michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- michael@0: * vim: set ts=8 sts=4 et sw=4 tw=99: michael@0: * michael@0: * Copyright (C) 2009 Apple Inc. All rights reserved. michael@0: * michael@0: * Redistribution and use in source and binary forms, with or without michael@0: * modification, are permitted provided that the following conditions michael@0: * are met: michael@0: * 1. Redistributions of source code must retain the above copyright michael@0: * notice, this list of conditions and the following disclaimer. michael@0: * 2. Redistributions in binary form must reproduce the above copyright michael@0: * notice, this list of conditions and the following disclaimer in the michael@0: * documentation and/or other materials provided with the distribution. michael@0: * michael@0: * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY michael@0: * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE michael@0: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR michael@0: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR michael@0: * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, michael@0: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, michael@0: * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR michael@0: * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY michael@0: * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT michael@0: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE michael@0: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: */ michael@0: michael@0: #ifndef yarr_YarrParser_h michael@0: #define yarr_YarrParser_h michael@0: michael@0: #include "yarr/Yarr.h" michael@0: michael@0: namespace JSC { namespace Yarr { michael@0: michael@0: enum BuiltInCharacterClassID { michael@0: DigitClassID, michael@0: SpaceClassID, michael@0: WordClassID, michael@0: NewlineClassID michael@0: }; michael@0: michael@0: // The Parser class should not be used directly - only via the Yarr::parse() method. michael@0: template michael@0: class Parser { michael@0: private: michael@0: template michael@0: friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); michael@0: michael@0: /* michael@0: * CharacterClassParserDelegate: michael@0: * michael@0: * The class CharacterClassParserDelegate is used in the parsing of character michael@0: * classes. This class handles detection of character ranges. This class michael@0: * implements enough of the delegate interface such that it can be passed to michael@0: * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused michael@0: * to perform the parsing of escape characters in character sets. michael@0: */ michael@0: class CharacterClassParserDelegate { michael@0: public: michael@0: CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) michael@0: : m_delegate(delegate) michael@0: , m_err(err) michael@0: , m_state(Empty) michael@0: , m_character(0) michael@0: { michael@0: } michael@0: michael@0: /* michael@0: * begin(): michael@0: * michael@0: * Called at beginning of construction. michael@0: */ michael@0: void begin(bool invert) michael@0: { michael@0: m_delegate.atomCharacterClassBegin(invert); michael@0: } michael@0: michael@0: /* michael@0: * atomPatternCharacter(): michael@0: * michael@0: * This method is called either from parseCharacterClass() (for an unescaped michael@0: * character in a character class), or from parseEscape(). In the former case michael@0: * the value true will be passed for the argument 'hyphenIsRange', and in this michael@0: * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ michael@0: * is different to /[a\-z]/). michael@0: */ michael@0: void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) michael@0: { michael@0: switch (m_state) { michael@0: case AfterCharacterClass: michael@0: // Following a builtin character class we need look out for a hyphen. michael@0: // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. michael@0: // If we see a hyphen following a charater class then unlike usual michael@0: // we'll report it to the delegate immediately, and put ourself into michael@0: // a poisoned state. Any following calls to add another character or michael@0: // character class will result in an error. (A hypen following a michael@0: // character-class is itself valid, but only at the end of a regex). michael@0: if (hyphenIsRange && ch == '-') { michael@0: m_delegate.atomCharacterClassAtom('-'); michael@0: m_state = AfterCharacterClassHyphen; michael@0: return; michael@0: } michael@0: // Otherwise just fall through - cached character so treat this as Empty. michael@0: michael@0: case Empty: michael@0: m_character = ch; michael@0: m_state = CachedCharacter; michael@0: return; michael@0: michael@0: case CachedCharacter: michael@0: if (hyphenIsRange && ch == '-') michael@0: m_state = CachedCharacterHyphen; michael@0: else { michael@0: m_delegate.atomCharacterClassAtom(m_character); michael@0: m_character = ch; michael@0: } michael@0: return; michael@0: michael@0: case CachedCharacterHyphen: michael@0: if (ch < m_character) { michael@0: m_err = CharacterClassOutOfOrder; michael@0: return; michael@0: } michael@0: m_delegate.atomCharacterClassRange(m_character, ch); michael@0: m_state = Empty; michael@0: return; michael@0: michael@0: case AfterCharacterClassHyphen: michael@0: m_delegate.atomCharacterClassAtom(ch); michael@0: m_state = Empty; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * atomBuiltInCharacterClass(): michael@0: * michael@0: * Adds a built-in character class, called by parseEscape(). michael@0: */ michael@0: void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) michael@0: { michael@0: switch (m_state) { michael@0: case CachedCharacter: michael@0: // Flush the currently cached character, then fall through. michael@0: m_delegate.atomCharacterClassAtom(m_character); michael@0: michael@0: case Empty: michael@0: case AfterCharacterClass: michael@0: m_state = AfterCharacterClass; michael@0: m_delegate.atomCharacterClassBuiltIn(classID, invert); michael@0: return; michael@0: michael@0: case CachedCharacterHyphen: michael@0: // Error! We have a range that looks like [x-\d]. We require michael@0: // the end of the range to be a single character. michael@0: m_err = CharacterClassInvalidRange; michael@0: return; michael@0: case AfterCharacterClassHyphen: michael@0: m_delegate.atomCharacterClassBuiltIn(classID, invert); michael@0: m_state = Empty; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * end(): michael@0: * michael@0: * Called at end of construction. michael@0: */ michael@0: void end() michael@0: { michael@0: if (m_state == CachedCharacter) michael@0: m_delegate.atomCharacterClassAtom(m_character); michael@0: else if (m_state == CachedCharacterHyphen) { michael@0: m_delegate.atomCharacterClassAtom(m_character); michael@0: m_delegate.atomCharacterClassAtom('-'); michael@0: } michael@0: m_delegate.atomCharacterClassEnd(); michael@0: } michael@0: michael@0: // parseEscape() should never call these delegate methods when michael@0: // invoked with inCharacterClass set. michael@0: NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } michael@0: NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } michael@0: michael@0: private: michael@0: Delegate& m_delegate; michael@0: ErrorCode& m_err; michael@0: enum CharacterClassConstructionState { michael@0: Empty, michael@0: CachedCharacter, michael@0: CachedCharacterHyphen, michael@0: AfterCharacterClass, michael@0: AfterCharacterClassHyphen michael@0: } m_state; michael@0: UChar m_character; michael@0: }; michael@0: michael@0: Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) michael@0: : m_delegate(delegate) michael@0: , m_backReferenceLimit(backReferenceLimit) michael@0: , m_err(NoError) michael@0: , m_data(pattern.chars()) michael@0: , m_size(pattern.length()) michael@0: , m_index(0) michael@0: , m_parenthesesNestingDepth(0) michael@0: { michael@0: } michael@0: michael@0: /* michael@0: * parseEscape(): michael@0: * michael@0: * Helper for parseTokens() AND parseCharacterClass(). michael@0: * Unlike the other parser methods, this function does not report tokens michael@0: * directly to the member delegate (m_delegate), instead tokens are michael@0: * emitted to the delegate provided as an argument. In the case of atom michael@0: * escapes, parseTokens() will call parseEscape() passing m_delegate as michael@0: * an argument, and as such the escape will be reported to the delegate. michael@0: * michael@0: * However this method may also be used by parseCharacterClass(), in which michael@0: * case a CharacterClassParserDelegate will be passed as the delegate that michael@0: * tokens should be added to. A boolean flag is also provided to indicate michael@0: * whether that an escape in a CharacterClass is being parsed (some parsing michael@0: * rules change in this context). michael@0: * michael@0: * The boolean value returned by this method indicates whether the token michael@0: * parsed was an atom (outside of a characted class \b and \B will be michael@0: * interpreted as assertions). michael@0: */ michael@0: template michael@0: bool parseEscape(EscapeDelegate& delegate) michael@0: { michael@0: ASSERT(!m_err); michael@0: ASSERT(peek() == '\\'); michael@0: consume(); michael@0: michael@0: if (atEndOfPattern()) { michael@0: m_err = EscapeUnterminated; michael@0: return false; michael@0: } michael@0: michael@0: switch (peek()) { michael@0: // Assertions michael@0: case 'b': michael@0: consume(); michael@0: if (inCharacterClass) michael@0: delegate.atomPatternCharacter('\b'); michael@0: else { michael@0: delegate.assertionWordBoundary(false); michael@0: return false; michael@0: } michael@0: break; michael@0: case 'B': michael@0: consume(); michael@0: if (inCharacterClass) michael@0: delegate.atomPatternCharacter('B'); michael@0: else { michael@0: delegate.assertionWordBoundary(true); michael@0: return false; michael@0: } michael@0: break; michael@0: michael@0: // CharacterClassEscape michael@0: case 'd': michael@0: consume(); michael@0: delegate.atomBuiltInCharacterClass(DigitClassID, false); michael@0: break; michael@0: case 's': michael@0: consume(); michael@0: delegate.atomBuiltInCharacterClass(SpaceClassID, false); michael@0: break; michael@0: case 'w': michael@0: consume(); michael@0: delegate.atomBuiltInCharacterClass(WordClassID, false); michael@0: break; michael@0: case 'D': michael@0: consume(); michael@0: delegate.atomBuiltInCharacterClass(DigitClassID, true); michael@0: break; michael@0: case 'S': michael@0: consume(); michael@0: delegate.atomBuiltInCharacterClass(SpaceClassID, true); michael@0: break; michael@0: case 'W': michael@0: consume(); michael@0: delegate.atomBuiltInCharacterClass(WordClassID, true); michael@0: break; michael@0: michael@0: // DecimalEscape michael@0: case '1': michael@0: case '2': michael@0: case '3': michael@0: case '4': michael@0: case '5': michael@0: case '6': michael@0: case '7': michael@0: case '8': michael@0: case '9': { michael@0: // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. michael@0: // First, try to parse this as backreference. michael@0: if (!inCharacterClass) { michael@0: ParseState state = saveState(); michael@0: michael@0: unsigned backReference; michael@0: if (!consumeNumber(backReference)) michael@0: break; michael@0: if (backReference <= m_backReferenceLimit) { michael@0: delegate.atomBackReference(backReference); michael@0: break; michael@0: } michael@0: michael@0: restoreState(state); michael@0: } michael@0: michael@0: // Not a backreference, and not octal. michael@0: if (peek() >= '8') { michael@0: delegate.atomPatternCharacter('\\'); michael@0: break; michael@0: } michael@0: michael@0: // Fall-through to handle this as an octal escape. michael@0: } michael@0: michael@0: // Octal escape michael@0: case '0': michael@0: delegate.atomPatternCharacter(consumeOctal()); michael@0: break; michael@0: michael@0: // ControlEscape michael@0: case 'f': michael@0: consume(); michael@0: delegate.atomPatternCharacter('\f'); michael@0: break; michael@0: case 'n': michael@0: consume(); michael@0: delegate.atomPatternCharacter('\n'); michael@0: break; michael@0: case 'r': michael@0: consume(); michael@0: delegate.atomPatternCharacter('\r'); michael@0: break; michael@0: case 't': michael@0: consume(); michael@0: delegate.atomPatternCharacter('\t'); michael@0: break; michael@0: case 'v': michael@0: consume(); michael@0: delegate.atomPatternCharacter('\v'); michael@0: break; michael@0: michael@0: // ControlLetter michael@0: case 'c': { michael@0: ParseState state = saveState(); michael@0: consume(); michael@0: if (!atEndOfPattern()) { michael@0: int control = consume(); michael@0: michael@0: // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. michael@0: if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { michael@0: delegate.atomPatternCharacter(control & 0x1f); michael@0: break; michael@0: } michael@0: } michael@0: restoreState(state); michael@0: delegate.atomPatternCharacter('\\'); michael@0: break; michael@0: } michael@0: michael@0: // HexEscape michael@0: case 'x': { michael@0: consume(); michael@0: int x = tryConsumeHex(2); michael@0: if (x == -1) michael@0: delegate.atomPatternCharacter('x'); michael@0: else michael@0: delegate.atomPatternCharacter(x); michael@0: break; michael@0: } michael@0: michael@0: // UnicodeEscape michael@0: case 'u': { michael@0: consume(); michael@0: int u = tryConsumeHex(4); michael@0: if (u == -1) michael@0: delegate.atomPatternCharacter('u'); michael@0: else michael@0: delegate.atomPatternCharacter(u); michael@0: break; michael@0: } michael@0: michael@0: // IdentityEscape michael@0: default: michael@0: delegate.atomPatternCharacter(consume()); michael@0: } michael@0: michael@0: return true; michael@0: } michael@0: michael@0: /* michael@0: * parseAtomEscape(), parseCharacterClassEscape(): michael@0: * michael@0: * These methods alias to parseEscape(). michael@0: */ michael@0: bool parseAtomEscape() michael@0: { michael@0: return parseEscape(m_delegate); michael@0: } michael@0: void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) michael@0: { michael@0: parseEscape(delegate); michael@0: } michael@0: michael@0: /* michael@0: * parseCharacterClass(): michael@0: * michael@0: * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) michael@0: * to an instance of CharacterClassParserDelegate, to describe the character class to the michael@0: * delegate. michael@0: */ michael@0: void parseCharacterClass() michael@0: { michael@0: ASSERT(!m_err); michael@0: ASSERT(peek() == '['); michael@0: consume(); michael@0: michael@0: CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); michael@0: michael@0: characterClassConstructor.begin(tryConsume('^')); michael@0: michael@0: while (!atEndOfPattern()) { michael@0: switch (peek()) { michael@0: case ']': michael@0: consume(); michael@0: characterClassConstructor.end(); michael@0: return; michael@0: michael@0: case '\\': michael@0: parseCharacterClassEscape(characterClassConstructor); michael@0: break; michael@0: michael@0: default: michael@0: characterClassConstructor.atomPatternCharacter(consume(), true); michael@0: } michael@0: michael@0: if (m_err) michael@0: return; michael@0: } michael@0: michael@0: m_err = CharacterClassUnmatched; michael@0: } michael@0: michael@0: /* michael@0: * parseParenthesesBegin(): michael@0: * michael@0: * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. michael@0: */ michael@0: void parseParenthesesBegin() michael@0: { michael@0: ASSERT(!m_err); michael@0: ASSERT(peek() == '('); michael@0: consume(); michael@0: michael@0: if (tryConsume('?')) { michael@0: if (atEndOfPattern()) { michael@0: m_err = ParenthesesTypeInvalid; michael@0: return; michael@0: } michael@0: michael@0: switch (consume()) { michael@0: case ':': michael@0: m_delegate.atomParenthesesSubpatternBegin(false); michael@0: break; michael@0: michael@0: case '=': michael@0: m_delegate.atomParentheticalAssertionBegin(); michael@0: break; michael@0: michael@0: case '!': michael@0: m_delegate.atomParentheticalAssertionBegin(true); michael@0: break; michael@0: michael@0: default: michael@0: m_err = ParenthesesTypeInvalid; michael@0: } michael@0: } else michael@0: m_delegate.atomParenthesesSubpatternBegin(); michael@0: michael@0: ++m_parenthesesNestingDepth; michael@0: } michael@0: michael@0: /* michael@0: * parseParenthesesEnd(): michael@0: * michael@0: * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). michael@0: */ michael@0: void parseParenthesesEnd() michael@0: { michael@0: ASSERT(!m_err); michael@0: ASSERT(peek() == ')'); michael@0: consume(); michael@0: michael@0: if (m_parenthesesNestingDepth > 0) michael@0: m_delegate.atomParenthesesEnd(); michael@0: else michael@0: m_err = ParenthesesUnmatched; michael@0: michael@0: --m_parenthesesNestingDepth; michael@0: } michael@0: michael@0: /* michael@0: * parseQuantifier(): michael@0: * michael@0: * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. michael@0: */ michael@0: void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) michael@0: { michael@0: ASSERT(!m_err); michael@0: ASSERT(min <= max); michael@0: michael@0: if (min == UINT_MAX) { michael@0: m_err = QuantifierTooLarge; michael@0: return; michael@0: } michael@0: michael@0: if (lastTokenWasAnAtom) michael@0: m_delegate.quantifyAtom(min, max, !tryConsume('?')); michael@0: else michael@0: m_err = QuantifierWithoutAtom; michael@0: } michael@0: michael@0: /* michael@0: * parseTokens(): michael@0: * michael@0: * This method loops over the input pattern reporting tokens to the delegate. michael@0: * The method returns when a parse error is detected, or the end of the pattern michael@0: * is reached. One piece of state is tracked around the loop, which is whether michael@0: * the last token passed to the delegate was an atom (this is necessary to detect michael@0: * a parse error when a quantifier provided without an atom to quantify). michael@0: */ michael@0: void parseTokens() michael@0: { michael@0: bool lastTokenWasAnAtom = false; michael@0: michael@0: while (!atEndOfPattern()) { michael@0: switch (peek()) { michael@0: case '|': michael@0: consume(); michael@0: m_delegate.disjunction(); michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: michael@0: case '(': michael@0: parseParenthesesBegin(); michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: michael@0: case ')': michael@0: parseParenthesesEnd(); michael@0: lastTokenWasAnAtom = true; michael@0: break; michael@0: michael@0: case '^': michael@0: consume(); michael@0: m_delegate.assertionBOL(); michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: michael@0: case '$': michael@0: consume(); michael@0: m_delegate.assertionEOL(); michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: michael@0: case '.': michael@0: consume(); michael@0: m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); michael@0: lastTokenWasAnAtom = true; michael@0: break; michael@0: michael@0: case '[': michael@0: parseCharacterClass(); michael@0: lastTokenWasAnAtom = true; michael@0: break; michael@0: michael@0: case '\\': michael@0: lastTokenWasAnAtom = parseAtomEscape(); michael@0: break; michael@0: michael@0: case '*': michael@0: consume(); michael@0: parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: michael@0: case '+': michael@0: consume(); michael@0: parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: michael@0: case '?': michael@0: consume(); michael@0: parseQuantifier(lastTokenWasAnAtom, 0, 1); michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: michael@0: case '{': { michael@0: ParseState state = saveState(); michael@0: michael@0: consume(); michael@0: if (peekIsDigit()) { michael@0: unsigned min; michael@0: if (!consumeNumber(min)) michael@0: break; michael@0: michael@0: unsigned max = min; michael@0: if (tryConsume(',')) { michael@0: if (peekIsDigit()) { michael@0: if (!consumeNumber(max)) michael@0: break; michael@0: } else { michael@0: max = quantifyInfinite; michael@0: } michael@0: } michael@0: michael@0: if (tryConsume('}')) { michael@0: if (min <= max) michael@0: parseQuantifier(lastTokenWasAnAtom, min, max); michael@0: else michael@0: m_err = QuantifierOutOfOrder; michael@0: lastTokenWasAnAtom = false; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: restoreState(state); michael@0: } // if we did not find a complete quantifer, fall through to the default case. michael@0: michael@0: default: michael@0: m_delegate.atomPatternCharacter(consume()); michael@0: lastTokenWasAnAtom = true; michael@0: } michael@0: michael@0: if (m_err) michael@0: return; michael@0: } michael@0: michael@0: if (m_parenthesesNestingDepth > 0) michael@0: m_err = MissingParentheses; michael@0: } michael@0: michael@0: /* michael@0: * parse(): michael@0: * michael@0: * This method calls parseTokens() to parse over the input and converts any michael@0: * error code to a const char* for a result. michael@0: */ michael@0: ErrorCode parse() michael@0: { michael@0: if (m_size > MAX_PATTERN_SIZE) michael@0: m_err = PatternTooLarge; michael@0: else michael@0: parseTokens(); michael@0: ASSERT(atEndOfPattern() || m_err); michael@0: michael@0: return m_err; michael@0: } michael@0: michael@0: // Misc helper functions: michael@0: michael@0: typedef unsigned ParseState; michael@0: michael@0: ParseState saveState() michael@0: { michael@0: return m_index; michael@0: } michael@0: michael@0: void restoreState(ParseState state) michael@0: { michael@0: m_index = state; michael@0: } michael@0: michael@0: bool atEndOfPattern() michael@0: { michael@0: ASSERT(m_index <= m_size); michael@0: return m_index == m_size; michael@0: } michael@0: michael@0: int peek() michael@0: { michael@0: ASSERT(m_index < m_size); michael@0: return m_data[m_index]; michael@0: } michael@0: michael@0: bool peekIsDigit() michael@0: { michael@0: return !atEndOfPattern() && WTF::isASCIIDigit(peek()); michael@0: } michael@0: michael@0: unsigned peekDigit() michael@0: { michael@0: ASSERT(peekIsDigit()); michael@0: return peek() - '0'; michael@0: } michael@0: michael@0: int consume() michael@0: { michael@0: ASSERT(m_index < m_size); michael@0: return m_data[m_index++]; michael@0: } michael@0: michael@0: unsigned consumeDigit() michael@0: { michael@0: ASSERT(peekIsDigit()); michael@0: return consume() - '0'; michael@0: } michael@0: michael@0: bool consumeNumber(unsigned &accum) michael@0: { michael@0: accum = consumeDigit(); michael@0: while (peekIsDigit()) { michael@0: unsigned newValue = accum * 10 + peekDigit(); michael@0: if (newValue < accum) { /* Overflow check. */ michael@0: m_err = QuantifierTooLarge; michael@0: return false; michael@0: } michael@0: accum = newValue; michael@0: consume(); michael@0: } michael@0: return true; michael@0: } michael@0: michael@0: unsigned consumeOctal() michael@0: { michael@0: ASSERT(WTF::isASCIIOctalDigit(peek())); michael@0: michael@0: unsigned n = consumeDigit(); michael@0: while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) michael@0: n = n * 8 + consumeDigit(); michael@0: return n; michael@0: } michael@0: michael@0: bool tryConsume(UChar ch) michael@0: { michael@0: if (atEndOfPattern() || (m_data[m_index] != ch)) michael@0: return false; michael@0: ++m_index; michael@0: return true; michael@0: } michael@0: michael@0: int tryConsumeHex(int count) michael@0: { michael@0: ParseState state = saveState(); michael@0: michael@0: int n = 0; michael@0: while (count--) { michael@0: if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { michael@0: restoreState(state); michael@0: return -1; michael@0: } michael@0: n = (n << 4) | WTF::toASCIIHexValue(consume()); michael@0: } michael@0: return n; michael@0: } michael@0: michael@0: Delegate& m_delegate; michael@0: unsigned m_backReferenceLimit; michael@0: ErrorCode m_err; michael@0: const CharType* m_data; michael@0: unsigned m_size; michael@0: unsigned m_index; michael@0: unsigned m_parenthesesNestingDepth; michael@0: michael@0: // Derived by empirical testing of compile time in PCRE and WREC. michael@0: static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; michael@0: }; michael@0: michael@0: /* michael@0: * Yarr::parse(): michael@0: * michael@0: * The parse method is passed a pattern to be parsed and a delegate upon which michael@0: * callbacks will be made to record the parsed tokens forming the regex. michael@0: * Yarr::parse() returns null on success, or a const C string providing an error michael@0: * message where a parse error occurs. michael@0: * michael@0: * The Delegate must implement the following interface: michael@0: * michael@0: * void assertionBOL(); michael@0: * void assertionEOL(); michael@0: * void assertionWordBoundary(bool invert); michael@0: * michael@0: * void atomPatternCharacter(UChar ch); michael@0: * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); michael@0: * void atomCharacterClassBegin(bool invert) michael@0: * void atomCharacterClassAtom(UChar ch) michael@0: * void atomCharacterClassRange(UChar begin, UChar end) michael@0: * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) michael@0: * void atomCharacterClassEnd() michael@0: * void atomParenthesesSubpatternBegin(bool capture = true); michael@0: * void atomParentheticalAssertionBegin(bool invert = false); michael@0: * void atomParenthesesEnd(); michael@0: * void atomBackReference(unsigned subpatternId); michael@0: * michael@0: * void quantifyAtom(unsigned min, unsigned max, bool greedy); michael@0: * michael@0: * void disjunction(); michael@0: * michael@0: * The regular expression is described by a sequence of assertion*() and atom*() michael@0: * callbacks to the delegate, describing the terms in the regular expression. michael@0: * Following an atom a quantifyAtom() call may occur to indicate that the previous michael@0: * atom should be quantified. In the case of atoms described across multiple michael@0: * calls (parentheses and character classes) the call to quantifyAtom() will come michael@0: * after the call to the atom*End() method, never after atom*Begin(). michael@0: * michael@0: * Character classes may either be described by a single call to michael@0: * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. michael@0: * In the latter case, ...Begin() will be called, followed by a sequence of michael@0: * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). michael@0: * michael@0: * Sequences of atoms and assertions are broken into alternatives via calls to michael@0: * disjunction(). Assertions, atoms, and disjunctions emitted between calls to michael@0: * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. michael@0: * atomParenthesesBegin() is passed a subpatternId. In the case of a regular michael@0: * capturing subpattern, this will be the subpatternId associated with these michael@0: * parentheses, and will also by definition be the lowest subpatternId of these michael@0: * parentheses and of any nested paretheses. The atomParenthesesEnd() method michael@0: * is passed the subpatternId of the last capturing subexpression nested within michael@0: * these paretheses. In the case of a capturing subpattern with no nested michael@0: * capturing subpatterns, the same subpatternId will be passed to the begin and michael@0: * end functions. In the case of non-capturing subpatterns the subpatternId michael@0: * passed to the begin method is also the first possible subpatternId that might michael@0: * be nested within these paretheses. If a set of non-capturing parentheses does michael@0: * not contain any capturing subpatterns, then the subpatternId passed to begin michael@0: * will be greater than the subpatternId passed to end. michael@0: */ michael@0: michael@0: template michael@0: ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) michael@0: { michael@0: #ifdef YARR_8BIT_CHAR_SUPPORT michael@0: if (pattern.is8Bit()) michael@0: return Parser(delegate, pattern, backReferenceLimit).parse(); michael@0: #endif michael@0: return Parser(delegate, pattern, backReferenceLimit).parse(); michael@0: } michael@0: michael@0: } } // namespace JSC::Yarr michael@0: michael@0: #endif /* yarr_YarrParser_h */