The Tor Browser: comparison js/src/yarr/YarrParser.h

--1:000000000000
+:51524ab7b5dc
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+* vim: set ts=8 sts=4 et sw=4 tw=99:
+*
+* Copyright (C) 2009 Apple Inc. All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+* 1. Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+* 2. Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in the
+*    documentation and/or other materials provided with the distribution.
+*
+* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef yarr_YarrParser_h
+#define yarr_YarrParser_h
+#include "yarr/Yarr.h"
+namespace JSC { namespace Yarr {
+enum BuiltInCharacterClassID {
+DigitClassID,
+SpaceClassID,
+WordClassID,
+NewlineClassID
+};
+// The Parser class should not be used directly - only via the Yarr::parse() method.
+template<class Delegate, typename CharType>
+class Parser {
+private:
+template<class FriendDelegate>
+friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
+/*
+* CharacterClassParserDelegate:
+*
+* The class CharacterClassParserDelegate is used in the parsing of character
+* classes.  This class handles detection of character ranges.  This class
+* implements enough of the delegate interface such that it can be passed to
+* parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
+* to perform the parsing of escape characters in character sets.
+*/
+class CharacterClassParserDelegate {
+public:
+CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
+: m_delegate(delegate)
+, m_err(err)
+, m_state(Empty)
+, m_character(0)
+{
+}
+/*
+* begin():
+*
+* Called at beginning of construction.
+*/
+void begin(bool invert)
+{
+m_delegate.atomCharacterClassBegin(invert);
+}
+/*
+* atomPatternCharacter():
+*
+* This method is called either from parseCharacterClass() (for an unescaped
+* character in a character class), or from parseEscape(). In the former case
+* the value true will be passed for the argument 'hyphenIsRange', and in this
+* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
+* is different to /[a\-z]/).
+*/
+void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
+{
+switch (m_state) {
+case AfterCharacterClass:
+// Following a builtin character class we need look out for a hyphen.
+// We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
+// If we see a hyphen following a charater class then unlike usual
+// we'll report it to the delegate immediately, and put ourself into
+// a poisoned state. Any following calls to add another character or
+// character class will result in an error. (A hypen following a
+// character-class is itself valid, but only  at the end of a regex).
+if (hyphenIsRange && ch == '-') {
+m_delegate.atomCharacterClassAtom('-');
+m_state = AfterCharacterClassHyphen;
+return;
+}
+// Otherwise just fall through - cached character so treat this as Empty.
+case Empty:
+m_character = ch;
+m_state = CachedCharacter;
+return;
+case CachedCharacter:
+if (hyphenIsRange && ch == '-')
+m_state = CachedCharacterHyphen;
+else {
+m_delegate.atomCharacterClassAtom(m_character);
+m_character = ch;
+}
+return;
+case CachedCharacterHyphen:
+if (ch < m_character) {
+m_err = CharacterClassOutOfOrder;
+return;
+}
+m_delegate.atomCharacterClassRange(m_character, ch);
+m_state = Empty;
+return;
+case AfterCharacterClassHyphen:
+m_delegate.atomCharacterClassAtom(ch);
+m_state = Empty;
+return;
+}
+}
+/*
+* atomBuiltInCharacterClass():
+*
+* Adds a built-in character class, called by parseEscape().
+*/
+void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
+{
+switch (m_state) {
+case CachedCharacter:
+// Flush the currently cached character, then fall through.
+m_delegate.atomCharacterClassAtom(m_character);
+case Empty:
+case AfterCharacterClass:
+m_state = AfterCharacterClass;
+m_delegate.atomCharacterClassBuiltIn(classID, invert);
+return;
+case CachedCharacterHyphen:
+// Error! We have a range that looks like [x-\d]. We require
+// the end of the range to be a single character.
+m_err = CharacterClassInvalidRange;
+return;
+case AfterCharacterClassHyphen:
+m_delegate.atomCharacterClassBuiltIn(classID, invert);
+m_state = Empty;
+return;
+}
+}
+/*
+* end():
+*
+* Called at end of construction.
+*/
+void end()
+{
+if (m_state == CachedCharacter)
+m_delegate.atomCharacterClassAtom(m_character);
+else if (m_state == CachedCharacterHyphen) {
+m_delegate.atomCharacterClassAtom(m_character);
+m_delegate.atomCharacterClassAtom('-');
+}
+m_delegate.atomCharacterClassEnd();
+}
+// parseEscape() should never call these delegate methods when
+// invoked with inCharacterClass set.
+NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
+NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
+private:
+Delegate& m_delegate;
+ErrorCode& m_err;
+enum CharacterClassConstructionState {
+Empty,
+CachedCharacter,
+CachedCharacterHyphen,
+AfterCharacterClass,
+AfterCharacterClassHyphen
+} m_state;
+UChar m_character;
+};
+Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
+: m_delegate(delegate)
+, m_backReferenceLimit(backReferenceLimit)
+, m_err(NoError)
+, m_data(pattern.chars())
+, m_size(pattern.length())
+, m_index(0)
+, m_parenthesesNestingDepth(0)
+{
+}
+/*
+* parseEscape():
+*
+* Helper for parseTokens() AND parseCharacterClass().
+* Unlike the other parser methods, this function does not report tokens
+* directly to the member delegate (m_delegate), instead tokens are
+* emitted to the delegate provided as an argument.  In the case of atom
+* escapes, parseTokens() will call parseEscape() passing m_delegate as
+* an argument, and as such the escape will be reported to the delegate.
+*
+* However this method may also be used by parseCharacterClass(), in which
+* case a CharacterClassParserDelegate will be passed as the delegate that
+* tokens should be added to.  A boolean flag is also provided to indicate
+* whether that an escape in a CharacterClass is being parsed (some parsing
+* rules change in this context).
+*
+* The boolean value returned by this method indicates whether the token
+* parsed was an atom (outside of a characted class \b and \B will be
+* interpreted as assertions).
+*/
+template<bool inCharacterClass, class EscapeDelegate>
+bool parseEscape(EscapeDelegate& delegate)
+{
+ASSERT(!m_err);
+ASSERT(peek() == '\\');
+consume();
+if (atEndOfPattern()) {
+m_err = EscapeUnterminated;
+return false;
+}
+switch (peek()) {
+// Assertions
+case 'b':
+consume();
+if (inCharacterClass)
+delegate.atomPatternCharacter('\b');
+else {
+delegate.assertionWordBoundary(false);
+return false;
+}
+break;
+case 'B':
+consume();
+if (inCharacterClass)
+delegate.atomPatternCharacter('B');
+else {
+delegate.assertionWordBoundary(true);
+return false;
+}
+break;
+// CharacterClassEscape
+case 'd':
+consume();
+delegate.atomBuiltInCharacterClass(DigitClassID, false);
+break;
+case 's':
+consume();
+delegate.atomBuiltInCharacterClass(SpaceClassID, false);
+break;
+case 'w':
+consume();
+delegate.atomBuiltInCharacterClass(WordClassID, false);
+break;
+case 'D':
+consume();
+delegate.atomBuiltInCharacterClass(DigitClassID, true);
+break;
+case 'S':
+consume();
+delegate.atomBuiltInCharacterClass(SpaceClassID, true);
+break;
+case 'W':
+consume();
+delegate.atomBuiltInCharacterClass(WordClassID, true);
+break;
+// DecimalEscape
+case '1':
+case '2':
+case '3':
+case '4':
+case '5':
+case '6':
+case '7':
+case '8':
+case '9': {
+// To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
+// First, try to parse this as backreference.
+if (!inCharacterClass) {
+ParseState state = saveState();
+unsigned backReference;
+if (!consumeNumber(backReference))
+break;
+if (backReference <= m_backReferenceLimit) {
+delegate.atomBackReference(backReference);
+break;
+}
+restoreState(state);
+}
+// Not a backreference, and not octal.
+if (peek() >= '8') {
+delegate.atomPatternCharacter('\\');
+break;
+}
+// Fall-through to handle this as an octal escape.
+}
+// Octal escape
+case '0':
+delegate.atomPatternCharacter(consumeOctal());
+break;
+// ControlEscape
+case 'f':
+consume();
+delegate.atomPatternCharacter('\f');
+break;
+case 'n':
+consume();
+delegate.atomPatternCharacter('\n');
+break;
+case 'r':
+consume();
+delegate.atomPatternCharacter('\r');
+break;
+case 't':
+consume();
+delegate.atomPatternCharacter('\t');
+break;
+case 'v':
+consume();
+delegate.atomPatternCharacter('\v');
+break;
+// ControlLetter
+case 'c': {
+ParseState state = saveState();
+consume();
+if (!atEndOfPattern()) {
+int control = consume();
+// To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
+if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
+delegate.atomPatternCharacter(control & 0x1f);
+break;
+}
+}
+restoreState(state);
+delegate.atomPatternCharacter('\\');
+break;
+}
+// HexEscape
+case 'x': {
+consume();
+int x = tryConsumeHex(2);
+if (x == -1)
+delegate.atomPatternCharacter('x');
+else
+delegate.atomPatternCharacter(x);
+break;
+}
+// UnicodeEscape
+case 'u': {
+consume();
+int u = tryConsumeHex(4);
+if (u == -1)
+delegate.atomPatternCharacter('u');
+else
+delegate.atomPatternCharacter(u);
+break;
+}
+// IdentityEscape
+default:
+delegate.atomPatternCharacter(consume());
+}
+return true;
+}
+/*
+* parseAtomEscape(), parseCharacterClassEscape():
+*
+* These methods alias to parseEscape().
+*/
+bool parseAtomEscape()
+{
+return parseEscape<false>(m_delegate);
+}
+void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
+{
+parseEscape<true>(delegate);
+}
+/*
+* parseCharacterClass():
+*
+* Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
+* to an instance of CharacterClassParserDelegate, to describe the character class to the
+* delegate.
+*/
+void parseCharacterClass()
+{
+ASSERT(!m_err);
+ASSERT(peek() == '[');
+consume();
+CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
+characterClassConstructor.begin(tryConsume('^'));
+while (!atEndOfPattern()) {
+switch (peek()) {
+case ']':
+consume();
+characterClassConstructor.end();
+return;
+case '\\':
+parseCharacterClassEscape(characterClassConstructor);
+break;
+default:
+characterClassConstructor.atomPatternCharacter(consume(), true);
+}
+if (m_err)
+return;
+}
+m_err = CharacterClassUnmatched;
+}
+/*
+* parseParenthesesBegin():
+*
+* Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
+*/
+void parseParenthesesBegin()
+{
+ASSERT(!m_err);
+ASSERT(peek() == '(');
+consume();
+if (tryConsume('?')) {
+if (atEndOfPattern()) {
+m_err = ParenthesesTypeInvalid;
+return;
+}
+switch (consume()) {
+case ':':
+m_delegate.atomParenthesesSubpatternBegin(false);
+break;
+case '=':
+m_delegate.atomParentheticalAssertionBegin();
+break;
+case '!':
+m_delegate.atomParentheticalAssertionBegin(true);
+break;
+default:
+m_err = ParenthesesTypeInvalid;
+}
+} else
+m_delegate.atomParenthesesSubpatternBegin();
+++m_parenthesesNestingDepth;
+}
+/*
+* parseParenthesesEnd():
+*
+* Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
+*/
+void parseParenthesesEnd()
+{
+ASSERT(!m_err);
+ASSERT(peek() == ')');
+consume();
+if (m_parenthesesNestingDepth > 0)
+m_delegate.atomParenthesesEnd();
+else
+m_err = ParenthesesUnmatched;
+--m_parenthesesNestingDepth;
+}
+/*
+* parseQuantifier():
+*
+* Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
+*/
+void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
+{
+ASSERT(!m_err);
+ASSERT(min <= max);
+if (min == UINT_MAX) {
+m_err = QuantifierTooLarge;
+return;
+}
+if (lastTokenWasAnAtom)
+m_delegate.quantifyAtom(min, max, !tryConsume('?'));
+else
+m_err = QuantifierWithoutAtom;
+}
+/*
+* parseTokens():
+*
+* This method loops over the input pattern reporting tokens to the delegate.
+* The method returns when a parse error is detected, or the end of the pattern
+* is reached.  One piece of state is tracked around the loop, which is whether
+* the last token passed to the delegate was an atom (this is necessary to detect
+* a parse error when a quantifier provided without an atom to quantify).
+*/
+void parseTokens()
+{
+bool lastTokenWasAnAtom = false;
+while (!atEndOfPattern()) {
+switch (peek()) {
+case '|':
+consume();
+m_delegate.disjunction();
+lastTokenWasAnAtom = false;
+break;
+case '(':
+parseParenthesesBegin();
+lastTokenWasAnAtom = false;
+break;
+case ')':
+parseParenthesesEnd();
+lastTokenWasAnAtom = true;
+break;
+case '^':
+consume();
+m_delegate.assertionBOL();
+lastTokenWasAnAtom = false;
+break;
+case '$':
+consume();
+m_delegate.assertionEOL();
+lastTokenWasAnAtom = false;
+break;
+case '.':
+consume();
+m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
+lastTokenWasAnAtom = true;
+break;
+case '[':
+parseCharacterClass();
+lastTokenWasAnAtom = true;
+break;
+case '\\':
+lastTokenWasAnAtom = parseAtomEscape();
+break;
+case '*':
+consume();
+parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
+lastTokenWasAnAtom = false;
+break;
+case '+':
+consume();
+parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
+lastTokenWasAnAtom = false;
+break;
+case '?':
+consume();
+parseQuantifier(lastTokenWasAnAtom, 0, 1);
+lastTokenWasAnAtom = false;
+break;
+case '{': {
+ParseState state = saveState();
+consume();
+if (peekIsDigit()) {
+unsigned min;
+if (!consumeNumber(min))
+break;
+unsigned max = min;
+if (tryConsume(',')) {
+if (peekIsDigit()) {
+if (!consumeNumber(max))
+break;
+} else {
+max = quantifyInfinite;
+}
+}
+if (tryConsume('}')) {
+if (min <= max)
+parseQuantifier(lastTokenWasAnAtom, min, max);
+else
+m_err = QuantifierOutOfOrder;
+lastTokenWasAnAtom = false;
+break;
+}
+}
+restoreState(state);
+} // if we did not find a complete quantifer, fall through to the default case.
+default:
+m_delegate.atomPatternCharacter(consume());
+lastTokenWasAnAtom = true;
+}
+if (m_err)
+return;
+}
+if (m_parenthesesNestingDepth > 0)
+m_err = MissingParentheses;
+}
+/*
+* parse():
+*
+* This method calls parseTokens() to parse over the input and converts any
+* error code to a const char* for a result.
+*/
+ErrorCode parse()
+{
+if (m_size > MAX_PATTERN_SIZE)
+m_err = PatternTooLarge;
+else
+parseTokens();
+ASSERT(atEndOfPattern() || m_err);
+return m_err;
+}
+// Misc helper functions:
+typedef unsigned ParseState;
+ParseState saveState()
+{
+return m_index;
+}
+void restoreState(ParseState state)
+{
+m_index = state;
+}
+bool atEndOfPattern()
+{
+ASSERT(m_index <= m_size);
+return m_index == m_size;
+}
+int peek()
+{
+ASSERT(m_index < m_size);
+return m_data[m_index];
+}
+bool peekIsDigit()
+{
+return !atEndOfPattern() && WTF::isASCIIDigit(peek());
+}
+unsigned peekDigit()
+{
+ASSERT(peekIsDigit());
+return peek() - '0';
+}
+int consume()
+{
+ASSERT(m_index < m_size);
+return m_data[m_index++];
+}
+unsigned consumeDigit()
+{
+ASSERT(peekIsDigit());
+return consume() - '0';
+}
+bool consumeNumber(unsigned &accum)
+{
+accum = consumeDigit();
+while (peekIsDigit()) {
+unsigned newValue = accum * 10 + peekDigit();
+if (newValue < accum) { /* Overflow check. */
+m_err = QuantifierTooLarge;
+return false;
+}
+accum = newValue;
+consume();
+}
+return true;
+}
+unsigned consumeOctal()
+{
+ASSERT(WTF::isASCIIOctalDigit(peek()));
+unsigned n = consumeDigit();
+while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
+n = n * 8 + consumeDigit();
+return n;
+}
+bool tryConsume(UChar ch)
+{
+if (atEndOfPattern() || (m_data[m_index] != ch))
+return false;
+++m_index;
+return true;
+}
+int tryConsumeHex(int count)
+{
+ParseState state = saveState();
+int n = 0;
+while (count--) {
+if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
+restoreState(state);
+return -1;
+}
+n = (n << 4) | WTF::toASCIIHexValue(consume());
+}
+return n;
+}
+Delegate& m_delegate;
+unsigned m_backReferenceLimit;
+ErrorCode m_err;
+const CharType* m_data;
+unsigned m_size;
+unsigned m_index;
+unsigned m_parenthesesNestingDepth;
+// Derived by empirical testing of compile time in PCRE and WREC.
+static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
+};
+/*
+* Yarr::parse():
+*
+* The parse method is passed a pattern to be parsed and a delegate upon which
+* callbacks will be made to record the parsed tokens forming the regex.
+* Yarr::parse() returns null on success, or a const C string providing an error
+* message where a parse error occurs.
+*
+* The Delegate must implement the following interface:
+*
+*    void assertionBOL();
+*    void assertionEOL();
+*    void assertionWordBoundary(bool invert);
+*
+*    void atomPatternCharacter(UChar ch);
+*    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
+*    void atomCharacterClassBegin(bool invert)
+*    void atomCharacterClassAtom(UChar ch)
+*    void atomCharacterClassRange(UChar begin, UChar end)
+*    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
+*    void atomCharacterClassEnd()
+*    void atomParenthesesSubpatternBegin(bool capture = true);
+*    void atomParentheticalAssertionBegin(bool invert = false);
+*    void atomParenthesesEnd();
+*    void atomBackReference(unsigned subpatternId);
+*
+*    void quantifyAtom(unsigned min, unsigned max, bool greedy);
+*
+*    void disjunction();
+*
+* The regular expression is described by a sequence of assertion*() and atom*()
+* callbacks to the delegate, describing the terms in the regular expression.
+* Following an atom a quantifyAtom() call may occur to indicate that the previous
+* atom should be quantified.  In the case of atoms described across multiple
+* calls (parentheses and character classes) the call to quantifyAtom() will come
+* after the call to the atom*End() method, never after atom*Begin().
+*
+* Character classes may either be described by a single call to
+* atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
+* In the latter case, ...Begin() will be called, followed by a sequence of
+* calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
+*
+* Sequences of atoms and assertions are broken into alternatives via calls to
+* disjunction().  Assertions, atoms, and disjunctions emitted between calls to
+* atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
+* atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
+* capturing subpattern, this will be the subpatternId associated with these
+* parentheses, and will also by definition be the lowest subpatternId of these
+* parentheses and of any nested paretheses.  The atomParenthesesEnd() method
+* is passed the subpatternId of the last capturing subexpression nested within
+* these paretheses.  In the case of a capturing subpattern with no nested
+* capturing subpatterns, the same subpatternId will be passed to the begin and
+* end functions.  In the case of non-capturing subpatterns the subpatternId
+* passed to the begin method is also the first possible subpatternId that might
+* be nested within these paretheses.  If a set of non-capturing parentheses does
+* not contain any capturing subpatterns, then the subpatternId passed to begin
+* will be greater than the subpatternId passed to end.
+*/
+template<class Delegate>
+ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
+{
+#ifdef YARR_8BIT_CHAR_SUPPORT
+if (pattern.is8Bit())
+return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
+#endif
+return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
+}
+} } // namespace JSC::Yarr
+#endif /* yarr_YarrParser_h */

The Tor Browser / file comparison

comparison: js/src/yarr/YarrParser.h

js/src/yarr/YarrParser.h