js/src/yarr/YarrParser.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/yarr/YarrParser.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,849 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99:
     1.6 + *
     1.7 + * Copyright (C) 2009 Apple Inc. All rights reserved.
     1.8 + *
     1.9 + * Redistribution and use in source and binary forms, with or without
    1.10 + * modification, are permitted provided that the following conditions
    1.11 + * are met:
    1.12 + * 1. Redistributions of source code must retain the above copyright
    1.13 + *    notice, this list of conditions and the following disclaimer.
    1.14 + * 2. Redistributions in binary form must reproduce the above copyright
    1.15 + *    notice, this list of conditions and the following disclaimer in the
    1.16 + *    documentation and/or other materials provided with the distribution.
    1.17 + *
    1.18 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
    1.19 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    1.20 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    1.21 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
    1.22 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.23 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.24 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.25 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
    1.26 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.27 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.28 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.29 + */
    1.30 +
    1.31 +#ifndef yarr_YarrParser_h
    1.32 +#define yarr_YarrParser_h
    1.33 +
    1.34 +#include "yarr/Yarr.h"
    1.35 +
    1.36 +namespace JSC { namespace Yarr {
    1.37 +
    1.38 +enum BuiltInCharacterClassID {
    1.39 +    DigitClassID,
    1.40 +    SpaceClassID,
    1.41 +    WordClassID,
    1.42 +    NewlineClassID
    1.43 +};
    1.44 +
    1.45 +// The Parser class should not be used directly - only via the Yarr::parse() method.
    1.46 +template<class Delegate, typename CharType>
    1.47 +class Parser {
    1.48 +private:
    1.49 +    template<class FriendDelegate>
    1.50 +    friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
    1.51 +
    1.52 +    /*
    1.53 +     * CharacterClassParserDelegate:
    1.54 +     *
    1.55 +     * The class CharacterClassParserDelegate is used in the parsing of character
    1.56 +     * classes.  This class handles detection of character ranges.  This class
    1.57 +     * implements enough of the delegate interface such that it can be passed to
    1.58 +     * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
    1.59 +     * to perform the parsing of escape characters in character sets.
    1.60 +     */
    1.61 +    class CharacterClassParserDelegate {
    1.62 +    public:
    1.63 +        CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
    1.64 +            : m_delegate(delegate)
    1.65 +            , m_err(err)
    1.66 +            , m_state(Empty)
    1.67 +            , m_character(0)
    1.68 +        {
    1.69 +        }
    1.70 +
    1.71 +        /*
    1.72 +         * begin():
    1.73 +         *
    1.74 +         * Called at beginning of construction.
    1.75 +         */
    1.76 +        void begin(bool invert)
    1.77 +        {
    1.78 +            m_delegate.atomCharacterClassBegin(invert);
    1.79 +        }
    1.80 +
    1.81 +        /*
    1.82 +         * atomPatternCharacter():
    1.83 +         *
    1.84 +         * This method is called either from parseCharacterClass() (for an unescaped
    1.85 +         * character in a character class), or from parseEscape(). In the former case
    1.86 +         * the value true will be passed for the argument 'hyphenIsRange', and in this
    1.87 +         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
    1.88 +         * is different to /[a\-z]/).
    1.89 +         */
    1.90 +        void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
    1.91 +        {
    1.92 +            switch (m_state) {
    1.93 +            case AfterCharacterClass:
    1.94 +                // Following a builtin character class we need look out for a hyphen.
    1.95 +                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
    1.96 +                // If we see a hyphen following a charater class then unlike usual
    1.97 +                // we'll report it to the delegate immediately, and put ourself into
    1.98 +                // a poisoned state. Any following calls to add another character or
    1.99 +                // character class will result in an error. (A hypen following a
   1.100 +                // character-class is itself valid, but only  at the end of a regex).
   1.101 +                if (hyphenIsRange && ch == '-') {
   1.102 +                    m_delegate.atomCharacterClassAtom('-');
   1.103 +                    m_state = AfterCharacterClassHyphen;
   1.104 +                    return;
   1.105 +                }
   1.106 +                // Otherwise just fall through - cached character so treat this as Empty.
   1.107 +
   1.108 +            case Empty:
   1.109 +                m_character = ch;
   1.110 +                m_state = CachedCharacter;
   1.111 +                return;
   1.112 +
   1.113 +            case CachedCharacter:
   1.114 +                if (hyphenIsRange && ch == '-')
   1.115 +                    m_state = CachedCharacterHyphen;
   1.116 +                else {
   1.117 +                    m_delegate.atomCharacterClassAtom(m_character);
   1.118 +                    m_character = ch;
   1.119 +                }
   1.120 +                return;
   1.121 +
   1.122 +            case CachedCharacterHyphen:
   1.123 +                if (ch < m_character) {
   1.124 +                    m_err = CharacterClassOutOfOrder;
   1.125 +                    return;
   1.126 +                }
   1.127 +                m_delegate.atomCharacterClassRange(m_character, ch);
   1.128 +                m_state = Empty;
   1.129 +                return;
   1.130 +
   1.131 +            case AfterCharacterClassHyphen:
   1.132 +                m_delegate.atomCharacterClassAtom(ch);
   1.133 +                m_state = Empty;
   1.134 +                return;
   1.135 +            }
   1.136 +        }
   1.137 +
   1.138 +        /*
   1.139 +         * atomBuiltInCharacterClass():
   1.140 +         *
   1.141 +         * Adds a built-in character class, called by parseEscape().
   1.142 +         */
   1.143 +        void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
   1.144 +        {
   1.145 +            switch (m_state) {
   1.146 +            case CachedCharacter:
   1.147 +                // Flush the currently cached character, then fall through.
   1.148 +                m_delegate.atomCharacterClassAtom(m_character);
   1.149 +
   1.150 +            case Empty:
   1.151 +            case AfterCharacterClass:
   1.152 +                m_state = AfterCharacterClass;
   1.153 +                m_delegate.atomCharacterClassBuiltIn(classID, invert);
   1.154 +                return;
   1.155 +
   1.156 +            case CachedCharacterHyphen:
   1.157 +                // Error! We have a range that looks like [x-\d]. We require
   1.158 +                // the end of the range to be a single character.
   1.159 +                m_err = CharacterClassInvalidRange;
   1.160 +                return;
   1.161 +            case AfterCharacterClassHyphen:
   1.162 +                m_delegate.atomCharacterClassBuiltIn(classID, invert);
   1.163 +                m_state = Empty;
   1.164 +                return;
   1.165 +            }
   1.166 +        }
   1.167 +
   1.168 +        /*
   1.169 +         * end():
   1.170 +         *
   1.171 +         * Called at end of construction.
   1.172 +         */
   1.173 +        void end()
   1.174 +        {
   1.175 +            if (m_state == CachedCharacter)
   1.176 +                m_delegate.atomCharacterClassAtom(m_character);
   1.177 +            else if (m_state == CachedCharacterHyphen) {
   1.178 +                m_delegate.atomCharacterClassAtom(m_character);
   1.179 +                m_delegate.atomCharacterClassAtom('-');
   1.180 +            }
   1.181 +            m_delegate.atomCharacterClassEnd();
   1.182 +        }
   1.183 +
   1.184 +        // parseEscape() should never call these delegate methods when
   1.185 +        // invoked with inCharacterClass set.
   1.186 +        NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
   1.187 +        NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
   1.188 +
   1.189 +    private:
   1.190 +        Delegate& m_delegate;
   1.191 +        ErrorCode& m_err;
   1.192 +        enum CharacterClassConstructionState {
   1.193 +            Empty,
   1.194 +            CachedCharacter,
   1.195 +            CachedCharacterHyphen,
   1.196 +            AfterCharacterClass,
   1.197 +            AfterCharacterClassHyphen
   1.198 +        } m_state;
   1.199 +        UChar m_character;
   1.200 +    };
   1.201 +
   1.202 +    Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
   1.203 +        : m_delegate(delegate)
   1.204 +        , m_backReferenceLimit(backReferenceLimit)
   1.205 +        , m_err(NoError)
   1.206 +        , m_data(pattern.chars())
   1.207 +        , m_size(pattern.length())
   1.208 +        , m_index(0)
   1.209 +        , m_parenthesesNestingDepth(0)
   1.210 +    {
   1.211 +    }
   1.212 +
   1.213 +    /*
   1.214 +     * parseEscape():
   1.215 +     *
   1.216 +     * Helper for parseTokens() AND parseCharacterClass().
   1.217 +     * Unlike the other parser methods, this function does not report tokens
   1.218 +     * directly to the member delegate (m_delegate), instead tokens are
   1.219 +     * emitted to the delegate provided as an argument.  In the case of atom
   1.220 +     * escapes, parseTokens() will call parseEscape() passing m_delegate as
   1.221 +     * an argument, and as such the escape will be reported to the delegate.
   1.222 +     *
   1.223 +     * However this method may also be used by parseCharacterClass(), in which
   1.224 +     * case a CharacterClassParserDelegate will be passed as the delegate that
   1.225 +     * tokens should be added to.  A boolean flag is also provided to indicate
   1.226 +     * whether that an escape in a CharacterClass is being parsed (some parsing
   1.227 +     * rules change in this context).
   1.228 +     *
   1.229 +     * The boolean value returned by this method indicates whether the token
   1.230 +     * parsed was an atom (outside of a characted class \b and \B will be
   1.231 +     * interpreted as assertions).
   1.232 +     */
   1.233 +    template<bool inCharacterClass, class EscapeDelegate>
   1.234 +    bool parseEscape(EscapeDelegate& delegate)
   1.235 +    {
   1.236 +        ASSERT(!m_err);
   1.237 +        ASSERT(peek() == '\\');
   1.238 +        consume();
   1.239 +
   1.240 +        if (atEndOfPattern()) {
   1.241 +            m_err = EscapeUnterminated;
   1.242 +            return false;
   1.243 +        }
   1.244 +
   1.245 +        switch (peek()) {
   1.246 +        // Assertions
   1.247 +        case 'b':
   1.248 +            consume();
   1.249 +            if (inCharacterClass)
   1.250 +                delegate.atomPatternCharacter('\b');
   1.251 +            else {
   1.252 +                delegate.assertionWordBoundary(false);
   1.253 +                return false;
   1.254 +            }
   1.255 +            break;
   1.256 +        case 'B':
   1.257 +            consume();
   1.258 +            if (inCharacterClass)
   1.259 +                delegate.atomPatternCharacter('B');
   1.260 +            else {
   1.261 +                delegate.assertionWordBoundary(true);
   1.262 +                return false;
   1.263 +            }
   1.264 +            break;
   1.265 +
   1.266 +        // CharacterClassEscape
   1.267 +        case 'd':
   1.268 +            consume();
   1.269 +            delegate.atomBuiltInCharacterClass(DigitClassID, false);
   1.270 +            break;
   1.271 +        case 's':
   1.272 +            consume();
   1.273 +            delegate.atomBuiltInCharacterClass(SpaceClassID, false);
   1.274 +            break;
   1.275 +        case 'w':
   1.276 +            consume();
   1.277 +            delegate.atomBuiltInCharacterClass(WordClassID, false);
   1.278 +            break;
   1.279 +        case 'D':
   1.280 +            consume();
   1.281 +            delegate.atomBuiltInCharacterClass(DigitClassID, true);
   1.282 +            break;
   1.283 +        case 'S':
   1.284 +            consume();
   1.285 +            delegate.atomBuiltInCharacterClass(SpaceClassID, true);
   1.286 +            break;
   1.287 +        case 'W':
   1.288 +            consume();
   1.289 +            delegate.atomBuiltInCharacterClass(WordClassID, true);
   1.290 +            break;
   1.291 +
   1.292 +        // DecimalEscape
   1.293 +        case '1':
   1.294 +        case '2':
   1.295 +        case '3':
   1.296 +        case '4':
   1.297 +        case '5':
   1.298 +        case '6':
   1.299 +        case '7':
   1.300 +        case '8':
   1.301 +        case '9': {
   1.302 +            // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
   1.303 +            // First, try to parse this as backreference.
   1.304 +            if (!inCharacterClass) {
   1.305 +                ParseState state = saveState();
   1.306 +
   1.307 +                unsigned backReference;
   1.308 +                if (!consumeNumber(backReference))
   1.309 +                    break;
   1.310 +                if (backReference <= m_backReferenceLimit) {
   1.311 +                    delegate.atomBackReference(backReference);
   1.312 +                    break;
   1.313 +                }
   1.314 +
   1.315 +                restoreState(state);
   1.316 +            }
   1.317 +
   1.318 +            // Not a backreference, and not octal.
   1.319 +            if (peek() >= '8') {
   1.320 +                delegate.atomPatternCharacter('\\');
   1.321 +                break;
   1.322 +            }
   1.323 +
   1.324 +            // Fall-through to handle this as an octal escape.
   1.325 +        }
   1.326 +
   1.327 +        // Octal escape
   1.328 +        case '0':
   1.329 +            delegate.atomPatternCharacter(consumeOctal());
   1.330 +            break;
   1.331 +
   1.332 +        // ControlEscape
   1.333 +        case 'f':
   1.334 +            consume();
   1.335 +            delegate.atomPatternCharacter('\f');
   1.336 +            break;
   1.337 +        case 'n':
   1.338 +            consume();
   1.339 +            delegate.atomPatternCharacter('\n');
   1.340 +            break;
   1.341 +        case 'r':
   1.342 +            consume();
   1.343 +            delegate.atomPatternCharacter('\r');
   1.344 +            break;
   1.345 +        case 't':
   1.346 +            consume();
   1.347 +            delegate.atomPatternCharacter('\t');
   1.348 +            break;
   1.349 +        case 'v':
   1.350 +            consume();
   1.351 +            delegate.atomPatternCharacter('\v');
   1.352 +            break;
   1.353 +
   1.354 +        // ControlLetter
   1.355 +        case 'c': {
   1.356 +            ParseState state = saveState();
   1.357 +            consume();
   1.358 +            if (!atEndOfPattern()) {
   1.359 +                int control = consume();
   1.360 +
   1.361 +                // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
   1.362 +                if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
   1.363 +                    delegate.atomPatternCharacter(control & 0x1f);
   1.364 +                    break;
   1.365 +                }
   1.366 +            }
   1.367 +            restoreState(state);
   1.368 +            delegate.atomPatternCharacter('\\');
   1.369 +            break;
   1.370 +        }
   1.371 +
   1.372 +        // HexEscape
   1.373 +        case 'x': {
   1.374 +            consume();
   1.375 +            int x = tryConsumeHex(2);
   1.376 +            if (x == -1)
   1.377 +                delegate.atomPatternCharacter('x');
   1.378 +            else
   1.379 +                delegate.atomPatternCharacter(x);
   1.380 +            break;
   1.381 +        }
   1.382 +
   1.383 +        // UnicodeEscape
   1.384 +        case 'u': {
   1.385 +            consume();
   1.386 +            int u = tryConsumeHex(4);
   1.387 +            if (u == -1)
   1.388 +                delegate.atomPatternCharacter('u');
   1.389 +            else
   1.390 +                delegate.atomPatternCharacter(u);
   1.391 +            break;
   1.392 +        }
   1.393 +
   1.394 +        // IdentityEscape
   1.395 +        default:
   1.396 +            delegate.atomPatternCharacter(consume());
   1.397 +        }
   1.398 +
   1.399 +        return true;
   1.400 +    }
   1.401 +
   1.402 +    /*
   1.403 +     * parseAtomEscape(), parseCharacterClassEscape():
   1.404 +     *
   1.405 +     * These methods alias to parseEscape().
   1.406 +     */
   1.407 +    bool parseAtomEscape()
   1.408 +    {
   1.409 +        return parseEscape<false>(m_delegate);
   1.410 +    }
   1.411 +    void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
   1.412 +    {
   1.413 +        parseEscape<true>(delegate);
   1.414 +    }
   1.415 +
   1.416 +    /*
   1.417 +     * parseCharacterClass():
   1.418 +     *
   1.419 +     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
   1.420 +     * to an instance of CharacterClassParserDelegate, to describe the character class to the
   1.421 +     * delegate.
   1.422 +     */
   1.423 +    void parseCharacterClass()
   1.424 +    {
   1.425 +        ASSERT(!m_err);
   1.426 +        ASSERT(peek() == '[');
   1.427 +        consume();
   1.428 +
   1.429 +        CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
   1.430 +
   1.431 +        characterClassConstructor.begin(tryConsume('^'));
   1.432 +
   1.433 +        while (!atEndOfPattern()) {
   1.434 +            switch (peek()) {
   1.435 +            case ']':
   1.436 +                consume();
   1.437 +                characterClassConstructor.end();
   1.438 +                return;
   1.439 +
   1.440 +            case '\\':
   1.441 +                parseCharacterClassEscape(characterClassConstructor);
   1.442 +                break;
   1.443 +
   1.444 +            default:
   1.445 +                characterClassConstructor.atomPatternCharacter(consume(), true);
   1.446 +            }
   1.447 +
   1.448 +            if (m_err)
   1.449 +                return;
   1.450 +        }
   1.451 +
   1.452 +        m_err = CharacterClassUnmatched;
   1.453 +    }
   1.454 +
   1.455 +    /*
   1.456 +     * parseParenthesesBegin():
   1.457 +     *
   1.458 +     * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
   1.459 +     */
   1.460 +    void parseParenthesesBegin()
   1.461 +    {
   1.462 +        ASSERT(!m_err);
   1.463 +        ASSERT(peek() == '(');
   1.464 +        consume();
   1.465 +
   1.466 +        if (tryConsume('?')) {
   1.467 +            if (atEndOfPattern()) {
   1.468 +                m_err = ParenthesesTypeInvalid;
   1.469 +                return;
   1.470 +            }
   1.471 +
   1.472 +            switch (consume()) {
   1.473 +            case ':':
   1.474 +                m_delegate.atomParenthesesSubpatternBegin(false);
   1.475 +                break;
   1.476 +
   1.477 +            case '=':
   1.478 +                m_delegate.atomParentheticalAssertionBegin();
   1.479 +                break;
   1.480 +
   1.481 +            case '!':
   1.482 +                m_delegate.atomParentheticalAssertionBegin(true);
   1.483 +                break;
   1.484 +
   1.485 +            default:
   1.486 +                m_err = ParenthesesTypeInvalid;
   1.487 +            }
   1.488 +        } else
   1.489 +            m_delegate.atomParenthesesSubpatternBegin();
   1.490 +
   1.491 +        ++m_parenthesesNestingDepth;
   1.492 +    }
   1.493 +
   1.494 +    /*
   1.495 +     * parseParenthesesEnd():
   1.496 +     *
   1.497 +     * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
   1.498 +     */
   1.499 +    void parseParenthesesEnd()
   1.500 +    {
   1.501 +        ASSERT(!m_err);
   1.502 +        ASSERT(peek() == ')');
   1.503 +        consume();
   1.504 +
   1.505 +        if (m_parenthesesNestingDepth > 0)
   1.506 +            m_delegate.atomParenthesesEnd();
   1.507 +        else
   1.508 +            m_err = ParenthesesUnmatched;
   1.509 +
   1.510 +        --m_parenthesesNestingDepth;
   1.511 +    }
   1.512 +
   1.513 +    /*
   1.514 +     * parseQuantifier():
   1.515 +     *
   1.516 +     * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
   1.517 +     */
   1.518 +    void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
   1.519 +    {
   1.520 +        ASSERT(!m_err);
   1.521 +        ASSERT(min <= max);
   1.522 +
   1.523 +        if (min == UINT_MAX) {
   1.524 +            m_err = QuantifierTooLarge;
   1.525 +            return;
   1.526 +        }
   1.527 +
   1.528 +        if (lastTokenWasAnAtom)
   1.529 +            m_delegate.quantifyAtom(min, max, !tryConsume('?'));
   1.530 +        else
   1.531 +            m_err = QuantifierWithoutAtom;
   1.532 +    }
   1.533 +
   1.534 +    /*
   1.535 +     * parseTokens():
   1.536 +     *
   1.537 +     * This method loops over the input pattern reporting tokens to the delegate.
   1.538 +     * The method returns when a parse error is detected, or the end of the pattern
   1.539 +     * is reached.  One piece of state is tracked around the loop, which is whether
   1.540 +     * the last token passed to the delegate was an atom (this is necessary to detect
   1.541 +     * a parse error when a quantifier provided without an atom to quantify).
   1.542 +     */
   1.543 +    void parseTokens()
   1.544 +    {
   1.545 +        bool lastTokenWasAnAtom = false;
   1.546 +
   1.547 +        while (!atEndOfPattern()) {
   1.548 +            switch (peek()) {
   1.549 +            case '|':
   1.550 +                consume();
   1.551 +                m_delegate.disjunction();
   1.552 +                lastTokenWasAnAtom = false;
   1.553 +                break;
   1.554 +
   1.555 +            case '(':
   1.556 +                parseParenthesesBegin();
   1.557 +                lastTokenWasAnAtom = false;
   1.558 +                break;
   1.559 +
   1.560 +            case ')':
   1.561 +                parseParenthesesEnd();
   1.562 +                lastTokenWasAnAtom = true;
   1.563 +                break;
   1.564 +
   1.565 +            case '^':
   1.566 +                consume();
   1.567 +                m_delegate.assertionBOL();
   1.568 +                lastTokenWasAnAtom = false;
   1.569 +                break;
   1.570 +
   1.571 +            case '$':
   1.572 +                consume();
   1.573 +                m_delegate.assertionEOL();
   1.574 +                lastTokenWasAnAtom = false;
   1.575 +                break;
   1.576 +
   1.577 +            case '.':
   1.578 +                consume();
   1.579 +                m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
   1.580 +                lastTokenWasAnAtom = true;
   1.581 +                break;
   1.582 +
   1.583 +            case '[':
   1.584 +                parseCharacterClass();
   1.585 +                lastTokenWasAnAtom = true;
   1.586 +                break;
   1.587 +
   1.588 +            case '\\':
   1.589 +                lastTokenWasAnAtom = parseAtomEscape();
   1.590 +                break;
   1.591 +
   1.592 +            case '*':
   1.593 +                consume();
   1.594 +                parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
   1.595 +                lastTokenWasAnAtom = false;
   1.596 +                break;
   1.597 +
   1.598 +            case '+':
   1.599 +                consume();
   1.600 +                parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
   1.601 +                lastTokenWasAnAtom = false;
   1.602 +                break;
   1.603 +
   1.604 +            case '?':
   1.605 +                consume();
   1.606 +                parseQuantifier(lastTokenWasAnAtom, 0, 1);
   1.607 +                lastTokenWasAnAtom = false;
   1.608 +                break;
   1.609 +
   1.610 +            case '{': {
   1.611 +                ParseState state = saveState();
   1.612 +
   1.613 +                consume();
   1.614 +                if (peekIsDigit()) {
   1.615 +                    unsigned min;
   1.616 +                    if (!consumeNumber(min))
   1.617 +                        break;
   1.618 +
   1.619 +                    unsigned max = min;
   1.620 +                    if (tryConsume(',')) {
   1.621 +                        if (peekIsDigit()) {
   1.622 +                            if (!consumeNumber(max))
   1.623 +                                break;
   1.624 +                        } else {
   1.625 +                            max = quantifyInfinite;
   1.626 +                        }
   1.627 +                    }
   1.628 +
   1.629 +                    if (tryConsume('}')) {
   1.630 +                        if (min <= max)
   1.631 +                            parseQuantifier(lastTokenWasAnAtom, min, max);
   1.632 +                        else
   1.633 +                            m_err = QuantifierOutOfOrder;
   1.634 +                        lastTokenWasAnAtom = false;
   1.635 +                        break;
   1.636 +                    }
   1.637 +                }
   1.638 +
   1.639 +                restoreState(state);
   1.640 +            } // if we did not find a complete quantifer, fall through to the default case.
   1.641 +
   1.642 +            default:
   1.643 +                m_delegate.atomPatternCharacter(consume());
   1.644 +                lastTokenWasAnAtom = true;
   1.645 +            }
   1.646 +
   1.647 +            if (m_err)
   1.648 +                return;
   1.649 +        }
   1.650 +
   1.651 +        if (m_parenthesesNestingDepth > 0)
   1.652 +            m_err = MissingParentheses;
   1.653 +    }
   1.654 +
   1.655 +    /*
   1.656 +     * parse():
   1.657 +     *
   1.658 +     * This method calls parseTokens() to parse over the input and converts any
   1.659 +     * error code to a const char* for a result.
   1.660 +     */
   1.661 +    ErrorCode parse()
   1.662 +    {
   1.663 +        if (m_size > MAX_PATTERN_SIZE)
   1.664 +            m_err = PatternTooLarge;
   1.665 +        else
   1.666 +            parseTokens();
   1.667 +        ASSERT(atEndOfPattern() || m_err);
   1.668 +
   1.669 +        return m_err;
   1.670 +    }
   1.671 +
   1.672 +    // Misc helper functions:
   1.673 +
   1.674 +    typedef unsigned ParseState;
   1.675 +
   1.676 +    ParseState saveState()
   1.677 +    {
   1.678 +        return m_index;
   1.679 +    }
   1.680 +
   1.681 +    void restoreState(ParseState state)
   1.682 +    {
   1.683 +        m_index = state;
   1.684 +    }
   1.685 +
   1.686 +    bool atEndOfPattern()
   1.687 +    {
   1.688 +        ASSERT(m_index <= m_size);
   1.689 +        return m_index == m_size;
   1.690 +    }
   1.691 +
   1.692 +    int peek()
   1.693 +    {
   1.694 +        ASSERT(m_index < m_size);
   1.695 +        return m_data[m_index];
   1.696 +    }
   1.697 +
   1.698 +    bool peekIsDigit()
   1.699 +    {
   1.700 +        return !atEndOfPattern() && WTF::isASCIIDigit(peek());
   1.701 +    }
   1.702 +
   1.703 +    unsigned peekDigit()
   1.704 +    {
   1.705 +        ASSERT(peekIsDigit());
   1.706 +        return peek() - '0';
   1.707 +    }
   1.708 +
   1.709 +    int consume()
   1.710 +    {
   1.711 +        ASSERT(m_index < m_size);
   1.712 +        return m_data[m_index++];
   1.713 +    }
   1.714 +
   1.715 +    unsigned consumeDigit()
   1.716 +    {
   1.717 +        ASSERT(peekIsDigit());
   1.718 +        return consume() - '0';
   1.719 +    }
   1.720 +
   1.721 +    bool consumeNumber(unsigned &accum)
   1.722 +    {
   1.723 +        accum = consumeDigit();
   1.724 +        while (peekIsDigit()) {
   1.725 +            unsigned newValue = accum * 10 + peekDigit();
   1.726 +            if (newValue < accum) { /* Overflow check. */
   1.727 +                m_err = QuantifierTooLarge;
   1.728 +                return false;
   1.729 +            }
   1.730 +            accum = newValue;
   1.731 +            consume();
   1.732 +        }
   1.733 +        return true;
   1.734 +    }
   1.735 +
   1.736 +    unsigned consumeOctal()
   1.737 +    {
   1.738 +        ASSERT(WTF::isASCIIOctalDigit(peek()));
   1.739 +
   1.740 +        unsigned n = consumeDigit();
   1.741 +        while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
   1.742 +            n = n * 8 + consumeDigit();
   1.743 +        return n;
   1.744 +    }
   1.745 +
   1.746 +    bool tryConsume(UChar ch)
   1.747 +    {
   1.748 +        if (atEndOfPattern() || (m_data[m_index] != ch))
   1.749 +            return false;
   1.750 +        ++m_index;
   1.751 +        return true;
   1.752 +    }
   1.753 +
   1.754 +    int tryConsumeHex(int count)
   1.755 +    {
   1.756 +        ParseState state = saveState();
   1.757 +
   1.758 +        int n = 0;
   1.759 +        while (count--) {
   1.760 +            if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
   1.761 +                restoreState(state);
   1.762 +                return -1;
   1.763 +            }
   1.764 +            n = (n << 4) | WTF::toASCIIHexValue(consume());
   1.765 +        }
   1.766 +        return n;
   1.767 +    }
   1.768 +
   1.769 +    Delegate& m_delegate;
   1.770 +    unsigned m_backReferenceLimit;
   1.771 +    ErrorCode m_err;
   1.772 +    const CharType* m_data;
   1.773 +    unsigned m_size;
   1.774 +    unsigned m_index;
   1.775 +    unsigned m_parenthesesNestingDepth;
   1.776 +
   1.777 +    // Derived by empirical testing of compile time in PCRE and WREC.
   1.778 +    static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
   1.779 +};
   1.780 +
   1.781 +/*
   1.782 + * Yarr::parse():
   1.783 + *
   1.784 + * The parse method is passed a pattern to be parsed and a delegate upon which
   1.785 + * callbacks will be made to record the parsed tokens forming the regex.
   1.786 + * Yarr::parse() returns null on success, or a const C string providing an error
   1.787 + * message where a parse error occurs.
   1.788 + *
   1.789 + * The Delegate must implement the following interface:
   1.790 + *
   1.791 + *    void assertionBOL();
   1.792 + *    void assertionEOL();
   1.793 + *    void assertionWordBoundary(bool invert);
   1.794 + *
   1.795 + *    void atomPatternCharacter(UChar ch);
   1.796 + *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
   1.797 + *    void atomCharacterClassBegin(bool invert)
   1.798 + *    void atomCharacterClassAtom(UChar ch)
   1.799 + *    void atomCharacterClassRange(UChar begin, UChar end)
   1.800 + *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
   1.801 + *    void atomCharacterClassEnd()
   1.802 + *    void atomParenthesesSubpatternBegin(bool capture = true);
   1.803 + *    void atomParentheticalAssertionBegin(bool invert = false);
   1.804 + *    void atomParenthesesEnd();
   1.805 + *    void atomBackReference(unsigned subpatternId);
   1.806 + *
   1.807 + *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
   1.808 + *
   1.809 + *    void disjunction();
   1.810 + *
   1.811 + * The regular expression is described by a sequence of assertion*() and atom*()
   1.812 + * callbacks to the delegate, describing the terms in the regular expression.
   1.813 + * Following an atom a quantifyAtom() call may occur to indicate that the previous
   1.814 + * atom should be quantified.  In the case of atoms described across multiple
   1.815 + * calls (parentheses and character classes) the call to quantifyAtom() will come
   1.816 + * after the call to the atom*End() method, never after atom*Begin().
   1.817 + *
   1.818 + * Character classes may either be described by a single call to
   1.819 + * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
   1.820 + * In the latter case, ...Begin() will be called, followed by a sequence of
   1.821 + * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
   1.822 + *
   1.823 + * Sequences of atoms and assertions are broken into alternatives via calls to
   1.824 + * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
   1.825 + * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
   1.826 + * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
   1.827 + * capturing subpattern, this will be the subpatternId associated with these
   1.828 + * parentheses, and will also by definition be the lowest subpatternId of these
   1.829 + * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
   1.830 + * is passed the subpatternId of the last capturing subexpression nested within
   1.831 + * these paretheses.  In the case of a capturing subpattern with no nested
   1.832 + * capturing subpatterns, the same subpatternId will be passed to the begin and
   1.833 + * end functions.  In the case of non-capturing subpatterns the subpatternId
   1.834 + * passed to the begin method is also the first possible subpatternId that might
   1.835 + * be nested within these paretheses.  If a set of non-capturing parentheses does
   1.836 + * not contain any capturing subpatterns, then the subpatternId passed to begin
   1.837 + * will be greater than the subpatternId passed to end.
   1.838 + */
   1.839 +
   1.840 +template<class Delegate>
   1.841 +ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
   1.842 +{
   1.843 +#ifdef YARR_8BIT_CHAR_SUPPORT
   1.844 +    if (pattern.is8Bit())
   1.845 +        return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
   1.846 +#endif
   1.847 +    return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
   1.848 +}
   1.849 +
   1.850 +} } // namespace JSC::Yarr
   1.851 +
   1.852 +#endif /* yarr_YarrParser_h */

mercurial