js/src/yarr/YarrPattern.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/yarr/YarrPattern.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,463 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99:
     1.6 + *
     1.7 + * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
     1.8 + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
     1.9 + *
    1.10 + * Redistribution and use in source and binary forms, with or without
    1.11 + * modification, are permitted provided that the following conditions
    1.12 + * are met:
    1.13 + * 1. Redistributions of source code must retain the above copyright
    1.14 + *    notice, this list of conditions and the following disclaimer.
    1.15 + * 2. Redistributions in binary form must reproduce the above copyright
    1.16 + *    notice, this list of conditions and the following disclaimer in the
    1.17 + *    documentation and/or other materials provided with the distribution.
    1.18 + *
    1.19 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
    1.20 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    1.21 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    1.22 + * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
    1.23 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    1.24 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    1.25 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    1.26 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
    1.27 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    1.28 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    1.29 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    1.30 + */
    1.31 +
    1.32 +#ifndef yarr_YarrPattern_h
    1.33 +#define yarr_YarrPattern_h
    1.34 +
    1.35 +#include "yarr/wtfbridge.h"
    1.36 +#include "yarr/ASCIICType.h"
    1.37 +
    1.38 +namespace JSC { namespace Yarr {
    1.39 +
    1.40 +struct PatternDisjunction;
    1.41 +
    1.42 +enum ErrorCode {
    1.43 +    NoError,
    1.44 +    PatternTooLarge,
    1.45 +    QuantifierOutOfOrder,
    1.46 +    QuantifierWithoutAtom,
    1.47 +    QuantifierTooLarge,
    1.48 +    MissingParentheses,
    1.49 +    ParenthesesUnmatched,
    1.50 +    ParenthesesTypeInvalid,
    1.51 +    CharacterClassUnmatched,
    1.52 +    CharacterClassOutOfOrder,
    1.53 +    CharacterClassInvalidRange,
    1.54 +    EscapeUnterminated,
    1.55 +    RuntimeError,
    1.56 +    NumberOfErrorCodes
    1.57 +};
    1.58 +
    1.59 +static inline const char* errorMessage(ErrorCode code)
    1.60 +{
    1.61 +
    1.62 +#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
    1.63 +   // The order of this array must match the ErrorCode enum.
    1.64 +   static const char* errorMessages[NumberOfErrorCodes] = {
    1.65 +       0, // NoError
    1.66 +       REGEXP_ERROR_PREFIX "regular expression too large",
    1.67 +       REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
    1.68 +       REGEXP_ERROR_PREFIX "nothing to repeat",
    1.69 +       REGEXP_ERROR_PREFIX "number too large in {} quantifier",
    1.70 +       REGEXP_ERROR_PREFIX "missing )",
    1.71 +       REGEXP_ERROR_PREFIX "unmatched parentheses",
    1.72 +       REGEXP_ERROR_PREFIX "unrecognized character after (?",
    1.73 +       REGEXP_ERROR_PREFIX "missing terminating ] for character class",
    1.74 +       REGEXP_ERROR_PREFIX "character class out of order"
    1.75 +       REGEXP_ERROR_PREFIX "range out of order in character class",
    1.76 +       REGEXP_ERROR_PREFIX "\\ at end of pattern"
    1.77 +   };
    1.78 +#undef REGEXP_ERROR_PREFIX
    1.79 +
    1.80 +   return errorMessages[code];
    1.81 +}
    1.82 +
    1.83 +struct CharacterRange {
    1.84 +    UChar begin;
    1.85 +    UChar end;
    1.86 +
    1.87 +    CharacterRange(UChar begin, UChar end)
    1.88 +        : begin(begin)
    1.89 +        , end(end)
    1.90 +    {
    1.91 +    }
    1.92 +};
    1.93 +
    1.94 +struct CharacterClass {
    1.95 +    WTF_MAKE_FAST_ALLOCATED;
    1.96 +public:
    1.97 +    // All CharacterClass instances have to have the full set of matches and ranges,
    1.98 +    // they may have an optional m_table for faster lookups (which must match the
    1.99 +    // specified matches and ranges)
   1.100 +    CharacterClass()
   1.101 +        : m_table(0)
   1.102 +    {
   1.103 +    }
   1.104 +    CharacterClass(const char* table, bool inverted)
   1.105 +        : m_table(table)
   1.106 +        , m_tableInverted(inverted)
   1.107 +    {
   1.108 +    }
   1.109 +    Vector<UChar> m_matches;
   1.110 +    Vector<CharacterRange> m_ranges;
   1.111 +    Vector<UChar> m_matchesUnicode;
   1.112 +    Vector<CharacterRange> m_rangesUnicode;
   1.113 +
   1.114 +    const char* m_table;
   1.115 +    bool m_tableInverted;
   1.116 +};
   1.117 +
   1.118 +enum QuantifierType {
   1.119 +    QuantifierFixedCount,
   1.120 +    QuantifierGreedy,
   1.121 +    QuantifierNonGreedy
   1.122 +};
   1.123 +
   1.124 +struct PatternTerm {
   1.125 +    enum Type {
   1.126 +        TypeAssertionBOL,
   1.127 +        TypeAssertionEOL,
   1.128 +        TypeAssertionWordBoundary,
   1.129 +        TypePatternCharacter,
   1.130 +        TypeCharacterClass,
   1.131 +        TypeBackReference,
   1.132 +        TypeForwardReference,
   1.133 +        TypeParenthesesSubpattern,
   1.134 +        TypeParentheticalAssertion,
   1.135 +        TypeDotStarEnclosure
   1.136 +    } type;
   1.137 +    bool m_capture :1;
   1.138 +    bool m_invert :1;
   1.139 +    union {
   1.140 +        UChar patternCharacter;
   1.141 +        CharacterClass* characterClass;
   1.142 +        unsigned backReferenceSubpatternId;
   1.143 +        struct {
   1.144 +            PatternDisjunction* disjunction;
   1.145 +            unsigned subpatternId;
   1.146 +            unsigned lastSubpatternId;
   1.147 +            bool isCopy;
   1.148 +            bool isTerminal;
   1.149 +        } parentheses;
   1.150 +        struct {
   1.151 +            bool bolAnchor : 1;
   1.152 +            bool eolAnchor : 1;
   1.153 +        } anchors;
   1.154 +    };
   1.155 +    QuantifierType quantityType;
   1.156 +    Checked<unsigned> quantityCount;
   1.157 +    int inputPosition;
   1.158 +    unsigned frameLocation;
   1.159 +
   1.160 +    PatternTerm(UChar ch)
   1.161 +        : type(PatternTerm::TypePatternCharacter)
   1.162 +        , m_capture(false)
   1.163 +        , m_invert(false)
   1.164 +    {
   1.165 +        patternCharacter = ch;
   1.166 +        quantityType = QuantifierFixedCount;
   1.167 +        quantityCount = 1;
   1.168 +    }
   1.169 +
   1.170 +    PatternTerm(CharacterClass* charClass, bool invert)
   1.171 +        : type(PatternTerm::TypeCharacterClass)
   1.172 +        , m_capture(false)
   1.173 +        , m_invert(invert)
   1.174 +    {
   1.175 +        characterClass = charClass;
   1.176 +        quantityType = QuantifierFixedCount;
   1.177 +        quantityCount = 1;
   1.178 +    }
   1.179 +
   1.180 +    PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
   1.181 +        : type(type)
   1.182 +        , m_capture(capture)
   1.183 +        , m_invert(invert)
   1.184 +    {
   1.185 +        parentheses.disjunction = disjunction;
   1.186 +        parentheses.subpatternId = subpatternId;
   1.187 +        parentheses.isCopy = false;
   1.188 +        parentheses.isTerminal = false;
   1.189 +        quantityType = QuantifierFixedCount;
   1.190 +        quantityCount = 1;
   1.191 +    }
   1.192 +
   1.193 +    PatternTerm(Type type, bool invert = false)
   1.194 +        : type(type)
   1.195 +        , m_capture(false)
   1.196 +        , m_invert(invert)
   1.197 +    {
   1.198 +        quantityType = QuantifierFixedCount;
   1.199 +        quantityCount = 1;
   1.200 +    }
   1.201 +
   1.202 +    PatternTerm(unsigned spatternId)
   1.203 +        : type(TypeBackReference)
   1.204 +        , m_capture(false)
   1.205 +        , m_invert(false)
   1.206 +    {
   1.207 +        backReferenceSubpatternId = spatternId;
   1.208 +        quantityType = QuantifierFixedCount;
   1.209 +        quantityCount = 1;
   1.210 +    }
   1.211 +
   1.212 +    PatternTerm(bool bolAnchor, bool eolAnchor)
   1.213 +        : type(TypeDotStarEnclosure)
   1.214 +        , m_capture(false)
   1.215 +        , m_invert(false)
   1.216 +    {
   1.217 +        anchors.bolAnchor = bolAnchor;
   1.218 +        anchors.eolAnchor = eolAnchor;
   1.219 +        quantityType = QuantifierFixedCount;
   1.220 +        quantityCount = 1;
   1.221 +    }
   1.222 +
   1.223 +    // No-argument constructor for js::Vector.
   1.224 +    PatternTerm()
   1.225 +        : type(PatternTerm::TypePatternCharacter)
   1.226 +        , m_capture(false)
   1.227 +        , m_invert(false)
   1.228 +    {
   1.229 +        patternCharacter = 0;
   1.230 +        quantityType = QuantifierFixedCount;
   1.231 +        quantityCount = 1;
   1.232 +    }
   1.233 +
   1.234 +    static PatternTerm ForwardReference()
   1.235 +    {
   1.236 +        return PatternTerm(TypeForwardReference);
   1.237 +    }
   1.238 +
   1.239 +    static PatternTerm BOL()
   1.240 +    {
   1.241 +        return PatternTerm(TypeAssertionBOL);
   1.242 +    }
   1.243 +
   1.244 +    static PatternTerm EOL()
   1.245 +    {
   1.246 +        return PatternTerm(TypeAssertionEOL);
   1.247 +    }
   1.248 +
   1.249 +    static PatternTerm WordBoundary(bool invert)
   1.250 +    {
   1.251 +        return PatternTerm(TypeAssertionWordBoundary, invert);
   1.252 +    }
   1.253 +
   1.254 +    bool invert()
   1.255 +    {
   1.256 +        return m_invert;
   1.257 +    }
   1.258 +
   1.259 +    bool capture()
   1.260 +    {
   1.261 +        return m_capture;
   1.262 +    }
   1.263 +
   1.264 +    void quantify(unsigned count, QuantifierType type)
   1.265 +    {
   1.266 +        quantityCount = count;
   1.267 +        quantityType = type;
   1.268 +    }
   1.269 +};
   1.270 +
   1.271 +struct PatternAlternative {
   1.272 +    WTF_MAKE_FAST_ALLOCATED;
   1.273 +public:
   1.274 +    PatternAlternative(PatternDisjunction* disjunction)
   1.275 +        : m_parent(disjunction)
   1.276 +        , m_onceThrough(false)
   1.277 +        , m_hasFixedSize(false)
   1.278 +        , m_startsWithBOL(false)
   1.279 +        , m_containsBOL(false)
   1.280 +    {
   1.281 +    }
   1.282 +
   1.283 +    PatternTerm& lastTerm()
   1.284 +    {
   1.285 +        ASSERT(m_terms.size());
   1.286 +        return m_terms[m_terms.size() - 1];
   1.287 +    }
   1.288 +
   1.289 +    void removeLastTerm()
   1.290 +    {
   1.291 +        ASSERT(m_terms.size());
   1.292 +        m_terms.shrink(m_terms.size() - 1);
   1.293 +    }
   1.294 +
   1.295 +    void setOnceThrough()
   1.296 +    {
   1.297 +        m_onceThrough = true;
   1.298 +    }
   1.299 +
   1.300 +    bool onceThrough()
   1.301 +    {
   1.302 +        return m_onceThrough;
   1.303 +    }
   1.304 +
   1.305 +    Vector<PatternTerm> m_terms;
   1.306 +    PatternDisjunction* m_parent;
   1.307 +    unsigned m_minimumSize;
   1.308 +    bool m_onceThrough : 1;
   1.309 +    bool m_hasFixedSize : 1;
   1.310 +    bool m_startsWithBOL : 1;
   1.311 +    bool m_containsBOL : 1;
   1.312 +};
   1.313 +
   1.314 +struct PatternDisjunction {
   1.315 +    WTF_MAKE_FAST_ALLOCATED;
   1.316 +public:
   1.317 +    PatternDisjunction(PatternAlternative* parent = 0)
   1.318 +        : m_parent(parent)
   1.319 +        , m_hasFixedSize(false)
   1.320 +    {
   1.321 +    }
   1.322 +
   1.323 +    ~PatternDisjunction()
   1.324 +    {
   1.325 +        deleteAllValues(m_alternatives);
   1.326 +    }
   1.327 +
   1.328 +    PatternAlternative* addNewAlternative()
   1.329 +    {
   1.330 +        PatternAlternative* alternative = newOrCrash<PatternAlternative>(this);
   1.331 +        m_alternatives.append(alternative);
   1.332 +        return alternative;
   1.333 +    }
   1.334 +
   1.335 +    Vector<PatternAlternative*> m_alternatives;
   1.336 +    PatternAlternative* m_parent;
   1.337 +    unsigned m_minimumSize;
   1.338 +    unsigned m_callFrameSize;
   1.339 +    bool m_hasFixedSize;
   1.340 +};
   1.341 +
   1.342 +// You probably don't want to be calling these functions directly
   1.343 +// (please to be calling newlineCharacterClass() et al on your
   1.344 +// friendly neighborhood YarrPattern instance to get nicely
   1.345 +// cached copies).
   1.346 +CharacterClass* newlineCreate();
   1.347 +CharacterClass* digitsCreate();
   1.348 +CharacterClass* spacesCreate();
   1.349 +CharacterClass* wordcharCreate();
   1.350 +CharacterClass* nondigitsCreate();
   1.351 +CharacterClass* nonspacesCreate();
   1.352 +CharacterClass* nonwordcharCreate();
   1.353 +
   1.354 +struct TermChain {
   1.355 +    TermChain(PatternTerm term)
   1.356 +        : term(term)
   1.357 +    {}
   1.358 +
   1.359 +    PatternTerm term;
   1.360 +    Vector<TermChain> hotTerms;
   1.361 +};
   1.362 +
   1.363 +struct YarrPattern {
   1.364 +    YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error);
   1.365 +
   1.366 +    ~YarrPattern()
   1.367 +    {
   1.368 +        deleteAllValues(m_disjunctions);
   1.369 +        deleteAllValues(m_userCharacterClasses);
   1.370 +    }
   1.371 +
   1.372 +    void reset()
   1.373 +    {
   1.374 +        m_numSubpatterns = 0;
   1.375 +        m_maxBackReference = 0;
   1.376 +
   1.377 +        m_containsBackreferences = false;
   1.378 +        m_containsBOL = false;
   1.379 +
   1.380 +        newlineCached = 0;
   1.381 +        digitsCached = 0;
   1.382 +        spacesCached = 0;
   1.383 +        wordcharCached = 0;
   1.384 +        nondigitsCached = 0;
   1.385 +        nonspacesCached = 0;
   1.386 +        nonwordcharCached = 0;
   1.387 +
   1.388 +        deleteAllValues(m_disjunctions);
   1.389 +        m_disjunctions.clear();
   1.390 +        deleteAllValues(m_userCharacterClasses);
   1.391 +        m_userCharacterClasses.clear();
   1.392 +    }
   1.393 +
   1.394 +    bool containsIllegalBackReference()
   1.395 +    {
   1.396 +        return m_maxBackReference > m_numSubpatterns;
   1.397 +    }
   1.398 +
   1.399 +    CharacterClass* newlineCharacterClass()
   1.400 +    {
   1.401 +        if (!newlineCached)
   1.402 +            m_userCharacterClasses.append(newlineCached = newlineCreate());
   1.403 +        return newlineCached;
   1.404 +    }
   1.405 +    CharacterClass* digitsCharacterClass()
   1.406 +    {
   1.407 +        if (!digitsCached)
   1.408 +            m_userCharacterClasses.append(digitsCached = digitsCreate());
   1.409 +        return digitsCached;
   1.410 +    }
   1.411 +    CharacterClass* spacesCharacterClass()
   1.412 +    {
   1.413 +        if (!spacesCached)
   1.414 +            m_userCharacterClasses.append(spacesCached = spacesCreate());
   1.415 +        return spacesCached;
   1.416 +    }
   1.417 +    CharacterClass* wordcharCharacterClass()
   1.418 +    {
   1.419 +        if (!wordcharCached)
   1.420 +            m_userCharacterClasses.append(wordcharCached = wordcharCreate());
   1.421 +        return wordcharCached;
   1.422 +    }
   1.423 +    CharacterClass* nondigitsCharacterClass()
   1.424 +    {
   1.425 +        if (!nondigitsCached)
   1.426 +            m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
   1.427 +        return nondigitsCached;
   1.428 +    }
   1.429 +    CharacterClass* nonspacesCharacterClass()
   1.430 +    {
   1.431 +        if (!nonspacesCached)
   1.432 +            m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
   1.433 +        return nonspacesCached;
   1.434 +    }
   1.435 +    CharacterClass* nonwordcharCharacterClass()
   1.436 +    {
   1.437 +        if (!nonwordcharCached)
   1.438 +            m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
   1.439 +        return nonwordcharCached;
   1.440 +    }
   1.441 +
   1.442 +    bool m_ignoreCase : 1;
   1.443 +    bool m_multiline : 1;
   1.444 +    bool m_containsBackreferences : 1;
   1.445 +    bool m_containsBOL : 1;
   1.446 +    unsigned m_numSubpatterns;
   1.447 +    unsigned m_maxBackReference;
   1.448 +    PatternDisjunction* m_body;
   1.449 +    Vector<PatternDisjunction*, 4> m_disjunctions;
   1.450 +    Vector<CharacterClass*> m_userCharacterClasses;
   1.451 +
   1.452 +private:
   1.453 +    ErrorCode compile(const String& patternString);
   1.454 +
   1.455 +    CharacterClass* newlineCached;
   1.456 +    CharacterClass* digitsCached;
   1.457 +    CharacterClass* spacesCached;
   1.458 +    CharacterClass* wordcharCached;
   1.459 +    CharacterClass* nondigitsCached;
   1.460 +    CharacterClass* nonspacesCached;
   1.461 +    CharacterClass* nonwordcharCached;
   1.462 +};
   1.463 +
   1.464 +} } // namespace JSC::Yarr
   1.465 +
   1.466 +#endif /* yarr_YarrPattern_h */

mercurial