michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- michael@0: * vim: set ts=8 sts=4 et sw=4 tw=99: michael@0: * michael@0: * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. michael@0: * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged michael@0: * michael@0: * Redistribution and use in source and binary forms, with or without michael@0: * modification, are permitted provided that the following conditions michael@0: * are met: michael@0: * 1. Redistributions of source code must retain the above copyright michael@0: * notice, this list of conditions and the following disclaimer. michael@0: * 2. Redistributions in binary form must reproduce the above copyright michael@0: * notice, this list of conditions and the following disclaimer in the michael@0: * documentation and/or other materials provided with the distribution. michael@0: * michael@0: * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY michael@0: * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE michael@0: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR michael@0: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR michael@0: * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, michael@0: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, michael@0: * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR michael@0: * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY michael@0: * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT michael@0: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE michael@0: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. michael@0: */ michael@0: michael@0: #ifndef yarr_YarrPattern_h michael@0: #define yarr_YarrPattern_h michael@0: michael@0: #include "yarr/wtfbridge.h" michael@0: #include "yarr/ASCIICType.h" michael@0: michael@0: namespace JSC { namespace Yarr { michael@0: michael@0: struct PatternDisjunction; michael@0: michael@0: enum ErrorCode { michael@0: NoError, michael@0: PatternTooLarge, michael@0: QuantifierOutOfOrder, michael@0: QuantifierWithoutAtom, michael@0: QuantifierTooLarge, michael@0: MissingParentheses, michael@0: ParenthesesUnmatched, michael@0: ParenthesesTypeInvalid, michael@0: CharacterClassUnmatched, michael@0: CharacterClassOutOfOrder, michael@0: CharacterClassInvalidRange, michael@0: EscapeUnterminated, michael@0: RuntimeError, michael@0: NumberOfErrorCodes michael@0: }; michael@0: michael@0: static inline const char* errorMessage(ErrorCode code) michael@0: { michael@0: michael@0: #define REGEXP_ERROR_PREFIX "Invalid regular expression: " michael@0: // The order of this array must match the ErrorCode enum. michael@0: static const char* errorMessages[NumberOfErrorCodes] = { michael@0: 0, // NoError michael@0: REGEXP_ERROR_PREFIX "regular expression too large", michael@0: REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", michael@0: REGEXP_ERROR_PREFIX "nothing to repeat", michael@0: REGEXP_ERROR_PREFIX "number too large in {} quantifier", michael@0: REGEXP_ERROR_PREFIX "missing )", michael@0: REGEXP_ERROR_PREFIX "unmatched parentheses", michael@0: REGEXP_ERROR_PREFIX "unrecognized character after (?", michael@0: REGEXP_ERROR_PREFIX "missing terminating ] for character class", michael@0: REGEXP_ERROR_PREFIX "character class out of order" michael@0: REGEXP_ERROR_PREFIX "range out of order in character class", michael@0: REGEXP_ERROR_PREFIX "\\ at end of pattern" michael@0: }; michael@0: #undef REGEXP_ERROR_PREFIX michael@0: michael@0: return errorMessages[code]; michael@0: } michael@0: michael@0: struct CharacterRange { michael@0: UChar begin; michael@0: UChar end; michael@0: michael@0: CharacterRange(UChar begin, UChar end) michael@0: : begin(begin) michael@0: , end(end) michael@0: { michael@0: } michael@0: }; michael@0: michael@0: struct CharacterClass { michael@0: WTF_MAKE_FAST_ALLOCATED; michael@0: public: michael@0: // All CharacterClass instances have to have the full set of matches and ranges, michael@0: // they may have an optional m_table for faster lookups (which must match the michael@0: // specified matches and ranges) michael@0: CharacterClass() michael@0: : m_table(0) michael@0: { michael@0: } michael@0: CharacterClass(const char* table, bool inverted) michael@0: : m_table(table) michael@0: , m_tableInverted(inverted) michael@0: { michael@0: } michael@0: Vector m_matches; michael@0: Vector m_ranges; michael@0: Vector m_matchesUnicode; michael@0: Vector m_rangesUnicode; michael@0: michael@0: const char* m_table; michael@0: bool m_tableInverted; michael@0: }; michael@0: michael@0: enum QuantifierType { michael@0: QuantifierFixedCount, michael@0: QuantifierGreedy, michael@0: QuantifierNonGreedy michael@0: }; michael@0: michael@0: struct PatternTerm { michael@0: enum Type { michael@0: TypeAssertionBOL, michael@0: TypeAssertionEOL, michael@0: TypeAssertionWordBoundary, michael@0: TypePatternCharacter, michael@0: TypeCharacterClass, michael@0: TypeBackReference, michael@0: TypeForwardReference, michael@0: TypeParenthesesSubpattern, michael@0: TypeParentheticalAssertion, michael@0: TypeDotStarEnclosure michael@0: } type; michael@0: bool m_capture :1; michael@0: bool m_invert :1; michael@0: union { michael@0: UChar patternCharacter; michael@0: CharacterClass* characterClass; michael@0: unsigned backReferenceSubpatternId; michael@0: struct { michael@0: PatternDisjunction* disjunction; michael@0: unsigned subpatternId; michael@0: unsigned lastSubpatternId; michael@0: bool isCopy; michael@0: bool isTerminal; michael@0: } parentheses; michael@0: struct { michael@0: bool bolAnchor : 1; michael@0: bool eolAnchor : 1; michael@0: } anchors; michael@0: }; michael@0: QuantifierType quantityType; michael@0: Checked quantityCount; michael@0: int inputPosition; michael@0: unsigned frameLocation; michael@0: michael@0: PatternTerm(UChar ch) michael@0: : type(PatternTerm::TypePatternCharacter) michael@0: , m_capture(false) michael@0: , m_invert(false) michael@0: { michael@0: patternCharacter = ch; michael@0: quantityType = QuantifierFixedCount; michael@0: quantityCount = 1; michael@0: } michael@0: michael@0: PatternTerm(CharacterClass* charClass, bool invert) michael@0: : type(PatternTerm::TypeCharacterClass) michael@0: , m_capture(false) michael@0: , m_invert(invert) michael@0: { michael@0: characterClass = charClass; michael@0: quantityType = QuantifierFixedCount; michael@0: quantityCount = 1; michael@0: } michael@0: michael@0: PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) michael@0: : type(type) michael@0: , m_capture(capture) michael@0: , m_invert(invert) michael@0: { michael@0: parentheses.disjunction = disjunction; michael@0: parentheses.subpatternId = subpatternId; michael@0: parentheses.isCopy = false; michael@0: parentheses.isTerminal = false; michael@0: quantityType = QuantifierFixedCount; michael@0: quantityCount = 1; michael@0: } michael@0: michael@0: PatternTerm(Type type, bool invert = false) michael@0: : type(type) michael@0: , m_capture(false) michael@0: , m_invert(invert) michael@0: { michael@0: quantityType = QuantifierFixedCount; michael@0: quantityCount = 1; michael@0: } michael@0: michael@0: PatternTerm(unsigned spatternId) michael@0: : type(TypeBackReference) michael@0: , m_capture(false) michael@0: , m_invert(false) michael@0: { michael@0: backReferenceSubpatternId = spatternId; michael@0: quantityType = QuantifierFixedCount; michael@0: quantityCount = 1; michael@0: } michael@0: michael@0: PatternTerm(bool bolAnchor, bool eolAnchor) michael@0: : type(TypeDotStarEnclosure) michael@0: , m_capture(false) michael@0: , m_invert(false) michael@0: { michael@0: anchors.bolAnchor = bolAnchor; michael@0: anchors.eolAnchor = eolAnchor; michael@0: quantityType = QuantifierFixedCount; michael@0: quantityCount = 1; michael@0: } michael@0: michael@0: // No-argument constructor for js::Vector. michael@0: PatternTerm() michael@0: : type(PatternTerm::TypePatternCharacter) michael@0: , m_capture(false) michael@0: , m_invert(false) michael@0: { michael@0: patternCharacter = 0; michael@0: quantityType = QuantifierFixedCount; michael@0: quantityCount = 1; michael@0: } michael@0: michael@0: static PatternTerm ForwardReference() michael@0: { michael@0: return PatternTerm(TypeForwardReference); michael@0: } michael@0: michael@0: static PatternTerm BOL() michael@0: { michael@0: return PatternTerm(TypeAssertionBOL); michael@0: } michael@0: michael@0: static PatternTerm EOL() michael@0: { michael@0: return PatternTerm(TypeAssertionEOL); michael@0: } michael@0: michael@0: static PatternTerm WordBoundary(bool invert) michael@0: { michael@0: return PatternTerm(TypeAssertionWordBoundary, invert); michael@0: } michael@0: michael@0: bool invert() michael@0: { michael@0: return m_invert; michael@0: } michael@0: michael@0: bool capture() michael@0: { michael@0: return m_capture; michael@0: } michael@0: michael@0: void quantify(unsigned count, QuantifierType type) michael@0: { michael@0: quantityCount = count; michael@0: quantityType = type; michael@0: } michael@0: }; michael@0: michael@0: struct PatternAlternative { michael@0: WTF_MAKE_FAST_ALLOCATED; michael@0: public: michael@0: PatternAlternative(PatternDisjunction* disjunction) michael@0: : m_parent(disjunction) michael@0: , m_onceThrough(false) michael@0: , m_hasFixedSize(false) michael@0: , m_startsWithBOL(false) michael@0: , m_containsBOL(false) michael@0: { michael@0: } michael@0: michael@0: PatternTerm& lastTerm() michael@0: { michael@0: ASSERT(m_terms.size()); michael@0: return m_terms[m_terms.size() - 1]; michael@0: } michael@0: michael@0: void removeLastTerm() michael@0: { michael@0: ASSERT(m_terms.size()); michael@0: m_terms.shrink(m_terms.size() - 1); michael@0: } michael@0: michael@0: void setOnceThrough() michael@0: { michael@0: m_onceThrough = true; michael@0: } michael@0: michael@0: bool onceThrough() michael@0: { michael@0: return m_onceThrough; michael@0: } michael@0: michael@0: Vector m_terms; michael@0: PatternDisjunction* m_parent; michael@0: unsigned m_minimumSize; michael@0: bool m_onceThrough : 1; michael@0: bool m_hasFixedSize : 1; michael@0: bool m_startsWithBOL : 1; michael@0: bool m_containsBOL : 1; michael@0: }; michael@0: michael@0: struct PatternDisjunction { michael@0: WTF_MAKE_FAST_ALLOCATED; michael@0: public: michael@0: PatternDisjunction(PatternAlternative* parent = 0) michael@0: : m_parent(parent) michael@0: , m_hasFixedSize(false) michael@0: { michael@0: } michael@0: michael@0: ~PatternDisjunction() michael@0: { michael@0: deleteAllValues(m_alternatives); michael@0: } michael@0: michael@0: PatternAlternative* addNewAlternative() michael@0: { michael@0: PatternAlternative* alternative = newOrCrash(this); michael@0: m_alternatives.append(alternative); michael@0: return alternative; michael@0: } michael@0: michael@0: Vector m_alternatives; michael@0: PatternAlternative* m_parent; michael@0: unsigned m_minimumSize; michael@0: unsigned m_callFrameSize; michael@0: bool m_hasFixedSize; michael@0: }; michael@0: michael@0: // You probably don't want to be calling these functions directly michael@0: // (please to be calling newlineCharacterClass() et al on your michael@0: // friendly neighborhood YarrPattern instance to get nicely michael@0: // cached copies). michael@0: CharacterClass* newlineCreate(); michael@0: CharacterClass* digitsCreate(); michael@0: CharacterClass* spacesCreate(); michael@0: CharacterClass* wordcharCreate(); michael@0: CharacterClass* nondigitsCreate(); michael@0: CharacterClass* nonspacesCreate(); michael@0: CharacterClass* nonwordcharCreate(); michael@0: michael@0: struct TermChain { michael@0: TermChain(PatternTerm term) michael@0: : term(term) michael@0: {} michael@0: michael@0: PatternTerm term; michael@0: Vector hotTerms; michael@0: }; michael@0: michael@0: struct YarrPattern { michael@0: YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error); michael@0: michael@0: ~YarrPattern() michael@0: { michael@0: deleteAllValues(m_disjunctions); michael@0: deleteAllValues(m_userCharacterClasses); michael@0: } michael@0: michael@0: void reset() michael@0: { michael@0: m_numSubpatterns = 0; michael@0: m_maxBackReference = 0; michael@0: michael@0: m_containsBackreferences = false; michael@0: m_containsBOL = false; michael@0: michael@0: newlineCached = 0; michael@0: digitsCached = 0; michael@0: spacesCached = 0; michael@0: wordcharCached = 0; michael@0: nondigitsCached = 0; michael@0: nonspacesCached = 0; michael@0: nonwordcharCached = 0; michael@0: michael@0: deleteAllValues(m_disjunctions); michael@0: m_disjunctions.clear(); michael@0: deleteAllValues(m_userCharacterClasses); michael@0: m_userCharacterClasses.clear(); michael@0: } michael@0: michael@0: bool containsIllegalBackReference() michael@0: { michael@0: return m_maxBackReference > m_numSubpatterns; michael@0: } michael@0: michael@0: CharacterClass* newlineCharacterClass() michael@0: { michael@0: if (!newlineCached) michael@0: m_userCharacterClasses.append(newlineCached = newlineCreate()); michael@0: return newlineCached; michael@0: } michael@0: CharacterClass* digitsCharacterClass() michael@0: { michael@0: if (!digitsCached) michael@0: m_userCharacterClasses.append(digitsCached = digitsCreate()); michael@0: return digitsCached; michael@0: } michael@0: CharacterClass* spacesCharacterClass() michael@0: { michael@0: if (!spacesCached) michael@0: m_userCharacterClasses.append(spacesCached = spacesCreate()); michael@0: return spacesCached; michael@0: } michael@0: CharacterClass* wordcharCharacterClass() michael@0: { michael@0: if (!wordcharCached) michael@0: m_userCharacterClasses.append(wordcharCached = wordcharCreate()); michael@0: return wordcharCached; michael@0: } michael@0: CharacterClass* nondigitsCharacterClass() michael@0: { michael@0: if (!nondigitsCached) michael@0: m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); michael@0: return nondigitsCached; michael@0: } michael@0: CharacterClass* nonspacesCharacterClass() michael@0: { michael@0: if (!nonspacesCached) michael@0: m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); michael@0: return nonspacesCached; michael@0: } michael@0: CharacterClass* nonwordcharCharacterClass() michael@0: { michael@0: if (!nonwordcharCached) michael@0: m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); michael@0: return nonwordcharCached; michael@0: } michael@0: michael@0: bool m_ignoreCase : 1; michael@0: bool m_multiline : 1; michael@0: bool m_containsBackreferences : 1; michael@0: bool m_containsBOL : 1; michael@0: unsigned m_numSubpatterns; michael@0: unsigned m_maxBackReference; michael@0: PatternDisjunction* m_body; michael@0: Vector m_disjunctions; michael@0: Vector m_userCharacterClasses; michael@0: michael@0: private: michael@0: ErrorCode compile(const String& patternString); michael@0: michael@0: CharacterClass* newlineCached; michael@0: CharacterClass* digitsCached; michael@0: CharacterClass* spacesCached; michael@0: CharacterClass* wordcharCached; michael@0: CharacterClass* nondigitsCached; michael@0: CharacterClass* nonspacesCached; michael@0: CharacterClass* nonwordcharCached; michael@0: }; michael@0: michael@0: } } // namespace JSC::Yarr michael@0: michael@0: #endif /* yarr_YarrPattern_h */