1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/yarr/YarrPattern.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,463 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99: 1.6 + * 1.7 + * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. 1.8 + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 1.9 + * 1.10 + * Redistribution and use in source and binary forms, with or without 1.11 + * modification, are permitted provided that the following conditions 1.12 + * are met: 1.13 + * 1. Redistributions of source code must retain the above copyright 1.14 + * notice, this list of conditions and the following disclaimer. 1.15 + * 2. Redistributions in binary form must reproduce the above copyright 1.16 + * notice, this list of conditions and the following disclaimer in the 1.17 + * documentation and/or other materials provided with the distribution. 1.18 + * 1.19 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 1.20 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1.21 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 1.22 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 1.23 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.24 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.25 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.26 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 1.27 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.28 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.29 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.30 + */ 1.31 + 1.32 +#ifndef yarr_YarrPattern_h 1.33 +#define yarr_YarrPattern_h 1.34 + 1.35 +#include "yarr/wtfbridge.h" 1.36 +#include "yarr/ASCIICType.h" 1.37 + 1.38 +namespace JSC { namespace Yarr { 1.39 + 1.40 +struct PatternDisjunction; 1.41 + 1.42 +enum ErrorCode { 1.43 + NoError, 1.44 + PatternTooLarge, 1.45 + QuantifierOutOfOrder, 1.46 + QuantifierWithoutAtom, 1.47 + QuantifierTooLarge, 1.48 + MissingParentheses, 1.49 + ParenthesesUnmatched, 1.50 + ParenthesesTypeInvalid, 1.51 + CharacterClassUnmatched, 1.52 + CharacterClassOutOfOrder, 1.53 + CharacterClassInvalidRange, 1.54 + EscapeUnterminated, 1.55 + RuntimeError, 1.56 + NumberOfErrorCodes 1.57 +}; 1.58 + 1.59 +static inline const char* errorMessage(ErrorCode code) 1.60 +{ 1.61 + 1.62 +#define REGEXP_ERROR_PREFIX "Invalid regular expression: " 1.63 + // The order of this array must match the ErrorCode enum. 1.64 + static const char* errorMessages[NumberOfErrorCodes] = { 1.65 + 0, // NoError 1.66 + REGEXP_ERROR_PREFIX "regular expression too large", 1.67 + REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", 1.68 + REGEXP_ERROR_PREFIX "nothing to repeat", 1.69 + REGEXP_ERROR_PREFIX "number too large in {} quantifier", 1.70 + REGEXP_ERROR_PREFIX "missing )", 1.71 + REGEXP_ERROR_PREFIX "unmatched parentheses", 1.72 + REGEXP_ERROR_PREFIX "unrecognized character after (?", 1.73 + REGEXP_ERROR_PREFIX "missing terminating ] for character class", 1.74 + REGEXP_ERROR_PREFIX "character class out of order" 1.75 + REGEXP_ERROR_PREFIX "range out of order in character class", 1.76 + REGEXP_ERROR_PREFIX "\\ at end of pattern" 1.77 + }; 1.78 +#undef REGEXP_ERROR_PREFIX 1.79 + 1.80 + return errorMessages[code]; 1.81 +} 1.82 + 1.83 +struct CharacterRange { 1.84 + UChar begin; 1.85 + UChar end; 1.86 + 1.87 + CharacterRange(UChar begin, UChar end) 1.88 + : begin(begin) 1.89 + , end(end) 1.90 + { 1.91 + } 1.92 +}; 1.93 + 1.94 +struct CharacterClass { 1.95 + WTF_MAKE_FAST_ALLOCATED; 1.96 +public: 1.97 + // All CharacterClass instances have to have the full set of matches and ranges, 1.98 + // they may have an optional m_table for faster lookups (which must match the 1.99 + // specified matches and ranges) 1.100 + CharacterClass() 1.101 + : m_table(0) 1.102 + { 1.103 + } 1.104 + CharacterClass(const char* table, bool inverted) 1.105 + : m_table(table) 1.106 + , m_tableInverted(inverted) 1.107 + { 1.108 + } 1.109 + Vector<UChar> m_matches; 1.110 + Vector<CharacterRange> m_ranges; 1.111 + Vector<UChar> m_matchesUnicode; 1.112 + Vector<CharacterRange> m_rangesUnicode; 1.113 + 1.114 + const char* m_table; 1.115 + bool m_tableInverted; 1.116 +}; 1.117 + 1.118 +enum QuantifierType { 1.119 + QuantifierFixedCount, 1.120 + QuantifierGreedy, 1.121 + QuantifierNonGreedy 1.122 +}; 1.123 + 1.124 +struct PatternTerm { 1.125 + enum Type { 1.126 + TypeAssertionBOL, 1.127 + TypeAssertionEOL, 1.128 + TypeAssertionWordBoundary, 1.129 + TypePatternCharacter, 1.130 + TypeCharacterClass, 1.131 + TypeBackReference, 1.132 + TypeForwardReference, 1.133 + TypeParenthesesSubpattern, 1.134 + TypeParentheticalAssertion, 1.135 + TypeDotStarEnclosure 1.136 + } type; 1.137 + bool m_capture :1; 1.138 + bool m_invert :1; 1.139 + union { 1.140 + UChar patternCharacter; 1.141 + CharacterClass* characterClass; 1.142 + unsigned backReferenceSubpatternId; 1.143 + struct { 1.144 + PatternDisjunction* disjunction; 1.145 + unsigned subpatternId; 1.146 + unsigned lastSubpatternId; 1.147 + bool isCopy; 1.148 + bool isTerminal; 1.149 + } parentheses; 1.150 + struct { 1.151 + bool bolAnchor : 1; 1.152 + bool eolAnchor : 1; 1.153 + } anchors; 1.154 + }; 1.155 + QuantifierType quantityType; 1.156 + Checked<unsigned> quantityCount; 1.157 + int inputPosition; 1.158 + unsigned frameLocation; 1.159 + 1.160 + PatternTerm(UChar ch) 1.161 + : type(PatternTerm::TypePatternCharacter) 1.162 + , m_capture(false) 1.163 + , m_invert(false) 1.164 + { 1.165 + patternCharacter = ch; 1.166 + quantityType = QuantifierFixedCount; 1.167 + quantityCount = 1; 1.168 + } 1.169 + 1.170 + PatternTerm(CharacterClass* charClass, bool invert) 1.171 + : type(PatternTerm::TypeCharacterClass) 1.172 + , m_capture(false) 1.173 + , m_invert(invert) 1.174 + { 1.175 + characterClass = charClass; 1.176 + quantityType = QuantifierFixedCount; 1.177 + quantityCount = 1; 1.178 + } 1.179 + 1.180 + PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) 1.181 + : type(type) 1.182 + , m_capture(capture) 1.183 + , m_invert(invert) 1.184 + { 1.185 + parentheses.disjunction = disjunction; 1.186 + parentheses.subpatternId = subpatternId; 1.187 + parentheses.isCopy = false; 1.188 + parentheses.isTerminal = false; 1.189 + quantityType = QuantifierFixedCount; 1.190 + quantityCount = 1; 1.191 + } 1.192 + 1.193 + PatternTerm(Type type, bool invert = false) 1.194 + : type(type) 1.195 + , m_capture(false) 1.196 + , m_invert(invert) 1.197 + { 1.198 + quantityType = QuantifierFixedCount; 1.199 + quantityCount = 1; 1.200 + } 1.201 + 1.202 + PatternTerm(unsigned spatternId) 1.203 + : type(TypeBackReference) 1.204 + , m_capture(false) 1.205 + , m_invert(false) 1.206 + { 1.207 + backReferenceSubpatternId = spatternId; 1.208 + quantityType = QuantifierFixedCount; 1.209 + quantityCount = 1; 1.210 + } 1.211 + 1.212 + PatternTerm(bool bolAnchor, bool eolAnchor) 1.213 + : type(TypeDotStarEnclosure) 1.214 + , m_capture(false) 1.215 + , m_invert(false) 1.216 + { 1.217 + anchors.bolAnchor = bolAnchor; 1.218 + anchors.eolAnchor = eolAnchor; 1.219 + quantityType = QuantifierFixedCount; 1.220 + quantityCount = 1; 1.221 + } 1.222 + 1.223 + // No-argument constructor for js::Vector. 1.224 + PatternTerm() 1.225 + : type(PatternTerm::TypePatternCharacter) 1.226 + , m_capture(false) 1.227 + , m_invert(false) 1.228 + { 1.229 + patternCharacter = 0; 1.230 + quantityType = QuantifierFixedCount; 1.231 + quantityCount = 1; 1.232 + } 1.233 + 1.234 + static PatternTerm ForwardReference() 1.235 + { 1.236 + return PatternTerm(TypeForwardReference); 1.237 + } 1.238 + 1.239 + static PatternTerm BOL() 1.240 + { 1.241 + return PatternTerm(TypeAssertionBOL); 1.242 + } 1.243 + 1.244 + static PatternTerm EOL() 1.245 + { 1.246 + return PatternTerm(TypeAssertionEOL); 1.247 + } 1.248 + 1.249 + static PatternTerm WordBoundary(bool invert) 1.250 + { 1.251 + return PatternTerm(TypeAssertionWordBoundary, invert); 1.252 + } 1.253 + 1.254 + bool invert() 1.255 + { 1.256 + return m_invert; 1.257 + } 1.258 + 1.259 + bool capture() 1.260 + { 1.261 + return m_capture; 1.262 + } 1.263 + 1.264 + void quantify(unsigned count, QuantifierType type) 1.265 + { 1.266 + quantityCount = count; 1.267 + quantityType = type; 1.268 + } 1.269 +}; 1.270 + 1.271 +struct PatternAlternative { 1.272 + WTF_MAKE_FAST_ALLOCATED; 1.273 +public: 1.274 + PatternAlternative(PatternDisjunction* disjunction) 1.275 + : m_parent(disjunction) 1.276 + , m_onceThrough(false) 1.277 + , m_hasFixedSize(false) 1.278 + , m_startsWithBOL(false) 1.279 + , m_containsBOL(false) 1.280 + { 1.281 + } 1.282 + 1.283 + PatternTerm& lastTerm() 1.284 + { 1.285 + ASSERT(m_terms.size()); 1.286 + return m_terms[m_terms.size() - 1]; 1.287 + } 1.288 + 1.289 + void removeLastTerm() 1.290 + { 1.291 + ASSERT(m_terms.size()); 1.292 + m_terms.shrink(m_terms.size() - 1); 1.293 + } 1.294 + 1.295 + void setOnceThrough() 1.296 + { 1.297 + m_onceThrough = true; 1.298 + } 1.299 + 1.300 + bool onceThrough() 1.301 + { 1.302 + return m_onceThrough; 1.303 + } 1.304 + 1.305 + Vector<PatternTerm> m_terms; 1.306 + PatternDisjunction* m_parent; 1.307 + unsigned m_minimumSize; 1.308 + bool m_onceThrough : 1; 1.309 + bool m_hasFixedSize : 1; 1.310 + bool m_startsWithBOL : 1; 1.311 + bool m_containsBOL : 1; 1.312 +}; 1.313 + 1.314 +struct PatternDisjunction { 1.315 + WTF_MAKE_FAST_ALLOCATED; 1.316 +public: 1.317 + PatternDisjunction(PatternAlternative* parent = 0) 1.318 + : m_parent(parent) 1.319 + , m_hasFixedSize(false) 1.320 + { 1.321 + } 1.322 + 1.323 + ~PatternDisjunction() 1.324 + { 1.325 + deleteAllValues(m_alternatives); 1.326 + } 1.327 + 1.328 + PatternAlternative* addNewAlternative() 1.329 + { 1.330 + PatternAlternative* alternative = newOrCrash<PatternAlternative>(this); 1.331 + m_alternatives.append(alternative); 1.332 + return alternative; 1.333 + } 1.334 + 1.335 + Vector<PatternAlternative*> m_alternatives; 1.336 + PatternAlternative* m_parent; 1.337 + unsigned m_minimumSize; 1.338 + unsigned m_callFrameSize; 1.339 + bool m_hasFixedSize; 1.340 +}; 1.341 + 1.342 +// You probably don't want to be calling these functions directly 1.343 +// (please to be calling newlineCharacterClass() et al on your 1.344 +// friendly neighborhood YarrPattern instance to get nicely 1.345 +// cached copies). 1.346 +CharacterClass* newlineCreate(); 1.347 +CharacterClass* digitsCreate(); 1.348 +CharacterClass* spacesCreate(); 1.349 +CharacterClass* wordcharCreate(); 1.350 +CharacterClass* nondigitsCreate(); 1.351 +CharacterClass* nonspacesCreate(); 1.352 +CharacterClass* nonwordcharCreate(); 1.353 + 1.354 +struct TermChain { 1.355 + TermChain(PatternTerm term) 1.356 + : term(term) 1.357 + {} 1.358 + 1.359 + PatternTerm term; 1.360 + Vector<TermChain> hotTerms; 1.361 +}; 1.362 + 1.363 +struct YarrPattern { 1.364 + YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error); 1.365 + 1.366 + ~YarrPattern() 1.367 + { 1.368 + deleteAllValues(m_disjunctions); 1.369 + deleteAllValues(m_userCharacterClasses); 1.370 + } 1.371 + 1.372 + void reset() 1.373 + { 1.374 + m_numSubpatterns = 0; 1.375 + m_maxBackReference = 0; 1.376 + 1.377 + m_containsBackreferences = false; 1.378 + m_containsBOL = false; 1.379 + 1.380 + newlineCached = 0; 1.381 + digitsCached = 0; 1.382 + spacesCached = 0; 1.383 + wordcharCached = 0; 1.384 + nondigitsCached = 0; 1.385 + nonspacesCached = 0; 1.386 + nonwordcharCached = 0; 1.387 + 1.388 + deleteAllValues(m_disjunctions); 1.389 + m_disjunctions.clear(); 1.390 + deleteAllValues(m_userCharacterClasses); 1.391 + m_userCharacterClasses.clear(); 1.392 + } 1.393 + 1.394 + bool containsIllegalBackReference() 1.395 + { 1.396 + return m_maxBackReference > m_numSubpatterns; 1.397 + } 1.398 + 1.399 + CharacterClass* newlineCharacterClass() 1.400 + { 1.401 + if (!newlineCached) 1.402 + m_userCharacterClasses.append(newlineCached = newlineCreate()); 1.403 + return newlineCached; 1.404 + } 1.405 + CharacterClass* digitsCharacterClass() 1.406 + { 1.407 + if (!digitsCached) 1.408 + m_userCharacterClasses.append(digitsCached = digitsCreate()); 1.409 + return digitsCached; 1.410 + } 1.411 + CharacterClass* spacesCharacterClass() 1.412 + { 1.413 + if (!spacesCached) 1.414 + m_userCharacterClasses.append(spacesCached = spacesCreate()); 1.415 + return spacesCached; 1.416 + } 1.417 + CharacterClass* wordcharCharacterClass() 1.418 + { 1.419 + if (!wordcharCached) 1.420 + m_userCharacterClasses.append(wordcharCached = wordcharCreate()); 1.421 + return wordcharCached; 1.422 + } 1.423 + CharacterClass* nondigitsCharacterClass() 1.424 + { 1.425 + if (!nondigitsCached) 1.426 + m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); 1.427 + return nondigitsCached; 1.428 + } 1.429 + CharacterClass* nonspacesCharacterClass() 1.430 + { 1.431 + if (!nonspacesCached) 1.432 + m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); 1.433 + return nonspacesCached; 1.434 + } 1.435 + CharacterClass* nonwordcharCharacterClass() 1.436 + { 1.437 + if (!nonwordcharCached) 1.438 + m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); 1.439 + return nonwordcharCached; 1.440 + } 1.441 + 1.442 + bool m_ignoreCase : 1; 1.443 + bool m_multiline : 1; 1.444 + bool m_containsBackreferences : 1; 1.445 + bool m_containsBOL : 1; 1.446 + unsigned m_numSubpatterns; 1.447 + unsigned m_maxBackReference; 1.448 + PatternDisjunction* m_body; 1.449 + Vector<PatternDisjunction*, 4> m_disjunctions; 1.450 + Vector<CharacterClass*> m_userCharacterClasses; 1.451 + 1.452 +private: 1.453 + ErrorCode compile(const String& patternString); 1.454 + 1.455 + CharacterClass* newlineCached; 1.456 + CharacterClass* digitsCached; 1.457 + CharacterClass* spacesCached; 1.458 + CharacterClass* wordcharCached; 1.459 + CharacterClass* nondigitsCached; 1.460 + CharacterClass* nonspacesCached; 1.461 + CharacterClass* nonwordcharCached; 1.462 +}; 1.463 + 1.464 +} } // namespace JSC::Yarr 1.465 + 1.466 +#endif /* yarr_YarrPattern_h */