1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/yarr/YarrPattern.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,933 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99: 1.6 + * 1.7 + * Copyright (C) 2009 Apple Inc. All rights reserved. 1.8 + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged 1.9 + * 1.10 + * Redistribution and use in source and binary forms, with or without 1.11 + * modification, are permitted provided that the following conditions 1.12 + * are met: 1.13 + * 1. Redistributions of source code must retain the above copyright 1.14 + * notice, this list of conditions and the following disclaimer. 1.15 + * 2. Redistributions in binary form must reproduce the above copyright 1.16 + * notice, this list of conditions and the following disclaimer in the 1.17 + * documentation and/or other materials provided with the distribution. 1.18 + * 1.19 + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 1.20 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1.21 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 1.22 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 1.23 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 1.24 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 1.25 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 1.26 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 1.27 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 1.28 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 1.29 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 1.30 + */ 1.31 + 1.32 +#include "yarr/YarrPattern.h" 1.33 + 1.34 +#include "yarr/Yarr.h" 1.35 +#include "yarr/YarrCanonicalizeUCS2.h" 1.36 +#include "yarr/YarrParser.h" 1.37 + 1.38 +using namespace WTF; 1.39 + 1.40 +namespace JSC { namespace Yarr { 1.41 + 1.42 +#include "yarr/RegExpJitTables.h" 1.43 + 1.44 +#if WTF_CPU_SPARC 1.45 +# define BASE_FRAME_SIZE 24 1.46 +#else 1.47 +# define BASE_FRAME_SIZE 0 1.48 +#endif 1.49 + 1.50 +// Thanks, windows.h! 1.51 +#undef min 1.52 +#undef max 1.53 + 1.54 +class CharacterClassConstructor { 1.55 +public: 1.56 + CharacterClassConstructor(bool isCaseInsensitive = false) 1.57 + : m_isCaseInsensitive(isCaseInsensitive) 1.58 + { 1.59 + } 1.60 + 1.61 + void reset() 1.62 + { 1.63 + m_matches.clear(); 1.64 + m_ranges.clear(); 1.65 + m_matchesUnicode.clear(); 1.66 + m_rangesUnicode.clear(); 1.67 + } 1.68 + 1.69 + void append(const CharacterClass* other) 1.70 + { 1.71 + for (size_t i = 0; i < other->m_matches.size(); ++i) 1.72 + addSorted(m_matches, other->m_matches[i]); 1.73 + for (size_t i = 0; i < other->m_ranges.size(); ++i) 1.74 + addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); 1.75 + for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) 1.76 + addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); 1.77 + for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) 1.78 + addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); 1.79 + } 1.80 + 1.81 + void putChar(UChar ch) 1.82 + { 1.83 + // Handle ascii cases. 1.84 + if (ch <= 0x7f) { 1.85 + if (m_isCaseInsensitive && isASCIIAlpha(ch)) { 1.86 + addSorted(m_matches, toASCIIUpper(ch)); 1.87 + addSorted(m_matches, toASCIILower(ch)); 1.88 + } else 1.89 + addSorted(m_matches, ch); 1.90 + return; 1.91 + } 1.92 + 1.93 + // Simple case, not a case-insensitive match. 1.94 + if (!m_isCaseInsensitive) { 1.95 + addSorted(m_matchesUnicode, ch); 1.96 + return; 1.97 + } 1.98 + 1.99 + // Add multiple matches, if necessary. 1.100 + const UCS2CanonicalizationRange* info = rangeInfoFor(ch); 1.101 + if (info->type == CanonicalizeUnique) 1.102 + addSorted(m_matchesUnicode, ch); 1.103 + else 1.104 + putUnicodeIgnoreCase(ch, info); 1.105 + } 1.106 + 1.107 + void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info) 1.108 + { 1.109 + ASSERT(m_isCaseInsensitive); 1.110 + ASSERT(ch > 0x7f); 1.111 + ASSERT(ch >= info->begin && ch <= info->end); 1.112 + ASSERT(info->type != CanonicalizeUnique); 1.113 + if (info->type == CanonicalizeSet) { 1.114 + for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 1.115 + addSorted(m_matchesUnicode, ch); 1.116 + } else { 1.117 + addSorted(m_matchesUnicode, ch); 1.118 + addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); 1.119 + } 1.120 + } 1.121 + 1.122 + void putRange(UChar lo, UChar hi) 1.123 + { 1.124 + if (lo <= 0x7f) { 1.125 + char asciiLo = lo; 1.126 + char asciiHi = std::min(hi, (UChar)0x7f); 1.127 + addSortedRange(m_ranges, lo, asciiHi); 1.128 + 1.129 + if (m_isCaseInsensitive) { 1.130 + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) 1.131 + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); 1.132 + if ((asciiLo <= 'z') && (asciiHi >= 'a')) 1.133 + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); 1.134 + } 1.135 + } 1.136 + if (hi <= 0x7f) 1.137 + return; 1.138 + 1.139 + lo = std::max(lo, (UChar)0x80); 1.140 + addSortedRange(m_rangesUnicode, lo, hi); 1.141 + 1.142 + if (!m_isCaseInsensitive) 1.143 + return; 1.144 + 1.145 + const UCS2CanonicalizationRange* info = rangeInfoFor(lo); 1.146 + while (true) { 1.147 + // Handle the range [lo .. end] 1.148 + UChar end = std::min<UChar>(info->end, hi); 1.149 + 1.150 + switch (info->type) { 1.151 + case CanonicalizeUnique: 1.152 + // Nothing to do - no canonical equivalents. 1.153 + break; 1.154 + case CanonicalizeSet: { 1.155 + UChar ch; 1.156 + for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 1.157 + addSorted(m_matchesUnicode, ch); 1.158 + break; 1.159 + } 1.160 + case CanonicalizeRangeLo: 1.161 + addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); 1.162 + break; 1.163 + case CanonicalizeRangeHi: 1.164 + addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); 1.165 + break; 1.166 + case CanonicalizeAlternatingAligned: 1.167 + // Use addSortedRange since there is likely an abutting range to combine with. 1.168 + if (lo & 1) 1.169 + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 1.170 + if (!(end & 1)) 1.171 + addSortedRange(m_rangesUnicode, end + 1, end + 1); 1.172 + break; 1.173 + case CanonicalizeAlternatingUnaligned: 1.174 + // Use addSortedRange since there is likely an abutting range to combine with. 1.175 + if (!(lo & 1)) 1.176 + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 1.177 + if (end & 1) 1.178 + addSortedRange(m_rangesUnicode, end + 1, end + 1); 1.179 + break; 1.180 + } 1.181 + 1.182 + if (hi == end) 1.183 + return; 1.184 + 1.185 + ++info; 1.186 + lo = info->begin; 1.187 + }; 1.188 + 1.189 + } 1.190 + 1.191 + CharacterClass* charClass() 1.192 + { 1.193 + CharacterClass* characterClass = newOrCrash<CharacterClass>(); 1.194 + 1.195 + characterClass->m_matches.swap(m_matches); 1.196 + characterClass->m_ranges.swap(m_ranges); 1.197 + characterClass->m_matchesUnicode.swap(m_matchesUnicode); 1.198 + characterClass->m_rangesUnicode.swap(m_rangesUnicode); 1.199 + 1.200 + return characterClass; 1.201 + } 1.202 + 1.203 +private: 1.204 + void addSorted(Vector<UChar>& matches, UChar ch) 1.205 + { 1.206 + unsigned pos = 0; 1.207 + unsigned range = matches.size(); 1.208 + 1.209 + // binary chop, find position to insert char. 1.210 + while (range) { 1.211 + unsigned index = range >> 1; 1.212 + 1.213 + int val = matches[pos+index] - ch; 1.214 + if (!val) 1.215 + return; 1.216 + else if (val > 0) 1.217 + range = index; 1.218 + else { 1.219 + pos += (index+1); 1.220 + range -= (index+1); 1.221 + } 1.222 + } 1.223 + 1.224 + if (pos == matches.size()) 1.225 + matches.append(ch); 1.226 + else 1.227 + matches.insert(pos, ch); 1.228 + } 1.229 + 1.230 + void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) 1.231 + { 1.232 + unsigned end = ranges.size(); 1.233 + 1.234 + // Simple linear scan - I doubt there are that many ranges anyway... 1.235 + // feel free to fix this with something faster (eg binary chop). 1.236 + for (unsigned i = 0; i < end; ++i) { 1.237 + // does the new range fall before the current position in the array 1.238 + if (hi < ranges[i].begin) { 1.239 + // optional optimization: concatenate appending ranges? - may not be worthwhile. 1.240 + if (hi == (ranges[i].begin - 1)) { 1.241 + ranges[i].begin = lo; 1.242 + return; 1.243 + } 1.244 + ranges.insert(i, CharacterRange(lo, hi)); 1.245 + return; 1.246 + } 1.247 + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining 1.248 + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the 1.249 + // end of the last range they concatenate, which is just as good. 1.250 + if (lo <= (ranges[i].end + 1)) { 1.251 + // found an intersect! we'll replace this entry in the array. 1.252 + ranges[i].begin = std::min(ranges[i].begin, lo); 1.253 + ranges[i].end = std::max(ranges[i].end, hi); 1.254 + 1.255 + // now check if the new range can subsume any subsequent ranges. 1.256 + unsigned next = i+1; 1.257 + // each iteration of the loop we will either remove something from the list, or break the loop. 1.258 + while (next < ranges.size()) { 1.259 + if (ranges[next].begin <= (ranges[i].end + 1)) { 1.260 + // the next entry now overlaps / concatenates this one. 1.261 + ranges[i].end = std::max(ranges[i].end, ranges[next].end); 1.262 + ranges.remove(next); 1.263 + } else 1.264 + break; 1.265 + } 1.266 + 1.267 + return; 1.268 + } 1.269 + } 1.270 + 1.271 + // CharacterRange comes after all existing ranges. 1.272 + ranges.append(CharacterRange(lo, hi)); 1.273 + } 1.274 + 1.275 + bool m_isCaseInsensitive; 1.276 + 1.277 + Vector<UChar> m_matches; 1.278 + Vector<CharacterRange> m_ranges; 1.279 + Vector<UChar> m_matchesUnicode; 1.280 + Vector<CharacterRange> m_rangesUnicode; 1.281 +}; 1.282 + 1.283 +class YarrPatternConstructor { 1.284 +public: 1.285 + YarrPatternConstructor(YarrPattern& pattern) 1.286 + : m_pattern(pattern) 1.287 + , m_stackBase(nullptr) 1.288 + , m_characterClassConstructor(pattern.m_ignoreCase) 1.289 + , m_invertParentheticalAssertion(false) 1.290 + { 1.291 + m_pattern.m_body = newOrCrash<PatternDisjunction>(); 1.292 + m_alternative = m_pattern.m_body->addNewAlternative(); 1.293 + m_pattern.m_disjunctions.append(m_pattern.m_body); 1.294 + } 1.295 + 1.296 + ~YarrPatternConstructor() 1.297 + { 1.298 + } 1.299 + 1.300 + void reset() 1.301 + { 1.302 + m_pattern.reset(); 1.303 + m_characterClassConstructor.reset(); 1.304 + 1.305 + m_pattern.m_body = newOrCrash<PatternDisjunction>(); 1.306 + m_alternative = m_pattern.m_body->addNewAlternative(); 1.307 + m_pattern.m_disjunctions.append(m_pattern.m_body); 1.308 + } 1.309 + 1.310 + void assertionBOL() 1.311 + { 1.312 + if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { 1.313 + m_alternative->m_startsWithBOL = true; 1.314 + m_alternative->m_containsBOL = true; 1.315 + m_pattern.m_containsBOL = true; 1.316 + } 1.317 + m_alternative->m_terms.append(PatternTerm::BOL()); 1.318 + } 1.319 + void assertionEOL() 1.320 + { 1.321 + m_alternative->m_terms.append(PatternTerm::EOL()); 1.322 + } 1.323 + void assertionWordBoundary(bool invert) 1.324 + { 1.325 + m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); 1.326 + } 1.327 + 1.328 + void atomPatternCharacter(UChar ch) 1.329 + { 1.330 + // We handle case-insensitive checking of unicode characters which do have both 1.331 + // cases by handling them as if they were defined using a CharacterClass. 1.332 + if (!m_pattern.m_ignoreCase || isASCII(ch)) { 1.333 + m_alternative->m_terms.append(PatternTerm(ch)); 1.334 + return; 1.335 + } 1.336 + 1.337 + const UCS2CanonicalizationRange* info = rangeInfoFor(ch); 1.338 + if (info->type == CanonicalizeUnique) { 1.339 + m_alternative->m_terms.append(PatternTerm(ch)); 1.340 + return; 1.341 + } 1.342 + 1.343 + m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); 1.344 + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); 1.345 + m_pattern.m_userCharacterClasses.append(newCharacterClass); 1.346 + m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); 1.347 + } 1.348 + 1.349 + void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 1.350 + { 1.351 + switch (classID) { 1.352 + case DigitClassID: 1.353 + m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); 1.354 + break; 1.355 + case SpaceClassID: 1.356 + m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); 1.357 + break; 1.358 + case WordClassID: 1.359 + m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); 1.360 + break; 1.361 + case NewlineClassID: 1.362 + m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); 1.363 + break; 1.364 + } 1.365 + } 1.366 + 1.367 + void atomCharacterClassBegin(bool invert = false) 1.368 + { 1.369 + m_invertCharacterClass = invert; 1.370 + } 1.371 + 1.372 + void atomCharacterClassAtom(UChar ch) 1.373 + { 1.374 + m_characterClassConstructor.putChar(ch); 1.375 + } 1.376 + 1.377 + void atomCharacterClassRange(UChar begin, UChar end) 1.378 + { 1.379 + m_characterClassConstructor.putRange(begin, end); 1.380 + } 1.381 + 1.382 + void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 1.383 + { 1.384 + ASSERT(classID != NewlineClassID); 1.385 + 1.386 + switch (classID) { 1.387 + case DigitClassID: 1.388 + m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); 1.389 + break; 1.390 + 1.391 + case SpaceClassID: 1.392 + m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); 1.393 + break; 1.394 + 1.395 + case WordClassID: 1.396 + m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); 1.397 + break; 1.398 + 1.399 + default: 1.400 + ASSERT_NOT_REACHED(); 1.401 + } 1.402 + } 1.403 + 1.404 + void atomCharacterClassEnd() 1.405 + { 1.406 + CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); 1.407 + m_pattern.m_userCharacterClasses.append(newCharacterClass); 1.408 + m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); 1.409 + } 1.410 + 1.411 + void atomParenthesesSubpatternBegin(bool capture = true) 1.412 + { 1.413 + unsigned subpatternId = m_pattern.m_numSubpatterns + 1; 1.414 + if (capture) 1.415 + m_pattern.m_numSubpatterns++; 1.416 + 1.417 + PatternDisjunction* parenthesesDisjunction = newOrCrash<PatternDisjunction>(m_alternative); 1.418 + m_pattern.m_disjunctions.append(parenthesesDisjunction); 1.419 + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); 1.420 + m_alternative = parenthesesDisjunction->addNewAlternative(); 1.421 + } 1.422 + 1.423 + void atomParentheticalAssertionBegin(bool invert = false) 1.424 + { 1.425 + PatternDisjunction* parenthesesDisjunction = newOrCrash<PatternDisjunction>(m_alternative); 1.426 + m_pattern.m_disjunctions.append(parenthesesDisjunction); 1.427 + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); 1.428 + m_alternative = parenthesesDisjunction->addNewAlternative(); 1.429 + m_invertParentheticalAssertion = invert; 1.430 + } 1.431 + 1.432 + void atomParenthesesEnd() 1.433 + { 1.434 + ASSERT(m_alternative->m_parent); 1.435 + ASSERT(m_alternative->m_parent->m_parent); 1.436 + 1.437 + PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; 1.438 + m_alternative = m_alternative->m_parent->m_parent; 1.439 + 1.440 + PatternTerm& lastTerm = m_alternative->lastTerm(); 1.441 + 1.442 + unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); 1.443 + unsigned numBOLAnchoredAlts = 0; 1.444 + 1.445 + for (unsigned i = 0; i < numParenAlternatives; i++) { 1.446 + // Bubble up BOL flags 1.447 + if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) 1.448 + numBOLAnchoredAlts++; 1.449 + } 1.450 + 1.451 + if (numBOLAnchoredAlts) { 1.452 + m_alternative->m_containsBOL = true; 1.453 + // If all the alternatives in parens start with BOL, then so does this one 1.454 + if (numBOLAnchoredAlts == numParenAlternatives) 1.455 + m_alternative->m_startsWithBOL = true; 1.456 + } 1.457 + 1.458 + lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; 1.459 + m_invertParentheticalAssertion = false; 1.460 + } 1.461 + 1.462 + void atomBackReference(unsigned subpatternId) 1.463 + { 1.464 + ASSERT(subpatternId); 1.465 + m_pattern.m_containsBackreferences = true; 1.466 + m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); 1.467 + 1.468 + if (subpatternId > m_pattern.m_numSubpatterns) { 1.469 + m_alternative->m_terms.append(PatternTerm::ForwardReference()); 1.470 + return; 1.471 + } 1.472 + 1.473 + PatternAlternative* currentAlternative = m_alternative; 1.474 + ASSERT(currentAlternative); 1.475 + 1.476 + // Note to self: if we waited until the AST was baked, we could also remove forwards refs 1.477 + while ((currentAlternative = currentAlternative->m_parent->m_parent)) { 1.478 + PatternTerm& term = currentAlternative->lastTerm(); 1.479 + ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); 1.480 + 1.481 + if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { 1.482 + m_alternative->m_terms.append(PatternTerm::ForwardReference()); 1.483 + return; 1.484 + } 1.485 + } 1.486 + 1.487 + m_alternative->m_terms.append(PatternTerm(subpatternId)); 1.488 + } 1.489 + 1.490 + // deep copy the argument disjunction. If filterStartsWithBOL is true, 1.491 + // skip alternatives with m_startsWithBOL set true. 1.492 + PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) 1.493 + { 1.494 + PatternDisjunction* newDisjunction = 0; 1.495 + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 1.496 + PatternAlternative* alternative = disjunction->m_alternatives[alt]; 1.497 + if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { 1.498 + if (!newDisjunction) { 1.499 + newDisjunction = newOrCrash<PatternDisjunction>(); 1.500 + newDisjunction->m_parent = disjunction->m_parent; 1.501 + } 1.502 + PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); 1.503 + newAlternative->m_terms.reserve(alternative->m_terms.size()); 1.504 + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) 1.505 + newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); 1.506 + } 1.507 + } 1.508 + 1.509 + if (newDisjunction) 1.510 + m_pattern.m_disjunctions.append(newDisjunction); 1.511 + return newDisjunction; 1.512 + } 1.513 + 1.514 + PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) 1.515 + { 1.516 + if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) 1.517 + return PatternTerm(term); 1.518 + 1.519 + PatternTerm termCopy = term; 1.520 + termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); 1.521 + return termCopy; 1.522 + } 1.523 + 1.524 + void quantifyAtom(unsigned min, unsigned max, bool greedy) 1.525 + { 1.526 + ASSERT(min <= max); 1.527 + ASSERT(m_alternative->m_terms.size()); 1.528 + 1.529 + if (!max) { 1.530 + m_alternative->removeLastTerm(); 1.531 + return; 1.532 + } 1.533 + 1.534 + PatternTerm& term = m_alternative->lastTerm(); 1.535 + ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); 1.536 + ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); 1.537 + 1.538 + if (term.type == PatternTerm::TypeParentheticalAssertion) { 1.539 + // If an assertion is quantified with a minimum count of zero, it can simply be removed. 1.540 + // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never 1.541 + // results in any input being consumed, however the continuation passed to the assertion 1.542 + // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will 1.543 + // reject all zero length matches (see step 2.1). A match from the continuation of the 1.544 + // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all 1.545 + // this is that matches from the assertion are not required, and won't be accepted anyway, 1.546 + // so no need to ever run it. 1.547 + if (!min) 1.548 + m_alternative->removeLastTerm(); 1.549 + // We never need to run an assertion more than once. Subsequent interations will be run 1.550 + // with the same start index (since assertions are non-capturing) and the same captures 1.551 + // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the 1.552 + // same result and captures. If the first match succeeds then the subsequent (min - 1) 1.553 + // matches will too. Any additional optional matches will fail (on the same basis as the 1.554 + // minimum zero quantified assertions, above), but this will still result in a match. 1.555 + return; 1.556 + } 1.557 + 1.558 + if (min == 0) 1.559 + term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); 1.560 + else if (min == max) 1.561 + term.quantify(min, QuantifierFixedCount); 1.562 + else { 1.563 + term.quantify(min, QuantifierFixedCount); 1.564 + m_alternative->m_terms.append(copyTerm(term)); 1.565 + // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... 1.566 + m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); 1.567 + if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) 1.568 + m_alternative->lastTerm().parentheses.isCopy = true; 1.569 + } 1.570 + } 1.571 + 1.572 + void disjunction() 1.573 + { 1.574 + m_alternative = m_alternative->m_parent->addNewAlternative(); 1.575 + } 1.576 + 1.577 + ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, 1.578 + unsigned *callFrameSizeOut) 1.579 + { 1.580 + /* 1.581 + * Attempt detection of over-recursion: 1.582 + * "1MB should be enough stack for anyone." 1.583 + */ 1.584 + uint8_t stackDummy_; 1.585 + if (m_stackBase - &stackDummy_ > 1024*1024) 1.586 + return PatternTooLarge; 1.587 + 1.588 + alternative->m_hasFixedSize = true; 1.589 + Checked<unsigned> currentInputPosition = initialInputPosition; 1.590 + 1.591 + for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { 1.592 + PatternTerm& term = alternative->m_terms[i]; 1.593 + 1.594 + switch (term.type) { 1.595 + case PatternTerm::TypeAssertionBOL: 1.596 + case PatternTerm::TypeAssertionEOL: 1.597 + case PatternTerm::TypeAssertionWordBoundary: 1.598 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.599 + return RuntimeError; 1.600 + break; 1.601 + 1.602 + case PatternTerm::TypeBackReference: 1.603 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.604 + return RuntimeError; 1.605 + term.frameLocation = currentCallFrameSize; 1.606 + currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; 1.607 + alternative->m_hasFixedSize = false; 1.608 + break; 1.609 + 1.610 + case PatternTerm::TypeForwardReference: 1.611 + break; 1.612 + 1.613 + case PatternTerm::TypePatternCharacter: 1.614 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.615 + return RuntimeError; 1.616 + if (term.quantityType != QuantifierFixedCount) { 1.617 + term.frameLocation = currentCallFrameSize; 1.618 + currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; 1.619 + alternative->m_hasFixedSize = false; 1.620 + } else 1.621 + currentInputPosition += term.quantityCount; 1.622 + break; 1.623 + 1.624 + case PatternTerm::TypeCharacterClass: 1.625 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.626 + return RuntimeError; 1.627 + if (term.quantityType != QuantifierFixedCount) { 1.628 + term.frameLocation = currentCallFrameSize; 1.629 + currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; 1.630 + alternative->m_hasFixedSize = false; 1.631 + } else 1.632 + currentInputPosition += term.quantityCount; 1.633 + break; 1.634 + 1.635 + case PatternTerm::TypeParenthesesSubpattern: 1.636 + // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. 1.637 + term.frameLocation = currentCallFrameSize; 1.638 + unsigned position; 1.639 + if (currentInputPosition.safeGet(position)) 1.640 + return RuntimeError; 1.641 + if (term.quantityCount == 1 && !term.parentheses.isCopy) { 1.642 + if (term.quantityType != QuantifierFixedCount) 1.643 + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; 1.644 + if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, position, ¤tCallFrameSize)) 1.645 + return error; 1.646 + // If quantity is fixed, then pre-check its minimum size. 1.647 + if (term.quantityType == QuantifierFixedCount) 1.648 + currentInputPosition += term.parentheses.disjunction->m_minimumSize; 1.649 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.650 + return RuntimeError; 1.651 + } else if (term.parentheses.isTerminal) { 1.652 + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; 1.653 + if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, position, ¤tCallFrameSize)) 1.654 + return error; 1.655 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.656 + return RuntimeError; 1.657 + } else { 1.658 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.659 + return RuntimeError; 1.660 + unsigned dummy; 1.661 + if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, BASE_FRAME_SIZE, position, &dummy)) 1.662 + return error; 1.663 + currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; 1.664 + } 1.665 + // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. 1.666 + alternative->m_hasFixedSize = false; 1.667 + break; 1.668 + 1.669 + case PatternTerm::TypeParentheticalAssertion: 1.670 + if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) 1.671 + return RuntimeError; 1.672 + term.frameLocation = currentCallFrameSize; 1.673 + if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, term.inputPosition, ¤tCallFrameSize)) 1.674 + return error; 1.675 + break; 1.676 + 1.677 + case PatternTerm::TypeDotStarEnclosure: 1.678 + alternative->m_hasFixedSize = false; 1.679 + term.inputPosition = initialInputPosition; 1.680 + break; 1.681 + } 1.682 + } 1.683 + 1.684 + if ((currentInputPosition - initialInputPosition).safeGet(alternative->m_minimumSize)) 1.685 + return RuntimeError; 1.686 + *callFrameSizeOut = currentCallFrameSize; 1.687 + return NoError; 1.688 + } 1.689 + 1.690 + ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned *maximumCallFrameSizeOut) 1.691 + { 1.692 + if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) 1.693 + initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; 1.694 + 1.695 + unsigned minimumInputSize = UINT_MAX; 1.696 + unsigned maximumCallFrameSize = 0; 1.697 + bool hasFixedSize = true; 1.698 + 1.699 + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { 1.700 + PatternAlternative* alternative = disjunction->m_alternatives[alt]; 1.701 + unsigned currentAlternativeCallFrameSize; 1.702 + if (ErrorCode error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, ¤tAlternativeCallFrameSize)) 1.703 + return error; 1.704 + minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); 1.705 + maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); 1.706 + hasFixedSize &= alternative->m_hasFixedSize; 1.707 + } 1.708 + 1.709 + ASSERT(minimumInputSize != UINT_MAX); 1.710 + if (minimumInputSize == UINT_MAX) 1.711 + return PatternTooLarge; 1.712 + 1.713 + ASSERT(minimumInputSize != UINT_MAX); 1.714 + ASSERT(maximumCallFrameSize >= initialCallFrameSize); 1.715 + 1.716 + disjunction->m_hasFixedSize = hasFixedSize; 1.717 + disjunction->m_minimumSize = minimumInputSize; 1.718 + disjunction->m_callFrameSize = maximumCallFrameSize; 1.719 + *maximumCallFrameSizeOut = maximumCallFrameSize; 1.720 + return NoError; 1.721 + } 1.722 + 1.723 + ErrorCode setupOffsets() 1.724 + { 1.725 + unsigned dummy; 1.726 + return setupDisjunctionOffsets(m_pattern.m_body, BASE_FRAME_SIZE, 0, &dummy); 1.727 + } 1.728 + 1.729 + // This optimization identifies sets of parentheses that we will never need to backtrack. 1.730 + // In these cases we do not need to store state from prior iterations. 1.731 + // We can presently avoid backtracking for: 1.732 + // * where the parens are at the end of the regular expression (last term in any of the 1.733 + // alternatives of the main body disjunction). 1.734 + // * where the parens are non-capturing, and quantified unbounded greedy (*). 1.735 + // * where the parens do not contain any capturing subpatterns. 1.736 + void checkForTerminalParentheses() 1.737 + { 1.738 + // This check is much too crude; should be just checking whether the candidate 1.739 + // node contains nested capturing subpatterns, not the whole expression! 1.740 + if (m_pattern.m_numSubpatterns) 1.741 + return; 1.742 + 1.743 + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; 1.744 + for (size_t i = 0; i < alternatives.size(); ++i) { 1.745 + Vector<PatternTerm>& terms = alternatives[i]->m_terms; 1.746 + if (terms.size()) { 1.747 + PatternTerm& term = terms.last(); 1.748 + if (term.type == PatternTerm::TypeParenthesesSubpattern 1.749 + && term.quantityType == QuantifierGreedy 1.750 + && term.quantityCount == quantifyInfinite 1.751 + && !term.capture()) 1.752 + term.parentheses.isTerminal = true; 1.753 + } 1.754 + } 1.755 + } 1.756 + 1.757 + void optimizeBOL() 1.758 + { 1.759 + // Look for expressions containing beginning of line (^) anchoring and unroll them. 1.760 + // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops 1.761 + // This code relies on the parsing code tagging alternatives with m_containsBOL and 1.762 + // m_startsWithBOL and rolling those up to containing alternatives. 1.763 + // At this point, this is only valid for non-multiline expressions. 1.764 + PatternDisjunction* disjunction = m_pattern.m_body; 1.765 + 1.766 + if (!m_pattern.m_containsBOL || m_pattern.m_multiline) 1.767 + return; 1.768 + 1.769 + PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); 1.770 + 1.771 + // Set alternatives in disjunction to "onceThrough" 1.772 + for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) 1.773 + disjunction->m_alternatives[alt]->setOnceThrough(); 1.774 + 1.775 + if (loopDisjunction) { 1.776 + // Move alternatives from loopDisjunction to disjunction 1.777 + for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) 1.778 + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); 1.779 + 1.780 + loopDisjunction->m_alternatives.clear(); 1.781 + } 1.782 + } 1.783 + 1.784 + bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) 1.785 + { 1.786 + Vector<PatternTerm>& terms = alternative->m_terms; 1.787 + 1.788 + for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { 1.789 + PatternTerm& term = terms[termIndex]; 1.790 + 1.791 + if (term.m_capture) 1.792 + return true; 1.793 + 1.794 + if (term.type == PatternTerm::TypeParenthesesSubpattern) { 1.795 + PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; 1.796 + for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { 1.797 + PatternAlternative *pattern = nestedDisjunction->m_alternatives[alt]; 1.798 + if (pattern->m_terms.size() == 0) 1.799 + continue; 1.800 + if (containsCapturingTerms(pattern, 0, pattern->m_terms.size() - 1)) 1.801 + return true; 1.802 + } 1.803 + } 1.804 + } 1.805 + 1.806 + return false; 1.807 + } 1.808 + 1.809 + // This optimization identifies alternatives in the form of 1.810 + // [^].*[?]<expression>.*[$] for expressions that don't have any 1.811 + // capturing terms. The alternative is changed to <expression> 1.812 + // followed by processing of the dot stars to find and adjust the 1.813 + // beginning and the end of the match. 1.814 + void optimizeDotStarWrappedExpressions() 1.815 + { 1.816 + Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; 1.817 + if (alternatives.size() != 1) 1.818 + return; 1.819 + 1.820 + PatternAlternative* alternative = alternatives[0]; 1.821 + Vector<PatternTerm>& terms = alternative->m_terms; 1.822 + if (terms.size() >= 3) { 1.823 + bool startsWithBOL = false; 1.824 + bool endsWithEOL = false; 1.825 + size_t termIndex, firstExpressionTerm, lastExpressionTerm; 1.826 + 1.827 + termIndex = 0; 1.828 + if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { 1.829 + startsWithBOL = true; 1.830 + ++termIndex; 1.831 + } 1.832 + 1.833 + PatternTerm& firstNonAnchorTerm = terms[termIndex]; 1.834 + if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) 1.835 + return; 1.836 + 1.837 + firstExpressionTerm = termIndex + 1; 1.838 + 1.839 + termIndex = terms.size() - 1; 1.840 + if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { 1.841 + endsWithEOL = true; 1.842 + --termIndex; 1.843 + } 1.844 + 1.845 + PatternTerm& lastNonAnchorTerm = terms[termIndex]; 1.846 + if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) 1.847 + return; 1.848 + 1.849 + lastExpressionTerm = termIndex - 1; 1.850 + 1.851 + if (firstExpressionTerm > lastExpressionTerm) 1.852 + return; 1.853 + 1.854 + if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { 1.855 + for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) 1.856 + terms.remove(termIndex); 1.857 + 1.858 + for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) 1.859 + terms.remove(termIndex - 1); 1.860 + 1.861 + terms.append(PatternTerm(startsWithBOL, endsWithEOL)); 1.862 + 1.863 + m_pattern.m_containsBOL = false; 1.864 + } 1.865 + } 1.866 + } 1.867 + 1.868 + void setStackBase(uint8_t *stackBase) { 1.869 + m_stackBase = stackBase; 1.870 + } 1.871 + 1.872 +private: 1.873 + YarrPattern& m_pattern; 1.874 + uint8_t * m_stackBase; 1.875 + PatternAlternative* m_alternative; 1.876 + CharacterClassConstructor m_characterClassConstructor; 1.877 + bool m_invertCharacterClass; 1.878 + bool m_invertParentheticalAssertion; 1.879 +}; 1.880 + 1.881 +ErrorCode YarrPattern::compile(const String& patternString) 1.882 +{ 1.883 + YarrPatternConstructor constructor(*this); 1.884 + 1.885 + if (ErrorCode error = parse(constructor, patternString)) 1.886 + return error; 1.887 + 1.888 + // If the pattern contains illegal backreferences reset & reparse. 1.889 + // Quoting Netscape's "What's new in JavaScript 1.2", 1.890 + // "Note: if the number of left parentheses is less than the number specified 1.891 + // in \#, the \# is taken as an octal escape as described in the next row." 1.892 + if (containsIllegalBackReference()) { 1.893 + unsigned numSubpatterns = m_numSubpatterns; 1.894 + 1.895 + constructor.reset(); 1.896 +#if !ASSERT_DISABLED 1.897 + ErrorCode error = 1.898 +#endif 1.899 + parse(constructor, patternString, numSubpatterns); 1.900 + 1.901 + ASSERT(!error); 1.902 + ASSERT(numSubpatterns == m_numSubpatterns); 1.903 + } 1.904 + 1.905 + uint8_t stackDummy_; 1.906 + constructor.setStackBase(&stackDummy_); 1.907 + 1.908 + constructor.checkForTerminalParentheses(); 1.909 + constructor.optimizeDotStarWrappedExpressions(); 1.910 + constructor.optimizeBOL(); 1.911 + 1.912 + if (ErrorCode error = constructor.setupOffsets()) 1.913 + return error; 1.914 + 1.915 + return NoError; 1.916 +} 1.917 + 1.918 +YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error) 1.919 + : m_ignoreCase(ignoreCase) 1.920 + , m_multiline(multiline) 1.921 + , m_containsBackreferences(false) 1.922 + , m_containsBOL(false) 1.923 + , m_numSubpatterns(0) 1.924 + , m_maxBackReference(0) 1.925 + , newlineCached(0) 1.926 + , digitsCached(0) 1.927 + , spacesCached(0) 1.928 + , wordcharCached(0) 1.929 + , nondigitsCached(0) 1.930 + , nonspacesCached(0) 1.931 + , nonwordcharCached(0) 1.932 +{ 1.933 + *error = compile(pattern); 1.934 +} 1.935 + 1.936 +} }