Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
michael@0 | 2 | * vim: set ts=8 sts=4 et sw=4 tw=99: |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
michael@0 | 5 | * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged |
michael@0 | 6 | * |
michael@0 | 7 | * Redistribution and use in source and binary forms, with or without |
michael@0 | 8 | * modification, are permitted provided that the following conditions |
michael@0 | 9 | * are met: |
michael@0 | 10 | * 1. Redistributions of source code must retain the above copyright |
michael@0 | 11 | * notice, this list of conditions and the following disclaimer. |
michael@0 | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
michael@0 | 13 | * notice, this list of conditions and the following disclaimer in the |
michael@0 | 14 | * documentation and/or other materials provided with the distribution. |
michael@0 | 15 | * |
michael@0 | 16 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
michael@0 | 17 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
michael@0 | 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
michael@0 | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
michael@0 | 20 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
michael@0 | 21 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
michael@0 | 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
michael@0 | 23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
michael@0 | 24 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
michael@0 | 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
michael@0 | 26 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
michael@0 | 27 | */ |
michael@0 | 28 | |
michael@0 | 29 | #include "yarr/YarrPattern.h" |
michael@0 | 30 | |
michael@0 | 31 | #include "yarr/Yarr.h" |
michael@0 | 32 | #include "yarr/YarrCanonicalizeUCS2.h" |
michael@0 | 33 | #include "yarr/YarrParser.h" |
michael@0 | 34 | |
michael@0 | 35 | using namespace WTF; |
michael@0 | 36 | |
michael@0 | 37 | namespace JSC { namespace Yarr { |
michael@0 | 38 | |
michael@0 | 39 | #include "yarr/RegExpJitTables.h" |
michael@0 | 40 | |
michael@0 | 41 | #if WTF_CPU_SPARC |
michael@0 | 42 | # define BASE_FRAME_SIZE 24 |
michael@0 | 43 | #else |
michael@0 | 44 | # define BASE_FRAME_SIZE 0 |
michael@0 | 45 | #endif |
michael@0 | 46 | |
michael@0 | 47 | // Thanks, windows.h! |
michael@0 | 48 | #undef min |
michael@0 | 49 | #undef max |
michael@0 | 50 | |
michael@0 | 51 | class CharacterClassConstructor { |
michael@0 | 52 | public: |
michael@0 | 53 | CharacterClassConstructor(bool isCaseInsensitive = false) |
michael@0 | 54 | : m_isCaseInsensitive(isCaseInsensitive) |
michael@0 | 55 | { |
michael@0 | 56 | } |
michael@0 | 57 | |
michael@0 | 58 | void reset() |
michael@0 | 59 | { |
michael@0 | 60 | m_matches.clear(); |
michael@0 | 61 | m_ranges.clear(); |
michael@0 | 62 | m_matchesUnicode.clear(); |
michael@0 | 63 | m_rangesUnicode.clear(); |
michael@0 | 64 | } |
michael@0 | 65 | |
michael@0 | 66 | void append(const CharacterClass* other) |
michael@0 | 67 | { |
michael@0 | 68 | for (size_t i = 0; i < other->m_matches.size(); ++i) |
michael@0 | 69 | addSorted(m_matches, other->m_matches[i]); |
michael@0 | 70 | for (size_t i = 0; i < other->m_ranges.size(); ++i) |
michael@0 | 71 | addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); |
michael@0 | 72 | for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) |
michael@0 | 73 | addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); |
michael@0 | 74 | for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) |
michael@0 | 75 | addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); |
michael@0 | 76 | } |
michael@0 | 77 | |
michael@0 | 78 | void putChar(UChar ch) |
michael@0 | 79 | { |
michael@0 | 80 | // Handle ascii cases. |
michael@0 | 81 | if (ch <= 0x7f) { |
michael@0 | 82 | if (m_isCaseInsensitive && isASCIIAlpha(ch)) { |
michael@0 | 83 | addSorted(m_matches, toASCIIUpper(ch)); |
michael@0 | 84 | addSorted(m_matches, toASCIILower(ch)); |
michael@0 | 85 | } else |
michael@0 | 86 | addSorted(m_matches, ch); |
michael@0 | 87 | return; |
michael@0 | 88 | } |
michael@0 | 89 | |
michael@0 | 90 | // Simple case, not a case-insensitive match. |
michael@0 | 91 | if (!m_isCaseInsensitive) { |
michael@0 | 92 | addSorted(m_matchesUnicode, ch); |
michael@0 | 93 | return; |
michael@0 | 94 | } |
michael@0 | 95 | |
michael@0 | 96 | // Add multiple matches, if necessary. |
michael@0 | 97 | const UCS2CanonicalizationRange* info = rangeInfoFor(ch); |
michael@0 | 98 | if (info->type == CanonicalizeUnique) |
michael@0 | 99 | addSorted(m_matchesUnicode, ch); |
michael@0 | 100 | else |
michael@0 | 101 | putUnicodeIgnoreCase(ch, info); |
michael@0 | 102 | } |
michael@0 | 103 | |
michael@0 | 104 | void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info) |
michael@0 | 105 | { |
michael@0 | 106 | ASSERT(m_isCaseInsensitive); |
michael@0 | 107 | ASSERT(ch > 0x7f); |
michael@0 | 108 | ASSERT(ch >= info->begin && ch <= info->end); |
michael@0 | 109 | ASSERT(info->type != CanonicalizeUnique); |
michael@0 | 110 | if (info->type == CanonicalizeSet) { |
michael@0 | 111 | for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) |
michael@0 | 112 | addSorted(m_matchesUnicode, ch); |
michael@0 | 113 | } else { |
michael@0 | 114 | addSorted(m_matchesUnicode, ch); |
michael@0 | 115 | addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); |
michael@0 | 116 | } |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | void putRange(UChar lo, UChar hi) |
michael@0 | 120 | { |
michael@0 | 121 | if (lo <= 0x7f) { |
michael@0 | 122 | char asciiLo = lo; |
michael@0 | 123 | char asciiHi = std::min(hi, (UChar)0x7f); |
michael@0 | 124 | addSortedRange(m_ranges, lo, asciiHi); |
michael@0 | 125 | |
michael@0 | 126 | if (m_isCaseInsensitive) { |
michael@0 | 127 | if ((asciiLo <= 'Z') && (asciiHi >= 'A')) |
michael@0 | 128 | addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); |
michael@0 | 129 | if ((asciiLo <= 'z') && (asciiHi >= 'a')) |
michael@0 | 130 | addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); |
michael@0 | 131 | } |
michael@0 | 132 | } |
michael@0 | 133 | if (hi <= 0x7f) |
michael@0 | 134 | return; |
michael@0 | 135 | |
michael@0 | 136 | lo = std::max(lo, (UChar)0x80); |
michael@0 | 137 | addSortedRange(m_rangesUnicode, lo, hi); |
michael@0 | 138 | |
michael@0 | 139 | if (!m_isCaseInsensitive) |
michael@0 | 140 | return; |
michael@0 | 141 | |
michael@0 | 142 | const UCS2CanonicalizationRange* info = rangeInfoFor(lo); |
michael@0 | 143 | while (true) { |
michael@0 | 144 | // Handle the range [lo .. end] |
michael@0 | 145 | UChar end = std::min<UChar>(info->end, hi); |
michael@0 | 146 | |
michael@0 | 147 | switch (info->type) { |
michael@0 | 148 | case CanonicalizeUnique: |
michael@0 | 149 | // Nothing to do - no canonical equivalents. |
michael@0 | 150 | break; |
michael@0 | 151 | case CanonicalizeSet: { |
michael@0 | 152 | UChar ch; |
michael@0 | 153 | for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) |
michael@0 | 154 | addSorted(m_matchesUnicode, ch); |
michael@0 | 155 | break; |
michael@0 | 156 | } |
michael@0 | 157 | case CanonicalizeRangeLo: |
michael@0 | 158 | addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); |
michael@0 | 159 | break; |
michael@0 | 160 | case CanonicalizeRangeHi: |
michael@0 | 161 | addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); |
michael@0 | 162 | break; |
michael@0 | 163 | case CanonicalizeAlternatingAligned: |
michael@0 | 164 | // Use addSortedRange since there is likely an abutting range to combine with. |
michael@0 | 165 | if (lo & 1) |
michael@0 | 166 | addSortedRange(m_rangesUnicode, lo - 1, lo - 1); |
michael@0 | 167 | if (!(end & 1)) |
michael@0 | 168 | addSortedRange(m_rangesUnicode, end + 1, end + 1); |
michael@0 | 169 | break; |
michael@0 | 170 | case CanonicalizeAlternatingUnaligned: |
michael@0 | 171 | // Use addSortedRange since there is likely an abutting range to combine with. |
michael@0 | 172 | if (!(lo & 1)) |
michael@0 | 173 | addSortedRange(m_rangesUnicode, lo - 1, lo - 1); |
michael@0 | 174 | if (end & 1) |
michael@0 | 175 | addSortedRange(m_rangesUnicode, end + 1, end + 1); |
michael@0 | 176 | break; |
michael@0 | 177 | } |
michael@0 | 178 | |
michael@0 | 179 | if (hi == end) |
michael@0 | 180 | return; |
michael@0 | 181 | |
michael@0 | 182 | ++info; |
michael@0 | 183 | lo = info->begin; |
michael@0 | 184 | }; |
michael@0 | 185 | |
michael@0 | 186 | } |
michael@0 | 187 | |
michael@0 | 188 | CharacterClass* charClass() |
michael@0 | 189 | { |
michael@0 | 190 | CharacterClass* characterClass = newOrCrash<CharacterClass>(); |
michael@0 | 191 | |
michael@0 | 192 | characterClass->m_matches.swap(m_matches); |
michael@0 | 193 | characterClass->m_ranges.swap(m_ranges); |
michael@0 | 194 | characterClass->m_matchesUnicode.swap(m_matchesUnicode); |
michael@0 | 195 | characterClass->m_rangesUnicode.swap(m_rangesUnicode); |
michael@0 | 196 | |
michael@0 | 197 | return characterClass; |
michael@0 | 198 | } |
michael@0 | 199 | |
michael@0 | 200 | private: |
michael@0 | 201 | void addSorted(Vector<UChar>& matches, UChar ch) |
michael@0 | 202 | { |
michael@0 | 203 | unsigned pos = 0; |
michael@0 | 204 | unsigned range = matches.size(); |
michael@0 | 205 | |
michael@0 | 206 | // binary chop, find position to insert char. |
michael@0 | 207 | while (range) { |
michael@0 | 208 | unsigned index = range >> 1; |
michael@0 | 209 | |
michael@0 | 210 | int val = matches[pos+index] - ch; |
michael@0 | 211 | if (!val) |
michael@0 | 212 | return; |
michael@0 | 213 | else if (val > 0) |
michael@0 | 214 | range = index; |
michael@0 | 215 | else { |
michael@0 | 216 | pos += (index+1); |
michael@0 | 217 | range -= (index+1); |
michael@0 | 218 | } |
michael@0 | 219 | } |
michael@0 | 220 | |
michael@0 | 221 | if (pos == matches.size()) |
michael@0 | 222 | matches.append(ch); |
michael@0 | 223 | else |
michael@0 | 224 | matches.insert(pos, ch); |
michael@0 | 225 | } |
michael@0 | 226 | |
michael@0 | 227 | void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) |
michael@0 | 228 | { |
michael@0 | 229 | unsigned end = ranges.size(); |
michael@0 | 230 | |
michael@0 | 231 | // Simple linear scan - I doubt there are that many ranges anyway... |
michael@0 | 232 | // feel free to fix this with something faster (eg binary chop). |
michael@0 | 233 | for (unsigned i = 0; i < end; ++i) { |
michael@0 | 234 | // does the new range fall before the current position in the array |
michael@0 | 235 | if (hi < ranges[i].begin) { |
michael@0 | 236 | // optional optimization: concatenate appending ranges? - may not be worthwhile. |
michael@0 | 237 | if (hi == (ranges[i].begin - 1)) { |
michael@0 | 238 | ranges[i].begin = lo; |
michael@0 | 239 | return; |
michael@0 | 240 | } |
michael@0 | 241 | ranges.insert(i, CharacterRange(lo, hi)); |
michael@0 | 242 | return; |
michael@0 | 243 | } |
michael@0 | 244 | // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining |
michael@0 | 245 | // If the new range start at or before the end of the last range, then the overlap (if it starts one after the |
michael@0 | 246 | // end of the last range they concatenate, which is just as good. |
michael@0 | 247 | if (lo <= (ranges[i].end + 1)) { |
michael@0 | 248 | // found an intersect! we'll replace this entry in the array. |
michael@0 | 249 | ranges[i].begin = std::min(ranges[i].begin, lo); |
michael@0 | 250 | ranges[i].end = std::max(ranges[i].end, hi); |
michael@0 | 251 | |
michael@0 | 252 | // now check if the new range can subsume any subsequent ranges. |
michael@0 | 253 | unsigned next = i+1; |
michael@0 | 254 | // each iteration of the loop we will either remove something from the list, or break the loop. |
michael@0 | 255 | while (next < ranges.size()) { |
michael@0 | 256 | if (ranges[next].begin <= (ranges[i].end + 1)) { |
michael@0 | 257 | // the next entry now overlaps / concatenates this one. |
michael@0 | 258 | ranges[i].end = std::max(ranges[i].end, ranges[next].end); |
michael@0 | 259 | ranges.remove(next); |
michael@0 | 260 | } else |
michael@0 | 261 | break; |
michael@0 | 262 | } |
michael@0 | 263 | |
michael@0 | 264 | return; |
michael@0 | 265 | } |
michael@0 | 266 | } |
michael@0 | 267 | |
michael@0 | 268 | // CharacterRange comes after all existing ranges. |
michael@0 | 269 | ranges.append(CharacterRange(lo, hi)); |
michael@0 | 270 | } |
michael@0 | 271 | |
michael@0 | 272 | bool m_isCaseInsensitive; |
michael@0 | 273 | |
michael@0 | 274 | Vector<UChar> m_matches; |
michael@0 | 275 | Vector<CharacterRange> m_ranges; |
michael@0 | 276 | Vector<UChar> m_matchesUnicode; |
michael@0 | 277 | Vector<CharacterRange> m_rangesUnicode; |
michael@0 | 278 | }; |
michael@0 | 279 | |
michael@0 | 280 | class YarrPatternConstructor { |
michael@0 | 281 | public: |
michael@0 | 282 | YarrPatternConstructor(YarrPattern& pattern) |
michael@0 | 283 | : m_pattern(pattern) |
michael@0 | 284 | , m_stackBase(nullptr) |
michael@0 | 285 | , m_characterClassConstructor(pattern.m_ignoreCase) |
michael@0 | 286 | , m_invertParentheticalAssertion(false) |
michael@0 | 287 | { |
michael@0 | 288 | m_pattern.m_body = newOrCrash<PatternDisjunction>(); |
michael@0 | 289 | m_alternative = m_pattern.m_body->addNewAlternative(); |
michael@0 | 290 | m_pattern.m_disjunctions.append(m_pattern.m_body); |
michael@0 | 291 | } |
michael@0 | 292 | |
michael@0 | 293 | ~YarrPatternConstructor() |
michael@0 | 294 | { |
michael@0 | 295 | } |
michael@0 | 296 | |
michael@0 | 297 | void reset() |
michael@0 | 298 | { |
michael@0 | 299 | m_pattern.reset(); |
michael@0 | 300 | m_characterClassConstructor.reset(); |
michael@0 | 301 | |
michael@0 | 302 | m_pattern.m_body = newOrCrash<PatternDisjunction>(); |
michael@0 | 303 | m_alternative = m_pattern.m_body->addNewAlternative(); |
michael@0 | 304 | m_pattern.m_disjunctions.append(m_pattern.m_body); |
michael@0 | 305 | } |
michael@0 | 306 | |
michael@0 | 307 | void assertionBOL() |
michael@0 | 308 | { |
michael@0 | 309 | if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { |
michael@0 | 310 | m_alternative->m_startsWithBOL = true; |
michael@0 | 311 | m_alternative->m_containsBOL = true; |
michael@0 | 312 | m_pattern.m_containsBOL = true; |
michael@0 | 313 | } |
michael@0 | 314 | m_alternative->m_terms.append(PatternTerm::BOL()); |
michael@0 | 315 | } |
michael@0 | 316 | void assertionEOL() |
michael@0 | 317 | { |
michael@0 | 318 | m_alternative->m_terms.append(PatternTerm::EOL()); |
michael@0 | 319 | } |
michael@0 | 320 | void assertionWordBoundary(bool invert) |
michael@0 | 321 | { |
michael@0 | 322 | m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); |
michael@0 | 323 | } |
michael@0 | 324 | |
michael@0 | 325 | void atomPatternCharacter(UChar ch) |
michael@0 | 326 | { |
michael@0 | 327 | // We handle case-insensitive checking of unicode characters which do have both |
michael@0 | 328 | // cases by handling them as if they were defined using a CharacterClass. |
michael@0 | 329 | if (!m_pattern.m_ignoreCase || isASCII(ch)) { |
michael@0 | 330 | m_alternative->m_terms.append(PatternTerm(ch)); |
michael@0 | 331 | return; |
michael@0 | 332 | } |
michael@0 | 333 | |
michael@0 | 334 | const UCS2CanonicalizationRange* info = rangeInfoFor(ch); |
michael@0 | 335 | if (info->type == CanonicalizeUnique) { |
michael@0 | 336 | m_alternative->m_terms.append(PatternTerm(ch)); |
michael@0 | 337 | return; |
michael@0 | 338 | } |
michael@0 | 339 | |
michael@0 | 340 | m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); |
michael@0 | 341 | CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); |
michael@0 | 342 | m_pattern.m_userCharacterClasses.append(newCharacterClass); |
michael@0 | 343 | m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); |
michael@0 | 344 | } |
michael@0 | 345 | |
michael@0 | 346 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
michael@0 | 347 | { |
michael@0 | 348 | switch (classID) { |
michael@0 | 349 | case DigitClassID: |
michael@0 | 350 | m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); |
michael@0 | 351 | break; |
michael@0 | 352 | case SpaceClassID: |
michael@0 | 353 | m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); |
michael@0 | 354 | break; |
michael@0 | 355 | case WordClassID: |
michael@0 | 356 | m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); |
michael@0 | 357 | break; |
michael@0 | 358 | case NewlineClassID: |
michael@0 | 359 | m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); |
michael@0 | 360 | break; |
michael@0 | 361 | } |
michael@0 | 362 | } |
michael@0 | 363 | |
michael@0 | 364 | void atomCharacterClassBegin(bool invert = false) |
michael@0 | 365 | { |
michael@0 | 366 | m_invertCharacterClass = invert; |
michael@0 | 367 | } |
michael@0 | 368 | |
michael@0 | 369 | void atomCharacterClassAtom(UChar ch) |
michael@0 | 370 | { |
michael@0 | 371 | m_characterClassConstructor.putChar(ch); |
michael@0 | 372 | } |
michael@0 | 373 | |
michael@0 | 374 | void atomCharacterClassRange(UChar begin, UChar end) |
michael@0 | 375 | { |
michael@0 | 376 | m_characterClassConstructor.putRange(begin, end); |
michael@0 | 377 | } |
michael@0 | 378 | |
michael@0 | 379 | void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
michael@0 | 380 | { |
michael@0 | 381 | ASSERT(classID != NewlineClassID); |
michael@0 | 382 | |
michael@0 | 383 | switch (classID) { |
michael@0 | 384 | case DigitClassID: |
michael@0 | 385 | m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); |
michael@0 | 386 | break; |
michael@0 | 387 | |
michael@0 | 388 | case SpaceClassID: |
michael@0 | 389 | m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); |
michael@0 | 390 | break; |
michael@0 | 391 | |
michael@0 | 392 | case WordClassID: |
michael@0 | 393 | m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); |
michael@0 | 394 | break; |
michael@0 | 395 | |
michael@0 | 396 | default: |
michael@0 | 397 | ASSERT_NOT_REACHED(); |
michael@0 | 398 | } |
michael@0 | 399 | } |
michael@0 | 400 | |
michael@0 | 401 | void atomCharacterClassEnd() |
michael@0 | 402 | { |
michael@0 | 403 | CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); |
michael@0 | 404 | m_pattern.m_userCharacterClasses.append(newCharacterClass); |
michael@0 | 405 | m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); |
michael@0 | 406 | } |
michael@0 | 407 | |
michael@0 | 408 | void atomParenthesesSubpatternBegin(bool capture = true) |
michael@0 | 409 | { |
michael@0 | 410 | unsigned subpatternId = m_pattern.m_numSubpatterns + 1; |
michael@0 | 411 | if (capture) |
michael@0 | 412 | m_pattern.m_numSubpatterns++; |
michael@0 | 413 | |
michael@0 | 414 | PatternDisjunction* parenthesesDisjunction = newOrCrash<PatternDisjunction>(m_alternative); |
michael@0 | 415 | m_pattern.m_disjunctions.append(parenthesesDisjunction); |
michael@0 | 416 | m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); |
michael@0 | 417 | m_alternative = parenthesesDisjunction->addNewAlternative(); |
michael@0 | 418 | } |
michael@0 | 419 | |
michael@0 | 420 | void atomParentheticalAssertionBegin(bool invert = false) |
michael@0 | 421 | { |
michael@0 | 422 | PatternDisjunction* parenthesesDisjunction = newOrCrash<PatternDisjunction>(m_alternative); |
michael@0 | 423 | m_pattern.m_disjunctions.append(parenthesesDisjunction); |
michael@0 | 424 | m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); |
michael@0 | 425 | m_alternative = parenthesesDisjunction->addNewAlternative(); |
michael@0 | 426 | m_invertParentheticalAssertion = invert; |
michael@0 | 427 | } |
michael@0 | 428 | |
michael@0 | 429 | void atomParenthesesEnd() |
michael@0 | 430 | { |
michael@0 | 431 | ASSERT(m_alternative->m_parent); |
michael@0 | 432 | ASSERT(m_alternative->m_parent->m_parent); |
michael@0 | 433 | |
michael@0 | 434 | PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; |
michael@0 | 435 | m_alternative = m_alternative->m_parent->m_parent; |
michael@0 | 436 | |
michael@0 | 437 | PatternTerm& lastTerm = m_alternative->lastTerm(); |
michael@0 | 438 | |
michael@0 | 439 | unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); |
michael@0 | 440 | unsigned numBOLAnchoredAlts = 0; |
michael@0 | 441 | |
michael@0 | 442 | for (unsigned i = 0; i < numParenAlternatives; i++) { |
michael@0 | 443 | // Bubble up BOL flags |
michael@0 | 444 | if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) |
michael@0 | 445 | numBOLAnchoredAlts++; |
michael@0 | 446 | } |
michael@0 | 447 | |
michael@0 | 448 | if (numBOLAnchoredAlts) { |
michael@0 | 449 | m_alternative->m_containsBOL = true; |
michael@0 | 450 | // If all the alternatives in parens start with BOL, then so does this one |
michael@0 | 451 | if (numBOLAnchoredAlts == numParenAlternatives) |
michael@0 | 452 | m_alternative->m_startsWithBOL = true; |
michael@0 | 453 | } |
michael@0 | 454 | |
michael@0 | 455 | lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; |
michael@0 | 456 | m_invertParentheticalAssertion = false; |
michael@0 | 457 | } |
michael@0 | 458 | |
michael@0 | 459 | void atomBackReference(unsigned subpatternId) |
michael@0 | 460 | { |
michael@0 | 461 | ASSERT(subpatternId); |
michael@0 | 462 | m_pattern.m_containsBackreferences = true; |
michael@0 | 463 | m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); |
michael@0 | 464 | |
michael@0 | 465 | if (subpatternId > m_pattern.m_numSubpatterns) { |
michael@0 | 466 | m_alternative->m_terms.append(PatternTerm::ForwardReference()); |
michael@0 | 467 | return; |
michael@0 | 468 | } |
michael@0 | 469 | |
michael@0 | 470 | PatternAlternative* currentAlternative = m_alternative; |
michael@0 | 471 | ASSERT(currentAlternative); |
michael@0 | 472 | |
michael@0 | 473 | // Note to self: if we waited until the AST was baked, we could also remove forwards refs |
michael@0 | 474 | while ((currentAlternative = currentAlternative->m_parent->m_parent)) { |
michael@0 | 475 | PatternTerm& term = currentAlternative->lastTerm(); |
michael@0 | 476 | ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); |
michael@0 | 477 | |
michael@0 | 478 | if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { |
michael@0 | 479 | m_alternative->m_terms.append(PatternTerm::ForwardReference()); |
michael@0 | 480 | return; |
michael@0 | 481 | } |
michael@0 | 482 | } |
michael@0 | 483 | |
michael@0 | 484 | m_alternative->m_terms.append(PatternTerm(subpatternId)); |
michael@0 | 485 | } |
michael@0 | 486 | |
michael@0 | 487 | // deep copy the argument disjunction. If filterStartsWithBOL is true, |
michael@0 | 488 | // skip alternatives with m_startsWithBOL set true. |
michael@0 | 489 | PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) |
michael@0 | 490 | { |
michael@0 | 491 | PatternDisjunction* newDisjunction = 0; |
michael@0 | 492 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
michael@0 | 493 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
michael@0 | 494 | if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { |
michael@0 | 495 | if (!newDisjunction) { |
michael@0 | 496 | newDisjunction = newOrCrash<PatternDisjunction>(); |
michael@0 | 497 | newDisjunction->m_parent = disjunction->m_parent; |
michael@0 | 498 | } |
michael@0 | 499 | PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); |
michael@0 | 500 | newAlternative->m_terms.reserve(alternative->m_terms.size()); |
michael@0 | 501 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) |
michael@0 | 502 | newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); |
michael@0 | 503 | } |
michael@0 | 504 | } |
michael@0 | 505 | |
michael@0 | 506 | if (newDisjunction) |
michael@0 | 507 | m_pattern.m_disjunctions.append(newDisjunction); |
michael@0 | 508 | return newDisjunction; |
michael@0 | 509 | } |
michael@0 | 510 | |
michael@0 | 511 | PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) |
michael@0 | 512 | { |
michael@0 | 513 | if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) |
michael@0 | 514 | return PatternTerm(term); |
michael@0 | 515 | |
michael@0 | 516 | PatternTerm termCopy = term; |
michael@0 | 517 | termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); |
michael@0 | 518 | return termCopy; |
michael@0 | 519 | } |
michael@0 | 520 | |
michael@0 | 521 | void quantifyAtom(unsigned min, unsigned max, bool greedy) |
michael@0 | 522 | { |
michael@0 | 523 | ASSERT(min <= max); |
michael@0 | 524 | ASSERT(m_alternative->m_terms.size()); |
michael@0 | 525 | |
michael@0 | 526 | if (!max) { |
michael@0 | 527 | m_alternative->removeLastTerm(); |
michael@0 | 528 | return; |
michael@0 | 529 | } |
michael@0 | 530 | |
michael@0 | 531 | PatternTerm& term = m_alternative->lastTerm(); |
michael@0 | 532 | ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); |
michael@0 | 533 | ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); |
michael@0 | 534 | |
michael@0 | 535 | if (term.type == PatternTerm::TypeParentheticalAssertion) { |
michael@0 | 536 | // If an assertion is quantified with a minimum count of zero, it can simply be removed. |
michael@0 | 537 | // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never |
michael@0 | 538 | // results in any input being consumed, however the continuation passed to the assertion |
michael@0 | 539 | // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will |
michael@0 | 540 | // reject all zero length matches (see step 2.1). A match from the continuation of the |
michael@0 | 541 | // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all |
michael@0 | 542 | // this is that matches from the assertion are not required, and won't be accepted anyway, |
michael@0 | 543 | // so no need to ever run it. |
michael@0 | 544 | if (!min) |
michael@0 | 545 | m_alternative->removeLastTerm(); |
michael@0 | 546 | // We never need to run an assertion more than once. Subsequent interations will be run |
michael@0 | 547 | // with the same start index (since assertions are non-capturing) and the same captures |
michael@0 | 548 | // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the |
michael@0 | 549 | // same result and captures. If the first match succeeds then the subsequent (min - 1) |
michael@0 | 550 | // matches will too. Any additional optional matches will fail (on the same basis as the |
michael@0 | 551 | // minimum zero quantified assertions, above), but this will still result in a match. |
michael@0 | 552 | return; |
michael@0 | 553 | } |
michael@0 | 554 | |
michael@0 | 555 | if (min == 0) |
michael@0 | 556 | term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); |
michael@0 | 557 | else if (min == max) |
michael@0 | 558 | term.quantify(min, QuantifierFixedCount); |
michael@0 | 559 | else { |
michael@0 | 560 | term.quantify(min, QuantifierFixedCount); |
michael@0 | 561 | m_alternative->m_terms.append(copyTerm(term)); |
michael@0 | 562 | // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... |
michael@0 | 563 | m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); |
michael@0 | 564 | if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) |
michael@0 | 565 | m_alternative->lastTerm().parentheses.isCopy = true; |
michael@0 | 566 | } |
michael@0 | 567 | } |
michael@0 | 568 | |
michael@0 | 569 | void disjunction() |
michael@0 | 570 | { |
michael@0 | 571 | m_alternative = m_alternative->m_parent->addNewAlternative(); |
michael@0 | 572 | } |
michael@0 | 573 | |
michael@0 | 574 | ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, |
michael@0 | 575 | unsigned *callFrameSizeOut) |
michael@0 | 576 | { |
michael@0 | 577 | /* |
michael@0 | 578 | * Attempt detection of over-recursion: |
michael@0 | 579 | * "1MB should be enough stack for anyone." |
michael@0 | 580 | */ |
michael@0 | 581 | uint8_t stackDummy_; |
michael@0 | 582 | if (m_stackBase - &stackDummy_ > 1024*1024) |
michael@0 | 583 | return PatternTooLarge; |
michael@0 | 584 | |
michael@0 | 585 | alternative->m_hasFixedSize = true; |
michael@0 | 586 | Checked<unsigned> currentInputPosition = initialInputPosition; |
michael@0 | 587 | |
michael@0 | 588 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { |
michael@0 | 589 | PatternTerm& term = alternative->m_terms[i]; |
michael@0 | 590 | |
michael@0 | 591 | switch (term.type) { |
michael@0 | 592 | case PatternTerm::TypeAssertionBOL: |
michael@0 | 593 | case PatternTerm::TypeAssertionEOL: |
michael@0 | 594 | case PatternTerm::TypeAssertionWordBoundary: |
michael@0 | 595 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 596 | return RuntimeError; |
michael@0 | 597 | break; |
michael@0 | 598 | |
michael@0 | 599 | case PatternTerm::TypeBackReference: |
michael@0 | 600 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 601 | return RuntimeError; |
michael@0 | 602 | term.frameLocation = currentCallFrameSize; |
michael@0 | 603 | currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; |
michael@0 | 604 | alternative->m_hasFixedSize = false; |
michael@0 | 605 | break; |
michael@0 | 606 | |
michael@0 | 607 | case PatternTerm::TypeForwardReference: |
michael@0 | 608 | break; |
michael@0 | 609 | |
michael@0 | 610 | case PatternTerm::TypePatternCharacter: |
michael@0 | 611 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 612 | return RuntimeError; |
michael@0 | 613 | if (term.quantityType != QuantifierFixedCount) { |
michael@0 | 614 | term.frameLocation = currentCallFrameSize; |
michael@0 | 615 | currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; |
michael@0 | 616 | alternative->m_hasFixedSize = false; |
michael@0 | 617 | } else |
michael@0 | 618 | currentInputPosition += term.quantityCount; |
michael@0 | 619 | break; |
michael@0 | 620 | |
michael@0 | 621 | case PatternTerm::TypeCharacterClass: |
michael@0 | 622 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 623 | return RuntimeError; |
michael@0 | 624 | if (term.quantityType != QuantifierFixedCount) { |
michael@0 | 625 | term.frameLocation = currentCallFrameSize; |
michael@0 | 626 | currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; |
michael@0 | 627 | alternative->m_hasFixedSize = false; |
michael@0 | 628 | } else |
michael@0 | 629 | currentInputPosition += term.quantityCount; |
michael@0 | 630 | break; |
michael@0 | 631 | |
michael@0 | 632 | case PatternTerm::TypeParenthesesSubpattern: |
michael@0 | 633 | // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. |
michael@0 | 634 | term.frameLocation = currentCallFrameSize; |
michael@0 | 635 | unsigned position; |
michael@0 | 636 | if (currentInputPosition.safeGet(position)) |
michael@0 | 637 | return RuntimeError; |
michael@0 | 638 | if (term.quantityCount == 1 && !term.parentheses.isCopy) { |
michael@0 | 639 | if (term.quantityType != QuantifierFixedCount) |
michael@0 | 640 | currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; |
michael@0 | 641 | if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, position, ¤tCallFrameSize)) |
michael@0 | 642 | return error; |
michael@0 | 643 | // If quantity is fixed, then pre-check its minimum size. |
michael@0 | 644 | if (term.quantityType == QuantifierFixedCount) |
michael@0 | 645 | currentInputPosition += term.parentheses.disjunction->m_minimumSize; |
michael@0 | 646 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 647 | return RuntimeError; |
michael@0 | 648 | } else if (term.parentheses.isTerminal) { |
michael@0 | 649 | currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; |
michael@0 | 650 | if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, position, ¤tCallFrameSize)) |
michael@0 | 651 | return error; |
michael@0 | 652 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 653 | return RuntimeError; |
michael@0 | 654 | } else { |
michael@0 | 655 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 656 | return RuntimeError; |
michael@0 | 657 | unsigned dummy; |
michael@0 | 658 | if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, BASE_FRAME_SIZE, position, &dummy)) |
michael@0 | 659 | return error; |
michael@0 | 660 | currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; |
michael@0 | 661 | } |
michael@0 | 662 | // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. |
michael@0 | 663 | alternative->m_hasFixedSize = false; |
michael@0 | 664 | break; |
michael@0 | 665 | |
michael@0 | 666 | case PatternTerm::TypeParentheticalAssertion: |
michael@0 | 667 | if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
michael@0 | 668 | return RuntimeError; |
michael@0 | 669 | term.frameLocation = currentCallFrameSize; |
michael@0 | 670 | if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, term.inputPosition, ¤tCallFrameSize)) |
michael@0 | 671 | return error; |
michael@0 | 672 | break; |
michael@0 | 673 | |
michael@0 | 674 | case PatternTerm::TypeDotStarEnclosure: |
michael@0 | 675 | alternative->m_hasFixedSize = false; |
michael@0 | 676 | term.inputPosition = initialInputPosition; |
michael@0 | 677 | break; |
michael@0 | 678 | } |
michael@0 | 679 | } |
michael@0 | 680 | |
michael@0 | 681 | if ((currentInputPosition - initialInputPosition).safeGet(alternative->m_minimumSize)) |
michael@0 | 682 | return RuntimeError; |
michael@0 | 683 | *callFrameSizeOut = currentCallFrameSize; |
michael@0 | 684 | return NoError; |
michael@0 | 685 | } |
michael@0 | 686 | |
michael@0 | 687 | ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned *maximumCallFrameSizeOut) |
michael@0 | 688 | { |
michael@0 | 689 | if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) |
michael@0 | 690 | initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; |
michael@0 | 691 | |
michael@0 | 692 | unsigned minimumInputSize = UINT_MAX; |
michael@0 | 693 | unsigned maximumCallFrameSize = 0; |
michael@0 | 694 | bool hasFixedSize = true; |
michael@0 | 695 | |
michael@0 | 696 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
michael@0 | 697 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
michael@0 | 698 | unsigned currentAlternativeCallFrameSize; |
michael@0 | 699 | if (ErrorCode error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, ¤tAlternativeCallFrameSize)) |
michael@0 | 700 | return error; |
michael@0 | 701 | minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); |
michael@0 | 702 | maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); |
michael@0 | 703 | hasFixedSize &= alternative->m_hasFixedSize; |
michael@0 | 704 | } |
michael@0 | 705 | |
michael@0 | 706 | ASSERT(minimumInputSize != UINT_MAX); |
michael@0 | 707 | if (minimumInputSize == UINT_MAX) |
michael@0 | 708 | return PatternTooLarge; |
michael@0 | 709 | |
michael@0 | 710 | ASSERT(minimumInputSize != UINT_MAX); |
michael@0 | 711 | ASSERT(maximumCallFrameSize >= initialCallFrameSize); |
michael@0 | 712 | |
michael@0 | 713 | disjunction->m_hasFixedSize = hasFixedSize; |
michael@0 | 714 | disjunction->m_minimumSize = minimumInputSize; |
michael@0 | 715 | disjunction->m_callFrameSize = maximumCallFrameSize; |
michael@0 | 716 | *maximumCallFrameSizeOut = maximumCallFrameSize; |
michael@0 | 717 | return NoError; |
michael@0 | 718 | } |
michael@0 | 719 | |
michael@0 | 720 | ErrorCode setupOffsets() |
michael@0 | 721 | { |
michael@0 | 722 | unsigned dummy; |
michael@0 | 723 | return setupDisjunctionOffsets(m_pattern.m_body, BASE_FRAME_SIZE, 0, &dummy); |
michael@0 | 724 | } |
michael@0 | 725 | |
michael@0 | 726 | // This optimization identifies sets of parentheses that we will never need to backtrack. |
michael@0 | 727 | // In these cases we do not need to store state from prior iterations. |
michael@0 | 728 | // We can presently avoid backtracking for: |
michael@0 | 729 | // * where the parens are at the end of the regular expression (last term in any of the |
michael@0 | 730 | // alternatives of the main body disjunction). |
michael@0 | 731 | // * where the parens are non-capturing, and quantified unbounded greedy (*). |
michael@0 | 732 | // * where the parens do not contain any capturing subpatterns. |
michael@0 | 733 | void checkForTerminalParentheses() |
michael@0 | 734 | { |
michael@0 | 735 | // This check is much too crude; should be just checking whether the candidate |
michael@0 | 736 | // node contains nested capturing subpatterns, not the whole expression! |
michael@0 | 737 | if (m_pattern.m_numSubpatterns) |
michael@0 | 738 | return; |
michael@0 | 739 | |
michael@0 | 740 | Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; |
michael@0 | 741 | for (size_t i = 0; i < alternatives.size(); ++i) { |
michael@0 | 742 | Vector<PatternTerm>& terms = alternatives[i]->m_terms; |
michael@0 | 743 | if (terms.size()) { |
michael@0 | 744 | PatternTerm& term = terms.last(); |
michael@0 | 745 | if (term.type == PatternTerm::TypeParenthesesSubpattern |
michael@0 | 746 | && term.quantityType == QuantifierGreedy |
michael@0 | 747 | && term.quantityCount == quantifyInfinite |
michael@0 | 748 | && !term.capture()) |
michael@0 | 749 | term.parentheses.isTerminal = true; |
michael@0 | 750 | } |
michael@0 | 751 | } |
michael@0 | 752 | } |
michael@0 | 753 | |
michael@0 | 754 | void optimizeBOL() |
michael@0 | 755 | { |
michael@0 | 756 | // Look for expressions containing beginning of line (^) anchoring and unroll them. |
michael@0 | 757 | // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops |
michael@0 | 758 | // This code relies on the parsing code tagging alternatives with m_containsBOL and |
michael@0 | 759 | // m_startsWithBOL and rolling those up to containing alternatives. |
michael@0 | 760 | // At this point, this is only valid for non-multiline expressions. |
michael@0 | 761 | PatternDisjunction* disjunction = m_pattern.m_body; |
michael@0 | 762 | |
michael@0 | 763 | if (!m_pattern.m_containsBOL || m_pattern.m_multiline) |
michael@0 | 764 | return; |
michael@0 | 765 | |
michael@0 | 766 | PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); |
michael@0 | 767 | |
michael@0 | 768 | // Set alternatives in disjunction to "onceThrough" |
michael@0 | 769 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) |
michael@0 | 770 | disjunction->m_alternatives[alt]->setOnceThrough(); |
michael@0 | 771 | |
michael@0 | 772 | if (loopDisjunction) { |
michael@0 | 773 | // Move alternatives from loopDisjunction to disjunction |
michael@0 | 774 | for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) |
michael@0 | 775 | disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); |
michael@0 | 776 | |
michael@0 | 777 | loopDisjunction->m_alternatives.clear(); |
michael@0 | 778 | } |
michael@0 | 779 | } |
michael@0 | 780 | |
michael@0 | 781 | bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) |
michael@0 | 782 | { |
michael@0 | 783 | Vector<PatternTerm>& terms = alternative->m_terms; |
michael@0 | 784 | |
michael@0 | 785 | for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { |
michael@0 | 786 | PatternTerm& term = terms[termIndex]; |
michael@0 | 787 | |
michael@0 | 788 | if (term.m_capture) |
michael@0 | 789 | return true; |
michael@0 | 790 | |
michael@0 | 791 | if (term.type == PatternTerm::TypeParenthesesSubpattern) { |
michael@0 | 792 | PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; |
michael@0 | 793 | for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { |
michael@0 | 794 | PatternAlternative *pattern = nestedDisjunction->m_alternatives[alt]; |
michael@0 | 795 | if (pattern->m_terms.size() == 0) |
michael@0 | 796 | continue; |
michael@0 | 797 | if (containsCapturingTerms(pattern, 0, pattern->m_terms.size() - 1)) |
michael@0 | 798 | return true; |
michael@0 | 799 | } |
michael@0 | 800 | } |
michael@0 | 801 | } |
michael@0 | 802 | |
michael@0 | 803 | return false; |
michael@0 | 804 | } |
michael@0 | 805 | |
michael@0 | 806 | // This optimization identifies alternatives in the form of |
michael@0 | 807 | // [^].*[?]<expression>.*[$] for expressions that don't have any |
michael@0 | 808 | // capturing terms. The alternative is changed to <expression> |
michael@0 | 809 | // followed by processing of the dot stars to find and adjust the |
michael@0 | 810 | // beginning and the end of the match. |
michael@0 | 811 | void optimizeDotStarWrappedExpressions() |
michael@0 | 812 | { |
michael@0 | 813 | Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; |
michael@0 | 814 | if (alternatives.size() != 1) |
michael@0 | 815 | return; |
michael@0 | 816 | |
michael@0 | 817 | PatternAlternative* alternative = alternatives[0]; |
michael@0 | 818 | Vector<PatternTerm>& terms = alternative->m_terms; |
michael@0 | 819 | if (terms.size() >= 3) { |
michael@0 | 820 | bool startsWithBOL = false; |
michael@0 | 821 | bool endsWithEOL = false; |
michael@0 | 822 | size_t termIndex, firstExpressionTerm, lastExpressionTerm; |
michael@0 | 823 | |
michael@0 | 824 | termIndex = 0; |
michael@0 | 825 | if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { |
michael@0 | 826 | startsWithBOL = true; |
michael@0 | 827 | ++termIndex; |
michael@0 | 828 | } |
michael@0 | 829 | |
michael@0 | 830 | PatternTerm& firstNonAnchorTerm = terms[termIndex]; |
michael@0 | 831 | if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) |
michael@0 | 832 | return; |
michael@0 | 833 | |
michael@0 | 834 | firstExpressionTerm = termIndex + 1; |
michael@0 | 835 | |
michael@0 | 836 | termIndex = terms.size() - 1; |
michael@0 | 837 | if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { |
michael@0 | 838 | endsWithEOL = true; |
michael@0 | 839 | --termIndex; |
michael@0 | 840 | } |
michael@0 | 841 | |
michael@0 | 842 | PatternTerm& lastNonAnchorTerm = terms[termIndex]; |
michael@0 | 843 | if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) |
michael@0 | 844 | return; |
michael@0 | 845 | |
michael@0 | 846 | lastExpressionTerm = termIndex - 1; |
michael@0 | 847 | |
michael@0 | 848 | if (firstExpressionTerm > lastExpressionTerm) |
michael@0 | 849 | return; |
michael@0 | 850 | |
michael@0 | 851 | if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { |
michael@0 | 852 | for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) |
michael@0 | 853 | terms.remove(termIndex); |
michael@0 | 854 | |
michael@0 | 855 | for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) |
michael@0 | 856 | terms.remove(termIndex - 1); |
michael@0 | 857 | |
michael@0 | 858 | terms.append(PatternTerm(startsWithBOL, endsWithEOL)); |
michael@0 | 859 | |
michael@0 | 860 | m_pattern.m_containsBOL = false; |
michael@0 | 861 | } |
michael@0 | 862 | } |
michael@0 | 863 | } |
michael@0 | 864 | |
michael@0 | 865 | void setStackBase(uint8_t *stackBase) { |
michael@0 | 866 | m_stackBase = stackBase; |
michael@0 | 867 | } |
michael@0 | 868 | |
michael@0 | 869 | private: |
michael@0 | 870 | YarrPattern& m_pattern; |
michael@0 | 871 | uint8_t * m_stackBase; |
michael@0 | 872 | PatternAlternative* m_alternative; |
michael@0 | 873 | CharacterClassConstructor m_characterClassConstructor; |
michael@0 | 874 | bool m_invertCharacterClass; |
michael@0 | 875 | bool m_invertParentheticalAssertion; |
michael@0 | 876 | }; |
michael@0 | 877 | |
michael@0 | 878 | ErrorCode YarrPattern::compile(const String& patternString) |
michael@0 | 879 | { |
michael@0 | 880 | YarrPatternConstructor constructor(*this); |
michael@0 | 881 | |
michael@0 | 882 | if (ErrorCode error = parse(constructor, patternString)) |
michael@0 | 883 | return error; |
michael@0 | 884 | |
michael@0 | 885 | // If the pattern contains illegal backreferences reset & reparse. |
michael@0 | 886 | // Quoting Netscape's "What's new in JavaScript 1.2", |
michael@0 | 887 | // "Note: if the number of left parentheses is less than the number specified |
michael@0 | 888 | // in \#, the \# is taken as an octal escape as described in the next row." |
michael@0 | 889 | if (containsIllegalBackReference()) { |
michael@0 | 890 | unsigned numSubpatterns = m_numSubpatterns; |
michael@0 | 891 | |
michael@0 | 892 | constructor.reset(); |
michael@0 | 893 | #if !ASSERT_DISABLED |
michael@0 | 894 | ErrorCode error = |
michael@0 | 895 | #endif |
michael@0 | 896 | parse(constructor, patternString, numSubpatterns); |
michael@0 | 897 | |
michael@0 | 898 | ASSERT(!error); |
michael@0 | 899 | ASSERT(numSubpatterns == m_numSubpatterns); |
michael@0 | 900 | } |
michael@0 | 901 | |
michael@0 | 902 | uint8_t stackDummy_; |
michael@0 | 903 | constructor.setStackBase(&stackDummy_); |
michael@0 | 904 | |
michael@0 | 905 | constructor.checkForTerminalParentheses(); |
michael@0 | 906 | constructor.optimizeDotStarWrappedExpressions(); |
michael@0 | 907 | constructor.optimizeBOL(); |
michael@0 | 908 | |
michael@0 | 909 | if (ErrorCode error = constructor.setupOffsets()) |
michael@0 | 910 | return error; |
michael@0 | 911 | |
michael@0 | 912 | return NoError; |
michael@0 | 913 | } |
michael@0 | 914 | |
michael@0 | 915 | YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error) |
michael@0 | 916 | : m_ignoreCase(ignoreCase) |
michael@0 | 917 | , m_multiline(multiline) |
michael@0 | 918 | , m_containsBackreferences(false) |
michael@0 | 919 | , m_containsBOL(false) |
michael@0 | 920 | , m_numSubpatterns(0) |
michael@0 | 921 | , m_maxBackReference(0) |
michael@0 | 922 | , newlineCached(0) |
michael@0 | 923 | , digitsCached(0) |
michael@0 | 924 | , spacesCached(0) |
michael@0 | 925 | , wordcharCached(0) |
michael@0 | 926 | , nondigitsCached(0) |
michael@0 | 927 | , nonspacesCached(0) |
michael@0 | 928 | , nonwordcharCached(0) |
michael@0 | 929 | { |
michael@0 | 930 | *error = compile(pattern); |
michael@0 | 931 | } |
michael@0 | 932 | |
michael@0 | 933 | } } |