|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * vim: set ts=8 sts=4 et sw=4 tw=99: |
|
3 * |
|
4 * Copyright (C) 2009 Apple Inc. All rights reserved. |
|
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged |
|
6 * |
|
7 * Redistribution and use in source and binary forms, with or without |
|
8 * modification, are permitted provided that the following conditions |
|
9 * are met: |
|
10 * 1. Redistributions of source code must retain the above copyright |
|
11 * notice, this list of conditions and the following disclaimer. |
|
12 * 2. Redistributions in binary form must reproduce the above copyright |
|
13 * notice, this list of conditions and the following disclaimer in the |
|
14 * documentation and/or other materials provided with the distribution. |
|
15 * |
|
16 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
24 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
27 */ |
|
28 |
|
29 #include "yarr/YarrPattern.h" |
|
30 |
|
31 #include "yarr/Yarr.h" |
|
32 #include "yarr/YarrCanonicalizeUCS2.h" |
|
33 #include "yarr/YarrParser.h" |
|
34 |
|
35 using namespace WTF; |
|
36 |
|
37 namespace JSC { namespace Yarr { |
|
38 |
|
39 #include "yarr/RegExpJitTables.h" |
|
40 |
|
41 #if WTF_CPU_SPARC |
|
42 # define BASE_FRAME_SIZE 24 |
|
43 #else |
|
44 # define BASE_FRAME_SIZE 0 |
|
45 #endif |
|
46 |
|
47 // Thanks, windows.h! |
|
48 #undef min |
|
49 #undef max |
|
50 |
|
51 class CharacterClassConstructor { |
|
52 public: |
|
53 CharacterClassConstructor(bool isCaseInsensitive = false) |
|
54 : m_isCaseInsensitive(isCaseInsensitive) |
|
55 { |
|
56 } |
|
57 |
|
58 void reset() |
|
59 { |
|
60 m_matches.clear(); |
|
61 m_ranges.clear(); |
|
62 m_matchesUnicode.clear(); |
|
63 m_rangesUnicode.clear(); |
|
64 } |
|
65 |
|
66 void append(const CharacterClass* other) |
|
67 { |
|
68 for (size_t i = 0; i < other->m_matches.size(); ++i) |
|
69 addSorted(m_matches, other->m_matches[i]); |
|
70 for (size_t i = 0; i < other->m_ranges.size(); ++i) |
|
71 addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end); |
|
72 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) |
|
73 addSorted(m_matchesUnicode, other->m_matchesUnicode[i]); |
|
74 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) |
|
75 addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end); |
|
76 } |
|
77 |
|
78 void putChar(UChar ch) |
|
79 { |
|
80 // Handle ascii cases. |
|
81 if (ch <= 0x7f) { |
|
82 if (m_isCaseInsensitive && isASCIIAlpha(ch)) { |
|
83 addSorted(m_matches, toASCIIUpper(ch)); |
|
84 addSorted(m_matches, toASCIILower(ch)); |
|
85 } else |
|
86 addSorted(m_matches, ch); |
|
87 return; |
|
88 } |
|
89 |
|
90 // Simple case, not a case-insensitive match. |
|
91 if (!m_isCaseInsensitive) { |
|
92 addSorted(m_matchesUnicode, ch); |
|
93 return; |
|
94 } |
|
95 |
|
96 // Add multiple matches, if necessary. |
|
97 const UCS2CanonicalizationRange* info = rangeInfoFor(ch); |
|
98 if (info->type == CanonicalizeUnique) |
|
99 addSorted(m_matchesUnicode, ch); |
|
100 else |
|
101 putUnicodeIgnoreCase(ch, info); |
|
102 } |
|
103 |
|
104 void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info) |
|
105 { |
|
106 ASSERT(m_isCaseInsensitive); |
|
107 ASSERT(ch > 0x7f); |
|
108 ASSERT(ch >= info->begin && ch <= info->end); |
|
109 ASSERT(info->type != CanonicalizeUnique); |
|
110 if (info->type == CanonicalizeSet) { |
|
111 for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) |
|
112 addSorted(m_matchesUnicode, ch); |
|
113 } else { |
|
114 addSorted(m_matchesUnicode, ch); |
|
115 addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); |
|
116 } |
|
117 } |
|
118 |
|
119 void putRange(UChar lo, UChar hi) |
|
120 { |
|
121 if (lo <= 0x7f) { |
|
122 char asciiLo = lo; |
|
123 char asciiHi = std::min(hi, (UChar)0x7f); |
|
124 addSortedRange(m_ranges, lo, asciiHi); |
|
125 |
|
126 if (m_isCaseInsensitive) { |
|
127 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) |
|
128 addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); |
|
129 if ((asciiLo <= 'z') && (asciiHi >= 'a')) |
|
130 addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); |
|
131 } |
|
132 } |
|
133 if (hi <= 0x7f) |
|
134 return; |
|
135 |
|
136 lo = std::max(lo, (UChar)0x80); |
|
137 addSortedRange(m_rangesUnicode, lo, hi); |
|
138 |
|
139 if (!m_isCaseInsensitive) |
|
140 return; |
|
141 |
|
142 const UCS2CanonicalizationRange* info = rangeInfoFor(lo); |
|
143 while (true) { |
|
144 // Handle the range [lo .. end] |
|
145 UChar end = std::min<UChar>(info->end, hi); |
|
146 |
|
147 switch (info->type) { |
|
148 case CanonicalizeUnique: |
|
149 // Nothing to do - no canonical equivalents. |
|
150 break; |
|
151 case CanonicalizeSet: { |
|
152 UChar ch; |
|
153 for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) |
|
154 addSorted(m_matchesUnicode, ch); |
|
155 break; |
|
156 } |
|
157 case CanonicalizeRangeLo: |
|
158 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); |
|
159 break; |
|
160 case CanonicalizeRangeHi: |
|
161 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); |
|
162 break; |
|
163 case CanonicalizeAlternatingAligned: |
|
164 // Use addSortedRange since there is likely an abutting range to combine with. |
|
165 if (lo & 1) |
|
166 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); |
|
167 if (!(end & 1)) |
|
168 addSortedRange(m_rangesUnicode, end + 1, end + 1); |
|
169 break; |
|
170 case CanonicalizeAlternatingUnaligned: |
|
171 // Use addSortedRange since there is likely an abutting range to combine with. |
|
172 if (!(lo & 1)) |
|
173 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); |
|
174 if (end & 1) |
|
175 addSortedRange(m_rangesUnicode, end + 1, end + 1); |
|
176 break; |
|
177 } |
|
178 |
|
179 if (hi == end) |
|
180 return; |
|
181 |
|
182 ++info; |
|
183 lo = info->begin; |
|
184 }; |
|
185 |
|
186 } |
|
187 |
|
188 CharacterClass* charClass() |
|
189 { |
|
190 CharacterClass* characterClass = newOrCrash<CharacterClass>(); |
|
191 |
|
192 characterClass->m_matches.swap(m_matches); |
|
193 characterClass->m_ranges.swap(m_ranges); |
|
194 characterClass->m_matchesUnicode.swap(m_matchesUnicode); |
|
195 characterClass->m_rangesUnicode.swap(m_rangesUnicode); |
|
196 |
|
197 return characterClass; |
|
198 } |
|
199 |
|
200 private: |
|
201 void addSorted(Vector<UChar>& matches, UChar ch) |
|
202 { |
|
203 unsigned pos = 0; |
|
204 unsigned range = matches.size(); |
|
205 |
|
206 // binary chop, find position to insert char. |
|
207 while (range) { |
|
208 unsigned index = range >> 1; |
|
209 |
|
210 int val = matches[pos+index] - ch; |
|
211 if (!val) |
|
212 return; |
|
213 else if (val > 0) |
|
214 range = index; |
|
215 else { |
|
216 pos += (index+1); |
|
217 range -= (index+1); |
|
218 } |
|
219 } |
|
220 |
|
221 if (pos == matches.size()) |
|
222 matches.append(ch); |
|
223 else |
|
224 matches.insert(pos, ch); |
|
225 } |
|
226 |
|
227 void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) |
|
228 { |
|
229 unsigned end = ranges.size(); |
|
230 |
|
231 // Simple linear scan - I doubt there are that many ranges anyway... |
|
232 // feel free to fix this with something faster (eg binary chop). |
|
233 for (unsigned i = 0; i < end; ++i) { |
|
234 // does the new range fall before the current position in the array |
|
235 if (hi < ranges[i].begin) { |
|
236 // optional optimization: concatenate appending ranges? - may not be worthwhile. |
|
237 if (hi == (ranges[i].begin - 1)) { |
|
238 ranges[i].begin = lo; |
|
239 return; |
|
240 } |
|
241 ranges.insert(i, CharacterRange(lo, hi)); |
|
242 return; |
|
243 } |
|
244 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining |
|
245 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the |
|
246 // end of the last range they concatenate, which is just as good. |
|
247 if (lo <= (ranges[i].end + 1)) { |
|
248 // found an intersect! we'll replace this entry in the array. |
|
249 ranges[i].begin = std::min(ranges[i].begin, lo); |
|
250 ranges[i].end = std::max(ranges[i].end, hi); |
|
251 |
|
252 // now check if the new range can subsume any subsequent ranges. |
|
253 unsigned next = i+1; |
|
254 // each iteration of the loop we will either remove something from the list, or break the loop. |
|
255 while (next < ranges.size()) { |
|
256 if (ranges[next].begin <= (ranges[i].end + 1)) { |
|
257 // the next entry now overlaps / concatenates this one. |
|
258 ranges[i].end = std::max(ranges[i].end, ranges[next].end); |
|
259 ranges.remove(next); |
|
260 } else |
|
261 break; |
|
262 } |
|
263 |
|
264 return; |
|
265 } |
|
266 } |
|
267 |
|
268 // CharacterRange comes after all existing ranges. |
|
269 ranges.append(CharacterRange(lo, hi)); |
|
270 } |
|
271 |
|
272 bool m_isCaseInsensitive; |
|
273 |
|
274 Vector<UChar> m_matches; |
|
275 Vector<CharacterRange> m_ranges; |
|
276 Vector<UChar> m_matchesUnicode; |
|
277 Vector<CharacterRange> m_rangesUnicode; |
|
278 }; |
|
279 |
|
280 class YarrPatternConstructor { |
|
281 public: |
|
282 YarrPatternConstructor(YarrPattern& pattern) |
|
283 : m_pattern(pattern) |
|
284 , m_stackBase(nullptr) |
|
285 , m_characterClassConstructor(pattern.m_ignoreCase) |
|
286 , m_invertParentheticalAssertion(false) |
|
287 { |
|
288 m_pattern.m_body = newOrCrash<PatternDisjunction>(); |
|
289 m_alternative = m_pattern.m_body->addNewAlternative(); |
|
290 m_pattern.m_disjunctions.append(m_pattern.m_body); |
|
291 } |
|
292 |
|
293 ~YarrPatternConstructor() |
|
294 { |
|
295 } |
|
296 |
|
297 void reset() |
|
298 { |
|
299 m_pattern.reset(); |
|
300 m_characterClassConstructor.reset(); |
|
301 |
|
302 m_pattern.m_body = newOrCrash<PatternDisjunction>(); |
|
303 m_alternative = m_pattern.m_body->addNewAlternative(); |
|
304 m_pattern.m_disjunctions.append(m_pattern.m_body); |
|
305 } |
|
306 |
|
307 void assertionBOL() |
|
308 { |
|
309 if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) { |
|
310 m_alternative->m_startsWithBOL = true; |
|
311 m_alternative->m_containsBOL = true; |
|
312 m_pattern.m_containsBOL = true; |
|
313 } |
|
314 m_alternative->m_terms.append(PatternTerm::BOL()); |
|
315 } |
|
316 void assertionEOL() |
|
317 { |
|
318 m_alternative->m_terms.append(PatternTerm::EOL()); |
|
319 } |
|
320 void assertionWordBoundary(bool invert) |
|
321 { |
|
322 m_alternative->m_terms.append(PatternTerm::WordBoundary(invert)); |
|
323 } |
|
324 |
|
325 void atomPatternCharacter(UChar ch) |
|
326 { |
|
327 // We handle case-insensitive checking of unicode characters which do have both |
|
328 // cases by handling them as if they were defined using a CharacterClass. |
|
329 if (!m_pattern.m_ignoreCase || isASCII(ch)) { |
|
330 m_alternative->m_terms.append(PatternTerm(ch)); |
|
331 return; |
|
332 } |
|
333 |
|
334 const UCS2CanonicalizationRange* info = rangeInfoFor(ch); |
|
335 if (info->type == CanonicalizeUnique) { |
|
336 m_alternative->m_terms.append(PatternTerm(ch)); |
|
337 return; |
|
338 } |
|
339 |
|
340 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); |
|
341 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); |
|
342 m_pattern.m_userCharacterClasses.append(newCharacterClass); |
|
343 m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); |
|
344 } |
|
345 |
|
346 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
|
347 { |
|
348 switch (classID) { |
|
349 case DigitClassID: |
|
350 m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert)); |
|
351 break; |
|
352 case SpaceClassID: |
|
353 m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert)); |
|
354 break; |
|
355 case WordClassID: |
|
356 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert)); |
|
357 break; |
|
358 case NewlineClassID: |
|
359 m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert)); |
|
360 break; |
|
361 } |
|
362 } |
|
363 |
|
364 void atomCharacterClassBegin(bool invert = false) |
|
365 { |
|
366 m_invertCharacterClass = invert; |
|
367 } |
|
368 |
|
369 void atomCharacterClassAtom(UChar ch) |
|
370 { |
|
371 m_characterClassConstructor.putChar(ch); |
|
372 } |
|
373 |
|
374 void atomCharacterClassRange(UChar begin, UChar end) |
|
375 { |
|
376 m_characterClassConstructor.putRange(begin, end); |
|
377 } |
|
378 |
|
379 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
|
380 { |
|
381 ASSERT(classID != NewlineClassID); |
|
382 |
|
383 switch (classID) { |
|
384 case DigitClassID: |
|
385 m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); |
|
386 break; |
|
387 |
|
388 case SpaceClassID: |
|
389 m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); |
|
390 break; |
|
391 |
|
392 case WordClassID: |
|
393 m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); |
|
394 break; |
|
395 |
|
396 default: |
|
397 ASSERT_NOT_REACHED(); |
|
398 } |
|
399 } |
|
400 |
|
401 void atomCharacterClassEnd() |
|
402 { |
|
403 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); |
|
404 m_pattern.m_userCharacterClasses.append(newCharacterClass); |
|
405 m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); |
|
406 } |
|
407 |
|
408 void atomParenthesesSubpatternBegin(bool capture = true) |
|
409 { |
|
410 unsigned subpatternId = m_pattern.m_numSubpatterns + 1; |
|
411 if (capture) |
|
412 m_pattern.m_numSubpatterns++; |
|
413 |
|
414 PatternDisjunction* parenthesesDisjunction = newOrCrash<PatternDisjunction>(m_alternative); |
|
415 m_pattern.m_disjunctions.append(parenthesesDisjunction); |
|
416 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); |
|
417 m_alternative = parenthesesDisjunction->addNewAlternative(); |
|
418 } |
|
419 |
|
420 void atomParentheticalAssertionBegin(bool invert = false) |
|
421 { |
|
422 PatternDisjunction* parenthesesDisjunction = newOrCrash<PatternDisjunction>(m_alternative); |
|
423 m_pattern.m_disjunctions.append(parenthesesDisjunction); |
|
424 m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); |
|
425 m_alternative = parenthesesDisjunction->addNewAlternative(); |
|
426 m_invertParentheticalAssertion = invert; |
|
427 } |
|
428 |
|
429 void atomParenthesesEnd() |
|
430 { |
|
431 ASSERT(m_alternative->m_parent); |
|
432 ASSERT(m_alternative->m_parent->m_parent); |
|
433 |
|
434 PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; |
|
435 m_alternative = m_alternative->m_parent->m_parent; |
|
436 |
|
437 PatternTerm& lastTerm = m_alternative->lastTerm(); |
|
438 |
|
439 unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); |
|
440 unsigned numBOLAnchoredAlts = 0; |
|
441 |
|
442 for (unsigned i = 0; i < numParenAlternatives; i++) { |
|
443 // Bubble up BOL flags |
|
444 if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) |
|
445 numBOLAnchoredAlts++; |
|
446 } |
|
447 |
|
448 if (numBOLAnchoredAlts) { |
|
449 m_alternative->m_containsBOL = true; |
|
450 // If all the alternatives in parens start with BOL, then so does this one |
|
451 if (numBOLAnchoredAlts == numParenAlternatives) |
|
452 m_alternative->m_startsWithBOL = true; |
|
453 } |
|
454 |
|
455 lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; |
|
456 m_invertParentheticalAssertion = false; |
|
457 } |
|
458 |
|
459 void atomBackReference(unsigned subpatternId) |
|
460 { |
|
461 ASSERT(subpatternId); |
|
462 m_pattern.m_containsBackreferences = true; |
|
463 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); |
|
464 |
|
465 if (subpatternId > m_pattern.m_numSubpatterns) { |
|
466 m_alternative->m_terms.append(PatternTerm::ForwardReference()); |
|
467 return; |
|
468 } |
|
469 |
|
470 PatternAlternative* currentAlternative = m_alternative; |
|
471 ASSERT(currentAlternative); |
|
472 |
|
473 // Note to self: if we waited until the AST was baked, we could also remove forwards refs |
|
474 while ((currentAlternative = currentAlternative->m_parent->m_parent)) { |
|
475 PatternTerm& term = currentAlternative->lastTerm(); |
|
476 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); |
|
477 |
|
478 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { |
|
479 m_alternative->m_terms.append(PatternTerm::ForwardReference()); |
|
480 return; |
|
481 } |
|
482 } |
|
483 |
|
484 m_alternative->m_terms.append(PatternTerm(subpatternId)); |
|
485 } |
|
486 |
|
487 // deep copy the argument disjunction. If filterStartsWithBOL is true, |
|
488 // skip alternatives with m_startsWithBOL set true. |
|
489 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) |
|
490 { |
|
491 PatternDisjunction* newDisjunction = 0; |
|
492 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
|
493 PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
|
494 if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { |
|
495 if (!newDisjunction) { |
|
496 newDisjunction = newOrCrash<PatternDisjunction>(); |
|
497 newDisjunction->m_parent = disjunction->m_parent; |
|
498 } |
|
499 PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); |
|
500 newAlternative->m_terms.reserve(alternative->m_terms.size()); |
|
501 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) |
|
502 newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); |
|
503 } |
|
504 } |
|
505 |
|
506 if (newDisjunction) |
|
507 m_pattern.m_disjunctions.append(newDisjunction); |
|
508 return newDisjunction; |
|
509 } |
|
510 |
|
511 PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) |
|
512 { |
|
513 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) |
|
514 return PatternTerm(term); |
|
515 |
|
516 PatternTerm termCopy = term; |
|
517 termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL); |
|
518 return termCopy; |
|
519 } |
|
520 |
|
521 void quantifyAtom(unsigned min, unsigned max, bool greedy) |
|
522 { |
|
523 ASSERT(min <= max); |
|
524 ASSERT(m_alternative->m_terms.size()); |
|
525 |
|
526 if (!max) { |
|
527 m_alternative->removeLastTerm(); |
|
528 return; |
|
529 } |
|
530 |
|
531 PatternTerm& term = m_alternative->lastTerm(); |
|
532 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); |
|
533 ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); |
|
534 |
|
535 if (term.type == PatternTerm::TypeParentheticalAssertion) { |
|
536 // If an assertion is quantified with a minimum count of zero, it can simply be removed. |
|
537 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never |
|
538 // results in any input being consumed, however the continuation passed to the assertion |
|
539 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will |
|
540 // reject all zero length matches (see step 2.1). A match from the continuation of the |
|
541 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all |
|
542 // this is that matches from the assertion are not required, and won't be accepted anyway, |
|
543 // so no need to ever run it. |
|
544 if (!min) |
|
545 m_alternative->removeLastTerm(); |
|
546 // We never need to run an assertion more than once. Subsequent interations will be run |
|
547 // with the same start index (since assertions are non-capturing) and the same captures |
|
548 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the |
|
549 // same result and captures. If the first match succeeds then the subsequent (min - 1) |
|
550 // matches will too. Any additional optional matches will fail (on the same basis as the |
|
551 // minimum zero quantified assertions, above), but this will still result in a match. |
|
552 return; |
|
553 } |
|
554 |
|
555 if (min == 0) |
|
556 term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy); |
|
557 else if (min == max) |
|
558 term.quantify(min, QuantifierFixedCount); |
|
559 else { |
|
560 term.quantify(min, QuantifierFixedCount); |
|
561 m_alternative->m_terms.append(copyTerm(term)); |
|
562 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... |
|
563 m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy); |
|
564 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) |
|
565 m_alternative->lastTerm().parentheses.isCopy = true; |
|
566 } |
|
567 } |
|
568 |
|
569 void disjunction() |
|
570 { |
|
571 m_alternative = m_alternative->m_parent->addNewAlternative(); |
|
572 } |
|
573 |
|
574 ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, |
|
575 unsigned *callFrameSizeOut) |
|
576 { |
|
577 /* |
|
578 * Attempt detection of over-recursion: |
|
579 * "1MB should be enough stack for anyone." |
|
580 */ |
|
581 uint8_t stackDummy_; |
|
582 if (m_stackBase - &stackDummy_ > 1024*1024) |
|
583 return PatternTooLarge; |
|
584 |
|
585 alternative->m_hasFixedSize = true; |
|
586 Checked<unsigned> currentInputPosition = initialInputPosition; |
|
587 |
|
588 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { |
|
589 PatternTerm& term = alternative->m_terms[i]; |
|
590 |
|
591 switch (term.type) { |
|
592 case PatternTerm::TypeAssertionBOL: |
|
593 case PatternTerm::TypeAssertionEOL: |
|
594 case PatternTerm::TypeAssertionWordBoundary: |
|
595 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
596 return RuntimeError; |
|
597 break; |
|
598 |
|
599 case PatternTerm::TypeBackReference: |
|
600 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
601 return RuntimeError; |
|
602 term.frameLocation = currentCallFrameSize; |
|
603 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; |
|
604 alternative->m_hasFixedSize = false; |
|
605 break; |
|
606 |
|
607 case PatternTerm::TypeForwardReference: |
|
608 break; |
|
609 |
|
610 case PatternTerm::TypePatternCharacter: |
|
611 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
612 return RuntimeError; |
|
613 if (term.quantityType != QuantifierFixedCount) { |
|
614 term.frameLocation = currentCallFrameSize; |
|
615 currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; |
|
616 alternative->m_hasFixedSize = false; |
|
617 } else |
|
618 currentInputPosition += term.quantityCount; |
|
619 break; |
|
620 |
|
621 case PatternTerm::TypeCharacterClass: |
|
622 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
623 return RuntimeError; |
|
624 if (term.quantityType != QuantifierFixedCount) { |
|
625 term.frameLocation = currentCallFrameSize; |
|
626 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; |
|
627 alternative->m_hasFixedSize = false; |
|
628 } else |
|
629 currentInputPosition += term.quantityCount; |
|
630 break; |
|
631 |
|
632 case PatternTerm::TypeParenthesesSubpattern: |
|
633 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. |
|
634 term.frameLocation = currentCallFrameSize; |
|
635 unsigned position; |
|
636 if (currentInputPosition.safeGet(position)) |
|
637 return RuntimeError; |
|
638 if (term.quantityCount == 1 && !term.parentheses.isCopy) { |
|
639 if (term.quantityType != QuantifierFixedCount) |
|
640 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; |
|
641 if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, position, ¤tCallFrameSize)) |
|
642 return error; |
|
643 // If quantity is fixed, then pre-check its minimum size. |
|
644 if (term.quantityType == QuantifierFixedCount) |
|
645 currentInputPosition += term.parentheses.disjunction->m_minimumSize; |
|
646 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
647 return RuntimeError; |
|
648 } else if (term.parentheses.isTerminal) { |
|
649 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; |
|
650 if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, position, ¤tCallFrameSize)) |
|
651 return error; |
|
652 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
653 return RuntimeError; |
|
654 } else { |
|
655 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
656 return RuntimeError; |
|
657 unsigned dummy; |
|
658 if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, BASE_FRAME_SIZE, position, &dummy)) |
|
659 return error; |
|
660 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; |
|
661 } |
|
662 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. |
|
663 alternative->m_hasFixedSize = false; |
|
664 break; |
|
665 |
|
666 case PatternTerm::TypeParentheticalAssertion: |
|
667 if (Checked<int>(currentInputPosition).safeGet(term.inputPosition)) |
|
668 return RuntimeError; |
|
669 term.frameLocation = currentCallFrameSize; |
|
670 if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, term.inputPosition, ¤tCallFrameSize)) |
|
671 return error; |
|
672 break; |
|
673 |
|
674 case PatternTerm::TypeDotStarEnclosure: |
|
675 alternative->m_hasFixedSize = false; |
|
676 term.inputPosition = initialInputPosition; |
|
677 break; |
|
678 } |
|
679 } |
|
680 |
|
681 if ((currentInputPosition - initialInputPosition).safeGet(alternative->m_minimumSize)) |
|
682 return RuntimeError; |
|
683 *callFrameSizeOut = currentCallFrameSize; |
|
684 return NoError; |
|
685 } |
|
686 |
|
687 ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned *maximumCallFrameSizeOut) |
|
688 { |
|
689 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) |
|
690 initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; |
|
691 |
|
692 unsigned minimumInputSize = UINT_MAX; |
|
693 unsigned maximumCallFrameSize = 0; |
|
694 bool hasFixedSize = true; |
|
695 |
|
696 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
|
697 PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
|
698 unsigned currentAlternativeCallFrameSize; |
|
699 if (ErrorCode error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, ¤tAlternativeCallFrameSize)) |
|
700 return error; |
|
701 minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); |
|
702 maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); |
|
703 hasFixedSize &= alternative->m_hasFixedSize; |
|
704 } |
|
705 |
|
706 ASSERT(minimumInputSize != UINT_MAX); |
|
707 if (minimumInputSize == UINT_MAX) |
|
708 return PatternTooLarge; |
|
709 |
|
710 ASSERT(minimumInputSize != UINT_MAX); |
|
711 ASSERT(maximumCallFrameSize >= initialCallFrameSize); |
|
712 |
|
713 disjunction->m_hasFixedSize = hasFixedSize; |
|
714 disjunction->m_minimumSize = minimumInputSize; |
|
715 disjunction->m_callFrameSize = maximumCallFrameSize; |
|
716 *maximumCallFrameSizeOut = maximumCallFrameSize; |
|
717 return NoError; |
|
718 } |
|
719 |
|
720 ErrorCode setupOffsets() |
|
721 { |
|
722 unsigned dummy; |
|
723 return setupDisjunctionOffsets(m_pattern.m_body, BASE_FRAME_SIZE, 0, &dummy); |
|
724 } |
|
725 |
|
726 // This optimization identifies sets of parentheses that we will never need to backtrack. |
|
727 // In these cases we do not need to store state from prior iterations. |
|
728 // We can presently avoid backtracking for: |
|
729 // * where the parens are at the end of the regular expression (last term in any of the |
|
730 // alternatives of the main body disjunction). |
|
731 // * where the parens are non-capturing, and quantified unbounded greedy (*). |
|
732 // * where the parens do not contain any capturing subpatterns. |
|
733 void checkForTerminalParentheses() |
|
734 { |
|
735 // This check is much too crude; should be just checking whether the candidate |
|
736 // node contains nested capturing subpatterns, not the whole expression! |
|
737 if (m_pattern.m_numSubpatterns) |
|
738 return; |
|
739 |
|
740 Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; |
|
741 for (size_t i = 0; i < alternatives.size(); ++i) { |
|
742 Vector<PatternTerm>& terms = alternatives[i]->m_terms; |
|
743 if (terms.size()) { |
|
744 PatternTerm& term = terms.last(); |
|
745 if (term.type == PatternTerm::TypeParenthesesSubpattern |
|
746 && term.quantityType == QuantifierGreedy |
|
747 && term.quantityCount == quantifyInfinite |
|
748 && !term.capture()) |
|
749 term.parentheses.isTerminal = true; |
|
750 } |
|
751 } |
|
752 } |
|
753 |
|
754 void optimizeBOL() |
|
755 { |
|
756 // Look for expressions containing beginning of line (^) anchoring and unroll them. |
|
757 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops |
|
758 // This code relies on the parsing code tagging alternatives with m_containsBOL and |
|
759 // m_startsWithBOL and rolling those up to containing alternatives. |
|
760 // At this point, this is only valid for non-multiline expressions. |
|
761 PatternDisjunction* disjunction = m_pattern.m_body; |
|
762 |
|
763 if (!m_pattern.m_containsBOL || m_pattern.m_multiline) |
|
764 return; |
|
765 |
|
766 PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true); |
|
767 |
|
768 // Set alternatives in disjunction to "onceThrough" |
|
769 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) |
|
770 disjunction->m_alternatives[alt]->setOnceThrough(); |
|
771 |
|
772 if (loopDisjunction) { |
|
773 // Move alternatives from loopDisjunction to disjunction |
|
774 for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) |
|
775 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); |
|
776 |
|
777 loopDisjunction->m_alternatives.clear(); |
|
778 } |
|
779 } |
|
780 |
|
781 bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex) |
|
782 { |
|
783 Vector<PatternTerm>& terms = alternative->m_terms; |
|
784 |
|
785 for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) { |
|
786 PatternTerm& term = terms[termIndex]; |
|
787 |
|
788 if (term.m_capture) |
|
789 return true; |
|
790 |
|
791 if (term.type == PatternTerm::TypeParenthesesSubpattern) { |
|
792 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; |
|
793 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { |
|
794 PatternAlternative *pattern = nestedDisjunction->m_alternatives[alt]; |
|
795 if (pattern->m_terms.size() == 0) |
|
796 continue; |
|
797 if (containsCapturingTerms(pattern, 0, pattern->m_terms.size() - 1)) |
|
798 return true; |
|
799 } |
|
800 } |
|
801 } |
|
802 |
|
803 return false; |
|
804 } |
|
805 |
|
806 // This optimization identifies alternatives in the form of |
|
807 // [^].*[?]<expression>.*[$] for expressions that don't have any |
|
808 // capturing terms. The alternative is changed to <expression> |
|
809 // followed by processing of the dot stars to find and adjust the |
|
810 // beginning and the end of the match. |
|
811 void optimizeDotStarWrappedExpressions() |
|
812 { |
|
813 Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives; |
|
814 if (alternatives.size() != 1) |
|
815 return; |
|
816 |
|
817 PatternAlternative* alternative = alternatives[0]; |
|
818 Vector<PatternTerm>& terms = alternative->m_terms; |
|
819 if (terms.size() >= 3) { |
|
820 bool startsWithBOL = false; |
|
821 bool endsWithEOL = false; |
|
822 size_t termIndex, firstExpressionTerm, lastExpressionTerm; |
|
823 |
|
824 termIndex = 0; |
|
825 if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { |
|
826 startsWithBOL = true; |
|
827 ++termIndex; |
|
828 } |
|
829 |
|
830 PatternTerm& firstNonAnchorTerm = terms[termIndex]; |
|
831 if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy))) |
|
832 return; |
|
833 |
|
834 firstExpressionTerm = termIndex + 1; |
|
835 |
|
836 termIndex = terms.size() - 1; |
|
837 if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { |
|
838 endsWithEOL = true; |
|
839 --termIndex; |
|
840 } |
|
841 |
|
842 PatternTerm& lastNonAnchorTerm = terms[termIndex]; |
|
843 if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy)) |
|
844 return; |
|
845 |
|
846 lastExpressionTerm = termIndex - 1; |
|
847 |
|
848 if (firstExpressionTerm > lastExpressionTerm) |
|
849 return; |
|
850 |
|
851 if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) { |
|
852 for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex) |
|
853 terms.remove(termIndex); |
|
854 |
|
855 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) |
|
856 terms.remove(termIndex - 1); |
|
857 |
|
858 terms.append(PatternTerm(startsWithBOL, endsWithEOL)); |
|
859 |
|
860 m_pattern.m_containsBOL = false; |
|
861 } |
|
862 } |
|
863 } |
|
864 |
|
865 void setStackBase(uint8_t *stackBase) { |
|
866 m_stackBase = stackBase; |
|
867 } |
|
868 |
|
869 private: |
|
870 YarrPattern& m_pattern; |
|
871 uint8_t * m_stackBase; |
|
872 PatternAlternative* m_alternative; |
|
873 CharacterClassConstructor m_characterClassConstructor; |
|
874 bool m_invertCharacterClass; |
|
875 bool m_invertParentheticalAssertion; |
|
876 }; |
|
877 |
|
878 ErrorCode YarrPattern::compile(const String& patternString) |
|
879 { |
|
880 YarrPatternConstructor constructor(*this); |
|
881 |
|
882 if (ErrorCode error = parse(constructor, patternString)) |
|
883 return error; |
|
884 |
|
885 // If the pattern contains illegal backreferences reset & reparse. |
|
886 // Quoting Netscape's "What's new in JavaScript 1.2", |
|
887 // "Note: if the number of left parentheses is less than the number specified |
|
888 // in \#, the \# is taken as an octal escape as described in the next row." |
|
889 if (containsIllegalBackReference()) { |
|
890 unsigned numSubpatterns = m_numSubpatterns; |
|
891 |
|
892 constructor.reset(); |
|
893 #if !ASSERT_DISABLED |
|
894 ErrorCode error = |
|
895 #endif |
|
896 parse(constructor, patternString, numSubpatterns); |
|
897 |
|
898 ASSERT(!error); |
|
899 ASSERT(numSubpatterns == m_numSubpatterns); |
|
900 } |
|
901 |
|
902 uint8_t stackDummy_; |
|
903 constructor.setStackBase(&stackDummy_); |
|
904 |
|
905 constructor.checkForTerminalParentheses(); |
|
906 constructor.optimizeDotStarWrappedExpressions(); |
|
907 constructor.optimizeBOL(); |
|
908 |
|
909 if (ErrorCode error = constructor.setupOffsets()) |
|
910 return error; |
|
911 |
|
912 return NoError; |
|
913 } |
|
914 |
|
915 YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error) |
|
916 : m_ignoreCase(ignoreCase) |
|
917 , m_multiline(multiline) |
|
918 , m_containsBackreferences(false) |
|
919 , m_containsBOL(false) |
|
920 , m_numSubpatterns(0) |
|
921 , m_maxBackReference(0) |
|
922 , newlineCached(0) |
|
923 , digitsCached(0) |
|
924 , spacesCached(0) |
|
925 , wordcharCached(0) |
|
926 , nondigitsCached(0) |
|
927 , nonspacesCached(0) |
|
928 , nonwordcharCached(0) |
|
929 { |
|
930 *error = compile(pattern); |
|
931 } |
|
932 |
|
933 } } |