|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * vim: set ts=8 sts=4 et sw=4 tw=99: |
|
3 * |
|
4 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. |
|
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged |
|
6 * |
|
7 * Redistribution and use in source and binary forms, with or without |
|
8 * modification, are permitted provided that the following conditions |
|
9 * are met: |
|
10 * 1. Redistributions of source code must retain the above copyright |
|
11 * notice, this list of conditions and the following disclaimer. |
|
12 * 2. Redistributions in binary form must reproduce the above copyright |
|
13 * notice, this list of conditions and the following disclaimer in the |
|
14 * documentation and/or other materials provided with the distribution. |
|
15 * |
|
16 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
24 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
27 */ |
|
28 |
|
29 #ifndef yarr_YarrPattern_h |
|
30 #define yarr_YarrPattern_h |
|
31 |
|
32 #include "yarr/wtfbridge.h" |
|
33 #include "yarr/ASCIICType.h" |
|
34 |
|
35 namespace JSC { namespace Yarr { |
|
36 |
|
37 struct PatternDisjunction; |
|
38 |
|
39 enum ErrorCode { |
|
40 NoError, |
|
41 PatternTooLarge, |
|
42 QuantifierOutOfOrder, |
|
43 QuantifierWithoutAtom, |
|
44 QuantifierTooLarge, |
|
45 MissingParentheses, |
|
46 ParenthesesUnmatched, |
|
47 ParenthesesTypeInvalid, |
|
48 CharacterClassUnmatched, |
|
49 CharacterClassOutOfOrder, |
|
50 CharacterClassInvalidRange, |
|
51 EscapeUnterminated, |
|
52 RuntimeError, |
|
53 NumberOfErrorCodes |
|
54 }; |
|
55 |
|
56 static inline const char* errorMessage(ErrorCode code) |
|
57 { |
|
58 |
|
59 #define REGEXP_ERROR_PREFIX "Invalid regular expression: " |
|
60 // The order of this array must match the ErrorCode enum. |
|
61 static const char* errorMessages[NumberOfErrorCodes] = { |
|
62 0, // NoError |
|
63 REGEXP_ERROR_PREFIX "regular expression too large", |
|
64 REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", |
|
65 REGEXP_ERROR_PREFIX "nothing to repeat", |
|
66 REGEXP_ERROR_PREFIX "number too large in {} quantifier", |
|
67 REGEXP_ERROR_PREFIX "missing )", |
|
68 REGEXP_ERROR_PREFIX "unmatched parentheses", |
|
69 REGEXP_ERROR_PREFIX "unrecognized character after (?", |
|
70 REGEXP_ERROR_PREFIX "missing terminating ] for character class", |
|
71 REGEXP_ERROR_PREFIX "character class out of order" |
|
72 REGEXP_ERROR_PREFIX "range out of order in character class", |
|
73 REGEXP_ERROR_PREFIX "\\ at end of pattern" |
|
74 }; |
|
75 #undef REGEXP_ERROR_PREFIX |
|
76 |
|
77 return errorMessages[code]; |
|
78 } |
|
79 |
|
80 struct CharacterRange { |
|
81 UChar begin; |
|
82 UChar end; |
|
83 |
|
84 CharacterRange(UChar begin, UChar end) |
|
85 : begin(begin) |
|
86 , end(end) |
|
87 { |
|
88 } |
|
89 }; |
|
90 |
|
91 struct CharacterClass { |
|
92 WTF_MAKE_FAST_ALLOCATED; |
|
93 public: |
|
94 // All CharacterClass instances have to have the full set of matches and ranges, |
|
95 // they may have an optional m_table for faster lookups (which must match the |
|
96 // specified matches and ranges) |
|
97 CharacterClass() |
|
98 : m_table(0) |
|
99 { |
|
100 } |
|
101 CharacterClass(const char* table, bool inverted) |
|
102 : m_table(table) |
|
103 , m_tableInverted(inverted) |
|
104 { |
|
105 } |
|
106 Vector<UChar> m_matches; |
|
107 Vector<CharacterRange> m_ranges; |
|
108 Vector<UChar> m_matchesUnicode; |
|
109 Vector<CharacterRange> m_rangesUnicode; |
|
110 |
|
111 const char* m_table; |
|
112 bool m_tableInverted; |
|
113 }; |
|
114 |
|
115 enum QuantifierType { |
|
116 QuantifierFixedCount, |
|
117 QuantifierGreedy, |
|
118 QuantifierNonGreedy |
|
119 }; |
|
120 |
|
121 struct PatternTerm { |
|
122 enum Type { |
|
123 TypeAssertionBOL, |
|
124 TypeAssertionEOL, |
|
125 TypeAssertionWordBoundary, |
|
126 TypePatternCharacter, |
|
127 TypeCharacterClass, |
|
128 TypeBackReference, |
|
129 TypeForwardReference, |
|
130 TypeParenthesesSubpattern, |
|
131 TypeParentheticalAssertion, |
|
132 TypeDotStarEnclosure |
|
133 } type; |
|
134 bool m_capture :1; |
|
135 bool m_invert :1; |
|
136 union { |
|
137 UChar patternCharacter; |
|
138 CharacterClass* characterClass; |
|
139 unsigned backReferenceSubpatternId; |
|
140 struct { |
|
141 PatternDisjunction* disjunction; |
|
142 unsigned subpatternId; |
|
143 unsigned lastSubpatternId; |
|
144 bool isCopy; |
|
145 bool isTerminal; |
|
146 } parentheses; |
|
147 struct { |
|
148 bool bolAnchor : 1; |
|
149 bool eolAnchor : 1; |
|
150 } anchors; |
|
151 }; |
|
152 QuantifierType quantityType; |
|
153 Checked<unsigned> quantityCount; |
|
154 int inputPosition; |
|
155 unsigned frameLocation; |
|
156 |
|
157 PatternTerm(UChar ch) |
|
158 : type(PatternTerm::TypePatternCharacter) |
|
159 , m_capture(false) |
|
160 , m_invert(false) |
|
161 { |
|
162 patternCharacter = ch; |
|
163 quantityType = QuantifierFixedCount; |
|
164 quantityCount = 1; |
|
165 } |
|
166 |
|
167 PatternTerm(CharacterClass* charClass, bool invert) |
|
168 : type(PatternTerm::TypeCharacterClass) |
|
169 , m_capture(false) |
|
170 , m_invert(invert) |
|
171 { |
|
172 characterClass = charClass; |
|
173 quantityType = QuantifierFixedCount; |
|
174 quantityCount = 1; |
|
175 } |
|
176 |
|
177 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false) |
|
178 : type(type) |
|
179 , m_capture(capture) |
|
180 , m_invert(invert) |
|
181 { |
|
182 parentheses.disjunction = disjunction; |
|
183 parentheses.subpatternId = subpatternId; |
|
184 parentheses.isCopy = false; |
|
185 parentheses.isTerminal = false; |
|
186 quantityType = QuantifierFixedCount; |
|
187 quantityCount = 1; |
|
188 } |
|
189 |
|
190 PatternTerm(Type type, bool invert = false) |
|
191 : type(type) |
|
192 , m_capture(false) |
|
193 , m_invert(invert) |
|
194 { |
|
195 quantityType = QuantifierFixedCount; |
|
196 quantityCount = 1; |
|
197 } |
|
198 |
|
199 PatternTerm(unsigned spatternId) |
|
200 : type(TypeBackReference) |
|
201 , m_capture(false) |
|
202 , m_invert(false) |
|
203 { |
|
204 backReferenceSubpatternId = spatternId; |
|
205 quantityType = QuantifierFixedCount; |
|
206 quantityCount = 1; |
|
207 } |
|
208 |
|
209 PatternTerm(bool bolAnchor, bool eolAnchor) |
|
210 : type(TypeDotStarEnclosure) |
|
211 , m_capture(false) |
|
212 , m_invert(false) |
|
213 { |
|
214 anchors.bolAnchor = bolAnchor; |
|
215 anchors.eolAnchor = eolAnchor; |
|
216 quantityType = QuantifierFixedCount; |
|
217 quantityCount = 1; |
|
218 } |
|
219 |
|
220 // No-argument constructor for js::Vector. |
|
221 PatternTerm() |
|
222 : type(PatternTerm::TypePatternCharacter) |
|
223 , m_capture(false) |
|
224 , m_invert(false) |
|
225 { |
|
226 patternCharacter = 0; |
|
227 quantityType = QuantifierFixedCount; |
|
228 quantityCount = 1; |
|
229 } |
|
230 |
|
231 static PatternTerm ForwardReference() |
|
232 { |
|
233 return PatternTerm(TypeForwardReference); |
|
234 } |
|
235 |
|
236 static PatternTerm BOL() |
|
237 { |
|
238 return PatternTerm(TypeAssertionBOL); |
|
239 } |
|
240 |
|
241 static PatternTerm EOL() |
|
242 { |
|
243 return PatternTerm(TypeAssertionEOL); |
|
244 } |
|
245 |
|
246 static PatternTerm WordBoundary(bool invert) |
|
247 { |
|
248 return PatternTerm(TypeAssertionWordBoundary, invert); |
|
249 } |
|
250 |
|
251 bool invert() |
|
252 { |
|
253 return m_invert; |
|
254 } |
|
255 |
|
256 bool capture() |
|
257 { |
|
258 return m_capture; |
|
259 } |
|
260 |
|
261 void quantify(unsigned count, QuantifierType type) |
|
262 { |
|
263 quantityCount = count; |
|
264 quantityType = type; |
|
265 } |
|
266 }; |
|
267 |
|
268 struct PatternAlternative { |
|
269 WTF_MAKE_FAST_ALLOCATED; |
|
270 public: |
|
271 PatternAlternative(PatternDisjunction* disjunction) |
|
272 : m_parent(disjunction) |
|
273 , m_onceThrough(false) |
|
274 , m_hasFixedSize(false) |
|
275 , m_startsWithBOL(false) |
|
276 , m_containsBOL(false) |
|
277 { |
|
278 } |
|
279 |
|
280 PatternTerm& lastTerm() |
|
281 { |
|
282 ASSERT(m_terms.size()); |
|
283 return m_terms[m_terms.size() - 1]; |
|
284 } |
|
285 |
|
286 void removeLastTerm() |
|
287 { |
|
288 ASSERT(m_terms.size()); |
|
289 m_terms.shrink(m_terms.size() - 1); |
|
290 } |
|
291 |
|
292 void setOnceThrough() |
|
293 { |
|
294 m_onceThrough = true; |
|
295 } |
|
296 |
|
297 bool onceThrough() |
|
298 { |
|
299 return m_onceThrough; |
|
300 } |
|
301 |
|
302 Vector<PatternTerm> m_terms; |
|
303 PatternDisjunction* m_parent; |
|
304 unsigned m_minimumSize; |
|
305 bool m_onceThrough : 1; |
|
306 bool m_hasFixedSize : 1; |
|
307 bool m_startsWithBOL : 1; |
|
308 bool m_containsBOL : 1; |
|
309 }; |
|
310 |
|
311 struct PatternDisjunction { |
|
312 WTF_MAKE_FAST_ALLOCATED; |
|
313 public: |
|
314 PatternDisjunction(PatternAlternative* parent = 0) |
|
315 : m_parent(parent) |
|
316 , m_hasFixedSize(false) |
|
317 { |
|
318 } |
|
319 |
|
320 ~PatternDisjunction() |
|
321 { |
|
322 deleteAllValues(m_alternatives); |
|
323 } |
|
324 |
|
325 PatternAlternative* addNewAlternative() |
|
326 { |
|
327 PatternAlternative* alternative = newOrCrash<PatternAlternative>(this); |
|
328 m_alternatives.append(alternative); |
|
329 return alternative; |
|
330 } |
|
331 |
|
332 Vector<PatternAlternative*> m_alternatives; |
|
333 PatternAlternative* m_parent; |
|
334 unsigned m_minimumSize; |
|
335 unsigned m_callFrameSize; |
|
336 bool m_hasFixedSize; |
|
337 }; |
|
338 |
|
339 // You probably don't want to be calling these functions directly |
|
340 // (please to be calling newlineCharacterClass() et al on your |
|
341 // friendly neighborhood YarrPattern instance to get nicely |
|
342 // cached copies). |
|
343 CharacterClass* newlineCreate(); |
|
344 CharacterClass* digitsCreate(); |
|
345 CharacterClass* spacesCreate(); |
|
346 CharacterClass* wordcharCreate(); |
|
347 CharacterClass* nondigitsCreate(); |
|
348 CharacterClass* nonspacesCreate(); |
|
349 CharacterClass* nonwordcharCreate(); |
|
350 |
|
351 struct TermChain { |
|
352 TermChain(PatternTerm term) |
|
353 : term(term) |
|
354 {} |
|
355 |
|
356 PatternTerm term; |
|
357 Vector<TermChain> hotTerms; |
|
358 }; |
|
359 |
|
360 struct YarrPattern { |
|
361 YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error); |
|
362 |
|
363 ~YarrPattern() |
|
364 { |
|
365 deleteAllValues(m_disjunctions); |
|
366 deleteAllValues(m_userCharacterClasses); |
|
367 } |
|
368 |
|
369 void reset() |
|
370 { |
|
371 m_numSubpatterns = 0; |
|
372 m_maxBackReference = 0; |
|
373 |
|
374 m_containsBackreferences = false; |
|
375 m_containsBOL = false; |
|
376 |
|
377 newlineCached = 0; |
|
378 digitsCached = 0; |
|
379 spacesCached = 0; |
|
380 wordcharCached = 0; |
|
381 nondigitsCached = 0; |
|
382 nonspacesCached = 0; |
|
383 nonwordcharCached = 0; |
|
384 |
|
385 deleteAllValues(m_disjunctions); |
|
386 m_disjunctions.clear(); |
|
387 deleteAllValues(m_userCharacterClasses); |
|
388 m_userCharacterClasses.clear(); |
|
389 } |
|
390 |
|
391 bool containsIllegalBackReference() |
|
392 { |
|
393 return m_maxBackReference > m_numSubpatterns; |
|
394 } |
|
395 |
|
396 CharacterClass* newlineCharacterClass() |
|
397 { |
|
398 if (!newlineCached) |
|
399 m_userCharacterClasses.append(newlineCached = newlineCreate()); |
|
400 return newlineCached; |
|
401 } |
|
402 CharacterClass* digitsCharacterClass() |
|
403 { |
|
404 if (!digitsCached) |
|
405 m_userCharacterClasses.append(digitsCached = digitsCreate()); |
|
406 return digitsCached; |
|
407 } |
|
408 CharacterClass* spacesCharacterClass() |
|
409 { |
|
410 if (!spacesCached) |
|
411 m_userCharacterClasses.append(spacesCached = spacesCreate()); |
|
412 return spacesCached; |
|
413 } |
|
414 CharacterClass* wordcharCharacterClass() |
|
415 { |
|
416 if (!wordcharCached) |
|
417 m_userCharacterClasses.append(wordcharCached = wordcharCreate()); |
|
418 return wordcharCached; |
|
419 } |
|
420 CharacterClass* nondigitsCharacterClass() |
|
421 { |
|
422 if (!nondigitsCached) |
|
423 m_userCharacterClasses.append(nondigitsCached = nondigitsCreate()); |
|
424 return nondigitsCached; |
|
425 } |
|
426 CharacterClass* nonspacesCharacterClass() |
|
427 { |
|
428 if (!nonspacesCached) |
|
429 m_userCharacterClasses.append(nonspacesCached = nonspacesCreate()); |
|
430 return nonspacesCached; |
|
431 } |
|
432 CharacterClass* nonwordcharCharacterClass() |
|
433 { |
|
434 if (!nonwordcharCached) |
|
435 m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate()); |
|
436 return nonwordcharCached; |
|
437 } |
|
438 |
|
439 bool m_ignoreCase : 1; |
|
440 bool m_multiline : 1; |
|
441 bool m_containsBackreferences : 1; |
|
442 bool m_containsBOL : 1; |
|
443 unsigned m_numSubpatterns; |
|
444 unsigned m_maxBackReference; |
|
445 PatternDisjunction* m_body; |
|
446 Vector<PatternDisjunction*, 4> m_disjunctions; |
|
447 Vector<CharacterClass*> m_userCharacterClasses; |
|
448 |
|
449 private: |
|
450 ErrorCode compile(const String& patternString); |
|
451 |
|
452 CharacterClass* newlineCached; |
|
453 CharacterClass* digitsCached; |
|
454 CharacterClass* spacesCached; |
|
455 CharacterClass* wordcharCached; |
|
456 CharacterClass* nondigitsCached; |
|
457 CharacterClass* nonspacesCached; |
|
458 CharacterClass* nonwordcharCached; |
|
459 }; |
|
460 |
|
461 } } // namespace JSC::Yarr |
|
462 |
|
463 #endif /* yarr_YarrPattern_h */ |