Wed, 31 Dec 2014 07:53:36 +0100
Correct small whitespace inconsistency, lost while renaming variables.
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set ts=8 sts=4 et sw=4 tw=99:
3 *
4 * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
29 #ifndef yarr_YarrPattern_h
30 #define yarr_YarrPattern_h
32 #include "yarr/wtfbridge.h"
33 #include "yarr/ASCIICType.h"
35 namespace JSC { namespace Yarr {
37 struct PatternDisjunction;
39 enum ErrorCode {
40 NoError,
41 PatternTooLarge,
42 QuantifierOutOfOrder,
43 QuantifierWithoutAtom,
44 QuantifierTooLarge,
45 MissingParentheses,
46 ParenthesesUnmatched,
47 ParenthesesTypeInvalid,
48 CharacterClassUnmatched,
49 CharacterClassOutOfOrder,
50 CharacterClassInvalidRange,
51 EscapeUnterminated,
52 RuntimeError,
53 NumberOfErrorCodes
54 };
56 static inline const char* errorMessage(ErrorCode code)
57 {
59 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
60 // The order of this array must match the ErrorCode enum.
61 static const char* errorMessages[NumberOfErrorCodes] = {
62 0, // NoError
63 REGEXP_ERROR_PREFIX "regular expression too large",
64 REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
65 REGEXP_ERROR_PREFIX "nothing to repeat",
66 REGEXP_ERROR_PREFIX "number too large in {} quantifier",
67 REGEXP_ERROR_PREFIX "missing )",
68 REGEXP_ERROR_PREFIX "unmatched parentheses",
69 REGEXP_ERROR_PREFIX "unrecognized character after (?",
70 REGEXP_ERROR_PREFIX "missing terminating ] for character class",
71 REGEXP_ERROR_PREFIX "character class out of order"
72 REGEXP_ERROR_PREFIX "range out of order in character class",
73 REGEXP_ERROR_PREFIX "\\ at end of pattern"
74 };
75 #undef REGEXP_ERROR_PREFIX
77 return errorMessages[code];
78 }
80 struct CharacterRange {
81 UChar begin;
82 UChar end;
84 CharacterRange(UChar begin, UChar end)
85 : begin(begin)
86 , end(end)
87 {
88 }
89 };
91 struct CharacterClass {
92 WTF_MAKE_FAST_ALLOCATED;
93 public:
94 // All CharacterClass instances have to have the full set of matches and ranges,
95 // they may have an optional m_table for faster lookups (which must match the
96 // specified matches and ranges)
97 CharacterClass()
98 : m_table(0)
99 {
100 }
101 CharacterClass(const char* table, bool inverted)
102 : m_table(table)
103 , m_tableInverted(inverted)
104 {
105 }
106 Vector<UChar> m_matches;
107 Vector<CharacterRange> m_ranges;
108 Vector<UChar> m_matchesUnicode;
109 Vector<CharacterRange> m_rangesUnicode;
111 const char* m_table;
112 bool m_tableInverted;
113 };
115 enum QuantifierType {
116 QuantifierFixedCount,
117 QuantifierGreedy,
118 QuantifierNonGreedy
119 };
121 struct PatternTerm {
122 enum Type {
123 TypeAssertionBOL,
124 TypeAssertionEOL,
125 TypeAssertionWordBoundary,
126 TypePatternCharacter,
127 TypeCharacterClass,
128 TypeBackReference,
129 TypeForwardReference,
130 TypeParenthesesSubpattern,
131 TypeParentheticalAssertion,
132 TypeDotStarEnclosure
133 } type;
134 bool m_capture :1;
135 bool m_invert :1;
136 union {
137 UChar patternCharacter;
138 CharacterClass* characterClass;
139 unsigned backReferenceSubpatternId;
140 struct {
141 PatternDisjunction* disjunction;
142 unsigned subpatternId;
143 unsigned lastSubpatternId;
144 bool isCopy;
145 bool isTerminal;
146 } parentheses;
147 struct {
148 bool bolAnchor : 1;
149 bool eolAnchor : 1;
150 } anchors;
151 };
152 QuantifierType quantityType;
153 Checked<unsigned> quantityCount;
154 int inputPosition;
155 unsigned frameLocation;
157 PatternTerm(UChar ch)
158 : type(PatternTerm::TypePatternCharacter)
159 , m_capture(false)
160 , m_invert(false)
161 {
162 patternCharacter = ch;
163 quantityType = QuantifierFixedCount;
164 quantityCount = 1;
165 }
167 PatternTerm(CharacterClass* charClass, bool invert)
168 : type(PatternTerm::TypeCharacterClass)
169 , m_capture(false)
170 , m_invert(invert)
171 {
172 characterClass = charClass;
173 quantityType = QuantifierFixedCount;
174 quantityCount = 1;
175 }
177 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
178 : type(type)
179 , m_capture(capture)
180 , m_invert(invert)
181 {
182 parentheses.disjunction = disjunction;
183 parentheses.subpatternId = subpatternId;
184 parentheses.isCopy = false;
185 parentheses.isTerminal = false;
186 quantityType = QuantifierFixedCount;
187 quantityCount = 1;
188 }
190 PatternTerm(Type type, bool invert = false)
191 : type(type)
192 , m_capture(false)
193 , m_invert(invert)
194 {
195 quantityType = QuantifierFixedCount;
196 quantityCount = 1;
197 }
199 PatternTerm(unsigned spatternId)
200 : type(TypeBackReference)
201 , m_capture(false)
202 , m_invert(false)
203 {
204 backReferenceSubpatternId = spatternId;
205 quantityType = QuantifierFixedCount;
206 quantityCount = 1;
207 }
209 PatternTerm(bool bolAnchor, bool eolAnchor)
210 : type(TypeDotStarEnclosure)
211 , m_capture(false)
212 , m_invert(false)
213 {
214 anchors.bolAnchor = bolAnchor;
215 anchors.eolAnchor = eolAnchor;
216 quantityType = QuantifierFixedCount;
217 quantityCount = 1;
218 }
220 // No-argument constructor for js::Vector.
221 PatternTerm()
222 : type(PatternTerm::TypePatternCharacter)
223 , m_capture(false)
224 , m_invert(false)
225 {
226 patternCharacter = 0;
227 quantityType = QuantifierFixedCount;
228 quantityCount = 1;
229 }
231 static PatternTerm ForwardReference()
232 {
233 return PatternTerm(TypeForwardReference);
234 }
236 static PatternTerm BOL()
237 {
238 return PatternTerm(TypeAssertionBOL);
239 }
241 static PatternTerm EOL()
242 {
243 return PatternTerm(TypeAssertionEOL);
244 }
246 static PatternTerm WordBoundary(bool invert)
247 {
248 return PatternTerm(TypeAssertionWordBoundary, invert);
249 }
251 bool invert()
252 {
253 return m_invert;
254 }
256 bool capture()
257 {
258 return m_capture;
259 }
261 void quantify(unsigned count, QuantifierType type)
262 {
263 quantityCount = count;
264 quantityType = type;
265 }
266 };
268 struct PatternAlternative {
269 WTF_MAKE_FAST_ALLOCATED;
270 public:
271 PatternAlternative(PatternDisjunction* disjunction)
272 : m_parent(disjunction)
273 , m_onceThrough(false)
274 , m_hasFixedSize(false)
275 , m_startsWithBOL(false)
276 , m_containsBOL(false)
277 {
278 }
280 PatternTerm& lastTerm()
281 {
282 ASSERT(m_terms.size());
283 return m_terms[m_terms.size() - 1];
284 }
286 void removeLastTerm()
287 {
288 ASSERT(m_terms.size());
289 m_terms.shrink(m_terms.size() - 1);
290 }
292 void setOnceThrough()
293 {
294 m_onceThrough = true;
295 }
297 bool onceThrough()
298 {
299 return m_onceThrough;
300 }
302 Vector<PatternTerm> m_terms;
303 PatternDisjunction* m_parent;
304 unsigned m_minimumSize;
305 bool m_onceThrough : 1;
306 bool m_hasFixedSize : 1;
307 bool m_startsWithBOL : 1;
308 bool m_containsBOL : 1;
309 };
311 struct PatternDisjunction {
312 WTF_MAKE_FAST_ALLOCATED;
313 public:
314 PatternDisjunction(PatternAlternative* parent = 0)
315 : m_parent(parent)
316 , m_hasFixedSize(false)
317 {
318 }
320 ~PatternDisjunction()
321 {
322 deleteAllValues(m_alternatives);
323 }
325 PatternAlternative* addNewAlternative()
326 {
327 PatternAlternative* alternative = newOrCrash<PatternAlternative>(this);
328 m_alternatives.append(alternative);
329 return alternative;
330 }
332 Vector<PatternAlternative*> m_alternatives;
333 PatternAlternative* m_parent;
334 unsigned m_minimumSize;
335 unsigned m_callFrameSize;
336 bool m_hasFixedSize;
337 };
339 // You probably don't want to be calling these functions directly
340 // (please to be calling newlineCharacterClass() et al on your
341 // friendly neighborhood YarrPattern instance to get nicely
342 // cached copies).
343 CharacterClass* newlineCreate();
344 CharacterClass* digitsCreate();
345 CharacterClass* spacesCreate();
346 CharacterClass* wordcharCreate();
347 CharacterClass* nondigitsCreate();
348 CharacterClass* nonspacesCreate();
349 CharacterClass* nonwordcharCreate();
351 struct TermChain {
352 TermChain(PatternTerm term)
353 : term(term)
354 {}
356 PatternTerm term;
357 Vector<TermChain> hotTerms;
358 };
360 struct YarrPattern {
361 YarrPattern(const String& pattern, bool ignoreCase, bool multiline, ErrorCode* error);
363 ~YarrPattern()
364 {
365 deleteAllValues(m_disjunctions);
366 deleteAllValues(m_userCharacterClasses);
367 }
369 void reset()
370 {
371 m_numSubpatterns = 0;
372 m_maxBackReference = 0;
374 m_containsBackreferences = false;
375 m_containsBOL = false;
377 newlineCached = 0;
378 digitsCached = 0;
379 spacesCached = 0;
380 wordcharCached = 0;
381 nondigitsCached = 0;
382 nonspacesCached = 0;
383 nonwordcharCached = 0;
385 deleteAllValues(m_disjunctions);
386 m_disjunctions.clear();
387 deleteAllValues(m_userCharacterClasses);
388 m_userCharacterClasses.clear();
389 }
391 bool containsIllegalBackReference()
392 {
393 return m_maxBackReference > m_numSubpatterns;
394 }
396 CharacterClass* newlineCharacterClass()
397 {
398 if (!newlineCached)
399 m_userCharacterClasses.append(newlineCached = newlineCreate());
400 return newlineCached;
401 }
402 CharacterClass* digitsCharacterClass()
403 {
404 if (!digitsCached)
405 m_userCharacterClasses.append(digitsCached = digitsCreate());
406 return digitsCached;
407 }
408 CharacterClass* spacesCharacterClass()
409 {
410 if (!spacesCached)
411 m_userCharacterClasses.append(spacesCached = spacesCreate());
412 return spacesCached;
413 }
414 CharacterClass* wordcharCharacterClass()
415 {
416 if (!wordcharCached)
417 m_userCharacterClasses.append(wordcharCached = wordcharCreate());
418 return wordcharCached;
419 }
420 CharacterClass* nondigitsCharacterClass()
421 {
422 if (!nondigitsCached)
423 m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
424 return nondigitsCached;
425 }
426 CharacterClass* nonspacesCharacterClass()
427 {
428 if (!nonspacesCached)
429 m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
430 return nonspacesCached;
431 }
432 CharacterClass* nonwordcharCharacterClass()
433 {
434 if (!nonwordcharCached)
435 m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
436 return nonwordcharCached;
437 }
439 bool m_ignoreCase : 1;
440 bool m_multiline : 1;
441 bool m_containsBackreferences : 1;
442 bool m_containsBOL : 1;
443 unsigned m_numSubpatterns;
444 unsigned m_maxBackReference;
445 PatternDisjunction* m_body;
446 Vector<PatternDisjunction*, 4> m_disjunctions;
447 Vector<CharacterClass*> m_userCharacterClasses;
449 private:
450 ErrorCode compile(const String& patternString);
452 CharacterClass* newlineCached;
453 CharacterClass* digitsCached;
454 CharacterClass* spacesCached;
455 CharacterClass* wordcharCached;
456 CharacterClass* nondigitsCached;
457 CharacterClass* nonspacesCached;
458 CharacterClass* nonwordcharCached;
459 };
461 } } // namespace JSC::Yarr
463 #endif /* yarr_YarrPattern_h */