|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * vim: set ts=8 sts=4 et sw=4 tw=99: |
|
3 * |
|
4 * Copyright (C) 2009 Apple Inc. All rights reserved. |
|
5 * |
|
6 * Redistribution and use in source and binary forms, with or without |
|
7 * modification, are permitted provided that the following conditions |
|
8 * are met: |
|
9 * 1. Redistributions of source code must retain the above copyright |
|
10 * notice, this list of conditions and the following disclaimer. |
|
11 * 2. Redistributions in binary form must reproduce the above copyright |
|
12 * notice, this list of conditions and the following disclaimer in the |
|
13 * documentation and/or other materials provided with the distribution. |
|
14 * |
|
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
26 */ |
|
27 |
|
28 #ifndef yarr_YarrParser_h |
|
29 #define yarr_YarrParser_h |
|
30 |
|
31 #include "yarr/Yarr.h" |
|
32 |
|
33 namespace JSC { namespace Yarr { |
|
34 |
|
35 enum BuiltInCharacterClassID { |
|
36 DigitClassID, |
|
37 SpaceClassID, |
|
38 WordClassID, |
|
39 NewlineClassID |
|
40 }; |
|
41 |
|
42 // The Parser class should not be used directly - only via the Yarr::parse() method. |
|
43 template<class Delegate, typename CharType> |
|
44 class Parser { |
|
45 private: |
|
46 template<class FriendDelegate> |
|
47 friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); |
|
48 |
|
49 /* |
|
50 * CharacterClassParserDelegate: |
|
51 * |
|
52 * The class CharacterClassParserDelegate is used in the parsing of character |
|
53 * classes. This class handles detection of character ranges. This class |
|
54 * implements enough of the delegate interface such that it can be passed to |
|
55 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused |
|
56 * to perform the parsing of escape characters in character sets. |
|
57 */ |
|
58 class CharacterClassParserDelegate { |
|
59 public: |
|
60 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) |
|
61 : m_delegate(delegate) |
|
62 , m_err(err) |
|
63 , m_state(Empty) |
|
64 , m_character(0) |
|
65 { |
|
66 } |
|
67 |
|
68 /* |
|
69 * begin(): |
|
70 * |
|
71 * Called at beginning of construction. |
|
72 */ |
|
73 void begin(bool invert) |
|
74 { |
|
75 m_delegate.atomCharacterClassBegin(invert); |
|
76 } |
|
77 |
|
78 /* |
|
79 * atomPatternCharacter(): |
|
80 * |
|
81 * This method is called either from parseCharacterClass() (for an unescaped |
|
82 * character in a character class), or from parseEscape(). In the former case |
|
83 * the value true will be passed for the argument 'hyphenIsRange', and in this |
|
84 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ |
|
85 * is different to /[a\-z]/). |
|
86 */ |
|
87 void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) |
|
88 { |
|
89 switch (m_state) { |
|
90 case AfterCharacterClass: |
|
91 // Following a builtin character class we need look out for a hyphen. |
|
92 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. |
|
93 // If we see a hyphen following a charater class then unlike usual |
|
94 // we'll report it to the delegate immediately, and put ourself into |
|
95 // a poisoned state. Any following calls to add another character or |
|
96 // character class will result in an error. (A hypen following a |
|
97 // character-class is itself valid, but only at the end of a regex). |
|
98 if (hyphenIsRange && ch == '-') { |
|
99 m_delegate.atomCharacterClassAtom('-'); |
|
100 m_state = AfterCharacterClassHyphen; |
|
101 return; |
|
102 } |
|
103 // Otherwise just fall through - cached character so treat this as Empty. |
|
104 |
|
105 case Empty: |
|
106 m_character = ch; |
|
107 m_state = CachedCharacter; |
|
108 return; |
|
109 |
|
110 case CachedCharacter: |
|
111 if (hyphenIsRange && ch == '-') |
|
112 m_state = CachedCharacterHyphen; |
|
113 else { |
|
114 m_delegate.atomCharacterClassAtom(m_character); |
|
115 m_character = ch; |
|
116 } |
|
117 return; |
|
118 |
|
119 case CachedCharacterHyphen: |
|
120 if (ch < m_character) { |
|
121 m_err = CharacterClassOutOfOrder; |
|
122 return; |
|
123 } |
|
124 m_delegate.atomCharacterClassRange(m_character, ch); |
|
125 m_state = Empty; |
|
126 return; |
|
127 |
|
128 case AfterCharacterClassHyphen: |
|
129 m_delegate.atomCharacterClassAtom(ch); |
|
130 m_state = Empty; |
|
131 return; |
|
132 } |
|
133 } |
|
134 |
|
135 /* |
|
136 * atomBuiltInCharacterClass(): |
|
137 * |
|
138 * Adds a built-in character class, called by parseEscape(). |
|
139 */ |
|
140 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
|
141 { |
|
142 switch (m_state) { |
|
143 case CachedCharacter: |
|
144 // Flush the currently cached character, then fall through. |
|
145 m_delegate.atomCharacterClassAtom(m_character); |
|
146 |
|
147 case Empty: |
|
148 case AfterCharacterClass: |
|
149 m_state = AfterCharacterClass; |
|
150 m_delegate.atomCharacterClassBuiltIn(classID, invert); |
|
151 return; |
|
152 |
|
153 case CachedCharacterHyphen: |
|
154 // Error! We have a range that looks like [x-\d]. We require |
|
155 // the end of the range to be a single character. |
|
156 m_err = CharacterClassInvalidRange; |
|
157 return; |
|
158 case AfterCharacterClassHyphen: |
|
159 m_delegate.atomCharacterClassBuiltIn(classID, invert); |
|
160 m_state = Empty; |
|
161 return; |
|
162 } |
|
163 } |
|
164 |
|
165 /* |
|
166 * end(): |
|
167 * |
|
168 * Called at end of construction. |
|
169 */ |
|
170 void end() |
|
171 { |
|
172 if (m_state == CachedCharacter) |
|
173 m_delegate.atomCharacterClassAtom(m_character); |
|
174 else if (m_state == CachedCharacterHyphen) { |
|
175 m_delegate.atomCharacterClassAtom(m_character); |
|
176 m_delegate.atomCharacterClassAtom('-'); |
|
177 } |
|
178 m_delegate.atomCharacterClassEnd(); |
|
179 } |
|
180 |
|
181 // parseEscape() should never call these delegate methods when |
|
182 // invoked with inCharacterClass set. |
|
183 NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } |
|
184 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } |
|
185 |
|
186 private: |
|
187 Delegate& m_delegate; |
|
188 ErrorCode& m_err; |
|
189 enum CharacterClassConstructionState { |
|
190 Empty, |
|
191 CachedCharacter, |
|
192 CachedCharacterHyphen, |
|
193 AfterCharacterClass, |
|
194 AfterCharacterClassHyphen |
|
195 } m_state; |
|
196 UChar m_character; |
|
197 }; |
|
198 |
|
199 Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) |
|
200 : m_delegate(delegate) |
|
201 , m_backReferenceLimit(backReferenceLimit) |
|
202 , m_err(NoError) |
|
203 , m_data(pattern.chars()) |
|
204 , m_size(pattern.length()) |
|
205 , m_index(0) |
|
206 , m_parenthesesNestingDepth(0) |
|
207 { |
|
208 } |
|
209 |
|
210 /* |
|
211 * parseEscape(): |
|
212 * |
|
213 * Helper for parseTokens() AND parseCharacterClass(). |
|
214 * Unlike the other parser methods, this function does not report tokens |
|
215 * directly to the member delegate (m_delegate), instead tokens are |
|
216 * emitted to the delegate provided as an argument. In the case of atom |
|
217 * escapes, parseTokens() will call parseEscape() passing m_delegate as |
|
218 * an argument, and as such the escape will be reported to the delegate. |
|
219 * |
|
220 * However this method may also be used by parseCharacterClass(), in which |
|
221 * case a CharacterClassParserDelegate will be passed as the delegate that |
|
222 * tokens should be added to. A boolean flag is also provided to indicate |
|
223 * whether that an escape in a CharacterClass is being parsed (some parsing |
|
224 * rules change in this context). |
|
225 * |
|
226 * The boolean value returned by this method indicates whether the token |
|
227 * parsed was an atom (outside of a characted class \b and \B will be |
|
228 * interpreted as assertions). |
|
229 */ |
|
230 template<bool inCharacterClass, class EscapeDelegate> |
|
231 bool parseEscape(EscapeDelegate& delegate) |
|
232 { |
|
233 ASSERT(!m_err); |
|
234 ASSERT(peek() == '\\'); |
|
235 consume(); |
|
236 |
|
237 if (atEndOfPattern()) { |
|
238 m_err = EscapeUnterminated; |
|
239 return false; |
|
240 } |
|
241 |
|
242 switch (peek()) { |
|
243 // Assertions |
|
244 case 'b': |
|
245 consume(); |
|
246 if (inCharacterClass) |
|
247 delegate.atomPatternCharacter('\b'); |
|
248 else { |
|
249 delegate.assertionWordBoundary(false); |
|
250 return false; |
|
251 } |
|
252 break; |
|
253 case 'B': |
|
254 consume(); |
|
255 if (inCharacterClass) |
|
256 delegate.atomPatternCharacter('B'); |
|
257 else { |
|
258 delegate.assertionWordBoundary(true); |
|
259 return false; |
|
260 } |
|
261 break; |
|
262 |
|
263 // CharacterClassEscape |
|
264 case 'd': |
|
265 consume(); |
|
266 delegate.atomBuiltInCharacterClass(DigitClassID, false); |
|
267 break; |
|
268 case 's': |
|
269 consume(); |
|
270 delegate.atomBuiltInCharacterClass(SpaceClassID, false); |
|
271 break; |
|
272 case 'w': |
|
273 consume(); |
|
274 delegate.atomBuiltInCharacterClass(WordClassID, false); |
|
275 break; |
|
276 case 'D': |
|
277 consume(); |
|
278 delegate.atomBuiltInCharacterClass(DigitClassID, true); |
|
279 break; |
|
280 case 'S': |
|
281 consume(); |
|
282 delegate.atomBuiltInCharacterClass(SpaceClassID, true); |
|
283 break; |
|
284 case 'W': |
|
285 consume(); |
|
286 delegate.atomBuiltInCharacterClass(WordClassID, true); |
|
287 break; |
|
288 |
|
289 // DecimalEscape |
|
290 case '1': |
|
291 case '2': |
|
292 case '3': |
|
293 case '4': |
|
294 case '5': |
|
295 case '6': |
|
296 case '7': |
|
297 case '8': |
|
298 case '9': { |
|
299 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. |
|
300 // First, try to parse this as backreference. |
|
301 if (!inCharacterClass) { |
|
302 ParseState state = saveState(); |
|
303 |
|
304 unsigned backReference; |
|
305 if (!consumeNumber(backReference)) |
|
306 break; |
|
307 if (backReference <= m_backReferenceLimit) { |
|
308 delegate.atomBackReference(backReference); |
|
309 break; |
|
310 } |
|
311 |
|
312 restoreState(state); |
|
313 } |
|
314 |
|
315 // Not a backreference, and not octal. |
|
316 if (peek() >= '8') { |
|
317 delegate.atomPatternCharacter('\\'); |
|
318 break; |
|
319 } |
|
320 |
|
321 // Fall-through to handle this as an octal escape. |
|
322 } |
|
323 |
|
324 // Octal escape |
|
325 case '0': |
|
326 delegate.atomPatternCharacter(consumeOctal()); |
|
327 break; |
|
328 |
|
329 // ControlEscape |
|
330 case 'f': |
|
331 consume(); |
|
332 delegate.atomPatternCharacter('\f'); |
|
333 break; |
|
334 case 'n': |
|
335 consume(); |
|
336 delegate.atomPatternCharacter('\n'); |
|
337 break; |
|
338 case 'r': |
|
339 consume(); |
|
340 delegate.atomPatternCharacter('\r'); |
|
341 break; |
|
342 case 't': |
|
343 consume(); |
|
344 delegate.atomPatternCharacter('\t'); |
|
345 break; |
|
346 case 'v': |
|
347 consume(); |
|
348 delegate.atomPatternCharacter('\v'); |
|
349 break; |
|
350 |
|
351 // ControlLetter |
|
352 case 'c': { |
|
353 ParseState state = saveState(); |
|
354 consume(); |
|
355 if (!atEndOfPattern()) { |
|
356 int control = consume(); |
|
357 |
|
358 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. |
|
359 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { |
|
360 delegate.atomPatternCharacter(control & 0x1f); |
|
361 break; |
|
362 } |
|
363 } |
|
364 restoreState(state); |
|
365 delegate.atomPatternCharacter('\\'); |
|
366 break; |
|
367 } |
|
368 |
|
369 // HexEscape |
|
370 case 'x': { |
|
371 consume(); |
|
372 int x = tryConsumeHex(2); |
|
373 if (x == -1) |
|
374 delegate.atomPatternCharacter('x'); |
|
375 else |
|
376 delegate.atomPatternCharacter(x); |
|
377 break; |
|
378 } |
|
379 |
|
380 // UnicodeEscape |
|
381 case 'u': { |
|
382 consume(); |
|
383 int u = tryConsumeHex(4); |
|
384 if (u == -1) |
|
385 delegate.atomPatternCharacter('u'); |
|
386 else |
|
387 delegate.atomPatternCharacter(u); |
|
388 break; |
|
389 } |
|
390 |
|
391 // IdentityEscape |
|
392 default: |
|
393 delegate.atomPatternCharacter(consume()); |
|
394 } |
|
395 |
|
396 return true; |
|
397 } |
|
398 |
|
399 /* |
|
400 * parseAtomEscape(), parseCharacterClassEscape(): |
|
401 * |
|
402 * These methods alias to parseEscape(). |
|
403 */ |
|
404 bool parseAtomEscape() |
|
405 { |
|
406 return parseEscape<false>(m_delegate); |
|
407 } |
|
408 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) |
|
409 { |
|
410 parseEscape<true>(delegate); |
|
411 } |
|
412 |
|
413 /* |
|
414 * parseCharacterClass(): |
|
415 * |
|
416 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) |
|
417 * to an instance of CharacterClassParserDelegate, to describe the character class to the |
|
418 * delegate. |
|
419 */ |
|
420 void parseCharacterClass() |
|
421 { |
|
422 ASSERT(!m_err); |
|
423 ASSERT(peek() == '['); |
|
424 consume(); |
|
425 |
|
426 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); |
|
427 |
|
428 characterClassConstructor.begin(tryConsume('^')); |
|
429 |
|
430 while (!atEndOfPattern()) { |
|
431 switch (peek()) { |
|
432 case ']': |
|
433 consume(); |
|
434 characterClassConstructor.end(); |
|
435 return; |
|
436 |
|
437 case '\\': |
|
438 parseCharacterClassEscape(characterClassConstructor); |
|
439 break; |
|
440 |
|
441 default: |
|
442 characterClassConstructor.atomPatternCharacter(consume(), true); |
|
443 } |
|
444 |
|
445 if (m_err) |
|
446 return; |
|
447 } |
|
448 |
|
449 m_err = CharacterClassUnmatched; |
|
450 } |
|
451 |
|
452 /* |
|
453 * parseParenthesesBegin(): |
|
454 * |
|
455 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. |
|
456 */ |
|
457 void parseParenthesesBegin() |
|
458 { |
|
459 ASSERT(!m_err); |
|
460 ASSERT(peek() == '('); |
|
461 consume(); |
|
462 |
|
463 if (tryConsume('?')) { |
|
464 if (atEndOfPattern()) { |
|
465 m_err = ParenthesesTypeInvalid; |
|
466 return; |
|
467 } |
|
468 |
|
469 switch (consume()) { |
|
470 case ':': |
|
471 m_delegate.atomParenthesesSubpatternBegin(false); |
|
472 break; |
|
473 |
|
474 case '=': |
|
475 m_delegate.atomParentheticalAssertionBegin(); |
|
476 break; |
|
477 |
|
478 case '!': |
|
479 m_delegate.atomParentheticalAssertionBegin(true); |
|
480 break; |
|
481 |
|
482 default: |
|
483 m_err = ParenthesesTypeInvalid; |
|
484 } |
|
485 } else |
|
486 m_delegate.atomParenthesesSubpatternBegin(); |
|
487 |
|
488 ++m_parenthesesNestingDepth; |
|
489 } |
|
490 |
|
491 /* |
|
492 * parseParenthesesEnd(): |
|
493 * |
|
494 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). |
|
495 */ |
|
496 void parseParenthesesEnd() |
|
497 { |
|
498 ASSERT(!m_err); |
|
499 ASSERT(peek() == ')'); |
|
500 consume(); |
|
501 |
|
502 if (m_parenthesesNestingDepth > 0) |
|
503 m_delegate.atomParenthesesEnd(); |
|
504 else |
|
505 m_err = ParenthesesUnmatched; |
|
506 |
|
507 --m_parenthesesNestingDepth; |
|
508 } |
|
509 |
|
510 /* |
|
511 * parseQuantifier(): |
|
512 * |
|
513 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. |
|
514 */ |
|
515 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) |
|
516 { |
|
517 ASSERT(!m_err); |
|
518 ASSERT(min <= max); |
|
519 |
|
520 if (min == UINT_MAX) { |
|
521 m_err = QuantifierTooLarge; |
|
522 return; |
|
523 } |
|
524 |
|
525 if (lastTokenWasAnAtom) |
|
526 m_delegate.quantifyAtom(min, max, !tryConsume('?')); |
|
527 else |
|
528 m_err = QuantifierWithoutAtom; |
|
529 } |
|
530 |
|
531 /* |
|
532 * parseTokens(): |
|
533 * |
|
534 * This method loops over the input pattern reporting tokens to the delegate. |
|
535 * The method returns when a parse error is detected, or the end of the pattern |
|
536 * is reached. One piece of state is tracked around the loop, which is whether |
|
537 * the last token passed to the delegate was an atom (this is necessary to detect |
|
538 * a parse error when a quantifier provided without an atom to quantify). |
|
539 */ |
|
540 void parseTokens() |
|
541 { |
|
542 bool lastTokenWasAnAtom = false; |
|
543 |
|
544 while (!atEndOfPattern()) { |
|
545 switch (peek()) { |
|
546 case '|': |
|
547 consume(); |
|
548 m_delegate.disjunction(); |
|
549 lastTokenWasAnAtom = false; |
|
550 break; |
|
551 |
|
552 case '(': |
|
553 parseParenthesesBegin(); |
|
554 lastTokenWasAnAtom = false; |
|
555 break; |
|
556 |
|
557 case ')': |
|
558 parseParenthesesEnd(); |
|
559 lastTokenWasAnAtom = true; |
|
560 break; |
|
561 |
|
562 case '^': |
|
563 consume(); |
|
564 m_delegate.assertionBOL(); |
|
565 lastTokenWasAnAtom = false; |
|
566 break; |
|
567 |
|
568 case '$': |
|
569 consume(); |
|
570 m_delegate.assertionEOL(); |
|
571 lastTokenWasAnAtom = false; |
|
572 break; |
|
573 |
|
574 case '.': |
|
575 consume(); |
|
576 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); |
|
577 lastTokenWasAnAtom = true; |
|
578 break; |
|
579 |
|
580 case '[': |
|
581 parseCharacterClass(); |
|
582 lastTokenWasAnAtom = true; |
|
583 break; |
|
584 |
|
585 case '\\': |
|
586 lastTokenWasAnAtom = parseAtomEscape(); |
|
587 break; |
|
588 |
|
589 case '*': |
|
590 consume(); |
|
591 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); |
|
592 lastTokenWasAnAtom = false; |
|
593 break; |
|
594 |
|
595 case '+': |
|
596 consume(); |
|
597 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); |
|
598 lastTokenWasAnAtom = false; |
|
599 break; |
|
600 |
|
601 case '?': |
|
602 consume(); |
|
603 parseQuantifier(lastTokenWasAnAtom, 0, 1); |
|
604 lastTokenWasAnAtom = false; |
|
605 break; |
|
606 |
|
607 case '{': { |
|
608 ParseState state = saveState(); |
|
609 |
|
610 consume(); |
|
611 if (peekIsDigit()) { |
|
612 unsigned min; |
|
613 if (!consumeNumber(min)) |
|
614 break; |
|
615 |
|
616 unsigned max = min; |
|
617 if (tryConsume(',')) { |
|
618 if (peekIsDigit()) { |
|
619 if (!consumeNumber(max)) |
|
620 break; |
|
621 } else { |
|
622 max = quantifyInfinite; |
|
623 } |
|
624 } |
|
625 |
|
626 if (tryConsume('}')) { |
|
627 if (min <= max) |
|
628 parseQuantifier(lastTokenWasAnAtom, min, max); |
|
629 else |
|
630 m_err = QuantifierOutOfOrder; |
|
631 lastTokenWasAnAtom = false; |
|
632 break; |
|
633 } |
|
634 } |
|
635 |
|
636 restoreState(state); |
|
637 } // if we did not find a complete quantifer, fall through to the default case. |
|
638 |
|
639 default: |
|
640 m_delegate.atomPatternCharacter(consume()); |
|
641 lastTokenWasAnAtom = true; |
|
642 } |
|
643 |
|
644 if (m_err) |
|
645 return; |
|
646 } |
|
647 |
|
648 if (m_parenthesesNestingDepth > 0) |
|
649 m_err = MissingParentheses; |
|
650 } |
|
651 |
|
652 /* |
|
653 * parse(): |
|
654 * |
|
655 * This method calls parseTokens() to parse over the input and converts any |
|
656 * error code to a const char* for a result. |
|
657 */ |
|
658 ErrorCode parse() |
|
659 { |
|
660 if (m_size > MAX_PATTERN_SIZE) |
|
661 m_err = PatternTooLarge; |
|
662 else |
|
663 parseTokens(); |
|
664 ASSERT(atEndOfPattern() || m_err); |
|
665 |
|
666 return m_err; |
|
667 } |
|
668 |
|
669 // Misc helper functions: |
|
670 |
|
671 typedef unsigned ParseState; |
|
672 |
|
673 ParseState saveState() |
|
674 { |
|
675 return m_index; |
|
676 } |
|
677 |
|
678 void restoreState(ParseState state) |
|
679 { |
|
680 m_index = state; |
|
681 } |
|
682 |
|
683 bool atEndOfPattern() |
|
684 { |
|
685 ASSERT(m_index <= m_size); |
|
686 return m_index == m_size; |
|
687 } |
|
688 |
|
689 int peek() |
|
690 { |
|
691 ASSERT(m_index < m_size); |
|
692 return m_data[m_index]; |
|
693 } |
|
694 |
|
695 bool peekIsDigit() |
|
696 { |
|
697 return !atEndOfPattern() && WTF::isASCIIDigit(peek()); |
|
698 } |
|
699 |
|
700 unsigned peekDigit() |
|
701 { |
|
702 ASSERT(peekIsDigit()); |
|
703 return peek() - '0'; |
|
704 } |
|
705 |
|
706 int consume() |
|
707 { |
|
708 ASSERT(m_index < m_size); |
|
709 return m_data[m_index++]; |
|
710 } |
|
711 |
|
712 unsigned consumeDigit() |
|
713 { |
|
714 ASSERT(peekIsDigit()); |
|
715 return consume() - '0'; |
|
716 } |
|
717 |
|
718 bool consumeNumber(unsigned &accum) |
|
719 { |
|
720 accum = consumeDigit(); |
|
721 while (peekIsDigit()) { |
|
722 unsigned newValue = accum * 10 + peekDigit(); |
|
723 if (newValue < accum) { /* Overflow check. */ |
|
724 m_err = QuantifierTooLarge; |
|
725 return false; |
|
726 } |
|
727 accum = newValue; |
|
728 consume(); |
|
729 } |
|
730 return true; |
|
731 } |
|
732 |
|
733 unsigned consumeOctal() |
|
734 { |
|
735 ASSERT(WTF::isASCIIOctalDigit(peek())); |
|
736 |
|
737 unsigned n = consumeDigit(); |
|
738 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) |
|
739 n = n * 8 + consumeDigit(); |
|
740 return n; |
|
741 } |
|
742 |
|
743 bool tryConsume(UChar ch) |
|
744 { |
|
745 if (atEndOfPattern() || (m_data[m_index] != ch)) |
|
746 return false; |
|
747 ++m_index; |
|
748 return true; |
|
749 } |
|
750 |
|
751 int tryConsumeHex(int count) |
|
752 { |
|
753 ParseState state = saveState(); |
|
754 |
|
755 int n = 0; |
|
756 while (count--) { |
|
757 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { |
|
758 restoreState(state); |
|
759 return -1; |
|
760 } |
|
761 n = (n << 4) | WTF::toASCIIHexValue(consume()); |
|
762 } |
|
763 return n; |
|
764 } |
|
765 |
|
766 Delegate& m_delegate; |
|
767 unsigned m_backReferenceLimit; |
|
768 ErrorCode m_err; |
|
769 const CharType* m_data; |
|
770 unsigned m_size; |
|
771 unsigned m_index; |
|
772 unsigned m_parenthesesNestingDepth; |
|
773 |
|
774 // Derived by empirical testing of compile time in PCRE and WREC. |
|
775 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; |
|
776 }; |
|
777 |
|
778 /* |
|
779 * Yarr::parse(): |
|
780 * |
|
781 * The parse method is passed a pattern to be parsed and a delegate upon which |
|
782 * callbacks will be made to record the parsed tokens forming the regex. |
|
783 * Yarr::parse() returns null on success, or a const C string providing an error |
|
784 * message where a parse error occurs. |
|
785 * |
|
786 * The Delegate must implement the following interface: |
|
787 * |
|
788 * void assertionBOL(); |
|
789 * void assertionEOL(); |
|
790 * void assertionWordBoundary(bool invert); |
|
791 * |
|
792 * void atomPatternCharacter(UChar ch); |
|
793 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); |
|
794 * void atomCharacterClassBegin(bool invert) |
|
795 * void atomCharacterClassAtom(UChar ch) |
|
796 * void atomCharacterClassRange(UChar begin, UChar end) |
|
797 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
|
798 * void atomCharacterClassEnd() |
|
799 * void atomParenthesesSubpatternBegin(bool capture = true); |
|
800 * void atomParentheticalAssertionBegin(bool invert = false); |
|
801 * void atomParenthesesEnd(); |
|
802 * void atomBackReference(unsigned subpatternId); |
|
803 * |
|
804 * void quantifyAtom(unsigned min, unsigned max, bool greedy); |
|
805 * |
|
806 * void disjunction(); |
|
807 * |
|
808 * The regular expression is described by a sequence of assertion*() and atom*() |
|
809 * callbacks to the delegate, describing the terms in the regular expression. |
|
810 * Following an atom a quantifyAtom() call may occur to indicate that the previous |
|
811 * atom should be quantified. In the case of atoms described across multiple |
|
812 * calls (parentheses and character classes) the call to quantifyAtom() will come |
|
813 * after the call to the atom*End() method, never after atom*Begin(). |
|
814 * |
|
815 * Character classes may either be described by a single call to |
|
816 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. |
|
817 * In the latter case, ...Begin() will be called, followed by a sequence of |
|
818 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). |
|
819 * |
|
820 * Sequences of atoms and assertions are broken into alternatives via calls to |
|
821 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to |
|
822 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. |
|
823 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular |
|
824 * capturing subpattern, this will be the subpatternId associated with these |
|
825 * parentheses, and will also by definition be the lowest subpatternId of these |
|
826 * parentheses and of any nested paretheses. The atomParenthesesEnd() method |
|
827 * is passed the subpatternId of the last capturing subexpression nested within |
|
828 * these paretheses. In the case of a capturing subpattern with no nested |
|
829 * capturing subpatterns, the same subpatternId will be passed to the begin and |
|
830 * end functions. In the case of non-capturing subpatterns the subpatternId |
|
831 * passed to the begin method is also the first possible subpatternId that might |
|
832 * be nested within these paretheses. If a set of non-capturing parentheses does |
|
833 * not contain any capturing subpatterns, then the subpatternId passed to begin |
|
834 * will be greater than the subpatternId passed to end. |
|
835 */ |
|
836 |
|
837 template<class Delegate> |
|
838 ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) |
|
839 { |
|
840 #ifdef YARR_8BIT_CHAR_SUPPORT |
|
841 if (pattern.is8Bit()) |
|
842 return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); |
|
843 #endif |
|
844 return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); |
|
845 } |
|
846 |
|
847 } } // namespace JSC::Yarr |
|
848 |
|
849 #endif /* yarr_YarrParser_h */ |