js/src/yarr/YarrParser.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     2  * vim: set ts=8 sts=4 et sw=4 tw=99:
     3  *
     4  * Copyright (C) 2009 Apple Inc. All rights reserved.
     5  *
     6  * Redistribution and use in source and binary forms, with or without
     7  * modification, are permitted provided that the following conditions
     8  * are met:
     9  * 1. Redistributions of source code must retain the above copyright
    10  *    notice, this list of conditions and the following disclaimer.
    11  * 2. Redistributions in binary form must reproduce the above copyright
    12  *    notice, this list of conditions and the following disclaimer in the
    13  *    documentation and/or other materials provided with the distribution.
    14  *
    15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
    16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
    19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
    20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
    21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
    22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
    23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    26  */
    28 #ifndef yarr_YarrParser_h
    29 #define yarr_YarrParser_h
    31 #include "yarr/Yarr.h"
    33 namespace JSC { namespace Yarr {
    35 enum BuiltInCharacterClassID {
    36     DigitClassID,
    37     SpaceClassID,
    38     WordClassID,
    39     NewlineClassID
    40 };
    42 // The Parser class should not be used directly - only via the Yarr::parse() method.
    43 template<class Delegate, typename CharType>
    44 class Parser {
    45 private:
    46     template<class FriendDelegate>
    47     friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);
    49     /*
    50      * CharacterClassParserDelegate:
    51      *
    52      * The class CharacterClassParserDelegate is used in the parsing of character
    53      * classes.  This class handles detection of character ranges.  This class
    54      * implements enough of the delegate interface such that it can be passed to
    55      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
    56      * to perform the parsing of escape characters in character sets.
    57      */
    58     class CharacterClassParserDelegate {
    59     public:
    60         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
    61             : m_delegate(delegate)
    62             , m_err(err)
    63             , m_state(Empty)
    64             , m_character(0)
    65         {
    66         }
    68         /*
    69          * begin():
    70          *
    71          * Called at beginning of construction.
    72          */
    73         void begin(bool invert)
    74         {
    75             m_delegate.atomCharacterClassBegin(invert);
    76         }
    78         /*
    79          * atomPatternCharacter():
    80          *
    81          * This method is called either from parseCharacterClass() (for an unescaped
    82          * character in a character class), or from parseEscape(). In the former case
    83          * the value true will be passed for the argument 'hyphenIsRange', and in this
    84          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
    85          * is different to /[a\-z]/).
    86          */
    87         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
    88         {
    89             switch (m_state) {
    90             case AfterCharacterClass:
    91                 // Following a builtin character class we need look out for a hyphen.
    92                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
    93                 // If we see a hyphen following a charater class then unlike usual
    94                 // we'll report it to the delegate immediately, and put ourself into
    95                 // a poisoned state. Any following calls to add another character or
    96                 // character class will result in an error. (A hypen following a
    97                 // character-class is itself valid, but only  at the end of a regex).
    98                 if (hyphenIsRange && ch == '-') {
    99                     m_delegate.atomCharacterClassAtom('-');
   100                     m_state = AfterCharacterClassHyphen;
   101                     return;
   102                 }
   103                 // Otherwise just fall through - cached character so treat this as Empty.
   105             case Empty:
   106                 m_character = ch;
   107                 m_state = CachedCharacter;
   108                 return;
   110             case CachedCharacter:
   111                 if (hyphenIsRange && ch == '-')
   112                     m_state = CachedCharacterHyphen;
   113                 else {
   114                     m_delegate.atomCharacterClassAtom(m_character);
   115                     m_character = ch;
   116                 }
   117                 return;
   119             case CachedCharacterHyphen:
   120                 if (ch < m_character) {
   121                     m_err = CharacterClassOutOfOrder;
   122                     return;
   123                 }
   124                 m_delegate.atomCharacterClassRange(m_character, ch);
   125                 m_state = Empty;
   126                 return;
   128             case AfterCharacterClassHyphen:
   129                 m_delegate.atomCharacterClassAtom(ch);
   130                 m_state = Empty;
   131                 return;
   132             }
   133         }
   135         /*
   136          * atomBuiltInCharacterClass():
   137          *
   138          * Adds a built-in character class, called by parseEscape().
   139          */
   140         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
   141         {
   142             switch (m_state) {
   143             case CachedCharacter:
   144                 // Flush the currently cached character, then fall through.
   145                 m_delegate.atomCharacterClassAtom(m_character);
   147             case Empty:
   148             case AfterCharacterClass:
   149                 m_state = AfterCharacterClass;
   150                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
   151                 return;
   153             case CachedCharacterHyphen:
   154                 // Error! We have a range that looks like [x-\d]. We require
   155                 // the end of the range to be a single character.
   156                 m_err = CharacterClassInvalidRange;
   157                 return;
   158             case AfterCharacterClassHyphen:
   159                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
   160                 m_state = Empty;
   161                 return;
   162             }
   163         }
   165         /*
   166          * end():
   167          *
   168          * Called at end of construction.
   169          */
   170         void end()
   171         {
   172             if (m_state == CachedCharacter)
   173                 m_delegate.atomCharacterClassAtom(m_character);
   174             else if (m_state == CachedCharacterHyphen) {
   175                 m_delegate.atomCharacterClassAtom(m_character);
   176                 m_delegate.atomCharacterClassAtom('-');
   177             }
   178             m_delegate.atomCharacterClassEnd();
   179         }
   181         // parseEscape() should never call these delegate methods when
   182         // invoked with inCharacterClass set.
   183         NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
   184         NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
   186     private:
   187         Delegate& m_delegate;
   188         ErrorCode& m_err;
   189         enum CharacterClassConstructionState {
   190             Empty,
   191             CachedCharacter,
   192             CachedCharacterHyphen,
   193             AfterCharacterClass,
   194             AfterCharacterClassHyphen
   195         } m_state;
   196         UChar m_character;
   197     };
   199     Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)
   200         : m_delegate(delegate)
   201         , m_backReferenceLimit(backReferenceLimit)
   202         , m_err(NoError)
   203         , m_data(pattern.chars())
   204         , m_size(pattern.length())
   205         , m_index(0)
   206         , m_parenthesesNestingDepth(0)
   207     {
   208     }
   210     /*
   211      * parseEscape():
   212      *
   213      * Helper for parseTokens() AND parseCharacterClass().
   214      * Unlike the other parser methods, this function does not report tokens
   215      * directly to the member delegate (m_delegate), instead tokens are
   216      * emitted to the delegate provided as an argument.  In the case of atom
   217      * escapes, parseTokens() will call parseEscape() passing m_delegate as
   218      * an argument, and as such the escape will be reported to the delegate.
   219      *
   220      * However this method may also be used by parseCharacterClass(), in which
   221      * case a CharacterClassParserDelegate will be passed as the delegate that
   222      * tokens should be added to.  A boolean flag is also provided to indicate
   223      * whether that an escape in a CharacterClass is being parsed (some parsing
   224      * rules change in this context).
   225      *
   226      * The boolean value returned by this method indicates whether the token
   227      * parsed was an atom (outside of a characted class \b and \B will be
   228      * interpreted as assertions).
   229      */
   230     template<bool inCharacterClass, class EscapeDelegate>
   231     bool parseEscape(EscapeDelegate& delegate)
   232     {
   233         ASSERT(!m_err);
   234         ASSERT(peek() == '\\');
   235         consume();
   237         if (atEndOfPattern()) {
   238             m_err = EscapeUnterminated;
   239             return false;
   240         }
   242         switch (peek()) {
   243         // Assertions
   244         case 'b':
   245             consume();
   246             if (inCharacterClass)
   247                 delegate.atomPatternCharacter('\b');
   248             else {
   249                 delegate.assertionWordBoundary(false);
   250                 return false;
   251             }
   252             break;
   253         case 'B':
   254             consume();
   255             if (inCharacterClass)
   256                 delegate.atomPatternCharacter('B');
   257             else {
   258                 delegate.assertionWordBoundary(true);
   259                 return false;
   260             }
   261             break;
   263         // CharacterClassEscape
   264         case 'd':
   265             consume();
   266             delegate.atomBuiltInCharacterClass(DigitClassID, false);
   267             break;
   268         case 's':
   269             consume();
   270             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
   271             break;
   272         case 'w':
   273             consume();
   274             delegate.atomBuiltInCharacterClass(WordClassID, false);
   275             break;
   276         case 'D':
   277             consume();
   278             delegate.atomBuiltInCharacterClass(DigitClassID, true);
   279             break;
   280         case 'S':
   281             consume();
   282             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
   283             break;
   284         case 'W':
   285             consume();
   286             delegate.atomBuiltInCharacterClass(WordClassID, true);
   287             break;
   289         // DecimalEscape
   290         case '1':
   291         case '2':
   292         case '3':
   293         case '4':
   294         case '5':
   295         case '6':
   296         case '7':
   297         case '8':
   298         case '9': {
   299             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
   300             // First, try to parse this as backreference.
   301             if (!inCharacterClass) {
   302                 ParseState state = saveState();
   304                 unsigned backReference;
   305                 if (!consumeNumber(backReference))
   306                     break;
   307                 if (backReference <= m_backReferenceLimit) {
   308                     delegate.atomBackReference(backReference);
   309                     break;
   310                 }
   312                 restoreState(state);
   313             }
   315             // Not a backreference, and not octal.
   316             if (peek() >= '8') {
   317                 delegate.atomPatternCharacter('\\');
   318                 break;
   319             }
   321             // Fall-through to handle this as an octal escape.
   322         }
   324         // Octal escape
   325         case '0':
   326             delegate.atomPatternCharacter(consumeOctal());
   327             break;
   329         // ControlEscape
   330         case 'f':
   331             consume();
   332             delegate.atomPatternCharacter('\f');
   333             break;
   334         case 'n':
   335             consume();
   336             delegate.atomPatternCharacter('\n');
   337             break;
   338         case 'r':
   339             consume();
   340             delegate.atomPatternCharacter('\r');
   341             break;
   342         case 't':
   343             consume();
   344             delegate.atomPatternCharacter('\t');
   345             break;
   346         case 'v':
   347             consume();
   348             delegate.atomPatternCharacter('\v');
   349             break;
   351         // ControlLetter
   352         case 'c': {
   353             ParseState state = saveState();
   354             consume();
   355             if (!atEndOfPattern()) {
   356                 int control = consume();
   358                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
   359                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
   360                     delegate.atomPatternCharacter(control & 0x1f);
   361                     break;
   362                 }
   363             }
   364             restoreState(state);
   365             delegate.atomPatternCharacter('\\');
   366             break;
   367         }
   369         // HexEscape
   370         case 'x': {
   371             consume();
   372             int x = tryConsumeHex(2);
   373             if (x == -1)
   374                 delegate.atomPatternCharacter('x');
   375             else
   376                 delegate.atomPatternCharacter(x);
   377             break;
   378         }
   380         // UnicodeEscape
   381         case 'u': {
   382             consume();
   383             int u = tryConsumeHex(4);
   384             if (u == -1)
   385                 delegate.atomPatternCharacter('u');
   386             else
   387                 delegate.atomPatternCharacter(u);
   388             break;
   389         }
   391         // IdentityEscape
   392         default:
   393             delegate.atomPatternCharacter(consume());
   394         }
   396         return true;
   397     }
   399     /*
   400      * parseAtomEscape(), parseCharacterClassEscape():
   401      *
   402      * These methods alias to parseEscape().
   403      */
   404     bool parseAtomEscape()
   405     {
   406         return parseEscape<false>(m_delegate);
   407     }
   408     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
   409     {
   410         parseEscape<true>(delegate);
   411     }
   413     /*
   414      * parseCharacterClass():
   415      *
   416      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
   417      * to an instance of CharacterClassParserDelegate, to describe the character class to the
   418      * delegate.
   419      */
   420     void parseCharacterClass()
   421     {
   422         ASSERT(!m_err);
   423         ASSERT(peek() == '[');
   424         consume();
   426         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
   428         characterClassConstructor.begin(tryConsume('^'));
   430         while (!atEndOfPattern()) {
   431             switch (peek()) {
   432             case ']':
   433                 consume();
   434                 characterClassConstructor.end();
   435                 return;
   437             case '\\':
   438                 parseCharacterClassEscape(characterClassConstructor);
   439                 break;
   441             default:
   442                 characterClassConstructor.atomPatternCharacter(consume(), true);
   443             }
   445             if (m_err)
   446                 return;
   447         }
   449         m_err = CharacterClassUnmatched;
   450     }
   452     /*
   453      * parseParenthesesBegin():
   454      *
   455      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
   456      */
   457     void parseParenthesesBegin()
   458     {
   459         ASSERT(!m_err);
   460         ASSERT(peek() == '(');
   461         consume();
   463         if (tryConsume('?')) {
   464             if (atEndOfPattern()) {
   465                 m_err = ParenthesesTypeInvalid;
   466                 return;
   467             }
   469             switch (consume()) {
   470             case ':':
   471                 m_delegate.atomParenthesesSubpatternBegin(false);
   472                 break;
   474             case '=':
   475                 m_delegate.atomParentheticalAssertionBegin();
   476                 break;
   478             case '!':
   479                 m_delegate.atomParentheticalAssertionBegin(true);
   480                 break;
   482             default:
   483                 m_err = ParenthesesTypeInvalid;
   484             }
   485         } else
   486             m_delegate.atomParenthesesSubpatternBegin();
   488         ++m_parenthesesNestingDepth;
   489     }
   491     /*
   492      * parseParenthesesEnd():
   493      *
   494      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
   495      */
   496     void parseParenthesesEnd()
   497     {
   498         ASSERT(!m_err);
   499         ASSERT(peek() == ')');
   500         consume();
   502         if (m_parenthesesNestingDepth > 0)
   503             m_delegate.atomParenthesesEnd();
   504         else
   505             m_err = ParenthesesUnmatched;
   507         --m_parenthesesNestingDepth;
   508     }
   510     /*
   511      * parseQuantifier():
   512      *
   513      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
   514      */
   515     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
   516     {
   517         ASSERT(!m_err);
   518         ASSERT(min <= max);
   520         if (min == UINT_MAX) {
   521             m_err = QuantifierTooLarge;
   522             return;
   523         }
   525         if (lastTokenWasAnAtom)
   526             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
   527         else
   528             m_err = QuantifierWithoutAtom;
   529     }
   531     /*
   532      * parseTokens():
   533      *
   534      * This method loops over the input pattern reporting tokens to the delegate.
   535      * The method returns when a parse error is detected, or the end of the pattern
   536      * is reached.  One piece of state is tracked around the loop, which is whether
   537      * the last token passed to the delegate was an atom (this is necessary to detect
   538      * a parse error when a quantifier provided without an atom to quantify).
   539      */
   540     void parseTokens()
   541     {
   542         bool lastTokenWasAnAtom = false;
   544         while (!atEndOfPattern()) {
   545             switch (peek()) {
   546             case '|':
   547                 consume();
   548                 m_delegate.disjunction();
   549                 lastTokenWasAnAtom = false;
   550                 break;
   552             case '(':
   553                 parseParenthesesBegin();
   554                 lastTokenWasAnAtom = false;
   555                 break;
   557             case ')':
   558                 parseParenthesesEnd();
   559                 lastTokenWasAnAtom = true;
   560                 break;
   562             case '^':
   563                 consume();
   564                 m_delegate.assertionBOL();
   565                 lastTokenWasAnAtom = false;
   566                 break;
   568             case '$':
   569                 consume();
   570                 m_delegate.assertionEOL();
   571                 lastTokenWasAnAtom = false;
   572                 break;
   574             case '.':
   575                 consume();
   576                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
   577                 lastTokenWasAnAtom = true;
   578                 break;
   580             case '[':
   581                 parseCharacterClass();
   582                 lastTokenWasAnAtom = true;
   583                 break;
   585             case '\\':
   586                 lastTokenWasAnAtom = parseAtomEscape();
   587                 break;
   589             case '*':
   590                 consume();
   591                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
   592                 lastTokenWasAnAtom = false;
   593                 break;
   595             case '+':
   596                 consume();
   597                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
   598                 lastTokenWasAnAtom = false;
   599                 break;
   601             case '?':
   602                 consume();
   603                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
   604                 lastTokenWasAnAtom = false;
   605                 break;
   607             case '{': {
   608                 ParseState state = saveState();
   610                 consume();
   611                 if (peekIsDigit()) {
   612                     unsigned min;
   613                     if (!consumeNumber(min))
   614                         break;
   616                     unsigned max = min;
   617                     if (tryConsume(',')) {
   618                         if (peekIsDigit()) {
   619                             if (!consumeNumber(max))
   620                                 break;
   621                         } else {
   622                             max = quantifyInfinite;
   623                         }
   624                     }
   626                     if (tryConsume('}')) {
   627                         if (min <= max)
   628                             parseQuantifier(lastTokenWasAnAtom, min, max);
   629                         else
   630                             m_err = QuantifierOutOfOrder;
   631                         lastTokenWasAnAtom = false;
   632                         break;
   633                     }
   634                 }
   636                 restoreState(state);
   637             } // if we did not find a complete quantifer, fall through to the default case.
   639             default:
   640                 m_delegate.atomPatternCharacter(consume());
   641                 lastTokenWasAnAtom = true;
   642             }
   644             if (m_err)
   645                 return;
   646         }
   648         if (m_parenthesesNestingDepth > 0)
   649             m_err = MissingParentheses;
   650     }
   652     /*
   653      * parse():
   654      *
   655      * This method calls parseTokens() to parse over the input and converts any
   656      * error code to a const char* for a result.
   657      */
   658     ErrorCode parse()
   659     {
   660         if (m_size > MAX_PATTERN_SIZE)
   661             m_err = PatternTooLarge;
   662         else
   663             parseTokens();
   664         ASSERT(atEndOfPattern() || m_err);
   666         return m_err;
   667     }
   669     // Misc helper functions:
   671     typedef unsigned ParseState;
   673     ParseState saveState()
   674     {
   675         return m_index;
   676     }
   678     void restoreState(ParseState state)
   679     {
   680         m_index = state;
   681     }
   683     bool atEndOfPattern()
   684     {
   685         ASSERT(m_index <= m_size);
   686         return m_index == m_size;
   687     }
   689     int peek()
   690     {
   691         ASSERT(m_index < m_size);
   692         return m_data[m_index];
   693     }
   695     bool peekIsDigit()
   696     {
   697         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
   698     }
   700     unsigned peekDigit()
   701     {
   702         ASSERT(peekIsDigit());
   703         return peek() - '0';
   704     }
   706     int consume()
   707     {
   708         ASSERT(m_index < m_size);
   709         return m_data[m_index++];
   710     }
   712     unsigned consumeDigit()
   713     {
   714         ASSERT(peekIsDigit());
   715         return consume() - '0';
   716     }
   718     bool consumeNumber(unsigned &accum)
   719     {
   720         accum = consumeDigit();
   721         while (peekIsDigit()) {
   722             unsigned newValue = accum * 10 + peekDigit();
   723             if (newValue < accum) { /* Overflow check. */
   724                 m_err = QuantifierTooLarge;
   725                 return false;
   726             }
   727             accum = newValue;
   728             consume();
   729         }
   730         return true;
   731     }
   733     unsigned consumeOctal()
   734     {
   735         ASSERT(WTF::isASCIIOctalDigit(peek()));
   737         unsigned n = consumeDigit();
   738         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
   739             n = n * 8 + consumeDigit();
   740         return n;
   741     }
   743     bool tryConsume(UChar ch)
   744     {
   745         if (atEndOfPattern() || (m_data[m_index] != ch))
   746             return false;
   747         ++m_index;
   748         return true;
   749     }
   751     int tryConsumeHex(int count)
   752     {
   753         ParseState state = saveState();
   755         int n = 0;
   756         while (count--) {
   757             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
   758                 restoreState(state);
   759                 return -1;
   760             }
   761             n = (n << 4) | WTF::toASCIIHexValue(consume());
   762         }
   763         return n;
   764     }
   766     Delegate& m_delegate;
   767     unsigned m_backReferenceLimit;
   768     ErrorCode m_err;
   769     const CharType* m_data;
   770     unsigned m_size;
   771     unsigned m_index;
   772     unsigned m_parenthesesNestingDepth;
   774     // Derived by empirical testing of compile time in PCRE and WREC.
   775     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
   776 };
   778 /*
   779  * Yarr::parse():
   780  *
   781  * The parse method is passed a pattern to be parsed and a delegate upon which
   782  * callbacks will be made to record the parsed tokens forming the regex.
   783  * Yarr::parse() returns null on success, or a const C string providing an error
   784  * message where a parse error occurs.
   785  *
   786  * The Delegate must implement the following interface:
   787  *
   788  *    void assertionBOL();
   789  *    void assertionEOL();
   790  *    void assertionWordBoundary(bool invert);
   791  *
   792  *    void atomPatternCharacter(UChar ch);
   793  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
   794  *    void atomCharacterClassBegin(bool invert)
   795  *    void atomCharacterClassAtom(UChar ch)
   796  *    void atomCharacterClassRange(UChar begin, UChar end)
   797  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
   798  *    void atomCharacterClassEnd()
   799  *    void atomParenthesesSubpatternBegin(bool capture = true);
   800  *    void atomParentheticalAssertionBegin(bool invert = false);
   801  *    void atomParenthesesEnd();
   802  *    void atomBackReference(unsigned subpatternId);
   803  *
   804  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
   805  *
   806  *    void disjunction();
   807  *
   808  * The regular expression is described by a sequence of assertion*() and atom*()
   809  * callbacks to the delegate, describing the terms in the regular expression.
   810  * Following an atom a quantifyAtom() call may occur to indicate that the previous
   811  * atom should be quantified.  In the case of atoms described across multiple
   812  * calls (parentheses and character classes) the call to quantifyAtom() will come
   813  * after the call to the atom*End() method, never after atom*Begin().
   814  *
   815  * Character classes may either be described by a single call to
   816  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
   817  * In the latter case, ...Begin() will be called, followed by a sequence of
   818  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
   819  *
   820  * Sequences of atoms and assertions are broken into alternatives via calls to
   821  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
   822  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
   823  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
   824  * capturing subpattern, this will be the subpatternId associated with these
   825  * parentheses, and will also by definition be the lowest subpatternId of these
   826  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
   827  * is passed the subpatternId of the last capturing subexpression nested within
   828  * these paretheses.  In the case of a capturing subpattern with no nested
   829  * capturing subpatterns, the same subpatternId will be passed to the begin and
   830  * end functions.  In the case of non-capturing subpatterns the subpatternId
   831  * passed to the begin method is also the first possible subpatternId that might
   832  * be nested within these paretheses.  If a set of non-capturing parentheses does
   833  * not contain any capturing subpatterns, then the subpatternId passed to begin
   834  * will be greater than the subpatternId passed to end.
   835  */
   837 template<class Delegate>
   838 ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)
   839 {
   840 #ifdef YARR_8BIT_CHAR_SUPPORT
   841     if (pattern.is8Bit())
   842         return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();
   843 #endif
   844     return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();
   845 }
   847 } } // namespace JSC::Yarr
   849 #endif /* yarr_YarrParser_h */

mercurial