The Tor Browser: js/src/yarr/YarrParser.h@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-

     2  * vim: set ts=8 sts=4 et sw=4 tw=99:

     3  *

     4  * Copyright (C) 2009 Apple Inc. All rights reserved.

     5  *

     6  * Redistribution and use in source and binary forms, with or without

     7  * modification, are permitted provided that the following conditions

     8  * are met:

     9  * 1. Redistributions of source code must retain the above copyright

    10  *    notice, this list of conditions and the following disclaimer.

    11  * 2. Redistributions in binary form must reproduce the above copyright

    12  *    notice, this list of conditions and the following disclaimer in the

    13  *    documentation and/or other materials provided with the distribution.

    14  *

    15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY

    16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

    17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

    18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR

    19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

    20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

    21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

    22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

    23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

    24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

    25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

    26  */

    28 #ifndef yarr_YarrParser_h

    29 #define yarr_YarrParser_h

    31 #include "yarr/Yarr.h"

    33 namespace JSC { namespace Yarr {

    35 enum BuiltInCharacterClassID {

    36     DigitClassID,

    37     SpaceClassID,

    38     WordClassID,

    39     NewlineClassID

    40 };

    42 // The Parser class should not be used directly - only via the Yarr::parse() method.

    43 template<class Delegate, typename CharType>

    44 class Parser {

    45 private:

    46     template<class FriendDelegate>

    47     friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit);

    49     /*

    50      * CharacterClassParserDelegate:

    51      *

    52      * The class CharacterClassParserDelegate is used in the parsing of character

    53      * classes.  This class handles detection of character ranges.  This class

    54      * implements enough of the delegate interface such that it can be passed to

    55      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused

    56      * to perform the parsing of escape characters in character sets.

    57      */

    58     class CharacterClassParserDelegate {

    59     public:

    60         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)

    61             : m_delegate(delegate)

    62             , m_err(err)

    63             , m_state(Empty)

    64             , m_character(0)

    65         {

    66         }

    68         /*

    69          * begin():

    70          *

    71          * Called at beginning of construction.

    72          */

    73         void begin(bool invert)

    74         {

    75             m_delegate.atomCharacterClassBegin(invert);

    76         }

    78         /*

    79          * atomPatternCharacter():

    80          *

    81          * This method is called either from parseCharacterClass() (for an unescaped

    82          * character in a character class), or from parseEscape(). In the former case

    83          * the value true will be passed for the argument 'hyphenIsRange', and in this

    84          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/

    85          * is different to /[a\-z]/).

    86          */

    87         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)

    88         {

    89             switch (m_state) {

    90             case AfterCharacterClass:

    91                 // Following a builtin character class we need look out for a hyphen.

    92                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.

    93                 // If we see a hyphen following a charater class then unlike usual

    94                 // we'll report it to the delegate immediately, and put ourself into

    95                 // a poisoned state. Any following calls to add another character or

    96                 // character class will result in an error. (A hypen following a

    97                 // character-class is itself valid, but only  at the end of a regex).

    98                 if (hyphenIsRange && ch == '-') {

    99                     m_delegate.atomCharacterClassAtom('-');

   100                     m_state = AfterCharacterClassHyphen;

   101                     return;

   102                 }

   103                 // Otherwise just fall through - cached character so treat this as Empty.

   105             case Empty:

   106                 m_character = ch;

   107                 m_state = CachedCharacter;

   108                 return;

   110             case CachedCharacter:

   111                 if (hyphenIsRange && ch == '-')

   112                     m_state = CachedCharacterHyphen;

   113                 else {

   114                     m_delegate.atomCharacterClassAtom(m_character);

   115                     m_character = ch;

   116                 }

   117                 return;

   119             case CachedCharacterHyphen:

   120                 if (ch < m_character) {

   121                     m_err = CharacterClassOutOfOrder;

   122                     return;

   123                 }

   124                 m_delegate.atomCharacterClassRange(m_character, ch);

   125                 m_state = Empty;

   126                 return;

   128             case AfterCharacterClassHyphen:

   129                 m_delegate.atomCharacterClassAtom(ch);

   130                 m_state = Empty;

   131                 return;

   132             }

   133         }

   135         /*

   136          * atomBuiltInCharacterClass():

   137          *

   138          * Adds a built-in character class, called by parseEscape().

   139          */

   140         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)

   141         {

   142             switch (m_state) {

   143             case CachedCharacter:

   144                 // Flush the currently cached character, then fall through.

   145                 m_delegate.atomCharacterClassAtom(m_character);

   147             case Empty:

   148             case AfterCharacterClass:

   149                 m_state = AfterCharacterClass;

   150                 m_delegate.atomCharacterClassBuiltIn(classID, invert);

   151                 return;

   153             case CachedCharacterHyphen:

   154                 // Error! We have a range that looks like [x-\d]. We require

   155                 // the end of the range to be a single character.

   156                 m_err = CharacterClassInvalidRange;

   157                 return;

   158             case AfterCharacterClassHyphen:

   159                 m_delegate.atomCharacterClassBuiltIn(classID, invert);

   160                 m_state = Empty;

   161                 return;

   162             }

   163         }

   165         /*

   166          * end():

   167          *

   168          * Called at end of construction.

   169          */

   170         void end()

   171         {

   172             if (m_state == CachedCharacter)

   173                 m_delegate.atomCharacterClassAtom(m_character);

   174             else if (m_state == CachedCharacterHyphen) {

   175                 m_delegate.atomCharacterClassAtom(m_character);

   176                 m_delegate.atomCharacterClassAtom('-');

   177             }

   178             m_delegate.atomCharacterClassEnd();

   179         }

   181         // parseEscape() should never call these delegate methods when

   182         // invoked with inCharacterClass set.

   183         NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }

   184         NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }

   186     private:

   187         Delegate& m_delegate;

   188         ErrorCode& m_err;

   189         enum CharacterClassConstructionState {

   190             Empty,

   191             CachedCharacter,

   192             CachedCharacterHyphen,

   193             AfterCharacterClass,

   194             AfterCharacterClassHyphen

   195         } m_state;

   196         UChar m_character;

   197     };

   199     Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit)

   200         : m_delegate(delegate)

   201         , m_backReferenceLimit(backReferenceLimit)

   202         , m_err(NoError)

   203         , m_data(pattern.chars())

   204         , m_size(pattern.length())

   205         , m_index(0)

   206         , m_parenthesesNestingDepth(0)

   207     {

   208     }

   210     /*

   211      * parseEscape():

   212      *

   213      * Helper for parseTokens() AND parseCharacterClass().

   214      * Unlike the other parser methods, this function does not report tokens

   215      * directly to the member delegate (m_delegate), instead tokens are

   216      * emitted to the delegate provided as an argument.  In the case of atom

   217      * escapes, parseTokens() will call parseEscape() passing m_delegate as

   218      * an argument, and as such the escape will be reported to the delegate.

   219      *

   220      * However this method may also be used by parseCharacterClass(), in which

   221      * case a CharacterClassParserDelegate will be passed as the delegate that

   222      * tokens should be added to.  A boolean flag is also provided to indicate

   223      * whether that an escape in a CharacterClass is being parsed (some parsing

   224      * rules change in this context).

   225      *

   226      * The boolean value returned by this method indicates whether the token

   227      * parsed was an atom (outside of a characted class \b and \B will be

   228      * interpreted as assertions).

   229      */

   230     template<bool inCharacterClass, class EscapeDelegate>

   231     bool parseEscape(EscapeDelegate& delegate)

   232     {

   233         ASSERT(!m_err);

   234         ASSERT(peek() == '\\');

   235         consume();

   237         if (atEndOfPattern()) {

   238             m_err = EscapeUnterminated;

   239             return false;

   240         }

   242         switch (peek()) {

   243         // Assertions

   244         case 'b':

   245             consume();

   246             if (inCharacterClass)

   247                 delegate.atomPatternCharacter('\b');

   248             else {

   249                 delegate.assertionWordBoundary(false);

   250                 return false;

   251             }

   252             break;

   253         case 'B':

   254             consume();

   255             if (inCharacterClass)

   256                 delegate.atomPatternCharacter('B');

   257             else {

   258                 delegate.assertionWordBoundary(true);

   259                 return false;

   260             }

   261             break;

   263         // CharacterClassEscape

   264         case 'd':

   265             consume();

   266             delegate.atomBuiltInCharacterClass(DigitClassID, false);

   267             break;

   268         case 's':

   269             consume();

   270             delegate.atomBuiltInCharacterClass(SpaceClassID, false);

   271             break;

   272         case 'w':

   273             consume();

   274             delegate.atomBuiltInCharacterClass(WordClassID, false);

   275             break;

   276         case 'D':

   277             consume();

   278             delegate.atomBuiltInCharacterClass(DigitClassID, true);

   279             break;

   280         case 'S':

   281             consume();

   282             delegate.atomBuiltInCharacterClass(SpaceClassID, true);

   283             break;

   284         case 'W':

   285             consume();

   286             delegate.atomBuiltInCharacterClass(WordClassID, true);

   287             break;

   289         // DecimalEscape

   290         case '1':

   291         case '2':

   292         case '3':

   293         case '4':

   294         case '5':

   295         case '6':

   296         case '7':

   297         case '8':

   298         case '9': {

   299             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.

   300             // First, try to parse this as backreference.

   301             if (!inCharacterClass) {

   302                 ParseState state = saveState();

   304                 unsigned backReference;

   305                 if (!consumeNumber(backReference))

   306                     break;

   307                 if (backReference <= m_backReferenceLimit) {

   308                     delegate.atomBackReference(backReference);

   309                     break;

   310                 }

   312                 restoreState(state);

   313             }

   315             // Not a backreference, and not octal.

   316             if (peek() >= '8') {

   317                 delegate.atomPatternCharacter('\\');

   318                 break;

   319             }

   321             // Fall-through to handle this as an octal escape.

   322         }

   324         // Octal escape

   325         case '0':

   326             delegate.atomPatternCharacter(consumeOctal());

   327             break;

   329         // ControlEscape

   330         case 'f':

   331             consume();

   332             delegate.atomPatternCharacter('\f');

   333             break;

   334         case 'n':

   335             consume();

   336             delegate.atomPatternCharacter('\n');

   337             break;

   338         case 'r':

   339             consume();

   340             delegate.atomPatternCharacter('\r');

   341             break;

   342         case 't':

   343             consume();

   344             delegate.atomPatternCharacter('\t');

   345             break;

   346         case 'v':

   347             consume();

   348             delegate.atomPatternCharacter('\v');

   349             break;

   351         // ControlLetter

   352         case 'c': {

   353             ParseState state = saveState();

   354             consume();

   355             if (!atEndOfPattern()) {

   356                 int control = consume();

   358                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.

   359                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {

   360                     delegate.atomPatternCharacter(control & 0x1f);

   361                     break;

   362                 }

   363             }

   364             restoreState(state);

   365             delegate.atomPatternCharacter('\\');

   366             break;

   367         }

   369         // HexEscape

   370         case 'x': {

   371             consume();

   372             int x = tryConsumeHex(2);

   373             if (x == -1)

   374                 delegate.atomPatternCharacter('x');

   375             else

   376                 delegate.atomPatternCharacter(x);

   377             break;

   378         }

   380         // UnicodeEscape

   381         case 'u': {

   382             consume();

   383             int u = tryConsumeHex(4);

   384             if (u == -1)

   385                 delegate.atomPatternCharacter('u');

   386             else

   387                 delegate.atomPatternCharacter(u);

   388             break;

   389         }

   391         // IdentityEscape

   392         default:

   393             delegate.atomPatternCharacter(consume());

   394         }

   396         return true;

   397     }

   399     /*

   400      * parseAtomEscape(), parseCharacterClassEscape():

   401      *

   402      * These methods alias to parseEscape().

   403      */

   404     bool parseAtomEscape()

   405     {

   406         return parseEscape<false>(m_delegate);

   407     }

   408     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)

   409     {

   410         parseEscape<true>(delegate);

   411     }

   413     /*

   414      * parseCharacterClass():

   415      *

   416      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)

   417      * to an instance of CharacterClassParserDelegate, to describe the character class to the

   418      * delegate.

   419      */

   420     void parseCharacterClass()

   421     {

   422         ASSERT(!m_err);

   423         ASSERT(peek() == '[');

   424         consume();

   426         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);

   428         characterClassConstructor.begin(tryConsume('^'));

   430         while (!atEndOfPattern()) {

   431             switch (peek()) {

   432             case ']':

   433                 consume();

   434                 characterClassConstructor.end();

   435                 return;

   437             case '\\':

   438                 parseCharacterClassEscape(characterClassConstructor);

   439                 break;

   441             default:

   442                 characterClassConstructor.atomPatternCharacter(consume(), true);

   443             }

   445             if (m_err)

   446                 return;

   447         }

   449         m_err = CharacterClassUnmatched;

   450     }

   452     /*

   453      * parseParenthesesBegin():

   454      *

   455      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.

   456      */

   457     void parseParenthesesBegin()

   458     {

   459         ASSERT(!m_err);

   460         ASSERT(peek() == '(');

   461         consume();

   463         if (tryConsume('?')) {

   464             if (atEndOfPattern()) {

   465                 m_err = ParenthesesTypeInvalid;

   466                 return;

   467             }

   469             switch (consume()) {

   470             case ':':

   471                 m_delegate.atomParenthesesSubpatternBegin(false);

   472                 break;

   474             case '=':

   475                 m_delegate.atomParentheticalAssertionBegin();

   476                 break;

   478             case '!':

   479                 m_delegate.atomParentheticalAssertionBegin(true);

   480                 break;

   482             default:

   483                 m_err = ParenthesesTypeInvalid;

   484             }

   485         } else

   486             m_delegate.atomParenthesesSubpatternBegin();

   488         ++m_parenthesesNestingDepth;

   489     }

   491     /*

   492      * parseParenthesesEnd():

   493      *

   494      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).

   495      */

   496     void parseParenthesesEnd()

   497     {

   498         ASSERT(!m_err);

   499         ASSERT(peek() == ')');

   500         consume();

   502         if (m_parenthesesNestingDepth > 0)

   503             m_delegate.atomParenthesesEnd();

   504         else

   505             m_err = ParenthesesUnmatched;

   507         --m_parenthesesNestingDepth;

   508     }

   510     /*

   511      * parseQuantifier():

   512      *

   513      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.

   514      */

   515     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)

   516     {

   517         ASSERT(!m_err);

   518         ASSERT(min <= max);

   520         if (min == UINT_MAX) {

   521             m_err = QuantifierTooLarge;

   522             return;

   523         }

   525         if (lastTokenWasAnAtom)

   526             m_delegate.quantifyAtom(min, max, !tryConsume('?'));

   527         else

   528             m_err = QuantifierWithoutAtom;

   529     }

   531     /*

   532      * parseTokens():

   533      *

   534      * This method loops over the input pattern reporting tokens to the delegate.

   535      * The method returns when a parse error is detected, or the end of the pattern

   536      * is reached.  One piece of state is tracked around the loop, which is whether

   537      * the last token passed to the delegate was an atom (this is necessary to detect

   538      * a parse error when a quantifier provided without an atom to quantify).

   539      */

   540     void parseTokens()

   541     {

   542         bool lastTokenWasAnAtom = false;

   544         while (!atEndOfPattern()) {

   545             switch (peek()) {

   546             case '|':

   547                 consume();

   548                 m_delegate.disjunction();

   549                 lastTokenWasAnAtom = false;

   550                 break;

   552             case '(':

   553                 parseParenthesesBegin();

   554                 lastTokenWasAnAtom = false;

   555                 break;

   557             case ')':

   558                 parseParenthesesEnd();

   559                 lastTokenWasAnAtom = true;

   560                 break;

   562             case '^':

   563                 consume();

   564                 m_delegate.assertionBOL();

   565                 lastTokenWasAnAtom = false;

   566                 break;

   568             case '$':

   569                 consume();

   570                 m_delegate.assertionEOL();

   571                 lastTokenWasAnAtom = false;

   572                 break;

   574             case '.':

   575                 consume();

   576                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);

   577                 lastTokenWasAnAtom = true;

   578                 break;

   580             case '[':

   581                 parseCharacterClass();

   582                 lastTokenWasAnAtom = true;

   583                 break;

   585             case '\\':

   586                 lastTokenWasAnAtom = parseAtomEscape();

   587                 break;

   589             case '*':

   590                 consume();

   591                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);

   592                 lastTokenWasAnAtom = false;

   593                 break;

   595             case '+':

   596                 consume();

   597                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);

   598                 lastTokenWasAnAtom = false;

   599                 break;

   601             case '?':

   602                 consume();

   603                 parseQuantifier(lastTokenWasAnAtom, 0, 1);

   604                 lastTokenWasAnAtom = false;

   605                 break;

   607             case '{': {

   608                 ParseState state = saveState();

   610                 consume();

   611                 if (peekIsDigit()) {

   612                     unsigned min;

   613                     if (!consumeNumber(min))

   614                         break;

   616                     unsigned max = min;

   617                     if (tryConsume(',')) {

   618                         if (peekIsDigit()) {

   619                             if (!consumeNumber(max))

   620                                 break;

   621                         } else {

   622                             max = quantifyInfinite;

   623                         }

   624                     }

   626                     if (tryConsume('}')) {

   627                         if (min <= max)

   628                             parseQuantifier(lastTokenWasAnAtom, min, max);

   629                         else

   630                             m_err = QuantifierOutOfOrder;

   631                         lastTokenWasAnAtom = false;

   632                         break;

   633                     }

   634                 }

   636                 restoreState(state);

   637             } // if we did not find a complete quantifer, fall through to the default case.

   639             default:

   640                 m_delegate.atomPatternCharacter(consume());

   641                 lastTokenWasAnAtom = true;

   642             }

   644             if (m_err)

   645                 return;

   646         }

   648         if (m_parenthesesNestingDepth > 0)

   649             m_err = MissingParentheses;

   650     }

   652     /*

   653      * parse():

   654      *

   655      * This method calls parseTokens() to parse over the input and converts any

   656      * error code to a const char* for a result.

   657      */

   658     ErrorCode parse()

   659     {

   660         if (m_size > MAX_PATTERN_SIZE)

   661             m_err = PatternTooLarge;

   662         else

   663             parseTokens();

   664         ASSERT(atEndOfPattern() || m_err);

   666         return m_err;

   667     }

   669     // Misc helper functions:

   671     typedef unsigned ParseState;

   673     ParseState saveState()

   674     {

   675         return m_index;

   676     }

   678     void restoreState(ParseState state)

   679     {

   680         m_index = state;

   681     }

   683     bool atEndOfPattern()

   684     {

   685         ASSERT(m_index <= m_size);

   686         return m_index == m_size;

   687     }

   689     int peek()

   690     {

   691         ASSERT(m_index < m_size);

   692         return m_data[m_index];

   693     }

   695     bool peekIsDigit()

   696     {

   697         return !atEndOfPattern() && WTF::isASCIIDigit(peek());

   698     }

   700     unsigned peekDigit()

   701     {

   702         ASSERT(peekIsDigit());

   703         return peek() - '0';

   704     }

   706     int consume()

   707     {

   708         ASSERT(m_index < m_size);

   709         return m_data[m_index++];

   710     }

   712     unsigned consumeDigit()

   713     {

   714         ASSERT(peekIsDigit());

   715         return consume() - '0';

   716     }

   718     bool consumeNumber(unsigned &accum)

   719     {

   720         accum = consumeDigit();

   721         while (peekIsDigit()) {

   722             unsigned newValue = accum * 10 + peekDigit();

   723             if (newValue < accum) { /* Overflow check. */

   724                 m_err = QuantifierTooLarge;

   725                 return false;

   726             }

   727             accum = newValue;

   728             consume();

   729         }

   730         return true;

   731     }

   733     unsigned consumeOctal()

   734     {

   735         ASSERT(WTF::isASCIIOctalDigit(peek()));

   737         unsigned n = consumeDigit();

   738         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))

   739             n = n * 8 + consumeDigit();

   740         return n;

   741     }

   743     bool tryConsume(UChar ch)

   744     {

   745         if (atEndOfPattern() || (m_data[m_index] != ch))

   746             return false;

   747         ++m_index;

   748         return true;

   749     }

   751     int tryConsumeHex(int count)

   752     {

   753         ParseState state = saveState();

   755         int n = 0;

   756         while (count--) {

   757             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {

   758                 restoreState(state);

   759                 return -1;

   760             }

   761             n = (n << 4) | WTF::toASCIIHexValue(consume());

   762         }

   763         return n;

   764     }

   766     Delegate& m_delegate;

   767     unsigned m_backReferenceLimit;

   768     ErrorCode m_err;

   769     const CharType* m_data;

   770     unsigned m_size;

   771     unsigned m_index;

   772     unsigned m_parenthesesNestingDepth;

   774     // Derived by empirical testing of compile time in PCRE and WREC.

   775     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;

   776 };

   778 /*

   779  * Yarr::parse():

   780  *

   781  * The parse method is passed a pattern to be parsed and a delegate upon which

   782  * callbacks will be made to record the parsed tokens forming the regex.

   783  * Yarr::parse() returns null on success, or a const C string providing an error

   784  * message where a parse error occurs.

   785  *

   786  * The Delegate must implement the following interface:

   787  *

   788  *    void assertionBOL();

   789  *    void assertionEOL();

   790  *    void assertionWordBoundary(bool invert);

   791  *

   792  *    void atomPatternCharacter(UChar ch);

   793  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);

   794  *    void atomCharacterClassBegin(bool invert)

   795  *    void atomCharacterClassAtom(UChar ch)

   796  *    void atomCharacterClassRange(UChar begin, UChar end)

   797  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)

   798  *    void atomCharacterClassEnd()

   799  *    void atomParenthesesSubpatternBegin(bool capture = true);

   800  *    void atomParentheticalAssertionBegin(bool invert = false);

   801  *    void atomParenthesesEnd();

   802  *    void atomBackReference(unsigned subpatternId);

   803  *

   804  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);

   805  *

   806  *    void disjunction();

   807  *

   808  * The regular expression is described by a sequence of assertion*() and atom*()

   809  * callbacks to the delegate, describing the terms in the regular expression.

   810  * Following an atom a quantifyAtom() call may occur to indicate that the previous

   811  * atom should be quantified.  In the case of atoms described across multiple

   812  * calls (parentheses and character classes) the call to quantifyAtom() will come

   813  * after the call to the atom*End() method, never after atom*Begin().

   814  *

   815  * Character classes may either be described by a single call to

   816  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.

   817  * In the latter case, ...Begin() will be called, followed by a sequence of

   818  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().

   819  *

   820  * Sequences of atoms and assertions are broken into alternatives via calls to

   821  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to

   822  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.

   823  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular

   824  * capturing subpattern, this will be the subpatternId associated with these

   825  * parentheses, and will also by definition be the lowest subpatternId of these

   826  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method

   827  * is passed the subpatternId of the last capturing subexpression nested within

   828  * these paretheses.  In the case of a capturing subpattern with no nested

   829  * capturing subpatterns, the same subpatternId will be passed to the begin and

   830  * end functions.  In the case of non-capturing subpatterns the subpatternId

   831  * passed to the begin method is also the first possible subpatternId that might

   832  * be nested within these paretheses.  If a set of non-capturing parentheses does

   833  * not contain any capturing subpatterns, then the subpatternId passed to begin

   834  * will be greater than the subpatternId passed to end.

   835  */

   837 template<class Delegate>

   838 ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite)

   839 {

   840 #ifdef YARR_8BIT_CHAR_SUPPORT

   841     if (pattern.is8Bit())

   842         return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse();

   843 #endif

   844     return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse();

   845 }

   847 } } // namespace JSC::Yarr

   849 #endif /* yarr_YarrParser_h */

The Tor Browser / file revision

js/src/yarr/YarrParser.h@6474c204b198

js/src/yarr/YarrParser.h