Sat, 03 Jan 2015 20:18:00 +0100
Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
michael@0 | 2 | * vim: set ts=8 sts=4 et sw=4 tw=99: |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
michael@0 | 5 | * |
michael@0 | 6 | * Redistribution and use in source and binary forms, with or without |
michael@0 | 7 | * modification, are permitted provided that the following conditions |
michael@0 | 8 | * are met: |
michael@0 | 9 | * 1. Redistributions of source code must retain the above copyright |
michael@0 | 10 | * notice, this list of conditions and the following disclaimer. |
michael@0 | 11 | * 2. Redistributions in binary form must reproduce the above copyright |
michael@0 | 12 | * notice, this list of conditions and the following disclaimer in the |
michael@0 | 13 | * documentation and/or other materials provided with the distribution. |
michael@0 | 14 | * |
michael@0 | 15 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
michael@0 | 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
michael@0 | 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
michael@0 | 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
michael@0 | 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
michael@0 | 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
michael@0 | 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
michael@0 | 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
michael@0 | 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
michael@0 | 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
michael@0 | 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
michael@0 | 26 | */ |
michael@0 | 27 | |
michael@0 | 28 | #ifndef yarr_YarrParser_h |
michael@0 | 29 | #define yarr_YarrParser_h |
michael@0 | 30 | |
michael@0 | 31 | #include "yarr/Yarr.h" |
michael@0 | 32 | |
michael@0 | 33 | namespace JSC { namespace Yarr { |
michael@0 | 34 | |
michael@0 | 35 | enum BuiltInCharacterClassID { |
michael@0 | 36 | DigitClassID, |
michael@0 | 37 | SpaceClassID, |
michael@0 | 38 | WordClassID, |
michael@0 | 39 | NewlineClassID |
michael@0 | 40 | }; |
michael@0 | 41 | |
michael@0 | 42 | // The Parser class should not be used directly - only via the Yarr::parse() method. |
michael@0 | 43 | template<class Delegate, typename CharType> |
michael@0 | 44 | class Parser { |
michael@0 | 45 | private: |
michael@0 | 46 | template<class FriendDelegate> |
michael@0 | 47 | friend ErrorCode parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); |
michael@0 | 48 | |
michael@0 | 49 | /* |
michael@0 | 50 | * CharacterClassParserDelegate: |
michael@0 | 51 | * |
michael@0 | 52 | * The class CharacterClassParserDelegate is used in the parsing of character |
michael@0 | 53 | * classes. This class handles detection of character ranges. This class |
michael@0 | 54 | * implements enough of the delegate interface such that it can be passed to |
michael@0 | 55 | * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused |
michael@0 | 56 | * to perform the parsing of escape characters in character sets. |
michael@0 | 57 | */ |
michael@0 | 58 | class CharacterClassParserDelegate { |
michael@0 | 59 | public: |
michael@0 | 60 | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) |
michael@0 | 61 | : m_delegate(delegate) |
michael@0 | 62 | , m_err(err) |
michael@0 | 63 | , m_state(Empty) |
michael@0 | 64 | , m_character(0) |
michael@0 | 65 | { |
michael@0 | 66 | } |
michael@0 | 67 | |
michael@0 | 68 | /* |
michael@0 | 69 | * begin(): |
michael@0 | 70 | * |
michael@0 | 71 | * Called at beginning of construction. |
michael@0 | 72 | */ |
michael@0 | 73 | void begin(bool invert) |
michael@0 | 74 | { |
michael@0 | 75 | m_delegate.atomCharacterClassBegin(invert); |
michael@0 | 76 | } |
michael@0 | 77 | |
michael@0 | 78 | /* |
michael@0 | 79 | * atomPatternCharacter(): |
michael@0 | 80 | * |
michael@0 | 81 | * This method is called either from parseCharacterClass() (for an unescaped |
michael@0 | 82 | * character in a character class), or from parseEscape(). In the former case |
michael@0 | 83 | * the value true will be passed for the argument 'hyphenIsRange', and in this |
michael@0 | 84 | * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ |
michael@0 | 85 | * is different to /[a\-z]/). |
michael@0 | 86 | */ |
michael@0 | 87 | void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) |
michael@0 | 88 | { |
michael@0 | 89 | switch (m_state) { |
michael@0 | 90 | case AfterCharacterClass: |
michael@0 | 91 | // Following a builtin character class we need look out for a hyphen. |
michael@0 | 92 | // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. |
michael@0 | 93 | // If we see a hyphen following a charater class then unlike usual |
michael@0 | 94 | // we'll report it to the delegate immediately, and put ourself into |
michael@0 | 95 | // a poisoned state. Any following calls to add another character or |
michael@0 | 96 | // character class will result in an error. (A hypen following a |
michael@0 | 97 | // character-class is itself valid, but only at the end of a regex). |
michael@0 | 98 | if (hyphenIsRange && ch == '-') { |
michael@0 | 99 | m_delegate.atomCharacterClassAtom('-'); |
michael@0 | 100 | m_state = AfterCharacterClassHyphen; |
michael@0 | 101 | return; |
michael@0 | 102 | } |
michael@0 | 103 | // Otherwise just fall through - cached character so treat this as Empty. |
michael@0 | 104 | |
michael@0 | 105 | case Empty: |
michael@0 | 106 | m_character = ch; |
michael@0 | 107 | m_state = CachedCharacter; |
michael@0 | 108 | return; |
michael@0 | 109 | |
michael@0 | 110 | case CachedCharacter: |
michael@0 | 111 | if (hyphenIsRange && ch == '-') |
michael@0 | 112 | m_state = CachedCharacterHyphen; |
michael@0 | 113 | else { |
michael@0 | 114 | m_delegate.atomCharacterClassAtom(m_character); |
michael@0 | 115 | m_character = ch; |
michael@0 | 116 | } |
michael@0 | 117 | return; |
michael@0 | 118 | |
michael@0 | 119 | case CachedCharacterHyphen: |
michael@0 | 120 | if (ch < m_character) { |
michael@0 | 121 | m_err = CharacterClassOutOfOrder; |
michael@0 | 122 | return; |
michael@0 | 123 | } |
michael@0 | 124 | m_delegate.atomCharacterClassRange(m_character, ch); |
michael@0 | 125 | m_state = Empty; |
michael@0 | 126 | return; |
michael@0 | 127 | |
michael@0 | 128 | case AfterCharacterClassHyphen: |
michael@0 | 129 | m_delegate.atomCharacterClassAtom(ch); |
michael@0 | 130 | m_state = Empty; |
michael@0 | 131 | return; |
michael@0 | 132 | } |
michael@0 | 133 | } |
michael@0 | 134 | |
michael@0 | 135 | /* |
michael@0 | 136 | * atomBuiltInCharacterClass(): |
michael@0 | 137 | * |
michael@0 | 138 | * Adds a built-in character class, called by parseEscape(). |
michael@0 | 139 | */ |
michael@0 | 140 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
michael@0 | 141 | { |
michael@0 | 142 | switch (m_state) { |
michael@0 | 143 | case CachedCharacter: |
michael@0 | 144 | // Flush the currently cached character, then fall through. |
michael@0 | 145 | m_delegate.atomCharacterClassAtom(m_character); |
michael@0 | 146 | |
michael@0 | 147 | case Empty: |
michael@0 | 148 | case AfterCharacterClass: |
michael@0 | 149 | m_state = AfterCharacterClass; |
michael@0 | 150 | m_delegate.atomCharacterClassBuiltIn(classID, invert); |
michael@0 | 151 | return; |
michael@0 | 152 | |
michael@0 | 153 | case CachedCharacterHyphen: |
michael@0 | 154 | // Error! We have a range that looks like [x-\d]. We require |
michael@0 | 155 | // the end of the range to be a single character. |
michael@0 | 156 | m_err = CharacterClassInvalidRange; |
michael@0 | 157 | return; |
michael@0 | 158 | case AfterCharacterClassHyphen: |
michael@0 | 159 | m_delegate.atomCharacterClassBuiltIn(classID, invert); |
michael@0 | 160 | m_state = Empty; |
michael@0 | 161 | return; |
michael@0 | 162 | } |
michael@0 | 163 | } |
michael@0 | 164 | |
michael@0 | 165 | /* |
michael@0 | 166 | * end(): |
michael@0 | 167 | * |
michael@0 | 168 | * Called at end of construction. |
michael@0 | 169 | */ |
michael@0 | 170 | void end() |
michael@0 | 171 | { |
michael@0 | 172 | if (m_state == CachedCharacter) |
michael@0 | 173 | m_delegate.atomCharacterClassAtom(m_character); |
michael@0 | 174 | else if (m_state == CachedCharacterHyphen) { |
michael@0 | 175 | m_delegate.atomCharacterClassAtom(m_character); |
michael@0 | 176 | m_delegate.atomCharacterClassAtom('-'); |
michael@0 | 177 | } |
michael@0 | 178 | m_delegate.atomCharacterClassEnd(); |
michael@0 | 179 | } |
michael@0 | 180 | |
michael@0 | 181 | // parseEscape() should never call these delegate methods when |
michael@0 | 182 | // invoked with inCharacterClass set. |
michael@0 | 183 | NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } |
michael@0 | 184 | NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } |
michael@0 | 185 | |
michael@0 | 186 | private: |
michael@0 | 187 | Delegate& m_delegate; |
michael@0 | 188 | ErrorCode& m_err; |
michael@0 | 189 | enum CharacterClassConstructionState { |
michael@0 | 190 | Empty, |
michael@0 | 191 | CachedCharacter, |
michael@0 | 192 | CachedCharacterHyphen, |
michael@0 | 193 | AfterCharacterClass, |
michael@0 | 194 | AfterCharacterClassHyphen |
michael@0 | 195 | } m_state; |
michael@0 | 196 | UChar m_character; |
michael@0 | 197 | }; |
michael@0 | 198 | |
michael@0 | 199 | Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) |
michael@0 | 200 | : m_delegate(delegate) |
michael@0 | 201 | , m_backReferenceLimit(backReferenceLimit) |
michael@0 | 202 | , m_err(NoError) |
michael@0 | 203 | , m_data(pattern.chars()) |
michael@0 | 204 | , m_size(pattern.length()) |
michael@0 | 205 | , m_index(0) |
michael@0 | 206 | , m_parenthesesNestingDepth(0) |
michael@0 | 207 | { |
michael@0 | 208 | } |
michael@0 | 209 | |
michael@0 | 210 | /* |
michael@0 | 211 | * parseEscape(): |
michael@0 | 212 | * |
michael@0 | 213 | * Helper for parseTokens() AND parseCharacterClass(). |
michael@0 | 214 | * Unlike the other parser methods, this function does not report tokens |
michael@0 | 215 | * directly to the member delegate (m_delegate), instead tokens are |
michael@0 | 216 | * emitted to the delegate provided as an argument. In the case of atom |
michael@0 | 217 | * escapes, parseTokens() will call parseEscape() passing m_delegate as |
michael@0 | 218 | * an argument, and as such the escape will be reported to the delegate. |
michael@0 | 219 | * |
michael@0 | 220 | * However this method may also be used by parseCharacterClass(), in which |
michael@0 | 221 | * case a CharacterClassParserDelegate will be passed as the delegate that |
michael@0 | 222 | * tokens should be added to. A boolean flag is also provided to indicate |
michael@0 | 223 | * whether that an escape in a CharacterClass is being parsed (some parsing |
michael@0 | 224 | * rules change in this context). |
michael@0 | 225 | * |
michael@0 | 226 | * The boolean value returned by this method indicates whether the token |
michael@0 | 227 | * parsed was an atom (outside of a characted class \b and \B will be |
michael@0 | 228 | * interpreted as assertions). |
michael@0 | 229 | */ |
michael@0 | 230 | template<bool inCharacterClass, class EscapeDelegate> |
michael@0 | 231 | bool parseEscape(EscapeDelegate& delegate) |
michael@0 | 232 | { |
michael@0 | 233 | ASSERT(!m_err); |
michael@0 | 234 | ASSERT(peek() == '\\'); |
michael@0 | 235 | consume(); |
michael@0 | 236 | |
michael@0 | 237 | if (atEndOfPattern()) { |
michael@0 | 238 | m_err = EscapeUnterminated; |
michael@0 | 239 | return false; |
michael@0 | 240 | } |
michael@0 | 241 | |
michael@0 | 242 | switch (peek()) { |
michael@0 | 243 | // Assertions |
michael@0 | 244 | case 'b': |
michael@0 | 245 | consume(); |
michael@0 | 246 | if (inCharacterClass) |
michael@0 | 247 | delegate.atomPatternCharacter('\b'); |
michael@0 | 248 | else { |
michael@0 | 249 | delegate.assertionWordBoundary(false); |
michael@0 | 250 | return false; |
michael@0 | 251 | } |
michael@0 | 252 | break; |
michael@0 | 253 | case 'B': |
michael@0 | 254 | consume(); |
michael@0 | 255 | if (inCharacterClass) |
michael@0 | 256 | delegate.atomPatternCharacter('B'); |
michael@0 | 257 | else { |
michael@0 | 258 | delegate.assertionWordBoundary(true); |
michael@0 | 259 | return false; |
michael@0 | 260 | } |
michael@0 | 261 | break; |
michael@0 | 262 | |
michael@0 | 263 | // CharacterClassEscape |
michael@0 | 264 | case 'd': |
michael@0 | 265 | consume(); |
michael@0 | 266 | delegate.atomBuiltInCharacterClass(DigitClassID, false); |
michael@0 | 267 | break; |
michael@0 | 268 | case 's': |
michael@0 | 269 | consume(); |
michael@0 | 270 | delegate.atomBuiltInCharacterClass(SpaceClassID, false); |
michael@0 | 271 | break; |
michael@0 | 272 | case 'w': |
michael@0 | 273 | consume(); |
michael@0 | 274 | delegate.atomBuiltInCharacterClass(WordClassID, false); |
michael@0 | 275 | break; |
michael@0 | 276 | case 'D': |
michael@0 | 277 | consume(); |
michael@0 | 278 | delegate.atomBuiltInCharacterClass(DigitClassID, true); |
michael@0 | 279 | break; |
michael@0 | 280 | case 'S': |
michael@0 | 281 | consume(); |
michael@0 | 282 | delegate.atomBuiltInCharacterClass(SpaceClassID, true); |
michael@0 | 283 | break; |
michael@0 | 284 | case 'W': |
michael@0 | 285 | consume(); |
michael@0 | 286 | delegate.atomBuiltInCharacterClass(WordClassID, true); |
michael@0 | 287 | break; |
michael@0 | 288 | |
michael@0 | 289 | // DecimalEscape |
michael@0 | 290 | case '1': |
michael@0 | 291 | case '2': |
michael@0 | 292 | case '3': |
michael@0 | 293 | case '4': |
michael@0 | 294 | case '5': |
michael@0 | 295 | case '6': |
michael@0 | 296 | case '7': |
michael@0 | 297 | case '8': |
michael@0 | 298 | case '9': { |
michael@0 | 299 | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. |
michael@0 | 300 | // First, try to parse this as backreference. |
michael@0 | 301 | if (!inCharacterClass) { |
michael@0 | 302 | ParseState state = saveState(); |
michael@0 | 303 | |
michael@0 | 304 | unsigned backReference; |
michael@0 | 305 | if (!consumeNumber(backReference)) |
michael@0 | 306 | break; |
michael@0 | 307 | if (backReference <= m_backReferenceLimit) { |
michael@0 | 308 | delegate.atomBackReference(backReference); |
michael@0 | 309 | break; |
michael@0 | 310 | } |
michael@0 | 311 | |
michael@0 | 312 | restoreState(state); |
michael@0 | 313 | } |
michael@0 | 314 | |
michael@0 | 315 | // Not a backreference, and not octal. |
michael@0 | 316 | if (peek() >= '8') { |
michael@0 | 317 | delegate.atomPatternCharacter('\\'); |
michael@0 | 318 | break; |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | // Fall-through to handle this as an octal escape. |
michael@0 | 322 | } |
michael@0 | 323 | |
michael@0 | 324 | // Octal escape |
michael@0 | 325 | case '0': |
michael@0 | 326 | delegate.atomPatternCharacter(consumeOctal()); |
michael@0 | 327 | break; |
michael@0 | 328 | |
michael@0 | 329 | // ControlEscape |
michael@0 | 330 | case 'f': |
michael@0 | 331 | consume(); |
michael@0 | 332 | delegate.atomPatternCharacter('\f'); |
michael@0 | 333 | break; |
michael@0 | 334 | case 'n': |
michael@0 | 335 | consume(); |
michael@0 | 336 | delegate.atomPatternCharacter('\n'); |
michael@0 | 337 | break; |
michael@0 | 338 | case 'r': |
michael@0 | 339 | consume(); |
michael@0 | 340 | delegate.atomPatternCharacter('\r'); |
michael@0 | 341 | break; |
michael@0 | 342 | case 't': |
michael@0 | 343 | consume(); |
michael@0 | 344 | delegate.atomPatternCharacter('\t'); |
michael@0 | 345 | break; |
michael@0 | 346 | case 'v': |
michael@0 | 347 | consume(); |
michael@0 | 348 | delegate.atomPatternCharacter('\v'); |
michael@0 | 349 | break; |
michael@0 | 350 | |
michael@0 | 351 | // ControlLetter |
michael@0 | 352 | case 'c': { |
michael@0 | 353 | ParseState state = saveState(); |
michael@0 | 354 | consume(); |
michael@0 | 355 | if (!atEndOfPattern()) { |
michael@0 | 356 | int control = consume(); |
michael@0 | 357 | |
michael@0 | 358 | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. |
michael@0 | 359 | if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { |
michael@0 | 360 | delegate.atomPatternCharacter(control & 0x1f); |
michael@0 | 361 | break; |
michael@0 | 362 | } |
michael@0 | 363 | } |
michael@0 | 364 | restoreState(state); |
michael@0 | 365 | delegate.atomPatternCharacter('\\'); |
michael@0 | 366 | break; |
michael@0 | 367 | } |
michael@0 | 368 | |
michael@0 | 369 | // HexEscape |
michael@0 | 370 | case 'x': { |
michael@0 | 371 | consume(); |
michael@0 | 372 | int x = tryConsumeHex(2); |
michael@0 | 373 | if (x == -1) |
michael@0 | 374 | delegate.atomPatternCharacter('x'); |
michael@0 | 375 | else |
michael@0 | 376 | delegate.atomPatternCharacter(x); |
michael@0 | 377 | break; |
michael@0 | 378 | } |
michael@0 | 379 | |
michael@0 | 380 | // UnicodeEscape |
michael@0 | 381 | case 'u': { |
michael@0 | 382 | consume(); |
michael@0 | 383 | int u = tryConsumeHex(4); |
michael@0 | 384 | if (u == -1) |
michael@0 | 385 | delegate.atomPatternCharacter('u'); |
michael@0 | 386 | else |
michael@0 | 387 | delegate.atomPatternCharacter(u); |
michael@0 | 388 | break; |
michael@0 | 389 | } |
michael@0 | 390 | |
michael@0 | 391 | // IdentityEscape |
michael@0 | 392 | default: |
michael@0 | 393 | delegate.atomPatternCharacter(consume()); |
michael@0 | 394 | } |
michael@0 | 395 | |
michael@0 | 396 | return true; |
michael@0 | 397 | } |
michael@0 | 398 | |
michael@0 | 399 | /* |
michael@0 | 400 | * parseAtomEscape(), parseCharacterClassEscape(): |
michael@0 | 401 | * |
michael@0 | 402 | * These methods alias to parseEscape(). |
michael@0 | 403 | */ |
michael@0 | 404 | bool parseAtomEscape() |
michael@0 | 405 | { |
michael@0 | 406 | return parseEscape<false>(m_delegate); |
michael@0 | 407 | } |
michael@0 | 408 | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) |
michael@0 | 409 | { |
michael@0 | 410 | parseEscape<true>(delegate); |
michael@0 | 411 | } |
michael@0 | 412 | |
michael@0 | 413 | /* |
michael@0 | 414 | * parseCharacterClass(): |
michael@0 | 415 | * |
michael@0 | 416 | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) |
michael@0 | 417 | * to an instance of CharacterClassParserDelegate, to describe the character class to the |
michael@0 | 418 | * delegate. |
michael@0 | 419 | */ |
michael@0 | 420 | void parseCharacterClass() |
michael@0 | 421 | { |
michael@0 | 422 | ASSERT(!m_err); |
michael@0 | 423 | ASSERT(peek() == '['); |
michael@0 | 424 | consume(); |
michael@0 | 425 | |
michael@0 | 426 | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); |
michael@0 | 427 | |
michael@0 | 428 | characterClassConstructor.begin(tryConsume('^')); |
michael@0 | 429 | |
michael@0 | 430 | while (!atEndOfPattern()) { |
michael@0 | 431 | switch (peek()) { |
michael@0 | 432 | case ']': |
michael@0 | 433 | consume(); |
michael@0 | 434 | characterClassConstructor.end(); |
michael@0 | 435 | return; |
michael@0 | 436 | |
michael@0 | 437 | case '\\': |
michael@0 | 438 | parseCharacterClassEscape(characterClassConstructor); |
michael@0 | 439 | break; |
michael@0 | 440 | |
michael@0 | 441 | default: |
michael@0 | 442 | characterClassConstructor.atomPatternCharacter(consume(), true); |
michael@0 | 443 | } |
michael@0 | 444 | |
michael@0 | 445 | if (m_err) |
michael@0 | 446 | return; |
michael@0 | 447 | } |
michael@0 | 448 | |
michael@0 | 449 | m_err = CharacterClassUnmatched; |
michael@0 | 450 | } |
michael@0 | 451 | |
michael@0 | 452 | /* |
michael@0 | 453 | * parseParenthesesBegin(): |
michael@0 | 454 | * |
michael@0 | 455 | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. |
michael@0 | 456 | */ |
michael@0 | 457 | void parseParenthesesBegin() |
michael@0 | 458 | { |
michael@0 | 459 | ASSERT(!m_err); |
michael@0 | 460 | ASSERT(peek() == '('); |
michael@0 | 461 | consume(); |
michael@0 | 462 | |
michael@0 | 463 | if (tryConsume('?')) { |
michael@0 | 464 | if (atEndOfPattern()) { |
michael@0 | 465 | m_err = ParenthesesTypeInvalid; |
michael@0 | 466 | return; |
michael@0 | 467 | } |
michael@0 | 468 | |
michael@0 | 469 | switch (consume()) { |
michael@0 | 470 | case ':': |
michael@0 | 471 | m_delegate.atomParenthesesSubpatternBegin(false); |
michael@0 | 472 | break; |
michael@0 | 473 | |
michael@0 | 474 | case '=': |
michael@0 | 475 | m_delegate.atomParentheticalAssertionBegin(); |
michael@0 | 476 | break; |
michael@0 | 477 | |
michael@0 | 478 | case '!': |
michael@0 | 479 | m_delegate.atomParentheticalAssertionBegin(true); |
michael@0 | 480 | break; |
michael@0 | 481 | |
michael@0 | 482 | default: |
michael@0 | 483 | m_err = ParenthesesTypeInvalid; |
michael@0 | 484 | } |
michael@0 | 485 | } else |
michael@0 | 486 | m_delegate.atomParenthesesSubpatternBegin(); |
michael@0 | 487 | |
michael@0 | 488 | ++m_parenthesesNestingDepth; |
michael@0 | 489 | } |
michael@0 | 490 | |
michael@0 | 491 | /* |
michael@0 | 492 | * parseParenthesesEnd(): |
michael@0 | 493 | * |
michael@0 | 494 | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). |
michael@0 | 495 | */ |
michael@0 | 496 | void parseParenthesesEnd() |
michael@0 | 497 | { |
michael@0 | 498 | ASSERT(!m_err); |
michael@0 | 499 | ASSERT(peek() == ')'); |
michael@0 | 500 | consume(); |
michael@0 | 501 | |
michael@0 | 502 | if (m_parenthesesNestingDepth > 0) |
michael@0 | 503 | m_delegate.atomParenthesesEnd(); |
michael@0 | 504 | else |
michael@0 | 505 | m_err = ParenthesesUnmatched; |
michael@0 | 506 | |
michael@0 | 507 | --m_parenthesesNestingDepth; |
michael@0 | 508 | } |
michael@0 | 509 | |
michael@0 | 510 | /* |
michael@0 | 511 | * parseQuantifier(): |
michael@0 | 512 | * |
michael@0 | 513 | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. |
michael@0 | 514 | */ |
michael@0 | 515 | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) |
michael@0 | 516 | { |
michael@0 | 517 | ASSERT(!m_err); |
michael@0 | 518 | ASSERT(min <= max); |
michael@0 | 519 | |
michael@0 | 520 | if (min == UINT_MAX) { |
michael@0 | 521 | m_err = QuantifierTooLarge; |
michael@0 | 522 | return; |
michael@0 | 523 | } |
michael@0 | 524 | |
michael@0 | 525 | if (lastTokenWasAnAtom) |
michael@0 | 526 | m_delegate.quantifyAtom(min, max, !tryConsume('?')); |
michael@0 | 527 | else |
michael@0 | 528 | m_err = QuantifierWithoutAtom; |
michael@0 | 529 | } |
michael@0 | 530 | |
michael@0 | 531 | /* |
michael@0 | 532 | * parseTokens(): |
michael@0 | 533 | * |
michael@0 | 534 | * This method loops over the input pattern reporting tokens to the delegate. |
michael@0 | 535 | * The method returns when a parse error is detected, or the end of the pattern |
michael@0 | 536 | * is reached. One piece of state is tracked around the loop, which is whether |
michael@0 | 537 | * the last token passed to the delegate was an atom (this is necessary to detect |
michael@0 | 538 | * a parse error when a quantifier provided without an atom to quantify). |
michael@0 | 539 | */ |
michael@0 | 540 | void parseTokens() |
michael@0 | 541 | { |
michael@0 | 542 | bool lastTokenWasAnAtom = false; |
michael@0 | 543 | |
michael@0 | 544 | while (!atEndOfPattern()) { |
michael@0 | 545 | switch (peek()) { |
michael@0 | 546 | case '|': |
michael@0 | 547 | consume(); |
michael@0 | 548 | m_delegate.disjunction(); |
michael@0 | 549 | lastTokenWasAnAtom = false; |
michael@0 | 550 | break; |
michael@0 | 551 | |
michael@0 | 552 | case '(': |
michael@0 | 553 | parseParenthesesBegin(); |
michael@0 | 554 | lastTokenWasAnAtom = false; |
michael@0 | 555 | break; |
michael@0 | 556 | |
michael@0 | 557 | case ')': |
michael@0 | 558 | parseParenthesesEnd(); |
michael@0 | 559 | lastTokenWasAnAtom = true; |
michael@0 | 560 | break; |
michael@0 | 561 | |
michael@0 | 562 | case '^': |
michael@0 | 563 | consume(); |
michael@0 | 564 | m_delegate.assertionBOL(); |
michael@0 | 565 | lastTokenWasAnAtom = false; |
michael@0 | 566 | break; |
michael@0 | 567 | |
michael@0 | 568 | case '$': |
michael@0 | 569 | consume(); |
michael@0 | 570 | m_delegate.assertionEOL(); |
michael@0 | 571 | lastTokenWasAnAtom = false; |
michael@0 | 572 | break; |
michael@0 | 573 | |
michael@0 | 574 | case '.': |
michael@0 | 575 | consume(); |
michael@0 | 576 | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); |
michael@0 | 577 | lastTokenWasAnAtom = true; |
michael@0 | 578 | break; |
michael@0 | 579 | |
michael@0 | 580 | case '[': |
michael@0 | 581 | parseCharacterClass(); |
michael@0 | 582 | lastTokenWasAnAtom = true; |
michael@0 | 583 | break; |
michael@0 | 584 | |
michael@0 | 585 | case '\\': |
michael@0 | 586 | lastTokenWasAnAtom = parseAtomEscape(); |
michael@0 | 587 | break; |
michael@0 | 588 | |
michael@0 | 589 | case '*': |
michael@0 | 590 | consume(); |
michael@0 | 591 | parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); |
michael@0 | 592 | lastTokenWasAnAtom = false; |
michael@0 | 593 | break; |
michael@0 | 594 | |
michael@0 | 595 | case '+': |
michael@0 | 596 | consume(); |
michael@0 | 597 | parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); |
michael@0 | 598 | lastTokenWasAnAtom = false; |
michael@0 | 599 | break; |
michael@0 | 600 | |
michael@0 | 601 | case '?': |
michael@0 | 602 | consume(); |
michael@0 | 603 | parseQuantifier(lastTokenWasAnAtom, 0, 1); |
michael@0 | 604 | lastTokenWasAnAtom = false; |
michael@0 | 605 | break; |
michael@0 | 606 | |
michael@0 | 607 | case '{': { |
michael@0 | 608 | ParseState state = saveState(); |
michael@0 | 609 | |
michael@0 | 610 | consume(); |
michael@0 | 611 | if (peekIsDigit()) { |
michael@0 | 612 | unsigned min; |
michael@0 | 613 | if (!consumeNumber(min)) |
michael@0 | 614 | break; |
michael@0 | 615 | |
michael@0 | 616 | unsigned max = min; |
michael@0 | 617 | if (tryConsume(',')) { |
michael@0 | 618 | if (peekIsDigit()) { |
michael@0 | 619 | if (!consumeNumber(max)) |
michael@0 | 620 | break; |
michael@0 | 621 | } else { |
michael@0 | 622 | max = quantifyInfinite; |
michael@0 | 623 | } |
michael@0 | 624 | } |
michael@0 | 625 | |
michael@0 | 626 | if (tryConsume('}')) { |
michael@0 | 627 | if (min <= max) |
michael@0 | 628 | parseQuantifier(lastTokenWasAnAtom, min, max); |
michael@0 | 629 | else |
michael@0 | 630 | m_err = QuantifierOutOfOrder; |
michael@0 | 631 | lastTokenWasAnAtom = false; |
michael@0 | 632 | break; |
michael@0 | 633 | } |
michael@0 | 634 | } |
michael@0 | 635 | |
michael@0 | 636 | restoreState(state); |
michael@0 | 637 | } // if we did not find a complete quantifer, fall through to the default case. |
michael@0 | 638 | |
michael@0 | 639 | default: |
michael@0 | 640 | m_delegate.atomPatternCharacter(consume()); |
michael@0 | 641 | lastTokenWasAnAtom = true; |
michael@0 | 642 | } |
michael@0 | 643 | |
michael@0 | 644 | if (m_err) |
michael@0 | 645 | return; |
michael@0 | 646 | } |
michael@0 | 647 | |
michael@0 | 648 | if (m_parenthesesNestingDepth > 0) |
michael@0 | 649 | m_err = MissingParentheses; |
michael@0 | 650 | } |
michael@0 | 651 | |
michael@0 | 652 | /* |
michael@0 | 653 | * parse(): |
michael@0 | 654 | * |
michael@0 | 655 | * This method calls parseTokens() to parse over the input and converts any |
michael@0 | 656 | * error code to a const char* for a result. |
michael@0 | 657 | */ |
michael@0 | 658 | ErrorCode parse() |
michael@0 | 659 | { |
michael@0 | 660 | if (m_size > MAX_PATTERN_SIZE) |
michael@0 | 661 | m_err = PatternTooLarge; |
michael@0 | 662 | else |
michael@0 | 663 | parseTokens(); |
michael@0 | 664 | ASSERT(atEndOfPattern() || m_err); |
michael@0 | 665 | |
michael@0 | 666 | return m_err; |
michael@0 | 667 | } |
michael@0 | 668 | |
michael@0 | 669 | // Misc helper functions: |
michael@0 | 670 | |
michael@0 | 671 | typedef unsigned ParseState; |
michael@0 | 672 | |
michael@0 | 673 | ParseState saveState() |
michael@0 | 674 | { |
michael@0 | 675 | return m_index; |
michael@0 | 676 | } |
michael@0 | 677 | |
michael@0 | 678 | void restoreState(ParseState state) |
michael@0 | 679 | { |
michael@0 | 680 | m_index = state; |
michael@0 | 681 | } |
michael@0 | 682 | |
michael@0 | 683 | bool atEndOfPattern() |
michael@0 | 684 | { |
michael@0 | 685 | ASSERT(m_index <= m_size); |
michael@0 | 686 | return m_index == m_size; |
michael@0 | 687 | } |
michael@0 | 688 | |
michael@0 | 689 | int peek() |
michael@0 | 690 | { |
michael@0 | 691 | ASSERT(m_index < m_size); |
michael@0 | 692 | return m_data[m_index]; |
michael@0 | 693 | } |
michael@0 | 694 | |
michael@0 | 695 | bool peekIsDigit() |
michael@0 | 696 | { |
michael@0 | 697 | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); |
michael@0 | 698 | } |
michael@0 | 699 | |
michael@0 | 700 | unsigned peekDigit() |
michael@0 | 701 | { |
michael@0 | 702 | ASSERT(peekIsDigit()); |
michael@0 | 703 | return peek() - '0'; |
michael@0 | 704 | } |
michael@0 | 705 | |
michael@0 | 706 | int consume() |
michael@0 | 707 | { |
michael@0 | 708 | ASSERT(m_index < m_size); |
michael@0 | 709 | return m_data[m_index++]; |
michael@0 | 710 | } |
michael@0 | 711 | |
michael@0 | 712 | unsigned consumeDigit() |
michael@0 | 713 | { |
michael@0 | 714 | ASSERT(peekIsDigit()); |
michael@0 | 715 | return consume() - '0'; |
michael@0 | 716 | } |
michael@0 | 717 | |
michael@0 | 718 | bool consumeNumber(unsigned &accum) |
michael@0 | 719 | { |
michael@0 | 720 | accum = consumeDigit(); |
michael@0 | 721 | while (peekIsDigit()) { |
michael@0 | 722 | unsigned newValue = accum * 10 + peekDigit(); |
michael@0 | 723 | if (newValue < accum) { /* Overflow check. */ |
michael@0 | 724 | m_err = QuantifierTooLarge; |
michael@0 | 725 | return false; |
michael@0 | 726 | } |
michael@0 | 727 | accum = newValue; |
michael@0 | 728 | consume(); |
michael@0 | 729 | } |
michael@0 | 730 | return true; |
michael@0 | 731 | } |
michael@0 | 732 | |
michael@0 | 733 | unsigned consumeOctal() |
michael@0 | 734 | { |
michael@0 | 735 | ASSERT(WTF::isASCIIOctalDigit(peek())); |
michael@0 | 736 | |
michael@0 | 737 | unsigned n = consumeDigit(); |
michael@0 | 738 | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) |
michael@0 | 739 | n = n * 8 + consumeDigit(); |
michael@0 | 740 | return n; |
michael@0 | 741 | } |
michael@0 | 742 | |
michael@0 | 743 | bool tryConsume(UChar ch) |
michael@0 | 744 | { |
michael@0 | 745 | if (atEndOfPattern() || (m_data[m_index] != ch)) |
michael@0 | 746 | return false; |
michael@0 | 747 | ++m_index; |
michael@0 | 748 | return true; |
michael@0 | 749 | } |
michael@0 | 750 | |
michael@0 | 751 | int tryConsumeHex(int count) |
michael@0 | 752 | { |
michael@0 | 753 | ParseState state = saveState(); |
michael@0 | 754 | |
michael@0 | 755 | int n = 0; |
michael@0 | 756 | while (count--) { |
michael@0 | 757 | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { |
michael@0 | 758 | restoreState(state); |
michael@0 | 759 | return -1; |
michael@0 | 760 | } |
michael@0 | 761 | n = (n << 4) | WTF::toASCIIHexValue(consume()); |
michael@0 | 762 | } |
michael@0 | 763 | return n; |
michael@0 | 764 | } |
michael@0 | 765 | |
michael@0 | 766 | Delegate& m_delegate; |
michael@0 | 767 | unsigned m_backReferenceLimit; |
michael@0 | 768 | ErrorCode m_err; |
michael@0 | 769 | const CharType* m_data; |
michael@0 | 770 | unsigned m_size; |
michael@0 | 771 | unsigned m_index; |
michael@0 | 772 | unsigned m_parenthesesNestingDepth; |
michael@0 | 773 | |
michael@0 | 774 | // Derived by empirical testing of compile time in PCRE and WREC. |
michael@0 | 775 | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; |
michael@0 | 776 | }; |
michael@0 | 777 | |
michael@0 | 778 | /* |
michael@0 | 779 | * Yarr::parse(): |
michael@0 | 780 | * |
michael@0 | 781 | * The parse method is passed a pattern to be parsed and a delegate upon which |
michael@0 | 782 | * callbacks will be made to record the parsed tokens forming the regex. |
michael@0 | 783 | * Yarr::parse() returns null on success, or a const C string providing an error |
michael@0 | 784 | * message where a parse error occurs. |
michael@0 | 785 | * |
michael@0 | 786 | * The Delegate must implement the following interface: |
michael@0 | 787 | * |
michael@0 | 788 | * void assertionBOL(); |
michael@0 | 789 | * void assertionEOL(); |
michael@0 | 790 | * void assertionWordBoundary(bool invert); |
michael@0 | 791 | * |
michael@0 | 792 | * void atomPatternCharacter(UChar ch); |
michael@0 | 793 | * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); |
michael@0 | 794 | * void atomCharacterClassBegin(bool invert) |
michael@0 | 795 | * void atomCharacterClassAtom(UChar ch) |
michael@0 | 796 | * void atomCharacterClassRange(UChar begin, UChar end) |
michael@0 | 797 | * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
michael@0 | 798 | * void atomCharacterClassEnd() |
michael@0 | 799 | * void atomParenthesesSubpatternBegin(bool capture = true); |
michael@0 | 800 | * void atomParentheticalAssertionBegin(bool invert = false); |
michael@0 | 801 | * void atomParenthesesEnd(); |
michael@0 | 802 | * void atomBackReference(unsigned subpatternId); |
michael@0 | 803 | * |
michael@0 | 804 | * void quantifyAtom(unsigned min, unsigned max, bool greedy); |
michael@0 | 805 | * |
michael@0 | 806 | * void disjunction(); |
michael@0 | 807 | * |
michael@0 | 808 | * The regular expression is described by a sequence of assertion*() and atom*() |
michael@0 | 809 | * callbacks to the delegate, describing the terms in the regular expression. |
michael@0 | 810 | * Following an atom a quantifyAtom() call may occur to indicate that the previous |
michael@0 | 811 | * atom should be quantified. In the case of atoms described across multiple |
michael@0 | 812 | * calls (parentheses and character classes) the call to quantifyAtom() will come |
michael@0 | 813 | * after the call to the atom*End() method, never after atom*Begin(). |
michael@0 | 814 | * |
michael@0 | 815 | * Character classes may either be described by a single call to |
michael@0 | 816 | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. |
michael@0 | 817 | * In the latter case, ...Begin() will be called, followed by a sequence of |
michael@0 | 818 | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). |
michael@0 | 819 | * |
michael@0 | 820 | * Sequences of atoms and assertions are broken into alternatives via calls to |
michael@0 | 821 | * disjunction(). Assertions, atoms, and disjunctions emitted between calls to |
michael@0 | 822 | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. |
michael@0 | 823 | * atomParenthesesBegin() is passed a subpatternId. In the case of a regular |
michael@0 | 824 | * capturing subpattern, this will be the subpatternId associated with these |
michael@0 | 825 | * parentheses, and will also by definition be the lowest subpatternId of these |
michael@0 | 826 | * parentheses and of any nested paretheses. The atomParenthesesEnd() method |
michael@0 | 827 | * is passed the subpatternId of the last capturing subexpression nested within |
michael@0 | 828 | * these paretheses. In the case of a capturing subpattern with no nested |
michael@0 | 829 | * capturing subpatterns, the same subpatternId will be passed to the begin and |
michael@0 | 830 | * end functions. In the case of non-capturing subpatterns the subpatternId |
michael@0 | 831 | * passed to the begin method is also the first possible subpatternId that might |
michael@0 | 832 | * be nested within these paretheses. If a set of non-capturing parentheses does |
michael@0 | 833 | * not contain any capturing subpatterns, then the subpatternId passed to begin |
michael@0 | 834 | * will be greater than the subpatternId passed to end. |
michael@0 | 835 | */ |
michael@0 | 836 | |
michael@0 | 837 | template<class Delegate> |
michael@0 | 838 | ErrorCode parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) |
michael@0 | 839 | { |
michael@0 | 840 | #ifdef YARR_8BIT_CHAR_SUPPORT |
michael@0 | 841 | if (pattern.is8Bit()) |
michael@0 | 842 | return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); |
michael@0 | 843 | #endif |
michael@0 | 844 | return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); |
michael@0 | 845 | } |
michael@0 | 846 | |
michael@0 | 847 | } } // namespace JSC::Yarr |
michael@0 | 848 | |
michael@0 | 849 | #endif /* yarr_YarrParser_h */ |