parser/html/nsHtml5StreamParser.h

Wed, 31 Dec 2014 13:27:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 13:27:57 +0100
branch
TOR_BUG_3246
changeset 6
8bccb770b82d
permissions
-rw-r--r--

Ignore runtime configuration files generated during quality assurance.

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #ifndef nsHtml5StreamParser_h
michael@0 7 #define nsHtml5StreamParser_h
michael@0 8
michael@0 9 #include "nsAutoPtr.h"
michael@0 10 #include "nsCOMPtr.h"
michael@0 11 #include "nsICharsetDetectionObserver.h"
michael@0 12 #include "nsHtml5MetaScanner.h"
michael@0 13 #include "nsIUnicodeDecoder.h"
michael@0 14 #include "nsHtml5TreeOpExecutor.h"
michael@0 15 #include "nsHtml5OwningUTF16Buffer.h"
michael@0 16 #include "nsIInputStream.h"
michael@0 17 #include "mozilla/Mutex.h"
michael@0 18 #include "nsHtml5AtomTable.h"
michael@0 19 #include "nsHtml5Speculation.h"
michael@0 20 #include "nsITimer.h"
michael@0 21 #include "nsICharsetDetector.h"
michael@0 22
michael@0 23 class nsHtml5Parser;
michael@0 24
michael@0 25 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
michael@0 26 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
michael@0 27
michael@0 28 enum eParserMode {
michael@0 29 /**
michael@0 30 * Parse a document normally as HTML.
michael@0 31 */
michael@0 32 NORMAL,
michael@0 33
michael@0 34 /**
michael@0 35 * View document as HTML source.
michael@0 36 */
michael@0 37 VIEW_SOURCE_HTML,
michael@0 38
michael@0 39 /**
michael@0 40 * View document as XML source
michael@0 41 */
michael@0 42 VIEW_SOURCE_XML,
michael@0 43
michael@0 44 /**
michael@0 45 * View document as plain text source
michael@0 46 */
michael@0 47 VIEW_SOURCE_PLAIN,
michael@0 48
michael@0 49 /**
michael@0 50 * View document as plain text
michael@0 51 */
michael@0 52 PLAIN_TEXT,
michael@0 53
michael@0 54 /**
michael@0 55 * Load as data (XHR)
michael@0 56 */
michael@0 57 LOAD_AS_DATA
michael@0 58 };
michael@0 59
michael@0 60 enum eBomState {
michael@0 61 /**
michael@0 62 * BOM sniffing hasn't started.
michael@0 63 */
michael@0 64 BOM_SNIFFING_NOT_STARTED = 0,
michael@0 65
michael@0 66 /**
michael@0 67 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
michael@0 68 * seen.
michael@0 69 */
michael@0 70 SEEN_UTF_16_LE_FIRST_BYTE = 1,
michael@0 71
michael@0 72 /**
michael@0 73 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
michael@0 74 * seen.
michael@0 75 */
michael@0 76 SEEN_UTF_16_BE_FIRST_BYTE = 2,
michael@0 77
michael@0 78 /**
michael@0 79 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
michael@0 80 * seen.
michael@0 81 */
michael@0 82 SEEN_UTF_8_FIRST_BYTE = 3,
michael@0 83
michael@0 84 /**
michael@0 85 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
michael@0 86 * have been seen.
michael@0 87 */
michael@0 88 SEEN_UTF_8_SECOND_BYTE = 4,
michael@0 89
michael@0 90 /**
michael@0 91 * BOM sniffing was started but is now over for whatever reason.
michael@0 92 */
michael@0 93 BOM_SNIFFING_OVER = 5
michael@0 94 };
michael@0 95
michael@0 96 enum eHtml5StreamState {
michael@0 97 STREAM_NOT_STARTED = 0,
michael@0 98 STREAM_BEING_READ = 1,
michael@0 99 STREAM_ENDED = 2
michael@0 100 };
michael@0 101
michael@0 102 class nsHtml5StreamParser : public nsICharsetDetectionObserver {
michael@0 103
michael@0 104 friend class nsHtml5RequestStopper;
michael@0 105 friend class nsHtml5DataAvailable;
michael@0 106 friend class nsHtml5StreamParserContinuation;
michael@0 107 friend class nsHtml5TimerKungFu;
michael@0 108
michael@0 109 public:
michael@0 110 NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
michael@0 111 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
michael@0 112 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
michael@0 113 nsICharsetDetectionObserver)
michael@0 114
michael@0 115 static void InitializeStatics();
michael@0 116
michael@0 117 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
michael@0 118 nsHtml5Parser* aOwner,
michael@0 119 eParserMode aMode);
michael@0 120
michael@0 121 virtual ~nsHtml5StreamParser();
michael@0 122
michael@0 123 // Methods that nsHtml5StreamListener calls
michael@0 124 nsresult CheckListenerChain();
michael@0 125
michael@0 126 nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
michael@0 127
michael@0 128 nsresult OnDataAvailable(nsIRequest* aRequest,
michael@0 129 nsISupports* aContext,
michael@0 130 nsIInputStream* aInStream,
michael@0 131 uint64_t aSourceOffset,
michael@0 132 uint32_t aLength);
michael@0 133
michael@0 134 nsresult OnStopRequest(nsIRequest* aRequest,
michael@0 135 nsISupports* aContext,
michael@0 136 nsresult status);
michael@0 137
michael@0 138 // nsICharsetDetectionObserver
michael@0 139 /**
michael@0 140 * Chardet calls this to report the detection result
michael@0 141 */
michael@0 142 NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf);
michael@0 143
michael@0 144 // EncodingDeclarationHandler
michael@0 145 // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
michael@0 146 /**
michael@0 147 * Tree builder uses this to report a late <meta charset>
michael@0 148 */
michael@0 149 bool internalEncodingDeclaration(nsString* aEncoding);
michael@0 150
michael@0 151 // Not from an external interface
michael@0 152
michael@0 153 /**
michael@0 154 * Call this method once you've created a parser, and want to instruct it
michael@0 155 * about what charset to load
michael@0 156 *
michael@0 157 * @param aCharset the charset of a document
michael@0 158 * @param aCharsetSource the source of the charset
michael@0 159 */
michael@0 160 inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
michael@0 161 NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
michael@0 162 "SetDocumentCharset called too late.");
michael@0 163 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
michael@0 164 mCharset = aCharset;
michael@0 165 mCharsetSource = aSource;
michael@0 166 }
michael@0 167
michael@0 168 inline void SetObserver(nsIRequestObserver* aObserver) {
michael@0 169 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
michael@0 170 mObserver = aObserver;
michael@0 171 }
michael@0 172
michael@0 173 nsresult GetChannel(nsIChannel** aChannel);
michael@0 174
michael@0 175 /**
michael@0 176 * The owner parser must call this after script execution
michael@0 177 * when no scripts are executing and the document.written
michael@0 178 * buffer has been exhausted.
michael@0 179 */
michael@0 180 void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
michael@0 181 nsHtml5TreeBuilder* aTreeBuilder,
michael@0 182 bool aLastWasCR);
michael@0 183
michael@0 184 /**
michael@0 185 * Continues the stream parser if the charset switch failed.
michael@0 186 */
michael@0 187 void ContinueAfterFailedCharsetSwitch();
michael@0 188
michael@0 189 void Terminate()
michael@0 190 {
michael@0 191 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0 192 mTerminated = true;
michael@0 193 }
michael@0 194
michael@0 195 void DropTimer();
michael@0 196
michael@0 197 /**
michael@0 198 * Sets mCharset and mCharsetSource appropriately for the XML View Source
michael@0 199 * case if aEncoding names a supported rough ASCII superset and sets
michael@0 200 * the mCharset and mCharsetSource to the UTF-8 default otherwise.
michael@0 201 */
michael@0 202 void SetEncodingFromExpat(const char16_t* aEncoding);
michael@0 203
michael@0 204 /**
michael@0 205 * Sets the URL for View Source title in case this parser ends up being
michael@0 206 * used for View Source. If aURL is a view-source: URL, takes the inner
michael@0 207 * URL. data: URLs are shown with an ellipsis instead of the actual data.
michael@0 208 */
michael@0 209 void SetViewSourceTitle(nsIURI* aURL);
michael@0 210
michael@0 211 private:
michael@0 212
michael@0 213 #ifdef DEBUG
michael@0 214 bool IsParserThread() {
michael@0 215 bool ret;
michael@0 216 mThread->IsOnCurrentThread(&ret);
michael@0 217 return ret;
michael@0 218 }
michael@0 219 #endif
michael@0 220
michael@0 221 void MarkAsBroken(nsresult aRv);
michael@0 222
michael@0 223 /**
michael@0 224 * Marks the stream parser as interrupted. If you ever add calls to this
michael@0 225 * method, be sure to review Uninterrupt usage very, very carefully to
michael@0 226 * avoid having a previous in-flight runnable cancel your Interrupt()
michael@0 227 * call on the other thread too soon.
michael@0 228 */
michael@0 229 void Interrupt()
michael@0 230 {
michael@0 231 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0 232 mInterrupted = true;
michael@0 233 }
michael@0 234
michael@0 235 void Uninterrupt()
michael@0 236 {
michael@0 237 NS_ASSERTION(IsParserThread(), "Wrong thread!");
michael@0 238 mTokenizerMutex.AssertCurrentThreadOwns();
michael@0 239 // Not acquiring mTerminatedMutex because mTokenizerMutex is already
michael@0 240 // held at this point and is already stronger.
michael@0 241 mInterrupted = false;
michael@0 242 }
michael@0 243
michael@0 244 /**
michael@0 245 * Flushes the tree ops from the tree builder and disarms the flush
michael@0 246 * timer.
michael@0 247 */
michael@0 248 void FlushTreeOpsAndDisarmTimer();
michael@0 249
michael@0 250 void ParseAvailableData();
michael@0 251
michael@0 252 void DoStopRequest();
michael@0 253
michael@0 254 void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
michael@0 255
michael@0 256 static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream,
michael@0 257 void *aClosure,
michael@0 258 const char *aFromSegment,
michael@0 259 uint32_t aToOffset,
michael@0 260 uint32_t aCount,
michael@0 261 uint32_t *aWriteCount);
michael@0 262
michael@0 263 bool IsTerminatedOrInterrupted()
michael@0 264 {
michael@0 265 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0 266 return mTerminated || mInterrupted;
michael@0 267 }
michael@0 268
michael@0 269 bool IsTerminated()
michael@0 270 {
michael@0 271 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0 272 return mTerminated;
michael@0 273 }
michael@0 274
michael@0 275 /**
michael@0 276 * True when there is a Unicode decoder already
michael@0 277 */
michael@0 278 inline bool HasDecoder()
michael@0 279 {
michael@0 280 return !!mUnicodeDecoder;
michael@0 281 }
michael@0 282
michael@0 283 /**
michael@0 284 * Push bytes from network when there is no Unicode decoder yet
michael@0 285 */
michael@0 286 nsresult SniffStreamBytes(const uint8_t* aFromSegment,
michael@0 287 uint32_t aCount,
michael@0 288 uint32_t* aWriteCount);
michael@0 289
michael@0 290 /**
michael@0 291 * Push bytes from network when there is a Unicode decoder already
michael@0 292 */
michael@0 293 nsresult WriteStreamBytes(const uint8_t* aFromSegment,
michael@0 294 uint32_t aCount,
michael@0 295 uint32_t* aWriteCount);
michael@0 296
michael@0 297 /**
michael@0 298 * Check whether every other byte in the sniffing buffer is zero.
michael@0 299 */
michael@0 300 void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
michael@0 301 uint32_t aCountToSniffingLimit);
michael@0 302
michael@0 303 /**
michael@0 304 * <meta charset> scan failed. Try chardet if applicable. After this, the
michael@0 305 * the parser will have some encoding even if a last resolt fallback.
michael@0 306 *
michael@0 307 * @param aFromSegment The current network buffer or null if the sniffing
michael@0 308 * buffer is being flushed due to network stream ending.
michael@0 309 * @param aCount The number of bytes in aFromSegment (ignored if
michael@0 310 * aFromSegment is null)
michael@0 311 * @param aWriteCount Return value for how many bytes got read from the
michael@0 312 * buffer.
michael@0 313 * @param aCountToSniffingLimit The number of unfilled slots in
michael@0 314 * mSniffingBuffer
michael@0 315 */
michael@0 316 nsresult FinalizeSniffing(const uint8_t* aFromSegment,
michael@0 317 uint32_t aCount,
michael@0 318 uint32_t* aWriteCount,
michael@0 319 uint32_t aCountToSniffingLimit);
michael@0 320
michael@0 321 /**
michael@0 322 * Set up the Unicode decoder and write the sniffing buffer into it
michael@0 323 * followed by the current network buffer.
michael@0 324 *
michael@0 325 * @param aFromSegment The current network buffer or null if the sniffing
michael@0 326 * buffer is being flushed due to network stream ending.
michael@0 327 * @param aCount The number of bytes in aFromSegment (ignored if
michael@0 328 * aFromSegment is null)
michael@0 329 * @param aWriteCount Return value for how many bytes got read from the
michael@0 330 * buffer.
michael@0 331 */
michael@0 332 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
michael@0 333 uint32_t aCount,
michael@0 334 uint32_t* aWriteCount);
michael@0 335
michael@0 336 /**
michael@0 337 * Initialize the Unicode decoder, mark the BOM as the source and
michael@0 338 * drop the sniffer.
michael@0 339 *
michael@0 340 * @param aDecoderCharsetName The name for the decoder's charset
michael@0 341 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
michael@0 342 * been swallowed)
michael@0 343 */
michael@0 344 nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);
michael@0 345
michael@0 346 /**
michael@0 347 * Become confident or resolve and encoding name to its preferred form.
michael@0 348 * @param aEncoding the value of an internal encoding decl. Acts as an
michael@0 349 * out param, too, when the method returns true.
michael@0 350 * @return true if the parser needs to start using the new value of
michael@0 351 * aEncoding and false if the parser became confident or if
michael@0 352 * the encoding name did not specify a usable encoding
michael@0 353 */
michael@0 354 bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
michael@0 355
michael@0 356 /**
michael@0 357 * Callback for mFlushTimer.
michael@0 358 */
michael@0 359 static void TimerCallback(nsITimer* aTimer, void* aClosure);
michael@0 360
michael@0 361 /**
michael@0 362 * Parser thread entry point for (maybe) flushing the ops and posting
michael@0 363 * a flush runnable back on the main thread.
michael@0 364 */
michael@0 365 void TimerFlush();
michael@0 366
michael@0 367 nsCOMPtr<nsIRequest> mRequest;
michael@0 368 nsCOMPtr<nsIRequestObserver> mObserver;
michael@0 369
michael@0 370 /**
michael@0 371 * The document title to use if this turns out to be a View Source parser.
michael@0 372 */
michael@0 373 nsCString mViewSourceTitle;
michael@0 374
michael@0 375 /**
michael@0 376 * The Unicode decoder
michael@0 377 */
michael@0 378 nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
michael@0 379
michael@0 380 /**
michael@0 381 * The buffer for sniffing the character encoding
michael@0 382 */
michael@0 383 nsAutoArrayPtr<uint8_t> mSniffingBuffer;
michael@0 384
michael@0 385 /**
michael@0 386 * The number of meaningful bytes in mSniffingBuffer
michael@0 387 */
michael@0 388 uint32_t mSniffingLength;
michael@0 389
michael@0 390 /**
michael@0 391 * BOM sniffing state
michael@0 392 */
michael@0 393 eBomState mBomState;
michael@0 394
michael@0 395 /**
michael@0 396 * <meta> prescan implementation
michael@0 397 */
michael@0 398 nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
michael@0 399
michael@0 400 // encoding-related stuff
michael@0 401 /**
michael@0 402 * The source (confidence) of the character encoding in use
michael@0 403 */
michael@0 404 int32_t mCharsetSource;
michael@0 405
michael@0 406 /**
michael@0 407 * The character encoding in use
michael@0 408 */
michael@0 409 nsCString mCharset;
michael@0 410
michael@0 411 /**
michael@0 412 * Whether reparse is forbidden
michael@0 413 */
michael@0 414 bool mReparseForbidden;
michael@0 415
michael@0 416 // Portable parser objects
michael@0 417 /**
michael@0 418 * The first buffer in the pending UTF-16 buffer queue
michael@0 419 */
michael@0 420 nsRefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
michael@0 421
michael@0 422 /**
michael@0 423 * The last buffer in the pending UTF-16 buffer queue
michael@0 424 */
michael@0 425 nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to
michael@0 426 // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
michael@0 427
michael@0 428 /**
michael@0 429 * The tree operation executor
michael@0 430 */
michael@0 431 nsHtml5TreeOpExecutor* mExecutor;
michael@0 432
michael@0 433 /**
michael@0 434 * The HTML5 tree builder
michael@0 435 */
michael@0 436 nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
michael@0 437
michael@0 438 /**
michael@0 439 * The HTML5 tokenizer
michael@0 440 */
michael@0 441 nsAutoPtr<nsHtml5Tokenizer> mTokenizer;
michael@0 442
michael@0 443 /**
michael@0 444 * Makes sure the main thread can't mess the tokenizer state while it's
michael@0 445 * tokenizing. This mutex also protects the current speculation.
michael@0 446 */
michael@0 447 mozilla::Mutex mTokenizerMutex;
michael@0 448
michael@0 449 /**
michael@0 450 * The scoped atom table
michael@0 451 */
michael@0 452 nsHtml5AtomTable mAtomTable;
michael@0 453
michael@0 454 /**
michael@0 455 * The owner parser.
michael@0 456 */
michael@0 457 nsRefPtr<nsHtml5Parser> mOwner;
michael@0 458
michael@0 459 /**
michael@0 460 * Whether the last character tokenized was a carriage return (for CRLF)
michael@0 461 */
michael@0 462 bool mLastWasCR;
michael@0 463
michael@0 464 /**
michael@0 465 * For tracking stream life cycle
michael@0 466 */
michael@0 467 eHtml5StreamState mStreamState;
michael@0 468
michael@0 469 /**
michael@0 470 * Whether we are speculating.
michael@0 471 */
michael@0 472 bool mSpeculating;
michael@0 473
michael@0 474 /**
michael@0 475 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
michael@0 476 */
michael@0 477 bool mAtEOF;
michael@0 478
michael@0 479 /**
michael@0 480 * The speculations. The mutex protects the nsTArray itself.
michael@0 481 * To access the queue of current speculation, mTokenizerMutex must be
michael@0 482 * obtained.
michael@0 483 * The current speculation is the last element
michael@0 484 */
michael@0 485 nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations;
michael@0 486 mozilla::Mutex mSpeculationMutex;
michael@0 487
michael@0 488 /**
michael@0 489 * True to terminate early; protected by mTerminatedMutex
michael@0 490 */
michael@0 491 bool mTerminated;
michael@0 492 bool mInterrupted;
michael@0 493 mozilla::Mutex mTerminatedMutex;
michael@0 494
michael@0 495 /**
michael@0 496 * The thread this stream parser runs on.
michael@0 497 */
michael@0 498 nsCOMPtr<nsIThread> mThread;
michael@0 499
michael@0 500 nsCOMPtr<nsIRunnable> mExecutorFlusher;
michael@0 501
michael@0 502 nsCOMPtr<nsIRunnable> mLoadFlusher;
michael@0 503
michael@0 504 /**
michael@0 505 * The chardet instance if chardet is enabled.
michael@0 506 */
michael@0 507 nsCOMPtr<nsICharsetDetector> mChardet;
michael@0 508
michael@0 509 /**
michael@0 510 * If false, don't push data to chardet.
michael@0 511 */
michael@0 512 bool mFeedChardet;
michael@0 513
michael@0 514 /**
michael@0 515 * Whether the initial charset source was kCharsetFromParentFrame
michael@0 516 */
michael@0 517 bool mInitialEncodingWasFromParentFrame;
michael@0 518
michael@0 519 /**
michael@0 520 * Timer for flushing tree ops once in a while when not speculating.
michael@0 521 */
michael@0 522 nsCOMPtr<nsITimer> mFlushTimer;
michael@0 523
michael@0 524 /**
michael@0 525 * Keeps track whether mFlushTimer has been armed. Unfortunately,
michael@0 526 * nsITimer doesn't enable querying this from the timer itself.
michael@0 527 */
michael@0 528 bool mFlushTimerArmed;
michael@0 529
michael@0 530 /**
michael@0 531 * False initially and true after the timer has fired at least once.
michael@0 532 */
michael@0 533 bool mFlushTimerEverFired;
michael@0 534
michael@0 535 /**
michael@0 536 * Whether the parser is doing a normal parse, view source or plain text.
michael@0 537 */
michael@0 538 eParserMode mMode;
michael@0 539
michael@0 540 /**
michael@0 541 * The pref html5.flushtimer.initialdelay: Time in milliseconds between
michael@0 542 * the time a network buffer is seen and the timer firing when the
michael@0 543 * timer hasn't fired previously in this parse.
michael@0 544 */
michael@0 545 static int32_t sTimerInitialDelay;
michael@0 546
michael@0 547 /**
michael@0 548 * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
michael@0 549 * the time a network buffer is seen and the timer firing when the
michael@0 550 * timer has already fired previously in this parse.
michael@0 551 */
michael@0 552 static int32_t sTimerSubsequentDelay;
michael@0 553 };
michael@0 554
michael@0 555 #endif // nsHtml5StreamParser_h

mercurial