1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/parser/html/nsHtml5StreamParser.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,555 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#ifndef nsHtml5StreamParser_h 1.10 +#define nsHtml5StreamParser_h 1.11 + 1.12 +#include "nsAutoPtr.h" 1.13 +#include "nsCOMPtr.h" 1.14 +#include "nsICharsetDetectionObserver.h" 1.15 +#include "nsHtml5MetaScanner.h" 1.16 +#include "nsIUnicodeDecoder.h" 1.17 +#include "nsHtml5TreeOpExecutor.h" 1.18 +#include "nsHtml5OwningUTF16Buffer.h" 1.19 +#include "nsIInputStream.h" 1.20 +#include "mozilla/Mutex.h" 1.21 +#include "nsHtml5AtomTable.h" 1.22 +#include "nsHtml5Speculation.h" 1.23 +#include "nsITimer.h" 1.24 +#include "nsICharsetDetector.h" 1.25 + 1.26 +class nsHtml5Parser; 1.27 + 1.28 +#define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024 1.29 +#define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024 1.30 + 1.31 +enum eParserMode { 1.32 + /** 1.33 + * Parse a document normally as HTML. 1.34 + */ 1.35 + NORMAL, 1.36 + 1.37 + /** 1.38 + * View document as HTML source. 1.39 + */ 1.40 + VIEW_SOURCE_HTML, 1.41 + 1.42 + /** 1.43 + * View document as XML source 1.44 + */ 1.45 + VIEW_SOURCE_XML, 1.46 + 1.47 + /** 1.48 + * View document as plain text source 1.49 + */ 1.50 + VIEW_SOURCE_PLAIN, 1.51 + 1.52 + /** 1.53 + * View document as plain text 1.54 + */ 1.55 + PLAIN_TEXT, 1.56 + 1.57 + /** 1.58 + * Load as data (XHR) 1.59 + */ 1.60 + LOAD_AS_DATA 1.61 +}; 1.62 + 1.63 +enum eBomState { 1.64 + /** 1.65 + * BOM sniffing hasn't started. 1.66 + */ 1.67 + BOM_SNIFFING_NOT_STARTED = 0, 1.68 + 1.69 + /** 1.70 + * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been 1.71 + * seen. 1.72 + */ 1.73 + SEEN_UTF_16_LE_FIRST_BYTE = 1, 1.74 + 1.75 + /** 1.76 + * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been 1.77 + * seen. 1.78 + */ 1.79 + SEEN_UTF_16_BE_FIRST_BYTE = 2, 1.80 + 1.81 + /** 1.82 + * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been 1.83 + * seen. 1.84 + */ 1.85 + SEEN_UTF_8_FIRST_BYTE = 3, 1.86 + 1.87 + /** 1.88 + * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM 1.89 + * have been seen. 1.90 + */ 1.91 + SEEN_UTF_8_SECOND_BYTE = 4, 1.92 + 1.93 + /** 1.94 + * BOM sniffing was started but is now over for whatever reason. 1.95 + */ 1.96 + BOM_SNIFFING_OVER = 5 1.97 +}; 1.98 + 1.99 +enum eHtml5StreamState { 1.100 + STREAM_NOT_STARTED = 0, 1.101 + STREAM_BEING_READ = 1, 1.102 + STREAM_ENDED = 2 1.103 +}; 1.104 + 1.105 +class nsHtml5StreamParser : public nsICharsetDetectionObserver { 1.106 + 1.107 + friend class nsHtml5RequestStopper; 1.108 + friend class nsHtml5DataAvailable; 1.109 + friend class nsHtml5StreamParserContinuation; 1.110 + friend class nsHtml5TimerKungFu; 1.111 + 1.112 + public: 1.113 + NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW 1.114 + NS_DECL_CYCLE_COLLECTING_ISUPPORTS 1.115 + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser, 1.116 + nsICharsetDetectionObserver) 1.117 + 1.118 + static void InitializeStatics(); 1.119 + 1.120 + nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, 1.121 + nsHtml5Parser* aOwner, 1.122 + eParserMode aMode); 1.123 + 1.124 + virtual ~nsHtml5StreamParser(); 1.125 + 1.126 + // Methods that nsHtml5StreamListener calls 1.127 + nsresult CheckListenerChain(); 1.128 + 1.129 + nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext); 1.130 + 1.131 + nsresult OnDataAvailable(nsIRequest* aRequest, 1.132 + nsISupports* aContext, 1.133 + nsIInputStream* aInStream, 1.134 + uint64_t aSourceOffset, 1.135 + uint32_t aLength); 1.136 + 1.137 + nsresult OnStopRequest(nsIRequest* aRequest, 1.138 + nsISupports* aContext, 1.139 + nsresult status); 1.140 + 1.141 + // nsICharsetDetectionObserver 1.142 + /** 1.143 + * Chardet calls this to report the detection result 1.144 + */ 1.145 + NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf); 1.146 + 1.147 + // EncodingDeclarationHandler 1.148 + // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java 1.149 + /** 1.150 + * Tree builder uses this to report a late <meta charset> 1.151 + */ 1.152 + bool internalEncodingDeclaration(nsString* aEncoding); 1.153 + 1.154 + // Not from an external interface 1.155 + 1.156 + /** 1.157 + * Call this method once you've created a parser, and want to instruct it 1.158 + * about what charset to load 1.159 + * 1.160 + * @param aCharset the charset of a document 1.161 + * @param aCharsetSource the source of the charset 1.162 + */ 1.163 + inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) { 1.164 + NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED, 1.165 + "SetDocumentCharset called too late."); 1.166 + NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 1.167 + mCharset = aCharset; 1.168 + mCharsetSource = aSource; 1.169 + } 1.170 + 1.171 + inline void SetObserver(nsIRequestObserver* aObserver) { 1.172 + NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); 1.173 + mObserver = aObserver; 1.174 + } 1.175 + 1.176 + nsresult GetChannel(nsIChannel** aChannel); 1.177 + 1.178 + /** 1.179 + * The owner parser must call this after script execution 1.180 + * when no scripts are executing and the document.written 1.181 + * buffer has been exhausted. 1.182 + */ 1.183 + void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, 1.184 + nsHtml5TreeBuilder* aTreeBuilder, 1.185 + bool aLastWasCR); 1.186 + 1.187 + /** 1.188 + * Continues the stream parser if the charset switch failed. 1.189 + */ 1.190 + void ContinueAfterFailedCharsetSwitch(); 1.191 + 1.192 + void Terminate() 1.193 + { 1.194 + mozilla::MutexAutoLock autoLock(mTerminatedMutex); 1.195 + mTerminated = true; 1.196 + } 1.197 + 1.198 + void DropTimer(); 1.199 + 1.200 + /** 1.201 + * Sets mCharset and mCharsetSource appropriately for the XML View Source 1.202 + * case if aEncoding names a supported rough ASCII superset and sets 1.203 + * the mCharset and mCharsetSource to the UTF-8 default otherwise. 1.204 + */ 1.205 + void SetEncodingFromExpat(const char16_t* aEncoding); 1.206 + 1.207 + /** 1.208 + * Sets the URL for View Source title in case this parser ends up being 1.209 + * used for View Source. If aURL is a view-source: URL, takes the inner 1.210 + * URL. data: URLs are shown with an ellipsis instead of the actual data. 1.211 + */ 1.212 + void SetViewSourceTitle(nsIURI* aURL); 1.213 + 1.214 + private: 1.215 + 1.216 +#ifdef DEBUG 1.217 + bool IsParserThread() { 1.218 + bool ret; 1.219 + mThread->IsOnCurrentThread(&ret); 1.220 + return ret; 1.221 + } 1.222 +#endif 1.223 + 1.224 + void MarkAsBroken(nsresult aRv); 1.225 + 1.226 + /** 1.227 + * Marks the stream parser as interrupted. If you ever add calls to this 1.228 + * method, be sure to review Uninterrupt usage very, very carefully to 1.229 + * avoid having a previous in-flight runnable cancel your Interrupt() 1.230 + * call on the other thread too soon. 1.231 + */ 1.232 + void Interrupt() 1.233 + { 1.234 + mozilla::MutexAutoLock autoLock(mTerminatedMutex); 1.235 + mInterrupted = true; 1.236 + } 1.237 + 1.238 + void Uninterrupt() 1.239 + { 1.240 + NS_ASSERTION(IsParserThread(), "Wrong thread!"); 1.241 + mTokenizerMutex.AssertCurrentThreadOwns(); 1.242 + // Not acquiring mTerminatedMutex because mTokenizerMutex is already 1.243 + // held at this point and is already stronger. 1.244 + mInterrupted = false; 1.245 + } 1.246 + 1.247 + /** 1.248 + * Flushes the tree ops from the tree builder and disarms the flush 1.249 + * timer. 1.250 + */ 1.251 + void FlushTreeOpsAndDisarmTimer(); 1.252 + 1.253 + void ParseAvailableData(); 1.254 + 1.255 + void DoStopRequest(); 1.256 + 1.257 + void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength); 1.258 + 1.259 + static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream, 1.260 + void *aClosure, 1.261 + const char *aFromSegment, 1.262 + uint32_t aToOffset, 1.263 + uint32_t aCount, 1.264 + uint32_t *aWriteCount); 1.265 + 1.266 + bool IsTerminatedOrInterrupted() 1.267 + { 1.268 + mozilla::MutexAutoLock autoLock(mTerminatedMutex); 1.269 + return mTerminated || mInterrupted; 1.270 + } 1.271 + 1.272 + bool IsTerminated() 1.273 + { 1.274 + mozilla::MutexAutoLock autoLock(mTerminatedMutex); 1.275 + return mTerminated; 1.276 + } 1.277 + 1.278 + /** 1.279 + * True when there is a Unicode decoder already 1.280 + */ 1.281 + inline bool HasDecoder() 1.282 + { 1.283 + return !!mUnicodeDecoder; 1.284 + } 1.285 + 1.286 + /** 1.287 + * Push bytes from network when there is no Unicode decoder yet 1.288 + */ 1.289 + nsresult SniffStreamBytes(const uint8_t* aFromSegment, 1.290 + uint32_t aCount, 1.291 + uint32_t* aWriteCount); 1.292 + 1.293 + /** 1.294 + * Push bytes from network when there is a Unicode decoder already 1.295 + */ 1.296 + nsresult WriteStreamBytes(const uint8_t* aFromSegment, 1.297 + uint32_t aCount, 1.298 + uint32_t* aWriteCount); 1.299 + 1.300 + /** 1.301 + * Check whether every other byte in the sniffing buffer is zero. 1.302 + */ 1.303 + void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment, 1.304 + uint32_t aCountToSniffingLimit); 1.305 + 1.306 + /** 1.307 + * <meta charset> scan failed. Try chardet if applicable. After this, the 1.308 + * the parser will have some encoding even if a last resolt fallback. 1.309 + * 1.310 + * @param aFromSegment The current network buffer or null if the sniffing 1.311 + * buffer is being flushed due to network stream ending. 1.312 + * @param aCount The number of bytes in aFromSegment (ignored if 1.313 + * aFromSegment is null) 1.314 + * @param aWriteCount Return value for how many bytes got read from the 1.315 + * buffer. 1.316 + * @param aCountToSniffingLimit The number of unfilled slots in 1.317 + * mSniffingBuffer 1.318 + */ 1.319 + nsresult FinalizeSniffing(const uint8_t* aFromSegment, 1.320 + uint32_t aCount, 1.321 + uint32_t* aWriteCount, 1.322 + uint32_t aCountToSniffingLimit); 1.323 + 1.324 + /** 1.325 + * Set up the Unicode decoder and write the sniffing buffer into it 1.326 + * followed by the current network buffer. 1.327 + * 1.328 + * @param aFromSegment The current network buffer or null if the sniffing 1.329 + * buffer is being flushed due to network stream ending. 1.330 + * @param aCount The number of bytes in aFromSegment (ignored if 1.331 + * aFromSegment is null) 1.332 + * @param aWriteCount Return value for how many bytes got read from the 1.333 + * buffer. 1.334 + */ 1.335 + nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, 1.336 + uint32_t aCount, 1.337 + uint32_t* aWriteCount); 1.338 + 1.339 + /** 1.340 + * Initialize the Unicode decoder, mark the BOM as the source and 1.341 + * drop the sniffer. 1.342 + * 1.343 + * @param aDecoderCharsetName The name for the decoder's charset 1.344 + * (UTF-16BE, UTF-16LE or UTF-8; the BOM has 1.345 + * been swallowed) 1.346 + */ 1.347 + nsresult SetupDecodingFromBom(const char* aDecoderCharsetName); 1.348 + 1.349 + /** 1.350 + * Become confident or resolve and encoding name to its preferred form. 1.351 + * @param aEncoding the value of an internal encoding decl. Acts as an 1.352 + * out param, too, when the method returns true. 1.353 + * @return true if the parser needs to start using the new value of 1.354 + * aEncoding and false if the parser became confident or if 1.355 + * the encoding name did not specify a usable encoding 1.356 + */ 1.357 + bool PreferredForInternalEncodingDecl(nsACString& aEncoding); 1.358 + 1.359 + /** 1.360 + * Callback for mFlushTimer. 1.361 + */ 1.362 + static void TimerCallback(nsITimer* aTimer, void* aClosure); 1.363 + 1.364 + /** 1.365 + * Parser thread entry point for (maybe) flushing the ops and posting 1.366 + * a flush runnable back on the main thread. 1.367 + */ 1.368 + void TimerFlush(); 1.369 + 1.370 + nsCOMPtr<nsIRequest> mRequest; 1.371 + nsCOMPtr<nsIRequestObserver> mObserver; 1.372 + 1.373 + /** 1.374 + * The document title to use if this turns out to be a View Source parser. 1.375 + */ 1.376 + nsCString mViewSourceTitle; 1.377 + 1.378 + /** 1.379 + * The Unicode decoder 1.380 + */ 1.381 + nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder; 1.382 + 1.383 + /** 1.384 + * The buffer for sniffing the character encoding 1.385 + */ 1.386 + nsAutoArrayPtr<uint8_t> mSniffingBuffer; 1.387 + 1.388 + /** 1.389 + * The number of meaningful bytes in mSniffingBuffer 1.390 + */ 1.391 + uint32_t mSniffingLength; 1.392 + 1.393 + /** 1.394 + * BOM sniffing state 1.395 + */ 1.396 + eBomState mBomState; 1.397 + 1.398 + /** 1.399 + * <meta> prescan implementation 1.400 + */ 1.401 + nsAutoPtr<nsHtml5MetaScanner> mMetaScanner; 1.402 + 1.403 + // encoding-related stuff 1.404 + /** 1.405 + * The source (confidence) of the character encoding in use 1.406 + */ 1.407 + int32_t mCharsetSource; 1.408 + 1.409 + /** 1.410 + * The character encoding in use 1.411 + */ 1.412 + nsCString mCharset; 1.413 + 1.414 + /** 1.415 + * Whether reparse is forbidden 1.416 + */ 1.417 + bool mReparseForbidden; 1.418 + 1.419 + // Portable parser objects 1.420 + /** 1.421 + * The first buffer in the pending UTF-16 buffer queue 1.422 + */ 1.423 + nsRefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer; 1.424 + 1.425 + /** 1.426 + * The last buffer in the pending UTF-16 buffer queue 1.427 + */ 1.428 + nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to 1.429 + // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1.430 + 1.431 + /** 1.432 + * The tree operation executor 1.433 + */ 1.434 + nsHtml5TreeOpExecutor* mExecutor; 1.435 + 1.436 + /** 1.437 + * The HTML5 tree builder 1.438 + */ 1.439 + nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder; 1.440 + 1.441 + /** 1.442 + * The HTML5 tokenizer 1.443 + */ 1.444 + nsAutoPtr<nsHtml5Tokenizer> mTokenizer; 1.445 + 1.446 + /** 1.447 + * Makes sure the main thread can't mess the tokenizer state while it's 1.448 + * tokenizing. This mutex also protects the current speculation. 1.449 + */ 1.450 + mozilla::Mutex mTokenizerMutex; 1.451 + 1.452 + /** 1.453 + * The scoped atom table 1.454 + */ 1.455 + nsHtml5AtomTable mAtomTable; 1.456 + 1.457 + /** 1.458 + * The owner parser. 1.459 + */ 1.460 + nsRefPtr<nsHtml5Parser> mOwner; 1.461 + 1.462 + /** 1.463 + * Whether the last character tokenized was a carriage return (for CRLF) 1.464 + */ 1.465 + bool mLastWasCR; 1.466 + 1.467 + /** 1.468 + * For tracking stream life cycle 1.469 + */ 1.470 + eHtml5StreamState mStreamState; 1.471 + 1.472 + /** 1.473 + * Whether we are speculating. 1.474 + */ 1.475 + bool mSpeculating; 1.476 + 1.477 + /** 1.478 + * Whether the tokenizer has reached EOF. (Reset when stream rewinded.) 1.479 + */ 1.480 + bool mAtEOF; 1.481 + 1.482 + /** 1.483 + * The speculations. The mutex protects the nsTArray itself. 1.484 + * To access the queue of current speculation, mTokenizerMutex must be 1.485 + * obtained. 1.486 + * The current speculation is the last element 1.487 + */ 1.488 + nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations; 1.489 + mozilla::Mutex mSpeculationMutex; 1.490 + 1.491 + /** 1.492 + * True to terminate early; protected by mTerminatedMutex 1.493 + */ 1.494 + bool mTerminated; 1.495 + bool mInterrupted; 1.496 + mozilla::Mutex mTerminatedMutex; 1.497 + 1.498 + /** 1.499 + * The thread this stream parser runs on. 1.500 + */ 1.501 + nsCOMPtr<nsIThread> mThread; 1.502 + 1.503 + nsCOMPtr<nsIRunnable> mExecutorFlusher; 1.504 + 1.505 + nsCOMPtr<nsIRunnable> mLoadFlusher; 1.506 + 1.507 + /** 1.508 + * The chardet instance if chardet is enabled. 1.509 + */ 1.510 + nsCOMPtr<nsICharsetDetector> mChardet; 1.511 + 1.512 + /** 1.513 + * If false, don't push data to chardet. 1.514 + */ 1.515 + bool mFeedChardet; 1.516 + 1.517 + /** 1.518 + * Whether the initial charset source was kCharsetFromParentFrame 1.519 + */ 1.520 + bool mInitialEncodingWasFromParentFrame; 1.521 + 1.522 + /** 1.523 + * Timer for flushing tree ops once in a while when not speculating. 1.524 + */ 1.525 + nsCOMPtr<nsITimer> mFlushTimer; 1.526 + 1.527 + /** 1.528 + * Keeps track whether mFlushTimer has been armed. Unfortunately, 1.529 + * nsITimer doesn't enable querying this from the timer itself. 1.530 + */ 1.531 + bool mFlushTimerArmed; 1.532 + 1.533 + /** 1.534 + * False initially and true after the timer has fired at least once. 1.535 + */ 1.536 + bool mFlushTimerEverFired; 1.537 + 1.538 + /** 1.539 + * Whether the parser is doing a normal parse, view source or plain text. 1.540 + */ 1.541 + eParserMode mMode; 1.542 + 1.543 + /** 1.544 + * The pref html5.flushtimer.initialdelay: Time in milliseconds between 1.545 + * the time a network buffer is seen and the timer firing when the 1.546 + * timer hasn't fired previously in this parse. 1.547 + */ 1.548 + static int32_t sTimerInitialDelay; 1.549 + 1.550 + /** 1.551 + * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between 1.552 + * the time a network buffer is seen and the timer firing when the 1.553 + * timer has already fired previously in this parse. 1.554 + */ 1.555 + static int32_t sTimerSubsequentDelay; 1.556 +}; 1.557 + 1.558 +#endif // nsHtml5StreamParser_h