michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #ifndef nsHtml5StreamParser_h michael@0: #define nsHtml5StreamParser_h michael@0: michael@0: #include "nsAutoPtr.h" michael@0: #include "nsCOMPtr.h" michael@0: #include "nsICharsetDetectionObserver.h" michael@0: #include "nsHtml5MetaScanner.h" michael@0: #include "nsIUnicodeDecoder.h" michael@0: #include "nsHtml5TreeOpExecutor.h" michael@0: #include "nsHtml5OwningUTF16Buffer.h" michael@0: #include "nsIInputStream.h" michael@0: #include "mozilla/Mutex.h" michael@0: #include "nsHtml5AtomTable.h" michael@0: #include "nsHtml5Speculation.h" michael@0: #include "nsITimer.h" michael@0: #include "nsICharsetDetector.h" michael@0: michael@0: class nsHtml5Parser; michael@0: michael@0: #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024 michael@0: #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024 michael@0: michael@0: enum eParserMode { michael@0: /** michael@0: * Parse a document normally as HTML. michael@0: */ michael@0: NORMAL, michael@0: michael@0: /** michael@0: * View document as HTML source. michael@0: */ michael@0: VIEW_SOURCE_HTML, michael@0: michael@0: /** michael@0: * View document as XML source michael@0: */ michael@0: VIEW_SOURCE_XML, michael@0: michael@0: /** michael@0: * View document as plain text source michael@0: */ michael@0: VIEW_SOURCE_PLAIN, michael@0: michael@0: /** michael@0: * View document as plain text michael@0: */ michael@0: PLAIN_TEXT, michael@0: michael@0: /** michael@0: * Load as data (XHR) michael@0: */ michael@0: LOAD_AS_DATA michael@0: }; michael@0: michael@0: enum eBomState { michael@0: /** michael@0: * BOM sniffing hasn't started. michael@0: */ michael@0: BOM_SNIFFING_NOT_STARTED = 0, michael@0: michael@0: /** michael@0: * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been michael@0: * seen. michael@0: */ michael@0: SEEN_UTF_16_LE_FIRST_BYTE = 1, michael@0: michael@0: /** michael@0: * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been michael@0: * seen. michael@0: */ michael@0: SEEN_UTF_16_BE_FIRST_BYTE = 2, michael@0: michael@0: /** michael@0: * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been michael@0: * seen. michael@0: */ michael@0: SEEN_UTF_8_FIRST_BYTE = 3, michael@0: michael@0: /** michael@0: * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM michael@0: * have been seen. michael@0: */ michael@0: SEEN_UTF_8_SECOND_BYTE = 4, michael@0: michael@0: /** michael@0: * BOM sniffing was started but is now over for whatever reason. michael@0: */ michael@0: BOM_SNIFFING_OVER = 5 michael@0: }; michael@0: michael@0: enum eHtml5StreamState { michael@0: STREAM_NOT_STARTED = 0, michael@0: STREAM_BEING_READ = 1, michael@0: STREAM_ENDED = 2 michael@0: }; michael@0: michael@0: class nsHtml5StreamParser : public nsICharsetDetectionObserver { michael@0: michael@0: friend class nsHtml5RequestStopper; michael@0: friend class nsHtml5DataAvailable; michael@0: friend class nsHtml5StreamParserContinuation; michael@0: friend class nsHtml5TimerKungFu; michael@0: michael@0: public: michael@0: NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW michael@0: NS_DECL_CYCLE_COLLECTING_ISUPPORTS michael@0: NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser, michael@0: nsICharsetDetectionObserver) michael@0: michael@0: static void InitializeStatics(); michael@0: michael@0: nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, michael@0: nsHtml5Parser* aOwner, michael@0: eParserMode aMode); michael@0: michael@0: virtual ~nsHtml5StreamParser(); michael@0: michael@0: // Methods that nsHtml5StreamListener calls michael@0: nsresult CheckListenerChain(); michael@0: michael@0: nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext); michael@0: michael@0: nsresult OnDataAvailable(nsIRequest* aRequest, michael@0: nsISupports* aContext, michael@0: nsIInputStream* aInStream, michael@0: uint64_t aSourceOffset, michael@0: uint32_t aLength); michael@0: michael@0: nsresult OnStopRequest(nsIRequest* aRequest, michael@0: nsISupports* aContext, michael@0: nsresult status); michael@0: michael@0: // nsICharsetDetectionObserver michael@0: /** michael@0: * Chardet calls this to report the detection result michael@0: */ michael@0: NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf); michael@0: michael@0: // EncodingDeclarationHandler michael@0: // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java michael@0: /** michael@0: * Tree builder uses this to report a late michael@0: */ michael@0: bool internalEncodingDeclaration(nsString* aEncoding); michael@0: michael@0: // Not from an external interface michael@0: michael@0: /** michael@0: * Call this method once you've created a parser, and want to instruct it michael@0: * about what charset to load michael@0: * michael@0: * @param aCharset the charset of a document michael@0: * @param aCharsetSource the source of the charset michael@0: */ michael@0: inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) { michael@0: NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED, michael@0: "SetDocumentCharset called too late."); michael@0: NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); michael@0: mCharset = aCharset; michael@0: mCharsetSource = aSource; michael@0: } michael@0: michael@0: inline void SetObserver(nsIRequestObserver* aObserver) { michael@0: NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); michael@0: mObserver = aObserver; michael@0: } michael@0: michael@0: nsresult GetChannel(nsIChannel** aChannel); michael@0: michael@0: /** michael@0: * The owner parser must call this after script execution michael@0: * when no scripts are executing and the document.written michael@0: * buffer has been exhausted. michael@0: */ michael@0: void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, michael@0: nsHtml5TreeBuilder* aTreeBuilder, michael@0: bool aLastWasCR); michael@0: michael@0: /** michael@0: * Continues the stream parser if the charset switch failed. michael@0: */ michael@0: void ContinueAfterFailedCharsetSwitch(); michael@0: michael@0: void Terminate() michael@0: { michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex); michael@0: mTerminated = true; michael@0: } michael@0: michael@0: void DropTimer(); michael@0: michael@0: /** michael@0: * Sets mCharset and mCharsetSource appropriately for the XML View Source michael@0: * case if aEncoding names a supported rough ASCII superset and sets michael@0: * the mCharset and mCharsetSource to the UTF-8 default otherwise. michael@0: */ michael@0: void SetEncodingFromExpat(const char16_t* aEncoding); michael@0: michael@0: /** michael@0: * Sets the URL for View Source title in case this parser ends up being michael@0: * used for View Source. If aURL is a view-source: URL, takes the inner michael@0: * URL. data: URLs are shown with an ellipsis instead of the actual data. michael@0: */ michael@0: void SetViewSourceTitle(nsIURI* aURL); michael@0: michael@0: private: michael@0: michael@0: #ifdef DEBUG michael@0: bool IsParserThread() { michael@0: bool ret; michael@0: mThread->IsOnCurrentThread(&ret); michael@0: return ret; michael@0: } michael@0: #endif michael@0: michael@0: void MarkAsBroken(nsresult aRv); michael@0: michael@0: /** michael@0: * Marks the stream parser as interrupted. If you ever add calls to this michael@0: * method, be sure to review Uninterrupt usage very, very carefully to michael@0: * avoid having a previous in-flight runnable cancel your Interrupt() michael@0: * call on the other thread too soon. michael@0: */ michael@0: void Interrupt() michael@0: { michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex); michael@0: mInterrupted = true; michael@0: } michael@0: michael@0: void Uninterrupt() michael@0: { michael@0: NS_ASSERTION(IsParserThread(), "Wrong thread!"); michael@0: mTokenizerMutex.AssertCurrentThreadOwns(); michael@0: // Not acquiring mTerminatedMutex because mTokenizerMutex is already michael@0: // held at this point and is already stronger. michael@0: mInterrupted = false; michael@0: } michael@0: michael@0: /** michael@0: * Flushes the tree ops from the tree builder and disarms the flush michael@0: * timer. michael@0: */ michael@0: void FlushTreeOpsAndDisarmTimer(); michael@0: michael@0: void ParseAvailableData(); michael@0: michael@0: void DoStopRequest(); michael@0: michael@0: void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength); michael@0: michael@0: static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream, michael@0: void *aClosure, michael@0: const char *aFromSegment, michael@0: uint32_t aToOffset, michael@0: uint32_t aCount, michael@0: uint32_t *aWriteCount); michael@0: michael@0: bool IsTerminatedOrInterrupted() michael@0: { michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex); michael@0: return mTerminated || mInterrupted; michael@0: } michael@0: michael@0: bool IsTerminated() michael@0: { michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex); michael@0: return mTerminated; michael@0: } michael@0: michael@0: /** michael@0: * True when there is a Unicode decoder already michael@0: */ michael@0: inline bool HasDecoder() michael@0: { michael@0: return !!mUnicodeDecoder; michael@0: } michael@0: michael@0: /** michael@0: * Push bytes from network when there is no Unicode decoder yet michael@0: */ michael@0: nsresult SniffStreamBytes(const uint8_t* aFromSegment, michael@0: uint32_t aCount, michael@0: uint32_t* aWriteCount); michael@0: michael@0: /** michael@0: * Push bytes from network when there is a Unicode decoder already michael@0: */ michael@0: nsresult WriteStreamBytes(const uint8_t* aFromSegment, michael@0: uint32_t aCount, michael@0: uint32_t* aWriteCount); michael@0: michael@0: /** michael@0: * Check whether every other byte in the sniffing buffer is zero. michael@0: */ michael@0: void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment, michael@0: uint32_t aCountToSniffingLimit); michael@0: michael@0: /** michael@0: * scan failed. Try chardet if applicable. After this, the michael@0: * the parser will have some encoding even if a last resolt fallback. michael@0: * michael@0: * @param aFromSegment The current network buffer or null if the sniffing michael@0: * buffer is being flushed due to network stream ending. michael@0: * @param aCount The number of bytes in aFromSegment (ignored if michael@0: * aFromSegment is null) michael@0: * @param aWriteCount Return value for how many bytes got read from the michael@0: * buffer. michael@0: * @param aCountToSniffingLimit The number of unfilled slots in michael@0: * mSniffingBuffer michael@0: */ michael@0: nsresult FinalizeSniffing(const uint8_t* aFromSegment, michael@0: uint32_t aCount, michael@0: uint32_t* aWriteCount, michael@0: uint32_t aCountToSniffingLimit); michael@0: michael@0: /** michael@0: * Set up the Unicode decoder and write the sniffing buffer into it michael@0: * followed by the current network buffer. michael@0: * michael@0: * @param aFromSegment The current network buffer or null if the sniffing michael@0: * buffer is being flushed due to network stream ending. michael@0: * @param aCount The number of bytes in aFromSegment (ignored if michael@0: * aFromSegment is null) michael@0: * @param aWriteCount Return value for how many bytes got read from the michael@0: * buffer. michael@0: */ michael@0: nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, michael@0: uint32_t aCount, michael@0: uint32_t* aWriteCount); michael@0: michael@0: /** michael@0: * Initialize the Unicode decoder, mark the BOM as the source and michael@0: * drop the sniffer. michael@0: * michael@0: * @param aDecoderCharsetName The name for the decoder's charset michael@0: * (UTF-16BE, UTF-16LE or UTF-8; the BOM has michael@0: * been swallowed) michael@0: */ michael@0: nsresult SetupDecodingFromBom(const char* aDecoderCharsetName); michael@0: michael@0: /** michael@0: * Become confident or resolve and encoding name to its preferred form. michael@0: * @param aEncoding the value of an internal encoding decl. Acts as an michael@0: * out param, too, when the method returns true. michael@0: * @return true if the parser needs to start using the new value of michael@0: * aEncoding and false if the parser became confident or if michael@0: * the encoding name did not specify a usable encoding michael@0: */ michael@0: bool PreferredForInternalEncodingDecl(nsACString& aEncoding); michael@0: michael@0: /** michael@0: * Callback for mFlushTimer. michael@0: */ michael@0: static void TimerCallback(nsITimer* aTimer, void* aClosure); michael@0: michael@0: /** michael@0: * Parser thread entry point for (maybe) flushing the ops and posting michael@0: * a flush runnable back on the main thread. michael@0: */ michael@0: void TimerFlush(); michael@0: michael@0: nsCOMPtr mRequest; michael@0: nsCOMPtr mObserver; michael@0: michael@0: /** michael@0: * The document title to use if this turns out to be a View Source parser. michael@0: */ michael@0: nsCString mViewSourceTitle; michael@0: michael@0: /** michael@0: * The Unicode decoder michael@0: */ michael@0: nsCOMPtr mUnicodeDecoder; michael@0: michael@0: /** michael@0: * The buffer for sniffing the character encoding michael@0: */ michael@0: nsAutoArrayPtr mSniffingBuffer; michael@0: michael@0: /** michael@0: * The number of meaningful bytes in mSniffingBuffer michael@0: */ michael@0: uint32_t mSniffingLength; michael@0: michael@0: /** michael@0: * BOM sniffing state michael@0: */ michael@0: eBomState mBomState; michael@0: michael@0: /** michael@0: * prescan implementation michael@0: */ michael@0: nsAutoPtr mMetaScanner; michael@0: michael@0: // encoding-related stuff michael@0: /** michael@0: * The source (confidence) of the character encoding in use michael@0: */ michael@0: int32_t mCharsetSource; michael@0: michael@0: /** michael@0: * The character encoding in use michael@0: */ michael@0: nsCString mCharset; michael@0: michael@0: /** michael@0: * Whether reparse is forbidden michael@0: */ michael@0: bool mReparseForbidden; michael@0: michael@0: // Portable parser objects michael@0: /** michael@0: * The first buffer in the pending UTF-16 buffer queue michael@0: */ michael@0: nsRefPtr mFirstBuffer; michael@0: michael@0: /** michael@0: * The last buffer in the pending UTF-16 buffer queue michael@0: */ michael@0: nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to michael@0: // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE michael@0: michael@0: /** michael@0: * The tree operation executor michael@0: */ michael@0: nsHtml5TreeOpExecutor* mExecutor; michael@0: michael@0: /** michael@0: * The HTML5 tree builder michael@0: */ michael@0: nsAutoPtr mTreeBuilder; michael@0: michael@0: /** michael@0: * The HTML5 tokenizer michael@0: */ michael@0: nsAutoPtr mTokenizer; michael@0: michael@0: /** michael@0: * Makes sure the main thread can't mess the tokenizer state while it's michael@0: * tokenizing. This mutex also protects the current speculation. michael@0: */ michael@0: mozilla::Mutex mTokenizerMutex; michael@0: michael@0: /** michael@0: * The scoped atom table michael@0: */ michael@0: nsHtml5AtomTable mAtomTable; michael@0: michael@0: /** michael@0: * The owner parser. michael@0: */ michael@0: nsRefPtr mOwner; michael@0: michael@0: /** michael@0: * Whether the last character tokenized was a carriage return (for CRLF) michael@0: */ michael@0: bool mLastWasCR; michael@0: michael@0: /** michael@0: * For tracking stream life cycle michael@0: */ michael@0: eHtml5StreamState mStreamState; michael@0: michael@0: /** michael@0: * Whether we are speculating. michael@0: */ michael@0: bool mSpeculating; michael@0: michael@0: /** michael@0: * Whether the tokenizer has reached EOF. (Reset when stream rewinded.) michael@0: */ michael@0: bool mAtEOF; michael@0: michael@0: /** michael@0: * The speculations. The mutex protects the nsTArray itself. michael@0: * To access the queue of current speculation, mTokenizerMutex must be michael@0: * obtained. michael@0: * The current speculation is the last element michael@0: */ michael@0: nsTArray > mSpeculations; michael@0: mozilla::Mutex mSpeculationMutex; michael@0: michael@0: /** michael@0: * True to terminate early; protected by mTerminatedMutex michael@0: */ michael@0: bool mTerminated; michael@0: bool mInterrupted; michael@0: mozilla::Mutex mTerminatedMutex; michael@0: michael@0: /** michael@0: * The thread this stream parser runs on. michael@0: */ michael@0: nsCOMPtr mThread; michael@0: michael@0: nsCOMPtr mExecutorFlusher; michael@0: michael@0: nsCOMPtr mLoadFlusher; michael@0: michael@0: /** michael@0: * The chardet instance if chardet is enabled. michael@0: */ michael@0: nsCOMPtr mChardet; michael@0: michael@0: /** michael@0: * If false, don't push data to chardet. michael@0: */ michael@0: bool mFeedChardet; michael@0: michael@0: /** michael@0: * Whether the initial charset source was kCharsetFromParentFrame michael@0: */ michael@0: bool mInitialEncodingWasFromParentFrame; michael@0: michael@0: /** michael@0: * Timer for flushing tree ops once in a while when not speculating. michael@0: */ michael@0: nsCOMPtr mFlushTimer; michael@0: michael@0: /** michael@0: * Keeps track whether mFlushTimer has been armed. Unfortunately, michael@0: * nsITimer doesn't enable querying this from the timer itself. michael@0: */ michael@0: bool mFlushTimerArmed; michael@0: michael@0: /** michael@0: * False initially and true after the timer has fired at least once. michael@0: */ michael@0: bool mFlushTimerEverFired; michael@0: michael@0: /** michael@0: * Whether the parser is doing a normal parse, view source or plain text. michael@0: */ michael@0: eParserMode mMode; michael@0: michael@0: /** michael@0: * The pref html5.flushtimer.initialdelay: Time in milliseconds between michael@0: * the time a network buffer is seen and the timer firing when the michael@0: * timer hasn't fired previously in this parse. michael@0: */ michael@0: static int32_t sTimerInitialDelay; michael@0: michael@0: /** michael@0: * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between michael@0: * the time a network buffer is seen and the timer firing when the michael@0: * timer has already fired previously in this parse. michael@0: */ michael@0: static int32_t sTimerSubsequentDelay; michael@0: }; michael@0: michael@0: #endif // nsHtml5StreamParser_h