michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0:
michael@0: #ifndef nsHtml5StreamParser_h
michael@0: #define nsHtml5StreamParser_h
michael@0:
michael@0: #include "nsAutoPtr.h"
michael@0: #include "nsCOMPtr.h"
michael@0: #include "nsICharsetDetectionObserver.h"
michael@0: #include "nsHtml5MetaScanner.h"
michael@0: #include "nsIUnicodeDecoder.h"
michael@0: #include "nsHtml5TreeOpExecutor.h"
michael@0: #include "nsHtml5OwningUTF16Buffer.h"
michael@0: #include "nsIInputStream.h"
michael@0: #include "mozilla/Mutex.h"
michael@0: #include "nsHtml5AtomTable.h"
michael@0: #include "nsHtml5Speculation.h"
michael@0: #include "nsITimer.h"
michael@0: #include "nsICharsetDetector.h"
michael@0:
michael@0: class nsHtml5Parser;
michael@0:
michael@0: #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
michael@0: #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
michael@0:
michael@0: enum eParserMode {
michael@0: /**
michael@0: * Parse a document normally as HTML.
michael@0: */
michael@0: NORMAL,
michael@0:
michael@0: /**
michael@0: * View document as HTML source.
michael@0: */
michael@0: VIEW_SOURCE_HTML,
michael@0:
michael@0: /**
michael@0: * View document as XML source
michael@0: */
michael@0: VIEW_SOURCE_XML,
michael@0:
michael@0: /**
michael@0: * View document as plain text source
michael@0: */
michael@0: VIEW_SOURCE_PLAIN,
michael@0:
michael@0: /**
michael@0: * View document as plain text
michael@0: */
michael@0: PLAIN_TEXT,
michael@0:
michael@0: /**
michael@0: * Load as data (XHR)
michael@0: */
michael@0: LOAD_AS_DATA
michael@0: };
michael@0:
michael@0: enum eBomState {
michael@0: /**
michael@0: * BOM sniffing hasn't started.
michael@0: */
michael@0: BOM_SNIFFING_NOT_STARTED = 0,
michael@0:
michael@0: /**
michael@0: * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
michael@0: * seen.
michael@0: */
michael@0: SEEN_UTF_16_LE_FIRST_BYTE = 1,
michael@0:
michael@0: /**
michael@0: * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
michael@0: * seen.
michael@0: */
michael@0: SEEN_UTF_16_BE_FIRST_BYTE = 2,
michael@0:
michael@0: /**
michael@0: * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
michael@0: * seen.
michael@0: */
michael@0: SEEN_UTF_8_FIRST_BYTE = 3,
michael@0:
michael@0: /**
michael@0: * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
michael@0: * have been seen.
michael@0: */
michael@0: SEEN_UTF_8_SECOND_BYTE = 4,
michael@0:
michael@0: /**
michael@0: * BOM sniffing was started but is now over for whatever reason.
michael@0: */
michael@0: BOM_SNIFFING_OVER = 5
michael@0: };
michael@0:
michael@0: enum eHtml5StreamState {
michael@0: STREAM_NOT_STARTED = 0,
michael@0: STREAM_BEING_READ = 1,
michael@0: STREAM_ENDED = 2
michael@0: };
michael@0:
michael@0: class nsHtml5StreamParser : public nsICharsetDetectionObserver {
michael@0:
michael@0: friend class nsHtml5RequestStopper;
michael@0: friend class nsHtml5DataAvailable;
michael@0: friend class nsHtml5StreamParserContinuation;
michael@0: friend class nsHtml5TimerKungFu;
michael@0:
michael@0: public:
michael@0: NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
michael@0: NS_DECL_CYCLE_COLLECTING_ISUPPORTS
michael@0: NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
michael@0: nsICharsetDetectionObserver)
michael@0:
michael@0: static void InitializeStatics();
michael@0:
michael@0: nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
michael@0: nsHtml5Parser* aOwner,
michael@0: eParserMode aMode);
michael@0:
michael@0: virtual ~nsHtml5StreamParser();
michael@0:
michael@0: // Methods that nsHtml5StreamListener calls
michael@0: nsresult CheckListenerChain();
michael@0:
michael@0: nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
michael@0:
michael@0: nsresult OnDataAvailable(nsIRequest* aRequest,
michael@0: nsISupports* aContext,
michael@0: nsIInputStream* aInStream,
michael@0: uint64_t aSourceOffset,
michael@0: uint32_t aLength);
michael@0:
michael@0: nsresult OnStopRequest(nsIRequest* aRequest,
michael@0: nsISupports* aContext,
michael@0: nsresult status);
michael@0:
michael@0: // nsICharsetDetectionObserver
michael@0: /**
michael@0: * Chardet calls this to report the detection result
michael@0: */
michael@0: NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf);
michael@0:
michael@0: // EncodingDeclarationHandler
michael@0: // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
michael@0: /**
michael@0: * Tree builder uses this to report a late
michael@0: */
michael@0: bool internalEncodingDeclaration(nsString* aEncoding);
michael@0:
michael@0: // Not from an external interface
michael@0:
michael@0: /**
michael@0: * Call this method once you've created a parser, and want to instruct it
michael@0: * about what charset to load
michael@0: *
michael@0: * @param aCharset the charset of a document
michael@0: * @param aCharsetSource the source of the charset
michael@0: */
michael@0: inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
michael@0: NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
michael@0: "SetDocumentCharset called too late.");
michael@0: NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
michael@0: mCharset = aCharset;
michael@0: mCharsetSource = aSource;
michael@0: }
michael@0:
michael@0: inline void SetObserver(nsIRequestObserver* aObserver) {
michael@0: NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
michael@0: mObserver = aObserver;
michael@0: }
michael@0:
michael@0: nsresult GetChannel(nsIChannel** aChannel);
michael@0:
michael@0: /**
michael@0: * The owner parser must call this after script execution
michael@0: * when no scripts are executing and the document.written
michael@0: * buffer has been exhausted.
michael@0: */
michael@0: void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
michael@0: nsHtml5TreeBuilder* aTreeBuilder,
michael@0: bool aLastWasCR);
michael@0:
michael@0: /**
michael@0: * Continues the stream parser if the charset switch failed.
michael@0: */
michael@0: void ContinueAfterFailedCharsetSwitch();
michael@0:
michael@0: void Terminate()
michael@0: {
michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0: mTerminated = true;
michael@0: }
michael@0:
michael@0: void DropTimer();
michael@0:
michael@0: /**
michael@0: * Sets mCharset and mCharsetSource appropriately for the XML View Source
michael@0: * case if aEncoding names a supported rough ASCII superset and sets
michael@0: * the mCharset and mCharsetSource to the UTF-8 default otherwise.
michael@0: */
michael@0: void SetEncodingFromExpat(const char16_t* aEncoding);
michael@0:
michael@0: /**
michael@0: * Sets the URL for View Source title in case this parser ends up being
michael@0: * used for View Source. If aURL is a view-source: URL, takes the inner
michael@0: * URL. data: URLs are shown with an ellipsis instead of the actual data.
michael@0: */
michael@0: void SetViewSourceTitle(nsIURI* aURL);
michael@0:
michael@0: private:
michael@0:
michael@0: #ifdef DEBUG
michael@0: bool IsParserThread() {
michael@0: bool ret;
michael@0: mThread->IsOnCurrentThread(&ret);
michael@0: return ret;
michael@0: }
michael@0: #endif
michael@0:
michael@0: void MarkAsBroken(nsresult aRv);
michael@0:
michael@0: /**
michael@0: * Marks the stream parser as interrupted. If you ever add calls to this
michael@0: * method, be sure to review Uninterrupt usage very, very carefully to
michael@0: * avoid having a previous in-flight runnable cancel your Interrupt()
michael@0: * call on the other thread too soon.
michael@0: */
michael@0: void Interrupt()
michael@0: {
michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0: mInterrupted = true;
michael@0: }
michael@0:
michael@0: void Uninterrupt()
michael@0: {
michael@0: NS_ASSERTION(IsParserThread(), "Wrong thread!");
michael@0: mTokenizerMutex.AssertCurrentThreadOwns();
michael@0: // Not acquiring mTerminatedMutex because mTokenizerMutex is already
michael@0: // held at this point and is already stronger.
michael@0: mInterrupted = false;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Flushes the tree ops from the tree builder and disarms the flush
michael@0: * timer.
michael@0: */
michael@0: void FlushTreeOpsAndDisarmTimer();
michael@0:
michael@0: void ParseAvailableData();
michael@0:
michael@0: void DoStopRequest();
michael@0:
michael@0: void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
michael@0:
michael@0: static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream,
michael@0: void *aClosure,
michael@0: const char *aFromSegment,
michael@0: uint32_t aToOffset,
michael@0: uint32_t aCount,
michael@0: uint32_t *aWriteCount);
michael@0:
michael@0: bool IsTerminatedOrInterrupted()
michael@0: {
michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0: return mTerminated || mInterrupted;
michael@0: }
michael@0:
michael@0: bool IsTerminated()
michael@0: {
michael@0: mozilla::MutexAutoLock autoLock(mTerminatedMutex);
michael@0: return mTerminated;
michael@0: }
michael@0:
michael@0: /**
michael@0: * True when there is a Unicode decoder already
michael@0: */
michael@0: inline bool HasDecoder()
michael@0: {
michael@0: return !!mUnicodeDecoder;
michael@0: }
michael@0:
michael@0: /**
michael@0: * Push bytes from network when there is no Unicode decoder yet
michael@0: */
michael@0: nsresult SniffStreamBytes(const uint8_t* aFromSegment,
michael@0: uint32_t aCount,
michael@0: uint32_t* aWriteCount);
michael@0:
michael@0: /**
michael@0: * Push bytes from network when there is a Unicode decoder already
michael@0: */
michael@0: nsresult WriteStreamBytes(const uint8_t* aFromSegment,
michael@0: uint32_t aCount,
michael@0: uint32_t* aWriteCount);
michael@0:
michael@0: /**
michael@0: * Check whether every other byte in the sniffing buffer is zero.
michael@0: */
michael@0: void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
michael@0: uint32_t aCountToSniffingLimit);
michael@0:
michael@0: /**
michael@0: * scan failed. Try chardet if applicable. After this, the
michael@0: * the parser will have some encoding even if a last resolt fallback.
michael@0: *
michael@0: * @param aFromSegment The current network buffer or null if the sniffing
michael@0: * buffer is being flushed due to network stream ending.
michael@0: * @param aCount The number of bytes in aFromSegment (ignored if
michael@0: * aFromSegment is null)
michael@0: * @param aWriteCount Return value for how many bytes got read from the
michael@0: * buffer.
michael@0: * @param aCountToSniffingLimit The number of unfilled slots in
michael@0: * mSniffingBuffer
michael@0: */
michael@0: nsresult FinalizeSniffing(const uint8_t* aFromSegment,
michael@0: uint32_t aCount,
michael@0: uint32_t* aWriteCount,
michael@0: uint32_t aCountToSniffingLimit);
michael@0:
michael@0: /**
michael@0: * Set up the Unicode decoder and write the sniffing buffer into it
michael@0: * followed by the current network buffer.
michael@0: *
michael@0: * @param aFromSegment The current network buffer or null if the sniffing
michael@0: * buffer is being flushed due to network stream ending.
michael@0: * @param aCount The number of bytes in aFromSegment (ignored if
michael@0: * aFromSegment is null)
michael@0: * @param aWriteCount Return value for how many bytes got read from the
michael@0: * buffer.
michael@0: */
michael@0: nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
michael@0: uint32_t aCount,
michael@0: uint32_t* aWriteCount);
michael@0:
michael@0: /**
michael@0: * Initialize the Unicode decoder, mark the BOM as the source and
michael@0: * drop the sniffer.
michael@0: *
michael@0: * @param aDecoderCharsetName The name for the decoder's charset
michael@0: * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
michael@0: * been swallowed)
michael@0: */
michael@0: nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);
michael@0:
michael@0: /**
michael@0: * Become confident or resolve and encoding name to its preferred form.
michael@0: * @param aEncoding the value of an internal encoding decl. Acts as an
michael@0: * out param, too, when the method returns true.
michael@0: * @return true if the parser needs to start using the new value of
michael@0: * aEncoding and false if the parser became confident or if
michael@0: * the encoding name did not specify a usable encoding
michael@0: */
michael@0: bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
michael@0:
michael@0: /**
michael@0: * Callback for mFlushTimer.
michael@0: */
michael@0: static void TimerCallback(nsITimer* aTimer, void* aClosure);
michael@0:
michael@0: /**
michael@0: * Parser thread entry point for (maybe) flushing the ops and posting
michael@0: * a flush runnable back on the main thread.
michael@0: */
michael@0: void TimerFlush();
michael@0:
michael@0: nsCOMPtr mRequest;
michael@0: nsCOMPtr mObserver;
michael@0:
michael@0: /**
michael@0: * The document title to use if this turns out to be a View Source parser.
michael@0: */
michael@0: nsCString mViewSourceTitle;
michael@0:
michael@0: /**
michael@0: * The Unicode decoder
michael@0: */
michael@0: nsCOMPtr mUnicodeDecoder;
michael@0:
michael@0: /**
michael@0: * The buffer for sniffing the character encoding
michael@0: */
michael@0: nsAutoArrayPtr mSniffingBuffer;
michael@0:
michael@0: /**
michael@0: * The number of meaningful bytes in mSniffingBuffer
michael@0: */
michael@0: uint32_t mSniffingLength;
michael@0:
michael@0: /**
michael@0: * BOM sniffing state
michael@0: */
michael@0: eBomState mBomState;
michael@0:
michael@0: /**
michael@0: * prescan implementation
michael@0: */
michael@0: nsAutoPtr mMetaScanner;
michael@0:
michael@0: // encoding-related stuff
michael@0: /**
michael@0: * The source (confidence) of the character encoding in use
michael@0: */
michael@0: int32_t mCharsetSource;
michael@0:
michael@0: /**
michael@0: * The character encoding in use
michael@0: */
michael@0: nsCString mCharset;
michael@0:
michael@0: /**
michael@0: * Whether reparse is forbidden
michael@0: */
michael@0: bool mReparseForbidden;
michael@0:
michael@0: // Portable parser objects
michael@0: /**
michael@0: * The first buffer in the pending UTF-16 buffer queue
michael@0: */
michael@0: nsRefPtr mFirstBuffer;
michael@0:
michael@0: /**
michael@0: * The last buffer in the pending UTF-16 buffer queue
michael@0: */
michael@0: nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to
michael@0: // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
michael@0:
michael@0: /**
michael@0: * The tree operation executor
michael@0: */
michael@0: nsHtml5TreeOpExecutor* mExecutor;
michael@0:
michael@0: /**
michael@0: * The HTML5 tree builder
michael@0: */
michael@0: nsAutoPtr mTreeBuilder;
michael@0:
michael@0: /**
michael@0: * The HTML5 tokenizer
michael@0: */
michael@0: nsAutoPtr mTokenizer;
michael@0:
michael@0: /**
michael@0: * Makes sure the main thread can't mess the tokenizer state while it's
michael@0: * tokenizing. This mutex also protects the current speculation.
michael@0: */
michael@0: mozilla::Mutex mTokenizerMutex;
michael@0:
michael@0: /**
michael@0: * The scoped atom table
michael@0: */
michael@0: nsHtml5AtomTable mAtomTable;
michael@0:
michael@0: /**
michael@0: * The owner parser.
michael@0: */
michael@0: nsRefPtr mOwner;
michael@0:
michael@0: /**
michael@0: * Whether the last character tokenized was a carriage return (for CRLF)
michael@0: */
michael@0: bool mLastWasCR;
michael@0:
michael@0: /**
michael@0: * For tracking stream life cycle
michael@0: */
michael@0: eHtml5StreamState mStreamState;
michael@0:
michael@0: /**
michael@0: * Whether we are speculating.
michael@0: */
michael@0: bool mSpeculating;
michael@0:
michael@0: /**
michael@0: * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
michael@0: */
michael@0: bool mAtEOF;
michael@0:
michael@0: /**
michael@0: * The speculations. The mutex protects the nsTArray itself.
michael@0: * To access the queue of current speculation, mTokenizerMutex must be
michael@0: * obtained.
michael@0: * The current speculation is the last element
michael@0: */
michael@0: nsTArray > mSpeculations;
michael@0: mozilla::Mutex mSpeculationMutex;
michael@0:
michael@0: /**
michael@0: * True to terminate early; protected by mTerminatedMutex
michael@0: */
michael@0: bool mTerminated;
michael@0: bool mInterrupted;
michael@0: mozilla::Mutex mTerminatedMutex;
michael@0:
michael@0: /**
michael@0: * The thread this stream parser runs on.
michael@0: */
michael@0: nsCOMPtr mThread;
michael@0:
michael@0: nsCOMPtr mExecutorFlusher;
michael@0:
michael@0: nsCOMPtr mLoadFlusher;
michael@0:
michael@0: /**
michael@0: * The chardet instance if chardet is enabled.
michael@0: */
michael@0: nsCOMPtr mChardet;
michael@0:
michael@0: /**
michael@0: * If false, don't push data to chardet.
michael@0: */
michael@0: bool mFeedChardet;
michael@0:
michael@0: /**
michael@0: * Whether the initial charset source was kCharsetFromParentFrame
michael@0: */
michael@0: bool mInitialEncodingWasFromParentFrame;
michael@0:
michael@0: /**
michael@0: * Timer for flushing tree ops once in a while when not speculating.
michael@0: */
michael@0: nsCOMPtr mFlushTimer;
michael@0:
michael@0: /**
michael@0: * Keeps track whether mFlushTimer has been armed. Unfortunately,
michael@0: * nsITimer doesn't enable querying this from the timer itself.
michael@0: */
michael@0: bool mFlushTimerArmed;
michael@0:
michael@0: /**
michael@0: * False initially and true after the timer has fired at least once.
michael@0: */
michael@0: bool mFlushTimerEverFired;
michael@0:
michael@0: /**
michael@0: * Whether the parser is doing a normal parse, view source or plain text.
michael@0: */
michael@0: eParserMode mMode;
michael@0:
michael@0: /**
michael@0: * The pref html5.flushtimer.initialdelay: Time in milliseconds between
michael@0: * the time a network buffer is seen and the timer firing when the
michael@0: * timer hasn't fired previously in this parse.
michael@0: */
michael@0: static int32_t sTimerInitialDelay;
michael@0:
michael@0: /**
michael@0: * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
michael@0: * the time a network buffer is seen and the timer firing when the
michael@0: * timer has already fired previously in this parse.
michael@0: */
michael@0: static int32_t sTimerSubsequentDelay;
michael@0: };
michael@0:
michael@0: #endif // nsHtml5StreamParser_h