parser/html/nsHtml5StreamParser.h

Wed, 31 Dec 2014 13:27:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 13:27:57 +0100
branch
TOR_BUG_3246
changeset 6
8bccb770b82d
permissions
-rw-r--r--

Ignore runtime configuration files generated during quality assurance.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #ifndef nsHtml5StreamParser_h
     7 #define nsHtml5StreamParser_h
     9 #include "nsAutoPtr.h"
    10 #include "nsCOMPtr.h"
    11 #include "nsICharsetDetectionObserver.h"
    12 #include "nsHtml5MetaScanner.h"
    13 #include "nsIUnicodeDecoder.h"
    14 #include "nsHtml5TreeOpExecutor.h"
    15 #include "nsHtml5OwningUTF16Buffer.h"
    16 #include "nsIInputStream.h"
    17 #include "mozilla/Mutex.h"
    18 #include "nsHtml5AtomTable.h"
    19 #include "nsHtml5Speculation.h"
    20 #include "nsITimer.h"
    21 #include "nsICharsetDetector.h"
    23 class nsHtml5Parser;
    25 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
    26 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
    28 enum eParserMode {
    29   /**
    30    * Parse a document normally as HTML.
    31    */
    32   NORMAL,
    34   /**
    35    * View document as HTML source.
    36    */
    37   VIEW_SOURCE_HTML,
    39   /**
    40    * View document as XML source
    41    */
    42   VIEW_SOURCE_XML,
    44   /**
    45    * View document as plain text source
    46    */
    47   VIEW_SOURCE_PLAIN,
    49   /**
    50    * View document as plain text
    51    */
    52   PLAIN_TEXT,
    54   /**
    55    * Load as data (XHR)
    56    */
    57   LOAD_AS_DATA
    58 };
    60 enum eBomState {
    61   /**
    62    * BOM sniffing hasn't started.
    63    */
    64   BOM_SNIFFING_NOT_STARTED = 0,
    66   /**
    67    * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
    68    * seen.
    69    */
    70   SEEN_UTF_16_LE_FIRST_BYTE = 1,
    72   /**
    73    * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
    74    * seen.
    75    */
    76   SEEN_UTF_16_BE_FIRST_BYTE = 2,
    78   /**
    79    * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
    80    * seen.
    81    */
    82   SEEN_UTF_8_FIRST_BYTE = 3,
    84   /**
    85    * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
    86    * have been seen.
    87    */
    88   SEEN_UTF_8_SECOND_BYTE = 4,
    90   /**
    91    * BOM sniffing was started but is now over for whatever reason.
    92    */
    93   BOM_SNIFFING_OVER = 5
    94 };
    96 enum eHtml5StreamState {
    97   STREAM_NOT_STARTED = 0,
    98   STREAM_BEING_READ = 1,
    99   STREAM_ENDED = 2
   100 };
   102 class nsHtml5StreamParser : public nsICharsetDetectionObserver {
   104   friend class nsHtml5RequestStopper;
   105   friend class nsHtml5DataAvailable;
   106   friend class nsHtml5StreamParserContinuation;
   107   friend class nsHtml5TimerKungFu;
   109   public:
   110     NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
   111     NS_DECL_CYCLE_COLLECTING_ISUPPORTS
   112     NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
   113                                              nsICharsetDetectionObserver)
   115     static void InitializeStatics();
   117     nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
   118                         nsHtml5Parser* aOwner,
   119                         eParserMode aMode);
   121     virtual ~nsHtml5StreamParser();
   123     // Methods that nsHtml5StreamListener calls
   124     nsresult CheckListenerChain();
   126     nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
   128     nsresult OnDataAvailable(nsIRequest* aRequest,
   129                              nsISupports* aContext,
   130                              nsIInputStream* aInStream,
   131                              uint64_t aSourceOffset,
   132                              uint32_t aLength);
   134     nsresult OnStopRequest(nsIRequest* aRequest,
   135                            nsISupports* aContext,
   136                            nsresult status);
   138     // nsICharsetDetectionObserver
   139     /**
   140      * Chardet calls this to report the detection result
   141      */
   142     NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf);
   144     // EncodingDeclarationHandler
   145     // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
   146     /**
   147      * Tree builder uses this to report a late <meta charset>
   148      */
   149     bool internalEncodingDeclaration(nsString* aEncoding);
   151     // Not from an external interface
   153     /**
   154      *  Call this method once you've created a parser, and want to instruct it
   155      *  about what charset to load
   156      *
   157      *  @param   aCharset the charset of a document
   158      *  @param   aCharsetSource the source of the charset
   159      */
   160     inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
   161       NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
   162                       "SetDocumentCharset called too late.");
   163       NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
   164       mCharset = aCharset;
   165       mCharsetSource = aSource;
   166     }
   168     inline void SetObserver(nsIRequestObserver* aObserver) {
   169       NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
   170       mObserver = aObserver;
   171     }
   173     nsresult GetChannel(nsIChannel** aChannel);
   175     /**
   176      * The owner parser must call this after script execution
   177      * when no scripts are executing and the document.written 
   178      * buffer has been exhausted.
   179      */
   180     void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, 
   181                               nsHtml5TreeBuilder* aTreeBuilder,
   182                               bool aLastWasCR);
   184     /**
   185      * Continues the stream parser if the charset switch failed.
   186      */
   187     void ContinueAfterFailedCharsetSwitch();
   189     void Terminate()
   190     {
   191       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
   192       mTerminated = true;
   193     }
   195     void DropTimer();
   197     /**
   198      * Sets mCharset and mCharsetSource appropriately for the XML View Source
   199      * case if aEncoding names a supported rough ASCII superset and sets
   200      * the mCharset and mCharsetSource to the UTF-8 default otherwise.
   201      */
   202     void SetEncodingFromExpat(const char16_t* aEncoding);
   204     /**
   205      * Sets the URL for View Source title in case this parser ends up being
   206      * used for View Source. If aURL is a view-source: URL, takes the inner
   207      * URL. data: URLs are shown with an ellipsis instead of the actual data.
   208      */
   209     void SetViewSourceTitle(nsIURI* aURL);
   211   private:
   213 #ifdef DEBUG
   214     bool IsParserThread() {
   215       bool ret;
   216       mThread->IsOnCurrentThread(&ret);
   217       return ret;
   218     }
   219 #endif
   221     void MarkAsBroken(nsresult aRv);
   223     /**
   224      * Marks the stream parser as interrupted. If you ever add calls to this
   225      * method, be sure to review Uninterrupt usage very, very carefully to
   226      * avoid having a previous in-flight runnable cancel your Interrupt()
   227      * call on the other thread too soon.
   228      */
   229     void Interrupt()
   230     {
   231       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
   232       mInterrupted = true;
   233     }
   235     void Uninterrupt()
   236     {
   237       NS_ASSERTION(IsParserThread(), "Wrong thread!");
   238       mTokenizerMutex.AssertCurrentThreadOwns();
   239       // Not acquiring mTerminatedMutex because mTokenizerMutex is already
   240       // held at this point and is already stronger.
   241       mInterrupted = false;      
   242     }
   244     /**
   245      * Flushes the tree ops from the tree builder and disarms the flush
   246      * timer.
   247      */
   248     void FlushTreeOpsAndDisarmTimer();
   250     void ParseAvailableData();
   252     void DoStopRequest();
   254     void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
   256     static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream,
   257                                           void *aClosure,
   258                                           const char *aFromSegment,
   259                                           uint32_t aToOffset,
   260                                           uint32_t aCount,
   261                                           uint32_t *aWriteCount);
   263     bool IsTerminatedOrInterrupted()
   264     {
   265       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
   266       return mTerminated || mInterrupted;
   267     }
   269     bool IsTerminated()
   270     {
   271       mozilla::MutexAutoLock autoLock(mTerminatedMutex);
   272       return mTerminated;
   273     }
   275     /**
   276      * True when there is a Unicode decoder already
   277      */
   278     inline bool HasDecoder()
   279     {
   280       return !!mUnicodeDecoder;
   281     }
   283     /**
   284      * Push bytes from network when there is no Unicode decoder yet
   285      */
   286     nsresult SniffStreamBytes(const uint8_t* aFromSegment,
   287                               uint32_t aCount,
   288                               uint32_t* aWriteCount);
   290     /**
   291      * Push bytes from network when there is a Unicode decoder already
   292      */
   293     nsresult WriteStreamBytes(const uint8_t* aFromSegment,
   294                               uint32_t aCount,
   295                               uint32_t* aWriteCount);
   297     /**
   298      * Check whether every other byte in the sniffing buffer is zero.
   299      */
   300     void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
   301                                      uint32_t aCountToSniffingLimit);
   303     /**
   304      * <meta charset> scan failed. Try chardet if applicable. After this, the
   305      * the parser will have some encoding even if a last resolt fallback.
   306      *
   307      * @param aFromSegment The current network buffer or null if the sniffing
   308      *                     buffer is being flushed due to network stream ending.
   309      * @param aCount       The number of bytes in aFromSegment (ignored if
   310      *                     aFromSegment is null)
   311      * @param aWriteCount  Return value for how many bytes got read from the
   312      *                     buffer.
   313      * @param aCountToSniffingLimit The number of unfilled slots in
   314      *                              mSniffingBuffer
   315      */
   316     nsresult FinalizeSniffing(const uint8_t* aFromSegment,
   317                               uint32_t aCount,
   318                               uint32_t* aWriteCount,
   319                               uint32_t aCountToSniffingLimit);
   321     /**
   322      * Set up the Unicode decoder and write the sniffing buffer into it
   323      * followed by the current network buffer.
   324      *
   325      * @param aFromSegment The current network buffer or null if the sniffing
   326      *                     buffer is being flushed due to network stream ending.
   327      * @param aCount       The number of bytes in aFromSegment (ignored if
   328      *                     aFromSegment is null)
   329      * @param aWriteCount  Return value for how many bytes got read from the
   330      *                     buffer.
   331      */
   332     nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
   333                                                                   uint32_t aCount,
   334                                                                   uint32_t* aWriteCount);
   336     /**
   337      * Initialize the Unicode decoder, mark the BOM as the source and
   338      * drop the sniffer.
   339      *
   340      * @param aDecoderCharsetName The name for the decoder's charset
   341      *                            (UTF-16BE, UTF-16LE or UTF-8; the BOM has
   342      *                            been swallowed)
   343      */
   344     nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);
   346     /**
   347      * Become confident or resolve and encoding name to its preferred form.
   348      * @param aEncoding the value of an internal encoding decl. Acts as an
   349      *                  out param, too, when the method returns true.
   350      * @return true if the parser needs to start using the new value of
   351      *         aEncoding and false if the parser became confident or if
   352      *         the encoding name did not specify a usable encoding
   353      */
   354     bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
   356     /**
   357      * Callback for mFlushTimer.
   358      */
   359     static void TimerCallback(nsITimer* aTimer, void* aClosure);
   361     /**
   362      * Parser thread entry point for (maybe) flushing the ops and posting
   363      * a flush runnable back on the main thread.
   364      */
   365     void TimerFlush();
   367     nsCOMPtr<nsIRequest>          mRequest;
   368     nsCOMPtr<nsIRequestObserver>  mObserver;
   370     /**
   371      * The document title to use if this turns out to be a View Source parser.
   372      */
   373     nsCString                     mViewSourceTitle;
   375     /**
   376      * The Unicode decoder
   377      */
   378     nsCOMPtr<nsIUnicodeDecoder>   mUnicodeDecoder;
   380     /**
   381      * The buffer for sniffing the character encoding
   382      */
   383     nsAutoArrayPtr<uint8_t>       mSniffingBuffer;
   385     /**
   386      * The number of meaningful bytes in mSniffingBuffer
   387      */
   388     uint32_t                      mSniffingLength;
   390     /**
   391      * BOM sniffing state
   392      */
   393     eBomState                     mBomState;
   395     /**
   396      * <meta> prescan implementation
   397      */
   398     nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
   400     // encoding-related stuff
   401     /**
   402      * The source (confidence) of the character encoding in use
   403      */
   404     int32_t                       mCharsetSource;
   406     /**
   407      * The character encoding in use
   408      */
   409     nsCString                     mCharset;
   411     /**
   412      * Whether reparse is forbidden
   413      */
   414     bool                          mReparseForbidden;
   416     // Portable parser objects
   417     /**
   418      * The first buffer in the pending UTF-16 buffer queue
   419      */
   420     nsRefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
   422     /**
   423      * The last buffer in the pending UTF-16 buffer queue
   424      */
   425     nsHtml5OwningUTF16Buffer*     mLastBuffer; // weak ref; always points to
   426                       // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
   428     /**
   429      * The tree operation executor
   430      */
   431     nsHtml5TreeOpExecutor*        mExecutor;
   433     /**
   434      * The HTML5 tree builder
   435      */
   436     nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
   438     /**
   439      * The HTML5 tokenizer
   440      */
   441     nsAutoPtr<nsHtml5Tokenizer>   mTokenizer;
   443     /**
   444      * Makes sure the main thread can't mess the tokenizer state while it's
   445      * tokenizing. This mutex also protects the current speculation.
   446      */
   447     mozilla::Mutex                mTokenizerMutex;
   449     /**
   450      * The scoped atom table
   451      */
   452     nsHtml5AtomTable              mAtomTable;
   454     /**
   455      * The owner parser.
   456      */
   457     nsRefPtr<nsHtml5Parser>       mOwner;
   459     /**
   460      * Whether the last character tokenized was a carriage return (for CRLF)
   461      */
   462     bool                          mLastWasCR;
   464     /**
   465      * For tracking stream life cycle
   466      */
   467     eHtml5StreamState             mStreamState;
   469     /**
   470      * Whether we are speculating.
   471      */
   472     bool                          mSpeculating;
   474     /**
   475      * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
   476      */
   477     bool                          mAtEOF;
   479     /**
   480      * The speculations. The mutex protects the nsTArray itself.
   481      * To access the queue of current speculation, mTokenizerMutex must be 
   482      * obtained.
   483      * The current speculation is the last element
   484      */
   485     nsTArray<nsAutoPtr<nsHtml5Speculation> >  mSpeculations;
   486     mozilla::Mutex                            mSpeculationMutex;
   488     /**
   489      * True to terminate early; protected by mTerminatedMutex
   490      */
   491     bool                          mTerminated;
   492     bool                          mInterrupted;
   493     mozilla::Mutex                mTerminatedMutex;
   495     /**
   496      * The thread this stream parser runs on.
   497      */
   498     nsCOMPtr<nsIThread>           mThread;
   500     nsCOMPtr<nsIRunnable>         mExecutorFlusher;
   502     nsCOMPtr<nsIRunnable>         mLoadFlusher;
   504     /**
   505      * The chardet instance if chardet is enabled.
   506      */
   507     nsCOMPtr<nsICharsetDetector>  mChardet;
   509     /**
   510      * If false, don't push data to chardet.
   511      */
   512     bool                          mFeedChardet;
   514     /**
   515      * Whether the initial charset source was kCharsetFromParentFrame
   516      */
   517     bool                          mInitialEncodingWasFromParentFrame;
   519     /**
   520      * Timer for flushing tree ops once in a while when not speculating.
   521      */
   522     nsCOMPtr<nsITimer>            mFlushTimer;
   524     /**
   525      * Keeps track whether mFlushTimer has been armed. Unfortunately,
   526      * nsITimer doesn't enable querying this from the timer itself.
   527      */
   528     bool                          mFlushTimerArmed;
   530     /**
   531      * False initially and true after the timer has fired at least once.
   532      */
   533     bool                          mFlushTimerEverFired;
   535     /**
   536      * Whether the parser is doing a normal parse, view source or plain text.
   537      */
   538     eParserMode                   mMode;
   540     /**
   541      * The pref html5.flushtimer.initialdelay: Time in milliseconds between
   542      * the time a network buffer is seen and the timer firing when the
   543      * timer hasn't fired previously in this parse.
   544      */
   545     static int32_t                sTimerInitialDelay;
   547     /**
   548      * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
   549      * the time a network buffer is seen and the timer firing when the
   550      * timer has already fired previously in this parse.
   551      */
   552     static int32_t                sTimerSubsequentDelay;
   553 };
   555 #endif // nsHtml5StreamParser_h

mercurial