parser/html/nsHtml5StreamParser.h

branch
TOR_BUG_9701
changeset 14
925c144e1f1f
equal deleted inserted replaced
-1:000000000000 0:8105a0596228
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
8
9 #include "nsAutoPtr.h"
10 #include "nsCOMPtr.h"
11 #include "nsICharsetDetectionObserver.h"
12 #include "nsHtml5MetaScanner.h"
13 #include "nsIUnicodeDecoder.h"
14 #include "nsHtml5TreeOpExecutor.h"
15 #include "nsHtml5OwningUTF16Buffer.h"
16 #include "nsIInputStream.h"
17 #include "mozilla/Mutex.h"
18 #include "nsHtml5AtomTable.h"
19 #include "nsHtml5Speculation.h"
20 #include "nsITimer.h"
21 #include "nsICharsetDetector.h"
22
23 class nsHtml5Parser;
24
25 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
26 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
27
28 enum eParserMode {
29 /**
30 * Parse a document normally as HTML.
31 */
32 NORMAL,
33
34 /**
35 * View document as HTML source.
36 */
37 VIEW_SOURCE_HTML,
38
39 /**
40 * View document as XML source
41 */
42 VIEW_SOURCE_XML,
43
44 /**
45 * View document as plain text source
46 */
47 VIEW_SOURCE_PLAIN,
48
49 /**
50 * View document as plain text
51 */
52 PLAIN_TEXT,
53
54 /**
55 * Load as data (XHR)
56 */
57 LOAD_AS_DATA
58 };
59
60 enum eBomState {
61 /**
62 * BOM sniffing hasn't started.
63 */
64 BOM_SNIFFING_NOT_STARTED = 0,
65
66 /**
67 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
68 * seen.
69 */
70 SEEN_UTF_16_LE_FIRST_BYTE = 1,
71
72 /**
73 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
74 * seen.
75 */
76 SEEN_UTF_16_BE_FIRST_BYTE = 2,
77
78 /**
79 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
80 * seen.
81 */
82 SEEN_UTF_8_FIRST_BYTE = 3,
83
84 /**
85 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
86 * have been seen.
87 */
88 SEEN_UTF_8_SECOND_BYTE = 4,
89
90 /**
91 * BOM sniffing was started but is now over for whatever reason.
92 */
93 BOM_SNIFFING_OVER = 5
94 };
95
96 enum eHtml5StreamState {
97 STREAM_NOT_STARTED = 0,
98 STREAM_BEING_READ = 1,
99 STREAM_ENDED = 2
100 };
101
102 class nsHtml5StreamParser : public nsICharsetDetectionObserver {
103
104 friend class nsHtml5RequestStopper;
105 friend class nsHtml5DataAvailable;
106 friend class nsHtml5StreamParserContinuation;
107 friend class nsHtml5TimerKungFu;
108
109 public:
110 NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
111 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
112 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
113 nsICharsetDetectionObserver)
114
115 static void InitializeStatics();
116
117 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
118 nsHtml5Parser* aOwner,
119 eParserMode aMode);
120
121 virtual ~nsHtml5StreamParser();
122
123 // Methods that nsHtml5StreamListener calls
124 nsresult CheckListenerChain();
125
126 nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
127
128 nsresult OnDataAvailable(nsIRequest* aRequest,
129 nsISupports* aContext,
130 nsIInputStream* aInStream,
131 uint64_t aSourceOffset,
132 uint32_t aLength);
133
134 nsresult OnStopRequest(nsIRequest* aRequest,
135 nsISupports* aContext,
136 nsresult status);
137
138 // nsICharsetDetectionObserver
139 /**
140 * Chardet calls this to report the detection result
141 */
142 NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf);
143
144 // EncodingDeclarationHandler
145 // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
146 /**
147 * Tree builder uses this to report a late <meta charset>
148 */
149 bool internalEncodingDeclaration(nsString* aEncoding);
150
151 // Not from an external interface
152
153 /**
154 * Call this method once you've created a parser, and want to instruct it
155 * about what charset to load
156 *
157 * @param aCharset the charset of a document
158 * @param aCharsetSource the source of the charset
159 */
160 inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
161 NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
162 "SetDocumentCharset called too late.");
163 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
164 mCharset = aCharset;
165 mCharsetSource = aSource;
166 }
167
168 inline void SetObserver(nsIRequestObserver* aObserver) {
169 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
170 mObserver = aObserver;
171 }
172
173 nsresult GetChannel(nsIChannel** aChannel);
174
175 /**
176 * The owner parser must call this after script execution
177 * when no scripts are executing and the document.written
178 * buffer has been exhausted.
179 */
180 void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
181 nsHtml5TreeBuilder* aTreeBuilder,
182 bool aLastWasCR);
183
184 /**
185 * Continues the stream parser if the charset switch failed.
186 */
187 void ContinueAfterFailedCharsetSwitch();
188
189 void Terminate()
190 {
191 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
192 mTerminated = true;
193 }
194
195 void DropTimer();
196
197 /**
198 * Sets mCharset and mCharsetSource appropriately for the XML View Source
199 * case if aEncoding names a supported rough ASCII superset and sets
200 * the mCharset and mCharsetSource to the UTF-8 default otherwise.
201 */
202 void SetEncodingFromExpat(const char16_t* aEncoding);
203
204 /**
205 * Sets the URL for View Source title in case this parser ends up being
206 * used for View Source. If aURL is a view-source: URL, takes the inner
207 * URL. data: URLs are shown with an ellipsis instead of the actual data.
208 */
209 void SetViewSourceTitle(nsIURI* aURL);
210
211 private:
212
213 #ifdef DEBUG
214 bool IsParserThread() {
215 bool ret;
216 mThread->IsOnCurrentThread(&ret);
217 return ret;
218 }
219 #endif
220
221 void MarkAsBroken(nsresult aRv);
222
223 /**
224 * Marks the stream parser as interrupted. If you ever add calls to this
225 * method, be sure to review Uninterrupt usage very, very carefully to
226 * avoid having a previous in-flight runnable cancel your Interrupt()
227 * call on the other thread too soon.
228 */
229 void Interrupt()
230 {
231 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
232 mInterrupted = true;
233 }
234
235 void Uninterrupt()
236 {
237 NS_ASSERTION(IsParserThread(), "Wrong thread!");
238 mTokenizerMutex.AssertCurrentThreadOwns();
239 // Not acquiring mTerminatedMutex because mTokenizerMutex is already
240 // held at this point and is already stronger.
241 mInterrupted = false;
242 }
243
244 /**
245 * Flushes the tree ops from the tree builder and disarms the flush
246 * timer.
247 */
248 void FlushTreeOpsAndDisarmTimer();
249
250 void ParseAvailableData();
251
252 void DoStopRequest();
253
254 void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
255
256 static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream,
257 void *aClosure,
258 const char *aFromSegment,
259 uint32_t aToOffset,
260 uint32_t aCount,
261 uint32_t *aWriteCount);
262
263 bool IsTerminatedOrInterrupted()
264 {
265 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
266 return mTerminated || mInterrupted;
267 }
268
269 bool IsTerminated()
270 {
271 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
272 return mTerminated;
273 }
274
275 /**
276 * True when there is a Unicode decoder already
277 */
278 inline bool HasDecoder()
279 {
280 return !!mUnicodeDecoder;
281 }
282
283 /**
284 * Push bytes from network when there is no Unicode decoder yet
285 */
286 nsresult SniffStreamBytes(const uint8_t* aFromSegment,
287 uint32_t aCount,
288 uint32_t* aWriteCount);
289
290 /**
291 * Push bytes from network when there is a Unicode decoder already
292 */
293 nsresult WriteStreamBytes(const uint8_t* aFromSegment,
294 uint32_t aCount,
295 uint32_t* aWriteCount);
296
297 /**
298 * Check whether every other byte in the sniffing buffer is zero.
299 */
300 void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
301 uint32_t aCountToSniffingLimit);
302
303 /**
304 * <meta charset> scan failed. Try chardet if applicable. After this, the
305 * the parser will have some encoding even if a last resolt fallback.
306 *
307 * @param aFromSegment The current network buffer or null if the sniffing
308 * buffer is being flushed due to network stream ending.
309 * @param aCount The number of bytes in aFromSegment (ignored if
310 * aFromSegment is null)
311 * @param aWriteCount Return value for how many bytes got read from the
312 * buffer.
313 * @param aCountToSniffingLimit The number of unfilled slots in
314 * mSniffingBuffer
315 */
316 nsresult FinalizeSniffing(const uint8_t* aFromSegment,
317 uint32_t aCount,
318 uint32_t* aWriteCount,
319 uint32_t aCountToSniffingLimit);
320
321 /**
322 * Set up the Unicode decoder and write the sniffing buffer into it
323 * followed by the current network buffer.
324 *
325 * @param aFromSegment The current network buffer or null if the sniffing
326 * buffer is being flushed due to network stream ending.
327 * @param aCount The number of bytes in aFromSegment (ignored if
328 * aFromSegment is null)
329 * @param aWriteCount Return value for how many bytes got read from the
330 * buffer.
331 */
332 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
333 uint32_t aCount,
334 uint32_t* aWriteCount);
335
336 /**
337 * Initialize the Unicode decoder, mark the BOM as the source and
338 * drop the sniffer.
339 *
340 * @param aDecoderCharsetName The name for the decoder's charset
341 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
342 * been swallowed)
343 */
344 nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);
345
346 /**
347 * Become confident or resolve and encoding name to its preferred form.
348 * @param aEncoding the value of an internal encoding decl. Acts as an
349 * out param, too, when the method returns true.
350 * @return true if the parser needs to start using the new value of
351 * aEncoding and false if the parser became confident or if
352 * the encoding name did not specify a usable encoding
353 */
354 bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
355
356 /**
357 * Callback for mFlushTimer.
358 */
359 static void TimerCallback(nsITimer* aTimer, void* aClosure);
360
361 /**
362 * Parser thread entry point for (maybe) flushing the ops and posting
363 * a flush runnable back on the main thread.
364 */
365 void TimerFlush();
366
367 nsCOMPtr<nsIRequest> mRequest;
368 nsCOMPtr<nsIRequestObserver> mObserver;
369
370 /**
371 * The document title to use if this turns out to be a View Source parser.
372 */
373 nsCString mViewSourceTitle;
374
375 /**
376 * The Unicode decoder
377 */
378 nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
379
380 /**
381 * The buffer for sniffing the character encoding
382 */
383 nsAutoArrayPtr<uint8_t> mSniffingBuffer;
384
385 /**
386 * The number of meaningful bytes in mSniffingBuffer
387 */
388 uint32_t mSniffingLength;
389
390 /**
391 * BOM sniffing state
392 */
393 eBomState mBomState;
394
395 /**
396 * <meta> prescan implementation
397 */
398 nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
399
400 // encoding-related stuff
401 /**
402 * The source (confidence) of the character encoding in use
403 */
404 int32_t mCharsetSource;
405
406 /**
407 * The character encoding in use
408 */
409 nsCString mCharset;
410
411 /**
412 * Whether reparse is forbidden
413 */
414 bool mReparseForbidden;
415
416 // Portable parser objects
417 /**
418 * The first buffer in the pending UTF-16 buffer queue
419 */
420 nsRefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
421
422 /**
423 * The last buffer in the pending UTF-16 buffer queue
424 */
425 nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to
426 // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
427
428 /**
429 * The tree operation executor
430 */
431 nsHtml5TreeOpExecutor* mExecutor;
432
433 /**
434 * The HTML5 tree builder
435 */
436 nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
437
438 /**
439 * The HTML5 tokenizer
440 */
441 nsAutoPtr<nsHtml5Tokenizer> mTokenizer;
442
443 /**
444 * Makes sure the main thread can't mess the tokenizer state while it's
445 * tokenizing. This mutex also protects the current speculation.
446 */
447 mozilla::Mutex mTokenizerMutex;
448
449 /**
450 * The scoped atom table
451 */
452 nsHtml5AtomTable mAtomTable;
453
454 /**
455 * The owner parser.
456 */
457 nsRefPtr<nsHtml5Parser> mOwner;
458
459 /**
460 * Whether the last character tokenized was a carriage return (for CRLF)
461 */
462 bool mLastWasCR;
463
464 /**
465 * For tracking stream life cycle
466 */
467 eHtml5StreamState mStreamState;
468
469 /**
470 * Whether we are speculating.
471 */
472 bool mSpeculating;
473
474 /**
475 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
476 */
477 bool mAtEOF;
478
479 /**
480 * The speculations. The mutex protects the nsTArray itself.
481 * To access the queue of current speculation, mTokenizerMutex must be
482 * obtained.
483 * The current speculation is the last element
484 */
485 nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations;
486 mozilla::Mutex mSpeculationMutex;
487
488 /**
489 * True to terminate early; protected by mTerminatedMutex
490 */
491 bool mTerminated;
492 bool mInterrupted;
493 mozilla::Mutex mTerminatedMutex;
494
495 /**
496 * The thread this stream parser runs on.
497 */
498 nsCOMPtr<nsIThread> mThread;
499
500 nsCOMPtr<nsIRunnable> mExecutorFlusher;
501
502 nsCOMPtr<nsIRunnable> mLoadFlusher;
503
504 /**
505 * The chardet instance if chardet is enabled.
506 */
507 nsCOMPtr<nsICharsetDetector> mChardet;
508
509 /**
510 * If false, don't push data to chardet.
511 */
512 bool mFeedChardet;
513
514 /**
515 * Whether the initial charset source was kCharsetFromParentFrame
516 */
517 bool mInitialEncodingWasFromParentFrame;
518
519 /**
520 * Timer for flushing tree ops once in a while when not speculating.
521 */
522 nsCOMPtr<nsITimer> mFlushTimer;
523
524 /**
525 * Keeps track whether mFlushTimer has been armed. Unfortunately,
526 * nsITimer doesn't enable querying this from the timer itself.
527 */
528 bool mFlushTimerArmed;
529
530 /**
531 * False initially and true after the timer has fired at least once.
532 */
533 bool mFlushTimerEverFired;
534
535 /**
536 * Whether the parser is doing a normal parse, view source or plain text.
537 */
538 eParserMode mMode;
539
540 /**
541 * The pref html5.flushtimer.initialdelay: Time in milliseconds between
542 * the time a network buffer is seen and the timer firing when the
543 * timer hasn't fired previously in this parse.
544 */
545 static int32_t sTimerInitialDelay;
546
547 /**
548 * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
549 * the time a network buffer is seen and the timer firing when the
550 * timer has already fired previously in this parse.
551 */
552 static int32_t sTimerSubsequentDelay;
553 };
554
555 #endif // nsHtml5StreamParser_h

mercurial