|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #ifndef nsHtml5StreamParser_h |
|
7 #define nsHtml5StreamParser_h |
|
8 |
|
9 #include "nsAutoPtr.h" |
|
10 #include "nsCOMPtr.h" |
|
11 #include "nsICharsetDetectionObserver.h" |
|
12 #include "nsHtml5MetaScanner.h" |
|
13 #include "nsIUnicodeDecoder.h" |
|
14 #include "nsHtml5TreeOpExecutor.h" |
|
15 #include "nsHtml5OwningUTF16Buffer.h" |
|
16 #include "nsIInputStream.h" |
|
17 #include "mozilla/Mutex.h" |
|
18 #include "nsHtml5AtomTable.h" |
|
19 #include "nsHtml5Speculation.h" |
|
20 #include "nsITimer.h" |
|
21 #include "nsICharsetDetector.h" |
|
22 |
|
23 class nsHtml5Parser; |
|
24 |
|
25 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024 |
|
26 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024 |
|
27 |
|
28 enum eParserMode { |
|
29 /** |
|
30 * Parse a document normally as HTML. |
|
31 */ |
|
32 NORMAL, |
|
33 |
|
34 /** |
|
35 * View document as HTML source. |
|
36 */ |
|
37 VIEW_SOURCE_HTML, |
|
38 |
|
39 /** |
|
40 * View document as XML source |
|
41 */ |
|
42 VIEW_SOURCE_XML, |
|
43 |
|
44 /** |
|
45 * View document as plain text source |
|
46 */ |
|
47 VIEW_SOURCE_PLAIN, |
|
48 |
|
49 /** |
|
50 * View document as plain text |
|
51 */ |
|
52 PLAIN_TEXT, |
|
53 |
|
54 /** |
|
55 * Load as data (XHR) |
|
56 */ |
|
57 LOAD_AS_DATA |
|
58 }; |
|
59 |
|
60 enum eBomState { |
|
61 /** |
|
62 * BOM sniffing hasn't started. |
|
63 */ |
|
64 BOM_SNIFFING_NOT_STARTED = 0, |
|
65 |
|
66 /** |
|
67 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been |
|
68 * seen. |
|
69 */ |
|
70 SEEN_UTF_16_LE_FIRST_BYTE = 1, |
|
71 |
|
72 /** |
|
73 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been |
|
74 * seen. |
|
75 */ |
|
76 SEEN_UTF_16_BE_FIRST_BYTE = 2, |
|
77 |
|
78 /** |
|
79 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been |
|
80 * seen. |
|
81 */ |
|
82 SEEN_UTF_8_FIRST_BYTE = 3, |
|
83 |
|
84 /** |
|
85 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM |
|
86 * have been seen. |
|
87 */ |
|
88 SEEN_UTF_8_SECOND_BYTE = 4, |
|
89 |
|
90 /** |
|
91 * BOM sniffing was started but is now over for whatever reason. |
|
92 */ |
|
93 BOM_SNIFFING_OVER = 5 |
|
94 }; |
|
95 |
|
96 enum eHtml5StreamState { |
|
97 STREAM_NOT_STARTED = 0, |
|
98 STREAM_BEING_READ = 1, |
|
99 STREAM_ENDED = 2 |
|
100 }; |
|
101 |
|
102 class nsHtml5StreamParser : public nsICharsetDetectionObserver { |
|
103 |
|
104 friend class nsHtml5RequestStopper; |
|
105 friend class nsHtml5DataAvailable; |
|
106 friend class nsHtml5StreamParserContinuation; |
|
107 friend class nsHtml5TimerKungFu; |
|
108 |
|
109 public: |
|
110 NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW |
|
111 NS_DECL_CYCLE_COLLECTING_ISUPPORTS |
|
112 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser, |
|
113 nsICharsetDetectionObserver) |
|
114 |
|
115 static void InitializeStatics(); |
|
116 |
|
117 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, |
|
118 nsHtml5Parser* aOwner, |
|
119 eParserMode aMode); |
|
120 |
|
121 virtual ~nsHtml5StreamParser(); |
|
122 |
|
123 // Methods that nsHtml5StreamListener calls |
|
124 nsresult CheckListenerChain(); |
|
125 |
|
126 nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext); |
|
127 |
|
128 nsresult OnDataAvailable(nsIRequest* aRequest, |
|
129 nsISupports* aContext, |
|
130 nsIInputStream* aInStream, |
|
131 uint64_t aSourceOffset, |
|
132 uint32_t aLength); |
|
133 |
|
134 nsresult OnStopRequest(nsIRequest* aRequest, |
|
135 nsISupports* aContext, |
|
136 nsresult status); |
|
137 |
|
138 // nsICharsetDetectionObserver |
|
139 /** |
|
140 * Chardet calls this to report the detection result |
|
141 */ |
|
142 NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf); |
|
143 |
|
144 // EncodingDeclarationHandler |
|
145 // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java |
|
146 /** |
|
147 * Tree builder uses this to report a late <meta charset> |
|
148 */ |
|
149 bool internalEncodingDeclaration(nsString* aEncoding); |
|
150 |
|
151 // Not from an external interface |
|
152 |
|
153 /** |
|
154 * Call this method once you've created a parser, and want to instruct it |
|
155 * about what charset to load |
|
156 * |
|
157 * @param aCharset the charset of a document |
|
158 * @param aCharsetSource the source of the charset |
|
159 */ |
|
160 inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) { |
|
161 NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED, |
|
162 "SetDocumentCharset called too late."); |
|
163 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); |
|
164 mCharset = aCharset; |
|
165 mCharsetSource = aSource; |
|
166 } |
|
167 |
|
168 inline void SetObserver(nsIRequestObserver* aObserver) { |
|
169 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!"); |
|
170 mObserver = aObserver; |
|
171 } |
|
172 |
|
173 nsresult GetChannel(nsIChannel** aChannel); |
|
174 |
|
175 /** |
|
176 * The owner parser must call this after script execution |
|
177 * when no scripts are executing and the document.written |
|
178 * buffer has been exhausted. |
|
179 */ |
|
180 void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer, |
|
181 nsHtml5TreeBuilder* aTreeBuilder, |
|
182 bool aLastWasCR); |
|
183 |
|
184 /** |
|
185 * Continues the stream parser if the charset switch failed. |
|
186 */ |
|
187 void ContinueAfterFailedCharsetSwitch(); |
|
188 |
|
189 void Terminate() |
|
190 { |
|
191 mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
|
192 mTerminated = true; |
|
193 } |
|
194 |
|
195 void DropTimer(); |
|
196 |
|
197 /** |
|
198 * Sets mCharset and mCharsetSource appropriately for the XML View Source |
|
199 * case if aEncoding names a supported rough ASCII superset and sets |
|
200 * the mCharset and mCharsetSource to the UTF-8 default otherwise. |
|
201 */ |
|
202 void SetEncodingFromExpat(const char16_t* aEncoding); |
|
203 |
|
204 /** |
|
205 * Sets the URL for View Source title in case this parser ends up being |
|
206 * used for View Source. If aURL is a view-source: URL, takes the inner |
|
207 * URL. data: URLs are shown with an ellipsis instead of the actual data. |
|
208 */ |
|
209 void SetViewSourceTitle(nsIURI* aURL); |
|
210 |
|
211 private: |
|
212 |
|
213 #ifdef DEBUG |
|
214 bool IsParserThread() { |
|
215 bool ret; |
|
216 mThread->IsOnCurrentThread(&ret); |
|
217 return ret; |
|
218 } |
|
219 #endif |
|
220 |
|
221 void MarkAsBroken(nsresult aRv); |
|
222 |
|
223 /** |
|
224 * Marks the stream parser as interrupted. If you ever add calls to this |
|
225 * method, be sure to review Uninterrupt usage very, very carefully to |
|
226 * avoid having a previous in-flight runnable cancel your Interrupt() |
|
227 * call on the other thread too soon. |
|
228 */ |
|
229 void Interrupt() |
|
230 { |
|
231 mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
|
232 mInterrupted = true; |
|
233 } |
|
234 |
|
235 void Uninterrupt() |
|
236 { |
|
237 NS_ASSERTION(IsParserThread(), "Wrong thread!"); |
|
238 mTokenizerMutex.AssertCurrentThreadOwns(); |
|
239 // Not acquiring mTerminatedMutex because mTokenizerMutex is already |
|
240 // held at this point and is already stronger. |
|
241 mInterrupted = false; |
|
242 } |
|
243 |
|
244 /** |
|
245 * Flushes the tree ops from the tree builder and disarms the flush |
|
246 * timer. |
|
247 */ |
|
248 void FlushTreeOpsAndDisarmTimer(); |
|
249 |
|
250 void ParseAvailableData(); |
|
251 |
|
252 void DoStopRequest(); |
|
253 |
|
254 void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength); |
|
255 |
|
256 static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream, |
|
257 void *aClosure, |
|
258 const char *aFromSegment, |
|
259 uint32_t aToOffset, |
|
260 uint32_t aCount, |
|
261 uint32_t *aWriteCount); |
|
262 |
|
263 bool IsTerminatedOrInterrupted() |
|
264 { |
|
265 mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
|
266 return mTerminated || mInterrupted; |
|
267 } |
|
268 |
|
269 bool IsTerminated() |
|
270 { |
|
271 mozilla::MutexAutoLock autoLock(mTerminatedMutex); |
|
272 return mTerminated; |
|
273 } |
|
274 |
|
275 /** |
|
276 * True when there is a Unicode decoder already |
|
277 */ |
|
278 inline bool HasDecoder() |
|
279 { |
|
280 return !!mUnicodeDecoder; |
|
281 } |
|
282 |
|
283 /** |
|
284 * Push bytes from network when there is no Unicode decoder yet |
|
285 */ |
|
286 nsresult SniffStreamBytes(const uint8_t* aFromSegment, |
|
287 uint32_t aCount, |
|
288 uint32_t* aWriteCount); |
|
289 |
|
290 /** |
|
291 * Push bytes from network when there is a Unicode decoder already |
|
292 */ |
|
293 nsresult WriteStreamBytes(const uint8_t* aFromSegment, |
|
294 uint32_t aCount, |
|
295 uint32_t* aWriteCount); |
|
296 |
|
297 /** |
|
298 * Check whether every other byte in the sniffing buffer is zero. |
|
299 */ |
|
300 void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment, |
|
301 uint32_t aCountToSniffingLimit); |
|
302 |
|
303 /** |
|
304 * <meta charset> scan failed. Try chardet if applicable. After this, the |
|
305 * the parser will have some encoding even if a last resolt fallback. |
|
306 * |
|
307 * @param aFromSegment The current network buffer or null if the sniffing |
|
308 * buffer is being flushed due to network stream ending. |
|
309 * @param aCount The number of bytes in aFromSegment (ignored if |
|
310 * aFromSegment is null) |
|
311 * @param aWriteCount Return value for how many bytes got read from the |
|
312 * buffer. |
|
313 * @param aCountToSniffingLimit The number of unfilled slots in |
|
314 * mSniffingBuffer |
|
315 */ |
|
316 nsresult FinalizeSniffing(const uint8_t* aFromSegment, |
|
317 uint32_t aCount, |
|
318 uint32_t* aWriteCount, |
|
319 uint32_t aCountToSniffingLimit); |
|
320 |
|
321 /** |
|
322 * Set up the Unicode decoder and write the sniffing buffer into it |
|
323 * followed by the current network buffer. |
|
324 * |
|
325 * @param aFromSegment The current network buffer or null if the sniffing |
|
326 * buffer is being flushed due to network stream ending. |
|
327 * @param aCount The number of bytes in aFromSegment (ignored if |
|
328 * aFromSegment is null) |
|
329 * @param aWriteCount Return value for how many bytes got read from the |
|
330 * buffer. |
|
331 */ |
|
332 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, |
|
333 uint32_t aCount, |
|
334 uint32_t* aWriteCount); |
|
335 |
|
336 /** |
|
337 * Initialize the Unicode decoder, mark the BOM as the source and |
|
338 * drop the sniffer. |
|
339 * |
|
340 * @param aDecoderCharsetName The name for the decoder's charset |
|
341 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has |
|
342 * been swallowed) |
|
343 */ |
|
344 nsresult SetupDecodingFromBom(const char* aDecoderCharsetName); |
|
345 |
|
346 /** |
|
347 * Become confident or resolve and encoding name to its preferred form. |
|
348 * @param aEncoding the value of an internal encoding decl. Acts as an |
|
349 * out param, too, when the method returns true. |
|
350 * @return true if the parser needs to start using the new value of |
|
351 * aEncoding and false if the parser became confident or if |
|
352 * the encoding name did not specify a usable encoding |
|
353 */ |
|
354 bool PreferredForInternalEncodingDecl(nsACString& aEncoding); |
|
355 |
|
356 /** |
|
357 * Callback for mFlushTimer. |
|
358 */ |
|
359 static void TimerCallback(nsITimer* aTimer, void* aClosure); |
|
360 |
|
361 /** |
|
362 * Parser thread entry point for (maybe) flushing the ops and posting |
|
363 * a flush runnable back on the main thread. |
|
364 */ |
|
365 void TimerFlush(); |
|
366 |
|
367 nsCOMPtr<nsIRequest> mRequest; |
|
368 nsCOMPtr<nsIRequestObserver> mObserver; |
|
369 |
|
370 /** |
|
371 * The document title to use if this turns out to be a View Source parser. |
|
372 */ |
|
373 nsCString mViewSourceTitle; |
|
374 |
|
375 /** |
|
376 * The Unicode decoder |
|
377 */ |
|
378 nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder; |
|
379 |
|
380 /** |
|
381 * The buffer for sniffing the character encoding |
|
382 */ |
|
383 nsAutoArrayPtr<uint8_t> mSniffingBuffer; |
|
384 |
|
385 /** |
|
386 * The number of meaningful bytes in mSniffingBuffer |
|
387 */ |
|
388 uint32_t mSniffingLength; |
|
389 |
|
390 /** |
|
391 * BOM sniffing state |
|
392 */ |
|
393 eBomState mBomState; |
|
394 |
|
395 /** |
|
396 * <meta> prescan implementation |
|
397 */ |
|
398 nsAutoPtr<nsHtml5MetaScanner> mMetaScanner; |
|
399 |
|
400 // encoding-related stuff |
|
401 /** |
|
402 * The source (confidence) of the character encoding in use |
|
403 */ |
|
404 int32_t mCharsetSource; |
|
405 |
|
406 /** |
|
407 * The character encoding in use |
|
408 */ |
|
409 nsCString mCharset; |
|
410 |
|
411 /** |
|
412 * Whether reparse is forbidden |
|
413 */ |
|
414 bool mReparseForbidden; |
|
415 |
|
416 // Portable parser objects |
|
417 /** |
|
418 * The first buffer in the pending UTF-16 buffer queue |
|
419 */ |
|
420 nsRefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer; |
|
421 |
|
422 /** |
|
423 * The last buffer in the pending UTF-16 buffer queue |
|
424 */ |
|
425 nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to |
|
426 // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE |
|
427 |
|
428 /** |
|
429 * The tree operation executor |
|
430 */ |
|
431 nsHtml5TreeOpExecutor* mExecutor; |
|
432 |
|
433 /** |
|
434 * The HTML5 tree builder |
|
435 */ |
|
436 nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder; |
|
437 |
|
438 /** |
|
439 * The HTML5 tokenizer |
|
440 */ |
|
441 nsAutoPtr<nsHtml5Tokenizer> mTokenizer; |
|
442 |
|
443 /** |
|
444 * Makes sure the main thread can't mess the tokenizer state while it's |
|
445 * tokenizing. This mutex also protects the current speculation. |
|
446 */ |
|
447 mozilla::Mutex mTokenizerMutex; |
|
448 |
|
449 /** |
|
450 * The scoped atom table |
|
451 */ |
|
452 nsHtml5AtomTable mAtomTable; |
|
453 |
|
454 /** |
|
455 * The owner parser. |
|
456 */ |
|
457 nsRefPtr<nsHtml5Parser> mOwner; |
|
458 |
|
459 /** |
|
460 * Whether the last character tokenized was a carriage return (for CRLF) |
|
461 */ |
|
462 bool mLastWasCR; |
|
463 |
|
464 /** |
|
465 * For tracking stream life cycle |
|
466 */ |
|
467 eHtml5StreamState mStreamState; |
|
468 |
|
469 /** |
|
470 * Whether we are speculating. |
|
471 */ |
|
472 bool mSpeculating; |
|
473 |
|
474 /** |
|
475 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.) |
|
476 */ |
|
477 bool mAtEOF; |
|
478 |
|
479 /** |
|
480 * The speculations. The mutex protects the nsTArray itself. |
|
481 * To access the queue of current speculation, mTokenizerMutex must be |
|
482 * obtained. |
|
483 * The current speculation is the last element |
|
484 */ |
|
485 nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations; |
|
486 mozilla::Mutex mSpeculationMutex; |
|
487 |
|
488 /** |
|
489 * True to terminate early; protected by mTerminatedMutex |
|
490 */ |
|
491 bool mTerminated; |
|
492 bool mInterrupted; |
|
493 mozilla::Mutex mTerminatedMutex; |
|
494 |
|
495 /** |
|
496 * The thread this stream parser runs on. |
|
497 */ |
|
498 nsCOMPtr<nsIThread> mThread; |
|
499 |
|
500 nsCOMPtr<nsIRunnable> mExecutorFlusher; |
|
501 |
|
502 nsCOMPtr<nsIRunnable> mLoadFlusher; |
|
503 |
|
504 /** |
|
505 * The chardet instance if chardet is enabled. |
|
506 */ |
|
507 nsCOMPtr<nsICharsetDetector> mChardet; |
|
508 |
|
509 /** |
|
510 * If false, don't push data to chardet. |
|
511 */ |
|
512 bool mFeedChardet; |
|
513 |
|
514 /** |
|
515 * Whether the initial charset source was kCharsetFromParentFrame |
|
516 */ |
|
517 bool mInitialEncodingWasFromParentFrame; |
|
518 |
|
519 /** |
|
520 * Timer for flushing tree ops once in a while when not speculating. |
|
521 */ |
|
522 nsCOMPtr<nsITimer> mFlushTimer; |
|
523 |
|
524 /** |
|
525 * Keeps track whether mFlushTimer has been armed. Unfortunately, |
|
526 * nsITimer doesn't enable querying this from the timer itself. |
|
527 */ |
|
528 bool mFlushTimerArmed; |
|
529 |
|
530 /** |
|
531 * False initially and true after the timer has fired at least once. |
|
532 */ |
|
533 bool mFlushTimerEverFired; |
|
534 |
|
535 /** |
|
536 * Whether the parser is doing a normal parse, view source or plain text. |
|
537 */ |
|
538 eParserMode mMode; |
|
539 |
|
540 /** |
|
541 * The pref html5.flushtimer.initialdelay: Time in milliseconds between |
|
542 * the time a network buffer is seen and the timer firing when the |
|
543 * timer hasn't fired previously in this parse. |
|
544 */ |
|
545 static int32_t sTimerInitialDelay; |
|
546 |
|
547 /** |
|
548 * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between |
|
549 * the time a network buffer is seen and the timer firing when the |
|
550 * timer has already fired previously in this parse. |
|
551 */ |
|
552 static int32_t sTimerSubsequentDelay; |
|
553 }; |
|
554 |
|
555 #endif // nsHtml5StreamParser_h |