parser/html/nsHtml5StreamParser.cpp

branch
TOR_BUG_9701
changeset 14
925c144e1f1f
equal deleted inserted replaced
-1:000000000000 0:41b06b94efbe
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=79: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "mozilla/DebugOnly.h"
8
9 #include "nsHtml5StreamParser.h"
10 #include "nsContentUtils.h"
11 #include "nsHtml5Tokenizer.h"
12 #include "nsIHttpChannel.h"
13 #include "nsHtml5Parser.h"
14 #include "nsHtml5TreeBuilder.h"
15 #include "nsHtml5AtomTable.h"
16 #include "nsHtml5Module.h"
17 #include "nsHtml5RefPtr.h"
18 #include "nsIScriptError.h"
19 #include "mozilla/Preferences.h"
20 #include "nsHtml5Highlighter.h"
21 #include "expat_config.h"
22 #include "expat.h"
23 #include "nsINestedURI.h"
24 #include "nsCharsetSource.h"
25 #include "nsIWyciwygChannel.h"
26 #include "nsIThreadRetargetableRequest.h"
27 #include "nsPrintfCString.h"
28 #include "nsNetUtil.h"
29
30 #include "mozilla/dom/EncodingUtils.h"
31
32 using namespace mozilla;
33 using mozilla::dom::EncodingUtils;
34
35 int32_t nsHtml5StreamParser::sTimerInitialDelay = 120;
36 int32_t nsHtml5StreamParser::sTimerSubsequentDelay = 120;
37
38 // static
39 void
40 nsHtml5StreamParser::InitializeStatics()
41 {
42 Preferences::AddIntVarCache(&sTimerInitialDelay,
43 "html5.flushtimer.initialdelay");
44 Preferences::AddIntVarCache(&sTimerSubsequentDelay,
45 "html5.flushtimer.subsequentdelay");
46 }
47
48 /*
49 * Note that nsHtml5StreamParser implements cycle collecting AddRef and
50 * Release. Therefore, nsHtml5StreamParser must never be refcounted from
51 * the parser thread!
52 *
53 * To work around this limitation, runnables posted by the main thread to the
54 * parser thread hold their reference to the stream parser in an
55 * nsHtml5RefPtr. Upon creation, nsHtml5RefPtr addrefs the object it holds
56 * just like a regular nsRefPtr. This is OK, since the creation of the
57 * runnable and the nsHtml5RefPtr happens on the main thread.
58 *
59 * When the runnable is done on the parser thread, the destructor of
60 * nsHtml5RefPtr runs there. It doesn't call Release on the held object
61 * directly. Instead, it posts another runnable back to the main thread where
62 * that runnable calls Release on the wrapped object.
63 *
64 * When posting runnables in the other direction, the runnables have to be
65 * created on the main thread when nsHtml5StreamParser is instantiated and
66 * held for the lifetime of the nsHtml5StreamParser. This works, because the
67 * same runnabled can be dispatched multiple times and currently runnables
68 * posted from the parser thread to main thread don't need to wrap any
69 * runnable-specific data. (In the other direction, the runnables most notably
70 * wrap the byte data of the stream.)
71 */
72 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser)
73 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser)
74
75 NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser)
76 NS_INTERFACE_TABLE(nsHtml5StreamParser,
77 nsICharsetDetectionObserver)
78 NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser)
79 NS_INTERFACE_MAP_END
80
81 NS_IMPL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
82
83 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser)
84 tmp->DropTimer();
85 NS_IMPL_CYCLE_COLLECTION_UNLINK(mObserver)
86 NS_IMPL_CYCLE_COLLECTION_UNLINK(mRequest)
87 NS_IMPL_CYCLE_COLLECTION_UNLINK(mOwner)
88 tmp->mExecutorFlusher = nullptr;
89 tmp->mLoadFlusher = nullptr;
90 tmp->mExecutor = nullptr;
91 NS_IMPL_CYCLE_COLLECTION_UNLINK(mChardet)
92 NS_IMPL_CYCLE_COLLECTION_UNLINK_END
93
94 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
95 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mObserver)
96 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mRequest)
97 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mOwner)
98 // hack: count the strongly owned edge wrapped in the runnable
99 if (tmp->mExecutorFlusher) {
100 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mExecutorFlusher->mExecutor");
101 cb.NoteXPCOMChild(static_cast<nsIContentSink*> (tmp->mExecutor));
102 }
103 // hack: count the strongly owned edge wrapped in the runnable
104 if (tmp->mLoadFlusher) {
105 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor");
106 cb.NoteXPCOMChild(static_cast<nsIContentSink*> (tmp->mExecutor));
107 }
108 // hack: count self if held by mChardet
109 if (tmp->mChardet) {
110 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mChardet->mObserver");
111 cb.NoteXPCOMChild(static_cast<nsICharsetDetectionObserver*>(tmp));
112 }
113 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
114
115 class nsHtml5ExecutorFlusher : public nsRunnable
116 {
117 private:
118 nsRefPtr<nsHtml5TreeOpExecutor> mExecutor;
119 public:
120 nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor* aExecutor)
121 : mExecutor(aExecutor)
122 {}
123 NS_IMETHODIMP Run()
124 {
125 if (!mExecutor->isInList()) {
126 mExecutor->RunFlushLoop();
127 }
128 return NS_OK;
129 }
130 };
131
132 class nsHtml5LoadFlusher : public nsRunnable
133 {
134 private:
135 nsRefPtr<nsHtml5TreeOpExecutor> mExecutor;
136 public:
137 nsHtml5LoadFlusher(nsHtml5TreeOpExecutor* aExecutor)
138 : mExecutor(aExecutor)
139 {}
140 NS_IMETHODIMP Run()
141 {
142 mExecutor->FlushSpeculativeLoads();
143 return NS_OK;
144 }
145 };
146
147 nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
148 nsHtml5Parser* aOwner,
149 eParserMode aMode)
150 : mFirstBuffer(nullptr) // Will be filled when starting
151 , mLastBuffer(nullptr) // Will be filled when starting
152 , mExecutor(aExecutor)
153 , mTreeBuilder(new nsHtml5TreeBuilder((aMode == VIEW_SOURCE_HTML ||
154 aMode == VIEW_SOURCE_XML) ?
155 nullptr : mExecutor->GetStage(),
156 aMode == NORMAL ?
157 mExecutor->GetStage() : nullptr))
158 , mTokenizer(new nsHtml5Tokenizer(mTreeBuilder, aMode == VIEW_SOURCE_XML))
159 , mTokenizerMutex("nsHtml5StreamParser mTokenizerMutex")
160 , mOwner(aOwner)
161 , mSpeculationMutex("nsHtml5StreamParser mSpeculationMutex")
162 , mTerminatedMutex("nsHtml5StreamParser mTerminatedMutex")
163 , mThread(nsHtml5Module::GetStreamParserThread())
164 , mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor))
165 , mLoadFlusher(new nsHtml5LoadFlusher(aExecutor))
166 , mFlushTimer(do_CreateInstance("@mozilla.org/timer;1"))
167 , mMode(aMode)
168 {
169 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
170 mFlushTimer->SetTarget(mThread);
171 #ifdef DEBUG
172 mAtomTable.SetPermittedLookupThread(mThread);
173 #endif
174 mTokenizer->setInterner(&mAtomTable);
175 mTokenizer->setEncodingDeclarationHandler(this);
176
177 if (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) {
178 nsHtml5Highlighter* highlighter =
179 new nsHtml5Highlighter(mExecutor->GetStage());
180 mTokenizer->EnableViewSource(highlighter); // takes ownership
181 mTreeBuilder->EnableViewSource(highlighter); // doesn't own
182 }
183
184 // Chardet instantiation adapted from nsDOMFile.
185 // Chardet is initialized here even if it turns out to be useless
186 // to make the chardet refcount its observer (nsHtml5StreamParser)
187 // on the main thread.
188 const nsAdoptingCString& detectorName =
189 Preferences::GetLocalizedCString("intl.charset.detector");
190 if (!detectorName.IsEmpty()) {
191 nsAutoCString detectorContractID;
192 detectorContractID.AssignLiteral(NS_CHARSET_DETECTOR_CONTRACTID_BASE);
193 detectorContractID += detectorName;
194 if ((mChardet = do_CreateInstance(detectorContractID.get()))) {
195 (void) mChardet->Init(this);
196 mFeedChardet = true;
197 }
198 }
199
200 // There's a zeroing operator new for everything else
201 }
202
203 nsHtml5StreamParser::~nsHtml5StreamParser()
204 {
205 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
206 mTokenizer->end();
207 NS_ASSERTION(!mFlushTimer, "Flush timer was not dropped before dtor!");
208 #ifdef DEBUG
209 mRequest = nullptr;
210 mObserver = nullptr;
211 mUnicodeDecoder = nullptr;
212 mSniffingBuffer = nullptr;
213 mMetaScanner = nullptr;
214 mFirstBuffer = nullptr;
215 mExecutor = nullptr;
216 mTreeBuilder = nullptr;
217 mTokenizer = nullptr;
218 mOwner = nullptr;
219 #endif
220 }
221
222 nsresult
223 nsHtml5StreamParser::GetChannel(nsIChannel** aChannel)
224 {
225 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
226 return mRequest ? CallQueryInterface(mRequest, aChannel) :
227 NS_ERROR_NOT_AVAILABLE;
228 }
229
230 NS_IMETHODIMP
231 nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
232 {
233 NS_ASSERTION(IsParserThread(), "Wrong thread!");
234 if (aConf == eBestAnswer || aConf == eSureAnswer) {
235 mFeedChardet = false; // just in case
236 nsAutoCString encoding;
237 if (!EncodingUtils::FindEncodingForLabel(nsDependentCString(aCharset),
238 encoding)) {
239 return NS_OK;
240 }
241 if (encoding.EqualsLiteral("replacement")) {
242 return NS_OK;
243 }
244 if (HasDecoder()) {
245 if (mCharset.Equals(encoding)) {
246 NS_ASSERTION(mCharsetSource < kCharsetFromAutoDetection,
247 "Why are we running chardet at all?");
248 mCharsetSource = kCharsetFromAutoDetection;
249 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
250 } else {
251 // We've already committed to a decoder. Request a reload from the
252 // docshell.
253 mTreeBuilder->NeedsCharsetSwitchTo(encoding,
254 kCharsetFromAutoDetection,
255 0);
256 FlushTreeOpsAndDisarmTimer();
257 Interrupt();
258 }
259 } else {
260 // Got a confident answer from the sniffing buffer. That code will
261 // take care of setting up the decoder.
262 mCharset.Assign(encoding);
263 mCharsetSource = kCharsetFromAutoDetection;
264 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
265 }
266 }
267 return NS_OK;
268 }
269
270 void
271 nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL)
272 {
273 if (aURL) {
274 nsCOMPtr<nsIURI> temp;
275 bool isViewSource;
276 aURL->SchemeIs("view-source", &isViewSource);
277 if (isViewSource) {
278 nsCOMPtr<nsINestedURI> nested = do_QueryInterface(aURL);
279 nested->GetInnerURI(getter_AddRefs(temp));
280 } else {
281 temp = aURL;
282 }
283 bool isData;
284 temp->SchemeIs("data", &isData);
285 if (isData) {
286 // Avoid showing potentially huge data: URLs. The three last bytes are
287 // UTF-8 for an ellipsis.
288 mViewSourceTitle.AssignLiteral("data:\xE2\x80\xA6");
289 } else {
290 temp->GetSpec(mViewSourceTitle);
291 }
292 }
293 }
294
295 nsresult
296 nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, // can be null
297 uint32_t aCount,
298 uint32_t* aWriteCount)
299 {
300 NS_ASSERTION(IsParserThread(), "Wrong thread!");
301 nsresult rv = NS_OK;
302 mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
303 if (mSniffingBuffer) {
304 uint32_t writeCount;
305 rv = WriteStreamBytes(mSniffingBuffer, mSniffingLength, &writeCount);
306 NS_ENSURE_SUCCESS(rv, rv);
307 mSniffingBuffer = nullptr;
308 }
309 mMetaScanner = nullptr;
310 if (aFromSegment) {
311 rv = WriteStreamBytes(aFromSegment, aCount, aWriteCount);
312 }
313 return rv;
314 }
315
316 nsresult
317 nsHtml5StreamParser::SetupDecodingFromBom(const char* aDecoderCharsetName)
318 {
319 NS_ASSERTION(IsParserThread(), "Wrong thread!");
320 mCharset.Assign(aDecoderCharsetName);
321 mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
322 mCharsetSource = kCharsetFromByteOrderMark;
323 mFeedChardet = false;
324 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
325 mSniffingBuffer = nullptr;
326 mMetaScanner = nullptr;
327 mBomState = BOM_SNIFFING_OVER;
328 return NS_OK;
329 }
330
331 void
332 nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
333 uint32_t aCountToSniffingLimit)
334 {
335 // Avoid underspecified heuristic craziness for XHR
336 if (mMode == LOAD_AS_DATA) {
337 return;
338 }
339 // Make sure there's enough data. Require room for "<title></title>"
340 if (mSniffingLength + aCountToSniffingLimit < 30) {
341 return;
342 }
343 // even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
344 bool byteZero[2] = { false, false };
345 bool byteNonZero[2] = { false, false };
346 uint32_t i = 0;
347 if (mSniffingBuffer) {
348 for (; i < mSniffingLength; ++i) {
349 if (mSniffingBuffer[i]) {
350 if (byteNonZero[1 - (i % 2)]) {
351 return;
352 }
353 byteNonZero[i % 2] = true;
354 } else {
355 if (byteZero[1 - (i % 2)]) {
356 return;
357 }
358 byteZero[i % 2] = true;
359 }
360 }
361 }
362 if (aFromSegment) {
363 for (uint32_t j = 0; j < aCountToSniffingLimit; ++j) {
364 if (aFromSegment[j]) {
365 if (byteNonZero[1 - ((i + j) % 2)]) {
366 return;
367 }
368 byteNonZero[(i + j) % 2] = true;
369 } else {
370 if (byteZero[1 - ((i + j) % 2)]) {
371 return;
372 }
373 byteZero[(i + j) % 2] = true;
374 }
375 }
376 }
377
378 if (byteNonZero[0]) {
379 mCharset.Assign("UTF-16LE");
380 } else {
381 mCharset.Assign("UTF-16BE");
382 }
383 mCharsetSource = kCharsetFromIrreversibleAutoDetection;
384 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
385 mFeedChardet = false;
386 mTreeBuilder->MaybeComplainAboutCharset("EncBomlessUtf16",
387 true,
388 0);
389
390 }
391
392 void
393 nsHtml5StreamParser::SetEncodingFromExpat(const char16_t* aEncoding)
394 {
395 if (aEncoding) {
396 nsDependentString utf16(aEncoding);
397 nsAutoCString utf8;
398 CopyUTF16toUTF8(utf16, utf8);
399 if (PreferredForInternalEncodingDecl(utf8)) {
400 mCharset.Assign(utf8);
401 mCharsetSource = kCharsetFromMetaTag; // closest for XML
402 return;
403 }
404 // else the page declared an encoding Gecko doesn't support and we'd
405 // end up defaulting to UTF-8 anyway. Might as well fall through here
406 // right away and let the encoding be set to UTF-8 which we'd default to
407 // anyway.
408 }
409 mCharset.AssignLiteral("UTF-8"); // XML defaults to UTF-8 without a BOM
410 mCharsetSource = kCharsetFromMetaTag; // means confident
411 }
412
413 // A separate user data struct is used instead of passing the
414 // nsHtml5StreamParser instance as user data in order to avoid including
415 // expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
416 // Using a separate user data struct also avoids bloating nsHtml5StreamParser
417 // by one pointer.
418 struct UserData {
419 XML_Parser mExpat;
420 nsHtml5StreamParser* mStreamParser;
421 };
422
423 // Using no-namespace handler callbacks to avoid including expat.h in
424 // nsHtml5StreamParser.h, since doing so would cause naming conclicts.
425 static void
426 HandleXMLDeclaration(void* aUserData,
427 const XML_Char* aVersion,
428 const XML_Char* aEncoding,
429 int aStandalone)
430 {
431 UserData* ud = static_cast<UserData*>(aUserData);
432 ud->mStreamParser->SetEncodingFromExpat(
433 reinterpret_cast<const char16_t*>(aEncoding));
434 XML_StopParser(ud->mExpat, false);
435 }
436
437 static void
438 HandleStartElement(void* aUserData,
439 const XML_Char* aName,
440 const XML_Char **aAtts)
441 {
442 UserData* ud = static_cast<UserData*>(aUserData);
443 XML_StopParser(ud->mExpat, false);
444 }
445
446 static void
447 HandleEndElement(void* aUserData,
448 const XML_Char* aName)
449 {
450 UserData* ud = static_cast<UserData*>(aUserData);
451 XML_StopParser(ud->mExpat, false);
452 }
453
454 static void
455 HandleComment(void* aUserData,
456 const XML_Char* aName)
457 {
458 UserData* ud = static_cast<UserData*>(aUserData);
459 XML_StopParser(ud->mExpat, false);
460 }
461
462 static void
463 HandleProcessingInstruction(void* aUserData,
464 const XML_Char* aTarget,
465 const XML_Char* aData)
466 {
467 UserData* ud = static_cast<UserData*>(aUserData);
468 XML_StopParser(ud->mExpat, false);
469 }
470
471 nsresult
472 nsHtml5StreamParser::FinalizeSniffing(const uint8_t* aFromSegment, // can be null
473 uint32_t aCount,
474 uint32_t* aWriteCount,
475 uint32_t aCountToSniffingLimit)
476 {
477 NS_ASSERTION(IsParserThread(), "Wrong thread!");
478 NS_ASSERTION(mCharsetSource < kCharsetFromParentForced,
479 "Should not finalize sniffing when using forced charset.");
480 if (mMode == VIEW_SOURCE_XML) {
481 static const XML_Memory_Handling_Suite memsuite =
482 {
483 (void *(*)(size_t))moz_xmalloc,
484 (void *(*)(void *, size_t))moz_xrealloc,
485 moz_free
486 };
487
488 static const char16_t kExpatSeparator[] = { 0xFFFF, '\0' };
489
490 static const char16_t kISO88591[] =
491 { 'I', 'S', 'O', '-', '8', '8', '5', '9', '-', '1', '\0' };
492
493 UserData ud;
494 ud.mStreamParser = this;
495
496 // If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
497 // documents MUST begin with a BOM. We don't support EBCDIC and such.
498 // Thus, at this point, what we have is garbage or something encoded using
499 // a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
500 // without throwing errors when bytes have the most significant bit set
501 // and without triggering expat's unknown encoding code paths. This is
502 // enough to be able to use expat to parse the XML declaration in order
503 // to extract the encoding name from it.
504 ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
505 XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
506 XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
507 XML_SetCommentHandler(ud.mExpat, HandleComment);
508 XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
509 XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
510
511 XML_Status status = XML_STATUS_OK;
512
513 // aFromSegment points to the data obtained from the current network
514 // event. mSniffingBuffer (if it exists) contains the data obtained before
515 // the current event. Thus, mSniffingLenth bytes of mSniffingBuffer
516 // followed by aCountToSniffingLimit bytes from aFromSegment are the
517 // first 1024 bytes of the file (or the file as a whole if the file is
518 // 1024 bytes long or shorter). Thus, we parse both buffers, but if the
519 // first call succeeds already, we skip parsing the second buffer.
520 if (mSniffingBuffer) {
521 status = XML_Parse(ud.mExpat,
522 reinterpret_cast<const char*>(mSniffingBuffer.get()),
523 mSniffingLength,
524 false);
525 }
526 if (status == XML_STATUS_OK &&
527 mCharsetSource < kCharsetFromMetaTag &&
528 aFromSegment) {
529 status = XML_Parse(ud.mExpat,
530 reinterpret_cast<const char*>(aFromSegment),
531 aCountToSniffingLimit,
532 false);
533 }
534 XML_ParserFree(ud.mExpat);
535
536 if (mCharsetSource < kCharsetFromMetaTag) {
537 // Failed to get an encoding from the XML declaration. XML defaults
538 // confidently to UTF-8 in this case.
539 // It is also possible that the document has an XML declaration that is
540 // longer than 1024 bytes, but that case is not worth worrying about.
541 mCharset.AssignLiteral("UTF-8");
542 mCharsetSource = kCharsetFromMetaTag; // means confident
543 }
544
545 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
546 aCount,
547 aWriteCount);
548 }
549
550 // meta scan failed.
551 if (mCharsetSource >= kCharsetFromHintPrevDoc) {
552 mFeedChardet = false;
553 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
554 }
555 // Check for BOMless UTF-16 with Basic
556 // Latin content for compat with IE. See bug 631751.
557 SniffBOMlessUTF16BasicLatin(aFromSegment, aCountToSniffingLimit);
558 // the charset may have been set now
559 // maybe try chardet now;
560 if (mFeedChardet) {
561 bool dontFeed;
562 nsresult rv;
563 if (mSniffingBuffer) {
564 rv = mChardet->DoIt((const char*)mSniffingBuffer.get(), mSniffingLength, &dontFeed);
565 mFeedChardet = !dontFeed;
566 NS_ENSURE_SUCCESS(rv, rv);
567 }
568 if (mFeedChardet && aFromSegment) {
569 rv = mChardet->DoIt((const char*)aFromSegment,
570 // Avoid buffer boundary-dependent behavior when
571 // reparsing is forbidden. If reparse is forbidden,
572 // act as if we only saw the first 1024 bytes.
573 // When reparsing isn't forbidden, buffer boundaries
574 // can have an effect on whether the page is loaded
575 // once or twice. :-(
576 mReparseForbidden ? aCountToSniffingLimit : aCount,
577 &dontFeed);
578 mFeedChardet = !dontFeed;
579 NS_ENSURE_SUCCESS(rv, rv);
580 }
581 if (mFeedChardet && (!aFromSegment || mReparseForbidden)) {
582 // mReparseForbidden is checked so that we get to use the sniffing
583 // buffer with the best guess so far if we aren't allowed to guess
584 // better later.
585 mFeedChardet = false;
586 rv = mChardet->Done();
587 NS_ENSURE_SUCCESS(rv, rv);
588 }
589 // fall thru; callback may have changed charset
590 }
591 if (mCharsetSource == kCharsetUninitialized) {
592 // Hopefully this case is never needed, but dealing with it anyway
593 mCharset.AssignLiteral("windows-1252");
594 mCharsetSource = kCharsetFromFallback;
595 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
596 } else if (mMode == LOAD_AS_DATA &&
597 mCharsetSource == kCharsetFromFallback) {
598 NS_ASSERTION(mReparseForbidden, "Reparse should be forbidden for XHR");
599 NS_ASSERTION(!mFeedChardet, "Should not feed chardet for XHR");
600 NS_ASSERTION(mCharset.EqualsLiteral("UTF-8"),
601 "XHR should default to UTF-8");
602 // Now mark charset source as non-weak to signal that we have a decision
603 mCharsetSource = kCharsetFromDocTypeDefault;
604 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
605 }
606 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
607 }
608
609 nsresult
610 nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
611 uint32_t aCount,
612 uint32_t* aWriteCount)
613 {
614 NS_ASSERTION(IsParserThread(), "Wrong thread!");
615 nsresult rv = NS_OK;
616 uint32_t writeCount;
617
618 // mCharset and mCharsetSource potentially have come from channel or higher
619 // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
620 // If we don't find a BOM, the previously set values of mCharset and
621 // mCharsetSource are not modified by the BOM sniffing here.
622 for (uint32_t i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
623 switch (mBomState) {
624 case BOM_SNIFFING_NOT_STARTED:
625 NS_ASSERTION(i == 0, "Bad BOM sniffing state.");
626 switch (*aFromSegment) {
627 case 0xEF:
628 mBomState = SEEN_UTF_8_FIRST_BYTE;
629 break;
630 case 0xFF:
631 mBomState = SEEN_UTF_16_LE_FIRST_BYTE;
632 break;
633 case 0xFE:
634 mBomState = SEEN_UTF_16_BE_FIRST_BYTE;
635 break;
636 default:
637 mBomState = BOM_SNIFFING_OVER;
638 break;
639 }
640 break;
641 case SEEN_UTF_16_LE_FIRST_BYTE:
642 if (aFromSegment[i] == 0xFE) {
643 rv = SetupDecodingFromBom("UTF-16LE"); // upper case is the raw form
644 NS_ENSURE_SUCCESS(rv, rv);
645 uint32_t count = aCount - (i + 1);
646 rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
647 NS_ENSURE_SUCCESS(rv, rv);
648 *aWriteCount = writeCount + (i + 1);
649 return rv;
650 }
651 mBomState = BOM_SNIFFING_OVER;
652 break;
653 case SEEN_UTF_16_BE_FIRST_BYTE:
654 if (aFromSegment[i] == 0xFF) {
655 rv = SetupDecodingFromBom("UTF-16BE"); // upper case is the raw form
656 NS_ENSURE_SUCCESS(rv, rv);
657 uint32_t count = aCount - (i + 1);
658 rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
659 NS_ENSURE_SUCCESS(rv, rv);
660 *aWriteCount = writeCount + (i + 1);
661 return rv;
662 }
663 mBomState = BOM_SNIFFING_OVER;
664 break;
665 case SEEN_UTF_8_FIRST_BYTE:
666 if (aFromSegment[i] == 0xBB) {
667 mBomState = SEEN_UTF_8_SECOND_BYTE;
668 } else {
669 mBomState = BOM_SNIFFING_OVER;
670 }
671 break;
672 case SEEN_UTF_8_SECOND_BYTE:
673 if (aFromSegment[i] == 0xBF) {
674 rv = SetupDecodingFromBom("UTF-8"); // upper case is the raw form
675 NS_ENSURE_SUCCESS(rv, rv);
676 uint32_t count = aCount - (i + 1);
677 rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
678 NS_ENSURE_SUCCESS(rv, rv);
679 *aWriteCount = writeCount + (i + 1);
680 return rv;
681 }
682 mBomState = BOM_SNIFFING_OVER;
683 break;
684 default:
685 mBomState = BOM_SNIFFING_OVER;
686 break;
687 }
688 }
689 // if we get here, there either was no BOM or the BOM sniffing isn't complete
690 // yet
691
692 MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark,
693 "Should not come here if BOM was found.");
694 MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent,
695 "kCharsetFromOtherComponent is for XSLT.");
696
697 if (mBomState == BOM_SNIFFING_OVER &&
698 mCharsetSource == kCharsetFromChannel) {
699 // There was no BOM and the charset came from channel. mCharset
700 // still contains the charset from the channel as set by an
701 // earlier call to SetDocumentCharset(), since we didn't find a BOM and
702 // overwrite mCharset. (Note that if the user has overridden the charset,
703 // we don't come here but check <meta> for XSS-dangerous charsets first.)
704 mFeedChardet = false;
705 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
706 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
707 aCount, aWriteCount);
708 }
709
710 if (!mMetaScanner && (mMode == NORMAL ||
711 mMode == VIEW_SOURCE_HTML ||
712 mMode == LOAD_AS_DATA)) {
713 mMetaScanner = new nsHtml5MetaScanner();
714 }
715
716 if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
717 // this is the last buffer
718 uint32_t countToSniffingLimit =
719 NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
720 if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
721 nsHtml5ByteReadable readable(aFromSegment, aFromSegment +
722 countToSniffingLimit);
723 nsAutoCString encoding;
724 mMetaScanner->sniff(&readable, encoding);
725 if (!encoding.IsEmpty()) {
726 // meta scan successful; honor overrides unless meta is XSS-dangerous
727 if ((mCharsetSource == kCharsetFromParentForced ||
728 mCharsetSource == kCharsetFromUserForced) &&
729 EncodingUtils::IsAsciiCompatible(encoding)) {
730 // Honor override
731 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
732 aFromSegment, aCount, aWriteCount);
733 }
734 mCharset.Assign(encoding);
735 mCharsetSource = kCharsetFromMetaPrescan;
736 mFeedChardet = false;
737 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
738 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
739 aFromSegment, aCount, aWriteCount);
740 }
741 }
742 if (mCharsetSource == kCharsetFromParentForced ||
743 mCharsetSource == kCharsetFromUserForced) {
744 // meta not found, honor override
745 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
746 aFromSegment, aCount, aWriteCount);
747 }
748 return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
749 countToSniffingLimit);
750 }
751
752 // not the last buffer
753 if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
754 nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
755 nsAutoCString encoding;
756 mMetaScanner->sniff(&readable, encoding);
757 if (!encoding.IsEmpty()) {
758 // meta scan successful; honor overrides unless meta is XSS-dangerous
759 if ((mCharsetSource == kCharsetFromParentForced ||
760 mCharsetSource == kCharsetFromUserForced) &&
761 EncodingUtils::IsAsciiCompatible(encoding)) {
762 // Honor override
763 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
764 aCount, aWriteCount);
765 }
766 mCharset.Assign(encoding);
767 mCharsetSource = kCharsetFromMetaPrescan;
768 mFeedChardet = false;
769 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
770 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
771 aCount, aWriteCount);
772 }
773 }
774
775 if (!mSniffingBuffer) {
776 const mozilla::fallible_t fallible = mozilla::fallible_t();
777 mSniffingBuffer = new (fallible)
778 uint8_t[NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE];
779 if (!mSniffingBuffer) {
780 return NS_ERROR_OUT_OF_MEMORY;
781 }
782 }
783 memcpy(mSniffingBuffer + mSniffingLength, aFromSegment, aCount);
784 mSniffingLength += aCount;
785 *aWriteCount = aCount;
786 return NS_OK;
787 }
788
789 nsresult
790 nsHtml5StreamParser::WriteStreamBytes(const uint8_t* aFromSegment,
791 uint32_t aCount,
792 uint32_t* aWriteCount)
793 {
794 NS_ASSERTION(IsParserThread(), "Wrong thread!");
795 // mLastBuffer should always point to a buffer of the size
796 // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE.
797 if (!mLastBuffer) {
798 NS_WARNING("mLastBuffer should not be null!");
799 MarkAsBroken(NS_ERROR_NULL_POINTER);
800 return NS_ERROR_NULL_POINTER;
801 }
802 if (mLastBuffer->getEnd() == NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE) {
803 nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
804 nsHtml5OwningUTF16Buffer::FalliblyCreate(
805 NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
806 if (!newBuf) {
807 return NS_ERROR_OUT_OF_MEMORY;
808 }
809 mLastBuffer = (mLastBuffer->next = newBuf.forget());
810 }
811 int32_t totalByteCount = 0;
812 for (;;) {
813 int32_t end = mLastBuffer->getEnd();
814 int32_t byteCount = aCount - totalByteCount;
815 int32_t utf16Count = NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE - end;
816
817 NS_ASSERTION(utf16Count, "Trying to convert into a buffer with no free space!");
818 // byteCount may be zero to force the decoder to output a pending surrogate
819 // pair.
820
821 nsresult convResult = mUnicodeDecoder->Convert((const char*)aFromSegment, &byteCount, mLastBuffer->getBuffer() + end, &utf16Count);
822 MOZ_ASSERT(NS_SUCCEEDED(convResult));
823
824 end += utf16Count;
825 mLastBuffer->setEnd(end);
826 totalByteCount += byteCount;
827 aFromSegment += byteCount;
828
829 NS_ASSERTION(end <= NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE,
830 "The Unicode decoder wrote too much data.");
831 NS_ASSERTION(byteCount >= -1, "The decoder consumed fewer than -1 bytes.");
832
833 if (convResult == NS_PARTIAL_MORE_OUTPUT) {
834 nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
835 nsHtml5OwningUTF16Buffer::FalliblyCreate(
836 NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
837 if (!newBuf) {
838 return NS_ERROR_OUT_OF_MEMORY;
839 }
840 mLastBuffer = (mLastBuffer->next = newBuf.forget());
841 // All input may have been consumed if there is a pending surrogate pair
842 // that doesn't fit in the output buffer. Loop back to push a zero-length
843 // input to the decoder in that case.
844 } else {
845 NS_ASSERTION(totalByteCount == (int32_t)aCount,
846 "The Unicode decoder consumed the wrong number of bytes.");
847 *aWriteCount = (uint32_t)totalByteCount;
848 return NS_OK;
849 }
850 }
851 }
852
853 nsresult
854 nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
855 {
856 NS_PRECONDITION(STREAM_NOT_STARTED == mStreamState,
857 "Got OnStartRequest when the stream had already started.");
858 NS_PRECONDITION(!mExecutor->HasStarted(),
859 "Got OnStartRequest at the wrong stage in the executor life cycle.");
860 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
861 if (mObserver) {
862 mObserver->OnStartRequest(aRequest, aContext);
863 }
864 mRequest = aRequest;
865
866 mStreamState = STREAM_BEING_READ;
867
868 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
869 mTokenizer->StartViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
870 }
871
872 // For View Source, the parser should run with scripts "enabled" if a normal
873 // load would have scripts enabled.
874 bool scriptingEnabled = mMode == LOAD_AS_DATA ?
875 false : mExecutor->IsScriptEnabled();
876 mOwner->StartTokenizer(scriptingEnabled);
877
878 bool isSrcdoc = false;
879 nsCOMPtr<nsIChannel> channel;
880 nsresult rv = GetChannel(getter_AddRefs(channel));
881 if (NS_SUCCEEDED(rv)) {
882 isSrcdoc = NS_IsSrcdocChannel(channel);
883 }
884 mTreeBuilder->setIsSrcdocDocument(isSrcdoc);
885 mTreeBuilder->setScriptingEnabled(scriptingEnabled);
886 mTreeBuilder->SetPreventScriptExecution(!((mMode == NORMAL) &&
887 scriptingEnabled));
888 mTokenizer->start();
889 mExecutor->Start();
890 mExecutor->StartReadingFromStage();
891
892 if (mMode == PLAIN_TEXT) {
893 mTreeBuilder->StartPlainText();
894 mTokenizer->StartPlainText();
895 } else if (mMode == VIEW_SOURCE_PLAIN) {
896 mTreeBuilder->StartPlainTextViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
897 mTokenizer->StartPlainText();
898 }
899
900 /*
901 * If you move the following line, be very careful not to cause
902 * WillBuildModel to be called before the document has had its
903 * script global object set.
904 */
905 rv = mExecutor->WillBuildModel(eDTDMode_unknown);
906 NS_ENSURE_SUCCESS(rv, rv);
907
908 nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
909 nsHtml5OwningUTF16Buffer::FalliblyCreate(
910 NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
911 if (!newBuf) {
912 // marks this stream parser as terminated,
913 // which prevents entry to code paths that
914 // would use mFirstBuffer or mLastBuffer.
915 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
916 }
917 NS_ASSERTION(!mFirstBuffer, "How come we have the first buffer set?");
918 NS_ASSERTION(!mLastBuffer, "How come we have the last buffer set?");
919 mFirstBuffer = mLastBuffer = newBuf;
920
921 rv = NS_OK;
922
923 // The line below means that the encoding can end up being wrong if
924 // a view-source URL is loaded without having the encoding hint from a
925 // previous normal load in the history.
926 mReparseForbidden = !(mMode == NORMAL || mMode == PLAIN_TEXT);
927
928 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(mRequest, &rv));
929 if (NS_SUCCEEDED(rv)) {
930 nsAutoCString method;
931 httpChannel->GetRequestMethod(method);
932 // XXX does Necko have a way to renavigate POST, etc. without hitting
933 // the network?
934 if (!method.EqualsLiteral("GET")) {
935 // This is the old Gecko behavior but the HTML5 spec disagrees.
936 // Don't reparse on POST.
937 mReparseForbidden = true;
938 mFeedChardet = false; // can't restart anyway
939 }
940
941 // Attempt to retarget delivery of data (via OnDataAvailable) to the parser
942 // thread, rather than through the main thread.
943 nsCOMPtr<nsIThreadRetargetableRequest> threadRetargetableRequest =
944 do_QueryInterface(mRequest);
945 if (threadRetargetableRequest) {
946 threadRetargetableRequest->RetargetDeliveryTo(mThread);
947 }
948 }
949
950 if (mCharsetSource == kCharsetFromParentFrame) {
951 // Remember this in case chardet overwrites mCharsetSource
952 mInitialEncodingWasFromParentFrame = true;
953 }
954
955 if (mCharsetSource >= kCharsetFromAutoDetection) {
956 mFeedChardet = false;
957 }
958
959 nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
960 if (!wyciwygChannel) {
961 // we aren't ready to commit to an encoding yet
962 // leave converter uninstantiated for now
963 return NS_OK;
964 }
965
966 // We are reloading a document.open()ed doc.
967 mReparseForbidden = true;
968 mFeedChardet = false;
969
970 // Instantiate the converter here to avoid BOM sniffing.
971 mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
972 return NS_OK;
973 }
974
975 nsresult
976 nsHtml5StreamParser::CheckListenerChain()
977 {
978 NS_ASSERTION(NS_IsMainThread(), "Should be on the main thread!");
979 if (!mObserver) {
980 return NS_OK;
981 }
982 nsresult rv;
983 nsCOMPtr<nsIThreadRetargetableStreamListener> retargetable =
984 do_QueryInterface(mObserver, &rv);
985 if (NS_SUCCEEDED(rv) && retargetable) {
986 rv = retargetable->CheckListenerChain();
987 }
988 return rv;
989 }
990
991 void
992 nsHtml5StreamParser::DoStopRequest()
993 {
994 NS_ASSERTION(IsParserThread(), "Wrong thread!");
995 NS_PRECONDITION(STREAM_BEING_READ == mStreamState,
996 "Stream ended without being open.");
997 mTokenizerMutex.AssertCurrentThreadOwns();
998
999 if (IsTerminated()) {
1000 return;
1001 }
1002
1003 mStreamState = STREAM_ENDED;
1004
1005 if (!mUnicodeDecoder) {
1006 uint32_t writeCount;
1007 nsresult rv;
1008 if (NS_FAILED(rv = FinalizeSniffing(nullptr, 0, &writeCount, 0))) {
1009 MarkAsBroken(rv);
1010 return;
1011 }
1012 } else if (mFeedChardet) {
1013 mChardet->Done();
1014 }
1015
1016 if (IsTerminatedOrInterrupted()) {
1017 return;
1018 }
1019
1020 ParseAvailableData();
1021 }
1022
1023 class nsHtml5RequestStopper : public nsRunnable
1024 {
1025 private:
1026 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1027 public:
1028 nsHtml5RequestStopper(nsHtml5StreamParser* aStreamParser)
1029 : mStreamParser(aStreamParser)
1030 {}
1031 NS_IMETHODIMP Run()
1032 {
1033 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1034 mStreamParser->DoStopRequest();
1035 return NS_OK;
1036 }
1037 };
1038
1039 nsresult
1040 nsHtml5StreamParser::OnStopRequest(nsIRequest* aRequest,
1041 nsISupports* aContext,
1042 nsresult status)
1043 {
1044 NS_ASSERTION(mRequest == aRequest, "Got Stop on wrong stream.");
1045 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1046 if (mObserver) {
1047 mObserver->OnStopRequest(aRequest, aContext, status);
1048 }
1049 nsCOMPtr<nsIRunnable> stopper = new nsHtml5RequestStopper(this);
1050 if (NS_FAILED(mThread->Dispatch(stopper, nsIThread::DISPATCH_NORMAL))) {
1051 NS_WARNING("Dispatching StopRequest event failed.");
1052 }
1053 return NS_OK;
1054 }
1055
1056 void
1057 nsHtml5StreamParser::DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength)
1058 {
1059 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1060 NS_PRECONDITION(STREAM_BEING_READ == mStreamState,
1061 "DoDataAvailable called when stream not open.");
1062 mTokenizerMutex.AssertCurrentThreadOwns();
1063
1064 if (IsTerminated()) {
1065 return;
1066 }
1067
1068 uint32_t writeCount;
1069 nsresult rv;
1070 if (HasDecoder()) {
1071 if (mFeedChardet) {
1072 bool dontFeed;
1073 mChardet->DoIt((const char*)aBuffer, aLength, &dontFeed);
1074 mFeedChardet = !dontFeed;
1075 }
1076 rv = WriteStreamBytes(aBuffer, aLength, &writeCount);
1077 } else {
1078 rv = SniffStreamBytes(aBuffer, aLength, &writeCount);
1079 }
1080 if (NS_FAILED(rv)) {
1081 MarkAsBroken(rv);
1082 return;
1083 }
1084 NS_ASSERTION(writeCount == aLength, "Wrong number of stream bytes written/sniffed.");
1085
1086 if (IsTerminatedOrInterrupted()) {
1087 return;
1088 }
1089
1090 ParseAvailableData();
1091
1092 if (mFlushTimerArmed || mSpeculating) {
1093 return;
1094 }
1095
1096 mFlushTimer->InitWithFuncCallback(nsHtml5StreamParser::TimerCallback,
1097 static_cast<void*> (this),
1098 mFlushTimerEverFired ?
1099 sTimerInitialDelay :
1100 sTimerSubsequentDelay,
1101 nsITimer::TYPE_ONE_SHOT);
1102 mFlushTimerArmed = true;
1103 }
1104
1105 class nsHtml5DataAvailable : public nsRunnable
1106 {
1107 private:
1108 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1109 nsAutoArrayPtr<uint8_t> mData;
1110 uint32_t mLength;
1111 public:
1112 nsHtml5DataAvailable(nsHtml5StreamParser* aStreamParser,
1113 uint8_t* aData,
1114 uint32_t aLength)
1115 : mStreamParser(aStreamParser)
1116 , mData(aData)
1117 , mLength(aLength)
1118 {}
1119 NS_IMETHODIMP Run()
1120 {
1121 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1122 mStreamParser->DoDataAvailable(mData, mLength);
1123 return NS_OK;
1124 }
1125 };
1126
1127 nsresult
1128 nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
1129 nsISupports* aContext,
1130 nsIInputStream* aInStream,
1131 uint64_t aSourceOffset,
1132 uint32_t aLength)
1133 {
1134 nsresult rv;
1135 if (NS_FAILED(rv = mExecutor->IsBroken())) {
1136 return rv;
1137 }
1138
1139 NS_ASSERTION(mRequest == aRequest, "Got data on wrong stream.");
1140 uint32_t totalRead;
1141 // Main thread to parser thread dispatch requires copying to buffer first.
1142 if (NS_IsMainThread()) {
1143 const mozilla::fallible_t fallible = mozilla::fallible_t();
1144 nsAutoArrayPtr<uint8_t> data(new (fallible) uint8_t[aLength]);
1145 if (!data) {
1146 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1147 }
1148 rv = aInStream->Read(reinterpret_cast<char*>(data.get()),
1149 aLength, &totalRead);
1150 NS_ENSURE_SUCCESS(rv, rv);
1151 NS_ASSERTION(totalRead <= aLength, "Read more bytes than were available?");
1152
1153 nsCOMPtr<nsIRunnable> dataAvailable = new nsHtml5DataAvailable(this,
1154 data.forget(),
1155 totalRead);
1156 if (NS_FAILED(mThread->Dispatch(dataAvailable, nsIThread::DISPATCH_NORMAL))) {
1157 NS_WARNING("Dispatching DataAvailable event failed.");
1158 }
1159 return rv;
1160 } else {
1161 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1162 mozilla::MutexAutoLock autoLock(mTokenizerMutex);
1163
1164 // Read directly from response buffer.
1165 rv = aInStream->ReadSegments(CopySegmentsToParser, this, aLength,
1166 &totalRead);
1167 if (NS_FAILED(rv)) {
1168 NS_WARNING("Failed reading response data to parser");
1169 return rv;
1170 }
1171 return NS_OK;
1172 }
1173 }
1174
1175 /* static */
1176 NS_METHOD
1177 nsHtml5StreamParser::CopySegmentsToParser(nsIInputStream *aInStream,
1178 void *aClosure,
1179 const char *aFromSegment,
1180 uint32_t aToOffset,
1181 uint32_t aCount,
1182 uint32_t *aWriteCount)
1183 {
1184 nsHtml5StreamParser* parser = static_cast<nsHtml5StreamParser*>(aClosure);
1185
1186 parser->DoDataAvailable((const uint8_t*)aFromSegment, aCount);
1187 // Assume DoDataAvailable consumed all available bytes.
1188 *aWriteCount = aCount;
1189 return NS_OK;
1190 }
1191
1192 bool
1193 nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
1194 {
1195 nsAutoCString newEncoding;
1196 if (!EncodingUtils::FindEncodingForLabel(aEncoding, newEncoding)) {
1197 // the encoding name is bogus
1198 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported",
1199 true,
1200 mTokenizer->getLineNumber());
1201 return false;
1202 }
1203
1204 if (newEncoding.EqualsLiteral("UTF-16BE") ||
1205 newEncoding.EqualsLiteral("UTF-16LE")) {
1206 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16",
1207 true,
1208 mTokenizer->getLineNumber());
1209 newEncoding.Assign("UTF-8");
1210 }
1211
1212 if (newEncoding.EqualsLiteral("x-user-defined")) {
1213 // WebKit/Blink hack for Indian and Armenian legacy sites
1214 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUserDefined",
1215 true,
1216 mTokenizer->getLineNumber());
1217 newEncoding.Assign("windows-1252");
1218 }
1219
1220 if (newEncoding.Equals(mCharset)) {
1221 if (mCharsetSource < kCharsetFromMetaPrescan) {
1222 if (mInitialEncodingWasFromParentFrame) {
1223 mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaFrame",
1224 false,
1225 mTokenizer->getLineNumber());
1226 } else {
1227 mTreeBuilder->MaybeComplainAboutCharset("EncLateMeta",
1228 false,
1229 mTokenizer->getLineNumber());
1230 }
1231 }
1232 mCharsetSource = kCharsetFromMetaTag; // become confident
1233 mFeedChardet = false; // don't feed chardet when confident
1234 return false;
1235 }
1236
1237 aEncoding.Assign(newEncoding);
1238 return true;
1239 }
1240
1241 bool
1242 nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
1243 {
1244 // This code needs to stay in sync with
1245 // nsHtml5MetaScanner::tryCharset. Unfortunately, the
1246 // trickery with member fields there leads to some copy-paste reuse. :-(
1247 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1248 if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
1249 return false;
1250 }
1251
1252 nsAutoCString newEncoding;
1253 CopyUTF16toUTF8(*aEncoding, newEncoding);
1254
1255 if (!PreferredForInternalEncodingDecl(newEncoding)) {
1256 return false;
1257 }
1258
1259 if (mReparseForbidden) {
1260 // This mReparseForbidden check happens after the call to
1261 // PreferredForInternalEncodingDecl so that if that method calls
1262 // MaybeComplainAboutCharset, its charset complaint wins over the one
1263 // below.
1264 mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaTooLate",
1265 true,
1266 mTokenizer->getLineNumber());
1267 return false; // not reparsing even if we wanted to
1268 }
1269
1270 // Avoid having the chardet ask for another restart after this restart
1271 // request.
1272 mFeedChardet = false;
1273 mTreeBuilder->NeedsCharsetSwitchTo(newEncoding,
1274 kCharsetFromMetaTag,
1275 mTokenizer->getLineNumber());
1276 FlushTreeOpsAndDisarmTimer();
1277 Interrupt();
1278 // the tree op executor will cause the stream parser to terminate
1279 // if the charset switch request is accepted or it'll uninterrupt
1280 // if the request failed. Note that if the restart request fails,
1281 // we don't bother trying to make chardet resume. Might as well
1282 // assume that chardet-requested restarts would fail, too.
1283 return true;
1284 }
1285
1286 void
1287 nsHtml5StreamParser::FlushTreeOpsAndDisarmTimer()
1288 {
1289 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1290 if (mFlushTimerArmed) {
1291 // avoid calling Cancel if the flush timer isn't armed to avoid acquiring
1292 // a mutex
1293 mFlushTimer->Cancel();
1294 mFlushTimerArmed = false;
1295 }
1296 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1297 mTokenizer->FlushViewSource();
1298 }
1299 mTreeBuilder->Flush();
1300 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1301 NS_WARNING("failed to dispatch executor flush event");
1302 }
1303 }
1304
1305 void
1306 nsHtml5StreamParser::ParseAvailableData()
1307 {
1308 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1309 mTokenizerMutex.AssertCurrentThreadOwns();
1310
1311 if (IsTerminatedOrInterrupted()) {
1312 return;
1313 }
1314
1315 for (;;) {
1316 if (!mFirstBuffer->hasMore()) {
1317 if (mFirstBuffer == mLastBuffer) {
1318 switch (mStreamState) {
1319 case STREAM_BEING_READ:
1320 // never release the last buffer.
1321 if (!mSpeculating) {
1322 // reuse buffer space if not speculating
1323 mFirstBuffer->setStart(0);
1324 mFirstBuffer->setEnd(0);
1325 }
1326 mTreeBuilder->FlushLoads();
1327 // Dispatch this runnable unconditionally, because the loads
1328 // that need flushing may have been flushed earlier even if the
1329 // flush right above here did nothing.
1330 if (NS_FAILED(NS_DispatchToMainThread(mLoadFlusher))) {
1331 NS_WARNING("failed to dispatch load flush event");
1332 }
1333 return; // no more data for now but expecting more
1334 case STREAM_ENDED:
1335 if (mAtEOF) {
1336 return;
1337 }
1338 mAtEOF = true;
1339 if (mCharsetSource < kCharsetFromMetaTag) {
1340 if (mInitialEncodingWasFromParentFrame) {
1341 // Unfortunately, this check doesn't take effect for
1342 // cross-origin frames, so cross-origin ad frames that have
1343 // no text and only an image or a Flash embed get the more
1344 // severe message from the next if block. The message is
1345 // technically accurate, though.
1346 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationFrame",
1347 false,
1348 0);
1349 } else if (mMode == NORMAL) {
1350 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclaration",
1351 true,
1352 0);
1353 } else if (mMode == PLAIN_TEXT) {
1354 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationPlain",
1355 true,
1356 0);
1357 }
1358 }
1359 mTokenizer->eof();
1360 mTreeBuilder->StreamEnded();
1361 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1362 mTokenizer->EndViewSource();
1363 }
1364 FlushTreeOpsAndDisarmTimer();
1365 return; // no more data and not expecting more
1366 default:
1367 NS_NOTREACHED("It should be impossible to reach this.");
1368 return;
1369 }
1370 }
1371 mFirstBuffer = mFirstBuffer->next;
1372 continue;
1373 }
1374
1375 // now we have a non-empty buffer
1376 mFirstBuffer->adjust(mLastWasCR);
1377 mLastWasCR = false;
1378 if (mFirstBuffer->hasMore()) {
1379 mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer);
1380 // At this point, internalEncodingDeclaration() may have called
1381 // Terminate, but that never happens together with script.
1382 // Can't assert that here, though, because it's possible that the main
1383 // thread has called Terminate() while this thread was parsing.
1384 if (mTreeBuilder->HasScript()) {
1385 // HasScript() cannot return true if the tree builder is preventing
1386 // script execution.
1387 MOZ_ASSERT(mMode == NORMAL);
1388 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1389 nsHtml5Speculation* speculation =
1390 new nsHtml5Speculation(mFirstBuffer,
1391 mFirstBuffer->getStart(),
1392 mTokenizer->getLineNumber(),
1393 mTreeBuilder->newSnapshot());
1394 mTreeBuilder->AddSnapshotToScript(speculation->GetSnapshot(),
1395 speculation->GetStartLineNumber());
1396 FlushTreeOpsAndDisarmTimer();
1397 mTreeBuilder->SetOpSink(speculation);
1398 mSpeculations.AppendElement(speculation); // adopts the pointer
1399 mSpeculating = true;
1400 }
1401 if (IsTerminatedOrInterrupted()) {
1402 return;
1403 }
1404 }
1405 continue;
1406 }
1407 }
1408
1409 class nsHtml5StreamParserContinuation : public nsRunnable
1410 {
1411 private:
1412 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1413 public:
1414 nsHtml5StreamParserContinuation(nsHtml5StreamParser* aStreamParser)
1415 : mStreamParser(aStreamParser)
1416 {}
1417 NS_IMETHODIMP Run()
1418 {
1419 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1420 mStreamParser->Uninterrupt();
1421 mStreamParser->ParseAvailableData();
1422 return NS_OK;
1423 }
1424 };
1425
1426 void
1427 nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
1428 nsHtml5TreeBuilder* aTreeBuilder,
1429 bool aLastWasCR)
1430 {
1431 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1432 NS_ASSERTION(!(mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML),
1433 "ContinueAfterScripts called in view source mode!");
1434 if (NS_FAILED(mExecutor->IsBroken())) {
1435 return;
1436 }
1437 #ifdef DEBUG
1438 mExecutor->AssertStageEmpty();
1439 #endif
1440 bool speculationFailed = false;
1441 {
1442 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1443 if (mSpeculations.IsEmpty()) {
1444 NS_NOTREACHED("ContinueAfterScripts called without speculations.");
1445 return;
1446 }
1447 nsHtml5Speculation* speculation = mSpeculations.ElementAt(0);
1448 if (aLastWasCR ||
1449 !aTokenizer->isInDataState() ||
1450 !aTreeBuilder->snapshotMatches(speculation->GetSnapshot())) {
1451 speculationFailed = true;
1452 // We've got a failed speculation :-(
1453 Interrupt(); // Make the parser thread release the tokenizer mutex sooner
1454 // now fall out of the speculationAutoLock into the tokenizerAutoLock block
1455 } else {
1456 // We've got a successful speculation!
1457 if (mSpeculations.Length() > 1) {
1458 // the first speculation isn't the current speculation, so there's
1459 // no need to bother the parser thread.
1460 speculation->FlushToSink(mExecutor);
1461 NS_ASSERTION(!mExecutor->IsScriptExecuting(),
1462 "ParseUntilBlocked() was supposed to ensure we don't come "
1463 "here when scripts are executing.");
1464 NS_ASSERTION(mExecutor->IsInFlushLoop(), "How are we here if "
1465 "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
1466 "only caller of this method?");
1467 mSpeculations.RemoveElementAt(0);
1468 return;
1469 }
1470 // else
1471 Interrupt(); // Make the parser thread release the tokenizer mutex sooner
1472
1473 // now fall through
1474 // the first speculation is the current speculation. Need to
1475 // release the the speculation mutex and acquire the tokenizer
1476 // mutex. (Just acquiring the other mutex here would deadlock)
1477 }
1478 }
1479 {
1480 mozilla::MutexAutoLock tokenizerAutoLock(mTokenizerMutex);
1481 #ifdef DEBUG
1482 {
1483 nsCOMPtr<nsIThread> mainThread;
1484 NS_GetMainThread(getter_AddRefs(mainThread));
1485 mAtomTable.SetPermittedLookupThread(mainThread);
1486 }
1487 #endif
1488 // In principle, the speculation mutex should be acquired here,
1489 // but there's no point, because the parser thread only acquires it
1490 // when it has also acquired the tokenizer mutex and we are already
1491 // holding the tokenizer mutex.
1492 if (speculationFailed) {
1493 // Rewind the stream
1494 mAtEOF = false;
1495 nsHtml5Speculation* speculation = mSpeculations.ElementAt(0);
1496 mFirstBuffer = speculation->GetBuffer();
1497 mFirstBuffer->setStart(speculation->GetStart());
1498 mTokenizer->setLineNumber(speculation->GetStartLineNumber());
1499
1500 nsContentUtils::ReportToConsole(nsIScriptError::warningFlag,
1501 NS_LITERAL_CSTRING("DOM Events"),
1502 mExecutor->GetDocument(),
1503 nsContentUtils::eDOM_PROPERTIES,
1504 "SpeculationFailed",
1505 nullptr, 0,
1506 nullptr,
1507 EmptyString(),
1508 speculation->GetStartLineNumber());
1509
1510 nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer->next;
1511 while (buffer) {
1512 buffer->setStart(0);
1513 buffer = buffer->next;
1514 }
1515
1516 mSpeculations.Clear(); // potentially a huge number of destructors
1517 // run here synchronously on the main thread...
1518
1519 mTreeBuilder->flushCharacters(); // empty the pending buffer
1520 mTreeBuilder->ClearOps(); // now get rid of the failed ops
1521
1522 mTreeBuilder->SetOpSink(mExecutor->GetStage());
1523 mExecutor->StartReadingFromStage();
1524 mSpeculating = false;
1525
1526 // Copy state over
1527 mLastWasCR = aLastWasCR;
1528 mTokenizer->loadState(aTokenizer);
1529 mTreeBuilder->loadState(aTreeBuilder, &mAtomTable);
1530 } else {
1531 // We've got a successful speculation and at least a moment ago it was
1532 // the current speculation
1533 mSpeculations.ElementAt(0)->FlushToSink(mExecutor);
1534 NS_ASSERTION(!mExecutor->IsScriptExecuting(),
1535 "ParseUntilBlocked() was supposed to ensure we don't come "
1536 "here when scripts are executing.");
1537 NS_ASSERTION(mExecutor->IsInFlushLoop(), "How are we here if "
1538 "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
1539 "only caller of this method?");
1540 mSpeculations.RemoveElementAt(0);
1541 if (mSpeculations.IsEmpty()) {
1542 // yes, it was still the only speculation. Now stop speculating
1543 // However, before telling the executor to read from stage, flush
1544 // any pending ops straight to the executor, because otherwise
1545 // they remain unflushed until we get more data from the network.
1546 mTreeBuilder->SetOpSink(mExecutor);
1547 mTreeBuilder->Flush(true);
1548 mTreeBuilder->SetOpSink(mExecutor->GetStage());
1549 mExecutor->StartReadingFromStage();
1550 mSpeculating = false;
1551 }
1552 }
1553 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
1554 if (NS_FAILED(mThread->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
1555 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
1556 }
1557 // A stream event might run before this event runs, but that's harmless.
1558 #ifdef DEBUG
1559 mAtomTable.SetPermittedLookupThread(mThread);
1560 #endif
1561 }
1562 }
1563
1564 void
1565 nsHtml5StreamParser::ContinueAfterFailedCharsetSwitch()
1566 {
1567 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1568 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
1569 if (NS_FAILED(mThread->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
1570 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
1571 }
1572 }
1573
1574 class nsHtml5TimerKungFu : public nsRunnable
1575 {
1576 private:
1577 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1578 public:
1579 nsHtml5TimerKungFu(nsHtml5StreamParser* aStreamParser)
1580 : mStreamParser(aStreamParser)
1581 {}
1582 NS_IMETHODIMP Run()
1583 {
1584 if (mStreamParser->mFlushTimer) {
1585 mStreamParser->mFlushTimer->Cancel();
1586 mStreamParser->mFlushTimer = nullptr;
1587 }
1588 return NS_OK;
1589 }
1590 };
1591
1592 void
1593 nsHtml5StreamParser::DropTimer()
1594 {
1595 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1596 /*
1597 * Simply nulling out the timer wouldn't work, because if the timer is
1598 * armed, it needs to be canceled first. Simply canceling it first wouldn't
1599 * work, because nsTimerImpl::Cancel is not safe for calling from outside
1600 * the thread where nsTimerImpl::Fire would run. It's not safe to
1601 * dispatch a runnable to cancel the timer from the destructor of this
1602 * class, because the timer has a weak (void*) pointer back to this instance
1603 * of the stream parser and having the timer fire before the runnable
1604 * cancels it would make the timer access a deleted object.
1605 *
1606 * This DropTimer method addresses these issues. This method must be called
1607 * on the main thread before the destructor of this class is reached.
1608 * The nsHtml5TimerKungFu object has an nsHtml5RefPtr that addrefs this
1609 * stream parser object to keep it alive until the runnable is done.
1610 * The runnable cancels the timer on the parser thread, drops the timer
1611 * and lets nsHtml5RefPtr send a runnable back to the main thread to
1612 * release the stream parser.
1613 */
1614 if (mFlushTimer) {
1615 nsCOMPtr<nsIRunnable> event = new nsHtml5TimerKungFu(this);
1616 if (NS_FAILED(mThread->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
1617 NS_WARNING("Failed to dispatch TimerKungFu event");
1618 }
1619 }
1620 }
1621
1622 // Using a static, because the method name Notify is taken by the chardet
1623 // callback.
1624 void
1625 nsHtml5StreamParser::TimerCallback(nsITimer* aTimer, void* aClosure)
1626 {
1627 (static_cast<nsHtml5StreamParser*> (aClosure))->TimerFlush();
1628 }
1629
1630 void
1631 nsHtml5StreamParser::TimerFlush()
1632 {
1633 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1634 mozilla::MutexAutoLock autoLock(mTokenizerMutex);
1635
1636 NS_ASSERTION(!mSpeculating, "Flush timer fired while speculating.");
1637
1638 // The timer fired if we got here. No need to cancel it. Mark it as
1639 // not armed, though.
1640 mFlushTimerArmed = false;
1641
1642 mFlushTimerEverFired = true;
1643
1644 if (IsTerminatedOrInterrupted()) {
1645 return;
1646 }
1647
1648 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1649 mTreeBuilder->Flush(); // delete useless ops
1650 if (mTokenizer->FlushViewSource()) {
1651 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1652 NS_WARNING("failed to dispatch executor flush event");
1653 }
1654 }
1655 } else {
1656 // we aren't speculating and we don't know when new data is
1657 // going to arrive. Send data to the main thread.
1658 if (mTreeBuilder->Flush(true)) {
1659 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1660 NS_WARNING("failed to dispatch executor flush event");
1661 }
1662 }
1663 }
1664 }
1665
1666 void
1667 nsHtml5StreamParser::MarkAsBroken(nsresult aRv)
1668 {
1669 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1670 mTokenizerMutex.AssertCurrentThreadOwns();
1671
1672 Terminate();
1673 mTreeBuilder->MarkAsBroken(aRv);
1674 mozilla::DebugOnly<bool> hadOps = mTreeBuilder->Flush(false);
1675 NS_ASSERTION(hadOps, "Should have had the markAsBroken op!");
1676 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1677 NS_WARNING("failed to dispatch executor flush event");
1678 }
1679 }

mercurial