parser/htmlparser/src/nsParser.cpp

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:612bb51e23a0
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=79: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "nsIAtom.h"
8 #include "nsParser.h"
9 #include "nsString.h"
10 #include "nsCRT.h"
11 #include "nsScanner.h"
12 #include "plstr.h"
13 #include "nsIStringStream.h"
14 #include "nsIChannel.h"
15 #include "nsICachingChannel.h"
16 #include "nsICacheEntryDescriptor.h"
17 #include "nsIInputStream.h"
18 #include "CNavDTD.h"
19 #include "prenv.h"
20 #include "prlock.h"
21 #include "prcvar.h"
22 #include "nsParserCIID.h"
23 #include "nsReadableUtils.h"
24 #include "nsCOMPtr.h"
25 #include "nsExpatDriver.h"
26 #include "nsIServiceManager.h"
27 #include "nsICategoryManager.h"
28 #include "nsISupportsPrimitives.h"
29 #include "nsIFragmentContentSink.h"
30 #include "nsStreamUtils.h"
31 #include "nsHTMLTokenizer.h"
32 #include "nsNetUtil.h"
33 #include "nsScriptLoader.h"
34 #include "nsDataHashtable.h"
35 #include "nsXPCOMCIDInternal.h"
36 #include "nsMimeTypes.h"
37 #include "mozilla/CondVar.h"
38 #include "mozilla/Mutex.h"
39 #include "nsParserConstants.h"
40 #include "nsCharsetSource.h"
41 #include "nsContentUtils.h"
42 #include "nsThreadUtils.h"
43 #include "nsIHTMLContentSink.h"
44
45 #include "mozilla/dom/EncodingUtils.h"
46
47 using namespace mozilla;
48 using mozilla::dom::EncodingUtils;
49
50 #define NS_PARSER_FLAG_PARSER_ENABLED 0x00000002
51 #define NS_PARSER_FLAG_OBSERVERS_ENABLED 0x00000004
52 #define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008
53 #define NS_PARSER_FLAG_FLUSH_TOKENS 0x00000020
54 #define NS_PARSER_FLAG_CAN_TOKENIZE 0x00000040
55
56 //-------------- Begin ParseContinue Event Definition ------------------------
57 /*
58 The parser can be explicitly interrupted by passing a return value of
59 NS_ERROR_HTMLPARSER_INTERRUPTED from BuildModel on the DTD. This will cause
60 the parser to stop processing and allow the application to return to the event
61 loop. The data which was left at the time of interruption will be processed
62 the next time OnDataAvailable is called. If the parser has received its final
63 chunk of data then OnDataAvailable will no longer be called by the networking
64 module, so the parser will schedule a nsParserContinueEvent which will call
65 the parser to process the remaining data after returning to the event loop.
66 If the parser is interrupted while processing the remaining data it will
67 schedule another ParseContinueEvent. The processing of data followed by
68 scheduling of the continue events will proceed until either:
69
70 1) All of the remaining data can be processed without interrupting
71 2) The parser has been cancelled.
72
73
74 This capability is currently used in CNavDTD and nsHTMLContentSink. The
75 nsHTMLContentSink is notified by CNavDTD when a chunk of tokens is going to be
76 processed and when each token is processed. The nsHTML content sink records
77 the time when the chunk has started processing and will return
78 NS_ERROR_HTMLPARSER_INTERRUPTED if the token processing time has exceeded a
79 threshold called max tokenizing processing time. This allows the content sink
80 to limit how much data is processed in a single chunk which in turn gates how
81 much time is spent away from the event loop. Processing smaller chunks of data
82 also reduces the time spent in subsequent reflows.
83
84 This capability is most apparent when loading large documents. If the maximum
85 token processing time is set small enough the application will remain
86 responsive during document load.
87
88 A side-effect of this capability is that document load is not complete when
89 the last chunk of data is passed to OnDataAvailable since the parser may have
90 been interrupted when the last chunk of data arrived. The document is complete
91 when all of the document has been tokenized and there aren't any pending
92 nsParserContinueEvents. This can cause problems if the application assumes
93 that it can monitor the load requests to determine when the document load has
94 been completed. This is what happens in Mozilla. The document is considered
95 completely loaded when all of the load requests have been satisfied. To delay
96 the document load until all of the parsing has been completed the
97 nsHTMLContentSink adds a dummy parser load request which is not removed until
98 the nsHTMLContentSink's DidBuildModel is called. The CNavDTD will not call
99 DidBuildModel until the final chunk of data has been passed to the parser
100 through the OnDataAvailable and there aren't any pending
101 nsParserContineEvents.
102
103 Currently the parser is ignores requests to be interrupted during the
104 processing of script. This is because a document.write followed by JavaScript
105 calls to manipulate the DOM may fail if the parser was interrupted during the
106 document.write.
107
108 For more details @see bugzilla bug 76722
109 */
110
111
112 class nsParserContinueEvent : public nsRunnable
113 {
114 public:
115 nsRefPtr<nsParser> mParser;
116
117 nsParserContinueEvent(nsParser* aParser)
118 : mParser(aParser)
119 {}
120
121 NS_IMETHOD Run()
122 {
123 mParser->HandleParserContinueEvent(this);
124 return NS_OK;
125 }
126 };
127
128 //-------------- End ParseContinue Event Definition ------------------------
129
130 /**
131 * default constructor
132 */
133 nsParser::nsParser()
134 {
135 Initialize(true);
136 }
137
138 nsParser::~nsParser()
139 {
140 Cleanup();
141 }
142
143 void
144 nsParser::Initialize(bool aConstructor)
145 {
146 if (aConstructor) {
147 // Raw pointer
148 mParserContext = 0;
149 }
150 else {
151 // nsCOMPtrs
152 mObserver = nullptr;
153 mUnusedInput.Truncate();
154 }
155
156 mContinueEvent = nullptr;
157 mCharsetSource = kCharsetUninitialized;
158 mCharset.AssignLiteral("ISO-8859-1");
159 mInternalState = NS_OK;
160 mStreamStatus = NS_OK;
161 mCommand = eViewNormal;
162 mFlags = NS_PARSER_FLAG_OBSERVERS_ENABLED |
163 NS_PARSER_FLAG_PARSER_ENABLED |
164 NS_PARSER_FLAG_CAN_TOKENIZE;
165
166 mProcessingNetworkData = false;
167 mIsAboutBlank = false;
168 }
169
170 void
171 nsParser::Cleanup()
172 {
173 #ifdef DEBUG
174 if (mParserContext && mParserContext->mPrevContext) {
175 NS_WARNING("Extra parser contexts still on the parser stack");
176 }
177 #endif
178
179 while (mParserContext) {
180 CParserContext *pc = mParserContext->mPrevContext;
181 delete mParserContext;
182 mParserContext = pc;
183 }
184
185 // It should not be possible for this flag to be set when we are getting
186 // destroyed since this flag implies a pending nsParserContinueEvent, which
187 // has an owning reference to |this|.
188 NS_ASSERTION(!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT), "bad");
189 }
190
191 NS_IMPL_CYCLE_COLLECTION_CLASS(nsParser)
192
193 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsParser)
194 NS_IMPL_CYCLE_COLLECTION_UNLINK(mDTD)
195 NS_IMPL_CYCLE_COLLECTION_UNLINK(mSink)
196 NS_IMPL_CYCLE_COLLECTION_UNLINK(mObserver)
197 NS_IMPL_CYCLE_COLLECTION_UNLINK_END
198
199 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsParser)
200 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mDTD)
201 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mSink)
202 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mObserver)
203 CParserContext *pc = tmp->mParserContext;
204 while (pc) {
205 cb.NoteXPCOMChild(pc->mTokenizer);
206 pc = pc->mPrevContext;
207 }
208 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
209
210 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsParser)
211 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsParser)
212 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsParser)
213 NS_INTERFACE_MAP_ENTRY(nsIStreamListener)
214 NS_INTERFACE_MAP_ENTRY(nsIParser)
215 NS_INTERFACE_MAP_ENTRY(nsIRequestObserver)
216 NS_INTERFACE_MAP_ENTRY(nsISupportsWeakReference)
217 NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIParser)
218 NS_INTERFACE_MAP_END
219
220 // The parser continue event is posted only if
221 // all of the data to parse has been passed to ::OnDataAvailable
222 // and the parser has been interrupted by the content sink
223 // because the processing of tokens took too long.
224
225 nsresult
226 nsParser::PostContinueEvent()
227 {
228 if (!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT)) {
229 // If this flag isn't set, then there shouldn't be a live continue event!
230 NS_ASSERTION(!mContinueEvent, "bad");
231
232 // This creates a reference cycle between this and the event that is
233 // broken when the event fires.
234 nsCOMPtr<nsIRunnable> event = new nsParserContinueEvent(this);
235 if (NS_FAILED(NS_DispatchToCurrentThread(event))) {
236 NS_WARNING("failed to dispatch parser continuation event");
237 } else {
238 mFlags |= NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
239 mContinueEvent = event;
240 }
241 }
242 return NS_OK;
243 }
244
245 NS_IMETHODIMP_(void)
246 nsParser::GetCommand(nsCString& aCommand)
247 {
248 aCommand = mCommandStr;
249 }
250
251 /**
252 * Call this method once you've created a parser, and want to instruct it
253 * about the command which caused the parser to be constructed. For example,
254 * this allows us to select a DTD which can do, say, view-source.
255 *
256 * @param aCommand the command string to set
257 */
258 NS_IMETHODIMP_(void)
259 nsParser::SetCommand(const char* aCommand)
260 {
261 mCommandStr.Assign(aCommand);
262 if (mCommandStr.Equals("view-source")) {
263 mCommand = eViewSource;
264 } else if (mCommandStr.Equals("view-fragment")) {
265 mCommand = eViewFragment;
266 } else {
267 mCommand = eViewNormal;
268 }
269 }
270
271 /**
272 * Call this method once you've created a parser, and want to instruct it
273 * about the command which caused the parser to be constructed. For example,
274 * this allows us to select a DTD which can do, say, view-source.
275 *
276 * @param aParserCommand the command to set
277 */
278 NS_IMETHODIMP_(void)
279 nsParser::SetCommand(eParserCommands aParserCommand)
280 {
281 mCommand = aParserCommand;
282 }
283
284 /**
285 * Call this method once you've created a parser, and want to instruct it
286 * about what charset to load
287 *
288 * @param aCharset- the charset of a document
289 * @param aCharsetSource- the source of the charset
290 */
291 NS_IMETHODIMP_(void)
292 nsParser::SetDocumentCharset(const nsACString& aCharset, int32_t aCharsetSource)
293 {
294 mCharset = aCharset;
295 mCharsetSource = aCharsetSource;
296 if (mParserContext && mParserContext->mScanner) {
297 mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource);
298 }
299 }
300
301 void
302 nsParser::SetSinkCharset(nsACString& aCharset)
303 {
304 if (mSink) {
305 mSink->SetDocumentCharset(aCharset);
306 }
307 }
308
309 /**
310 * This method gets called in order to set the content
311 * sink for this parser to dump nodes to.
312 *
313 * @param nsIContentSink interface for node receiver
314 */
315 NS_IMETHODIMP_(void)
316 nsParser::SetContentSink(nsIContentSink* aSink)
317 {
318 NS_PRECONDITION(aSink, "sink cannot be null!");
319 mSink = aSink;
320
321 if (mSink) {
322 mSink->SetParser(this);
323 nsCOMPtr<nsIHTMLContentSink> htmlSink = do_QueryInterface(mSink);
324 if (htmlSink) {
325 mIsAboutBlank = true;
326 }
327 }
328 }
329
330 /**
331 * retrieve the sink set into the parser
332 * @return current sink
333 */
334 NS_IMETHODIMP_(nsIContentSink*)
335 nsParser::GetContentSink()
336 {
337 return mSink;
338 }
339
340 /**
341 * Determine what DTD mode (and thus what layout nsCompatibility mode)
342 * to use for this document based on the first chunk of data received
343 * from the network (each parsercontext can have its own mode). (No,
344 * this is not an optimal solution -- we really don't need to know until
345 * after we've received the DOCTYPE, and this could easily be part of
346 * the regular parsing process if the parser were designed in a way that
347 * made such modifications easy.)
348 */
349
350 // Parse the PS production in the SGML spec (excluding the part dealing
351 // with entity references) starting at theIndex into theBuffer, and
352 // return the first index after the end of the production.
353 static int32_t
354 ParsePS(const nsString& aBuffer, int32_t aIndex)
355 {
356 for (;;) {
357 char16_t ch = aBuffer.CharAt(aIndex);
358 if ((ch == char16_t(' ')) || (ch == char16_t('\t')) ||
359 (ch == char16_t('\n')) || (ch == char16_t('\r'))) {
360 ++aIndex;
361 } else if (ch == char16_t('-')) {
362 int32_t tmpIndex;
363 if (aBuffer.CharAt(aIndex+1) == char16_t('-') &&
364 kNotFound != (tmpIndex=aBuffer.Find("--",false,aIndex+2,-1))) {
365 aIndex = tmpIndex + 2;
366 } else {
367 return aIndex;
368 }
369 } else {
370 return aIndex;
371 }
372 }
373 }
374
375 #define PARSE_DTD_HAVE_DOCTYPE (1<<0)
376 #define PARSE_DTD_HAVE_PUBLIC_ID (1<<1)
377 #define PARSE_DTD_HAVE_SYSTEM_ID (1<<2)
378 #define PARSE_DTD_HAVE_INTERNAL_SUBSET (1<<3)
379
380 // return true on success (includes not present), false on failure
381 static bool
382 ParseDocTypeDecl(const nsString &aBuffer,
383 int32_t *aResultFlags,
384 nsString &aPublicID,
385 nsString &aSystemID)
386 {
387 bool haveDoctype = false;
388 *aResultFlags = 0;
389
390 // Skip through any comments and processing instructions
391 // The PI-skipping is a bit of a hack.
392 int32_t theIndex = 0;
393 do {
394 theIndex = aBuffer.FindChar('<', theIndex);
395 if (theIndex == kNotFound) break;
396 char16_t nextChar = aBuffer.CharAt(theIndex+1);
397 if (nextChar == char16_t('!')) {
398 int32_t tmpIndex = theIndex + 2;
399 if (kNotFound !=
400 (theIndex=aBuffer.Find("DOCTYPE", true, tmpIndex, 0))) {
401 haveDoctype = true;
402 theIndex += 7; // skip "DOCTYPE"
403 break;
404 }
405 theIndex = ParsePS(aBuffer, tmpIndex);
406 theIndex = aBuffer.FindChar('>', theIndex);
407 } else if (nextChar == char16_t('?')) {
408 theIndex = aBuffer.FindChar('>', theIndex);
409 } else {
410 break;
411 }
412 } while (theIndex != kNotFound);
413
414 if (!haveDoctype)
415 return true;
416 *aResultFlags |= PARSE_DTD_HAVE_DOCTYPE;
417
418 theIndex = ParsePS(aBuffer, theIndex);
419 theIndex = aBuffer.Find("HTML", true, theIndex, 0);
420 if (kNotFound == theIndex)
421 return false;
422 theIndex = ParsePS(aBuffer, theIndex+4);
423 int32_t tmpIndex = aBuffer.Find("PUBLIC", true, theIndex, 0);
424
425 if (kNotFound != tmpIndex) {
426 theIndex = ParsePS(aBuffer, tmpIndex+6);
427
428 // We get here only if we've read <!DOCTYPE HTML PUBLIC
429 // (not case sensitive) possibly with comments within.
430
431 // Now find the beginning and end of the public identifier
432 // and the system identifier (if present).
433
434 char16_t lit = aBuffer.CharAt(theIndex);
435 if ((lit != char16_t('\"')) && (lit != char16_t('\'')))
436 return false;
437
438 // Start is the first character, excluding the quote, and End is
439 // the final quote, so there are (end-start) characters.
440
441 int32_t PublicIDStart = theIndex + 1;
442 int32_t PublicIDEnd = aBuffer.FindChar(lit, PublicIDStart);
443 if (kNotFound == PublicIDEnd)
444 return false;
445 theIndex = ParsePS(aBuffer, PublicIDEnd + 1);
446 char16_t next = aBuffer.CharAt(theIndex);
447 if (next == char16_t('>')) {
448 // There was a public identifier, but no system
449 // identifier,
450 // so do nothing.
451 // This is needed to avoid the else at the end, and it's
452 // also the most common case.
453 } else if ((next == char16_t('\"')) ||
454 (next == char16_t('\''))) {
455 // We found a system identifier.
456 *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
457 int32_t SystemIDStart = theIndex + 1;
458 int32_t SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
459 if (kNotFound == SystemIDEnd)
460 return false;
461 aSystemID =
462 Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
463 } else if (next == char16_t('[')) {
464 // We found an internal subset.
465 *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
466 } else {
467 // Something's wrong.
468 return false;
469 }
470
471 // Since a public ID is a minimum literal, we must trim
472 // and collapse whitespace
473 aPublicID = Substring(aBuffer, PublicIDStart, PublicIDEnd - PublicIDStart);
474 aPublicID.CompressWhitespace(true, true);
475 *aResultFlags |= PARSE_DTD_HAVE_PUBLIC_ID;
476 } else {
477 tmpIndex=aBuffer.Find("SYSTEM", true, theIndex, 0);
478 if (kNotFound != tmpIndex) {
479 // DOCTYPES with system ID but no Public ID
480 *aResultFlags |= PARSE_DTD_HAVE_SYSTEM_ID;
481
482 theIndex = ParsePS(aBuffer, tmpIndex+6);
483 char16_t next = aBuffer.CharAt(theIndex);
484 if (next != char16_t('\"') && next != char16_t('\''))
485 return false;
486
487 int32_t SystemIDStart = theIndex + 1;
488 int32_t SystemIDEnd = aBuffer.FindChar(next, SystemIDStart);
489
490 if (kNotFound == SystemIDEnd)
491 return false;
492 aSystemID =
493 Substring(aBuffer, SystemIDStart, SystemIDEnd - SystemIDStart);
494 theIndex = ParsePS(aBuffer, SystemIDEnd + 1);
495 }
496
497 char16_t nextChar = aBuffer.CharAt(theIndex);
498 if (nextChar == char16_t('['))
499 *aResultFlags |= PARSE_DTD_HAVE_INTERNAL_SUBSET;
500 else if (nextChar != char16_t('>'))
501 return false;
502 }
503 return true;
504 }
505
506 struct PubIDInfo
507 {
508 enum eMode {
509 eQuirks, /* always quirks mode, unless there's an internal subset */
510 eAlmostStandards,/* eCompatibility_AlmostStandards */
511 eFullStandards /* eCompatibility_FullStandards */
512 /*
513 * public IDs that should trigger strict mode are not listed
514 * since we want all future public IDs to trigger strict mode as
515 * well
516 */
517 };
518
519 const char* name;
520 eMode mode_if_no_sysid;
521 eMode mode_if_sysid;
522 };
523
524 #define ELEMENTS_OF(array_) (sizeof(array_)/sizeof(array_[0]))
525
526 // These must be in nsCRT::strcmp order so binary-search can be used.
527 // This is verified, |#ifdef DEBUG|, below.
528
529 // Even though public identifiers should be case sensitive, we will do
530 // all comparisons after converting to lower case in order to do
531 // case-insensitive comparison since there are a number of existing web
532 // sites that use the incorrect case. Therefore all of the public
533 // identifiers below are in lower case (with the correct case following,
534 // in comments). The case is verified, |#ifdef DEBUG|, below.
535 static const PubIDInfo kPublicIDs[] = {
536 {"+//silmaril//dtd html pro v0r11 19970101//en" /* "+//Silmaril//dtd html Pro v0r11 19970101//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
537 {"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en" /* "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
538 {"-//as//dtd html 3.0 aswedit + extensions//en" /* "-//AS//DTD HTML 3.0 asWedit + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
539 {"-//ietf//dtd html 2.0 level 1//en" /* "-//IETF//DTD HTML 2.0 Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
540 {"-//ietf//dtd html 2.0 level 2//en" /* "-//IETF//DTD HTML 2.0 Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
541 {"-//ietf//dtd html 2.0 strict level 1//en" /* "-//IETF//DTD HTML 2.0 Strict Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
542 {"-//ietf//dtd html 2.0 strict level 2//en" /* "-//IETF//DTD HTML 2.0 Strict Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
543 {"-//ietf//dtd html 2.0 strict//en" /* "-//IETF//DTD HTML 2.0 Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
544 {"-//ietf//dtd html 2.0//en" /* "-//IETF//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
545 {"-//ietf//dtd html 2.1e//en" /* "-//IETF//DTD HTML 2.1E//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
546 {"-//ietf//dtd html 3.0//en" /* "-//IETF//DTD HTML 3.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
547 {"-//ietf//dtd html 3.0//en//" /* "-//IETF//DTD HTML 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
548 {"-//ietf//dtd html 3.2 final//en" /* "-//IETF//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
549 {"-//ietf//dtd html 3.2//en" /* "-//IETF//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
550 {"-//ietf//dtd html 3//en" /* "-//IETF//DTD HTML 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
551 {"-//ietf//dtd html level 0//en" /* "-//IETF//DTD HTML Level 0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
552 {"-//ietf//dtd html level 0//en//2.0" /* "-//IETF//DTD HTML Level 0//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
553 {"-//ietf//dtd html level 1//en" /* "-//IETF//DTD HTML Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
554 {"-//ietf//dtd html level 1//en//2.0" /* "-//IETF//DTD HTML Level 1//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
555 {"-//ietf//dtd html level 2//en" /* "-//IETF//DTD HTML Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
556 {"-//ietf//dtd html level 2//en//2.0" /* "-//IETF//DTD HTML Level 2//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
557 {"-//ietf//dtd html level 3//en" /* "-//IETF//DTD HTML Level 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
558 {"-//ietf//dtd html level 3//en//3.0" /* "-//IETF//DTD HTML Level 3//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
559 {"-//ietf//dtd html strict level 0//en" /* "-//IETF//DTD HTML Strict Level 0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
560 {"-//ietf//dtd html strict level 0//en//2.0" /* "-//IETF//DTD HTML Strict Level 0//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
561 {"-//ietf//dtd html strict level 1//en" /* "-//IETF//DTD HTML Strict Level 1//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
562 {"-//ietf//dtd html strict level 1//en//2.0" /* "-//IETF//DTD HTML Strict Level 1//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
563 {"-//ietf//dtd html strict level 2//en" /* "-//IETF//DTD HTML Strict Level 2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
564 {"-//ietf//dtd html strict level 2//en//2.0" /* "-//IETF//DTD HTML Strict Level 2//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
565 {"-//ietf//dtd html strict level 3//en" /* "-//IETF//DTD HTML Strict Level 3//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
566 {"-//ietf//dtd html strict level 3//en//3.0" /* "-//IETF//DTD HTML Strict Level 3//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
567 {"-//ietf//dtd html strict//en" /* "-//IETF//DTD HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
568 {"-//ietf//dtd html strict//en//2.0" /* "-//IETF//DTD HTML Strict//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
569 {"-//ietf//dtd html strict//en//3.0" /* "-//IETF//DTD HTML Strict//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
570 {"-//ietf//dtd html//en" /* "-//IETF//DTD HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
571 {"-//ietf//dtd html//en//2.0" /* "-//IETF//DTD HTML//EN//2.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
572 {"-//ietf//dtd html//en//3.0" /* "-//IETF//DTD HTML//EN//3.0" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
573 {"-//metrius//dtd metrius presentational//en" /* "-//Metrius//DTD Metrius Presentational//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
574 {"-//microsoft//dtd internet explorer 2.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
575 {"-//microsoft//dtd internet explorer 2.0 html//en" /* "-//Microsoft//DTD Internet Explorer 2.0 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
576 {"-//microsoft//dtd internet explorer 2.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 2.0 Tables//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
577 {"-//microsoft//dtd internet explorer 3.0 html strict//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
578 {"-//microsoft//dtd internet explorer 3.0 html//en" /* "-//Microsoft//DTD Internet Explorer 3.0 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
579 {"-//microsoft//dtd internet explorer 3.0 tables//en" /* "-//Microsoft//DTD Internet Explorer 3.0 Tables//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
580 {"-//netscape comm. corp.//dtd html//en" /* "-//Netscape Comm. Corp.//DTD HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
581 {"-//netscape comm. corp.//dtd strict html//en" /* "-//Netscape Comm. Corp.//DTD Strict HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
582 {"-//o'reilly and associates//dtd html 2.0//en" /* "-//O'Reilly and Associates//DTD HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
583 {"-//o'reilly and associates//dtd html extended 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended 1.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
584 {"-//o'reilly and associates//dtd html extended relaxed 1.0//en" /* "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
585 {"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//en" /* "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
586 {"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//en" /* "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
587 {"-//spyglass//dtd html 2.0 extended//en" /* "-//Spyglass//DTD HTML 2.0 Extended//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
588 {"-//sq//dtd html 2.0 hotmetal + extensions//en" /* "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
589 {"-//sun microsystems corp.//dtd hotjava html//en" /* "-//Sun Microsystems Corp.//DTD HotJava HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
590 {"-//sun microsystems corp.//dtd hotjava strict html//en" /* "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
591 {"-//w3c//dtd html 3 1995-03-24//en" /* "-//W3C//DTD HTML 3 1995-03-24//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
592 {"-//w3c//dtd html 3.2 draft//en" /* "-//W3C//DTD HTML 3.2 Draft//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
593 {"-//w3c//dtd html 3.2 final//en" /* "-//W3C//DTD HTML 3.2 Final//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
594 {"-//w3c//dtd html 3.2//en" /* "-//W3C//DTD HTML 3.2//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
595 {"-//w3c//dtd html 3.2s draft//en" /* "-//W3C//DTD HTML 3.2S Draft//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
596 {"-//w3c//dtd html 4.0 frameset//en" /* "-//W3C//DTD HTML 4.0 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
597 {"-//w3c//dtd html 4.0 transitional//en" /* "-//W3C//DTD HTML 4.0 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
598 {"-//w3c//dtd html 4.01 frameset//en" /* "-//W3C//DTD HTML 4.01 Frameset//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
599 {"-//w3c//dtd html 4.01 transitional//en" /* "-//W3C//DTD HTML 4.01 Transitional//EN" */, PubIDInfo::eQuirks, PubIDInfo::eAlmostStandards},
600 {"-//w3c//dtd html experimental 19960712//en" /* "-//W3C//DTD HTML Experimental 19960712//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
601 {"-//w3c//dtd html experimental 970421//en" /* "-//W3C//DTD HTML Experimental 970421//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
602 {"-//w3c//dtd w3 html//en" /* "-//W3C//DTD W3 HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
603 {"-//w3c//dtd xhtml 1.0 frameset//en" /* "-//W3C//DTD XHTML 1.0 Frameset//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
604 {"-//w3c//dtd xhtml 1.0 transitional//en" /* "-//W3C//DTD XHTML 1.0 Transitional//EN" */, PubIDInfo::eAlmostStandards, PubIDInfo::eAlmostStandards},
605 {"-//w3o//dtd w3 html 3.0//en" /* "-//W3O//DTD W3 HTML 3.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
606 {"-//w3o//dtd w3 html 3.0//en//" /* "-//W3O//DTD W3 HTML 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
607 {"-//w3o//dtd w3 html strict 3.0//en//" /* "-//W3O//DTD W3 HTML Strict 3.0//EN//" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
608 {"-//webtechs//dtd mozilla html 2.0//en" /* "-//WebTechs//DTD Mozilla HTML 2.0//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
609 {"-//webtechs//dtd mozilla html//en" /* "-//WebTechs//DTD Mozilla HTML//EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
610 {"-/w3c/dtd html 4.0 transitional/en" /* "-/W3C/DTD HTML 4.0 Transitional/EN" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
611 {"html" /* "HTML" */, PubIDInfo::eQuirks, PubIDInfo::eQuirks},
612 };
613
614 #ifdef DEBUG
615 static void
616 VerifyPublicIDs()
617 {
618 static bool gVerified = false;
619 if (!gVerified) {
620 gVerified = true;
621 uint32_t i;
622 for (i = 0; i < ELEMENTS_OF(kPublicIDs) - 1; ++i) {
623 if (nsCRT::strcmp(kPublicIDs[i].name, kPublicIDs[i+1].name) >= 0) {
624 NS_NOTREACHED("doctypes out of order");
625 printf("Doctypes %s and %s out of order.\n",
626 kPublicIDs[i].name, kPublicIDs[i+1].name);
627 }
628 }
629 for (i = 0; i < ELEMENTS_OF(kPublicIDs); ++i) {
630 nsAutoCString lcPubID(kPublicIDs[i].name);
631 ToLowerCase(lcPubID);
632 if (nsCRT::strcmp(kPublicIDs[i].name, lcPubID.get()) != 0) {
633 NS_NOTREACHED("doctype not lower case");
634 printf("Doctype %s not lower case.\n", kPublicIDs[i].name);
635 }
636 }
637 }
638 }
639 #endif
640
641 static void
642 DetermineHTMLParseMode(const nsString& aBuffer,
643 nsDTDMode& aParseMode,
644 eParserDocType& aDocType)
645 {
646 #ifdef DEBUG
647 VerifyPublicIDs();
648 #endif
649 int32_t resultFlags;
650 nsAutoString publicIDUCS2, sysIDUCS2;
651 if (ParseDocTypeDecl(aBuffer, &resultFlags, publicIDUCS2, sysIDUCS2)) {
652 if (!(resultFlags & PARSE_DTD_HAVE_DOCTYPE)) {
653 // no DOCTYPE
654 aParseMode = eDTDMode_quirks;
655 aDocType = eHTML_Quirks;
656 } else if ((resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) ||
657 !(resultFlags & PARSE_DTD_HAVE_PUBLIC_ID)) {
658 // A doctype with an internal subset is always full_standards.
659 // A doctype without a public ID is always full_standards.
660 aDocType = eHTML_Strict;
661 aParseMode = eDTDMode_full_standards;
662
663 // Special hack for IBM's custom DOCTYPE.
664 if (!(resultFlags & PARSE_DTD_HAVE_INTERNAL_SUBSET) &&
665 sysIDUCS2 == NS_LITERAL_STRING(
666 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")) {
667 aParseMode = eDTDMode_quirks;
668 aDocType = eHTML_Quirks;
669 }
670
671 } else {
672 // We have to check our list of public IDs to see what to do.
673 // Yes, we want UCS2 to ASCII lossy conversion.
674 nsAutoCString publicID;
675 publicID.AssignWithConversion(publicIDUCS2);
676
677 // See comment above definition of kPublicIDs about case
678 // sensitivity.
679 ToLowerCase(publicID);
680
681 // Binary search to see if we can find the correct public ID
682 // These must be signed since maximum can go below zero and we'll
683 // crash if it's unsigned.
684 int32_t minimum = 0;
685 int32_t maximum = ELEMENTS_OF(kPublicIDs) - 1;
686 int32_t index;
687 for (;;) {
688 index = (minimum + maximum) / 2;
689 int32_t comparison =
690 nsCRT::strcmp(publicID.get(), kPublicIDs[index].name);
691 if (comparison == 0)
692 break;
693 if (comparison < 0)
694 maximum = index - 1;
695 else
696 minimum = index + 1;
697
698 if (maximum < minimum) {
699 // The DOCTYPE is not in our list, so it must be full_standards.
700 aParseMode = eDTDMode_full_standards;
701 aDocType = eHTML_Strict;
702 return;
703 }
704 }
705
706 switch ((resultFlags & PARSE_DTD_HAVE_SYSTEM_ID)
707 ? kPublicIDs[index].mode_if_sysid
708 : kPublicIDs[index].mode_if_no_sysid)
709 {
710 case PubIDInfo::eQuirks:
711 aParseMode = eDTDMode_quirks;
712 aDocType = eHTML_Quirks;
713 break;
714 case PubIDInfo::eAlmostStandards:
715 aParseMode = eDTDMode_almost_standards;
716 aDocType = eHTML_Strict;
717 break;
718 case PubIDInfo::eFullStandards:
719 aParseMode = eDTDMode_full_standards;
720 aDocType = eHTML_Strict;
721 break;
722 default:
723 NS_NOTREACHED("no other cases!");
724 }
725 }
726 } else {
727 // badly formed DOCTYPE -> quirks
728 aParseMode = eDTDMode_quirks;
729 aDocType = eHTML_Quirks;
730 }
731 }
732
733 static void
734 DetermineParseMode(const nsString& aBuffer, nsDTDMode& aParseMode,
735 eParserDocType& aDocType, const nsACString& aMimeType)
736 {
737 if (aMimeType.EqualsLiteral(TEXT_HTML)) {
738 DetermineHTMLParseMode(aBuffer, aParseMode, aDocType);
739 } else if (nsContentUtils::IsPlainTextType(aMimeType)) {
740 aDocType = ePlainText;
741 aParseMode = eDTDMode_quirks;
742 } else { // Some form of XML
743 aDocType = eXML;
744 aParseMode = eDTDMode_full_standards;
745 }
746 }
747
748 static nsIDTD*
749 FindSuitableDTD(CParserContext& aParserContext)
750 {
751 // We always find a DTD.
752 aParserContext.mAutoDetectStatus = ePrimaryDetect;
753
754 // Quick check for view source.
755 NS_ABORT_IF_FALSE(aParserContext.mParserCommand != eViewSource,
756 "The old parser is not supposed to be used for View Source anymore.");
757
758 // Now see if we're parsing HTML (which, as far as we're concerned, simply
759 // means "not XML").
760 if (aParserContext.mDocType != eXML) {
761 return new CNavDTD();
762 }
763
764 // If we're here, then we'd better be parsing XML.
765 NS_ASSERTION(aParserContext.mDocType == eXML, "What are you trying to send me, here?");
766 return new nsExpatDriver();
767 }
768
769 NS_IMETHODIMP
770 nsParser::CancelParsingEvents()
771 {
772 if (mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT) {
773 NS_ASSERTION(mContinueEvent, "mContinueEvent is null");
774 // Revoke the pending continue parsing event
775 mContinueEvent = nullptr;
776 mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
777 }
778 return NS_OK;
779 }
780
781 ////////////////////////////////////////////////////////////////////////
782
783 /**
784 * Evalutes EXPR1 and EXPR2 exactly once each, in that order. Stores the value
785 * of EXPR2 in RV is EXPR2 fails, otherwise RV contains the result of EXPR1
786 * (which could be success or failure).
787 *
788 * To understand the motivation for this construct, consider these example
789 * methods:
790 *
791 * nsresult nsSomething::DoThatThing(nsIWhatever* obj) {
792 * nsresult rv = NS_OK;
793 * ...
794 * return obj->DoThatThing();
795 * NS_ENSURE_SUCCESS(rv, rv);
796 * ...
797 * return rv;
798 * }
799 *
800 * void nsCaller::MakeThingsHappen() {
801 * return mSomething->DoThatThing(mWhatever);
802 * }
803 *
804 * Suppose, for whatever reason*, we want to shift responsibility for calling
805 * mWhatever->DoThatThing() from nsSomething::DoThatThing up to
806 * nsCaller::MakeThingsHappen. We might rewrite the two methods as follows:
807 *
808 * nsresult nsSomething::DoThatThing() {
809 * nsresult rv = NS_OK;
810 * ...
811 * ...
812 * return rv;
813 * }
814 *
815 * void nsCaller::MakeThingsHappen() {
816 * nsresult rv;
817 * PREFER_LATTER_ERROR_CODE(mSomething->DoThatThing(),
818 * mWhatever->DoThatThing(),
819 * rv);
820 * return rv;
821 * }
822 *
823 * *Possible reasons include: nsCaller doesn't want to give mSomething access
824 * to mWhatever, nsCaller wants to guarantee that mWhatever->DoThatThing() will
825 * be called regardless of how nsSomething::DoThatThing behaves, &c.
826 */
827 #define PREFER_LATTER_ERROR_CODE(EXPR1, EXPR2, RV) { \
828 nsresult RV##__temp = EXPR1; \
829 RV = EXPR2; \
830 if (NS_FAILED(RV)) { \
831 RV = RV##__temp; \
832 } \
833 }
834
835 /**
836 * This gets called just prior to the model actually
837 * being constructed. It's important to make this the
838 * last thing that happens right before parsing, so we
839 * can delay until the last moment the resolution of
840 * which DTD to use (unless of course we're assigned one).
841 */
842 nsresult
843 nsParser::WillBuildModel(nsString& aFilename)
844 {
845 if (!mParserContext)
846 return kInvalidParserContext;
847
848 if (eUnknownDetect != mParserContext->mAutoDetectStatus)
849 return NS_OK;
850
851 if (eDTDMode_unknown == mParserContext->mDTDMode ||
852 eDTDMode_autodetect == mParserContext->mDTDMode) {
853 char16_t buf[1025];
854 nsFixedString theBuffer(buf, 1024, 0);
855
856 // Grab 1024 characters, starting at the first non-whitespace
857 // character, to look for the doctype in.
858 mParserContext->mScanner->Peek(theBuffer, 1024, mParserContext->mScanner->FirstNonWhitespacePosition());
859 DetermineParseMode(theBuffer, mParserContext->mDTDMode,
860 mParserContext->mDocType, mParserContext->mMimeType);
861 }
862
863 NS_ASSERTION(!mDTD || !mParserContext->mPrevContext,
864 "Clobbering DTD for non-root parser context!");
865 mDTD = FindSuitableDTD(*mParserContext);
866 NS_ENSURE_TRUE(mDTD, NS_ERROR_OUT_OF_MEMORY);
867
868 nsITokenizer* tokenizer;
869 nsresult rv = mParserContext->GetTokenizer(mDTD, mSink, tokenizer);
870 NS_ENSURE_SUCCESS(rv, rv);
871
872 rv = mDTD->WillBuildModel(*mParserContext, tokenizer, mSink);
873 nsresult sinkResult = mSink->WillBuildModel(mDTD->GetMode());
874 // nsIDTD::WillBuildModel used to be responsible for calling
875 // nsIContentSink::WillBuildModel, but that obligation isn't expressible
876 // in the nsIDTD interface itself, so it's sounder and simpler to give that
877 // responsibility back to the parser. The former behavior of the DTD was to
878 // NS_ENSURE_SUCCESS the sink WillBuildModel call, so if the sink returns
879 // failure we should use sinkResult instead of rv, to preserve the old error
880 // handling behavior of the DTD:
881 return NS_FAILED(sinkResult) ? sinkResult : rv;
882 }
883
884 /**
885 * This gets called when the parser is done with its input.
886 * Note that the parser may have been called recursively, so we
887 * have to check for a prev. context before closing out the DTD/sink.
888 */
889 nsresult
890 nsParser::DidBuildModel(nsresult anErrorCode)
891 {
892 nsresult result = anErrorCode;
893
894 if (IsComplete()) {
895 if (mParserContext && !mParserContext->mPrevContext) {
896 // Let sink know if we're about to end load because we've been terminated.
897 // In that case we don't want it to run deferred scripts.
898 bool terminated = mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING;
899 if (mDTD && mSink) {
900 nsresult dtdResult = mDTD->DidBuildModel(anErrorCode),
901 sinkResult = mSink->DidBuildModel(terminated);
902 // nsIDTD::DidBuildModel used to be responsible for calling
903 // nsIContentSink::DidBuildModel, but that obligation isn't expressible
904 // in the nsIDTD interface itself, so it's sounder and simpler to give
905 // that responsibility back to the parser. The former behavior of the
906 // DTD was to NS_ENSURE_SUCCESS the sink DidBuildModel call, so if the
907 // sink returns failure we should use sinkResult instead of dtdResult,
908 // to preserve the old error handling behavior of the DTD:
909 result = NS_FAILED(sinkResult) ? sinkResult : dtdResult;
910 }
911
912 //Ref. to bug 61462.
913 mParserContext->mRequest = 0;
914 }
915 }
916
917 return result;
918 }
919
920 /**
921 * This method adds a new parser context to the list,
922 * pushing the current one to the next position.
923 *
924 * @param ptr to new context
925 */
926 void
927 nsParser::PushContext(CParserContext& aContext)
928 {
929 NS_ASSERTION(aContext.mPrevContext == mParserContext,
930 "Trying to push a context whose previous context differs from "
931 "the current parser context.");
932 mParserContext = &aContext;
933 }
934
935 /**
936 * This method pops the topmost context off the stack,
937 * returning it to the user. The next context (if any)
938 * becomes the current context.
939 * @update gess7/22/98
940 * @return prev. context
941 */
942 CParserContext*
943 nsParser::PopContext()
944 {
945 CParserContext* oldContext = mParserContext;
946 if (oldContext) {
947 mParserContext = oldContext->mPrevContext;
948 if (mParserContext) {
949 // If the old context was blocked, propagate the blocked state
950 // back to the new one. Also, propagate the stream listener state
951 // but don't override onStop state to guarantee the call to DidBuildModel().
952 if (mParserContext->mStreamListenerState != eOnStop) {
953 mParserContext->mStreamListenerState = oldContext->mStreamListenerState;
954 }
955 }
956 }
957 return oldContext;
958 }
959
960 /**
961 * Call this when you want control whether or not the parser will parse
962 * and tokenize input (TRUE), or whether it just caches input to be
963 * parsed later (FALSE).
964 *
965 * @param aState determines whether we parse/tokenize or just cache.
966 * @return current state
967 */
968 void
969 nsParser::SetUnusedInput(nsString& aBuffer)
970 {
971 mUnusedInput = aBuffer;
972 }
973
974 /**
975 * Call this when you want to *force* the parser to terminate the
976 * parsing process altogether. This is binary -- so once you terminate
977 * you can't resume without restarting altogether.
978 */
979 NS_IMETHODIMP
980 nsParser::Terminate(void)
981 {
982 // We should only call DidBuildModel once, so don't do anything if this is
983 // the second time that Terminate has been called.
984 if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) {
985 return NS_OK;
986 }
987
988 nsresult result = NS_OK;
989 // XXX - [ until we figure out a way to break parser-sink circularity ]
990 // Hack - Hold a reference until we are completely done...
991 nsCOMPtr<nsIParser> kungFuDeathGrip(this);
992 mInternalState = result = NS_ERROR_HTMLPARSER_STOPPARSING;
993
994 // CancelParsingEvents must be called to avoid leaking the nsParser object
995 // @see bug 108049
996 // If NS_PARSER_FLAG_PENDING_CONTINUE_EVENT is set then CancelParsingEvents
997 // will reset it so DidBuildModel will call DidBuildModel on the DTD. Note:
998 // The IsComplete() call inside of DidBuildModel looks at the pendingContinueEvents flag.
999 CancelParsingEvents();
1000
1001 // If we got interrupted in the middle of a document.write, then we might
1002 // have more than one parser context on our parsercontext stack. This has
1003 // the effect of making DidBuildModel a no-op, meaning that we never call
1004 // our sink's DidBuildModel and break the reference cycle, causing a leak.
1005 // Since we're getting terminated, we manually clean up our context stack.
1006 while (mParserContext && mParserContext->mPrevContext) {
1007 CParserContext *prev = mParserContext->mPrevContext;
1008 delete mParserContext;
1009 mParserContext = prev;
1010 }
1011
1012 if (mDTD) {
1013 mDTD->Terminate();
1014 DidBuildModel(result);
1015 } else if (mSink) {
1016 // We have no parser context or no DTD yet (so we got terminated before we
1017 // got any data). Manually break the reference cycle with the sink.
1018 result = mSink->DidBuildModel(true);
1019 NS_ENSURE_SUCCESS(result, result);
1020 }
1021
1022 return NS_OK;
1023 }
1024
1025 NS_IMETHODIMP
1026 nsParser::ContinueInterruptedParsing()
1027 {
1028 // If there are scripts executing, then the content sink is jumping the gun
1029 // (probably due to a synchronous XMLHttpRequest) and will re-enable us
1030 // later, see bug 460706.
1031 if (!IsOkToProcessNetworkData()) {
1032 return NS_OK;
1033 }
1034
1035 // If the stream has already finished, there's a good chance
1036 // that we might start closing things down when the parser
1037 // is reenabled. To make sure that we're not deleted across
1038 // the reenabling process, hold a reference to ourselves.
1039 nsresult result=NS_OK;
1040 nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1041 nsCOMPtr<nsIContentSink> sinkDeathGrip(mSink);
1042
1043 #ifdef DEBUG
1044 if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
1045 NS_WARNING("Don't call ContinueInterruptedParsing on a blocked parser.");
1046 }
1047 #endif
1048
1049 bool isFinalChunk = mParserContext &&
1050 mParserContext->mStreamListenerState == eOnStop;
1051
1052 mProcessingNetworkData = true;
1053 if (mSink) {
1054 mSink->WillParse();
1055 }
1056 result = ResumeParse(true, isFinalChunk); // Ref. bug 57999
1057 mProcessingNetworkData = false;
1058
1059 if (result != NS_OK) {
1060 result=mInternalState;
1061 }
1062
1063 return result;
1064 }
1065
1066 /**
1067 * Stops parsing temporarily. That's it will prevent the
1068 * parser from building up content model.
1069 */
1070 NS_IMETHODIMP_(void)
1071 nsParser::BlockParser()
1072 {
1073 mFlags &= ~NS_PARSER_FLAG_PARSER_ENABLED;
1074 }
1075
1076 /**
1077 * Open up the parser for tokenization, building up content
1078 * model..etc. However, this method does not resume parsing
1079 * automatically. It's the callers' responsibility to restart
1080 * the parsing engine.
1081 */
1082 NS_IMETHODIMP_(void)
1083 nsParser::UnblockParser()
1084 {
1085 if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) {
1086 mFlags |= NS_PARSER_FLAG_PARSER_ENABLED;
1087 } else {
1088 NS_WARNING("Trying to unblock an unblocked parser.");
1089 }
1090 }
1091
1092 NS_IMETHODIMP_(void)
1093 nsParser::ContinueInterruptedParsingAsync()
1094 {
1095 mSink->ContinueInterruptedParsingAsync();
1096 }
1097
1098 /**
1099 * Call this to query whether the parser is enabled or not.
1100 */
1101 NS_IMETHODIMP_(bool)
1102 nsParser::IsParserEnabled()
1103 {
1104 return (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) != 0;
1105 }
1106
1107 /**
1108 * Call this to query whether the parser thinks it's done with parsing.
1109 */
1110 NS_IMETHODIMP_(bool)
1111 nsParser::IsComplete()
1112 {
1113 return !(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT);
1114 }
1115
1116
1117 void nsParser::HandleParserContinueEvent(nsParserContinueEvent *ev)
1118 {
1119 // Ignore any revoked continue events...
1120 if (mContinueEvent != ev)
1121 return;
1122
1123 mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT;
1124 mContinueEvent = nullptr;
1125
1126 NS_ASSERTION(IsOkToProcessNetworkData(),
1127 "Interrupted in the middle of a script?");
1128 ContinueInterruptedParsing();
1129 }
1130
1131 bool
1132 nsParser::IsInsertionPointDefined()
1133 {
1134 return false;
1135 }
1136
1137 void
1138 nsParser::BeginEvaluatingParserInsertedScript()
1139 {
1140 }
1141
1142 void
1143 nsParser::EndEvaluatingParserInsertedScript()
1144 {
1145 }
1146
1147 void
1148 nsParser::MarkAsNotScriptCreated(const char* aCommand)
1149 {
1150 }
1151
1152 bool
1153 nsParser::IsScriptCreated()
1154 {
1155 return false;
1156 }
1157
1158 /**
1159 * This is the main controlling routine in the parsing process.
1160 * Note that it may get called multiple times for the same scanner,
1161 * since this is a pushed based system, and all the tokens may
1162 * not have been consumed by the scanner during a given invocation
1163 * of this method.
1164 */
1165 NS_IMETHODIMP
1166 nsParser::Parse(nsIURI* aURL,
1167 nsIRequestObserver* aListener,
1168 void* aKey,
1169 nsDTDMode aMode)
1170 {
1171
1172 NS_PRECONDITION(aURL, "Error: Null URL given");
1173
1174 nsresult result=kBadURL;
1175 mObserver = aListener;
1176
1177 if (aURL) {
1178 nsAutoCString spec;
1179 nsresult rv = aURL->GetSpec(spec);
1180 if (rv != NS_OK) {
1181 return rv;
1182 }
1183 NS_ConvertUTF8toUTF16 theName(spec);
1184
1185 nsScanner* theScanner = new nsScanner(theName, false);
1186 CParserContext* pc = new CParserContext(mParserContext, theScanner, aKey,
1187 mCommand, aListener);
1188 if (pc && theScanner) {
1189 pc->mMultipart = true;
1190 pc->mContextType = CParserContext::eCTURL;
1191 pc->mDTDMode = aMode;
1192 PushContext(*pc);
1193
1194 result = NS_OK;
1195 } else {
1196 result = mInternalState = NS_ERROR_HTMLPARSER_BADCONTEXT;
1197 }
1198 }
1199 return result;
1200 }
1201
1202 /**
1203 * Used by XML fragment parsing below.
1204 *
1205 * @param aSourceBuffer contains a string-full of real content
1206 */
1207 nsresult
1208 nsParser::Parse(const nsAString& aSourceBuffer,
1209 void* aKey,
1210 bool aLastCall)
1211 {
1212 nsresult result = NS_OK;
1213
1214 // Don't bother if we're never going to parse this.
1215 if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) {
1216 return result;
1217 }
1218
1219 if (!aLastCall && aSourceBuffer.IsEmpty()) {
1220 // Nothing is being passed to the parser so return
1221 // immediately. mUnusedInput will get processed when
1222 // some data is actually passed in.
1223 // But if this is the last call, make sure to finish up
1224 // stuff correctly.
1225 return result;
1226 }
1227
1228 // Maintain a reference to ourselves so we don't go away
1229 // till we're completely done.
1230 nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1231
1232 if (aLastCall || !aSourceBuffer.IsEmpty() || !mUnusedInput.IsEmpty()) {
1233 // Note: The following code will always find the parser context associated
1234 // with the given key, even if that context has been suspended (e.g., for
1235 // another document.write call). This doesn't appear to be exactly what IE
1236 // does in the case where this happens, but this makes more sense.
1237 CParserContext* pc = mParserContext;
1238 while (pc && pc->mKey != aKey) {
1239 pc = pc->mPrevContext;
1240 }
1241
1242 if (!pc) {
1243 // Only make a new context if we don't have one, OR if we do, but has a
1244 // different context key.
1245 nsScanner* theScanner = new nsScanner(mUnusedInput);
1246 NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY);
1247
1248 eAutoDetectResult theStatus = eUnknownDetect;
1249
1250 if (mParserContext &&
1251 mParserContext->mMimeType.EqualsLiteral("application/xml")) {
1252 // Ref. Bug 90379
1253 NS_ASSERTION(mDTD, "How come the DTD is null?");
1254
1255 if (mParserContext) {
1256 theStatus = mParserContext->mAutoDetectStatus;
1257 // Added this to fix bug 32022.
1258 }
1259 }
1260
1261 pc = new CParserContext(mParserContext, theScanner, aKey, mCommand,
1262 0, theStatus, aLastCall);
1263 NS_ENSURE_TRUE(pc, NS_ERROR_OUT_OF_MEMORY);
1264
1265 PushContext(*pc);
1266
1267 pc->mMultipart = !aLastCall; // By default
1268 if (pc->mPrevContext) {
1269 pc->mMultipart |= pc->mPrevContext->mMultipart;
1270 }
1271
1272 // Start fix bug 40143
1273 if (pc->mMultipart) {
1274 pc->mStreamListenerState = eOnDataAvail;
1275 if (pc->mScanner) {
1276 pc->mScanner->SetIncremental(true);
1277 }
1278 } else {
1279 pc->mStreamListenerState = eOnStop;
1280 if (pc->mScanner) {
1281 pc->mScanner->SetIncremental(false);
1282 }
1283 }
1284 // end fix for 40143
1285
1286 pc->mContextType=CParserContext::eCTString;
1287 pc->SetMimeType(NS_LITERAL_CSTRING("application/xml"));
1288 pc->mDTDMode = eDTDMode_full_standards;
1289
1290 mUnusedInput.Truncate();
1291
1292 pc->mScanner->Append(aSourceBuffer);
1293 // Do not interrupt document.write() - bug 95487
1294 result = ResumeParse(false, false, false);
1295 } else {
1296 pc->mScanner->Append(aSourceBuffer);
1297 if (!pc->mPrevContext) {
1298 // Set stream listener state to eOnStop, on the final context - Fix 68160,
1299 // to guarantee DidBuildModel() call - Fix 36148
1300 if (aLastCall) {
1301 pc->mStreamListenerState = eOnStop;
1302 pc->mScanner->SetIncremental(false);
1303 }
1304
1305 if (pc == mParserContext) {
1306 // If pc is not mParserContext, then this call to ResumeParse would
1307 // do the wrong thing and try to continue parsing using
1308 // mParserContext. We need to wait to actually resume parsing on pc.
1309 ResumeParse(false, false, false);
1310 }
1311 }
1312 }
1313 }
1314
1315 return result;
1316 }
1317
1318 NS_IMETHODIMP
1319 nsParser::ParseFragment(const nsAString& aSourceBuffer,
1320 nsTArray<nsString>& aTagStack)
1321 {
1322 nsresult result = NS_OK;
1323 nsAutoString theContext;
1324 uint32_t theCount = aTagStack.Length();
1325 uint32_t theIndex = 0;
1326
1327 // Disable observers for fragments
1328 mFlags &= ~NS_PARSER_FLAG_OBSERVERS_ENABLED;
1329
1330 for (theIndex = 0; theIndex < theCount; theIndex++) {
1331 theContext.AppendLiteral("<");
1332 theContext.Append(aTagStack[theCount - theIndex - 1]);
1333 theContext.AppendLiteral(">");
1334 }
1335
1336 if (theCount == 0) {
1337 // Ensure that the buffer is not empty. Because none of the DTDs care
1338 // about leading whitespace, this doesn't change the result.
1339 theContext.AssignLiteral(" ");
1340 }
1341
1342 // First, parse the context to build up the DTD's tag stack. Note that we
1343 // pass false for the aLastCall parameter.
1344 result = Parse(theContext,
1345 (void*)&theContext,
1346 false);
1347 if (NS_FAILED(result)) {
1348 mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
1349 return result;
1350 }
1351
1352 if (!mSink) {
1353 // Parse must have failed in the XML case and so the sink was killed.
1354 return NS_ERROR_HTMLPARSER_STOPPARSING;
1355 }
1356
1357 nsCOMPtr<nsIFragmentContentSink> fragSink = do_QueryInterface(mSink);
1358 NS_ASSERTION(fragSink, "ParseFragment requires a fragment content sink");
1359
1360 fragSink->WillBuildContent();
1361 // Now, parse the actual content. Note that this is the last call
1362 // for HTML content, but for XML, we will want to build and parse
1363 // the end tags. However, if tagStack is empty, it's the last call
1364 // for XML as well.
1365 if (theCount == 0) {
1366 result = Parse(aSourceBuffer,
1367 &theContext,
1368 true);
1369 fragSink->DidBuildContent();
1370 } else {
1371 // Add an end tag chunk, so expat will read the whole source buffer,
1372 // and not worry about ']]' etc.
1373 result = Parse(aSourceBuffer + NS_LITERAL_STRING("</"),
1374 &theContext,
1375 false);
1376 fragSink->DidBuildContent();
1377
1378 if (NS_SUCCEEDED(result)) {
1379 nsAutoString endContext;
1380 for (theIndex = 0; theIndex < theCount; theIndex++) {
1381 // we already added an end tag chunk above
1382 if (theIndex > 0) {
1383 endContext.AppendLiteral("</");
1384 }
1385
1386 nsString& thisTag = aTagStack[theIndex];
1387 // was there an xmlns=?
1388 int32_t endOfTag = thisTag.FindChar(char16_t(' '));
1389 if (endOfTag == -1) {
1390 endContext.Append(thisTag);
1391 } else {
1392 endContext.Append(Substring(thisTag,0,endOfTag));
1393 }
1394
1395 endContext.AppendLiteral(">");
1396 }
1397
1398 result = Parse(endContext,
1399 &theContext,
1400 true);
1401 }
1402 }
1403
1404 mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED;
1405
1406 return result;
1407 }
1408
1409 /**
1410 * This routine is called to cause the parser to continue parsing its
1411 * underlying stream. This call allows the parse process to happen in
1412 * chunks, such as when the content is push based, and we need to parse in
1413 * pieces.
1414 *
1415 * An interesting change in how the parser gets used has led us to add extra
1416 * processing to this method. The case occurs when the parser is blocked in
1417 * one context, and gets a parse(string) call in another context. In this
1418 * case, the parserContexts are linked. No problem.
1419 *
1420 * The problem is that Parse(string) assumes that it can proceed unabated,
1421 * but if the parser is already blocked that assumption is false. So we
1422 * needed to add a mechanism here to allow the parser to continue to process
1423 * (the pop and free) contexts until 1) it get's blocked again; 2) it runs
1424 * out of contexts.
1425 *
1426 *
1427 * @param allowItertion : set to true if non-script resumption is requested
1428 * @param aIsFinalChunk : tells us when the last chunk of data is provided.
1429 * @return error code -- 0 if ok, non-zero if error.
1430 */
1431 nsresult
1432 nsParser::ResumeParse(bool allowIteration, bool aIsFinalChunk,
1433 bool aCanInterrupt)
1434 {
1435 nsresult result = NS_OK;
1436
1437 if ((mFlags & NS_PARSER_FLAG_PARSER_ENABLED) &&
1438 mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) {
1439
1440 result = WillBuildModel(mParserContext->mScanner->GetFilename());
1441 if (NS_FAILED(result)) {
1442 mFlags &= ~NS_PARSER_FLAG_CAN_TOKENIZE;
1443 return result;
1444 }
1445
1446 if (mDTD) {
1447 mSink->WillResume();
1448 bool theIterationIsOk = true;
1449
1450 while (result == NS_OK && theIterationIsOk) {
1451 if (!mUnusedInput.IsEmpty() && mParserContext->mScanner) {
1452 // -- Ref: Bug# 22485 --
1453 // Insert the unused input into the source buffer
1454 // as if it was read from the input stream.
1455 // Adding UngetReadable() per vidur!!
1456 mParserContext->mScanner->UngetReadable(mUnusedInput);
1457 mUnusedInput.Truncate(0);
1458 }
1459
1460 // Only allow parsing to be interrupted in the subsequent call to
1461 // build model.
1462 nsresult theTokenizerResult = (mFlags & NS_PARSER_FLAG_CAN_TOKENIZE)
1463 ? Tokenize(aIsFinalChunk)
1464 : NS_OK;
1465 result = BuildModel();
1466
1467 if (result == NS_ERROR_HTMLPARSER_INTERRUPTED && aIsFinalChunk) {
1468 PostContinueEvent();
1469 }
1470
1471 theIterationIsOk = theTokenizerResult != kEOF &&
1472 result != NS_ERROR_HTMLPARSER_INTERRUPTED;
1473
1474 // Make sure not to stop parsing too early. Therefore, before shutting
1475 // down the parser, it's important to check whether the input buffer
1476 // has been scanned to completion (theTokenizerResult should be kEOF).
1477 // kEOF -> End of buffer.
1478
1479 // If we're told to block the parser, we disable all further parsing
1480 // (and cache any data coming in) until the parser is re-enabled.
1481 if (NS_ERROR_HTMLPARSER_BLOCK == result) {
1482 mSink->WillInterrupt();
1483 if (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) {
1484 // If we were blocked by a recursive invocation, don't re-block.
1485 BlockParser();
1486 }
1487 return NS_OK;
1488 }
1489 if (NS_ERROR_HTMLPARSER_STOPPARSING == result) {
1490 // Note: Parser Terminate() calls DidBuildModel.
1491 if (mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) {
1492 DidBuildModel(mStreamStatus);
1493 mInternalState = result;
1494 }
1495
1496 return NS_OK;
1497 }
1498 if ((NS_OK == result && theTokenizerResult == kEOF) ||
1499 result == NS_ERROR_HTMLPARSER_INTERRUPTED) {
1500 bool theContextIsStringBased =
1501 CParserContext::eCTString == mParserContext->mContextType;
1502
1503 if (mParserContext->mStreamListenerState == eOnStop ||
1504 !mParserContext->mMultipart || theContextIsStringBased) {
1505 if (!mParserContext->mPrevContext) {
1506 if (mParserContext->mStreamListenerState == eOnStop) {
1507 DidBuildModel(mStreamStatus);
1508 return NS_OK;
1509 }
1510 } else {
1511 CParserContext* theContext = PopContext();
1512 if (theContext) {
1513 theIterationIsOk = allowIteration && theContextIsStringBased;
1514 if (theContext->mCopyUnused) {
1515 theContext->mScanner->CopyUnusedData(mUnusedInput);
1516 }
1517
1518 delete theContext;
1519 }
1520
1521 result = mInternalState;
1522 aIsFinalChunk = mParserContext &&
1523 mParserContext->mStreamListenerState == eOnStop;
1524 // ...then intentionally fall through to mSink->WillInterrupt()...
1525 }
1526 }
1527 }
1528
1529 if (theTokenizerResult == kEOF ||
1530 result == NS_ERROR_HTMLPARSER_INTERRUPTED) {
1531 result = (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
1532 mSink->WillInterrupt();
1533 }
1534 }
1535 } else {
1536 mInternalState = result = NS_ERROR_HTMLPARSER_UNRESOLVEDDTD;
1537 }
1538 }
1539
1540 return (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result;
1541 }
1542
1543 /**
1544 * This is where we loop over the tokens created in the
1545 * tokenization phase, and try to make sense out of them.
1546 */
1547 nsresult
1548 nsParser::BuildModel()
1549 {
1550 nsITokenizer* theTokenizer = nullptr;
1551
1552 nsresult result = NS_OK;
1553 if (mParserContext) {
1554 result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer);
1555 }
1556
1557 if (NS_SUCCEEDED(result)) {
1558 if (mDTD) {
1559 result = mDTD->BuildModel(theTokenizer, mSink);
1560 }
1561 } else {
1562 mInternalState = result = NS_ERROR_HTMLPARSER_BADTOKENIZER;
1563 }
1564 return result;
1565 }
1566
1567 /*******************************************************************
1568 These methods are used to talk to the netlib system...
1569 *******************************************************************/
1570
1571 nsresult
1572 nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext)
1573 {
1574 NS_PRECONDITION(eNone == mParserContext->mStreamListenerState,
1575 "Parser's nsIStreamListener API was not setup "
1576 "correctly in constructor.");
1577 if (mObserver) {
1578 mObserver->OnStartRequest(request, aContext);
1579 }
1580 mParserContext->mStreamListenerState = eOnStart;
1581 mParserContext->mAutoDetectStatus = eUnknownDetect;
1582 mParserContext->mRequest = request;
1583
1584 NS_ASSERTION(!mParserContext->mPrevContext,
1585 "Clobbering DTD for non-root parser context!");
1586 mDTD = nullptr;
1587
1588 nsresult rv;
1589 nsAutoCString contentType;
1590 nsCOMPtr<nsIChannel> channel = do_QueryInterface(request);
1591 if (channel) {
1592 rv = channel->GetContentType(contentType);
1593 if (NS_SUCCEEDED(rv)) {
1594 mParserContext->SetMimeType(contentType);
1595 }
1596 }
1597
1598 rv = NS_OK;
1599
1600 return rv;
1601 }
1602
1603 static bool
1604 ExtractCharsetFromXmlDeclaration(const unsigned char* aBytes, int32_t aLen,
1605 nsCString& oCharset)
1606 {
1607 // This code is rather pointless to have. Might as well reuse expat as
1608 // seen in nsHtml5StreamParser. -- hsivonen
1609 oCharset.Truncate();
1610 if ((aLen >= 5) &&
1611 ('<' == aBytes[0]) &&
1612 ('?' == aBytes[1]) &&
1613 ('x' == aBytes[2]) &&
1614 ('m' == aBytes[3]) &&
1615 ('l' == aBytes[4])) {
1616 int32_t i;
1617 bool versionFound = false, encodingFound = false;
1618 for (i = 6; i < aLen && !encodingFound; ++i) {
1619 // end of XML declaration?
1620 if ((((char*) aBytes)[i] == '?') &&
1621 ((i + 1) < aLen) &&
1622 (((char*) aBytes)[i + 1] == '>')) {
1623 break;
1624 }
1625 // Version is required.
1626 if (!versionFound) {
1627 // Want to avoid string comparisons, hence looking for 'n'
1628 // and only if found check the string leading to it. Not
1629 // foolproof, but fast.
1630 // The shortest string allowed before this is (strlen==13):
1631 // <?xml version
1632 if ((((char*) aBytes)[i] == 'n') &&
1633 (i >= 12) &&
1634 (0 == PL_strncmp("versio", (char*) (aBytes + i - 6), 6))) {
1635 // Fast forward through version
1636 char q = 0;
1637 for (++i; i < aLen; ++i) {
1638 char qi = ((char*) aBytes)[i];
1639 if (qi == '\'' || qi == '"') {
1640 if (q && q == qi) {
1641 // ending quote
1642 versionFound = true;
1643 break;
1644 } else {
1645 // Starting quote
1646 q = qi;
1647 }
1648 }
1649 }
1650 }
1651 } else {
1652 // encoding must follow version
1653 // Want to avoid string comparisons, hence looking for 'g'
1654 // and only if found check the string leading to it. Not
1655 // foolproof, but fast.
1656 // The shortest allowed string before this (strlen==26):
1657 // <?xml version="1" encoding
1658 if ((((char*) aBytes)[i] == 'g') && (i >= 25) && (0 == PL_strncmp(
1659 "encodin", (char*) (aBytes + i - 7), 7))) {
1660 int32_t encStart = 0;
1661 char q = 0;
1662 for (++i; i < aLen; ++i) {
1663 char qi = ((char*) aBytes)[i];
1664 if (qi == '\'' || qi == '"') {
1665 if (q && q == qi) {
1666 int32_t count = i - encStart;
1667 // encoding value is invalid if it is UTF-16
1668 if (count > 0 && PL_strncasecmp("UTF-16",
1669 (char*) (aBytes + encStart), count)) {
1670 oCharset.Assign((char*) (aBytes + encStart), count);
1671 }
1672 encodingFound = true;
1673 break;
1674 } else {
1675 encStart = i + 1;
1676 q = qi;
1677 }
1678 }
1679 }
1680 }
1681 } // if (!versionFound)
1682 } // for
1683 }
1684 return !oCharset.IsEmpty();
1685 }
1686
1687 inline const char
1688 GetNextChar(nsACString::const_iterator& aStart,
1689 nsACString::const_iterator& aEnd)
1690 {
1691 NS_ASSERTION(aStart != aEnd, "end of buffer");
1692 return (++aStart != aEnd) ? *aStart : '\0';
1693 }
1694
1695 static NS_METHOD
1696 NoOpParserWriteFunc(nsIInputStream* in,
1697 void* closure,
1698 const char* fromRawSegment,
1699 uint32_t toOffset,
1700 uint32_t count,
1701 uint32_t *writeCount)
1702 {
1703 *writeCount = count;
1704 return NS_OK;
1705 }
1706
1707 typedef struct {
1708 bool mNeedCharsetCheck;
1709 nsParser* mParser;
1710 nsScanner* mScanner;
1711 nsIRequest* mRequest;
1712 } ParserWriteStruct;
1713
1714 /*
1715 * This function is invoked as a result of a call to a stream's
1716 * ReadSegments() method. It is called for each contiguous buffer
1717 * of data in the underlying stream or pipe. Using ReadSegments
1718 * allows us to avoid copying data to read out of the stream.
1719 */
1720 static NS_METHOD
1721 ParserWriteFunc(nsIInputStream* in,
1722 void* closure,
1723 const char* fromRawSegment,
1724 uint32_t toOffset,
1725 uint32_t count,
1726 uint32_t *writeCount)
1727 {
1728 nsresult result;
1729 ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure);
1730 const unsigned char* buf =
1731 reinterpret_cast<const unsigned char*> (fromRawSegment);
1732 uint32_t theNumRead = count;
1733
1734 if (!pws) {
1735 return NS_ERROR_FAILURE;
1736 }
1737
1738 if (pws->mNeedCharsetCheck) {
1739 pws->mNeedCharsetCheck = false;
1740 int32_t source;
1741 nsAutoCString preferred;
1742 nsAutoCString maybePrefer;
1743 pws->mParser->GetDocumentCharset(preferred, source);
1744
1745 // This code was bogus when I found it. It expects the BOM or the XML
1746 // declaration to be entirely in the first network buffer. -- hsivonen
1747 if (nsContentUtils::CheckForBOM(buf, count, maybePrefer)) {
1748 // The decoder will swallow the BOM. The UTF-16 will re-sniff for
1749 // endianness. The value of preferred is now either "UTF-8" or "UTF-16".
1750 preferred.Assign(maybePrefer);
1751 source = kCharsetFromByteOrderMark;
1752 } else if (source < kCharsetFromChannel) {
1753 nsAutoCString declCharset;
1754
1755 if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) {
1756 if (EncodingUtils::FindEncodingForLabel(declCharset, maybePrefer)) {
1757 preferred.Assign(maybePrefer);
1758 source = kCharsetFromMetaTag;
1759 }
1760 }
1761 }
1762
1763 pws->mParser->SetDocumentCharset(preferred, source);
1764 pws->mParser->SetSinkCharset(preferred);
1765
1766 }
1767
1768 result = pws->mScanner->Append(fromRawSegment, theNumRead, pws->mRequest);
1769 if (NS_SUCCEEDED(result)) {
1770 *writeCount = count;
1771 }
1772
1773 return result;
1774 }
1775
1776 nsresult
1777 nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext,
1778 nsIInputStream *pIStream, uint64_t sourceOffset,
1779 uint32_t aLength)
1780 {
1781 NS_PRECONDITION((eOnStart == mParserContext->mStreamListenerState ||
1782 eOnDataAvail == mParserContext->mStreamListenerState),
1783 "Error: OnStartRequest() must be called before OnDataAvailable()");
1784 NS_PRECONDITION(NS_InputStreamIsBuffered(pIStream),
1785 "Must have a buffered input stream");
1786
1787 nsresult rv = NS_OK;
1788
1789 if (mIsAboutBlank) {
1790 MOZ_ASSERT(false, "Must not get OnDataAvailable for about:blank");
1791 // ... but if an extension tries to feed us data for about:blank in a
1792 // release build, silently ignore the data.
1793 uint32_t totalRead;
1794 rv = pIStream->ReadSegments(NoOpParserWriteFunc,
1795 nullptr,
1796 aLength,
1797 &totalRead);
1798 return rv;
1799 }
1800
1801 CParserContext *theContext = mParserContext;
1802
1803 while (theContext && theContext->mRequest != request) {
1804 theContext = theContext->mPrevContext;
1805 }
1806
1807 if (theContext) {
1808 theContext->mStreamListenerState = eOnDataAvail;
1809
1810 if (eInvalidDetect == theContext->mAutoDetectStatus) {
1811 if (theContext->mScanner) {
1812 nsScannerIterator iter;
1813 theContext->mScanner->EndReading(iter);
1814 theContext->mScanner->SetPosition(iter, true);
1815 }
1816 }
1817
1818 uint32_t totalRead;
1819 ParserWriteStruct pws;
1820 pws.mNeedCharsetCheck = true;
1821 pws.mParser = this;
1822 pws.mScanner = theContext->mScanner;
1823 pws.mRequest = request;
1824
1825 rv = pIStream->ReadSegments(ParserWriteFunc, &pws, aLength, &totalRead);
1826 if (NS_FAILED(rv)) {
1827 return rv;
1828 }
1829
1830 // Don't bother to start parsing until we've seen some
1831 // non-whitespace data
1832 if (IsOkToProcessNetworkData() &&
1833 theContext->mScanner->FirstNonWhitespacePosition() >= 0) {
1834 nsCOMPtr<nsIParser> kungFuDeathGrip(this);
1835 nsCOMPtr<nsIContentSink> sinkDeathGrip(mSink);
1836 mProcessingNetworkData = true;
1837 if (mSink) {
1838 mSink->WillParse();
1839 }
1840 rv = ResumeParse();
1841 mProcessingNetworkData = false;
1842 }
1843 } else {
1844 rv = NS_ERROR_UNEXPECTED;
1845 }
1846
1847 return rv;
1848 }
1849
1850 /**
1851 * This is called by the networking library once the last block of data
1852 * has been collected from the net.
1853 */
1854 nsresult
1855 nsParser::OnStopRequest(nsIRequest *request, nsISupports* aContext,
1856 nsresult status)
1857 {
1858 nsresult rv = NS_OK;
1859
1860 CParserContext *pc = mParserContext;
1861 while (pc) {
1862 if (pc->mRequest == request) {
1863 pc->mStreamListenerState = eOnStop;
1864 pc->mScanner->SetIncremental(false);
1865 break;
1866 }
1867
1868 pc = pc->mPrevContext;
1869 }
1870
1871 mStreamStatus = status;
1872
1873 if (IsOkToProcessNetworkData() && NS_SUCCEEDED(rv)) {
1874 mProcessingNetworkData = true;
1875 if (mSink) {
1876 mSink->WillParse();
1877 }
1878 rv = ResumeParse(true, true);
1879 mProcessingNetworkData = false;
1880 }
1881
1882 // If the parser isn't enabled, we don't finish parsing till
1883 // it is reenabled.
1884
1885
1886 // XXX Should we wait to notify our observers as well if the
1887 // parser isn't yet enabled?
1888 if (mObserver) {
1889 mObserver->OnStopRequest(request, aContext, status);
1890 }
1891
1892 return rv;
1893 }
1894
1895
1896 /*******************************************************************
1897 Here come the tokenization methods...
1898 *******************************************************************/
1899
1900
1901 /**
1902 * Part of the code sandwich, this gets called right before
1903 * the tokenization process begins. The main reason for
1904 * this call is to allow the delegate to do initialization.
1905 */
1906 bool
1907 nsParser::WillTokenize(bool aIsFinalChunk)
1908 {
1909 if (!mParserContext) {
1910 return true;
1911 }
1912
1913 nsITokenizer* theTokenizer;
1914 nsresult result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer);
1915 NS_ENSURE_SUCCESS(result, false);
1916 return NS_SUCCEEDED(theTokenizer->WillTokenize(aIsFinalChunk));
1917 }
1918
1919
1920 /**
1921 * This is the primary control routine to consume tokens.
1922 * It iteratively consumes tokens until an error occurs or
1923 * you run out of data.
1924 */
1925 nsresult nsParser::Tokenize(bool aIsFinalChunk)
1926 {
1927 nsITokenizer* theTokenizer;
1928
1929 nsresult result = NS_ERROR_NOT_AVAILABLE;
1930 if (mParserContext) {
1931 result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer);
1932 }
1933
1934 if (NS_SUCCEEDED(result)) {
1935 bool flushTokens = false;
1936
1937 bool killSink = false;
1938
1939 WillTokenize(aIsFinalChunk);
1940 while (NS_SUCCEEDED(result)) {
1941 mParserContext->mScanner->Mark();
1942 result = theTokenizer->ConsumeToken(*mParserContext->mScanner,
1943 flushTokens);
1944 if (NS_FAILED(result)) {
1945 mParserContext->mScanner->RewindToMark();
1946 if (kEOF == result){
1947 break;
1948 }
1949 if (NS_ERROR_HTMLPARSER_STOPPARSING == result) {
1950 killSink = true;
1951 result = Terminate();
1952 break;
1953 }
1954 } else if (flushTokens && (mFlags & NS_PARSER_FLAG_OBSERVERS_ENABLED)) {
1955 // I added the extra test of NS_PARSER_FLAG_OBSERVERS_ENABLED to fix Bug# 23931.
1956 // Flush tokens on seeing </SCRIPT> -- Ref: Bug# 22485 --
1957 // Also remember to update the marked position.
1958 mFlags |= NS_PARSER_FLAG_FLUSH_TOKENS;
1959 mParserContext->mScanner->Mark();
1960 break;
1961 }
1962 }
1963
1964 if (killSink) {
1965 mSink = nullptr;
1966 }
1967 } else {
1968 result = mInternalState = NS_ERROR_HTMLPARSER_BADTOKENIZER;
1969 }
1970
1971 return result;
1972 }
1973
1974 /**
1975 * Get the channel associated with this parser
1976 *
1977 * @param aChannel out param that will contain the result
1978 * @return NS_OK if successful
1979 */
1980 NS_IMETHODIMP
1981 nsParser::GetChannel(nsIChannel** aChannel)
1982 {
1983 nsresult result = NS_ERROR_NOT_AVAILABLE;
1984 if (mParserContext && mParserContext->mRequest) {
1985 result = CallQueryInterface(mParserContext->mRequest, aChannel);
1986 }
1987 return result;
1988 }
1989
1990 /**
1991 * Get the DTD associated with this parser
1992 */
1993 NS_IMETHODIMP
1994 nsParser::GetDTD(nsIDTD** aDTD)
1995 {
1996 if (mParserContext) {
1997 NS_IF_ADDREF(*aDTD = mDTD);
1998 }
1999
2000 return NS_OK;
2001 }
2002
2003 /**
2004 * Get this as nsIStreamListener
2005 */
2006 nsIStreamListener*
2007 nsParser::GetStreamListener()
2008 {
2009 return this;
2010 }

mercurial