netwerk/streamconv/converters/nsUnknownDecoder.cpp

branch
TOR_BUG_9701
changeset 9
a63d609f5ebe
equal deleted inserted replaced
-1:000000000000 0:774c62b034e5
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #include "nsUnknownDecoder.h"
7 #include "nsIPipe.h"
8 #include "nsIInputStream.h"
9 #include "nsIOutputStream.h"
10 #include "nsMimeTypes.h"
11 #include "nsIPrefService.h"
12 #include "nsIPrefBranch.h"
13
14 #include "nsCRT.h"
15
16 #include "nsIMIMEService.h"
17
18 #include "nsIViewSourceChannel.h"
19 #include "nsIHttpChannel.h"
20 #include "nsNetCID.h"
21 #include "nsNetUtil.h"
22
23
24 #define MAX_BUFFER_SIZE 512
25
26 nsUnknownDecoder::nsUnknownDecoder()
27 : mBuffer(nullptr)
28 , mBufferLen(0)
29 , mRequireHTMLsuffix(false)
30 {
31 nsCOMPtr<nsIPrefBranch> prefs = do_GetService(NS_PREFSERVICE_CONTRACTID);
32 if (prefs) {
33 bool val;
34 if (NS_SUCCEEDED(prefs->GetBoolPref("security.requireHTMLsuffix", &val)))
35 mRequireHTMLsuffix = val;
36 }
37 }
38
39 nsUnknownDecoder::~nsUnknownDecoder()
40 {
41 if (mBuffer) {
42 delete [] mBuffer;
43 mBuffer = nullptr;
44 }
45 }
46
47 // ----
48 //
49 // nsISupports implementation...
50 //
51 // ----
52
53 NS_IMPL_ADDREF(nsUnknownDecoder)
54 NS_IMPL_RELEASE(nsUnknownDecoder)
55
56 NS_INTERFACE_MAP_BEGIN(nsUnknownDecoder)
57 NS_INTERFACE_MAP_ENTRY(nsIStreamConverter)
58 NS_INTERFACE_MAP_ENTRY(nsIStreamListener)
59 NS_INTERFACE_MAP_ENTRY(nsIRequestObserver)
60 NS_INTERFACE_MAP_ENTRY(nsIContentSniffer)
61 NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIStreamListener)
62 NS_INTERFACE_MAP_END
63
64
65 // ----
66 //
67 // nsIStreamConverter methods...
68 //
69 // ----
70
71 NS_IMETHODIMP
72 nsUnknownDecoder::Convert(nsIInputStream *aFromStream,
73 const char *aFromType,
74 const char *aToType,
75 nsISupports *aCtxt,
76 nsIInputStream **aResultStream)
77 {
78 return NS_ERROR_NOT_IMPLEMENTED;
79 }
80
81 NS_IMETHODIMP
82 nsUnknownDecoder::AsyncConvertData(const char *aFromType,
83 const char *aToType,
84 nsIStreamListener *aListener,
85 nsISupports *aCtxt)
86 {
87 NS_ASSERTION(aListener && aFromType && aToType,
88 "null pointer passed into multi mixed converter");
89 // hook up our final listener. this guy gets the various On*() calls we want to throw
90 // at him.
91 //
92 mNextListener = aListener;
93 return (aListener) ? NS_OK : NS_ERROR_FAILURE;
94 }
95
96 // ----
97 //
98 // nsIStreamListener methods...
99 //
100 // ----
101
102 NS_IMETHODIMP
103 nsUnknownDecoder::OnDataAvailable(nsIRequest* request,
104 nsISupports *aCtxt,
105 nsIInputStream *aStream,
106 uint64_t aSourceOffset,
107 uint32_t aCount)
108 {
109 nsresult rv = NS_OK;
110
111 if (!mNextListener) return NS_ERROR_FAILURE;
112
113 if (mContentType.IsEmpty()) {
114 uint32_t count, len;
115
116 // If the buffer has not been allocated by now, just fail...
117 if (!mBuffer) return NS_ERROR_OUT_OF_MEMORY;
118
119 //
120 // Determine how much of the stream should be read to fill up the
121 // sniffer buffer...
122 //
123 if (mBufferLen + aCount >= MAX_BUFFER_SIZE) {
124 count = MAX_BUFFER_SIZE-mBufferLen;
125 } else {
126 count = aCount;
127 }
128
129 // Read the data into the buffer...
130 rv = aStream->Read((mBuffer+mBufferLen), count, &len);
131 if (NS_FAILED(rv)) return rv;
132
133 mBufferLen += len;
134 aCount -= len;
135
136 if (aCount) {
137 //
138 // Adjust the source offset... The call to FireListenerNotifications(...)
139 // will make the first OnDataAvailable(...) call with an offset of 0.
140 // So, this offset needs to be adjusted to reflect that...
141 //
142 aSourceOffset += mBufferLen;
143
144 DetermineContentType(request);
145
146 rv = FireListenerNotifications(request, aCtxt);
147 }
148 }
149
150 // Must not fire ODA again if it failed once
151 if (aCount && NS_SUCCEEDED(rv)) {
152 NS_ASSERTION(!mContentType.IsEmpty(),
153 "Content type should be known by now.");
154
155 rv = mNextListener->OnDataAvailable(request, aCtxt, aStream,
156 aSourceOffset, aCount);
157 }
158
159 return rv;
160 }
161
162 // ----
163 //
164 // nsIRequestObserver methods...
165 //
166 // ----
167
168 NS_IMETHODIMP
169 nsUnknownDecoder::OnStartRequest(nsIRequest* request, nsISupports *aCtxt)
170 {
171 nsresult rv = NS_OK;
172
173 if (!mNextListener) return NS_ERROR_FAILURE;
174
175 // Allocate the sniffer buffer...
176 if (NS_SUCCEEDED(rv) && !mBuffer) {
177 mBuffer = new char[MAX_BUFFER_SIZE];
178
179 if (!mBuffer) {
180 rv = NS_ERROR_OUT_OF_MEMORY;
181 }
182 }
183
184 // Do not pass the OnStartRequest on to the next listener (yet)...
185 return rv;
186 }
187
188 NS_IMETHODIMP
189 nsUnknownDecoder::OnStopRequest(nsIRequest* request, nsISupports *aCtxt,
190 nsresult aStatus)
191 {
192 nsresult rv = NS_OK;
193
194 if (!mNextListener) return NS_ERROR_FAILURE;
195
196 //
197 // The total amount of data is less than the size of the sniffer buffer.
198 // Analyze the buffer now...
199 //
200 if (mContentType.IsEmpty()) {
201 DetermineContentType(request);
202
203 rv = FireListenerNotifications(request, aCtxt);
204
205 if (NS_FAILED(rv)) {
206 aStatus = rv;
207 }
208 }
209
210 rv = mNextListener->OnStopRequest(request, aCtxt, aStatus);
211 mNextListener = 0;
212
213 return rv;
214 }
215
216 // ----
217 //
218 // nsIContentSniffer methods...
219 //
220 // ----
221 NS_IMETHODIMP
222 nsUnknownDecoder::GetMIMETypeFromContent(nsIRequest* aRequest,
223 const uint8_t* aData,
224 uint32_t aLength,
225 nsACString& type)
226 {
227 mBuffer = const_cast<char*>(reinterpret_cast<const char*>(aData));
228 mBufferLen = aLength;
229 DetermineContentType(aRequest);
230 mBuffer = nullptr;
231 mBufferLen = 0;
232 type.Assign(mContentType);
233 mContentType.Truncate();
234 return type.IsEmpty() ? NS_ERROR_NOT_AVAILABLE : NS_OK;
235 }
236
237
238 // Actual sniffing code
239
240 bool nsUnknownDecoder::AllowSniffing(nsIRequest* aRequest)
241 {
242 if (!mRequireHTMLsuffix) {
243 return true;
244 }
245
246 nsCOMPtr<nsIChannel> channel = do_QueryInterface(aRequest);
247 if (!channel) {
248 NS_ERROR("QI failed");
249 return false;
250 }
251
252 nsCOMPtr<nsIURI> uri;
253 if (NS_FAILED(channel->GetURI(getter_AddRefs(uri))) || !uri) {
254 return false;
255 }
256
257 bool isLocalFile = false;
258 if (NS_FAILED(uri->SchemeIs("file", &isLocalFile)) || isLocalFile) {
259 return false;
260 }
261
262 return true;
263 }
264
265 /**
266 * This is the array of sniffer entries that depend on "magic numbers"
267 * in the file. Each entry has either a type associated with it (set
268 * these with the SNIFFER_ENTRY macro) or a function to be executed
269 * (set these with the SNIFFER_ENTRY_WITH_FUNC macro). The function
270 * should take a single nsIRequest* and returns bool -- true if
271 * it sets mContentType, false otherwise
272 */
273 nsUnknownDecoder::nsSnifferEntry nsUnknownDecoder::sSnifferEntries[] = {
274 SNIFFER_ENTRY("%PDF-", APPLICATION_PDF),
275
276 SNIFFER_ENTRY("%!PS-Adobe-", APPLICATION_POSTSCRIPT),
277
278 // Files that start with mailbox delimiters let's provisionally call
279 // text/plain
280 SNIFFER_ENTRY("From", TEXT_PLAIN),
281 SNIFFER_ENTRY(">From", TEXT_PLAIN),
282
283 // If the buffer begins with "#!" or "%!" then it is a script of
284 // some sort... "Scripts" can include arbitrary data to be passed
285 // to an interpreter, so we need to decide whether we can call this
286 // text or whether it's data.
287 SNIFFER_ENTRY_WITH_FUNC("#!", &nsUnknownDecoder::LastDitchSniff),
288
289 // XXXbz should (and can) we also include the various ways that <?xml can
290 // appear as UTF-16 and such? See http://www.w3.org/TR/REC-xml#sec-guessing
291 SNIFFER_ENTRY_WITH_FUNC("<?xml", &nsUnknownDecoder::SniffForXML)
292 };
293
294 uint32_t nsUnknownDecoder::sSnifferEntryNum =
295 sizeof(nsUnknownDecoder::sSnifferEntries) /
296 sizeof(nsUnknownDecoder::nsSnifferEntry);
297
298 void nsUnknownDecoder::DetermineContentType(nsIRequest* aRequest)
299 {
300 NS_ASSERTION(mContentType.IsEmpty(), "Content type is already known.");
301 if (!mContentType.IsEmpty()) return;
302
303 // First, run through all the types we can detect reliably based on
304 // magic numbers
305 uint32_t i;
306 for (i = 0; i < sSnifferEntryNum; ++i) {
307 if (mBufferLen >= sSnifferEntries[i].mByteLen && // enough data
308 memcmp(mBuffer, sSnifferEntries[i].mBytes, sSnifferEntries[i].mByteLen) == 0) { // and type matches
309 NS_ASSERTION(sSnifferEntries[i].mMimeType ||
310 sSnifferEntries[i].mContentTypeSniffer,
311 "Must have either a type string or a function to set the type");
312 NS_ASSERTION(!sSnifferEntries[i].mMimeType ||
313 !sSnifferEntries[i].mContentTypeSniffer,
314 "Both a type string and a type sniffing function set;"
315 " using type string");
316 if (sSnifferEntries[i].mMimeType) {
317 mContentType = sSnifferEntries[i].mMimeType;
318 NS_ASSERTION(!mContentType.IsEmpty(),
319 "Content type should be known by now.");
320 return;
321 }
322 if ((this->*(sSnifferEntries[i].mContentTypeSniffer))(aRequest)) {
323 NS_ASSERTION(!mContentType.IsEmpty(),
324 "Content type should be known by now.");
325 return;
326 }
327 }
328 }
329
330 NS_SniffContent(NS_DATA_SNIFFER_CATEGORY, aRequest,
331 (const uint8_t*)mBuffer, mBufferLen, mContentType);
332 if (!mContentType.IsEmpty()) {
333 return;
334 }
335
336 if (SniffForHTML(aRequest)) {
337 NS_ASSERTION(!mContentType.IsEmpty(),
338 "Content type should be known by now.");
339 return;
340 }
341
342 // We don't know what this is yet. Before we just give up, try
343 // the URI from the request.
344 if (SniffURI(aRequest)) {
345 NS_ASSERTION(!mContentType.IsEmpty(),
346 "Content type should be known by now.");
347 return;
348 }
349
350 LastDitchSniff(aRequest);
351 NS_ASSERTION(!mContentType.IsEmpty(),
352 "Content type should be known by now.");
353 }
354
355 bool nsUnknownDecoder::SniffForHTML(nsIRequest* aRequest)
356 {
357 /*
358 * To prevent a possible attack, we will not consider this to be
359 * html content if it comes from the local file system and our prefs
360 * are set right
361 */
362 if (!AllowSniffing(aRequest)) {
363 return false;
364 }
365
366 // Now look for HTML.
367 const char* str = mBuffer;
368 const char* end = mBuffer + mBufferLen;
369
370 // skip leading whitespace
371 while (str != end && nsCRT::IsAsciiSpace(*str)) {
372 ++str;
373 }
374
375 // did we find something like a start tag?
376 if (str == end || *str != '<' || ++str == end) {
377 return false;
378 }
379
380 // If we seem to be SGML or XML and we got down here, just pretend we're HTML
381 if (*str == '!' || *str == '?') {
382 mContentType = TEXT_HTML;
383 return true;
384 }
385
386 uint32_t bufSize = end - str;
387 // We use sizeof(_tagstr) below because that's the length of _tagstr
388 // with the one char " " or ">" appended.
389 #define MATCHES_TAG(_tagstr) \
390 (bufSize >= sizeof(_tagstr) && \
391 (PL_strncasecmp(str, _tagstr " ", sizeof(_tagstr)) == 0 || \
392 PL_strncasecmp(str, _tagstr ">", sizeof(_tagstr)) == 0))
393
394 if (MATCHES_TAG("html") ||
395 MATCHES_TAG("frameset") ||
396 MATCHES_TAG("body") ||
397 MATCHES_TAG("head") ||
398 MATCHES_TAG("script") ||
399 MATCHES_TAG("iframe") ||
400 MATCHES_TAG("a") ||
401 MATCHES_TAG("img") ||
402 MATCHES_TAG("table") ||
403 MATCHES_TAG("title") ||
404 MATCHES_TAG("link") ||
405 MATCHES_TAG("base") ||
406 MATCHES_TAG("style") ||
407 MATCHES_TAG("div") ||
408 MATCHES_TAG("p") ||
409 MATCHES_TAG("font") ||
410 MATCHES_TAG("applet") ||
411 MATCHES_TAG("meta") ||
412 MATCHES_TAG("center") ||
413 MATCHES_TAG("form") ||
414 MATCHES_TAG("isindex") ||
415 MATCHES_TAG("h1") ||
416 MATCHES_TAG("h2") ||
417 MATCHES_TAG("h3") ||
418 MATCHES_TAG("h4") ||
419 MATCHES_TAG("h5") ||
420 MATCHES_TAG("h6") ||
421 MATCHES_TAG("b") ||
422 MATCHES_TAG("pre")) {
423
424 mContentType = TEXT_HTML;
425 return true;
426 }
427
428 #undef MATCHES_TAG
429
430 return false;
431 }
432
433 bool nsUnknownDecoder::SniffForXML(nsIRequest* aRequest)
434 {
435 // Just like HTML, this should be able to be shut off.
436 if (!AllowSniffing(aRequest)) {
437 return false;
438 }
439
440 // First see whether we can glean anything from the uri...
441 if (!SniffURI(aRequest)) {
442 // Oh well; just generic XML will have to do
443 mContentType = TEXT_XML;
444 }
445
446 return true;
447 }
448
449 bool nsUnknownDecoder::SniffURI(nsIRequest* aRequest)
450 {
451 nsCOMPtr<nsIMIMEService> mimeService(do_GetService("@mozilla.org/mime;1"));
452 if (mimeService) {
453 nsCOMPtr<nsIChannel> channel = do_QueryInterface(aRequest);
454 if (channel) {
455 nsCOMPtr<nsIURI> uri;
456 nsresult result = channel->GetURI(getter_AddRefs(uri));
457 if (NS_SUCCEEDED(result) && uri) {
458 nsAutoCString type;
459 result = mimeService->GetTypeFromURI(uri, type);
460 if (NS_SUCCEEDED(result)) {
461 mContentType = type;
462 return true;
463 }
464 }
465 }
466 }
467
468 return false;
469 }
470
471 // This macro is based on RFC 2046 Section 4.1.2. Treat any char 0-31
472 // except the 9-13 range (\t, \n, \v, \f, \r) and char 27 (used by
473 // encodings like Shift_JIS) as non-text
474 #define IS_TEXT_CHAR(ch) \
475 (((unsigned char)(ch)) > 31 || (9 <= (ch) && (ch) <= 13) || (ch) == 27)
476
477 bool nsUnknownDecoder::LastDitchSniff(nsIRequest* aRequest)
478 {
479 // All we can do now is try to guess whether this is text/plain or
480 // application/octet-stream
481
482 // First, check for a BOM. If we see one, assume this is text/plain
483 // in whatever encoding. If there is a BOM _and_ text we will
484 // always have at least 4 bytes in the buffer (since the 2-byte BOMs
485 // are for 2-byte encodings and the UTF-8 BOM is 3 bytes).
486 if (mBufferLen >= 4) {
487 const unsigned char* buf = (const unsigned char*)mBuffer;
488 if ((buf[0] == 0xFE && buf[1] == 0xFF) || // UTF-16, Big Endian
489 (buf[0] == 0xFF && buf[1] == 0xFE) || // UTF-16 or UCS-4, Little Endian
490 (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) || // UTF-8
491 (buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF)) { // UCS-4, Big Endian
492
493 mContentType = TEXT_PLAIN;
494 return true;
495 }
496 }
497
498 // Now see whether the buffer has any non-text chars. If not, then let's
499 // just call it text/plain...
500 //
501 uint32_t i;
502 for (i = 0; i < mBufferLen && IS_TEXT_CHAR(mBuffer[i]); i++) {
503 continue;
504 }
505
506 if (i == mBufferLen) {
507 mContentType = TEXT_PLAIN;
508 }
509 else {
510 mContentType = APPLICATION_OCTET_STREAM;
511 }
512
513 return true;
514 }
515
516
517 nsresult nsUnknownDecoder::FireListenerNotifications(nsIRequest* request,
518 nsISupports *aCtxt)
519 {
520 nsresult rv = NS_OK;
521
522 if (!mNextListener) return NS_ERROR_FAILURE;
523
524 if (!mContentType.IsEmpty()) {
525 nsCOMPtr<nsIViewSourceChannel> viewSourceChannel =
526 do_QueryInterface(request);
527 if (viewSourceChannel) {
528 rv = viewSourceChannel->SetOriginalContentType(mContentType);
529 } else {
530 nsCOMPtr<nsIChannel> channel = do_QueryInterface(request, &rv);
531 if (NS_SUCCEEDED(rv)) {
532 // Set the new content type on the channel...
533 rv = channel->SetContentType(mContentType);
534 }
535 }
536
537 NS_ASSERTION(NS_SUCCEEDED(rv), "Unable to set content type on channel!");
538
539 if (NS_FAILED(rv)) {
540 // Cancel the request to make sure it has the correct status if
541 // mNextListener looks at it.
542 request->Cancel(rv);
543 mNextListener->OnStartRequest(request, aCtxt);
544 return rv;
545 }
546 }
547
548 // Fire the OnStartRequest(...)
549 rv = mNextListener->OnStartRequest(request, aCtxt);
550
551 if (!mBuffer) return NS_ERROR_OUT_OF_MEMORY;
552
553 // If the request was canceled, then we need to treat that equivalently
554 // to an error returned by OnStartRequest.
555 if (NS_SUCCEEDED(rv))
556 request->GetStatus(&rv);
557
558 // Fire the first OnDataAvailable for the data that was read from the
559 // stream into the sniffer buffer...
560 if (NS_SUCCEEDED(rv) && (mBufferLen > 0)) {
561 uint32_t len = 0;
562 nsCOMPtr<nsIInputStream> in;
563 nsCOMPtr<nsIOutputStream> out;
564
565 // Create a pipe and fill it with the data from the sniffer buffer.
566 rv = NS_NewPipe(getter_AddRefs(in), getter_AddRefs(out),
567 MAX_BUFFER_SIZE, MAX_BUFFER_SIZE);
568
569 if (NS_SUCCEEDED(rv)) {
570 rv = out->Write(mBuffer, mBufferLen, &len);
571 if (NS_SUCCEEDED(rv)) {
572 if (len == mBufferLen) {
573 rv = mNextListener->OnDataAvailable(request, aCtxt, in, 0, len);
574 } else {
575 NS_ERROR("Unable to write all the data into the pipe.");
576 rv = NS_ERROR_FAILURE;
577 }
578 }
579 }
580 }
581
582 delete [] mBuffer;
583 mBuffer = nullptr;
584 mBufferLen = 0;
585
586 return rv;
587 }
588
589 void
590 nsBinaryDetector::DetermineContentType(nsIRequest* aRequest)
591 {
592 nsCOMPtr<nsIHttpChannel> httpChannel = do_QueryInterface(aRequest);
593 if (!httpChannel) {
594 return;
595 }
596
597 // It's an HTTP channel. Check for the text/plain mess
598 nsAutoCString contentTypeHdr;
599 httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Type"),
600 contentTypeHdr);
601 nsAutoCString contentType;
602 httpChannel->GetContentType(contentType);
603
604 // Make sure to do a case-sensitive exact match comparison here. Apache
605 // 1.x just sends text/plain for "unknown", while Apache 2.x sends
606 // text/plain with a ISO-8859-1 charset. Debian's Apache version, just to
607 // be different, sends text/plain with iso-8859-1 charset. For extra fun,
608 // FC7, RHEL4, and Ubuntu Feisty send charset=UTF-8. Don't do general
609 // case-insensitive comparison, since we really want to apply this crap as
610 // rarely as we can.
611 if (!contentType.EqualsLiteral("text/plain") ||
612 (!contentTypeHdr.EqualsLiteral("text/plain") &&
613 !contentTypeHdr.EqualsLiteral("text/plain; charset=ISO-8859-1") &&
614 !contentTypeHdr.EqualsLiteral("text/plain; charset=iso-8859-1") &&
615 !contentTypeHdr.EqualsLiteral("text/plain; charset=UTF-8"))) {
616 return;
617 }
618
619 // Check whether we have content-encoding. If we do, don't try to
620 // detect the type.
621 // XXXbz we could improve this by doing a local decompress if we
622 // wanted, I'm sure.
623 nsAutoCString contentEncoding;
624 httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
625 contentEncoding);
626 if (!contentEncoding.IsEmpty()) {
627 return;
628 }
629
630 LastDitchSniff(aRequest);
631 if (mContentType.Equals(APPLICATION_OCTET_STREAM)) {
632 // We want to guess at it instead
633 mContentType = APPLICATION_GUESS_FROM_EXT;
634 } else {
635 // Let the text/plain type we already have be, so that other content
636 // sniffers can also get a shot at this data.
637 mContentType.Truncate();
638 }
639 }

mercurial