browser/components/feeds/src/nsFeedSniffer.cpp

Wed, 31 Dec 2014 13:27:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 13:27:57 +0100
branch
TOR_BUG_3246
changeset 6
8bccb770b82d
permissions
-rw-r--r--

Ignore runtime configuration files generated during quality assurance.

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "nsFeedSniffer.h"
     9 #include "nsNetCID.h"
    10 #include "nsXPCOM.h"
    11 #include "nsCOMPtr.h"
    12 #include "nsStringStream.h"
    14 #include "nsBrowserCompsCID.h"
    16 #include "nsICategoryManager.h"
    17 #include "nsIServiceManager.h"
    18 #include "nsComponentManagerUtils.h"
    19 #include "nsServiceManagerUtils.h"
    21 #include "nsIStreamConverterService.h"
    22 #include "nsIStreamConverter.h"
    24 #include "nsIStreamListener.h"
    26 #include "nsIHttpChannel.h"
    27 #include "nsIMIMEHeaderParam.h"
    29 #include "nsMimeTypes.h"
    30 #include "nsIURI.h"
    31 #include <algorithm>
    33 #define TYPE_ATOM "application/atom+xml"
    34 #define TYPE_RSS "application/rss+xml"
    35 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
    37 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    38 #define NS_RSS "http://purl.org/rss/1.0/"
    40 #define MAX_BYTES 512u
    42 NS_IMPL_ISUPPORTS(nsFeedSniffer,
    43                   nsIContentSniffer,
    44                   nsIStreamListener,
    45                   nsIRequestObserver)
    47 nsresult
    48 nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
    49                                   const uint8_t* data,
    50                                   uint32_t length)
    51 {
    52   nsresult rv = NS_OK;
    54  mDecodedData = "";
    55  nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
    56   if (!httpChannel)
    57     return NS_ERROR_NO_INTERFACE;
    59   nsAutoCString contentEncoding;
    60   httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"), 
    61                                  contentEncoding);
    62   if (!contentEncoding.IsEmpty()) {
    63     nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
    64     if (converterService) {
    65       ToLowerCase(contentEncoding);
    67       nsCOMPtr<nsIStreamListener> converter;
    68       rv = converterService->AsyncConvertData(contentEncoding.get(), 
    69                                               "uncompressed", this, nullptr, 
    70                                               getter_AddRefs(converter));
    71       NS_ENSURE_SUCCESS(rv, rv);
    73       converter->OnStartRequest(request, nullptr);
    75       nsCOMPtr<nsIStringInputStream> rawStream =
    76         do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
    77       if (!rawStream)
    78         return NS_ERROR_FAILURE;
    80       rv = rawStream->SetData((const char*)data, length);
    81       NS_ENSURE_SUCCESS(rv, rv);
    83       rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length);
    84       NS_ENSURE_SUCCESS(rv, rv);
    86       converter->OnStopRequest(request, nullptr, NS_OK);
    87     }
    88   }
    89   return rv;
    90 }
    92 template<int N>
    93 static bool
    94 StringBeginsWithLowercaseLiteral(nsAString& aString,
    95                                  const char (&aSubstring)[N])
    96 {
    97   return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
    98 }
   100 bool
   101 HasAttachmentDisposition(nsIHttpChannel* httpChannel)
   102 {
   103   if (!httpChannel)
   104     return false;
   106   uint32_t disp;
   107   nsresult rv = httpChannel->GetContentDisposition(&disp);
   109   if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
   110     return true;
   112   return false;
   113 }
   115 /**
   116  * @return the first occurrence of a character within a string buffer,
   117  *         or nullptr if not found
   118  */
   119 static const char*
   120 FindChar(char c, const char *begin, const char *end)
   121 {
   122   for (; begin < end; ++begin) {
   123     if (*begin == c)
   124       return begin;
   125   }
   126   return nullptr;
   127 }
   129 /**
   130  *
   131  * Determine if a substring is the "documentElement" in the document.
   132  *
   133  * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
   134  * element within the XML DOM, i.e. the root container element. Otherwise,
   135  * it's possible that someone embedded one of these tags inside a document of
   136  * another type, e.g. a HTML document, and we don't want to show the preview
   137  * page if the document isn't actually a feed.
   138  * 
   139  * @param   start
   140  *          The beginning of the data being sniffed
   141  * @param   end
   142  *          The end of the data being sniffed, right before the substring that
   143  *          was found.
   144  * @returns true if the found substring is the documentElement, false 
   145  *          otherwise.
   146  */
   147 static bool
   148 IsDocumentElement(const char *start, const char* end)
   149 {
   150   // For every tag in the buffer, check to see if it's a PI, Doctype or 
   151   // comment, our desired substring or something invalid.
   152   while ( (start = FindChar('<', start, end)) ) {
   153     ++start;
   154     if (start >= end)
   155       return false;
   157     // Check to see if the character following the '<' is either '?' or '!'
   158     // (processing instruction or doctype or comment)... these are valid nodes
   159     // to have in the prologue. 
   160     if (*start != '?' && *start != '!')
   161       return false;
   163     // Now advance the iterator until the '>' (We do this because we don't want
   164     // to sniff indicator substrings that are embedded within other nodes, e.g.
   165     // comments: <!-- <rdf:RDF .. > -->
   166     start = FindChar('>', start, end);
   167     if (!start)
   168       return false;
   170     ++start;
   171   }
   172   return true;
   173 }
   175 /**
   176  * Determines whether or not a string exists as the root element in an XML data
   177  * string buffer.
   178  * @param   dataString
   179  *          The data being sniffed
   180  * @param   substring
   181  *          The substring being tested for existence and root-ness.
   182  * @returns true if the substring exists and is the documentElement, false
   183  *          otherwise.
   184  */
   185 static bool
   186 ContainsTopLevelSubstring(nsACString& dataString, const char *substring) 
   187 {
   188   int32_t offset = dataString.Find(substring);
   189   if (offset == -1)
   190     return false;
   192   const char *begin = dataString.BeginReading();
   194   // Only do the validation when we find the substring.
   195   return IsDocumentElement(begin, begin + offset);
   196 }
   198 NS_IMETHODIMP
   199 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request, 
   200                                       const uint8_t* data, 
   201                                       uint32_t length, 
   202                                       nsACString& sniffedType)
   203 {
   204   nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
   205   if (!channel)
   206     return NS_ERROR_NO_INTERFACE;
   208   // Check that this is a GET request, since you can't subscribe to a POST...
   209   nsAutoCString method;
   210   channel->GetRequestMethod(method);
   211   if (!method.Equals("GET")) {
   212     sniffedType.Truncate();
   213     return NS_OK;
   214   }
   216   // We need to find out if this is a load of a view-source document. In this
   217   // case we do not want to override the content type, since the source display
   218   // does not need to be converted from feed format to XUL. More importantly, 
   219   // we don't want to change the content type from something 
   220   // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html 
   221   // etc) to something that only the application fe knows about (maybe.feed) 
   222   // thus deactivating syntax highlighting.
   223   nsCOMPtr<nsIURI> originalURI;
   224   channel->GetOriginalURI(getter_AddRefs(originalURI));
   226   nsAutoCString scheme;
   227   originalURI->GetScheme(scheme);
   228   if (scheme.EqualsLiteral("view-source")) {
   229     sniffedType.Truncate();
   230     return NS_OK;
   231   }
   233   // Check the Content-Type to see if it is set correctly. If it is set to 
   234   // something specific that we think is a reliable indication of a feed, don't
   235   // bother sniffing since we assume the site maintainer knows what they're 
   236   // doing. 
   237   nsAutoCString contentType;
   238   channel->GetContentType(contentType);
   239   bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
   240                    contentType.EqualsLiteral(TYPE_ATOM);
   242   // Check to see if this was a feed request from the location bar or from
   243   // the feed: protocol. This is also a reliable indication.
   244   // The value of the header doesn't matter.  
   245   if (!noSniff) {
   246     nsAutoCString sniffHeader;
   247     nsresult foundHeader =
   248       channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
   249                                 sniffHeader);
   250     noSniff = NS_SUCCEEDED(foundHeader);
   251   }
   253   if (noSniff) {
   254     // check for an attachment after we have a likely feed.
   255     if(HasAttachmentDisposition(channel)) {
   256       sniffedType.Truncate();
   257       return NS_OK;
   258     }
   260     // set the feed header as a response header, since we have good metadata
   261     // telling us that the feed is supposed to be RSS or Atom
   262     channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
   263                                NS_LITERAL_CSTRING("1"), false);
   264     sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
   265     return NS_OK;
   266   }
   268   // Don't sniff arbitrary types.  Limit sniffing to situations that
   269   // we think can reasonably arise.
   270   if (!contentType.EqualsLiteral(TEXT_HTML) &&
   271       !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
   272       // Same criterion as XMLHttpRequest.  Should we be checking for "+xml"
   273       // and check for text/xml and application/xml by hand instead?
   274       contentType.Find("xml") == -1) {
   275     sniffedType.Truncate();
   276     return NS_OK;
   277   }
   279   // Now we need to potentially decompress data served with 
   280   // Content-Encoding: gzip
   281   nsresult rv = ConvertEncodedData(request, data, length);
   282   if (NS_FAILED(rv))
   283     return rv;
   285   // We cap the number of bytes to scan at MAX_BYTES to prevent picking up 
   286   // false positives by accidentally reading document content, e.g. a "how to
   287   // make a feed" page.
   288   const char* testData;
   289   if (mDecodedData.IsEmpty()) {
   290     testData = (const char*)data;
   291     length = std::min(length, MAX_BYTES);
   292   } else {
   293     testData = mDecodedData.get();
   294     length = std::min(mDecodedData.Length(), MAX_BYTES);
   295   }
   297   // The strategy here is based on that described in:
   298   // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
   299   // for interoperarbility purposes.
   301   // Thus begins the actual sniffing.
   302   nsDependentCSubstring dataString((const char*)testData, length);
   304   bool isFeed = false;
   306   // RSS 0.91/0.92/2.0
   307   isFeed = ContainsTopLevelSubstring(dataString, "<rss");
   309   // Atom 1.0
   310   if (!isFeed)
   311     isFeed = ContainsTopLevelSubstring(dataString, "<feed");
   313   // RSS 1.0
   314   if (!isFeed) {
   315     isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
   316       dataString.Find(NS_RDF) != -1 &&
   317       dataString.Find(NS_RSS) != -1;
   318   }
   320   // If we sniffed a feed, coerce our internal type
   321   if (isFeed && !HasAttachmentDisposition(channel))
   322     sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
   323   else
   324     sniffedType.Truncate();
   325   return NS_OK;
   326 }
   328 NS_IMETHODIMP
   329 nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
   330 {
   331   return NS_OK;
   332 }
   334 NS_METHOD
   335 nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
   336                                      void* closure,
   337                                      const char* rawSegment,
   338                                      uint32_t toOffset,
   339                                      uint32_t count,
   340                                      uint32_t* writeCount)
   341 {
   342   nsCString* decodedData = static_cast<nsCString*>(closure);
   343   decodedData->Append(rawSegment, count);
   344   *writeCount = count;
   345   return NS_OK;
   346 }
   348 NS_IMETHODIMP
   349 nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
   350                                nsIInputStream* stream, uint64_t offset, 
   351                                uint32_t count)
   352 {
   353   uint32_t read;
   354   return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count, 
   355                               &read);
   356 }
   358 NS_IMETHODIMP
   359 nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context, 
   360                              nsresult status)
   361 {
   362   return NS_OK; 
   363 }

mercurial