browser/components/feeds/src/nsFeedSniffer.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/feeds/src/nsFeedSniffer.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,363 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "nsFeedSniffer.h"
    1.10 +
    1.11 +
    1.12 +#include "nsNetCID.h"
    1.13 +#include "nsXPCOM.h"
    1.14 +#include "nsCOMPtr.h"
    1.15 +#include "nsStringStream.h"
    1.16 +
    1.17 +#include "nsBrowserCompsCID.h"
    1.18 +
    1.19 +#include "nsICategoryManager.h"
    1.20 +#include "nsIServiceManager.h"
    1.21 +#include "nsComponentManagerUtils.h"
    1.22 +#include "nsServiceManagerUtils.h"
    1.23 +
    1.24 +#include "nsIStreamConverterService.h"
    1.25 +#include "nsIStreamConverter.h"
    1.26 +
    1.27 +#include "nsIStreamListener.h"
    1.28 +
    1.29 +#include "nsIHttpChannel.h"
    1.30 +#include "nsIMIMEHeaderParam.h"
    1.31 +
    1.32 +#include "nsMimeTypes.h"
    1.33 +#include "nsIURI.h"
    1.34 +#include <algorithm>
    1.35 +
    1.36 +#define TYPE_ATOM "application/atom+xml"
    1.37 +#define TYPE_RSS "application/rss+xml"
    1.38 +#define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
    1.39 +
    1.40 +#define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    1.41 +#define NS_RSS "http://purl.org/rss/1.0/"
    1.42 +
    1.43 +#define MAX_BYTES 512u
    1.44 +
    1.45 +NS_IMPL_ISUPPORTS(nsFeedSniffer,
    1.46 +                  nsIContentSniffer,
    1.47 +                  nsIStreamListener,
    1.48 +                  nsIRequestObserver)
    1.49 +
    1.50 +nsresult
    1.51 +nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
    1.52 +                                  const uint8_t* data,
    1.53 +                                  uint32_t length)
    1.54 +{
    1.55 +  nsresult rv = NS_OK;
    1.56 +
    1.57 + mDecodedData = "";
    1.58 + nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
    1.59 +  if (!httpChannel)
    1.60 +    return NS_ERROR_NO_INTERFACE;
    1.61 +
    1.62 +  nsAutoCString contentEncoding;
    1.63 +  httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"), 
    1.64 +                                 contentEncoding);
    1.65 +  if (!contentEncoding.IsEmpty()) {
    1.66 +    nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
    1.67 +    if (converterService) {
    1.68 +      ToLowerCase(contentEncoding);
    1.69 +
    1.70 +      nsCOMPtr<nsIStreamListener> converter;
    1.71 +      rv = converterService->AsyncConvertData(contentEncoding.get(), 
    1.72 +                                              "uncompressed", this, nullptr, 
    1.73 +                                              getter_AddRefs(converter));
    1.74 +      NS_ENSURE_SUCCESS(rv, rv);
    1.75 +
    1.76 +      converter->OnStartRequest(request, nullptr);
    1.77 +
    1.78 +      nsCOMPtr<nsIStringInputStream> rawStream =
    1.79 +        do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
    1.80 +      if (!rawStream)
    1.81 +        return NS_ERROR_FAILURE;
    1.82 +
    1.83 +      rv = rawStream->SetData((const char*)data, length);
    1.84 +      NS_ENSURE_SUCCESS(rv, rv);
    1.85 +
    1.86 +      rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length);
    1.87 +      NS_ENSURE_SUCCESS(rv, rv);
    1.88 +
    1.89 +      converter->OnStopRequest(request, nullptr, NS_OK);
    1.90 +    }
    1.91 +  }
    1.92 +  return rv;
    1.93 +}
    1.94 +
    1.95 +template<int N>
    1.96 +static bool
    1.97 +StringBeginsWithLowercaseLiteral(nsAString& aString,
    1.98 +                                 const char (&aSubstring)[N])
    1.99 +{
   1.100 +  return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
   1.101 +}
   1.102 +
   1.103 +bool
   1.104 +HasAttachmentDisposition(nsIHttpChannel* httpChannel)
   1.105 +{
   1.106 +  if (!httpChannel)
   1.107 +    return false;
   1.108 +
   1.109 +  uint32_t disp;
   1.110 +  nsresult rv = httpChannel->GetContentDisposition(&disp);
   1.111 +
   1.112 +  if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
   1.113 +    return true;
   1.114 +
   1.115 +  return false;
   1.116 +}
   1.117 +
   1.118 +/**
   1.119 + * @return the first occurrence of a character within a string buffer,
   1.120 + *         or nullptr if not found
   1.121 + */
   1.122 +static const char*
   1.123 +FindChar(char c, const char *begin, const char *end)
   1.124 +{
   1.125 +  for (; begin < end; ++begin) {
   1.126 +    if (*begin == c)
   1.127 +      return begin;
   1.128 +  }
   1.129 +  return nullptr;
   1.130 +}
   1.131 +
   1.132 +/**
   1.133 + *
   1.134 + * Determine if a substring is the "documentElement" in the document.
   1.135 + *
   1.136 + * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
   1.137 + * element within the XML DOM, i.e. the root container element. Otherwise,
   1.138 + * it's possible that someone embedded one of these tags inside a document of
   1.139 + * another type, e.g. a HTML document, and we don't want to show the preview
   1.140 + * page if the document isn't actually a feed.
   1.141 + * 
   1.142 + * @param   start
   1.143 + *          The beginning of the data being sniffed
   1.144 + * @param   end
   1.145 + *          The end of the data being sniffed, right before the substring that
   1.146 + *          was found.
   1.147 + * @returns true if the found substring is the documentElement, false 
   1.148 + *          otherwise.
   1.149 + */
   1.150 +static bool
   1.151 +IsDocumentElement(const char *start, const char* end)
   1.152 +{
   1.153 +  // For every tag in the buffer, check to see if it's a PI, Doctype or 
   1.154 +  // comment, our desired substring or something invalid.
   1.155 +  while ( (start = FindChar('<', start, end)) ) {
   1.156 +    ++start;
   1.157 +    if (start >= end)
   1.158 +      return false;
   1.159 +
   1.160 +    // Check to see if the character following the '<' is either '?' or '!'
   1.161 +    // (processing instruction or doctype or comment)... these are valid nodes
   1.162 +    // to have in the prologue. 
   1.163 +    if (*start != '?' && *start != '!')
   1.164 +      return false;
   1.165 +    
   1.166 +    // Now advance the iterator until the '>' (We do this because we don't want
   1.167 +    // to sniff indicator substrings that are embedded within other nodes, e.g.
   1.168 +    // comments: <!-- <rdf:RDF .. > -->
   1.169 +    start = FindChar('>', start, end);
   1.170 +    if (!start)
   1.171 +      return false;
   1.172 +
   1.173 +    ++start;
   1.174 +  }
   1.175 +  return true;
   1.176 +}
   1.177 +
   1.178 +/**
   1.179 + * Determines whether or not a string exists as the root element in an XML data
   1.180 + * string buffer.
   1.181 + * @param   dataString
   1.182 + *          The data being sniffed
   1.183 + * @param   substring
   1.184 + *          The substring being tested for existence and root-ness.
   1.185 + * @returns true if the substring exists and is the documentElement, false
   1.186 + *          otherwise.
   1.187 + */
   1.188 +static bool
   1.189 +ContainsTopLevelSubstring(nsACString& dataString, const char *substring) 
   1.190 +{
   1.191 +  int32_t offset = dataString.Find(substring);
   1.192 +  if (offset == -1)
   1.193 +    return false;
   1.194 +
   1.195 +  const char *begin = dataString.BeginReading();
   1.196 +
   1.197 +  // Only do the validation when we find the substring.
   1.198 +  return IsDocumentElement(begin, begin + offset);
   1.199 +}
   1.200 +
   1.201 +NS_IMETHODIMP
   1.202 +nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request, 
   1.203 +                                      const uint8_t* data, 
   1.204 +                                      uint32_t length, 
   1.205 +                                      nsACString& sniffedType)
   1.206 +{
   1.207 +  nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
   1.208 +  if (!channel)
   1.209 +    return NS_ERROR_NO_INTERFACE;
   1.210 +
   1.211 +  // Check that this is a GET request, since you can't subscribe to a POST...
   1.212 +  nsAutoCString method;
   1.213 +  channel->GetRequestMethod(method);
   1.214 +  if (!method.Equals("GET")) {
   1.215 +    sniffedType.Truncate();
   1.216 +    return NS_OK;
   1.217 +  }
   1.218 +
   1.219 +  // We need to find out if this is a load of a view-source document. In this
   1.220 +  // case we do not want to override the content type, since the source display
   1.221 +  // does not need to be converted from feed format to XUL. More importantly, 
   1.222 +  // we don't want to change the content type from something 
   1.223 +  // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html 
   1.224 +  // etc) to something that only the application fe knows about (maybe.feed) 
   1.225 +  // thus deactivating syntax highlighting.
   1.226 +  nsCOMPtr<nsIURI> originalURI;
   1.227 +  channel->GetOriginalURI(getter_AddRefs(originalURI));
   1.228 +
   1.229 +  nsAutoCString scheme;
   1.230 +  originalURI->GetScheme(scheme);
   1.231 +  if (scheme.EqualsLiteral("view-source")) {
   1.232 +    sniffedType.Truncate();
   1.233 +    return NS_OK;
   1.234 +  }
   1.235 +
   1.236 +  // Check the Content-Type to see if it is set correctly. If it is set to 
   1.237 +  // something specific that we think is a reliable indication of a feed, don't
   1.238 +  // bother sniffing since we assume the site maintainer knows what they're 
   1.239 +  // doing. 
   1.240 +  nsAutoCString contentType;
   1.241 +  channel->GetContentType(contentType);
   1.242 +  bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
   1.243 +                   contentType.EqualsLiteral(TYPE_ATOM);
   1.244 +
   1.245 +  // Check to see if this was a feed request from the location bar or from
   1.246 +  // the feed: protocol. This is also a reliable indication.
   1.247 +  // The value of the header doesn't matter.  
   1.248 +  if (!noSniff) {
   1.249 +    nsAutoCString sniffHeader;
   1.250 +    nsresult foundHeader =
   1.251 +      channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
   1.252 +                                sniffHeader);
   1.253 +    noSniff = NS_SUCCEEDED(foundHeader);
   1.254 +  }
   1.255 +
   1.256 +  if (noSniff) {
   1.257 +    // check for an attachment after we have a likely feed.
   1.258 +    if(HasAttachmentDisposition(channel)) {
   1.259 +      sniffedType.Truncate();
   1.260 +      return NS_OK;
   1.261 +    }
   1.262 +
   1.263 +    // set the feed header as a response header, since we have good metadata
   1.264 +    // telling us that the feed is supposed to be RSS or Atom
   1.265 +    channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
   1.266 +                               NS_LITERAL_CSTRING("1"), false);
   1.267 +    sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
   1.268 +    return NS_OK;
   1.269 +  }
   1.270 +
   1.271 +  // Don't sniff arbitrary types.  Limit sniffing to situations that
   1.272 +  // we think can reasonably arise.
   1.273 +  if (!contentType.EqualsLiteral(TEXT_HTML) &&
   1.274 +      !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
   1.275 +      // Same criterion as XMLHttpRequest.  Should we be checking for "+xml"
   1.276 +      // and check for text/xml and application/xml by hand instead?
   1.277 +      contentType.Find("xml") == -1) {
   1.278 +    sniffedType.Truncate();
   1.279 +    return NS_OK;
   1.280 +  }
   1.281 +
   1.282 +  // Now we need to potentially decompress data served with 
   1.283 +  // Content-Encoding: gzip
   1.284 +  nsresult rv = ConvertEncodedData(request, data, length);
   1.285 +  if (NS_FAILED(rv))
   1.286 +    return rv;
   1.287 +
   1.288 +  // We cap the number of bytes to scan at MAX_BYTES to prevent picking up 
   1.289 +  // false positives by accidentally reading document content, e.g. a "how to
   1.290 +  // make a feed" page.
   1.291 +  const char* testData;
   1.292 +  if (mDecodedData.IsEmpty()) {
   1.293 +    testData = (const char*)data;
   1.294 +    length = std::min(length, MAX_BYTES);
   1.295 +  } else {
   1.296 +    testData = mDecodedData.get();
   1.297 +    length = std::min(mDecodedData.Length(), MAX_BYTES);
   1.298 +  }
   1.299 +
   1.300 +  // The strategy here is based on that described in:
   1.301 +  // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
   1.302 +  // for interoperarbility purposes.
   1.303 +
   1.304 +  // Thus begins the actual sniffing.
   1.305 +  nsDependentCSubstring dataString((const char*)testData, length);
   1.306 +
   1.307 +  bool isFeed = false;
   1.308 +
   1.309 +  // RSS 0.91/0.92/2.0
   1.310 +  isFeed = ContainsTopLevelSubstring(dataString, "<rss");
   1.311 +
   1.312 +  // Atom 1.0
   1.313 +  if (!isFeed)
   1.314 +    isFeed = ContainsTopLevelSubstring(dataString, "<feed");
   1.315 +
   1.316 +  // RSS 1.0
   1.317 +  if (!isFeed) {
   1.318 +    isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
   1.319 +      dataString.Find(NS_RDF) != -1 &&
   1.320 +      dataString.Find(NS_RSS) != -1;
   1.321 +  }
   1.322 +
   1.323 +  // If we sniffed a feed, coerce our internal type
   1.324 +  if (isFeed && !HasAttachmentDisposition(channel))
   1.325 +    sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
   1.326 +  else
   1.327 +    sniffedType.Truncate();
   1.328 +  return NS_OK;
   1.329 +}
   1.330 +
   1.331 +NS_IMETHODIMP
   1.332 +nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
   1.333 +{
   1.334 +  return NS_OK;
   1.335 +}
   1.336 +
   1.337 +NS_METHOD
   1.338 +nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
   1.339 +                                     void* closure,
   1.340 +                                     const char* rawSegment,
   1.341 +                                     uint32_t toOffset,
   1.342 +                                     uint32_t count,
   1.343 +                                     uint32_t* writeCount)
   1.344 +{
   1.345 +  nsCString* decodedData = static_cast<nsCString*>(closure);
   1.346 +  decodedData->Append(rawSegment, count);
   1.347 +  *writeCount = count;
   1.348 +  return NS_OK;
   1.349 +}
   1.350 +
   1.351 +NS_IMETHODIMP
   1.352 +nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
   1.353 +                               nsIInputStream* stream, uint64_t offset, 
   1.354 +                               uint32_t count)
   1.355 +{
   1.356 +  uint32_t read;
   1.357 +  return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count, 
   1.358 +                              &read);
   1.359 +}
   1.360 +
   1.361 +NS_IMETHODIMP
   1.362 +nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context, 
   1.363 +                             nsresult status)
   1.364 +{
   1.365 +  return NS_OK; 
   1.366 +}

mercurial