1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/feeds/src/nsFeedSniffer.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,363 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "nsFeedSniffer.h" 1.10 + 1.11 + 1.12 +#include "nsNetCID.h" 1.13 +#include "nsXPCOM.h" 1.14 +#include "nsCOMPtr.h" 1.15 +#include "nsStringStream.h" 1.16 + 1.17 +#include "nsBrowserCompsCID.h" 1.18 + 1.19 +#include "nsICategoryManager.h" 1.20 +#include "nsIServiceManager.h" 1.21 +#include "nsComponentManagerUtils.h" 1.22 +#include "nsServiceManagerUtils.h" 1.23 + 1.24 +#include "nsIStreamConverterService.h" 1.25 +#include "nsIStreamConverter.h" 1.26 + 1.27 +#include "nsIStreamListener.h" 1.28 + 1.29 +#include "nsIHttpChannel.h" 1.30 +#include "nsIMIMEHeaderParam.h" 1.31 + 1.32 +#include "nsMimeTypes.h" 1.33 +#include "nsIURI.h" 1.34 +#include <algorithm> 1.35 + 1.36 +#define TYPE_ATOM "application/atom+xml" 1.37 +#define TYPE_RSS "application/rss+xml" 1.38 +#define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed" 1.39 + 1.40 +#define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 1.41 +#define NS_RSS "http://purl.org/rss/1.0/" 1.42 + 1.43 +#define MAX_BYTES 512u 1.44 + 1.45 +NS_IMPL_ISUPPORTS(nsFeedSniffer, 1.46 + nsIContentSniffer, 1.47 + nsIStreamListener, 1.48 + nsIRequestObserver) 1.49 + 1.50 +nsresult 1.51 +nsFeedSniffer::ConvertEncodedData(nsIRequest* request, 1.52 + const uint8_t* data, 1.53 + uint32_t length) 1.54 +{ 1.55 + nsresult rv = NS_OK; 1.56 + 1.57 + mDecodedData = ""; 1.58 + nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request)); 1.59 + if (!httpChannel) 1.60 + return NS_ERROR_NO_INTERFACE; 1.61 + 1.62 + nsAutoCString contentEncoding; 1.63 + httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"), 1.64 + contentEncoding); 1.65 + if (!contentEncoding.IsEmpty()) { 1.66 + nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID)); 1.67 + if (converterService) { 1.68 + ToLowerCase(contentEncoding); 1.69 + 1.70 + nsCOMPtr<nsIStreamListener> converter; 1.71 + rv = converterService->AsyncConvertData(contentEncoding.get(), 1.72 + "uncompressed", this, nullptr, 1.73 + getter_AddRefs(converter)); 1.74 + NS_ENSURE_SUCCESS(rv, rv); 1.75 + 1.76 + converter->OnStartRequest(request, nullptr); 1.77 + 1.78 + nsCOMPtr<nsIStringInputStream> rawStream = 1.79 + do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID); 1.80 + if (!rawStream) 1.81 + return NS_ERROR_FAILURE; 1.82 + 1.83 + rv = rawStream->SetData((const char*)data, length); 1.84 + NS_ENSURE_SUCCESS(rv, rv); 1.85 + 1.86 + rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length); 1.87 + NS_ENSURE_SUCCESS(rv, rv); 1.88 + 1.89 + converter->OnStopRequest(request, nullptr, NS_OK); 1.90 + } 1.91 + } 1.92 + return rv; 1.93 +} 1.94 + 1.95 +template<int N> 1.96 +static bool 1.97 +StringBeginsWithLowercaseLiteral(nsAString& aString, 1.98 + const char (&aSubstring)[N]) 1.99 +{ 1.100 + return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring); 1.101 +} 1.102 + 1.103 +bool 1.104 +HasAttachmentDisposition(nsIHttpChannel* httpChannel) 1.105 +{ 1.106 + if (!httpChannel) 1.107 + return false; 1.108 + 1.109 + uint32_t disp; 1.110 + nsresult rv = httpChannel->GetContentDisposition(&disp); 1.111 + 1.112 + if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT) 1.113 + return true; 1.114 + 1.115 + return false; 1.116 +} 1.117 + 1.118 +/** 1.119 + * @return the first occurrence of a character within a string buffer, 1.120 + * or nullptr if not found 1.121 + */ 1.122 +static const char* 1.123 +FindChar(char c, const char *begin, const char *end) 1.124 +{ 1.125 + for (; begin < end; ++begin) { 1.126 + if (*begin == c) 1.127 + return begin; 1.128 + } 1.129 + return nullptr; 1.130 +} 1.131 + 1.132 +/** 1.133 + * 1.134 + * Determine if a substring is the "documentElement" in the document. 1.135 + * 1.136 + * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document" 1.137 + * element within the XML DOM, i.e. the root container element. Otherwise, 1.138 + * it's possible that someone embedded one of these tags inside a document of 1.139 + * another type, e.g. a HTML document, and we don't want to show the preview 1.140 + * page if the document isn't actually a feed. 1.141 + * 1.142 + * @param start 1.143 + * The beginning of the data being sniffed 1.144 + * @param end 1.145 + * The end of the data being sniffed, right before the substring that 1.146 + * was found. 1.147 + * @returns true if the found substring is the documentElement, false 1.148 + * otherwise. 1.149 + */ 1.150 +static bool 1.151 +IsDocumentElement(const char *start, const char* end) 1.152 +{ 1.153 + // For every tag in the buffer, check to see if it's a PI, Doctype or 1.154 + // comment, our desired substring or something invalid. 1.155 + while ( (start = FindChar('<', start, end)) ) { 1.156 + ++start; 1.157 + if (start >= end) 1.158 + return false; 1.159 + 1.160 + // Check to see if the character following the '<' is either '?' or '!' 1.161 + // (processing instruction or doctype or comment)... these are valid nodes 1.162 + // to have in the prologue. 1.163 + if (*start != '?' && *start != '!') 1.164 + return false; 1.165 + 1.166 + // Now advance the iterator until the '>' (We do this because we don't want 1.167 + // to sniff indicator substrings that are embedded within other nodes, e.g. 1.168 + // comments: <!-- <rdf:RDF .. > --> 1.169 + start = FindChar('>', start, end); 1.170 + if (!start) 1.171 + return false; 1.172 + 1.173 + ++start; 1.174 + } 1.175 + return true; 1.176 +} 1.177 + 1.178 +/** 1.179 + * Determines whether or not a string exists as the root element in an XML data 1.180 + * string buffer. 1.181 + * @param dataString 1.182 + * The data being sniffed 1.183 + * @param substring 1.184 + * The substring being tested for existence and root-ness. 1.185 + * @returns true if the substring exists and is the documentElement, false 1.186 + * otherwise. 1.187 + */ 1.188 +static bool 1.189 +ContainsTopLevelSubstring(nsACString& dataString, const char *substring) 1.190 +{ 1.191 + int32_t offset = dataString.Find(substring); 1.192 + if (offset == -1) 1.193 + return false; 1.194 + 1.195 + const char *begin = dataString.BeginReading(); 1.196 + 1.197 + // Only do the validation when we find the substring. 1.198 + return IsDocumentElement(begin, begin + offset); 1.199 +} 1.200 + 1.201 +NS_IMETHODIMP 1.202 +nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request, 1.203 + const uint8_t* data, 1.204 + uint32_t length, 1.205 + nsACString& sniffedType) 1.206 +{ 1.207 + nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request)); 1.208 + if (!channel) 1.209 + return NS_ERROR_NO_INTERFACE; 1.210 + 1.211 + // Check that this is a GET request, since you can't subscribe to a POST... 1.212 + nsAutoCString method; 1.213 + channel->GetRequestMethod(method); 1.214 + if (!method.Equals("GET")) { 1.215 + sniffedType.Truncate(); 1.216 + return NS_OK; 1.217 + } 1.218 + 1.219 + // We need to find out if this is a load of a view-source document. In this 1.220 + // case we do not want to override the content type, since the source display 1.221 + // does not need to be converted from feed format to XUL. More importantly, 1.222 + // we don't want to change the content type from something 1.223 + // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html 1.224 + // etc) to something that only the application fe knows about (maybe.feed) 1.225 + // thus deactivating syntax highlighting. 1.226 + nsCOMPtr<nsIURI> originalURI; 1.227 + channel->GetOriginalURI(getter_AddRefs(originalURI)); 1.228 + 1.229 + nsAutoCString scheme; 1.230 + originalURI->GetScheme(scheme); 1.231 + if (scheme.EqualsLiteral("view-source")) { 1.232 + sniffedType.Truncate(); 1.233 + return NS_OK; 1.234 + } 1.235 + 1.236 + // Check the Content-Type to see if it is set correctly. If it is set to 1.237 + // something specific that we think is a reliable indication of a feed, don't 1.238 + // bother sniffing since we assume the site maintainer knows what they're 1.239 + // doing. 1.240 + nsAutoCString contentType; 1.241 + channel->GetContentType(contentType); 1.242 + bool noSniff = contentType.EqualsLiteral(TYPE_RSS) || 1.243 + contentType.EqualsLiteral(TYPE_ATOM); 1.244 + 1.245 + // Check to see if this was a feed request from the location bar or from 1.246 + // the feed: protocol. This is also a reliable indication. 1.247 + // The value of the header doesn't matter. 1.248 + if (!noSniff) { 1.249 + nsAutoCString sniffHeader; 1.250 + nsresult foundHeader = 1.251 + channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), 1.252 + sniffHeader); 1.253 + noSniff = NS_SUCCEEDED(foundHeader); 1.254 + } 1.255 + 1.256 + if (noSniff) { 1.257 + // check for an attachment after we have a likely feed. 1.258 + if(HasAttachmentDisposition(channel)) { 1.259 + sniffedType.Truncate(); 1.260 + return NS_OK; 1.261 + } 1.262 + 1.263 + // set the feed header as a response header, since we have good metadata 1.264 + // telling us that the feed is supposed to be RSS or Atom 1.265 + channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), 1.266 + NS_LITERAL_CSTRING("1"), false); 1.267 + sniffedType.AssignLiteral(TYPE_MAYBE_FEED); 1.268 + return NS_OK; 1.269 + } 1.270 + 1.271 + // Don't sniff arbitrary types. Limit sniffing to situations that 1.272 + // we think can reasonably arise. 1.273 + if (!contentType.EqualsLiteral(TEXT_HTML) && 1.274 + !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) && 1.275 + // Same criterion as XMLHttpRequest. Should we be checking for "+xml" 1.276 + // and check for text/xml and application/xml by hand instead? 1.277 + contentType.Find("xml") == -1) { 1.278 + sniffedType.Truncate(); 1.279 + return NS_OK; 1.280 + } 1.281 + 1.282 + // Now we need to potentially decompress data served with 1.283 + // Content-Encoding: gzip 1.284 + nsresult rv = ConvertEncodedData(request, data, length); 1.285 + if (NS_FAILED(rv)) 1.286 + return rv; 1.287 + 1.288 + // We cap the number of bytes to scan at MAX_BYTES to prevent picking up 1.289 + // false positives by accidentally reading document content, e.g. a "how to 1.290 + // make a feed" page. 1.291 + const char* testData; 1.292 + if (mDecodedData.IsEmpty()) { 1.293 + testData = (const char*)data; 1.294 + length = std::min(length, MAX_BYTES); 1.295 + } else { 1.296 + testData = mDecodedData.get(); 1.297 + length = std::min(mDecodedData.Length(), MAX_BYTES); 1.298 + } 1.299 + 1.300 + // The strategy here is based on that described in: 1.301 + // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx 1.302 + // for interoperarbility purposes. 1.303 + 1.304 + // Thus begins the actual sniffing. 1.305 + nsDependentCSubstring dataString((const char*)testData, length); 1.306 + 1.307 + bool isFeed = false; 1.308 + 1.309 + // RSS 0.91/0.92/2.0 1.310 + isFeed = ContainsTopLevelSubstring(dataString, "<rss"); 1.311 + 1.312 + // Atom 1.0 1.313 + if (!isFeed) 1.314 + isFeed = ContainsTopLevelSubstring(dataString, "<feed"); 1.315 + 1.316 + // RSS 1.0 1.317 + if (!isFeed) { 1.318 + isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") && 1.319 + dataString.Find(NS_RDF) != -1 && 1.320 + dataString.Find(NS_RSS) != -1; 1.321 + } 1.322 + 1.323 + // If we sniffed a feed, coerce our internal type 1.324 + if (isFeed && !HasAttachmentDisposition(channel)) 1.325 + sniffedType.AssignLiteral(TYPE_MAYBE_FEED); 1.326 + else 1.327 + sniffedType.Truncate(); 1.328 + return NS_OK; 1.329 +} 1.330 + 1.331 +NS_IMETHODIMP 1.332 +nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context) 1.333 +{ 1.334 + return NS_OK; 1.335 +} 1.336 + 1.337 +NS_METHOD 1.338 +nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream, 1.339 + void* closure, 1.340 + const char* rawSegment, 1.341 + uint32_t toOffset, 1.342 + uint32_t count, 1.343 + uint32_t* writeCount) 1.344 +{ 1.345 + nsCString* decodedData = static_cast<nsCString*>(closure); 1.346 + decodedData->Append(rawSegment, count); 1.347 + *writeCount = count; 1.348 + return NS_OK; 1.349 +} 1.350 + 1.351 +NS_IMETHODIMP 1.352 +nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context, 1.353 + nsIInputStream* stream, uint64_t offset, 1.354 + uint32_t count) 1.355 +{ 1.356 + uint32_t read; 1.357 + return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count, 1.358 + &read); 1.359 +} 1.360 + 1.361 +NS_IMETHODIMP 1.362 +nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context, 1.363 + nsresult status) 1.364 +{ 1.365 + return NS_OK; 1.366 +}