browser/components/feeds/src/nsFeedSniffer.cpp

Wed, 31 Dec 2014 13:27:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 13:27:57 +0100
branch
TOR_BUG_3246
changeset 6
8bccb770b82d
permissions
-rw-r--r--

Ignore runtime configuration files generated during quality assurance.

michael@0 1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "nsFeedSniffer.h"
michael@0 7
michael@0 8
michael@0 9 #include "nsNetCID.h"
michael@0 10 #include "nsXPCOM.h"
michael@0 11 #include "nsCOMPtr.h"
michael@0 12 #include "nsStringStream.h"
michael@0 13
michael@0 14 #include "nsBrowserCompsCID.h"
michael@0 15
michael@0 16 #include "nsICategoryManager.h"
michael@0 17 #include "nsIServiceManager.h"
michael@0 18 #include "nsComponentManagerUtils.h"
michael@0 19 #include "nsServiceManagerUtils.h"
michael@0 20
michael@0 21 #include "nsIStreamConverterService.h"
michael@0 22 #include "nsIStreamConverter.h"
michael@0 23
michael@0 24 #include "nsIStreamListener.h"
michael@0 25
michael@0 26 #include "nsIHttpChannel.h"
michael@0 27 #include "nsIMIMEHeaderParam.h"
michael@0 28
michael@0 29 #include "nsMimeTypes.h"
michael@0 30 #include "nsIURI.h"
michael@0 31 #include <algorithm>
michael@0 32
michael@0 33 #define TYPE_ATOM "application/atom+xml"
michael@0 34 #define TYPE_RSS "application/rss+xml"
michael@0 35 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed"
michael@0 36
michael@0 37 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
michael@0 38 #define NS_RSS "http://purl.org/rss/1.0/"
michael@0 39
michael@0 40 #define MAX_BYTES 512u
michael@0 41
michael@0 42 NS_IMPL_ISUPPORTS(nsFeedSniffer,
michael@0 43 nsIContentSniffer,
michael@0 44 nsIStreamListener,
michael@0 45 nsIRequestObserver)
michael@0 46
michael@0 47 nsresult
michael@0 48 nsFeedSniffer::ConvertEncodedData(nsIRequest* request,
michael@0 49 const uint8_t* data,
michael@0 50 uint32_t length)
michael@0 51 {
michael@0 52 nsresult rv = NS_OK;
michael@0 53
michael@0 54 mDecodedData = "";
michael@0 55 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request));
michael@0 56 if (!httpChannel)
michael@0 57 return NS_ERROR_NO_INTERFACE;
michael@0 58
michael@0 59 nsAutoCString contentEncoding;
michael@0 60 httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
michael@0 61 contentEncoding);
michael@0 62 if (!contentEncoding.IsEmpty()) {
michael@0 63 nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID));
michael@0 64 if (converterService) {
michael@0 65 ToLowerCase(contentEncoding);
michael@0 66
michael@0 67 nsCOMPtr<nsIStreamListener> converter;
michael@0 68 rv = converterService->AsyncConvertData(contentEncoding.get(),
michael@0 69 "uncompressed", this, nullptr,
michael@0 70 getter_AddRefs(converter));
michael@0 71 NS_ENSURE_SUCCESS(rv, rv);
michael@0 72
michael@0 73 converter->OnStartRequest(request, nullptr);
michael@0 74
michael@0 75 nsCOMPtr<nsIStringInputStream> rawStream =
michael@0 76 do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID);
michael@0 77 if (!rawStream)
michael@0 78 return NS_ERROR_FAILURE;
michael@0 79
michael@0 80 rv = rawStream->SetData((const char*)data, length);
michael@0 81 NS_ENSURE_SUCCESS(rv, rv);
michael@0 82
michael@0 83 rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length);
michael@0 84 NS_ENSURE_SUCCESS(rv, rv);
michael@0 85
michael@0 86 converter->OnStopRequest(request, nullptr, NS_OK);
michael@0 87 }
michael@0 88 }
michael@0 89 return rv;
michael@0 90 }
michael@0 91
michael@0 92 template<int N>
michael@0 93 static bool
michael@0 94 StringBeginsWithLowercaseLiteral(nsAString& aString,
michael@0 95 const char (&aSubstring)[N])
michael@0 96 {
michael@0 97 return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring);
michael@0 98 }
michael@0 99
michael@0 100 bool
michael@0 101 HasAttachmentDisposition(nsIHttpChannel* httpChannel)
michael@0 102 {
michael@0 103 if (!httpChannel)
michael@0 104 return false;
michael@0 105
michael@0 106 uint32_t disp;
michael@0 107 nsresult rv = httpChannel->GetContentDisposition(&disp);
michael@0 108
michael@0 109 if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT)
michael@0 110 return true;
michael@0 111
michael@0 112 return false;
michael@0 113 }
michael@0 114
michael@0 115 /**
michael@0 116 * @return the first occurrence of a character within a string buffer,
michael@0 117 * or nullptr if not found
michael@0 118 */
michael@0 119 static const char*
michael@0 120 FindChar(char c, const char *begin, const char *end)
michael@0 121 {
michael@0 122 for (; begin < end; ++begin) {
michael@0 123 if (*begin == c)
michael@0 124 return begin;
michael@0 125 }
michael@0 126 return nullptr;
michael@0 127 }
michael@0 128
michael@0 129 /**
michael@0 130 *
michael@0 131 * Determine if a substring is the "documentElement" in the document.
michael@0 132 *
michael@0 133 * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document"
michael@0 134 * element within the XML DOM, i.e. the root container element. Otherwise,
michael@0 135 * it's possible that someone embedded one of these tags inside a document of
michael@0 136 * another type, e.g. a HTML document, and we don't want to show the preview
michael@0 137 * page if the document isn't actually a feed.
michael@0 138 *
michael@0 139 * @param start
michael@0 140 * The beginning of the data being sniffed
michael@0 141 * @param end
michael@0 142 * The end of the data being sniffed, right before the substring that
michael@0 143 * was found.
michael@0 144 * @returns true if the found substring is the documentElement, false
michael@0 145 * otherwise.
michael@0 146 */
michael@0 147 static bool
michael@0 148 IsDocumentElement(const char *start, const char* end)
michael@0 149 {
michael@0 150 // For every tag in the buffer, check to see if it's a PI, Doctype or
michael@0 151 // comment, our desired substring or something invalid.
michael@0 152 while ( (start = FindChar('<', start, end)) ) {
michael@0 153 ++start;
michael@0 154 if (start >= end)
michael@0 155 return false;
michael@0 156
michael@0 157 // Check to see if the character following the '<' is either '?' or '!'
michael@0 158 // (processing instruction or doctype or comment)... these are valid nodes
michael@0 159 // to have in the prologue.
michael@0 160 if (*start != '?' && *start != '!')
michael@0 161 return false;
michael@0 162
michael@0 163 // Now advance the iterator until the '>' (We do this because we don't want
michael@0 164 // to sniff indicator substrings that are embedded within other nodes, e.g.
michael@0 165 // comments: <!-- <rdf:RDF .. > -->
michael@0 166 start = FindChar('>', start, end);
michael@0 167 if (!start)
michael@0 168 return false;
michael@0 169
michael@0 170 ++start;
michael@0 171 }
michael@0 172 return true;
michael@0 173 }
michael@0 174
michael@0 175 /**
michael@0 176 * Determines whether or not a string exists as the root element in an XML data
michael@0 177 * string buffer.
michael@0 178 * @param dataString
michael@0 179 * The data being sniffed
michael@0 180 * @param substring
michael@0 181 * The substring being tested for existence and root-ness.
michael@0 182 * @returns true if the substring exists and is the documentElement, false
michael@0 183 * otherwise.
michael@0 184 */
michael@0 185 static bool
michael@0 186 ContainsTopLevelSubstring(nsACString& dataString, const char *substring)
michael@0 187 {
michael@0 188 int32_t offset = dataString.Find(substring);
michael@0 189 if (offset == -1)
michael@0 190 return false;
michael@0 191
michael@0 192 const char *begin = dataString.BeginReading();
michael@0 193
michael@0 194 // Only do the validation when we find the substring.
michael@0 195 return IsDocumentElement(begin, begin + offset);
michael@0 196 }
michael@0 197
michael@0 198 NS_IMETHODIMP
michael@0 199 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request,
michael@0 200 const uint8_t* data,
michael@0 201 uint32_t length,
michael@0 202 nsACString& sniffedType)
michael@0 203 {
michael@0 204 nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request));
michael@0 205 if (!channel)
michael@0 206 return NS_ERROR_NO_INTERFACE;
michael@0 207
michael@0 208 // Check that this is a GET request, since you can't subscribe to a POST...
michael@0 209 nsAutoCString method;
michael@0 210 channel->GetRequestMethod(method);
michael@0 211 if (!method.Equals("GET")) {
michael@0 212 sniffedType.Truncate();
michael@0 213 return NS_OK;
michael@0 214 }
michael@0 215
michael@0 216 // We need to find out if this is a load of a view-source document. In this
michael@0 217 // case we do not want to override the content type, since the source display
michael@0 218 // does not need to be converted from feed format to XUL. More importantly,
michael@0 219 // we don't want to change the content type from something
michael@0 220 // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html
michael@0 221 // etc) to something that only the application fe knows about (maybe.feed)
michael@0 222 // thus deactivating syntax highlighting.
michael@0 223 nsCOMPtr<nsIURI> originalURI;
michael@0 224 channel->GetOriginalURI(getter_AddRefs(originalURI));
michael@0 225
michael@0 226 nsAutoCString scheme;
michael@0 227 originalURI->GetScheme(scheme);
michael@0 228 if (scheme.EqualsLiteral("view-source")) {
michael@0 229 sniffedType.Truncate();
michael@0 230 return NS_OK;
michael@0 231 }
michael@0 232
michael@0 233 // Check the Content-Type to see if it is set correctly. If it is set to
michael@0 234 // something specific that we think is a reliable indication of a feed, don't
michael@0 235 // bother sniffing since we assume the site maintainer knows what they're
michael@0 236 // doing.
michael@0 237 nsAutoCString contentType;
michael@0 238 channel->GetContentType(contentType);
michael@0 239 bool noSniff = contentType.EqualsLiteral(TYPE_RSS) ||
michael@0 240 contentType.EqualsLiteral(TYPE_ATOM);
michael@0 241
michael@0 242 // Check to see if this was a feed request from the location bar or from
michael@0 243 // the feed: protocol. This is also a reliable indication.
michael@0 244 // The value of the header doesn't matter.
michael@0 245 if (!noSniff) {
michael@0 246 nsAutoCString sniffHeader;
michael@0 247 nsresult foundHeader =
michael@0 248 channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
michael@0 249 sniffHeader);
michael@0 250 noSniff = NS_SUCCEEDED(foundHeader);
michael@0 251 }
michael@0 252
michael@0 253 if (noSniff) {
michael@0 254 // check for an attachment after we have a likely feed.
michael@0 255 if(HasAttachmentDisposition(channel)) {
michael@0 256 sniffedType.Truncate();
michael@0 257 return NS_OK;
michael@0 258 }
michael@0 259
michael@0 260 // set the feed header as a response header, since we have good metadata
michael@0 261 // telling us that the feed is supposed to be RSS or Atom
michael@0 262 channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"),
michael@0 263 NS_LITERAL_CSTRING("1"), false);
michael@0 264 sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
michael@0 265 return NS_OK;
michael@0 266 }
michael@0 267
michael@0 268 // Don't sniff arbitrary types. Limit sniffing to situations that
michael@0 269 // we think can reasonably arise.
michael@0 270 if (!contentType.EqualsLiteral(TEXT_HTML) &&
michael@0 271 !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) &&
michael@0 272 // Same criterion as XMLHttpRequest. Should we be checking for "+xml"
michael@0 273 // and check for text/xml and application/xml by hand instead?
michael@0 274 contentType.Find("xml") == -1) {
michael@0 275 sniffedType.Truncate();
michael@0 276 return NS_OK;
michael@0 277 }
michael@0 278
michael@0 279 // Now we need to potentially decompress data served with
michael@0 280 // Content-Encoding: gzip
michael@0 281 nsresult rv = ConvertEncodedData(request, data, length);
michael@0 282 if (NS_FAILED(rv))
michael@0 283 return rv;
michael@0 284
michael@0 285 // We cap the number of bytes to scan at MAX_BYTES to prevent picking up
michael@0 286 // false positives by accidentally reading document content, e.g. a "how to
michael@0 287 // make a feed" page.
michael@0 288 const char* testData;
michael@0 289 if (mDecodedData.IsEmpty()) {
michael@0 290 testData = (const char*)data;
michael@0 291 length = std::min(length, MAX_BYTES);
michael@0 292 } else {
michael@0 293 testData = mDecodedData.get();
michael@0 294 length = std::min(mDecodedData.Length(), MAX_BYTES);
michael@0 295 }
michael@0 296
michael@0 297 // The strategy here is based on that described in:
michael@0 298 // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx
michael@0 299 // for interoperarbility purposes.
michael@0 300
michael@0 301 // Thus begins the actual sniffing.
michael@0 302 nsDependentCSubstring dataString((const char*)testData, length);
michael@0 303
michael@0 304 bool isFeed = false;
michael@0 305
michael@0 306 // RSS 0.91/0.92/2.0
michael@0 307 isFeed = ContainsTopLevelSubstring(dataString, "<rss");
michael@0 308
michael@0 309 // Atom 1.0
michael@0 310 if (!isFeed)
michael@0 311 isFeed = ContainsTopLevelSubstring(dataString, "<feed");
michael@0 312
michael@0 313 // RSS 1.0
michael@0 314 if (!isFeed) {
michael@0 315 isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") &&
michael@0 316 dataString.Find(NS_RDF) != -1 &&
michael@0 317 dataString.Find(NS_RSS) != -1;
michael@0 318 }
michael@0 319
michael@0 320 // If we sniffed a feed, coerce our internal type
michael@0 321 if (isFeed && !HasAttachmentDisposition(channel))
michael@0 322 sniffedType.AssignLiteral(TYPE_MAYBE_FEED);
michael@0 323 else
michael@0 324 sniffedType.Truncate();
michael@0 325 return NS_OK;
michael@0 326 }
michael@0 327
michael@0 328 NS_IMETHODIMP
michael@0 329 nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context)
michael@0 330 {
michael@0 331 return NS_OK;
michael@0 332 }
michael@0 333
michael@0 334 NS_METHOD
michael@0 335 nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream,
michael@0 336 void* closure,
michael@0 337 const char* rawSegment,
michael@0 338 uint32_t toOffset,
michael@0 339 uint32_t count,
michael@0 340 uint32_t* writeCount)
michael@0 341 {
michael@0 342 nsCString* decodedData = static_cast<nsCString*>(closure);
michael@0 343 decodedData->Append(rawSegment, count);
michael@0 344 *writeCount = count;
michael@0 345 return NS_OK;
michael@0 346 }
michael@0 347
michael@0 348 NS_IMETHODIMP
michael@0 349 nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context,
michael@0 350 nsIInputStream* stream, uint64_t offset,
michael@0 351 uint32_t count)
michael@0 352 {
michael@0 353 uint32_t read;
michael@0 354 return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count,
michael@0 355 &read);
michael@0 356 }
michael@0 357
michael@0 358 NS_IMETHODIMP
michael@0 359 nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context,
michael@0 360 nsresult status)
michael@0 361 {
michael@0 362 return NS_OK;
michael@0 363 }

mercurial