Wed, 31 Dec 2014 13:27:57 +0100
Ignore runtime configuration files generated during quality assurance.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "nsFeedSniffer.h" |
michael@0 | 7 | |
michael@0 | 8 | |
michael@0 | 9 | #include "nsNetCID.h" |
michael@0 | 10 | #include "nsXPCOM.h" |
michael@0 | 11 | #include "nsCOMPtr.h" |
michael@0 | 12 | #include "nsStringStream.h" |
michael@0 | 13 | |
michael@0 | 14 | #include "nsBrowserCompsCID.h" |
michael@0 | 15 | |
michael@0 | 16 | #include "nsICategoryManager.h" |
michael@0 | 17 | #include "nsIServiceManager.h" |
michael@0 | 18 | #include "nsComponentManagerUtils.h" |
michael@0 | 19 | #include "nsServiceManagerUtils.h" |
michael@0 | 20 | |
michael@0 | 21 | #include "nsIStreamConverterService.h" |
michael@0 | 22 | #include "nsIStreamConverter.h" |
michael@0 | 23 | |
michael@0 | 24 | #include "nsIStreamListener.h" |
michael@0 | 25 | |
michael@0 | 26 | #include "nsIHttpChannel.h" |
michael@0 | 27 | #include "nsIMIMEHeaderParam.h" |
michael@0 | 28 | |
michael@0 | 29 | #include "nsMimeTypes.h" |
michael@0 | 30 | #include "nsIURI.h" |
michael@0 | 31 | #include <algorithm> |
michael@0 | 32 | |
michael@0 | 33 | #define TYPE_ATOM "application/atom+xml" |
michael@0 | 34 | #define TYPE_RSS "application/rss+xml" |
michael@0 | 35 | #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed" |
michael@0 | 36 | |
michael@0 | 37 | #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#" |
michael@0 | 38 | #define NS_RSS "http://purl.org/rss/1.0/" |
michael@0 | 39 | |
michael@0 | 40 | #define MAX_BYTES 512u |
michael@0 | 41 | |
michael@0 | 42 | NS_IMPL_ISUPPORTS(nsFeedSniffer, |
michael@0 | 43 | nsIContentSniffer, |
michael@0 | 44 | nsIStreamListener, |
michael@0 | 45 | nsIRequestObserver) |
michael@0 | 46 | |
michael@0 | 47 | nsresult |
michael@0 | 48 | nsFeedSniffer::ConvertEncodedData(nsIRequest* request, |
michael@0 | 49 | const uint8_t* data, |
michael@0 | 50 | uint32_t length) |
michael@0 | 51 | { |
michael@0 | 52 | nsresult rv = NS_OK; |
michael@0 | 53 | |
michael@0 | 54 | mDecodedData = ""; |
michael@0 | 55 | nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request)); |
michael@0 | 56 | if (!httpChannel) |
michael@0 | 57 | return NS_ERROR_NO_INTERFACE; |
michael@0 | 58 | |
michael@0 | 59 | nsAutoCString contentEncoding; |
michael@0 | 60 | httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"), |
michael@0 | 61 | contentEncoding); |
michael@0 | 62 | if (!contentEncoding.IsEmpty()) { |
michael@0 | 63 | nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID)); |
michael@0 | 64 | if (converterService) { |
michael@0 | 65 | ToLowerCase(contentEncoding); |
michael@0 | 66 | |
michael@0 | 67 | nsCOMPtr<nsIStreamListener> converter; |
michael@0 | 68 | rv = converterService->AsyncConvertData(contentEncoding.get(), |
michael@0 | 69 | "uncompressed", this, nullptr, |
michael@0 | 70 | getter_AddRefs(converter)); |
michael@0 | 71 | NS_ENSURE_SUCCESS(rv, rv); |
michael@0 | 72 | |
michael@0 | 73 | converter->OnStartRequest(request, nullptr); |
michael@0 | 74 | |
michael@0 | 75 | nsCOMPtr<nsIStringInputStream> rawStream = |
michael@0 | 76 | do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID); |
michael@0 | 77 | if (!rawStream) |
michael@0 | 78 | return NS_ERROR_FAILURE; |
michael@0 | 79 | |
michael@0 | 80 | rv = rawStream->SetData((const char*)data, length); |
michael@0 | 81 | NS_ENSURE_SUCCESS(rv, rv); |
michael@0 | 82 | |
michael@0 | 83 | rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length); |
michael@0 | 84 | NS_ENSURE_SUCCESS(rv, rv); |
michael@0 | 85 | |
michael@0 | 86 | converter->OnStopRequest(request, nullptr, NS_OK); |
michael@0 | 87 | } |
michael@0 | 88 | } |
michael@0 | 89 | return rv; |
michael@0 | 90 | } |
michael@0 | 91 | |
michael@0 | 92 | template<int N> |
michael@0 | 93 | static bool |
michael@0 | 94 | StringBeginsWithLowercaseLiteral(nsAString& aString, |
michael@0 | 95 | const char (&aSubstring)[N]) |
michael@0 | 96 | { |
michael@0 | 97 | return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring); |
michael@0 | 98 | } |
michael@0 | 99 | |
michael@0 | 100 | bool |
michael@0 | 101 | HasAttachmentDisposition(nsIHttpChannel* httpChannel) |
michael@0 | 102 | { |
michael@0 | 103 | if (!httpChannel) |
michael@0 | 104 | return false; |
michael@0 | 105 | |
michael@0 | 106 | uint32_t disp; |
michael@0 | 107 | nsresult rv = httpChannel->GetContentDisposition(&disp); |
michael@0 | 108 | |
michael@0 | 109 | if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT) |
michael@0 | 110 | return true; |
michael@0 | 111 | |
michael@0 | 112 | return false; |
michael@0 | 113 | } |
michael@0 | 114 | |
michael@0 | 115 | /** |
michael@0 | 116 | * @return the first occurrence of a character within a string buffer, |
michael@0 | 117 | * or nullptr if not found |
michael@0 | 118 | */ |
michael@0 | 119 | static const char* |
michael@0 | 120 | FindChar(char c, const char *begin, const char *end) |
michael@0 | 121 | { |
michael@0 | 122 | for (; begin < end; ++begin) { |
michael@0 | 123 | if (*begin == c) |
michael@0 | 124 | return begin; |
michael@0 | 125 | } |
michael@0 | 126 | return nullptr; |
michael@0 | 127 | } |
michael@0 | 128 | |
michael@0 | 129 | /** |
michael@0 | 130 | * |
michael@0 | 131 | * Determine if a substring is the "documentElement" in the document. |
michael@0 | 132 | * |
michael@0 | 133 | * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document" |
michael@0 | 134 | * element within the XML DOM, i.e. the root container element. Otherwise, |
michael@0 | 135 | * it's possible that someone embedded one of these tags inside a document of |
michael@0 | 136 | * another type, e.g. a HTML document, and we don't want to show the preview |
michael@0 | 137 | * page if the document isn't actually a feed. |
michael@0 | 138 | * |
michael@0 | 139 | * @param start |
michael@0 | 140 | * The beginning of the data being sniffed |
michael@0 | 141 | * @param end |
michael@0 | 142 | * The end of the data being sniffed, right before the substring that |
michael@0 | 143 | * was found. |
michael@0 | 144 | * @returns true if the found substring is the documentElement, false |
michael@0 | 145 | * otherwise. |
michael@0 | 146 | */ |
michael@0 | 147 | static bool |
michael@0 | 148 | IsDocumentElement(const char *start, const char* end) |
michael@0 | 149 | { |
michael@0 | 150 | // For every tag in the buffer, check to see if it's a PI, Doctype or |
michael@0 | 151 | // comment, our desired substring or something invalid. |
michael@0 | 152 | while ( (start = FindChar('<', start, end)) ) { |
michael@0 | 153 | ++start; |
michael@0 | 154 | if (start >= end) |
michael@0 | 155 | return false; |
michael@0 | 156 | |
michael@0 | 157 | // Check to see if the character following the '<' is either '?' or '!' |
michael@0 | 158 | // (processing instruction or doctype or comment)... these are valid nodes |
michael@0 | 159 | // to have in the prologue. |
michael@0 | 160 | if (*start != '?' && *start != '!') |
michael@0 | 161 | return false; |
michael@0 | 162 | |
michael@0 | 163 | // Now advance the iterator until the '>' (We do this because we don't want |
michael@0 | 164 | // to sniff indicator substrings that are embedded within other nodes, e.g. |
michael@0 | 165 | // comments: <!-- <rdf:RDF .. > --> |
michael@0 | 166 | start = FindChar('>', start, end); |
michael@0 | 167 | if (!start) |
michael@0 | 168 | return false; |
michael@0 | 169 | |
michael@0 | 170 | ++start; |
michael@0 | 171 | } |
michael@0 | 172 | return true; |
michael@0 | 173 | } |
michael@0 | 174 | |
michael@0 | 175 | /** |
michael@0 | 176 | * Determines whether or not a string exists as the root element in an XML data |
michael@0 | 177 | * string buffer. |
michael@0 | 178 | * @param dataString |
michael@0 | 179 | * The data being sniffed |
michael@0 | 180 | * @param substring |
michael@0 | 181 | * The substring being tested for existence and root-ness. |
michael@0 | 182 | * @returns true if the substring exists and is the documentElement, false |
michael@0 | 183 | * otherwise. |
michael@0 | 184 | */ |
michael@0 | 185 | static bool |
michael@0 | 186 | ContainsTopLevelSubstring(nsACString& dataString, const char *substring) |
michael@0 | 187 | { |
michael@0 | 188 | int32_t offset = dataString.Find(substring); |
michael@0 | 189 | if (offset == -1) |
michael@0 | 190 | return false; |
michael@0 | 191 | |
michael@0 | 192 | const char *begin = dataString.BeginReading(); |
michael@0 | 193 | |
michael@0 | 194 | // Only do the validation when we find the substring. |
michael@0 | 195 | return IsDocumentElement(begin, begin + offset); |
michael@0 | 196 | } |
michael@0 | 197 | |
michael@0 | 198 | NS_IMETHODIMP |
michael@0 | 199 | nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request, |
michael@0 | 200 | const uint8_t* data, |
michael@0 | 201 | uint32_t length, |
michael@0 | 202 | nsACString& sniffedType) |
michael@0 | 203 | { |
michael@0 | 204 | nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request)); |
michael@0 | 205 | if (!channel) |
michael@0 | 206 | return NS_ERROR_NO_INTERFACE; |
michael@0 | 207 | |
michael@0 | 208 | // Check that this is a GET request, since you can't subscribe to a POST... |
michael@0 | 209 | nsAutoCString method; |
michael@0 | 210 | channel->GetRequestMethod(method); |
michael@0 | 211 | if (!method.Equals("GET")) { |
michael@0 | 212 | sniffedType.Truncate(); |
michael@0 | 213 | return NS_OK; |
michael@0 | 214 | } |
michael@0 | 215 | |
michael@0 | 216 | // We need to find out if this is a load of a view-source document. In this |
michael@0 | 217 | // case we do not want to override the content type, since the source display |
michael@0 | 218 | // does not need to be converted from feed format to XUL. More importantly, |
michael@0 | 219 | // we don't want to change the content type from something |
michael@0 | 220 | // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html |
michael@0 | 221 | // etc) to something that only the application fe knows about (maybe.feed) |
michael@0 | 222 | // thus deactivating syntax highlighting. |
michael@0 | 223 | nsCOMPtr<nsIURI> originalURI; |
michael@0 | 224 | channel->GetOriginalURI(getter_AddRefs(originalURI)); |
michael@0 | 225 | |
michael@0 | 226 | nsAutoCString scheme; |
michael@0 | 227 | originalURI->GetScheme(scheme); |
michael@0 | 228 | if (scheme.EqualsLiteral("view-source")) { |
michael@0 | 229 | sniffedType.Truncate(); |
michael@0 | 230 | return NS_OK; |
michael@0 | 231 | } |
michael@0 | 232 | |
michael@0 | 233 | // Check the Content-Type to see if it is set correctly. If it is set to |
michael@0 | 234 | // something specific that we think is a reliable indication of a feed, don't |
michael@0 | 235 | // bother sniffing since we assume the site maintainer knows what they're |
michael@0 | 236 | // doing. |
michael@0 | 237 | nsAutoCString contentType; |
michael@0 | 238 | channel->GetContentType(contentType); |
michael@0 | 239 | bool noSniff = contentType.EqualsLiteral(TYPE_RSS) || |
michael@0 | 240 | contentType.EqualsLiteral(TYPE_ATOM); |
michael@0 | 241 | |
michael@0 | 242 | // Check to see if this was a feed request from the location bar or from |
michael@0 | 243 | // the feed: protocol. This is also a reliable indication. |
michael@0 | 244 | // The value of the header doesn't matter. |
michael@0 | 245 | if (!noSniff) { |
michael@0 | 246 | nsAutoCString sniffHeader; |
michael@0 | 247 | nsresult foundHeader = |
michael@0 | 248 | channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), |
michael@0 | 249 | sniffHeader); |
michael@0 | 250 | noSniff = NS_SUCCEEDED(foundHeader); |
michael@0 | 251 | } |
michael@0 | 252 | |
michael@0 | 253 | if (noSniff) { |
michael@0 | 254 | // check for an attachment after we have a likely feed. |
michael@0 | 255 | if(HasAttachmentDisposition(channel)) { |
michael@0 | 256 | sniffedType.Truncate(); |
michael@0 | 257 | return NS_OK; |
michael@0 | 258 | } |
michael@0 | 259 | |
michael@0 | 260 | // set the feed header as a response header, since we have good metadata |
michael@0 | 261 | // telling us that the feed is supposed to be RSS or Atom |
michael@0 | 262 | channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), |
michael@0 | 263 | NS_LITERAL_CSTRING("1"), false); |
michael@0 | 264 | sniffedType.AssignLiteral(TYPE_MAYBE_FEED); |
michael@0 | 265 | return NS_OK; |
michael@0 | 266 | } |
michael@0 | 267 | |
michael@0 | 268 | // Don't sniff arbitrary types. Limit sniffing to situations that |
michael@0 | 269 | // we think can reasonably arise. |
michael@0 | 270 | if (!contentType.EqualsLiteral(TEXT_HTML) && |
michael@0 | 271 | !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) && |
michael@0 | 272 | // Same criterion as XMLHttpRequest. Should we be checking for "+xml" |
michael@0 | 273 | // and check for text/xml and application/xml by hand instead? |
michael@0 | 274 | contentType.Find("xml") == -1) { |
michael@0 | 275 | sniffedType.Truncate(); |
michael@0 | 276 | return NS_OK; |
michael@0 | 277 | } |
michael@0 | 278 | |
michael@0 | 279 | // Now we need to potentially decompress data served with |
michael@0 | 280 | // Content-Encoding: gzip |
michael@0 | 281 | nsresult rv = ConvertEncodedData(request, data, length); |
michael@0 | 282 | if (NS_FAILED(rv)) |
michael@0 | 283 | return rv; |
michael@0 | 284 | |
michael@0 | 285 | // We cap the number of bytes to scan at MAX_BYTES to prevent picking up |
michael@0 | 286 | // false positives by accidentally reading document content, e.g. a "how to |
michael@0 | 287 | // make a feed" page. |
michael@0 | 288 | const char* testData; |
michael@0 | 289 | if (mDecodedData.IsEmpty()) { |
michael@0 | 290 | testData = (const char*)data; |
michael@0 | 291 | length = std::min(length, MAX_BYTES); |
michael@0 | 292 | } else { |
michael@0 | 293 | testData = mDecodedData.get(); |
michael@0 | 294 | length = std::min(mDecodedData.Length(), MAX_BYTES); |
michael@0 | 295 | } |
michael@0 | 296 | |
michael@0 | 297 | // The strategy here is based on that described in: |
michael@0 | 298 | // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx |
michael@0 | 299 | // for interoperarbility purposes. |
michael@0 | 300 | |
michael@0 | 301 | // Thus begins the actual sniffing. |
michael@0 | 302 | nsDependentCSubstring dataString((const char*)testData, length); |
michael@0 | 303 | |
michael@0 | 304 | bool isFeed = false; |
michael@0 | 305 | |
michael@0 | 306 | // RSS 0.91/0.92/2.0 |
michael@0 | 307 | isFeed = ContainsTopLevelSubstring(dataString, "<rss"); |
michael@0 | 308 | |
michael@0 | 309 | // Atom 1.0 |
michael@0 | 310 | if (!isFeed) |
michael@0 | 311 | isFeed = ContainsTopLevelSubstring(dataString, "<feed"); |
michael@0 | 312 | |
michael@0 | 313 | // RSS 1.0 |
michael@0 | 314 | if (!isFeed) { |
michael@0 | 315 | isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") && |
michael@0 | 316 | dataString.Find(NS_RDF) != -1 && |
michael@0 | 317 | dataString.Find(NS_RSS) != -1; |
michael@0 | 318 | } |
michael@0 | 319 | |
michael@0 | 320 | // If we sniffed a feed, coerce our internal type |
michael@0 | 321 | if (isFeed && !HasAttachmentDisposition(channel)) |
michael@0 | 322 | sniffedType.AssignLiteral(TYPE_MAYBE_FEED); |
michael@0 | 323 | else |
michael@0 | 324 | sniffedType.Truncate(); |
michael@0 | 325 | return NS_OK; |
michael@0 | 326 | } |
michael@0 | 327 | |
michael@0 | 328 | NS_IMETHODIMP |
michael@0 | 329 | nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context) |
michael@0 | 330 | { |
michael@0 | 331 | return NS_OK; |
michael@0 | 332 | } |
michael@0 | 333 | |
michael@0 | 334 | NS_METHOD |
michael@0 | 335 | nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream, |
michael@0 | 336 | void* closure, |
michael@0 | 337 | const char* rawSegment, |
michael@0 | 338 | uint32_t toOffset, |
michael@0 | 339 | uint32_t count, |
michael@0 | 340 | uint32_t* writeCount) |
michael@0 | 341 | { |
michael@0 | 342 | nsCString* decodedData = static_cast<nsCString*>(closure); |
michael@0 | 343 | decodedData->Append(rawSegment, count); |
michael@0 | 344 | *writeCount = count; |
michael@0 | 345 | return NS_OK; |
michael@0 | 346 | } |
michael@0 | 347 | |
michael@0 | 348 | NS_IMETHODIMP |
michael@0 | 349 | nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context, |
michael@0 | 350 | nsIInputStream* stream, uint64_t offset, |
michael@0 | 351 | uint32_t count) |
michael@0 | 352 | { |
michael@0 | 353 | uint32_t read; |
michael@0 | 354 | return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count, |
michael@0 | 355 | &read); |
michael@0 | 356 | } |
michael@0 | 357 | |
michael@0 | 358 | NS_IMETHODIMP |
michael@0 | 359 | nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context, |
michael@0 | 360 | nsresult status) |
michael@0 | 361 | { |
michael@0 | 362 | return NS_OK; |
michael@0 | 363 | } |