michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nsFeedSniffer.h" michael@0: michael@0: michael@0: #include "nsNetCID.h" michael@0: #include "nsXPCOM.h" michael@0: #include "nsCOMPtr.h" michael@0: #include "nsStringStream.h" michael@0: michael@0: #include "nsBrowserCompsCID.h" michael@0: michael@0: #include "nsICategoryManager.h" michael@0: #include "nsIServiceManager.h" michael@0: #include "nsComponentManagerUtils.h" michael@0: #include "nsServiceManagerUtils.h" michael@0: michael@0: #include "nsIStreamConverterService.h" michael@0: #include "nsIStreamConverter.h" michael@0: michael@0: #include "nsIStreamListener.h" michael@0: michael@0: #include "nsIHttpChannel.h" michael@0: #include "nsIMIMEHeaderParam.h" michael@0: michael@0: #include "nsMimeTypes.h" michael@0: #include "nsIURI.h" michael@0: #include michael@0: michael@0: #define TYPE_ATOM "application/atom+xml" michael@0: #define TYPE_RSS "application/rss+xml" michael@0: #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed" michael@0: michael@0: #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#" michael@0: #define NS_RSS "http://purl.org/rss/1.0/" michael@0: michael@0: #define MAX_BYTES 512u michael@0: michael@0: NS_IMPL_ISUPPORTS(nsFeedSniffer, michael@0: nsIContentSniffer, michael@0: nsIStreamListener, michael@0: nsIRequestObserver) michael@0: michael@0: nsresult michael@0: nsFeedSniffer::ConvertEncodedData(nsIRequest* request, michael@0: const uint8_t* data, michael@0: uint32_t length) michael@0: { michael@0: nsresult rv = NS_OK; michael@0: michael@0: mDecodedData = ""; michael@0: nsCOMPtr httpChannel(do_QueryInterface(request)); michael@0: if (!httpChannel) michael@0: return NS_ERROR_NO_INTERFACE; michael@0: michael@0: nsAutoCString contentEncoding; michael@0: httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"), michael@0: contentEncoding); michael@0: if (!contentEncoding.IsEmpty()) { michael@0: nsCOMPtr converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID)); michael@0: if (converterService) { michael@0: ToLowerCase(contentEncoding); michael@0: michael@0: nsCOMPtr converter; michael@0: rv = converterService->AsyncConvertData(contentEncoding.get(), michael@0: "uncompressed", this, nullptr, michael@0: getter_AddRefs(converter)); michael@0: NS_ENSURE_SUCCESS(rv, rv); michael@0: michael@0: converter->OnStartRequest(request, nullptr); michael@0: michael@0: nsCOMPtr rawStream = michael@0: do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID); michael@0: if (!rawStream) michael@0: return NS_ERROR_FAILURE; michael@0: michael@0: rv = rawStream->SetData((const char*)data, length); michael@0: NS_ENSURE_SUCCESS(rv, rv); michael@0: michael@0: rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length); michael@0: NS_ENSURE_SUCCESS(rv, rv); michael@0: michael@0: converter->OnStopRequest(request, nullptr, NS_OK); michael@0: } michael@0: } michael@0: return rv; michael@0: } michael@0: michael@0: template michael@0: static bool michael@0: StringBeginsWithLowercaseLiteral(nsAString& aString, michael@0: const char (&aSubstring)[N]) michael@0: { michael@0: return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring); michael@0: } michael@0: michael@0: bool michael@0: HasAttachmentDisposition(nsIHttpChannel* httpChannel) michael@0: { michael@0: if (!httpChannel) michael@0: return false; michael@0: michael@0: uint32_t disp; michael@0: nsresult rv = httpChannel->GetContentDisposition(&disp); michael@0: michael@0: if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT) michael@0: return true; michael@0: michael@0: return false; michael@0: } michael@0: michael@0: /** michael@0: * @return the first occurrence of a character within a string buffer, michael@0: * or nullptr if not found michael@0: */ michael@0: static const char* michael@0: FindChar(char c, const char *begin, const char *end) michael@0: { michael@0: for (; begin < end; ++begin) { michael@0: if (*begin == c) michael@0: return begin; michael@0: } michael@0: return nullptr; michael@0: } michael@0: michael@0: /** michael@0: * michael@0: * Determine if a substring is the "documentElement" in the document. michael@0: * michael@0: * All of our sniffed substrings: = end) michael@0: return false; michael@0: michael@0: // Check to see if the character following the '<' is either '?' or '!' michael@0: // (processing instruction or doctype or comment)... these are valid nodes michael@0: // to have in the prologue. michael@0: if (*start != '?' && *start != '!') michael@0: return false; michael@0: michael@0: // Now advance the iterator until the '>' (We do this because we don't want michael@0: // to sniff indicator substrings that are embedded within other nodes, e.g. michael@0: // comments: michael@0: start = FindChar('>', start, end); michael@0: if (!start) michael@0: return false; michael@0: michael@0: ++start; michael@0: } michael@0: return true; michael@0: } michael@0: michael@0: /** michael@0: * Determines whether or not a string exists as the root element in an XML data michael@0: * string buffer. michael@0: * @param dataString michael@0: * The data being sniffed michael@0: * @param substring michael@0: * The substring being tested for existence and root-ness. michael@0: * @returns true if the substring exists and is the documentElement, false michael@0: * otherwise. michael@0: */ michael@0: static bool michael@0: ContainsTopLevelSubstring(nsACString& dataString, const char *substring) michael@0: { michael@0: int32_t offset = dataString.Find(substring); michael@0: if (offset == -1) michael@0: return false; michael@0: michael@0: const char *begin = dataString.BeginReading(); michael@0: michael@0: // Only do the validation when we find the substring. michael@0: return IsDocumentElement(begin, begin + offset); michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request, michael@0: const uint8_t* data, michael@0: uint32_t length, michael@0: nsACString& sniffedType) michael@0: { michael@0: nsCOMPtr channel(do_QueryInterface(request)); michael@0: if (!channel) michael@0: return NS_ERROR_NO_INTERFACE; michael@0: michael@0: // Check that this is a GET request, since you can't subscribe to a POST... michael@0: nsAutoCString method; michael@0: channel->GetRequestMethod(method); michael@0: if (!method.Equals("GET")) { michael@0: sniffedType.Truncate(); michael@0: return NS_OK; michael@0: } michael@0: michael@0: // We need to find out if this is a load of a view-source document. In this michael@0: // case we do not want to override the content type, since the source display michael@0: // does not need to be converted from feed format to XUL. More importantly, michael@0: // we don't want to change the content type from something michael@0: // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html michael@0: // etc) to something that only the application fe knows about (maybe.feed) michael@0: // thus deactivating syntax highlighting. michael@0: nsCOMPtr originalURI; michael@0: channel->GetOriginalURI(getter_AddRefs(originalURI)); michael@0: michael@0: nsAutoCString scheme; michael@0: originalURI->GetScheme(scheme); michael@0: if (scheme.EqualsLiteral("view-source")) { michael@0: sniffedType.Truncate(); michael@0: return NS_OK; michael@0: } michael@0: michael@0: // Check the Content-Type to see if it is set correctly. If it is set to michael@0: // something specific that we think is a reliable indication of a feed, don't michael@0: // bother sniffing since we assume the site maintainer knows what they're michael@0: // doing. michael@0: nsAutoCString contentType; michael@0: channel->GetContentType(contentType); michael@0: bool noSniff = contentType.EqualsLiteral(TYPE_RSS) || michael@0: contentType.EqualsLiteral(TYPE_ATOM); michael@0: michael@0: // Check to see if this was a feed request from the location bar or from michael@0: // the feed: protocol. This is also a reliable indication. michael@0: // The value of the header doesn't matter. michael@0: if (!noSniff) { michael@0: nsAutoCString sniffHeader; michael@0: nsresult foundHeader = michael@0: channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), michael@0: sniffHeader); michael@0: noSniff = NS_SUCCEEDED(foundHeader); michael@0: } michael@0: michael@0: if (noSniff) { michael@0: // check for an attachment after we have a likely feed. michael@0: if(HasAttachmentDisposition(channel)) { michael@0: sniffedType.Truncate(); michael@0: return NS_OK; michael@0: } michael@0: michael@0: // set the feed header as a response header, since we have good metadata michael@0: // telling us that the feed is supposed to be RSS or Atom michael@0: channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), michael@0: NS_LITERAL_CSTRING("1"), false); michael@0: sniffedType.AssignLiteral(TYPE_MAYBE_FEED); michael@0: return NS_OK; michael@0: } michael@0: michael@0: // Don't sniff arbitrary types. Limit sniffing to situations that michael@0: // we think can reasonably arise. michael@0: if (!contentType.EqualsLiteral(TEXT_HTML) && michael@0: !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) && michael@0: // Same criterion as XMLHttpRequest. Should we be checking for "+xml" michael@0: // and check for text/xml and application/xml by hand instead? michael@0: contentType.Find("xml") == -1) { michael@0: sniffedType.Truncate(); michael@0: return NS_OK; michael@0: } michael@0: michael@0: // Now we need to potentially decompress data served with michael@0: // Content-Encoding: gzip michael@0: nsresult rv = ConvertEncodedData(request, data, length); michael@0: if (NS_FAILED(rv)) michael@0: return rv; michael@0: michael@0: // We cap the number of bytes to scan at MAX_BYTES to prevent picking up michael@0: // false positives by accidentally reading document content, e.g. a "how to michael@0: // make a feed" page. michael@0: const char* testData; michael@0: if (mDecodedData.IsEmpty()) { michael@0: testData = (const char*)data; michael@0: length = std::min(length, MAX_BYTES); michael@0: } else { michael@0: testData = mDecodedData.get(); michael@0: length = std::min(mDecodedData.Length(), MAX_BYTES); michael@0: } michael@0: michael@0: // The strategy here is based on that described in: michael@0: // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx michael@0: // for interoperarbility purposes. michael@0: michael@0: // Thus begins the actual sniffing. michael@0: nsDependentCSubstring dataString((const char*)testData, length); michael@0: michael@0: bool isFeed = false; michael@0: michael@0: // RSS 0.91/0.92/2.0 michael@0: isFeed = ContainsTopLevelSubstring(dataString, "(closure); michael@0: decodedData->Append(rawSegment, count); michael@0: *writeCount = count; michael@0: return NS_OK; michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context, michael@0: nsIInputStream* stream, uint64_t offset, michael@0: uint32_t count) michael@0: { michael@0: uint32_t read; michael@0: return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count, michael@0: &read); michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context, michael@0: nsresult status) michael@0: { michael@0: return NS_OK; michael@0: }