|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "nsFeedSniffer.h" |
|
7 |
|
8 |
|
9 #include "nsNetCID.h" |
|
10 #include "nsXPCOM.h" |
|
11 #include "nsCOMPtr.h" |
|
12 #include "nsStringStream.h" |
|
13 |
|
14 #include "nsBrowserCompsCID.h" |
|
15 |
|
16 #include "nsICategoryManager.h" |
|
17 #include "nsIServiceManager.h" |
|
18 #include "nsComponentManagerUtils.h" |
|
19 #include "nsServiceManagerUtils.h" |
|
20 |
|
21 #include "nsIStreamConverterService.h" |
|
22 #include "nsIStreamConverter.h" |
|
23 |
|
24 #include "nsIStreamListener.h" |
|
25 |
|
26 #include "nsIHttpChannel.h" |
|
27 #include "nsIMIMEHeaderParam.h" |
|
28 |
|
29 #include "nsMimeTypes.h" |
|
30 #include "nsIURI.h" |
|
31 #include <algorithm> |
|
32 |
|
33 #define TYPE_ATOM "application/atom+xml" |
|
34 #define TYPE_RSS "application/rss+xml" |
|
35 #define TYPE_MAYBE_FEED "application/vnd.mozilla.maybe.feed" |
|
36 |
|
37 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#" |
|
38 #define NS_RSS "http://purl.org/rss/1.0/" |
|
39 |
|
40 #define MAX_BYTES 512u |
|
41 |
|
42 NS_IMPL_ISUPPORTS(nsFeedSniffer, |
|
43 nsIContentSniffer, |
|
44 nsIStreamListener, |
|
45 nsIRequestObserver) |
|
46 |
|
47 nsresult |
|
48 nsFeedSniffer::ConvertEncodedData(nsIRequest* request, |
|
49 const uint8_t* data, |
|
50 uint32_t length) |
|
51 { |
|
52 nsresult rv = NS_OK; |
|
53 |
|
54 mDecodedData = ""; |
|
55 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(request)); |
|
56 if (!httpChannel) |
|
57 return NS_ERROR_NO_INTERFACE; |
|
58 |
|
59 nsAutoCString contentEncoding; |
|
60 httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"), |
|
61 contentEncoding); |
|
62 if (!contentEncoding.IsEmpty()) { |
|
63 nsCOMPtr<nsIStreamConverterService> converterService(do_GetService(NS_STREAMCONVERTERSERVICE_CONTRACTID)); |
|
64 if (converterService) { |
|
65 ToLowerCase(contentEncoding); |
|
66 |
|
67 nsCOMPtr<nsIStreamListener> converter; |
|
68 rv = converterService->AsyncConvertData(contentEncoding.get(), |
|
69 "uncompressed", this, nullptr, |
|
70 getter_AddRefs(converter)); |
|
71 NS_ENSURE_SUCCESS(rv, rv); |
|
72 |
|
73 converter->OnStartRequest(request, nullptr); |
|
74 |
|
75 nsCOMPtr<nsIStringInputStream> rawStream = |
|
76 do_CreateInstance(NS_STRINGINPUTSTREAM_CONTRACTID); |
|
77 if (!rawStream) |
|
78 return NS_ERROR_FAILURE; |
|
79 |
|
80 rv = rawStream->SetData((const char*)data, length); |
|
81 NS_ENSURE_SUCCESS(rv, rv); |
|
82 |
|
83 rv = converter->OnDataAvailable(request, nullptr, rawStream, 0, length); |
|
84 NS_ENSURE_SUCCESS(rv, rv); |
|
85 |
|
86 converter->OnStopRequest(request, nullptr, NS_OK); |
|
87 } |
|
88 } |
|
89 return rv; |
|
90 } |
|
91 |
|
92 template<int N> |
|
93 static bool |
|
94 StringBeginsWithLowercaseLiteral(nsAString& aString, |
|
95 const char (&aSubstring)[N]) |
|
96 { |
|
97 return StringHead(aString, N).LowerCaseEqualsLiteral(aSubstring); |
|
98 } |
|
99 |
|
100 bool |
|
101 HasAttachmentDisposition(nsIHttpChannel* httpChannel) |
|
102 { |
|
103 if (!httpChannel) |
|
104 return false; |
|
105 |
|
106 uint32_t disp; |
|
107 nsresult rv = httpChannel->GetContentDisposition(&disp); |
|
108 |
|
109 if (NS_SUCCEEDED(rv) && disp == nsIChannel::DISPOSITION_ATTACHMENT) |
|
110 return true; |
|
111 |
|
112 return false; |
|
113 } |
|
114 |
|
115 /** |
|
116 * @return the first occurrence of a character within a string buffer, |
|
117 * or nullptr if not found |
|
118 */ |
|
119 static const char* |
|
120 FindChar(char c, const char *begin, const char *end) |
|
121 { |
|
122 for (; begin < end; ++begin) { |
|
123 if (*begin == c) |
|
124 return begin; |
|
125 } |
|
126 return nullptr; |
|
127 } |
|
128 |
|
129 /** |
|
130 * |
|
131 * Determine if a substring is the "documentElement" in the document. |
|
132 * |
|
133 * All of our sniffed substrings: <rss, <feed, <rdf:RDF must be the "document" |
|
134 * element within the XML DOM, i.e. the root container element. Otherwise, |
|
135 * it's possible that someone embedded one of these tags inside a document of |
|
136 * another type, e.g. a HTML document, and we don't want to show the preview |
|
137 * page if the document isn't actually a feed. |
|
138 * |
|
139 * @param start |
|
140 * The beginning of the data being sniffed |
|
141 * @param end |
|
142 * The end of the data being sniffed, right before the substring that |
|
143 * was found. |
|
144 * @returns true if the found substring is the documentElement, false |
|
145 * otherwise. |
|
146 */ |
|
147 static bool |
|
148 IsDocumentElement(const char *start, const char* end) |
|
149 { |
|
150 // For every tag in the buffer, check to see if it's a PI, Doctype or |
|
151 // comment, our desired substring or something invalid. |
|
152 while ( (start = FindChar('<', start, end)) ) { |
|
153 ++start; |
|
154 if (start >= end) |
|
155 return false; |
|
156 |
|
157 // Check to see if the character following the '<' is either '?' or '!' |
|
158 // (processing instruction or doctype or comment)... these are valid nodes |
|
159 // to have in the prologue. |
|
160 if (*start != '?' && *start != '!') |
|
161 return false; |
|
162 |
|
163 // Now advance the iterator until the '>' (We do this because we don't want |
|
164 // to sniff indicator substrings that are embedded within other nodes, e.g. |
|
165 // comments: <!-- <rdf:RDF .. > --> |
|
166 start = FindChar('>', start, end); |
|
167 if (!start) |
|
168 return false; |
|
169 |
|
170 ++start; |
|
171 } |
|
172 return true; |
|
173 } |
|
174 |
|
175 /** |
|
176 * Determines whether or not a string exists as the root element in an XML data |
|
177 * string buffer. |
|
178 * @param dataString |
|
179 * The data being sniffed |
|
180 * @param substring |
|
181 * The substring being tested for existence and root-ness. |
|
182 * @returns true if the substring exists and is the documentElement, false |
|
183 * otherwise. |
|
184 */ |
|
185 static bool |
|
186 ContainsTopLevelSubstring(nsACString& dataString, const char *substring) |
|
187 { |
|
188 int32_t offset = dataString.Find(substring); |
|
189 if (offset == -1) |
|
190 return false; |
|
191 |
|
192 const char *begin = dataString.BeginReading(); |
|
193 |
|
194 // Only do the validation when we find the substring. |
|
195 return IsDocumentElement(begin, begin + offset); |
|
196 } |
|
197 |
|
198 NS_IMETHODIMP |
|
199 nsFeedSniffer::GetMIMETypeFromContent(nsIRequest* request, |
|
200 const uint8_t* data, |
|
201 uint32_t length, |
|
202 nsACString& sniffedType) |
|
203 { |
|
204 nsCOMPtr<nsIHttpChannel> channel(do_QueryInterface(request)); |
|
205 if (!channel) |
|
206 return NS_ERROR_NO_INTERFACE; |
|
207 |
|
208 // Check that this is a GET request, since you can't subscribe to a POST... |
|
209 nsAutoCString method; |
|
210 channel->GetRequestMethod(method); |
|
211 if (!method.Equals("GET")) { |
|
212 sniffedType.Truncate(); |
|
213 return NS_OK; |
|
214 } |
|
215 |
|
216 // We need to find out if this is a load of a view-source document. In this |
|
217 // case we do not want to override the content type, since the source display |
|
218 // does not need to be converted from feed format to XUL. More importantly, |
|
219 // we don't want to change the content type from something |
|
220 // nsContentDLF::CreateInstance knows about (e.g. application/xml, text/html |
|
221 // etc) to something that only the application fe knows about (maybe.feed) |
|
222 // thus deactivating syntax highlighting. |
|
223 nsCOMPtr<nsIURI> originalURI; |
|
224 channel->GetOriginalURI(getter_AddRefs(originalURI)); |
|
225 |
|
226 nsAutoCString scheme; |
|
227 originalURI->GetScheme(scheme); |
|
228 if (scheme.EqualsLiteral("view-source")) { |
|
229 sniffedType.Truncate(); |
|
230 return NS_OK; |
|
231 } |
|
232 |
|
233 // Check the Content-Type to see if it is set correctly. If it is set to |
|
234 // something specific that we think is a reliable indication of a feed, don't |
|
235 // bother sniffing since we assume the site maintainer knows what they're |
|
236 // doing. |
|
237 nsAutoCString contentType; |
|
238 channel->GetContentType(contentType); |
|
239 bool noSniff = contentType.EqualsLiteral(TYPE_RSS) || |
|
240 contentType.EqualsLiteral(TYPE_ATOM); |
|
241 |
|
242 // Check to see if this was a feed request from the location bar or from |
|
243 // the feed: protocol. This is also a reliable indication. |
|
244 // The value of the header doesn't matter. |
|
245 if (!noSniff) { |
|
246 nsAutoCString sniffHeader; |
|
247 nsresult foundHeader = |
|
248 channel->GetRequestHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), |
|
249 sniffHeader); |
|
250 noSniff = NS_SUCCEEDED(foundHeader); |
|
251 } |
|
252 |
|
253 if (noSniff) { |
|
254 // check for an attachment after we have a likely feed. |
|
255 if(HasAttachmentDisposition(channel)) { |
|
256 sniffedType.Truncate(); |
|
257 return NS_OK; |
|
258 } |
|
259 |
|
260 // set the feed header as a response header, since we have good metadata |
|
261 // telling us that the feed is supposed to be RSS or Atom |
|
262 channel->SetResponseHeader(NS_LITERAL_CSTRING("X-Moz-Is-Feed"), |
|
263 NS_LITERAL_CSTRING("1"), false); |
|
264 sniffedType.AssignLiteral(TYPE_MAYBE_FEED); |
|
265 return NS_OK; |
|
266 } |
|
267 |
|
268 // Don't sniff arbitrary types. Limit sniffing to situations that |
|
269 // we think can reasonably arise. |
|
270 if (!contentType.EqualsLiteral(TEXT_HTML) && |
|
271 !contentType.EqualsLiteral(APPLICATION_OCTET_STREAM) && |
|
272 // Same criterion as XMLHttpRequest. Should we be checking for "+xml" |
|
273 // and check for text/xml and application/xml by hand instead? |
|
274 contentType.Find("xml") == -1) { |
|
275 sniffedType.Truncate(); |
|
276 return NS_OK; |
|
277 } |
|
278 |
|
279 // Now we need to potentially decompress data served with |
|
280 // Content-Encoding: gzip |
|
281 nsresult rv = ConvertEncodedData(request, data, length); |
|
282 if (NS_FAILED(rv)) |
|
283 return rv; |
|
284 |
|
285 // We cap the number of bytes to scan at MAX_BYTES to prevent picking up |
|
286 // false positives by accidentally reading document content, e.g. a "how to |
|
287 // make a feed" page. |
|
288 const char* testData; |
|
289 if (mDecodedData.IsEmpty()) { |
|
290 testData = (const char*)data; |
|
291 length = std::min(length, MAX_BYTES); |
|
292 } else { |
|
293 testData = mDecodedData.get(); |
|
294 length = std::min(mDecodedData.Length(), MAX_BYTES); |
|
295 } |
|
296 |
|
297 // The strategy here is based on that described in: |
|
298 // http://blogs.msdn.com/rssteam/articles/PublishersGuide.aspx |
|
299 // for interoperarbility purposes. |
|
300 |
|
301 // Thus begins the actual sniffing. |
|
302 nsDependentCSubstring dataString((const char*)testData, length); |
|
303 |
|
304 bool isFeed = false; |
|
305 |
|
306 // RSS 0.91/0.92/2.0 |
|
307 isFeed = ContainsTopLevelSubstring(dataString, "<rss"); |
|
308 |
|
309 // Atom 1.0 |
|
310 if (!isFeed) |
|
311 isFeed = ContainsTopLevelSubstring(dataString, "<feed"); |
|
312 |
|
313 // RSS 1.0 |
|
314 if (!isFeed) { |
|
315 isFeed = ContainsTopLevelSubstring(dataString, "<rdf:RDF") && |
|
316 dataString.Find(NS_RDF) != -1 && |
|
317 dataString.Find(NS_RSS) != -1; |
|
318 } |
|
319 |
|
320 // If we sniffed a feed, coerce our internal type |
|
321 if (isFeed && !HasAttachmentDisposition(channel)) |
|
322 sniffedType.AssignLiteral(TYPE_MAYBE_FEED); |
|
323 else |
|
324 sniffedType.Truncate(); |
|
325 return NS_OK; |
|
326 } |
|
327 |
|
328 NS_IMETHODIMP |
|
329 nsFeedSniffer::OnStartRequest(nsIRequest* request, nsISupports* context) |
|
330 { |
|
331 return NS_OK; |
|
332 } |
|
333 |
|
334 NS_METHOD |
|
335 nsFeedSniffer::AppendSegmentToString(nsIInputStream* inputStream, |
|
336 void* closure, |
|
337 const char* rawSegment, |
|
338 uint32_t toOffset, |
|
339 uint32_t count, |
|
340 uint32_t* writeCount) |
|
341 { |
|
342 nsCString* decodedData = static_cast<nsCString*>(closure); |
|
343 decodedData->Append(rawSegment, count); |
|
344 *writeCount = count; |
|
345 return NS_OK; |
|
346 } |
|
347 |
|
348 NS_IMETHODIMP |
|
349 nsFeedSniffer::OnDataAvailable(nsIRequest* request, nsISupports* context, |
|
350 nsIInputStream* stream, uint64_t offset, |
|
351 uint32_t count) |
|
352 { |
|
353 uint32_t read; |
|
354 return stream->ReadSegments(AppendSegmentToString, &mDecodedData, count, |
|
355 &read); |
|
356 } |
|
357 |
|
358 NS_IMETHODIMP |
|
359 nsFeedSniffer::OnStopRequest(nsIRequest* request, nsISupports* context, |
|
360 nsresult status) |
|
361 { |
|
362 return NS_OK; |
|
363 } |