toolkit/components/feeds/FeedProcessor.js

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:28893db83e72
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 function LOG(str) {
7 dump("*** " + str + "\n");
8 }
9
10 const Ci = Components.interfaces;
11 const Cc = Components.classes;
12 const Cr = Components.results;
13 Components.utils.import("resource://gre/modules/XPCOMUtils.jsm");
14
15 const FP_CONTRACTID = "@mozilla.org/feed-processor;1";
16 const FP_CLASSID = Components.ID("{26acb1f0-28fc-43bc-867a-a46aabc85dd4}");
17 const FP_CLASSNAME = "Feed Processor";
18 const FR_CONTRACTID = "@mozilla.org/feed-result;1";
19 const FR_CLASSID = Components.ID("{072a5c3d-30c6-4f07-b87f-9f63d51403f2}");
20 const FR_CLASSNAME = "Feed Result";
21 const FEED_CONTRACTID = "@mozilla.org/feed;1";
22 const FEED_CLASSID = Components.ID("{5d0cfa97-69dd-4e5e-ac84-f253162e8f9a}");
23 const FEED_CLASSNAME = "Feed";
24 const ENTRY_CONTRACTID = "@mozilla.org/feed-entry;1";
25 const ENTRY_CLASSID = Components.ID("{8e4444ff-8e99-4bdd-aa7f-fb3c1c77319f}");
26 const ENTRY_CLASSNAME = "Feed Entry";
27 const TEXTCONSTRUCT_CONTRACTID = "@mozilla.org/feed-textconstruct;1";
28 const TEXTCONSTRUCT_CLASSID =
29 Components.ID("{b992ddcd-3899-4320-9909-924b3e72c922}");
30 const TEXTCONSTRUCT_CLASSNAME = "Feed Text Construct";
31 const GENERATOR_CONTRACTID = "@mozilla.org/feed-generator;1";
32 const GENERATOR_CLASSID =
33 Components.ID("{414af362-9ad8-4296-898e-62247f25a20e}");
34 const GENERATOR_CLASSNAME = "Feed Generator";
35 const PERSON_CONTRACTID = "@mozilla.org/feed-person;1";
36 const PERSON_CLASSID = Components.ID("{95c963b7-20b2-11db-92f6-001422106990}");
37 const PERSON_CLASSNAME = "Feed Person";
38
39 const IO_CONTRACTID = "@mozilla.org/network/io-service;1"
40 const BAG_CONTRACTID = "@mozilla.org/hash-property-bag;1"
41 const ARRAY_CONTRACTID = "@mozilla.org/array;1";
42 const SAX_CONTRACTID = "@mozilla.org/saxparser/xmlreader;1";
43 const PARSERUTILS_CONTRACTID = "@mozilla.org/parserutils;1";
44
45
46 var gIoService = null;
47
48 const XMLNS = "http://www.w3.org/XML/1998/namespace";
49 const RSS090NS = "http://my.netscape.com/rdf/simple/0.9/";
50
51 /***** Some general utils *****/
52 function strToURI(link, base) {
53 var base = base || null;
54 if (!gIoService)
55 gIoService = Cc[IO_CONTRACTID].getService(Ci.nsIIOService);
56 try {
57 return gIoService.newURI(link, null, base);
58 }
59 catch(e) {
60 return null;
61 }
62 }
63
64 function isArray(a) {
65 return isObject(a) && a.constructor == Array;
66 }
67
68 function isObject(a) {
69 return (a && typeof a == "object") || isFunction(a);
70 }
71
72 function isFunction(a) {
73 return typeof a == "function";
74 }
75
76 function isIID(a, iid) {
77 var rv = false;
78 try {
79 a.QueryInterface(iid);
80 rv = true;
81 }
82 catch(e) {
83 }
84 return rv;
85 }
86
87 function isIArray(a) {
88 return isIID(a, Ci.nsIArray);
89 }
90
91 function isIFeedContainer(a) {
92 return isIID(a, Ci.nsIFeedContainer);
93 }
94
95 function stripTags(someHTML) {
96 return someHTML.replace(/<[^>]+>/g,"");
97 }
98
99 /**
100 * Searches through an array of links and returns a JS array
101 * of matching property bags.
102 */
103 const IANA_URI = "http://www.iana.org/assignments/relation/";
104 function findAtomLinks(rel, links) {
105 var rvLinks = [];
106 for (var i = 0; i < links.length; ++i) {
107 var linkElement = links.queryElementAt(i, Ci.nsIPropertyBag2);
108 // atom:link MUST have @href
109 if (bagHasKey(linkElement, "href")) {
110 var relAttribute = null;
111 if (bagHasKey(linkElement, "rel"))
112 relAttribute = linkElement.getPropertyAsAString("rel")
113 if ((!relAttribute && rel == "alternate") || relAttribute == rel) {
114 rvLinks.push(linkElement);
115 continue;
116 }
117 // catch relations specified by IANA URI
118 if (relAttribute == IANA_URI + rel) {
119 rvLinks.push(linkElement);
120 }
121 }
122 }
123 return rvLinks;
124 }
125
126 function xmlEscape(s) {
127 s = s.replace(/&/g, "&amp;");
128 s = s.replace(/>/g, "&gt;");
129 s = s.replace(/</g, "&lt;");
130 s = s.replace(/"/g, "&quot;");
131 s = s.replace(/'/g, "&apos;");
132 return s;
133 }
134
135 function arrayContains(array, element) {
136 for (var i = 0; i < array.length; ++i) {
137 if (array[i] == element) {
138 return true;
139 }
140 }
141 return false;
142 }
143
144 // XXX add hasKey to nsIPropertyBag
145 function bagHasKey(bag, key) {
146 try {
147 bag.getProperty(key);
148 return true;
149 }
150 catch (e) {
151 return false;
152 }
153 }
154
155 function makePropGetter(key) {
156 return function FeedPropGetter(bag) {
157 try {
158 return value = bag.getProperty(key);
159 }
160 catch(e) {
161 }
162 return null;
163 }
164 }
165
166 const RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
167 // namespace map
168 var gNamespaces = {
169 "http://webns.net/mvcb/":"admin",
170 "http://backend.userland.com/rss":"",
171 "http://blogs.law.harvard.edu/tech/rss":"",
172 "http://www.w3.org/2005/Atom":"atom",
173 "http://purl.org/atom/ns#":"atom03",
174 "http://purl.org/rss/1.0/modules/content/":"content",
175 "http://purl.org/dc/elements/1.1/":"dc",
176 "http://purl.org/dc/terms/":"dcterms",
177 "http://www.w3.org/1999/02/22-rdf-syntax-ns#":"rdf",
178 "http://purl.org/rss/1.0/":"rss1",
179 "http://my.netscape.com/rdf/simple/0.9/":"rss1",
180 "http://wellformedweb.org/CommentAPI/":"wfw",
181 "http://purl.org/rss/1.0/modules/wiki/":"wiki",
182 "http://www.w3.org/XML/1998/namespace":"xml",
183 "http://search.yahoo.com/mrss/":"media",
184 "http://search.yahoo.com/mrss":"media"
185 }
186
187 // We allow a very small set of namespaces in XHTML content,
188 // for attributes only
189 var gAllowedXHTMLNamespaces = {
190 "http://www.w3.org/XML/1998/namespace":"xml",
191 // if someone ns qualifies XHTML, we have to prefix it to avoid an
192 // attribute collision.
193 "http://www.w3.org/1999/xhtml":"xhtml"
194 }
195
196 function FeedResult() {}
197 FeedResult.prototype = {
198 bozo: false,
199 doc: null,
200 version: null,
201 headers: null,
202 uri: null,
203 stylesheet: null,
204
205 registerExtensionPrefix: function FR_registerExtensionPrefix(ns, prefix) {
206 throw Cr.NS_ERROR_NOT_IMPLEMENTED;
207 },
208
209 // XPCOM stuff
210 classID: FR_CLASSID,
211 QueryInterface: XPCOMUtils.generateQI([Ci.nsIFeedResult])
212 }
213
214 function Feed() {
215 this.subtitle = null;
216 this.title = null;
217 this.items = Cc[ARRAY_CONTRACTID].createInstance(Ci.nsIMutableArray);
218 this.link = null;
219 this.id = null;
220 this.generator = null;
221 this.authors = Cc[ARRAY_CONTRACTID].createInstance(Ci.nsIMutableArray);
222 this.contributors = Cc[ARRAY_CONTRACTID].createInstance(Ci.nsIMutableArray);
223 this.baseURI = null;
224 this.enclosureCount = 0;
225 this.type = Ci.nsIFeed.TYPE_FEED;
226 }
227
228 Feed.prototype = {
229 searchLists: {
230 title: ["title", "rss1:title", "atom03:title", "atom:title"],
231 subtitle: ["description","dc:description","rss1:description",
232 "atom03:tagline","atom:subtitle"],
233 items: ["items","atom03_entries","entries"],
234 id: ["atom:id","rdf:about"],
235 generator: ["generator"],
236 authors : ["authors"],
237 contributors: ["contributors"],
238 title: ["title","rss1:title", "atom03:title","atom:title"],
239 link: [["link",strToURI],["rss1:link",strToURI]],
240 categories: ["categories", "dc:subject"],
241 rights: ["atom03:rights","atom:rights"],
242 cloud: ["cloud"],
243 image: ["image", "rss1:image", "atom:logo"],
244 textInput: ["textInput", "rss1:textinput"],
245 skipDays: ["skipDays"],
246 skipHours: ["skipHours"],
247 updated: ["pubDate", "lastBuildDate", "atom03:modified", "dc:date",
248 "dcterms:modified", "atom:updated"]
249 },
250
251 normalize: function Feed_normalize() {
252 fieldsToObj(this, this.searchLists);
253 if (this.skipDays)
254 this.skipDays = this.skipDays.getProperty("days");
255 if (this.skipHours)
256 this.skipHours = this.skipHours.getProperty("hours");
257
258 if (this.updated)
259 this.updated = dateParse(this.updated);
260
261 // Assign Atom link if needed
262 if (bagHasKey(this.fields, "links"))
263 this._atomLinksToURI();
264
265 this._calcEnclosureCountAndFeedType();
266
267 // Resolve relative image links
268 if (this.image && bagHasKey(this.image, "url"))
269 this._resolveImageLink();
270
271 this._resetBagMembersToRawText([this.searchLists.subtitle,
272 this.searchLists.title]);
273 },
274
275 _calcEnclosureCountAndFeedType: function Feed_calcEnclosureCountAndFeedType() {
276 var entries_with_enclosures = 0;
277 var audio_count = 0;
278 var image_count = 0;
279 var video_count = 0;
280 var other_count = 0;
281
282 for (var i = 0; i < this.items.length; ++i) {
283 var entry = this.items.queryElementAt(i, Ci.nsIFeedEntry);
284 entry.QueryInterface(Ci.nsIFeedContainer);
285
286 if (entry.enclosures && entry.enclosures.length > 0) {
287 ++entries_with_enclosures;
288
289 for (var e = 0; e < entry.enclosures.length; ++e) {
290 var enc = entry.enclosures.queryElementAt(e, Ci.nsIWritablePropertyBag2);
291 if (enc.hasKey("type")) {
292 var enctype = enc.get("type");
293
294 if (/^audio/.test(enctype)) {
295 ++audio_count;
296 } else if (/^image/.test(enctype)) {
297 ++image_count;
298 } else if (/^video/.test(enctype)) {
299 ++video_count;
300 } else {
301 ++other_count;
302 }
303 } else {
304 ++other_count;
305 }
306 }
307 }
308 }
309
310 var feedtype = Ci.nsIFeed.TYPE_FEED;
311
312 // For a feed to be marked as TYPE_VIDEO, TYPE_AUDIO and TYPE_IMAGE,
313 // we enforce two things:
314 //
315 // 1. all entries must have at least one enclosure
316 // 2. all enclosures must be video for TYPE_VIDEO, audio for TYPE_AUDIO or image
317 // for TYPE_IMAGE
318 //
319 // Otherwise it's a TYPE_FEED.
320 if (entries_with_enclosures == this.items.length && other_count == 0) {
321 if (audio_count > 0 && !video_count && !image_count) {
322 feedtype = Ci.nsIFeed.TYPE_AUDIO;
323
324 } else if (image_count > 0 && !audio_count && !video_count) {
325 feedtype = Ci.nsIFeed.TYPE_IMAGE;
326
327 } else if (video_count > 0 && !audio_count && !image_count) {
328 feedtype = Ci.nsIFeed.TYPE_VIDEO;
329 }
330 }
331
332 this.type = feedtype;
333 this.enclosureCount = other_count + video_count + audio_count + image_count;
334 },
335
336 _atomLinksToURI: function Feed_linkToURI() {
337 var links = this.fields.getPropertyAsInterface("links", Ci.nsIArray);
338 var alternates = findAtomLinks("alternate", links);
339 if (alternates.length > 0) {
340 var href = alternates[0].getPropertyAsAString("href");
341 var base;
342 if (bagHasKey(alternates[0], "xml:base"))
343 base = alternates[0].getPropertyAsAString("xml:base");
344 this.link = this._resolveURI(href, base);
345 }
346 },
347
348 _resolveImageLink: function Feed_resolveImageLink() {
349 var base;
350 if (bagHasKey(this.image, "xml:base"))
351 base = this.image.getPropertyAsAString("xml:base");
352 var url = this._resolveURI(this.image.getPropertyAsAString("url"), base);
353 if (url)
354 this.image.setPropertyAsAString("url", url.spec);
355 },
356
357 _resolveURI: function Feed_resolveURI(linkSpec, baseSpec) {
358 var uri = null;
359 try {
360 var base = baseSpec ? strToURI(baseSpec, this.baseURI) : this.baseURI;
361 uri = strToURI(linkSpec, base);
362 }
363 catch(e) {
364 LOG(e);
365 }
366
367 return uri;
368 },
369
370 // reset the bag to raw contents, not text constructs
371 _resetBagMembersToRawText: function Feed_resetBagMembers(fieldLists) {
372 for (var i=0; i<fieldLists.length; i++) {
373 for (var j=0; j<fieldLists[i].length; j++) {
374 if (bagHasKey(this.fields, fieldLists[i][j])) {
375 var textConstruct = this.fields.getProperty(fieldLists[i][j]);
376 this.fields.setPropertyAsAString(fieldLists[i][j],
377 textConstruct.text);
378 }
379 }
380 }
381 },
382
383 // XPCOM stuff
384 classID: FEED_CLASSID,
385 QueryInterface: XPCOMUtils.generateQI([Ci.nsIFeed, Ci.nsIFeedContainer])
386 }
387
388 function Entry() {
389 this.summary = null;
390 this.content = null;
391 this.title = null;
392 this.fields = Cc["@mozilla.org/hash-property-bag;1"].
393 createInstance(Ci.nsIWritablePropertyBag2);
394 this.link = null;
395 this.id = null;
396 this.baseURI = null;
397 this.updated = null;
398 this.published = null;
399 this.authors = Cc[ARRAY_CONTRACTID].createInstance(Ci.nsIMutableArray);
400 this.contributors = Cc[ARRAY_CONTRACTID].createInstance(Ci.nsIMutableArray);
401 }
402
403 Entry.prototype = {
404 fields: null,
405 enclosures: null,
406 mediaContent: null,
407
408 searchLists: {
409 title: ["title", "rss1:title", "atom03:title", "atom:title"],
410 link: [["link",strToURI],["rss1:link",strToURI]],
411 id: [["guid", makePropGetter("guid")], "rdf:about",
412 "atom03:id", "atom:id"],
413 authors : ["authors"],
414 contributors: ["contributors"],
415 summary: ["description", "rss1:description", "dc:description",
416 "atom03:summary", "atom:summary"],
417 content: ["content:encoded","atom03:content","atom:content"],
418 rights: ["atom03:rights","atom:rights"],
419 published: ["pubDate", "atom03:issued", "dcterms:issued", "atom:published"],
420 updated: ["pubDate", "atom03:modified", "dc:date", "dcterms:modified",
421 "atom:updated"]
422 },
423
424 normalize: function Entry_normalize() {
425 fieldsToObj(this, this.searchLists);
426
427 // Assign Atom link if needed
428 if (bagHasKey(this.fields, "links"))
429 this._atomLinksToURI();
430
431 // Populate enclosures array
432 this._populateEnclosures();
433
434 // The link might be a guid w/ permalink=true
435 if (!this.link && bagHasKey(this.fields, "guid")) {
436 var guid = this.fields.getProperty("guid");
437 var isPermaLink = true;
438
439 if (bagHasKey(guid, "isPermaLink"))
440 isPermaLink = guid.getProperty("isPermaLink").toLowerCase() != "false";
441
442 if (guid && isPermaLink)
443 this.link = strToURI(guid.getProperty("guid"));
444 }
445
446 if (this.updated)
447 this.updated = dateParse(this.updated);
448 if (this.published)
449 this.published = dateParse(this.published);
450
451 this._resetBagMembersToRawText([this.searchLists.content,
452 this.searchLists.summary,
453 this.searchLists.title]);
454 },
455
456 _populateEnclosures: function Entry_populateEnclosures() {
457 if (bagHasKey(this.fields, "links"))
458 this._atomLinksToEnclosures();
459
460 // Add RSS2 enclosure to enclosures
461 if (bagHasKey(this.fields, "enclosure"))
462 this._enclosureToEnclosures();
463
464 // Add media:content to enclosures
465 if (bagHasKey(this.fields, "mediacontent"))
466 this._mediacontentToEnclosures();
467
468 // Add media:content in media:group to enclosures
469 if (bagHasKey(this.fields, "mediagroup"))
470 this._mediagroupToEnclosures();
471 },
472
473 __enclosure_map: null,
474
475 _addToEnclosures: function Entry_addToEnclosures(new_enc) {
476 // items we add to the enclosures array get displayed in the FeedWriter and
477 // they must have non-empty urls.
478 if (!bagHasKey(new_enc, "url") || new_enc.getPropertyAsAString("url") == "")
479 return;
480
481 if (this.__enclosure_map == null)
482 this.__enclosure_map = {};
483
484 var previous_enc = this.__enclosure_map[new_enc.getPropertyAsAString("url")];
485
486 if (previous_enc != undefined) {
487 previous_enc.QueryInterface(Ci.nsIWritablePropertyBag2);
488
489 if (!bagHasKey(previous_enc, "type") && bagHasKey(new_enc, "type"))
490 previous_enc.setPropertyAsAString("type", new_enc.getPropertyAsAString("type"));
491
492 if (!bagHasKey(previous_enc, "length") && bagHasKey(new_enc, "length"))
493 previous_enc.setPropertyAsAString("length", new_enc.getPropertyAsAString("length"));
494
495 return;
496 }
497
498 if (this.enclosures == null) {
499 this.enclosures = Cc[ARRAY_CONTRACTID].createInstance(Ci.nsIMutableArray);
500 this.enclosures.QueryInterface(Ci.nsIMutableArray);
501 }
502
503 this.enclosures.appendElement(new_enc, false);
504 this.__enclosure_map[new_enc.getPropertyAsAString("url")] = new_enc;
505 },
506
507 _atomLinksToEnclosures: function Entry_linkToEnclosure() {
508 var links = this.fields.getPropertyAsInterface("links", Ci.nsIArray);
509 var enc_links = findAtomLinks("enclosure", links);
510 if (enc_links.length == 0)
511 return;
512
513 for (var i = 0; i < enc_links.length; ++i) {
514 var link = enc_links[i];
515
516 // an enclosure must have an href
517 if (!(link.getProperty("href")))
518 return;
519
520 var enc = Cc[BAG_CONTRACTID].createInstance(Ci.nsIWritablePropertyBag2);
521
522 // copy Atom bits over to equivalent enclosure bits
523 enc.setPropertyAsAString("url", link.getPropertyAsAString("href"));
524 if (bagHasKey(link, "type"))
525 enc.setPropertyAsAString("type", link.getPropertyAsAString("type"));
526 if (bagHasKey(link, "length"))
527 enc.setPropertyAsAString("length", link.getPropertyAsAString("length"));
528
529 this._addToEnclosures(enc);
530 }
531 },
532
533 _enclosureToEnclosures: function Entry_enclosureToEnclosures() {
534 var enc = this.fields.getPropertyAsInterface("enclosure", Ci.nsIPropertyBag2);
535
536 if (!(enc.getProperty("url")))
537 return;
538
539 this._addToEnclosures(enc);
540 },
541
542 _mediacontentToEnclosures: function Entry_mediacontentToEnclosures() {
543 var mediacontent = this.fields.getPropertyAsInterface("mediacontent", Ci.nsIArray);
544
545 for (var i = 0; i < mediacontent.length; ++i) {
546 var contentElement = mediacontent.queryElementAt(i, Ci.nsIWritablePropertyBag2);
547
548 // media:content don't require url, but if it's not there, we should
549 // skip it.
550 if (!bagHasKey(contentElement, "url"))
551 continue;
552
553 var enc = Cc[BAG_CONTRACTID].createInstance(Ci.nsIWritablePropertyBag2);
554
555 // copy media:content bits over to equivalent enclosure bits
556 enc.setPropertyAsAString("url", contentElement.getPropertyAsAString("url"));
557 if (bagHasKey(contentElement, "type")) {
558 enc.setPropertyAsAString("type", contentElement.getPropertyAsAString("type"));
559 }
560 if (bagHasKey(contentElement, "fileSize")) {
561 enc.setPropertyAsAString("length", contentElement.getPropertyAsAString("fileSize"));
562 }
563
564 this._addToEnclosures(enc);
565 }
566 },
567
568 _mediagroupToEnclosures: function Entry_mediagroupToEnclosures() {
569 var group = this.fields.getPropertyAsInterface("mediagroup", Ci.nsIPropertyBag2);
570
571 var content = group.getPropertyAsInterface("mediacontent", Ci.nsIArray);
572 for (var i = 0; i < content.length; ++i) {
573 var contentElement = content.queryElementAt(i, Ci.nsIWritablePropertyBag2);
574 // media:content don't require url, but if it's not there, we should
575 // skip it.
576 if (!bagHasKey(contentElement, "url"))
577 continue;
578
579 var enc = Cc[BAG_CONTRACTID].createInstance(Ci.nsIWritablePropertyBag2);
580
581 // copy media:content bits over to equivalent enclosure bits
582 enc.setPropertyAsAString("url", contentElement.getPropertyAsAString("url"));
583 if (bagHasKey(contentElement, "type")) {
584 enc.setPropertyAsAString("type", contentElement.getPropertyAsAString("type"));
585 }
586 if (bagHasKey(contentElement, "fileSize")) {
587 enc.setPropertyAsAString("length", contentElement.getPropertyAsAString("fileSize"));
588 }
589
590 this._addToEnclosures(enc);
591 }
592 },
593
594 // XPCOM stuff
595 classID: ENTRY_CLASSID,
596 QueryInterface: XPCOMUtils.generateQI(
597 [Ci.nsIFeedEntry, Ci.nsIFeedContainer]
598 )
599 }
600
601 Entry.prototype._atomLinksToURI = Feed.prototype._atomLinksToURI;
602 Entry.prototype._resolveURI = Feed.prototype._resolveURI;
603 Entry.prototype._resetBagMembersToRawText =
604 Feed.prototype._resetBagMembersToRawText;
605
606 // TextConstruct represents and element that could contain (X)HTML
607 function TextConstruct() {
608 this.lang = null;
609 this.base = null;
610 this.type = "text";
611 this.text = null;
612 this.parserUtils = Cc[PARSERUTILS_CONTRACTID].getService(Ci.nsIParserUtils);
613 }
614
615 TextConstruct.prototype = {
616 plainText: function TC_plainText() {
617 if (this.type != "text") {
618 return this.parserUtils.convertToPlainText(stripTags(this.text),
619 Ci.nsIDocumentEncoder.OutputSelectionOnly |
620 Ci.nsIDocumentEncoder.OutputAbsoluteLinks,
621 0);
622 }
623 return this.text;
624 },
625
626 createDocumentFragment: function TC_createDocumentFragment(element) {
627 if (this.type == "text") {
628 var doc = element.ownerDocument;
629 var docFragment = doc.createDocumentFragment();
630 var node = doc.createTextNode(this.text);
631 docFragment.appendChild(node);
632 return docFragment;
633 }
634 var isXML;
635 if (this.type == "xhtml")
636 isXML = true
637 else if (this.type == "html")
638 isXML = false;
639 else
640 return null;
641
642 return this.parserUtils.parseFragment(this.text, 0, isXML,
643 this.base, element);
644 },
645
646 // XPCOM stuff
647 classID: TEXTCONSTRUCT_CLASSID,
648 QueryInterface: XPCOMUtils.generateQI([Ci.nsIFeedTextConstruct])
649 }
650
651 // Generator represents the software that produced the feed
652 function Generator() {
653 this.lang = null;
654 this.agent = null;
655 this.version = null;
656 this.uri = null;
657
658 // nsIFeedElementBase
659 this._attributes = null;
660 this.baseURI = null;
661 }
662
663 Generator.prototype = {
664
665 get attributes() {
666 return this._attributes;
667 },
668
669 set attributes(value) {
670 this._attributes = value;
671 this.version = this._attributes.getValueFromName("","version");
672 var uriAttribute = this._attributes.getValueFromName("","uri") ||
673 this._attributes.getValueFromName("","url");
674 this.uri = strToURI(uriAttribute, this.baseURI);
675
676 // RSS1
677 uriAttribute = this._attributes.getValueFromName(RDF_NS,"resource");
678 if (uriAttribute) {
679 this.agent = uriAttribute;
680 this.uri = strToURI(uriAttribute, this.baseURI);
681 }
682 },
683
684 // XPCOM stuff
685 classID: GENERATOR_CLASSID,
686 QueryInterface: XPCOMUtils.generateQI(
687 [Ci.nsIFeedGenerator, Ci.nsIFeedElementBase]
688 )
689 }
690
691 function Person() {
692 this.name = null;
693 this.uri = null;
694 this.email = null;
695
696 // nsIFeedElementBase
697 this.attributes = null;
698 this.baseURI = null;
699 }
700
701 Person.prototype = {
702 // XPCOM stuff
703 classID: PERSON_CLASSID,
704 QueryInterface: XPCOMUtils.generateQI(
705 [Ci.nsIFeedPerson, Ci.nsIFeedElementBase]
706 )
707 }
708
709 /**
710 * Map a list of fields into properties on a container.
711 *
712 * @param container An nsIFeedContainer
713 * @param fields A list of fields to search for. List members can
714 * be a list, in which case the second member is
715 * transformation function (like parseInt).
716 */
717 function fieldsToObj(container, fields) {
718 var props,prop,field,searchList;
719 for (var key in fields) {
720 searchList = fields[key];
721 for (var i=0; i < searchList.length; ++i) {
722 props = searchList[i];
723 prop = null;
724 field = isArray(props) ? props[0] : props;
725 try {
726 prop = container.fields.getProperty(field);
727 }
728 catch(e) {
729 }
730 if (prop) {
731 prop = isArray(props) ? props[1](prop) : prop;
732 container[key] = prop;
733 }
734 }
735 }
736 }
737
738 /**
739 * Lower cases an element's localName property
740 * @param element A DOM element.
741 *
742 * @returns The lower case localName property of the specified element
743 */
744 function LC(element) {
745 return element.localName.toLowerCase();
746 }
747
748 // TODO move these post-processor functions
749 // create a generator element
750 function atomGenerator(s, generator) {
751 generator.QueryInterface(Ci.nsIFeedGenerator);
752 generator.agent = s.trim();
753 return generator;
754 }
755
756 // post-process atom:logo to create an RSS2-like structure
757 function atomLogo(s, logo) {
758 logo.setPropertyAsAString("url", s.trim());
759 }
760
761 // post-process an RSS category, map it to the Atom fields.
762 function rssCatTerm(s, cat) {
763 // add slash handling?
764 cat.setPropertyAsAString("term", s.trim());
765 return cat;
766 }
767
768 // post-process a GUID
769 function rssGuid(s, guid) {
770 guid.setPropertyAsAString("guid", s.trim());
771 return guid;
772 }
773
774 // post-process an RSS author element
775 //
776 // It can contain a field like this:
777 //
778 // <author>lawyer@boyer.net (Lawyer Boyer)</author>
779 //
780 // or, delightfully, a field like this:
781 //
782 // <dc:creator>Simon St.Laurent (mailto:simonstl@simonstl.com)</dc:creator>
783 //
784 // We want to split this up and assign it to corresponding Atom
785 // fields.
786 //
787 function rssAuthor(s,author) {
788 author.QueryInterface(Ci.nsIFeedPerson);
789 // check for RSS2 string format
790 var chars = s.trim();
791 var matches = chars.match(/(.*)\((.*)\)/);
792 var emailCheck =
793 /^([a-zA-Z0-9_\.\-])+\@(([a-zA-Z0-9\-])+\.)+([a-zA-Z0-9]{2,4})+$/;
794 if (matches) {
795 var match1 = matches[1].trim();
796 var match2 = matches[2].trim();
797 if (match2.indexOf("mailto:") == 0)
798 match2 = match2.substring(7);
799 if (emailCheck.test(match1)) {
800 author.email = match1;
801 author.name = match2;
802 }
803 else if (emailCheck.test(match2)) {
804 author.email = match2;
805 author.name = match1;
806 }
807 else {
808 // put it back together
809 author.name = match1 + " (" + match2 + ")";
810 }
811 }
812 else {
813 author.name = chars;
814 if (chars.indexOf('@'))
815 author.email = chars;
816 }
817 return author;
818 }
819
820 //
821 // skipHours and skipDays map to arrays, so we need to change the
822 // string to an nsISupports in order to stick it in there.
823 //
824 function rssArrayElement(s) {
825 var str = Cc["@mozilla.org/supports-string;1"].
826 createInstance(Ci.nsISupportsString);
827 str.data = s;
828 str.QueryInterface(Ci.nsISupportsString);
829 return str;
830 }
831
832 /**
833 * Tries parsing a string through the JavaScript Date object.
834 * @param aDateString
835 * A string that is supposedly an RFC822 or RFC3339 date.
836 * @return A Date.toUTCString, or null if the string can't be parsed.
837 */
838 function dateParse(aDateString) {
839 let dateString = aDateString.trim();
840 // Without bug 682781 fixed, JS won't parse an RFC822 date with a Z for the
841 // timezone, so convert to -00:00 which works for any date format.
842 dateString = dateString.replace(/z$/i, "-00:00");
843 let date = new Date(dateString);
844 if (!isNaN(date)) {
845 return date.toUTCString();
846 }
847 return null;
848 }
849
850 const XHTML_NS = "http://www.w3.org/1999/xhtml";
851
852 // The XHTMLHandler handles inline XHTML found in things like atom:summary
853 function XHTMLHandler(processor, isAtom) {
854 this._buf = "";
855 this._processor = processor;
856 this._depth = 0;
857 this._isAtom = isAtom;
858 // a stack of lists tracking in-scope namespaces
859 this._inScopeNS = [];
860 }
861
862 // The fidelity can be improved here, to allow handling of stuff like
863 // SVG and MathML. XXX
864 XHTMLHandler.prototype = {
865
866 // look back up at the declared namespaces
867 // we always use the same prefixes for our safe stuff
868 _isInScope: function XH__isInScope(ns) {
869 for (var i in this._inScopeNS) {
870 for (var uri in this._inScopeNS[i]) {
871 if (this._inScopeNS[i][uri] == ns)
872 return true;
873 }
874 }
875 return false;
876 },
877
878 startDocument: function XH_startDocument() {
879 },
880 endDocument: function XH_endDocument() {
881 },
882 startElement: function XH_startElement(uri, localName, qName, attributes) {
883 ++this._depth;
884 this._inScopeNS.push([]);
885
886 // RFC4287 requires XHTML to be wrapped in a div that is *not* part of
887 // the content. This prevents people from screwing up namespaces, but
888 // we need to skip it here.
889 if (this._isAtom && this._depth == 1 && localName == "div")
890 return;
891
892 // If it's an XHTML element, record it. Otherwise, it's ignored.
893 if (uri == XHTML_NS) {
894 this._buf += "<" + localName;
895 var uri;
896 for (var i=0; i < attributes.length; ++i) {
897 uri = attributes.getURI(i);
898 // XHTML attributes aren't in a namespace
899 if (uri == "") {
900 this._buf += (" " + attributes.getLocalName(i) + "='" +
901 xmlEscape(attributes.getValue(i)) + "'");
902 } else {
903 // write a small set of allowed attribute namespaces
904 var prefix = gAllowedXHTMLNamespaces[uri];
905 if (prefix != null) {
906 // The attribute value we'll attempt to write
907 var attributeValue = xmlEscape(attributes.getValue(i));
908
909 // it's an allowed attribute NS.
910 // write the attribute
911 this._buf += (" " + prefix + ":" +
912 attributes.getLocalName(i) +
913 "='" + attributeValue + "'");
914
915 // write an xmlns declaration if necessary
916 if (prefix != "xml" && !this._isInScope(uri)) {
917 this._inScopeNS[this._inScopeNS.length - 1].push(uri);
918 this._buf += " xmlns:" + prefix + "='" + uri + "'";
919 }
920 }
921 }
922 }
923 this._buf += ">";
924 }
925 },
926 endElement: function XH_endElement(uri, localName, qName) {
927 --this._depth;
928 this._inScopeNS.pop();
929
930 // We need to skip outer divs in Atom. See comment in startElement.
931 if (this._isAtom && this._depth == 0 && localName == "div")
932 return;
933
934 // When we peek too far, go back to the main processor
935 if (this._depth < 0) {
936 this._processor.returnFromXHTMLHandler(this._buf.trim(),
937 uri, localName, qName);
938 return;
939 }
940 // If it's an XHTML element, record it. Otherwise, it's ignored.
941 if (uri == XHTML_NS) {
942 this._buf += "</" + localName + ">";
943 }
944 },
945 characters: function XH_characters(data) {
946 this._buf += xmlEscape(data);
947 },
948 startPrefixMapping: function XH_startPrefixMapping(prefix, uri) {
949 },
950 endPrefixMapping: function FP_endPrefixMapping(prefix) {
951 },
952 processingInstruction: function XH_processingInstruction() {
953 },
954 }
955
956 /**
957 * The ExtensionHandler deals with elements we haven't explicitly
958 * added to our transition table in the FeedProcessor.
959 */
960 function ExtensionHandler(processor) {
961 this._buf = "";
962 this._depth = 0;
963 this._hasChildElements = false;
964
965 // The FeedProcessor
966 this._processor = processor;
967
968 // Fields of the outermost extension element.
969 this._localName = null;
970 this._uri = null;
971 this._qName = null;
972 this._attrs = null;
973 }
974
975 ExtensionHandler.prototype = {
976 startDocument: function EH_startDocument() {
977 },
978 endDocument: function EH_endDocument() {
979 },
980 startElement: function EH_startElement(uri, localName, qName, attrs) {
981 ++this._depth;
982 var prefix = gNamespaces[uri] ? gNamespaces[uri] + ":" : "";
983 var key = prefix + localName;
984
985 if (this._depth == 1) {
986 this._uri = uri;
987 this._localName = localName;
988 this._qName = qName;
989 this._attrs = attrs;
990 }
991
992 // if we descend into another element, we won't send text
993 this._hasChildElements = (this._depth > 1);
994
995 },
996 endElement: function EH_endElement(uri, localName, qName) {
997 --this._depth;
998 if (this._depth == 0) {
999 var text = this._hasChildElements ? null : this._buf.trim();
1000 this._processor.returnFromExtHandler(this._uri, this._localName,
1001 text, this._attrs);
1002 }
1003 },
1004 characters: function EH_characters(data) {
1005 if (!this._hasChildElements)
1006 this._buf += data;
1007 },
1008 startPrefixMapping: function EH_startPrefixMapping() {
1009 },
1010 endPrefixMapping: function EH_endPrefixMapping() {
1011 },
1012 processingInstruction: function EH_processingInstruction() {
1013 },
1014 };
1015
1016
1017 /**
1018 * ElementInfo is a simple container object that describes
1019 * some characteristics of a feed element. For example, it
1020 * says whether an element can be expected to appear more
1021 * than once inside a given entry or feed.
1022 */
1023 function ElementInfo(fieldName, containerClass, closeFunc, isArray) {
1024 this.fieldName = fieldName;
1025 this.containerClass = containerClass;
1026 this.closeFunc = closeFunc;
1027 this.isArray = isArray;
1028 this.isWrapper = false;
1029 }
1030
1031 /**
1032 * FeedElementInfo represents a feed element, usually the root.
1033 */
1034 function FeedElementInfo(fieldName, feedVersion) {
1035 this.isWrapper = false;
1036 this.fieldName = fieldName;
1037 this.feedVersion = feedVersion;
1038 }
1039
1040 /**
1041 * Some feed formats include vestigial wrapper elements that we don't
1042 * want to include in our object model, but we do need to keep track
1043 * of during parsing.
1044 */
1045 function WrapperElementInfo(fieldName) {
1046 this.isWrapper = true;
1047 this.fieldName = fieldName;
1048 }
1049
1050 /***** The Processor *****/
1051 function FeedProcessor() {
1052 this._reader = Cc[SAX_CONTRACTID].createInstance(Ci.nsISAXXMLReader);
1053 this._buf = "";
1054 this._feed = Cc[BAG_CONTRACTID].createInstance(Ci.nsIWritablePropertyBag2);
1055 this._handlerStack = [];
1056 this._xmlBaseStack = []; // sparse array keyed to nesting depth
1057 this._depth = 0;
1058 this._state = "START";
1059 this._result = null;
1060 this._extensionHandler = null;
1061 this._xhtmlHandler = null;
1062 this._haveSentResult = false;
1063
1064 // The nsIFeedResultListener waiting for the parse results
1065 this.listener = null;
1066
1067 // These elements can contain (X)HTML or plain text.
1068 // We keep a table here that contains their default treatment
1069 this._textConstructs = {"atom:title":"text",
1070 "atom:summary":"text",
1071 "atom:rights":"text",
1072 "atom:content":"text",
1073 "atom:subtitle":"text",
1074 "description":"html",
1075 "rss1:description":"html",
1076 "dc:description":"html",
1077 "content:encoded":"html",
1078 "title":"text",
1079 "rss1:title":"text",
1080 "atom03:title":"text",
1081 "atom03:tagline":"text",
1082 "atom03:summary":"text",
1083 "atom03:content":"text"};
1084 this._stack = [];
1085
1086 this._trans = {
1087 "START": {
1088 //If we hit a root RSS element, treat as RSS2.
1089 "rss": new FeedElementInfo("RSS2", "rss2"),
1090
1091 // If we hit an RDF element, if could be RSS1, but we can't
1092 // verify that until we hit a rss1:channel element.
1093 "rdf:RDF": new WrapperElementInfo("RDF"),
1094
1095 // If we hit a Atom 1.0 element, treat as Atom 1.0.
1096 "atom:feed": new FeedElementInfo("Atom", "atom"),
1097
1098 // Treat as Atom 0.3
1099 "atom03:feed": new FeedElementInfo("Atom03", "atom03"),
1100 },
1101
1102 /********* RSS2 **********/
1103 "IN_RSS2": {
1104 "channel": new WrapperElementInfo("channel")
1105 },
1106
1107 "IN_CHANNEL": {
1108 "item": new ElementInfo("items", Cc[ENTRY_CONTRACTID], null, true),
1109 "managingEditor": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1110 rssAuthor, true),
1111 "dc:creator": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1112 rssAuthor, true),
1113 "dc:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1114 rssAuthor, true),
1115 "dc:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID],
1116 rssAuthor, true),
1117 "category": new ElementInfo("categories", null, rssCatTerm, true),
1118 "cloud": new ElementInfo("cloud", null, null, false),
1119 "image": new ElementInfo("image", null, null, false),
1120 "textInput": new ElementInfo("textInput", null, null, false),
1121 "skipDays": new ElementInfo("skipDays", null, null, false),
1122 "skipHours": new ElementInfo("skipHours", null, null, false),
1123 "generator": new ElementInfo("generator", Cc[GENERATOR_CONTRACTID],
1124 atomGenerator, false),
1125 },
1126
1127 "IN_ITEMS": {
1128 "author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1129 rssAuthor, true),
1130 "dc:creator": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1131 rssAuthor, true),
1132 "dc:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1133 rssAuthor, true),
1134 "dc:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID],
1135 rssAuthor, true),
1136 "category": new ElementInfo("categories", null, rssCatTerm, true),
1137 "enclosure": new ElementInfo("enclosure", null, null, false),
1138 "media:content": new ElementInfo("mediacontent", null, null, true),
1139 "media:group": new ElementInfo("mediagroup", null, null, false),
1140 "guid": new ElementInfo("guid", null, rssGuid, false)
1141 },
1142
1143 "IN_SKIPDAYS": {
1144 "day": new ElementInfo("days", null, rssArrayElement, true)
1145 },
1146
1147 "IN_SKIPHOURS":{
1148 "hour": new ElementInfo("hours", null, rssArrayElement, true)
1149 },
1150
1151 "IN_MEDIAGROUP": {
1152 "media:content": new ElementInfo("mediacontent", null, null, true)
1153 },
1154
1155 /********* RSS1 **********/
1156 "IN_RDF": {
1157 // If we hit a rss1:channel, we can verify that we have RSS1
1158 "rss1:channel": new FeedElementInfo("rdf_channel", "rss1"),
1159 "rss1:image": new ElementInfo("image", null, null, false),
1160 "rss1:textinput": new ElementInfo("textInput", null, null, false),
1161 "rss1:item": new ElementInfo("items", Cc[ENTRY_CONTRACTID], null, true),
1162 },
1163
1164 "IN_RDF_CHANNEL": {
1165 "admin:generatorAgent": new ElementInfo("generator",
1166 Cc[GENERATOR_CONTRACTID],
1167 null, false),
1168 "dc:creator": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1169 rssAuthor, true),
1170 "dc:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1171 rssAuthor, true),
1172 "dc:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID],
1173 rssAuthor, true),
1174 },
1175
1176 /********* ATOM 1.0 **********/
1177 "IN_ATOM": {
1178 "atom:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1179 null, true),
1180 "atom:generator": new ElementInfo("generator", Cc[GENERATOR_CONTRACTID],
1181 atomGenerator, false),
1182 "atom:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID],
1183 null, true),
1184 "atom:link": new ElementInfo("links", null, null, true),
1185 "atom:logo": new ElementInfo("atom:logo", null, atomLogo, false),
1186 "atom:entry": new ElementInfo("entries", Cc[ENTRY_CONTRACTID],
1187 null, true)
1188 },
1189
1190 "IN_ENTRIES": {
1191 "atom:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1192 null, true),
1193 "atom:contributor": new ElementInfo("contributors", Cc[PERSON_CONTRACTID],
1194 null, true),
1195 "atom:link": new ElementInfo("links", null, null, true),
1196 },
1197
1198 /********* ATOM 0.3 **********/
1199 "IN_ATOM03": {
1200 "atom03:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1201 null, true),
1202 "atom03:contributor": new ElementInfo("contributors",
1203 Cc[PERSON_CONTRACTID],
1204 null, true),
1205 "atom03:link": new ElementInfo("links", null, null, true),
1206 "atom03:entry": new ElementInfo("atom03_entries", Cc[ENTRY_CONTRACTID],
1207 null, true),
1208 "atom03:generator": new ElementInfo("generator", Cc[GENERATOR_CONTRACTID],
1209 atomGenerator, false),
1210 },
1211
1212 "IN_ATOM03_ENTRIES": {
1213 "atom03:author": new ElementInfo("authors", Cc[PERSON_CONTRACTID],
1214 null, true),
1215 "atom03:contributor": new ElementInfo("contributors",
1216 Cc[PERSON_CONTRACTID],
1217 null, true),
1218 "atom03:link": new ElementInfo("links", null, null, true),
1219 "atom03:entry": new ElementInfo("atom03_entries", Cc[ENTRY_CONTRACTID],
1220 null, true)
1221 }
1222 }
1223 }
1224
1225 // See startElement for a long description of how feeds are processed.
1226 FeedProcessor.prototype = {
1227
1228 // Set ourselves as the SAX handler, and set the base URI
1229 _init: function FP_init(uri) {
1230 this._reader.contentHandler = this;
1231 this._reader.errorHandler = this;
1232 this._result = Cc[FR_CONTRACTID].createInstance(Ci.nsIFeedResult);
1233 if (uri) {
1234 this._result.uri = uri;
1235 this._reader.baseURI = uri;
1236 this._xmlBaseStack[0] = uri;
1237 }
1238 },
1239
1240 // This function is called once we figure out what type of feed
1241 // we're dealing with. Some feed types require digging a bit further
1242 // than the root.
1243 _docVerified: function FP_docVerified(version) {
1244 this._result.doc = Cc[FEED_CONTRACTID].createInstance(Ci.nsIFeed);
1245 this._result.doc.baseURI =
1246 this._xmlBaseStack[this._xmlBaseStack.length - 1];
1247 this._result.doc.fields = this._feed;
1248 this._result.version = version;
1249 },
1250
1251 // When we're done with the feed, let the listener know what
1252 // happened.
1253 _sendResult: function FP_sendResult() {
1254 this._haveSentResult = true;
1255 try {
1256 // Can be null when a non-feed is fed to us
1257 if (this._result.doc)
1258 this._result.doc.normalize();
1259 }
1260 catch (e) {
1261 LOG("FIXME: " + e);
1262 }
1263
1264 try {
1265 if (this.listener != null)
1266 this.listener.handleResult(this._result);
1267 }
1268 finally {
1269 this._result = null;
1270 }
1271 },
1272
1273 // Parsing functions
1274 parseFromStream: function FP_parseFromStream(stream, uri) {
1275 this._init(uri);
1276 this._reader.parseFromStream(stream, null, stream.available(),
1277 "application/xml");
1278 this._reader = null;
1279 },
1280
1281 parseFromString: function FP_parseFromString(inputString, uri) {
1282 this._init(uri);
1283 this._reader.parseFromString(inputString, "application/xml");
1284 this._reader = null;
1285 },
1286
1287 parseAsync: function FP_parseAsync(requestObserver, uri) {
1288 this._init(uri);
1289 this._reader.parseAsync(requestObserver);
1290 },
1291
1292 // nsIStreamListener
1293
1294 // The XMLReader will throw sensible exceptions if these get called
1295 // out of order.
1296 onStartRequest: function FP_onStartRequest(request, context) {
1297 // this will throw if the request is not a channel, but so will nsParser.
1298 var channel = request.QueryInterface(Ci.nsIChannel);
1299 channel.contentType = "application/vnd.mozilla.maybe.feed";
1300 this._reader.onStartRequest(request, context);
1301 },
1302
1303 onStopRequest: function FP_onStopRequest(request, context, statusCode) {
1304 try {
1305 this._reader.onStopRequest(request, context, statusCode);
1306 }
1307 finally {
1308 this._reader = null;
1309 }
1310 },
1311
1312 onDataAvailable:
1313 function FP_onDataAvailable(request, context, inputStream, offset, count) {
1314 this._reader.onDataAvailable(request, context, inputStream, offset, count);
1315 },
1316
1317 // nsISAXErrorHandler
1318
1319 // We only care about fatal errors. When this happens, we may have
1320 // parsed through the feed metadata and some number of entries. The
1321 // listener can still show some of that data if it wants, and we'll
1322 // set the bozo bit to indicate we were unable to parse all the way
1323 // through.
1324 fatalError: function FP_reportError() {
1325 this._result.bozo = true;
1326 //XXX need to QI to FeedProgressListener
1327 if (!this._haveSentResult)
1328 this._sendResult();
1329 },
1330
1331 // nsISAXContentHandler
1332
1333 startDocument: function FP_startDocument() {
1334 //LOG("----------");
1335 },
1336
1337 endDocument: function FP_endDocument() {
1338 if (!this._haveSentResult)
1339 this._sendResult();
1340 },
1341
1342 // The transitions defined above identify elements that contain more
1343 // than just text. For example RSS items contain many fields, and so
1344 // do Atom authors. The only commonly used elements that contain
1345 // mixed content are Atom Text Constructs of type="xhtml", which we
1346 // delegate to another handler for cleaning. That leaves a couple
1347 // different types of elements to deal with: those that should occur
1348 // only once, such as title elements, and those that can occur
1349 // multiple times, such as the RSS category element and the Atom
1350 // link element. Most of the RSS1/DC elements can occur multiple
1351 // times in theory, but in practice, the only ones that do have
1352 // analogues in Atom.
1353 //
1354 // Some elements are also groups of attributes or sub-elements,
1355 // while others are simple text fields. For the most part, we don't
1356 // have to pay explicit attention to the simple text elements,
1357 // unless we want to post-process the resulting string to transform
1358 // it into some richer object like a Date or URI.
1359 //
1360 // Elements that have more sophisticated content models still end up
1361 // being dictionaries, whether they are based on attributes like RSS
1362 // cloud, sub-elements like Atom author, or even items and
1363 // entries. These elements are treated as "containers". It's
1364 // theoretically possible for a container to have an attribute with
1365 // the same universal name as a sub-element, but none of the feed
1366 // formats allow this by default, and I don't of any extension that
1367 // works this way.
1368 //
1369 startElement: function FP_startElement(uri, localName, qName, attributes) {
1370 this._buf = "";
1371 ++this._depth;
1372 var elementInfo;
1373
1374 //LOG("<" + localName + ">");
1375
1376 // Check for xml:base
1377 var base = attributes.getValueFromName(XMLNS, "base");
1378 if (base) {
1379 this._xmlBaseStack[this._depth] =
1380 strToURI(base, this._xmlBaseStack[this._xmlBaseStack.length - 1]);
1381 }
1382
1383 // To identify the element we're dealing with, we look up the
1384 // namespace URI in our gNamespaces dictionary, which will give us
1385 // a "canonical" prefix for a namespace URI. For example, this
1386 // allows Dublin Core "creator" elements to be consistently mapped
1387 // to "dc:creator", for easy field access by consumer code. This
1388 // strategy also happens to shorten up our state table.
1389 var key = this._prefixForNS(uri) + localName;
1390
1391 // Check to see if we need to hand this off to our XHTML handler.
1392 // The elements we're dealing with will look like this:
1393 //
1394 // <title type="xhtml">
1395 // <div xmlns="http://www.w3.org/1999/xhtml">
1396 // A title with <b>bold</b> and <i>italics</i>.
1397 // </div>
1398 // </title>
1399 //
1400 // When it returns in returnFromXHTMLHandler, the handler should
1401 // give us back a string like this:
1402 //
1403 // "A title with <b>bold</b> and <i>italics</i>."
1404 //
1405 // The Atom spec explicitly says the div is not part of the content,
1406 // and explicitly allows whitespace collapsing.
1407 //
1408 if ((this._result.version == "atom" || this._result.version == "atom03") &&
1409 this._textConstructs[key] != null) {
1410 var type = attributes.getValueFromName("","type");
1411 if (type != null && type.indexOf("xhtml") >= 0) {
1412 this._xhtmlHandler =
1413 new XHTMLHandler(this, (this._result.version == "atom"));
1414 this._reader.contentHandler = this._xhtmlHandler;
1415 return;
1416 }
1417 }
1418
1419 // Check our current state, and see if that state has a defined
1420 // transition. For example, this._trans["atom:entry"]["atom:author"]
1421 // will have one, and it tells us to add an item to our authors array.
1422 if (this._trans[this._state] && this._trans[this._state][key]) {
1423 elementInfo = this._trans[this._state][key];
1424 }
1425 else {
1426 // If we don't have a transition, hand off to extension handler
1427 this._extensionHandler = new ExtensionHandler(this);
1428 this._reader.contentHandler = this._extensionHandler;
1429 this._extensionHandler.startElement(uri, localName, qName, attributes);
1430 return;
1431 }
1432
1433 // This distinguishes wrappers like 'channel' from elements
1434 // we'd actually like to do something with (which will test true).
1435 this._handlerStack[this._depth] = elementInfo;
1436 if (elementInfo.isWrapper) {
1437 this._state = "IN_" + elementInfo.fieldName.toUpperCase();
1438 this._stack.push([this._feed, this._state]);
1439 }
1440 else if (elementInfo.feedVersion) {
1441 this._state = "IN_" + elementInfo.fieldName.toUpperCase();
1442
1443 // Check for the older RSS2 variants
1444 if (elementInfo.feedVersion == "rss2")
1445 elementInfo.feedVersion = this._findRSSVersion(attributes);
1446 else if (uri == RSS090NS)
1447 elementInfo.feedVersion = "rss090";
1448
1449 this._docVerified(elementInfo.feedVersion);
1450 this._stack.push([this._feed, this._state]);
1451 this._mapAttributes(this._feed, attributes);
1452 }
1453 else {
1454 this._state = this._processComplexElement(elementInfo, attributes);
1455 }
1456 },
1457
1458 // In the endElement handler, we decrement the stack and look
1459 // for cleanup/transition functions to execute. The second part
1460 // of the state transition works as above in startElement, but
1461 // the state we're looking for is prefixed with an underscore
1462 // to distinguish endElement events from startElement events.
1463 endElement: function FP_endElement(uri, localName, qName) {
1464 var elementInfo = this._handlerStack[this._depth];
1465 //LOG("</" + localName + ">");
1466 if (elementInfo && !elementInfo.isWrapper)
1467 this._closeComplexElement(elementInfo);
1468
1469 // cut down xml:base context
1470 if (this._xmlBaseStack.length == this._depth + 1)
1471 this._xmlBaseStack = this._xmlBaseStack.slice(0, this._depth);
1472
1473 // our new state is whatever is at the top of the stack now
1474 if (this._stack.length > 0)
1475 this._state = this._stack[this._stack.length - 1][1];
1476 this._handlerStack = this._handlerStack.slice(0, this._depth);
1477 --this._depth;
1478 },
1479
1480 // Buffer up character data. The buffer is cleared with every
1481 // opening element.
1482 characters: function FP_characters(data) {
1483 this._buf += data;
1484 },
1485 // TODO: It would be nice to check new prefixes here, and if they
1486 // don't conflict with the ones we've defined, throw them in a
1487 // dictionary to check.
1488 startPrefixMapping: function FP_startPrefixMapping(prefix, uri) {
1489 },
1490
1491 endPrefixMapping: function FP_endPrefixMapping(prefix) {
1492 },
1493
1494 processingInstruction: function FP_processingInstruction(target, data) {
1495 if (target == "xml-stylesheet") {
1496 var hrefAttribute = data.match(/href=[\"\'](.*?)[\"\']/);
1497 if (hrefAttribute && hrefAttribute.length == 2)
1498 this._result.stylesheet = strToURI(hrefAttribute[1], this._result.uri);
1499 }
1500 },
1501
1502 // end of nsISAXContentHandler
1503
1504 // Handle our more complicated elements--those that contain
1505 // attributes and child elements.
1506 _processComplexElement:
1507 function FP__processComplexElement(elementInfo, attributes) {
1508 var obj, key, prefix;
1509
1510 // If the container is an entry/item, it'll need to have its
1511 // more esoteric properties put in the 'fields' property bag.
1512 if (elementInfo.containerClass == Cc[ENTRY_CONTRACTID]) {
1513 obj = elementInfo.containerClass.createInstance(Ci.nsIFeedEntry);
1514 obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1];
1515 this._mapAttributes(obj.fields, attributes);
1516 }
1517 else if (elementInfo.containerClass) {
1518 obj = elementInfo.containerClass.createInstance(Ci.nsIFeedElementBase);
1519 obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1];
1520 obj.attributes = attributes; // just set the SAX attributes
1521 }
1522 else {
1523 obj = Cc[BAG_CONTRACTID].createInstance(Ci.nsIWritablePropertyBag2);
1524 this._mapAttributes(obj, attributes);
1525 }
1526
1527 // We should have a container/propertyBag that's had its
1528 // attributes processed. Now we need to attach it to its
1529 // container.
1530 var newProp;
1531
1532 // First we'll see what's on top of the stack.
1533 var container = this._stack[this._stack.length - 1][0];
1534
1535 // Check to see if it has the property
1536 var prop;
1537 try {
1538 prop = container.getProperty(elementInfo.fieldName);
1539 }
1540 catch(e) {
1541 }
1542
1543 if (elementInfo.isArray) {
1544 if (!prop) {
1545 container.setPropertyAsInterface(elementInfo.fieldName,
1546 Cc[ARRAY_CONTRACTID].
1547 createInstance(Ci.nsIMutableArray));
1548 }
1549
1550 newProp = container.getProperty(elementInfo.fieldName);
1551 // XXX This QI should not be necessary, but XPConnect seems to fly
1552 // off the handle in the browser, and loses track of the interface
1553 // on large files. Bug 335638.
1554 newProp.QueryInterface(Ci.nsIMutableArray);
1555 newProp.appendElement(obj,false);
1556
1557 // If new object is an nsIFeedContainer, we want to deal with
1558 // its member nsIPropertyBag instead.
1559 if (isIFeedContainer(obj))
1560 newProp = obj.fields;
1561
1562 }
1563 else {
1564 // If it doesn't, set it.
1565 if (!prop) {
1566 container.setPropertyAsInterface(elementInfo.fieldName,obj);
1567 }
1568 newProp = container.getProperty(elementInfo.fieldName);
1569 }
1570
1571 // make our new state name, and push the property onto the stack
1572 var newState = "IN_" + elementInfo.fieldName.toUpperCase();
1573 this._stack.push([newProp, newState, obj]);
1574 return newState;
1575 },
1576
1577 // Sometimes we need reconcile the element content with the object
1578 // model for a given feed. We use helper functions to do the
1579 // munging, but we need to identify array types here, so the munging
1580 // happens only to the last element of an array.
1581 _closeComplexElement: function FP__closeComplexElement(elementInfo) {
1582 var stateTuple = this._stack.pop();
1583 var container = stateTuple[0];
1584 var containerParent = stateTuple[2];
1585 var element = null;
1586 var isArray = isIArray(container);
1587
1588 // If it's an array and we have to post-process,
1589 // grab the last element
1590 if (isArray)
1591 element = container.queryElementAt(container.length - 1, Ci.nsISupports);
1592 else
1593 element = container;
1594
1595 // Run the post-processing function if there is one.
1596 if (elementInfo.closeFunc)
1597 element = elementInfo.closeFunc(this._buf, element);
1598
1599 // If an nsIFeedContainer was on top of the stack,
1600 // we need to normalize it
1601 if (elementInfo.containerClass == Cc[ENTRY_CONTRACTID])
1602 containerParent.normalize();
1603
1604 // If it's an array, re-set the last element
1605 if (isArray)
1606 container.replaceElementAt(element, container.length - 1, false);
1607 },
1608
1609 _prefixForNS: function FP_prefixForNS(uri) {
1610 if (!uri)
1611 return "";
1612 var prefix = gNamespaces[uri];
1613 if (prefix)
1614 return prefix + ":";
1615 if (uri.toLowerCase().indexOf("http://backend.userland.com") == 0)
1616 return "";
1617 else
1618 return null;
1619 },
1620
1621 _mapAttributes: function FP__mapAttributes(bag, attributes) {
1622 // Cycle through the attributes, and set our properties using the
1623 // prefix:localNames we find in our namespace dictionary.
1624 for (var i = 0; i < attributes.length; ++i) {
1625 var key = this._prefixForNS(attributes.getURI(i)) + attributes.getLocalName(i);
1626 var val = attributes.getValue(i);
1627 bag.setPropertyAsAString(key, val);
1628 }
1629 },
1630
1631 // Only for RSS2esque formats
1632 _findRSSVersion: function FP__findRSSVersion(attributes) {
1633 var versionAttr = attributes.getValueFromName("", "version").trim();
1634 var versions = { "0.91":"rss091",
1635 "0.92":"rss092",
1636 "0.93":"rss093",
1637 "0.94":"rss094" }
1638 if (versions[versionAttr])
1639 return versions[versionAttr];
1640 if (versionAttr.substr(0,2) != "2.")
1641 return "rssUnknown";
1642 return "rss2";
1643 },
1644
1645 // unknown element values are returned here. See startElement above
1646 // for how this works.
1647 returnFromExtHandler:
1648 function FP_returnExt(uri, localName, chars, attributes) {
1649 --this._depth;
1650
1651 // take control of the SAX events
1652 this._reader.contentHandler = this;
1653 if (localName == null && chars == null)
1654 return;
1655
1656 // we don't take random elements inside rdf:RDF
1657 if (this._state == "IN_RDF")
1658 return;
1659
1660 // Grab the top of the stack
1661 var top = this._stack[this._stack.length - 1];
1662 if (!top)
1663 return;
1664
1665 var container = top[0];
1666 // Grab the last element if it's an array
1667 if (isIArray(container)) {
1668 var contract = this._handlerStack[this._depth].containerClass;
1669 // check if it's something specific, but not an entry
1670 if (contract && contract != Cc[ENTRY_CONTRACTID]) {
1671 var el = container.queryElementAt(container.length - 1,
1672 Ci.nsIFeedElementBase);
1673 // XXX there must be a way to flatten these interfaces
1674 if (contract == Cc[PERSON_CONTRACTID])
1675 el.QueryInterface(Ci.nsIFeedPerson);
1676 else
1677 return; // don't know about this interface
1678
1679 var propName = localName;
1680 var prefix = gNamespaces[uri];
1681
1682 // synonyms
1683 if ((uri == "" ||
1684 prefix &&
1685 ((prefix.indexOf("atom") > -1) ||
1686 (prefix.indexOf("rss") > -1))) &&
1687 (propName == "url" || propName == "href"))
1688 propName = "uri";
1689
1690 try {
1691 if (el[propName] !== "undefined") {
1692 var propValue = chars;
1693 // convert URI-bearing values to an nsIURI
1694 if (propName == "uri") {
1695 var base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
1696 propValue = strToURI(chars, base);
1697 }
1698 el[propName] = propValue;
1699 }
1700 }
1701 catch(e) {
1702 // ignore XPConnect errors
1703 }
1704 // the rest of the function deals with entry- and feed-level stuff
1705 return;
1706 }
1707 else {
1708 container = container.queryElementAt(container.length - 1,
1709 Ci.nsIWritablePropertyBag2);
1710 }
1711 }
1712
1713 // Make the buffer our new property
1714 var propName = this._prefixForNS(uri) + localName;
1715
1716 // But, it could be something containing HTML. If so,
1717 // we need to know about that.
1718 if (this._textConstructs[propName] != null &&
1719 this._handlerStack[this._depth].containerClass !== null) {
1720 var newProp = Cc[TEXTCONSTRUCT_CONTRACTID].
1721 createInstance(Ci.nsIFeedTextConstruct);
1722 newProp.text = chars;
1723 // Look up the default type in our table
1724 var type = this._textConstructs[propName];
1725 var typeAttribute = attributes.getValueFromName("","type");
1726 if (this._result.version == "atom" && typeAttribute != null) {
1727 type = typeAttribute;
1728 }
1729 else if (this._result.version == "atom03" && typeAttribute != null) {
1730 if (typeAttribute.toLowerCase().indexOf("xhtml") >= 0) {
1731 type = "xhtml";
1732 }
1733 else if (typeAttribute.toLowerCase().indexOf("html") >= 0) {
1734 type = "html";
1735 }
1736 else if (typeAttribute.toLowerCase().indexOf("text") >= 0) {
1737 type = "text";
1738 }
1739 }
1740
1741 // If it's rss feed-level description, it's not supposed to have html
1742 if (this._result.version.indexOf("rss") >= 0 &&
1743 this._handlerStack[this._depth].containerClass != ENTRY_CONTRACTID) {
1744 type = "text";
1745 }
1746 newProp.type = type;
1747 newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
1748 container.setPropertyAsInterface(propName, newProp);
1749 }
1750 else {
1751 container.setPropertyAsAString(propName, chars);
1752 }
1753 },
1754
1755 // Sometimes, we'll hand off SAX handling duties to an XHTMLHandler
1756 // (see above) that will scrape out non-XHTML stuff, normalize
1757 // namespaces, and remove the wrapper div from Atom 1.0. When the
1758 // XHTMLHandler is done, it'll callback here.
1759 returnFromXHTMLHandler:
1760 function FP_returnFromXHTMLHandler(chars, uri, localName, qName) {
1761 // retake control of the SAX content events
1762 this._reader.contentHandler = this;
1763
1764 // Grab the top of the stack
1765 var top = this._stack[this._stack.length - 1];
1766 if (!top)
1767 return;
1768 var container = top[0];
1769
1770 // Assign the property
1771 var newProp = newProp = Cc[TEXTCONSTRUCT_CONTRACTID].
1772 createInstance(Ci.nsIFeedTextConstruct);
1773 newProp.text = chars;
1774 newProp.type = "xhtml";
1775 newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
1776 container.setPropertyAsInterface(this._prefixForNS(uri) + localName,
1777 newProp);
1778
1779 // XHTML will cause us to peek too far. The XHTML handler will
1780 // send us an end element to call. RFC4287-valid feeds allow a
1781 // more graceful way to handle this. Unfortunately, we can't count
1782 // on compliance at this point.
1783 this.endElement(uri, localName, qName);
1784 },
1785
1786 // XPCOM stuff
1787 classID: FP_CLASSID,
1788 QueryInterface: XPCOMUtils.generateQI(
1789 [Ci.nsIFeedProcessor, Ci.nsISAXContentHandler, Ci.nsISAXErrorHandler,
1790 Ci.nsIStreamListener, Ci.nsIRequestObserver]
1791 )
1792 }
1793
1794 var components = [FeedProcessor, FeedResult, Feed, Entry,
1795 TextConstruct, Generator, Person];
1796
1797 this.NSGetFactory = XPCOMUtils.generateNSGetFactory(components);

mercurial