/* -*- Mode: JavaScript; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// The feed parser depends on FeedItem.js, Feed.js.
function FeedParser() {
this.mSerializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"].
createInstance(Ci.nsIDOMSerializer);
}
FeedParser.prototype =
{
// parseFeed() returns an array of parsed items ready for processing. It is
// currently a synchronous operation. If there is an error parsing the feed,
// parseFeed returns an empty feed in addition to calling aFeed.onParseError.
parseFeed: function (aFeed, aDOM)
{
if (!(aDOM instanceof Ci.nsIDOMXMLDocument))
{
// No xml doc.
return aFeed.onParseError(aFeed);
}
let doc = aDOM.documentElement;
if (doc.namespaceURI == FeedUtils.MOZ_PARSERERROR_NS)
{
// Gecko caught a basic parsing error.
let errStr = doc.firstChild.textContent + "\n" +
doc.firstElementChild.textContent;
FeedUtils.log.info("FeedParser.parseFeed: - " + errStr);
return aFeed.onParseError(aFeed);
}
else if (aDOM.querySelector("redirect"))
{
// Check for RSS2.0 redirect document.
let channel = aDOM.querySelector("redirect");
if (this.isPermanentRedirect(aFeed, channel, null, null))
return;
return aFeed.onParseError(aFeed);
}
else if (doc.namespaceURI == FeedUtils.RDF_SYNTAX_NS &&
doc.getElementsByTagNameNS(FeedUtils.RSS_NS, "channel")[0])
{
aFeed.mFeedType = "RSS_1.xRDF"
FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
aFeed.mFeedType +" : " +aFeed.url);
// aSource can be misencoded (XMLHttpRequest converts to UTF-8 by default),
// but the DOM is almost always right because it uses the hints in the
// XML file. This is slower, but not noticably so. Mozilla doesn't have
// the XMLHttpRequest.responseBody property that IE has, which provides
// access to the unencoded response.
let xmlString = this.mSerializer.serializeToString(doc);
return this.parseAsRSS1(aFeed, xmlString, aFeed.request.channel.URI);
}
else if (doc.namespaceURI == FeedUtils.ATOM_03_NS)
{
aFeed.mFeedType = "ATOM_0.3"
FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
aFeed.mFeedType +" : " +aFeed.url);
return this.parseAsAtom(aFeed, aDOM);
}
else if (doc.namespaceURI == FeedUtils.ATOM_IETF_NS)
{
aFeed.mFeedType = "ATOM_IETF"
FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
aFeed.mFeedType +" : " +aFeed.url);
return this.parseAsAtomIETF(aFeed, aDOM);
}
else if (doc.getElementsByTagNameNS(FeedUtils.RSS_090_NS, "channel")[0])
{
aFeed.mFeedType = "RSS_0.90"
FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
aFeed.mFeedType +" : " +aFeed.url);
return this.parseAsRSS2(aFeed, aDOM);
}
else
{
// Parse as RSS 0.9x. In theory even RSS 1.0 feeds could be parsed by
// the 0.9x parser if the RSS namespace were the default.
let rssVer = doc.localName == "rss" ? doc.getAttribute("version") : null;
if (rssVer)
aFeed.mFeedType = "RSS_" + rssVer;
else
aFeed.mFeedType = "RSS_0.9x?";
FeedUtils.log.debug("FeedParser.parseFeed: type:url - " +
aFeed.mFeedType +" : " +aFeed.url);
return this.parseAsRSS2(aFeed, aDOM);
}
},
parseAsRSS2: function (aFeed, aDOM)
{
// Get the first channel (assuming there is only one per RSS File).
let parsedItems = new Array();
let channel = aDOM.querySelector("channel");
if (!channel)
return aFeed.onParseError(aFeed);
// Usually the empty string, unless this is RSS .90.
let nsURI = channel.namespaceURI || "";
FeedUtils.log.debug("FeedParser.parseAsRSS2: channel nsURI - " + nsURI);
if (this.isPermanentRedirect(aFeed, null, channel, null))
return;
let tags = this.childrenByTagNameNS(channel, nsURI, "title");
aFeed.title = aFeed.title || this.getNodeValue(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(channel, nsURI, "description");
aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(channel, nsURI, "link");
aFeed.link = this.validLink(this.getNodeValue(tags ? tags[0] : null));
if (!(aFeed.title || aFeed.description) || !aFeed.link)
{
FeedUtils.log.error("FeedParser.parseAsRSS2: missing mandatory element " +
"
and , or ");
return aFeed.onParseError(aFeed);
}
if (!aFeed.parseItems)
return parsedItems;
aFeed.invalidateItems();
// XXX use getElementsByTagNameNS for now; childrenByTagNameNS would be
// better, but RSS .90 is still with us.
let itemNodes = aDOM.getElementsByTagNameNS(nsURI, "item");
itemNodes = itemNodes ? itemNodes : [];
FeedUtils.log.debug("FeedParser.parseAsRSS2: items to parse - " +
itemNodes.length);
for (let itemNode of itemNodes)
{
if (!itemNode.childElementCount)
continue;
let item = new FeedItem();
item.feed = aFeed;
item.enclosures = [];
item.keywords = [];
tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origLink");
let link = this.validLink(this.getNodeValue(tags ? tags[0] : null));
if (!link)
{
tags = this.childrenByTagNameNS(itemNode, nsURI, "link");
link = this.validLink(this.getNodeValue(tags ? tags[0] : null));
}
tags = this.childrenByTagNameNS(itemNode, nsURI, "guid");
let guidNode = tags ? tags[0] : null;
let guid;
let isPermaLink = false;
if (guidNode)
{
guid = this.getNodeValue(guidNode);
// isPermaLink is true if the value is "true" or if the attribute is
// not present; all other values, including "false" and "False" and
// for that matter "TRuE" and "meatcake" are false.
if (!guidNode.hasAttribute("isPermaLink") ||
guidNode.getAttribute("isPermaLink") == "true")
isPermaLink = true;
// If attribute isPermaLink is missing, it is good to check the validity
// of value as an URL to avoid linking to non-URL strings.
if (!guidNode.hasAttribute("isPermaLink"))
{
try
{
Services.io.newURI(guid, null, null);
if (Services.io.extractScheme(guid) == "tag")
isPermaLink = false;
}
catch (ex)
{
isPermaLink = false;
}
}
item.id = guid;
}
let guidLink = this.validLink(guid);
item.url = isPermaLink && guidLink ? guidLink : link ? link : null;
tags = this.childrenByTagNameNS(itemNode, nsURI, "description");
item.description = this.getNodeValueFormatted(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(itemNode, nsURI, "title");
item.title = this.getNodeValue(tags ? tags[0] : null);
if (!(item.title || item.description))
{
FeedUtils.log.info("FeedParser.parseAsRSS2: missing mandatory " +
"element, either or ; skipping");
continue;
}
if (!item.id)
{
// At this point, if there is no guid, uniqueness cannot be guaranteed
// by any of link or date (optional) or title (optional unless there
// is no description). Use a big chunk of description; minimize dupes
// with url and title if present.
item.id = (item.url || item.feed.url) + "#" + item.title + "#" +
(this.stripTags(item.description ?
item.description.substr(0, 150) : null) ||
item.title);
item.id = item.id.replace(/[\n\r\t\s]+/g, " ");
}
// Escape html entities in , which are unescaped as textContent
// values. If the title is used as content, it will remain escaped; if
// it is used as the title, it will be unescaped upon store. Bug 1240603.
// The tag must follow escaping examples found in
// http://www.rssboard.org/rss-encoding-examples, i.e. single escape angle
// brackets for tags, which are removed if used as title, and double
// escape entities for presentation in title.
// Better: always use . Best: use Atom.
if (!item.title)
item.title = this.stripTags(item.description).substr(0, 150);
else
item.title = item.htmlEscape(item.title);
tags = this.childrenByTagNameNS(itemNode, nsURI, "author");
if (!tags)
tags = this.childrenByTagNameNS(itemNode, FeedUtils.DC_NS, "creator");
let author = this.getNodeValue(tags ? tags[0] : null) ||
aFeed.title;
author = this.cleanAuthorName(author);
item.author = author ? ["<" + author + ">"] : item.author;
tags = this.childrenByTagNameNS(itemNode, nsURI, "pubDate");
if (!tags || !this.getNodeValue(tags[0]))
tags = this.childrenByTagNameNS(itemNode, FeedUtils.DC_NS, "date");
item.date = this.getNodeValue(tags ? tags[0] : null) || item.date;
// If the date is invalid, users will see the beginning of the epoch
// unless we reset it here, so they'll see the current time instead.
// This is typical aggregator behavior.
if (item.date)
{
item.date = item.date.trim();
if (!FeedUtils.isValidRFC822Date(item.date))
{
// XXX Use this on the other formats as well.
item.date = this.dateRescue(item.date);
}
}
tags = this.childrenByTagNameNS(itemNode, FeedUtils.RSS_CONTENT_NS, "encoded");
item.content = this.getNodeValueFormatted(tags ? tags[0] : null);
// Handle and , which may be in a
// (if present).
tags = this.childrenByTagNameNS(itemNode, nsURI, "enclosure");
let encUrls = [];
if (tags)
for (let tag of tags)
{
let url = this.validLink(tag.getAttribute("url"));
if (url && encUrls.indexOf(url) == -1)
{
let type = this.removeUnprintableASCII(tag.getAttribute("type"));
let length = this.removeUnprintableASCII(tag.getAttribute("length"));
item.enclosures.push(new FeedEnclosure(url, type, length));
encUrls.push(url);
}
}
tags = itemNode.getElementsByTagNameNS(FeedUtils.MRSS_NS, "content");
if (tags)
for (let tag of tags)
{
let url = this.validLink(tag.getAttribute("url"));
if (url && encUrls.indexOf(url) == -1)
{
let type = this.removeUnprintableASCII(tag.getAttribute("type"));
let fileSize = this.removeUnprintableASCII(tag.getAttribute("fileSize"));
item.enclosures.push(new FeedEnclosure(url, type, fileSize));
}
}
// The tag has no specification, especially regarding
// whether more than one tag is allowed and, if so, how tags would
// relate to previously declared (and well specified) enclosure urls.
// The common usage is to include 1 origEnclosureLink, in addition to
// the specified enclosure tags for 1 enclosure. Thus, we will replace the
// first enclosure's, if found, url with the first
// url only or else add the url.
tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origEnclosureLink");
let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null));
if (origEncUrl)
{
if (item.enclosures.length)
item.enclosures[0].mURL = origEncUrl;
else
item.enclosures.push(new FeedEnclosure(origEncUrl));
}
// Support and autotagging.
tags = this.childrenByTagNameNS(itemNode, nsURI, "category");
if (tags)
{
for (let tag of tags)
{
let term = this.getNodeValue(tag);
term = term ? this.xmlUnescape(term.replace(/,/g, ";")) : null;
if (term && item.keywords.indexOf(term) == -1)
item.keywords.push(term);
}
}
parsedItems.push(item);
}
return parsedItems;
},
parseAsRSS1 : function(aFeed, aSource, aBaseURI)
{
let parsedItems = new Array();
// RSS 1.0 is valid RDF, so use the RDF parser/service to extract data.
// Create a new RDF data source and parse the feed into it.
let ds = Cc["@mozilla.org/rdf/datasource;1?name=in-memory-datasource"].
createInstance(Ci.nsIRDFDataSource);
let rdfparser = Cc["@mozilla.org/rdf/xml-parser;1"].
createInstance(Ci.nsIRDFXMLParser);
rdfparser.parseString(ds, aBaseURI, aSource);
// Get information about the feed as a whole.
let channel = ds.GetSource(FeedUtils.RDF_TYPE, FeedUtils.RSS_CHANNEL, true);
if (!channel)
return aFeed.onParseError(aFeed);
if (this.isPermanentRedirect(aFeed, null, channel, ds))
return;
aFeed.title = aFeed.title ||
this.getRDFTargetValue(ds, channel, FeedUtils.RSS_TITLE) ||
aFeed.url;
aFeed.description = this.getRDFTargetValueFormatted(ds, channel, FeedUtils.RSS_DESCRIPTION) ||
"";
aFeed.link = this.validLink(this.getRDFTargetValue(ds, channel, FeedUtils.RSS_LINK)) ||
aFeed.url;
if (!(aFeed.title || aFeed.description) || !aFeed.link)
{
FeedUtils.log.error("FeedParser.parseAsRSS1: missing mandatory element " +
" and , or ");
return aFeed.onParseError(aFeed);
}
if (!aFeed.parseItems)
return parsedItems;
aFeed.invalidateItems();
// Ignore the list and just get the s.
let items = ds.GetSources(FeedUtils.RDF_TYPE, FeedUtils.RSS_ITEM, true);
let index = 0;
while (items.hasMoreElements())
{
let itemResource = items.getNext().QueryInterface(Ci.nsIRDFResource);
let item = new FeedItem();
item.feed = aFeed;
// Prefer the value of the link tag to the item URI since the URI could be
// a relative URN.
let uri = itemResource.ValueUTF8;
let link = this.validLink(this.getRDFTargetValue(ds, itemResource, FeedUtils.RSS_LINK));
item.url = link || uri;
item.description = this.getRDFTargetValueFormatted(ds, itemResource,
FeedUtils.RSS_DESCRIPTION);
item.title = this.getRDFTargetValue(ds, itemResource, FeedUtils.RSS_TITLE) ||
this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_SUBJECT) ||
(item.description ?
(this.stripTags(item.description).substr(0, 150)) : null);
if (!item.url || !item.title)
{
FeedUtils.log.info("FeedParser.parseAsRSS1: missing mandatory " +
"element and , or and " +
"no ; skipping");
continue;
}
item.id = item.url;
item.url = this.validLink(item.url);
let author = this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_CREATOR) ||
this.getRDFTargetValue(ds, channel, FeedUtils.DC_CREATOR) ||
aFeed.title;
author = this.cleanAuthorName(author);
item.author = author ? ["<" + author + ">"] : item.author;
item.date = this.getRDFTargetValue(ds, itemResource, FeedUtils.DC_DATE) ||
item.date;
item.content = this.getRDFTargetValueFormatted(ds, itemResource,
FeedUtils.RSS_CONTENT_ENCODED);
parsedItems[index++] = item;
}
FeedUtils.log.debug("FeedParser.parseAsRSS1: items parsed - " + index);
return parsedItems;
},
parseAsAtom: function(aFeed, aDOM)
{
let parsedItems = new Array();
// Get the first channel (assuming there is only one per Atom File).
let channel = aDOM.querySelector("feed");
if (!channel)
return aFeed.onParseError(aFeed);
if (this.isPermanentRedirect(aFeed, null, channel, null))
return;
let tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "title");
aFeed.title = aFeed.title ||
this.stripTags(this.getNodeValue(tags ? tags[0] : null));
tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "tagline");
aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "link");
aFeed.link = this.validLink(this.findAtomLink("alternate", tags));
if (!aFeed.title)
{
FeedUtils.log.error("FeedParser.parseAsAtom: missing mandatory element " +
"");
return aFeed.onParseError(aFeed);
}
if (!aFeed.parseItems)
return parsedItems;
aFeed.invalidateItems();
let items = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "entry");
items = items ? items : [];
FeedUtils.log.debug("FeedParser.parseAsAtom: items to parse - " +
items.length);
for (let itemNode of items)
{
if (!itemNode.childElementCount)
continue;
let item = new FeedItem();
item.feed = aFeed;
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "link");
item.url = this.validLink(this.findAtomLink("alternate", tags));
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "id");
item.id = this.getNodeValue(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "summary");
item.description = this.getNodeValueFormatted(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "title");
item.title = this.getNodeValue(tags ? tags[0] : null) ||
(item.description ? item.description.substr(0, 150) : null);
if (!item.title || !item.id)
{
// We're lenient about other mandatory tags, but insist on these.
FeedUtils.log.info("FeedParser.parseAsAtom: missing mandatory " +
"element , or and no ; skipping");
continue;
}
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "author");
if (!tags)
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "contributor");
if (!tags)
tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "author");
let authorEl = tags ? tags[0] : null;
let author = "";
if (authorEl)
{
tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_03_NS, "name");
let name = this.getNodeValue(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(authorEl, FeedUtils.ATOM_03_NS, "email");
let email = this.getNodeValue(tags ? tags[0] : null);
if (name)
author = name + (email ? " <" + email + ">" : "");
else if (email)
author = email;
}
item.author = author || item.author || aFeed.title;
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "modified");
if (!tags || !this.getNodeValue(tags[0]))
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "issued");
if (!tags || !this.getNodeValue(tags[0]))
tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_03_NS, "created");
item.date = this.getNodeValue(tags ? tags[0] : null) || item.date;
// XXX We should get the xml:base attribute from the content tag as well
// and use it as the base HREF of the message.
// XXX Atom feeds can have multiple content elements; we should differentiate
// between them and pick the best one.
// Some Atom feeds wrap the content in a CTYPE declaration; others use
// a namespace to identify the tags as HTML; and a few are buggy and put
// HTML tags in without declaring their namespace so they look like Atom.
// We deal with the first two but not the third.
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_03_NS, "content");
let contentNode = tags ? tags[0] : null;
let content;
if (contentNode)
{
content = "";
for (let j = 0; j < contentNode.childNodes.length; j++)
{
let node = contentNode.childNodes.item(j);
if (node.nodeType == node.CDATA_SECTION_NODE)
content += node.data;
else
content += this.mSerializer.serializeToString(node);
}
if (contentNode.getAttribute("mode") == "escaped")
{
content = content.replace(/</g, "<");
content = content.replace(/>/g, ">");
content = content.replace(/&/g, "&");
}
if (content == "")
content = null;
}
item.content = content;
parsedItems.push(item);
}
return parsedItems;
},
parseAsAtomIETF: function(aFeed, aDOM)
{
let parsedItems = new Array();
// Get the first channel (assuming there is only one per Atom File).
let channel = this.childrenByTagNameNS(aDOM, FeedUtils.ATOM_IETF_NS, "feed")[0];
if (!channel)
return aFeed.onParseError(aFeed);
if (this.isPermanentRedirect(aFeed, null, channel, null))
return;
let tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "title");
aFeed.title = aFeed.title ||
this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null));
tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "subtitle");
aFeed.description = this.serializeTextConstruct(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "link");
aFeed.link = this.findAtomLink("alternate", tags);
aFeed.link = this.validLink(aFeed.link);
if (!aFeed.title)
{
FeedUtils.log.error("FeedParser.parseAsAtomIETF: missing mandatory element " +
"");
return aFeed.onParseError(aFeed);
}
if (!aFeed.parseItems)
return parsedItems;
aFeed.invalidateItems();
let items = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "entry");
items = items ? items : [];
FeedUtils.log.debug("FeedParser.parseAsAtomIETF: items to parse - " +
items.length);
for (let itemNode of items)
{
if (!itemNode.childElementCount)
continue;
let item = new FeedItem();
item.feed = aFeed;
item.enclosures = [];
item.keywords = [];
tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origLink");
item.url = this.validLink(this.getNodeValue(tags ? tags[0] : null));
if (!item.url)
{
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "link");
item.url = this.validLink(this.findAtomLink("alternate", tags)) ||
aFeed.link;
}
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "id");
item.id = this.getNodeValue(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "summary");
item.description = this.serializeTextConstruct(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "title");
item.title = this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null) ||
(item.description ?
item.description.substr(0, 150) : null));
if (!item.title || !item.id)
{
// We're lenient about other mandatory tags, but insist on these.
FeedUtils.log.info("FeedParser.parseAsAtomIETF: missing mandatory " +
"element , or and no ; skipping");
continue;
}
// Support multiple authors.
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "source");
let source = tags ? tags[0] : null;
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "author");
if (!tags)
tags = this.childrenByTagNameNS(source, FeedUtils.ATOM_IETF_NS, "author");
if (!tags)
tags = this.childrenByTagNameNS(channel, FeedUtils.ATOM_IETF_NS, "author");
let authorTags = tags || [];
let authors = [];
for (let authorTag of authorTags) {
let author = "";
tags = this.childrenByTagNameNS(authorTag, FeedUtils.ATOM_IETF_NS, "name");
let name = this.getNodeValue(tags ? tags[0] : null);
tags = this.childrenByTagNameNS(authorTag, FeedUtils.ATOM_IETF_NS, "email");
let email = this.getNodeValue(tags ? tags[0] : null);
if (name) {
name = this.cleanAuthorName(name);
if (email) {
if (!email.match(/^<.*>$/)) {
email = " <" + email + ">";
}
author = name + email;
} else {
author = "<" + name + ">";
}
} else if (email) {
author = email;
}
if (author) {
authors.push(author);
}
}
if (authors.length == 0) {
tags = this.childrenByTagNameNS(channel, FeedUtils.DC_NS, "publisher");
let author = this.getNodeValue(tags ? tags[0] : null) ||
aFeed.title;
author = this.cleanAuthorName(author);
item.author = author ? ["<" + author + ">"] : item.author;
} else {
item.author = authors;
}
FeedUtils.log.trace("FeedParser.parseAsAtomIETF: author(s) - " + item.author);
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "updated");
if (!tags || !this.getNodeValue(tags[0]))
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "published");
if (!tags || !this.getNodeValue(tags[0]))
tags = this.childrenByTagNameNS(source, FeedUtils.ATOM_IETF_NS, "published");
item.date = this.getNodeValue(tags ? tags[0] : null) || item.date;
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "content");
item.content = this.serializeTextConstruct(tags ? tags[0] : null);
if (item.content)
item.xmlContentBase = tags ? tags[0].baseURI : null;
else if (item.description)
{
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "summary");
item.xmlContentBase = tags ? tags[0].baseURI : null;
}
else
item.xmlContentBase = itemNode.baseURI;
item.xmlContentBase = this.validLink(item.xmlContentBase);
// Handle (if present).
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "link");
let encUrls = [];
if (tags)
for (let tag of tags)
{
let url = tag.getAttribute("rel") == "enclosure" ?
(tag.getAttribute("href") || "").trim() : null;
url = this.validLink(url);
if (url && encUrls.indexOf(url) == -1)
{
let type = this.removeUnprintableASCII(tag.getAttribute("type"));
let length = this.removeUnprintableASCII(tag.getAttribute("length"));
let title = this.removeUnprintableASCII(tag.getAttribute("title"));
item.enclosures.push(new FeedEnclosure(url, type, length, title));
encUrls.push(url);
}
}
tags = this.childrenByTagNameNS(itemNode, FeedUtils.FEEDBURNER_NS, "origEnclosureLink");
let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null));
if (origEncUrl)
{
if (item.enclosures.length)
item.enclosures[0].mURL = origEncUrl;
else
item.enclosures.push(new FeedEnclosure(origEncUrl));
}
// Handle atom threading extension, RFC4685. There may be 1 or more tags,
// and each must contain a ref attribute with 1 Message-Id equivalent
// value. This is the only attr of interest in the spec for presentation.
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_THREAD_NS, "in-reply-to");
if (tags)
{
for (let tag of tags)
{
let ref = this.removeUnprintableASCII(tag.getAttribute("ref"));
if (ref)
item.inReplyTo += item.normalizeMessageID(ref) + " ";
}
item.inReplyTo = item.inReplyTo.trimRight();
}
// Support and autotagging.
tags = this.childrenByTagNameNS(itemNode, FeedUtils.ATOM_IETF_NS, "category");
if (tags)
{
for (let tag of tags)
{
let term = this.removeUnprintableASCII(tag.getAttribute("term"));
term = term ? this.xmlUnescape(term.replace(/,/g, ";")).trim() : null;
if (term && item.keywords.indexOf(term) == -1)
item.keywords.push(term);
}
}
parsedItems.push(item);
}
return parsedItems;
},
isPermanentRedirect: function(aFeed, aRedirDocChannel, aFeedChannel, aDS)
{
// If subscribing to a new feed, do not check redirect tags.
if (!aFeed.downloadCallback || aFeed.downloadCallback.mSubscribeMode)
return false;
let tags, tagName, newUrl;
let oldUrl = aFeed.url;
// Check for RSS2.0 redirect document tag.
if (aRedirDocChannel)
{
tagName = "newLocation";
tags = this.childrenByTagNameNS(aRedirDocChannel, "", tagName);
newUrl = this.getNodeValue(tags ? tags[0] : null);
}
// Check for tag.
if (aFeedChannel)
{
tagName = "new-feed-url";
if (aDS)
{
tags = FeedUtils.rdf.GetResource(FeedUtils.ITUNES_NS + tagName);
newUrl = this.getRDFTargetValue(aDS, aFeedChannel, tags);
}
else
{
tags = this.childrenByTagNameNS(aFeedChannel, FeedUtils.ITUNES_NS, tagName);
newUrl = this.getNodeValue(tags ? tags[0] : null);
}
tagName = "itunes:" + tagName;
}
if (newUrl && newUrl != oldUrl && FeedUtils.isValidScheme(newUrl) &&
FeedUtils.changeUrlForFeed(aFeed, newUrl))
{
FeedUtils.log.info("FeedParser.isPermanentRedirect: found <" + tagName +
"> tag; updated feed url from: " + oldUrl + " to: " + newUrl +
" in folder: " + FeedUtils.getFolderPrettyPath(aFeed.folder));
aFeed.onUrlChange(aFeed, oldUrl);
return true;
}
return false;
},
serializeTextConstruct: function(textElement)
{
let content = "";
if (textElement)
{
let textType = textElement.getAttribute("type");
// Atom spec says consider it "text" if not present.
if (!textType)
textType = "text";
// There could be some strange content type we don't handle.
if (textType != "text" && textType != "html" && textType != "xhtml")
return null;
for (let j = 0; j < textElement.childNodes.length; j++)
{
let node = textElement.childNodes.item(j);
if (node.nodeType == node.CDATA_SECTION_NODE)
content += this.xmlEscape(node.data);
else
content += this.mSerializer.serializeToString(node);
}
if (textType == "html")
content = this.xmlUnescape(content);
content = content.trim();
}
// Other parts of the code depend on this being null if there's no content.
return content ? content : null;
},
/**
* Return a cleaned up author name value.
*
* @param {String} authorString - A string.
* @returns {String} - A clean string value.
*/
cleanAuthorName(authorString) {
if (!authorString) {
return "";
}
FeedUtils.log.trace("FeedParser.cleanAuthor: author1 - " + authorString);
let author = authorString.replace(/[\n\r\t]+/g, " ")
.replace(/"/g, '\\"')
.trim();
// If the name contains special chars, quote it.
if (author.match(/[<>@,"]/)) {
author = '"' + author + '"';
}
FeedUtils.log.trace("FeedParser.cleanAuthor: author2 - " + author);
return author;
},
getRDFTargetValue: function(ds, source, property)
{
let nodeValue = this.getRDFTargetValueRaw(ds, source, property);
if (!nodeValue)
return null;
nodeValue = nodeValue.replace(/[\n\r\t]+/g, " ");
return this.removeUnprintableASCII(nodeValue);
},
getRDFTargetValueFormatted: function(ds, source, property)
{
let nodeValue = this.getRDFTargetValueRaw(ds, source, property);
if (!nodeValue)
return null;
return this.removeUnprintableASCIIexCRLFTAB(nodeValue);
},
getRDFTargetValueRaw: function(ds, source, property)
{
let node = ds.GetTarget(source, property, true);
if (node)
{
try
{
node = node.QueryInterface(Ci.nsIRDFLiteral);
if (node)
return node.Value.trim();
}
catch (e)
{
// If the RDF was bogus, do nothing. Rethrow if it's some other problem.
if (!((e instanceof Ci.nsIXPCException) &&
e.result == Cr.NS_ERROR_NO_INTERFACE))
throw new Error("FeedParser.getRDFTargetValue: " + e);
}
}
return null;
},
/**
* Return a cleaned up node value. This is intended for values that are not
* multiline and not formatted. A sequence of tab or newline is converted to
* a space and unprintable ascii is removed.
*
* @param {Node} node - A DOM node.
* @return {String} - A clean string value or null.
*/
getNodeValue: function(node)
{
let nodeValue = this.getNodeValueRaw(node);
if (!nodeValue)
return null;
nodeValue = nodeValue.replace(/[\n\r\t]+/g, " ");
return this.removeUnprintableASCII(nodeValue);
},
/**
* Return a cleaned up formatted node value, meaning CR/LF/TAB are retained
* while all other unprintable ascii is removed. This is intended for values
* that are multiline and formatted, such as content or description tags.
*
* @param {Node} node - A DOM node.
* @return {String} - A clean string value or null.
*/
getNodeValueFormatted: function(node)
{
let nodeValue = this.getNodeValueRaw(node);
if (!nodeValue)
return null;
return this.removeUnprintableASCIIexCRLFTAB(nodeValue);
},
/**
* Return a raw node value, as received. This should be sanitized as
* appropriate.
*
* @param {Node} node - A DOM node.
* @return {String} - A string value or null.
*/
getNodeValueRaw: function(node)
{
if (node && node.textContent)
return node.textContent.trim();
if (node && node.firstChild)
{
let ret = "";
for (let child = node.firstChild; child; child = child.nextSibling)
{
let value = this.getNodeValueRaw(child);
if (value)
ret += value;
}
if (ret)
return ret.trim();
}
return null;
},
// Finds elements that are direct children of the first arg.
childrenByTagNameNS: function(aElement, aNamespace, aTagName)
{
if (!aElement)
return null;
let matches = aElement.getElementsByTagNameNS(aNamespace, aTagName);
let matchingChildren = new Array();
for (let match of matches)
{
if (match.parentNode == aElement)
matchingChildren.push(match)
}
return matchingChildren.length ? matchingChildren : null;
},
/**
* Ensure type tags start with http[s]://, ftp:// or magnet:
* for values stored in mail headers (content-base and remote enclosures),
* particularly to prevent data: uris, javascript, and other spoofing.
*
* @param {String} link - An intended http url string.
* @return {String} - A clean string starting with http, ftp or magnet,
* else null.
*/
validLink: function(link)
{
if (/^((https?|ftp):\/\/|magnet:)/.test(link))
return this.removeUnprintableASCII(link.trim());
return null;
},
findAtomLink: function(linkRel, linkElements)
{
if (!linkElements)
return null;
// XXX Need to check for MIME type and hreflang.
for (let alink of linkElements) {
if (alink &&
// If there's a link rel.
((alink.getAttribute("rel") && alink.getAttribute("rel") == linkRel) ||
// If there isn't, assume 'alternate'.
(!alink.getAttribute("rel") && (linkRel == "alternate"))) &&
alink.getAttribute("href"))
{
// Atom links are interpreted relative to xml:base.
try {
return Services.io.newURI(alink.baseURI, null, null).
resolve(alink.getAttribute("href"));
}
catch (ex) {}
}
}
return null;
},
/**
* Remove unprintable ascii, particularly CR/LF, for non formatted tag values.
*
* @param {String} s - String to clean.
* @return {String}
*/
removeUnprintableASCII: function(s)
{
return s ? s.replace(/[\x00-\x1F\x7F]+/g, "") : "";
},
/**
* Remove unprintable ascii, except CR/LF/TAB, for formatted tag values.
*
* @param {String} s - String to clean.
* @return {String}
*/
removeUnprintableASCIIexCRLFTAB: function(s)
{
return s ? s.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+/g, "") : "";
},
stripTags: function(someHTML)
{
return someHTML ? someHTML.replace(/<[^>]+>/g, "") : someHTML;
},
xmlUnescape: function(s)
{
s = s.replace(/</g, "<");
s = s.replace(/>/g, ">");
s = s.replace(/&/g, "&");
return s;
},
xmlEscape: function(s)
{
s = s.replace(/&/g, "&");
s = s.replace(/>/g, ">");
s = s.replace(/= 0 && yeardiff < 3)
// It's quite likely the correct date.
return d.toString();
}
// Could be an ISO8601/W3C date. If not, get the current time.
return FeedUtils.getValidRFC5322Date(dateString);
}
};