michael@0: /* michael@0: * Copyright (c) 2010 Arc90 Inc michael@0: * michael@0: * Licensed under the Apache License, Version 2.0 (the "License"); michael@0: * you may not use this file except in compliance with the License. michael@0: * You may obtain a copy of the License at michael@0: * michael@0: * http://www.apache.org/licenses/LICENSE-2.0 michael@0: * michael@0: * Unless required by applicable law or agreed to in writing, software michael@0: * distributed under the License is distributed on an "AS IS" BASIS, michael@0: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: * See the License for the specific language governing permissions and michael@0: * limitations under the License. michael@0: */ michael@0: michael@0: /* michael@0: * This code is heavily based on Arc90's readability.js (1.7.1) script michael@0: * available at: http://code.google.com/p/arc90labs-readability michael@0: */ michael@0: michael@0: var Readability = function(uri, doc) { michael@0: const ENABLE_LOGGING = false; michael@0: michael@0: this._uri = uri; michael@0: this._doc = doc; michael@0: this._biggestFrame = false; michael@0: this._articleByline = null; michael@0: this._articleDir = null; michael@0: michael@0: // Start with all flags set michael@0: this._flags = this.FLAG_STRIP_UNLIKELYS | michael@0: this.FLAG_WEIGHT_CLASSES | michael@0: this.FLAG_CLEAN_CONDITIONALLY; michael@0: michael@0: // The list of pages we've parsed in this call of readability, michael@0: // for autopaging. As a key store for easier searching. michael@0: this._parsedPages = {}; michael@0: michael@0: // A list of the ETag headers of pages we've parsed, in case they happen to match, michael@0: // we'll know it's a duplicate. michael@0: this._pageETags = {}; michael@0: michael@0: // Make an AJAX request for each page and append it to the document. michael@0: this._curPageNum = 1; michael@0: michael@0: // Control whether log messages are sent to the console michael@0: if (ENABLE_LOGGING) { michael@0: this.log = function (msg) { michael@0: dump("Reader: (Readability) " + msg); michael@0: }; michael@0: } else { michael@0: this.log = function () {}; michael@0: } michael@0: } michael@0: michael@0: Readability.prototype = { michael@0: FLAG_STRIP_UNLIKELYS: 0x1, michael@0: FLAG_WEIGHT_CLASSES: 0x2, michael@0: FLAG_CLEAN_CONDITIONALLY: 0x4, michael@0: michael@0: // The number of top candidates to consider when analysing how michael@0: // tight the competition is among candidates. michael@0: N_TOP_CANDIDATES: 5, michael@0: michael@0: // The maximum number of pages to loop through before we call michael@0: // it quits and just show a link. michael@0: MAX_PAGES: 5, michael@0: michael@0: // All of the regular expressions in use within readability. michael@0: // Defined up here so we don't instantiate them repeatedly in loops. michael@0: REGEXPS: { michael@0: unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, michael@0: okMaybeItsACandidate: /and|article|body|column|main|shadow/i, michael@0: positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, michael@0: negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, michael@0: extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, michael@0: byline: /byline|author|dateline|writtenby/i, michael@0: replaceFonts: /<(\/?)font[^>]*>/gi, michael@0: trim: /^\s+|\s+$/g, michael@0: normalize: /\s{2,}/g, michael@0: videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, michael@0: nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, michael@0: prevLink: /(prev|earl|old|new|<|«)/i, michael@0: whitespace: /^\s*$/ michael@0: }, michael@0: michael@0: DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], michael@0: michael@0: /** michael@0: * Run any post-process modifications to article content as necessary. michael@0: * michael@0: * @param Element michael@0: * @return void michael@0: **/ michael@0: _postProcessContent: function(articleContent) { michael@0: // Readability cannot open relative uris so we convert them to absolute uris. michael@0: this._fixRelativeUris(articleContent); michael@0: }, michael@0: michael@0: /** michael@0: * Converts each and uri in the given element to an absolute URI. michael@0: * michael@0: * @param Element michael@0: * @return void michael@0: */ michael@0: _fixRelativeUris: function(articleContent) { michael@0: let scheme = this._uri.scheme; michael@0: let prePath = this._uri.prePath; michael@0: let pathBase = this._uri.pathBase; michael@0: michael@0: function toAbsoluteURI(uri) { michael@0: // If this is already an absolute URI, return it. michael@0: if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) michael@0: return uri; michael@0: michael@0: // Scheme-rooted relative URI. michael@0: if (uri.substr(0, 2) == "//") michael@0: return scheme + "://" + uri.substr(2); michael@0: michael@0: // Prepath-rooted relative URI. michael@0: if (uri[0] == "/") michael@0: return prePath + uri; michael@0: michael@0: // Standard relative URI; add entire path. pathBase already includes a michael@0: // trailing "/". michael@0: return pathBase + uri; michael@0: } michael@0: michael@0: function convertRelativeURIs(tagName, propName) { michael@0: let elems = articleContent.getElementsByTagName(tagName); michael@0: for (let i = elems.length; --i >= 0;) { michael@0: let elem = elems[i]; michael@0: let relativeURI = elem.getAttribute(propName); michael@0: if (relativeURI != null) michael@0: elems[i].setAttribute(propName, toAbsoluteURI(relativeURI)); michael@0: } michael@0: } michael@0: michael@0: // Fix links. michael@0: convertRelativeURIs("a", "href"); michael@0: michael@0: // Fix images. michael@0: convertRelativeURIs("img", "src"); michael@0: }, michael@0: michael@0: /** michael@0: * Get the article title as an H1. michael@0: * michael@0: * @return void michael@0: **/ michael@0: _getArticleTitle: function() { michael@0: let doc = this._doc; michael@0: let curTitle = ""; michael@0: let origTitle = ""; michael@0: michael@0: try { michael@0: curTitle = origTitle = doc.title; michael@0: michael@0: // If they had an element with id "title" in their HTML michael@0: if (typeof curTitle !== "string") michael@0: curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); michael@0: } catch(e) {} michael@0: michael@0: if (curTitle.match(/ [\|\-] /)) { michael@0: curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); michael@0: michael@0: if (curTitle.split(' ').length < 3) michael@0: curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); michael@0: } else if (curTitle.indexOf(': ') !== -1) { michael@0: curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); michael@0: michael@0: if (curTitle.split(' ').length < 3) michael@0: curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); michael@0: } else if (curTitle.length > 150 || curTitle.length < 15) { michael@0: let hOnes = doc.getElementsByTagName('h1'); michael@0: michael@0: if (hOnes.length === 1) michael@0: curTitle = this._getInnerText(hOnes[0]); michael@0: } michael@0: michael@0: curTitle = curTitle.replace(this.REGEXPS.trim, ""); michael@0: michael@0: if (curTitle.split(' ').length <= 4) michael@0: curTitle = origTitle; michael@0: michael@0: return curTitle; michael@0: }, michael@0: michael@0: /** michael@0: * Prepare the HTML document for readability to scrape it. michael@0: * This includes things like stripping javascript, CSS, and handling terrible markup. michael@0: * michael@0: * @return void michael@0: **/ michael@0: _prepDocument: function() { michael@0: let doc = this._doc; michael@0: michael@0: // In some cases a body element can't be found (if the HTML is michael@0: // totally hosed for example) so we create a new body node and michael@0: // append it to the document. michael@0: if (doc.body === null) { michael@0: let body = doc.createElement("body"); michael@0: michael@0: try { michael@0: doc.body = body; michael@0: } catch(e) { michael@0: doc.documentElement.appendChild(body); michael@0: this.log(e); michael@0: } michael@0: } michael@0: michael@0: // Remove all style tags in head michael@0: let styleTags = doc.getElementsByTagName("style"); michael@0: for (let st = 0; st < styleTags.length; st += 1) { michael@0: styleTags[st].textContent = ""; michael@0: } michael@0: michael@0: this._replaceBrs(doc.body); michael@0: michael@0: let fonts = doc.getElementsByTagName("FONT"); michael@0: for (let i = fonts.length; --i >=0;) { michael@0: this._setNodeTag(fonts[i], "SPAN"); michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Finds the next element, starting from the given node, and ignoring michael@0: * whitespace in between. If the given node is an element, the same node is michael@0: * returned. michael@0: */ michael@0: _nextElement: function (node) { michael@0: let next = node; michael@0: while (next michael@0: && (next.nodeType != Node.ELEMENT_NODE) michael@0: && this.REGEXPS.whitespace.test(next.textContent)) { michael@0: next = next.nextSibling; michael@0: } michael@0: return next; michael@0: }, michael@0: michael@0: /** michael@0: * Replaces 2 or more successive
elements with a single

. michael@0: * Whitespace between
elements are ignored. For example: michael@0: *

foo
bar


abc
michael@0: * will become: michael@0: *
foo
bar

abc

michael@0: */ michael@0: _replaceBrs: function (elem) { michael@0: let brs = elem.getElementsByTagName("br"); michael@0: for (let i = 0; i < brs.length; i++) { michael@0: let br = brs[i]; michael@0: let next = br.nextSibling; michael@0: michael@0: // Whether 2 or more
elements have been found and replaced with a michael@0: //

block. michael@0: let replaced = false; michael@0: michael@0: // If we find a
chain, remove the
s until we hit another element michael@0: // or non-whitespace. This leaves behind the first
in the chain michael@0: // (which will be replaced with a

later). michael@0: while ((next = this._nextElement(next)) && (next.tagName == "BR")) { michael@0: replaced = true; michael@0: let sibling = next.nextSibling; michael@0: next.parentNode.removeChild(next); michael@0: next = sibling; michael@0: } michael@0: michael@0: // If we removed a
chain, replace the remaining
with a

. Add michael@0: // all sibling nodes as children of the

until we hit another
michael@0: // chain. michael@0: if (replaced) { michael@0: let p = this._doc.createElement("p"); michael@0: br.parentNode.replaceChild(p, br); michael@0: michael@0: next = p.nextSibling; michael@0: while (next) { michael@0: // If we've hit another

, we're done adding children to this

. michael@0: if (next.tagName == "BR") { michael@0: let nextElem = this._nextElement(next); michael@0: if (nextElem && nextElem.tagName == "BR") michael@0: break; michael@0: } michael@0: michael@0: // Otherwise, make this node a child of the new

. michael@0: let sibling = next.nextSibling; michael@0: p.appendChild(next); michael@0: next = sibling; michael@0: } michael@0: } michael@0: } michael@0: }, michael@0: michael@0: _setNodeTag: function (node, tag) { michael@0: node.localName = tag.toLowerCase(); michael@0: node.tagName = tag.toUpperCase(); michael@0: }, michael@0: michael@0: /** michael@0: * Prepare the article node for display. Clean out any inline styles, michael@0: * iframes, forms, strip extraneous

tags, etc. michael@0: * michael@0: * @param Element michael@0: * @return void michael@0: **/ michael@0: _prepArticle: function(articleContent) { michael@0: this._cleanStyles(articleContent); michael@0: michael@0: // Clean out junk from the article content michael@0: this._cleanConditionally(articleContent, "form"); michael@0: this._clean(articleContent, "object"); michael@0: this._clean(articleContent, "h1"); michael@0: michael@0: // If there is only one h2, they are probably using it as a header michael@0: // and not a subheader, so remove it since we already have a header. michael@0: if (articleContent.getElementsByTagName('h2').length === 1) michael@0: this._clean(articleContent, "h2"); michael@0: michael@0: this._clean(articleContent, "iframe"); michael@0: this._cleanHeaders(articleContent); michael@0: michael@0: // Do these last as the previous stuff may have removed junk michael@0: // that will affect these michael@0: this._cleanConditionally(articleContent, "table"); michael@0: this._cleanConditionally(articleContent, "ul"); michael@0: this._cleanConditionally(articleContent, "div"); michael@0: michael@0: // Remove extra paragraphs michael@0: let articleParagraphs = articleContent.getElementsByTagName('p'); michael@0: for (let i = articleParagraphs.length - 1; i >= 0; i -= 1) { michael@0: let imgCount = articleParagraphs[i].getElementsByTagName('img').length; michael@0: let embedCount = articleParagraphs[i].getElementsByTagName('embed').length; michael@0: let objectCount = articleParagraphs[i].getElementsByTagName('object').length; michael@0: michael@0: if (imgCount === 0 && michael@0: embedCount === 0 && michael@0: objectCount === 0 && michael@0: this._getInnerText(articleParagraphs[i], false) === '') michael@0: articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); michael@0: } michael@0: michael@0: let brs = articleContent.getElementsByTagName("BR"); michael@0: for (let i = brs.length; --i >= 0;) { michael@0: let br = brs[i]; michael@0: let next = this._nextElement(br.nextSibling); michael@0: if (next && next.tagName == "P") michael@0: br.parentNode.removeChild(br); michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Initialize a node with the readability object. Also checks the michael@0: * className/id for special names to add to its score. michael@0: * michael@0: * @param Element michael@0: * @return void michael@0: **/ michael@0: _initializeNode: function(node) { michael@0: node.readability = {"contentScore": 0}; michael@0: michael@0: switch(node.tagName) { michael@0: case 'DIV': michael@0: node.readability.contentScore += 5; michael@0: break; michael@0: michael@0: case 'PRE': michael@0: case 'TD': michael@0: case 'BLOCKQUOTE': michael@0: node.readability.contentScore += 3; michael@0: break; michael@0: michael@0: case 'ADDRESS': michael@0: case 'OL': michael@0: case 'UL': michael@0: case 'DL': michael@0: case 'DD': michael@0: case 'DT': michael@0: case 'LI': michael@0: case 'FORM': michael@0: node.readability.contentScore -= 3; michael@0: break; michael@0: michael@0: case 'H1': michael@0: case 'H2': michael@0: case 'H3': michael@0: case 'H4': michael@0: case 'H5': michael@0: case 'H6': michael@0: case 'TH': michael@0: node.readability.contentScore -= 5; michael@0: break; michael@0: } michael@0: michael@0: node.readability.contentScore += this._getClassWeight(node); michael@0: }, michael@0: michael@0: /*** michael@0: * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is michael@0: * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. michael@0: * michael@0: * @param page a document to run upon. Needs to be a full document, complete with body. michael@0: * @return Element michael@0: **/ michael@0: _grabArticle: function (page) { michael@0: let doc = this._doc; michael@0: let isPaging = (page !== null ? true: false); michael@0: page = page ? page : this._doc.body; michael@0: let pageCacheHtml = page.innerHTML; michael@0: michael@0: // Check if any "dir" is set on the toplevel document element michael@0: this._articleDir = doc.documentElement.getAttribute("dir"); michael@0: michael@0: while (true) { michael@0: let stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); michael@0: let allElements = page.getElementsByTagName('*'); michael@0: michael@0: // First, node prepping. Trash nodes that look cruddy (like ones with the michael@0: // class name "comment", etc), and turn divs into P tags where they have been michael@0: // used inappropriately (as in, where they contain no other block level elements.) michael@0: // michael@0: // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 michael@0: // TODO: Shouldn't this be a reverse traversal? michael@0: let node = null; michael@0: let nodesToScore = []; michael@0: michael@0: // Let each node know its index in the allElements array. michael@0: for (let i = allElements.length; --i >= 0;) { michael@0: allElements[i]._index = i; michael@0: } michael@0: michael@0: /** michael@0: * JSDOMParser returns static node lists, not live ones. When we remove michael@0: * an element from the document, we need to manually remove it - and all michael@0: * of its children - from the allElements array. michael@0: */ michael@0: function purgeNode(node) { michael@0: for (let i = node.childNodes.length; --i >= 0;) { michael@0: purgeNode(node.childNodes[i]); michael@0: } michael@0: if (node._index !== undefined && allElements[node._index] == node) michael@0: delete allElements[node._index]; michael@0: } michael@0: michael@0: for (let nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) { michael@0: if (!(node = allElements[nodeIndex])) michael@0: continue; michael@0: michael@0: let matchString = node.className + node.id; michael@0: if (matchString.search(this.REGEXPS.byline) !== -1 && !this._articleByline) { michael@0: this._articleByline = node.textContent; michael@0: node.parentNode.removeChild(node); michael@0: purgeNode(node); michael@0: continue; michael@0: } michael@0: michael@0: // Remove unlikely candidates michael@0: if (stripUnlikelyCandidates) { michael@0: if (matchString.search(this.REGEXPS.unlikelyCandidates) !== -1 && michael@0: matchString.search(this.REGEXPS.okMaybeItsACandidate) === -1 && michael@0: node.tagName !== "BODY") { michael@0: this.log("Removing unlikely candidate - " + matchString); michael@0: node.parentNode.removeChild(node); michael@0: purgeNode(node); michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") michael@0: nodesToScore[nodesToScore.length] = node; michael@0: michael@0: // Turn all divs that don't have children block level elements into p's michael@0: if (node.tagName === "DIV") { michael@0: // Sites like http://mobile.slate.com encloses each paragraph with a DIV michael@0: // element. DIVs with only a P element inside and no text content can be michael@0: // safely converted into plain P elements to avoid confusing the scoring michael@0: // algorithm with DIVs with are, in practice, paragraphs. michael@0: let pIndex = this._getSinglePIndexInsideDiv(node); michael@0: michael@0: if (pIndex >= 0 || !this._hasChildBlockElement(node)) { michael@0: if (pIndex >= 0) { michael@0: let newNode = node.childNodes[pIndex]; michael@0: node.parentNode.replaceChild(newNode, node); michael@0: purgeNode(node); michael@0: } else { michael@0: this._setNodeTag(node, "P"); michael@0: nodesToScore[nodesToScore.length] = node; michael@0: } michael@0: } else { michael@0: // EXPERIMENTAL michael@0: for (let i = 0, il = node.childNodes.length; i < il; i += 1) { michael@0: let childNode = node.childNodes[i]; michael@0: if (!childNode) michael@0: continue; michael@0: michael@0: if (childNode.nodeType === 3) { // Node.TEXT_NODE michael@0: let p = doc.createElement('p'); michael@0: p.textContent = childNode.textContent; michael@0: p.style.display = 'inline'; michael@0: p.className = 'readability-styled'; michael@0: childNode.parentNode.replaceChild(p, childNode); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Loop through all paragraphs, and assign a score to them based on how content-y they look. michael@0: * Then add their score to their parent node. michael@0: * michael@0: * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. michael@0: **/ michael@0: let candidates = []; michael@0: for (let pt = 0; pt < nodesToScore.length; pt += 1) { michael@0: let parentNode = nodesToScore[pt].parentNode; michael@0: let grandParentNode = parentNode ? parentNode.parentNode : null; michael@0: let innerText = this._getInnerText(nodesToScore[pt]); michael@0: michael@0: if (!parentNode || typeof(parentNode.tagName) === 'undefined') michael@0: continue; michael@0: michael@0: // If this paragraph is less than 25 characters, don't even count it. michael@0: if (innerText.length < 25) michael@0: continue; michael@0: michael@0: // Initialize readability data for the parent. michael@0: if (typeof parentNode.readability === 'undefined') { michael@0: this._initializeNode(parentNode); michael@0: candidates.push(parentNode); michael@0: } michael@0: michael@0: // Initialize readability data for the grandparent. michael@0: if (grandParentNode && michael@0: typeof(grandParentNode.readability) === 'undefined' && michael@0: typeof(grandParentNode.tagName) !== 'undefined') { michael@0: this._initializeNode(grandParentNode); michael@0: candidates.push(grandParentNode); michael@0: } michael@0: michael@0: let contentScore = 0; michael@0: michael@0: // Add a point for the paragraph itself as a base. michael@0: contentScore += 1; michael@0: michael@0: // Add points for any commas within this paragraph. michael@0: contentScore += innerText.split(',').length; michael@0: michael@0: // For every 100 characters in this paragraph, add another point. Up to 3 points. michael@0: contentScore += Math.min(Math.floor(innerText.length / 100), 3); michael@0: michael@0: // Add the score to the parent. The grandparent gets half. michael@0: parentNode.readability.contentScore += contentScore; michael@0: michael@0: if (grandParentNode) michael@0: grandParentNode.readability.contentScore += contentScore / 2; michael@0: } michael@0: michael@0: // After we've calculated scores, loop through all of the possible michael@0: // candidate nodes we found and find the one with the highest score. michael@0: let topCandidates = []; michael@0: for (let c = 0, cl = candidates.length; c < cl; c += 1) { michael@0: let candidate = candidates[c]; michael@0: michael@0: // Scale the final candidates score based on link density. Good content michael@0: // should have a relatively small link density (5% or less) and be mostly michael@0: // unaffected by this operation. michael@0: let candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); michael@0: candidate.readability.contentScore = candidateScore; michael@0: michael@0: this.log('Candidate: ' + candidate + " (" + candidate.className + ":" + michael@0: candidate.id + ") with score " + candidateScore); michael@0: michael@0: for (let t = 0; t < this.N_TOP_CANDIDATES; t++) { michael@0: let aTopCandidate = topCandidates[t]; michael@0: michael@0: if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { michael@0: topCandidates.splice(t, 0, candidate); michael@0: if (topCandidates.length > this.N_TOP_CANDIDATES) michael@0: topCandidates.pop(); michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: michael@0: let topCandidate = topCandidates[0] || null; michael@0: let lastTopCandidate = (topCandidates.length > 3 ? topCandidates[topCandidates.length - 1] : null); michael@0: michael@0: // If we still have no top candidate, just use the body as a last resort. michael@0: // We also have to copy the body node so it is something we can modify. michael@0: if (topCandidate === null || topCandidate.tagName === "BODY") { michael@0: // Move all of the page's children into topCandidate michael@0: topCandidate = doc.createElement("DIV"); michael@0: let children = page.childNodes; michael@0: for (let i = 0; i < children.length; ++i) { michael@0: topCandidate.appendChild(children[i]); michael@0: } michael@0: michael@0: page.appendChild(topCandidate); michael@0: michael@0: this._initializeNode(topCandidate); michael@0: } michael@0: michael@0: // Now that we have the top candidate, look through its siblings for content michael@0: // that might also be related. Things like preambles, content split by ads michael@0: // that we removed, etc. michael@0: let articleContent = doc.createElement("DIV"); michael@0: if (isPaging) michael@0: articleContent.id = "readability-content"; michael@0: michael@0: let siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); michael@0: let siblingNodes = topCandidate.parentNode.childNodes; michael@0: michael@0: for (let s = 0, sl = siblingNodes.length; s < sl; s += 1) { michael@0: let siblingNode = siblingNodes[s]; michael@0: let append = false; michael@0: michael@0: this.log("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); michael@0: this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); michael@0: michael@0: if (siblingNode === topCandidate) michael@0: append = true; michael@0: michael@0: let contentBonus = 0; michael@0: michael@0: // Give a bonus if sibling nodes and top candidates have the example same classname michael@0: if (siblingNode.className === topCandidate.className && topCandidate.className !== "") michael@0: contentBonus += topCandidate.readability.contentScore * 0.2; michael@0: michael@0: if (typeof siblingNode.readability !== 'undefined' && michael@0: (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) michael@0: append = true; michael@0: michael@0: if (siblingNode.nodeName === "P") { michael@0: let linkDensity = this._getLinkDensity(siblingNode); michael@0: let nodeContent = this._getInnerText(siblingNode); michael@0: let nodeLength = nodeContent.length; michael@0: michael@0: if (nodeLength > 80 && linkDensity < 0.25) { michael@0: append = true; michael@0: } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { michael@0: append = true; michael@0: } michael@0: } michael@0: michael@0: if (append) { michael@0: this.log("Appending node: " + siblingNode); michael@0: michael@0: // siblingNodes is a reference to the childNodes array, and michael@0: // siblingNode is removed from the array when we call appendChild() michael@0: // below. As a result, we must revisit this index since the nodes michael@0: // have been shifted. michael@0: s -= 1; michael@0: sl -= 1; michael@0: michael@0: if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { michael@0: // We have a node that isn't a common block level element, like a form or td tag. michael@0: // Turn it into a div so it doesn't get filtered out later by accident. */ michael@0: this.log("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); michael@0: michael@0: this._setNodeTag(siblingNode, "DIV"); michael@0: } michael@0: michael@0: // To ensure a node does not interfere with readability styles, michael@0: // remove its classnames. michael@0: siblingNode.className = ""; michael@0: michael@0: // Append sibling and subtract from our list because it removes michael@0: // the node when you append to another node. michael@0: articleContent.appendChild(siblingNode); michael@0: } michael@0: } michael@0: michael@0: // So we have all of the content that we need. Now we clean it up for presentation. michael@0: this._prepArticle(articleContent); michael@0: michael@0: if (this._curPageNum === 1) { michael@0: let div = doc.createElement("DIV"); michael@0: div.id = "readability-page-1"; michael@0: div.className = "page"; michael@0: let children = articleContent.childNodes; michael@0: for (let i = 0; i < children.length; ++i) { michael@0: div.appendChild(children[i]); michael@0: } michael@0: articleContent.appendChild(div); michael@0: } michael@0: michael@0: // Now that we've gone through the full algorithm, check to see if michael@0: // we got any meaningful content. If we didn't, we may need to re-run michael@0: // grabArticle with different flags set. This gives us a higher likelihood of michael@0: // finding the content, and the sieve approach gives us a higher likelihood of michael@0: // finding the -right- content. michael@0: if (this._getInnerText(articleContent, true).length < 500) { michael@0: page.innerHTML = pageCacheHtml; michael@0: michael@0: if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { michael@0: this._removeFlag(this.FLAG_STRIP_UNLIKELYS); michael@0: } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { michael@0: this._removeFlag(this.FLAG_WEIGHT_CLASSES); michael@0: } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { michael@0: this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); michael@0: } else { michael@0: return null; michael@0: } michael@0: } else { michael@0: if (lastTopCandidate !== null) { michael@0: // EXPERIMENTAL: Contrast ratio is how we measure the level of competition between candidates in the michael@0: // readability algorithm. This is to avoid offering reader mode on pages that are more like michael@0: // a list or directory of links with summaries. It takes the score of the last top candidate michael@0: // (see N_TOP_CANDIDATES) and checks how it compares to the top candidate's. On pages that are not michael@0: // actual articles, there will likely be many candidates with similar score (i.e. higher contrast ratio). michael@0: let contrastRatio = lastTopCandidate.readability.contentScore / topCandidate.readability.contentScore; michael@0: if (contrastRatio > 0.45) michael@0: return null; michael@0: } michael@0: michael@0: return articleContent; michael@0: } michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Attempts to get the excerpt from these michael@0: * sources in the following order: michael@0: * - meta description tag michael@0: * - open-graph description michael@0: * - twitter cards description michael@0: * - article's first paragraph michael@0: * If no excerpt is found, an empty string will be michael@0: * returned. michael@0: * michael@0: * @param Element - root element of the processed version page michael@0: * @return String - excerpt of the article michael@0: **/ michael@0: _getExcerpt: function(articleContent) { michael@0: let values = {}; michael@0: let metaElements = this._doc.getElementsByTagName("meta"); michael@0: michael@0: // Match "description", or Twitter's "twitter:description" (Cards) michael@0: // in name attribute. michael@0: let namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi; michael@0: michael@0: // Match Facebook's og:description (Open Graph) in property attribute. michael@0: let propertyPattern = /^\s*og\s*:\s*description\s*$/gi; michael@0: michael@0: // Find description tags. michael@0: for (let i = 0; i < metaElements.length; i++) { michael@0: let element = metaElements[i]; michael@0: let elementName = element.getAttribute("name"); michael@0: let elementProperty = element.getAttribute("property"); michael@0: michael@0: let name; michael@0: if (namePattern.test(elementName)) { michael@0: name = elementName; michael@0: } else if (propertyPattern.test(elementProperty)) { michael@0: name = elementProperty; michael@0: } michael@0: michael@0: if (name) { michael@0: let content = element.getAttribute("content"); michael@0: if (content) { michael@0: // Convert to lowercase and remove any whitespace michael@0: // so we can match below. michael@0: name = name.toLowerCase().replace(/\s/g, ''); michael@0: values[name] = content.trim(); michael@0: } michael@0: } michael@0: } michael@0: michael@0: if ("description" in values) { michael@0: return values["description"]; michael@0: } michael@0: michael@0: if ("og:description" in values) { michael@0: // Use facebook open graph description. michael@0: return values["og:description"]; michael@0: } michael@0: michael@0: if ("twitter:description" in values) { michael@0: // Use twitter cards description. michael@0: return values["twitter:description"]; michael@0: } michael@0: michael@0: // No description meta tags, use the article's first paragraph. michael@0: let paragraphs = articleContent.getElementsByTagName("p"); michael@0: if (paragraphs.length > 0) { michael@0: return paragraphs[0].textContent; michael@0: } michael@0: michael@0: return ""; michael@0: }, michael@0: michael@0: /** michael@0: * Removes script tags from the document. michael@0: * michael@0: * @param Element michael@0: **/ michael@0: _removeScripts: function(doc) { michael@0: let scripts = doc.getElementsByTagName('script'); michael@0: for (let i = scripts.length - 1; i >= 0; i -= 1) { michael@0: scripts[i].nodeValue=""; michael@0: scripts[i].removeAttribute('src'); michael@0: michael@0: if (scripts[i].parentNode) michael@0: scripts[i].parentNode.removeChild(scripts[i]); michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Get child index of the only P element inside a DIV with no michael@0: * text content. Returns -1 if the DIV node contains non-empty michael@0: * text nodes or if it contains other element nodes. michael@0: * michael@0: * @param Element michael@0: **/ michael@0: _getSinglePIndexInsideDiv: function(e) { michael@0: let childNodes = e.childNodes; michael@0: let pIndex = -1; michael@0: michael@0: for (let i = childNodes.length; --i >= 0;) { michael@0: let node = childNodes[i]; michael@0: michael@0: if (node.nodeType === Node.ELEMENT_NODE) { michael@0: if (node.tagName !== "P") michael@0: return -1; michael@0: michael@0: if (pIndex >= 0) michael@0: return -1; michael@0: michael@0: pIndex = i; michael@0: } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) { michael@0: return -1; michael@0: } michael@0: } michael@0: michael@0: return pIndex; michael@0: }, michael@0: michael@0: /** michael@0: * Determine whether element has any children block level elements. michael@0: * michael@0: * @param Element michael@0: */ michael@0: _hasChildBlockElement: function (e) { michael@0: let length = e.childNodes.length; michael@0: for (let i = 0; i < length; i++) { michael@0: let child = e.childNodes[i]; michael@0: if (child.nodeType != 1) michael@0: continue; michael@0: michael@0: if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child)) michael@0: return true; michael@0: } michael@0: return false; michael@0: }, michael@0: michael@0: /** michael@0: * Get the inner text of a node - cross browser compatibly. michael@0: * This also strips out any excess whitespace to be found. michael@0: * michael@0: * @param Element michael@0: * @return string michael@0: **/ michael@0: _getInnerText: function(e, normalizeSpaces) { michael@0: let textContent = e.textContent.replace(this.REGEXPS.trim, ""); michael@0: normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; michael@0: michael@0: if (normalizeSpaces) { michael@0: return textContent.replace(this.REGEXPS.normalize, " "); michael@0: } else { michael@0: return textContent; michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Get the number of times a string s appears in the node e. michael@0: * michael@0: * @param Element michael@0: * @param string - what to split on. Default is "," michael@0: * @return number (integer) michael@0: **/ michael@0: _getCharCount: function(e,s) { michael@0: s = s || ","; michael@0: return this._getInnerText(e).split(s).length - 1; michael@0: }, michael@0: michael@0: /** michael@0: * Remove the style attribute on every e and under. michael@0: * TODO: Test if getElementsByTagName(*) is faster. michael@0: * michael@0: * @param Element michael@0: * @return void michael@0: **/ michael@0: _cleanStyles: function(e) { michael@0: e = e || this._doc; michael@0: let cur = e.firstChild; michael@0: michael@0: if (!e) michael@0: return; michael@0: michael@0: // Remove any root styles, if we're able. michael@0: if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') michael@0: e.removeAttribute('style'); michael@0: michael@0: // Go until there are no more child nodes michael@0: while (cur !== null) { michael@0: if (cur.nodeType === 1) { michael@0: // Remove style attribute(s) : michael@0: if (cur.className !== "readability-styled") michael@0: cur.removeAttribute("style"); michael@0: michael@0: this._cleanStyles(cur); michael@0: } michael@0: michael@0: cur = cur.nextSibling; michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Get the density of links as a percentage of the content michael@0: * This is the amount of text that is inside a link divided by the total text in the node. michael@0: * michael@0: * @param Element michael@0: * @return number (float) michael@0: **/ michael@0: _getLinkDensity: function(e) { michael@0: let links = e.getElementsByTagName("a"); michael@0: let textLength = this._getInnerText(e).length; michael@0: let linkLength = 0; michael@0: michael@0: for (let i = 0, il = links.length; i < il; i += 1) { michael@0: linkLength += this._getInnerText(links[i]).length; michael@0: } michael@0: michael@0: return linkLength / textLength; michael@0: }, michael@0: michael@0: /** michael@0: * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. michael@0: * michael@0: * @author Dan Lacy michael@0: * @return string the base url michael@0: **/ michael@0: _findBaseUrl: function() { michael@0: let uri = this._uri; michael@0: let noUrlParams = uri.path.split("?")[0]; michael@0: let urlSlashes = noUrlParams.split("/").reverse(); michael@0: let cleanedSegments = []; michael@0: let possibleType = ""; michael@0: michael@0: for (let i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { michael@0: let segment = urlSlashes[i]; michael@0: michael@0: // Split off and save anything that looks like a file type. michael@0: if (segment.indexOf(".") !== -1) { michael@0: possibleType = segment.split(".")[1]; michael@0: michael@0: // If the type isn't alpha-only, it's probably not actually a file extension. michael@0: if (!possibleType.match(/[^a-zA-Z]/)) michael@0: segment = segment.split(".")[0]; michael@0: } michael@0: michael@0: // EW-CMS specific segment replacement. Ugly. michael@0: // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html michael@0: if (segment.indexOf(',00') !== -1) michael@0: segment = segment.replace(',00', ''); michael@0: michael@0: // If our first or second segment has anything looking like a page number, remove it. michael@0: if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) michael@0: segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); michael@0: michael@0: let del = false; michael@0: michael@0: // If this is purely a number, and it's the first or second segment, michael@0: // it's probably a page number. Remove it. michael@0: if (i < 2 && segment.match(/^\d{1,2}$/)) michael@0: del = true; michael@0: michael@0: // If this is the first segment and it's just "index", remove it. michael@0: if (i === 0 && segment.toLowerCase() === "index") michael@0: del = true; michael@0: michael@0: // If our first or second segment is smaller than 3 characters, michael@0: // and the first segment was purely alphas, remove it. michael@0: if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) michael@0: del = true; michael@0: michael@0: // If it's not marked for deletion, push it to cleanedSegments. michael@0: if (!del) michael@0: cleanedSegments.push(segment); michael@0: } michael@0: michael@0: // This is our final, cleaned, base article URL. michael@0: return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); michael@0: }, michael@0: michael@0: /** michael@0: * Look for any paging links that may occur within the document. michael@0: * michael@0: * @param body michael@0: * @return object (array) michael@0: **/ michael@0: _findNextPageLink: function(elem) { michael@0: let uri = this._uri; michael@0: let possiblePages = {}; michael@0: let allLinks = elem.getElementsByTagName('a'); michael@0: let articleBaseUrl = this._findBaseUrl(); michael@0: michael@0: // Loop through all links, looking for hints that they may be next-page links. michael@0: // Things like having "page" in their textContent, className or id, or being a child michael@0: // of a node with a page-y className or id. michael@0: // michael@0: // Also possible: levenshtein distance? longest common subsequence? michael@0: // michael@0: // After we do that, assign each page a score, and michael@0: for (let i = 0, il = allLinks.length; i < il; i += 1) { michael@0: let link = allLinks[i]; michael@0: let linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); michael@0: michael@0: // If we've already seen this page, ignore it. michael@0: if (linkHref === "" || michael@0: linkHref === articleBaseUrl || michael@0: linkHref === uri.spec || michael@0: linkHref in this._parsedPages) { michael@0: continue; michael@0: } michael@0: michael@0: // If it's on a different domain, skip it. michael@0: if (uri.host !== linkHref.split(/\/+/g)[1]) michael@0: continue; michael@0: michael@0: let linkText = this._getInnerText(link); michael@0: michael@0: // If the linkText looks like it's not the next page, skip it. michael@0: if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) michael@0: continue; michael@0: michael@0: // If the leftovers of the URL after removing the base URL don't contain michael@0: // any digits, it's certainly not a next page link. michael@0: let linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); michael@0: if (!linkHrefLeftover.match(/\d/)) michael@0: continue; michael@0: michael@0: if (!(linkHref in possiblePages)) { michael@0: possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; michael@0: } else { michael@0: possiblePages[linkHref].linkText += ' | ' + linkText; michael@0: } michael@0: michael@0: let linkObj = possiblePages[linkHref]; michael@0: michael@0: // If the articleBaseUrl isn't part of this URL, penalize this link. It could michael@0: // still be the link, but the odds are lower. michael@0: // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html michael@0: if (linkHref.indexOf(articleBaseUrl) !== 0) michael@0: linkObj.score -= 25; michael@0: michael@0: let linkData = linkText + ' ' + link.className + ' ' + link.id; michael@0: if (linkData.match(this.REGEXPS.nextLink)) michael@0: linkObj.score += 50; michael@0: michael@0: if (linkData.match(/pag(e|ing|inat)/i)) michael@0: linkObj.score += 25; michael@0: michael@0: if (linkData.match(/(first|last)/i)) { michael@0: // -65 is enough to negate any bonuses gotten from a > or » in the text, michael@0: // If we already matched on "next", last is probably fine. michael@0: // If we didn't, then it's bad. Penalize. michael@0: if (!linkObj.linkText.match(this.REGEXPS.nextLink)) michael@0: linkObj.score -= 65; michael@0: } michael@0: michael@0: if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) michael@0: linkObj.score -= 50; michael@0: michael@0: if (linkData.match(this.REGEXPS.prevLink)) michael@0: linkObj.score -= 200; michael@0: michael@0: // If a parentNode contains page or paging or paginat michael@0: let parentNode = link.parentNode; michael@0: let positiveNodeMatch = false; michael@0: let negativeNodeMatch = false; michael@0: michael@0: while (parentNode) { michael@0: let parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; michael@0: michael@0: if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { michael@0: positiveNodeMatch = true; michael@0: linkObj.score += 25; michael@0: } michael@0: michael@0: if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { michael@0: // If this is just something like "footer", give it a negative. michael@0: // If it's something like "body-and-footer", leave it be. michael@0: if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { michael@0: linkObj.score -= 25; michael@0: negativeNodeMatch = true; michael@0: } michael@0: } michael@0: michael@0: parentNode = parentNode.parentNode; michael@0: } michael@0: michael@0: // If the URL looks like it has paging in it, add to the score. michael@0: // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 michael@0: if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) michael@0: linkObj.score += 25; michael@0: michael@0: // If the URL contains negative values, give a slight decrease. michael@0: if (linkHref.match(this.REGEXPS.extraneous)) michael@0: linkObj.score -= 15; michael@0: michael@0: /** michael@0: * Minor punishment to anything that doesn't match our current URL. michael@0: * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. michael@0: * Dan, can you show me a counterexample where this is necessary? michael@0: * if (linkHref.indexOf(window.location.href) !== 0) { michael@0: * linkObj.score -= 1; michael@0: * } michael@0: **/ michael@0: michael@0: // If the link text can be parsed as a number, give it a minor bonus, with a slight michael@0: // bias towards lower numbered pages. This is so that pages that might not have 'next' michael@0: // in their text can still get scored, and sorted properly by score. michael@0: let linkTextAsNumber = parseInt(linkText, 10); michael@0: if (linkTextAsNumber) { michael@0: // Punish 1 since we're either already there, or it's probably michael@0: // before what we want anyways. michael@0: if (linkTextAsNumber === 1) { michael@0: linkObj.score -= 10; michael@0: } else { michael@0: linkObj.score += Math.max(0, 10 - linkTextAsNumber); michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Loop thrugh all of our possible pages from above and find our top michael@0: // candidate for the next page URL. Require at least a score of 50, which michael@0: // is a relatively high confidence that this page is the next link. michael@0: let topPage = null; michael@0: for (let page in possiblePages) { michael@0: if (possiblePages.hasOwnProperty(page)) { michael@0: if (possiblePages[page].score >= 50 && michael@0: (!topPage || topPage.score < possiblePages[page].score)) michael@0: topPage = possiblePages[page]; michael@0: } michael@0: } michael@0: michael@0: if (topPage) { michael@0: let nextHref = topPage.href.replace(/\/$/,''); michael@0: michael@0: this.log('NEXT PAGE IS ' + nextHref); michael@0: this._parsedPages[nextHref] = true; michael@0: return nextHref; michael@0: } else { michael@0: return null; michael@0: } michael@0: }, michael@0: michael@0: _successfulRequest: function(request) { michael@0: return (request.status >= 200 && request.status < 300) || michael@0: request.status === 304 || michael@0: (request.status === 0 && request.responseText); michael@0: }, michael@0: michael@0: _ajax: function(url, options) { michael@0: let request = new XMLHttpRequest(); michael@0: michael@0: function respondToReadyState(readyState) { michael@0: if (request.readyState === 4) { michael@0: if (this._successfulRequest(request)) { michael@0: if (options.success) michael@0: options.success(request); michael@0: } else { michael@0: if (options.error) michael@0: options.error(request); michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (typeof options === 'undefined') michael@0: options = {}; michael@0: michael@0: request.onreadystatechange = respondToReadyState; michael@0: michael@0: request.open('get', url, true); michael@0: request.setRequestHeader('Accept', 'text/html'); michael@0: michael@0: try { michael@0: request.send(options.postBody); michael@0: } catch (e) { michael@0: if (options.error) michael@0: options.error(); michael@0: } michael@0: michael@0: return request; michael@0: }, michael@0: michael@0: _appendNextPage: function(nextPageLink) { michael@0: let doc = this._doc; michael@0: this._curPageNum += 1; michael@0: michael@0: let articlePage = doc.createElement("DIV"); michael@0: articlePage.id = 'readability-page-' + this._curPageNum; michael@0: articlePage.className = 'page'; michael@0: articlePage.innerHTML = '

§

'; michael@0: michael@0: doc.getElementById("readability-content").appendChild(articlePage); michael@0: michael@0: if (this._curPageNum > this.MAX_PAGES) { michael@0: let nextPageMarkup = "
View Next Page
"; michael@0: articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; michael@0: return; michael@0: } michael@0: michael@0: // Now that we've built the article page DOM element, get the page content michael@0: // asynchronously and load the cleaned content into the div we created for it. michael@0: (function(pageUrl, thisPage) { michael@0: this._ajax(pageUrl, { michael@0: success: function(r) { michael@0: michael@0: // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. michael@0: let eTag = r.getResponseHeader('ETag'); michael@0: if (eTag) { michael@0: if (eTag in this._pageETags) { michael@0: this.log("Exact duplicate page found via ETag. Aborting."); michael@0: articlePage.style.display = 'none'; michael@0: return; michael@0: } else { michael@0: this._pageETags[eTag] = 1; michael@0: } michael@0: } michael@0: michael@0: // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. michael@0: let page = doc.createElement("DIV"); michael@0: michael@0: // Do some preprocessing to our HTML to make it ready for appending. michael@0: // - Remove any script tags. Swap and reswap newlines with a unicode michael@0: // character because multiline regex doesn't work in javascript. michael@0: // - Turn any noscript tags into divs so that we can parse them. This michael@0: // allows us to find any next page links hidden via javascript. michael@0: // - Turn all double br's into p's - was handled by prepDocument in the original view. michael@0: // Maybe in the future abstract out prepDocument to work for both the original document michael@0: // and AJAX-added pages. michael@0: let responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); michael@0: responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/.*?<\/script>/gi, ''); michael@0: responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div'); michael@0: responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); michael@0: michael@0: page.innerHTML = responseHtml; michael@0: this._replaceBrs(page); michael@0: michael@0: // Reset all flags for the next page, as they will search through it and michael@0: // disable as necessary at the end of grabArticle. michael@0: this._flags = 0x1 | 0x2 | 0x4; michael@0: michael@0: let nextPageLink = this._findNextPageLink(page); michael@0: michael@0: // NOTE: if we end up supporting _appendNextPage(), we'll need to michael@0: // change this call to be async michael@0: let content = this._grabArticle(page); michael@0: michael@0: if (!content) { michael@0: this.log("No content found in page to append. Aborting."); michael@0: return; michael@0: } michael@0: michael@0: // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. michael@0: // Compare it against all of the the previous document's we've gotten. If the previous michael@0: // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. michael@0: let firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; michael@0: if (firstP && firstP.innerHTML.length > 100) { michael@0: for (let i = 1; i <= this._curPageNum; i += 1) { michael@0: let rPage = doc.getElementById('readability-page-' + i); michael@0: if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { michael@0: this.log('Duplicate of page ' + i + ' - skipping.'); michael@0: articlePage.style.display = 'none'; michael@0: this._parsedPages[pageUrl] = true; michael@0: return; michael@0: } michael@0: } michael@0: } michael@0: michael@0: this._removeScripts(content); michael@0: michael@0: thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; michael@0: michael@0: // After the page has rendered, post process the content. This delay is necessary because, michael@0: // in webkit at least, offsetWidth is not set in time to determine image width. We have to michael@0: // wait a little bit for reflow to finish before we can fix floating images. michael@0: setTimeout((function() { michael@0: this._postProcessContent(thisPage); michael@0: }).bind(this), 500); michael@0: michael@0: michael@0: if (nextPageLink) michael@0: this._appendNextPage(nextPageLink); michael@0: } michael@0: }); michael@0: }).bind(this)(nextPageLink, articlePage); michael@0: }, michael@0: michael@0: /** michael@0: * Get an elements class/id weight. Uses regular expressions to tell if this michael@0: * element looks good or bad. michael@0: * michael@0: * @param Element michael@0: * @return number (Integer) michael@0: **/ michael@0: _getClassWeight: function(e) { michael@0: if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) michael@0: return 0; michael@0: michael@0: let weight = 0; michael@0: michael@0: // Look for a special classname michael@0: if (typeof(e.className) === 'string' && e.className !== '') { michael@0: if (e.className.search(this.REGEXPS.negative) !== -1) michael@0: weight -= 25; michael@0: michael@0: if (e.className.search(this.REGEXPS.positive) !== -1) michael@0: weight += 25; michael@0: } michael@0: michael@0: // Look for a special ID michael@0: if (typeof(e.id) === 'string' && e.id !== '') { michael@0: if (e.id.search(this.REGEXPS.negative) !== -1) michael@0: weight -= 25; michael@0: michael@0: if (e.id.search(this.REGEXPS.positive) !== -1) michael@0: weight += 25; michael@0: } michael@0: michael@0: return weight; michael@0: }, michael@0: michael@0: /** michael@0: * Clean a node of all elements of type "tag". michael@0: * (Unless it's a youtube/vimeo video. People love movies.) michael@0: * michael@0: * @param Element michael@0: * @param string tag to clean michael@0: * @return void michael@0: **/ michael@0: _clean: function(e, tag) { michael@0: let targetList = e.getElementsByTagName(tag); michael@0: let isEmbed = (tag === 'object' || tag === 'embed'); michael@0: michael@0: for (let y = targetList.length - 1; y >= 0; y -= 1) { michael@0: // Allow youtube and vimeo videos through as people usually want to see those. michael@0: if (isEmbed) { michael@0: let attributeValues = ""; michael@0: for (let i = 0, il = targetList[y].attributes.length; i < il; i += 1) { michael@0: attributeValues += targetList[y].attributes[i].value + '|'; michael@0: } michael@0: michael@0: // First, check the elements attributes to see if any of them contain youtube or vimeo michael@0: if (attributeValues.search(this.REGEXPS.videos) !== -1) michael@0: continue; michael@0: michael@0: // Then check the elements inside this element for the same. michael@0: if (targetList[y].innerHTML.search(this.REGEXPS.videos) !== -1) michael@0: continue; michael@0: } michael@0: michael@0: targetList[y].parentNode.removeChild(targetList[y]); michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Clean an element of all tags of type "tag" if they look fishy. michael@0: * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. michael@0: * michael@0: * @return void michael@0: **/ michael@0: _cleanConditionally: function(e, tag) { michael@0: if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) michael@0: return; michael@0: michael@0: let tagsList = e.getElementsByTagName(tag); michael@0: let curTagsLength = tagsList.length; michael@0: michael@0: // Gather counts for other typical elements embedded within. michael@0: // Traverse backwards so we can remove nodes at the same time michael@0: // without effecting the traversal. michael@0: // michael@0: // TODO: Consider taking into account original contentScore here. michael@0: for (let i = curTagsLength-1; i >= 0; i -= 1) { michael@0: let weight = this._getClassWeight(tagsList[i]); michael@0: let contentScore = 0; michael@0: michael@0: this.log("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")"); michael@0: michael@0: if (weight + contentScore < 0) { michael@0: tagsList[i].parentNode.removeChild(tagsList[i]); michael@0: } else if (this._getCharCount(tagsList[i],',') < 10) { michael@0: // If there are not very many commas, and the number of michael@0: // non-paragraph elements is more than paragraphs or other michael@0: // ominous signs, remove the element. michael@0: let p = tagsList[i].getElementsByTagName("p").length; michael@0: let img = tagsList[i].getElementsByTagName("img").length; michael@0: let li = tagsList[i].getElementsByTagName("li").length-100; michael@0: let input = tagsList[i].getElementsByTagName("input").length; michael@0: michael@0: let embedCount = 0; michael@0: let embeds = tagsList[i].getElementsByTagName("embed"); michael@0: for (let ei = 0, il = embeds.length; ei < il; ei += 1) { michael@0: if (embeds[ei].src.search(this.REGEXPS.videos) === -1) michael@0: embedCount += 1; michael@0: } michael@0: michael@0: let linkDensity = this._getLinkDensity(tagsList[i]); michael@0: let contentLength = this._getInnerText(tagsList[i]).length; michael@0: let toRemove = false; michael@0: michael@0: if (img > p) { michael@0: toRemove = true; michael@0: } else if (li > p && tag !== "ul" && tag !== "ol") { michael@0: toRemove = true; michael@0: } else if ( input > Math.floor(p/3) ) { michael@0: toRemove = true; michael@0: } else if (contentLength < 25 && (img === 0 || img > 2) ) { michael@0: toRemove = true; michael@0: } else if (weight < 25 && linkDensity > 0.2) { michael@0: toRemove = true; michael@0: } else if (weight >= 25 && linkDensity > 0.5) { michael@0: toRemove = true; michael@0: } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { michael@0: toRemove = true; michael@0: } michael@0: michael@0: if (toRemove) michael@0: tagsList[i].parentNode.removeChild(tagsList[i]); michael@0: } michael@0: } michael@0: }, michael@0: michael@0: /** michael@0: * Clean out spurious headers from an Element. Checks things like classnames and link density. michael@0: * michael@0: * @param Element michael@0: * @return void michael@0: **/ michael@0: _cleanHeaders: function(e) { michael@0: for (let headerIndex = 1; headerIndex < 3; headerIndex += 1) { michael@0: let headers = e.getElementsByTagName('h' + headerIndex); michael@0: for (let i = headers.length - 1; i >= 0; i -= 1) { michael@0: if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33) michael@0: headers[i].parentNode.removeChild(headers[i]); michael@0: } michael@0: } michael@0: }, michael@0: michael@0: _flagIsActive: function(flag) { michael@0: return (this._flags & flag) > 0; michael@0: }, michael@0: michael@0: _addFlag: function(flag) { michael@0: this._flags = this._flags | flag; michael@0: }, michael@0: michael@0: _removeFlag: function(flag) { michael@0: this._flags = this._flags & ~flag; michael@0: }, michael@0: michael@0: /** michael@0: * Runs readability. michael@0: * michael@0: * Workflow: michael@0: * 1. Prep the document by removing script tags, css, etc. michael@0: * 2. Build readability's DOM tree. michael@0: * 3. Grab the article content from the current dom tree. michael@0: * 4. Replace the current DOM tree with the new one. michael@0: * 5. Read peacefully. michael@0: * michael@0: * @return void michael@0: **/ michael@0: parse: function () { michael@0: // Remove script tags from the document. michael@0: this._removeScripts(this._doc); michael@0: michael@0: // FIXME: Disabled multi-page article support for now as it michael@0: // needs more work on infrastructure. michael@0: michael@0: // Make sure this document is added to the list of parsed pages first, michael@0: // so we don't double up on the first page. michael@0: // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; michael@0: michael@0: // Pull out any possible next page link first. michael@0: // let nextPageLink = this._findNextPageLink(doc.body); michael@0: michael@0: this._prepDocument(); michael@0: michael@0: let articleTitle = this._getArticleTitle(); michael@0: let articleContent = this._grabArticle(); michael@0: if (!articleContent) michael@0: return null; michael@0: michael@0: this._postProcessContent(articleContent); michael@0: michael@0: // if (nextPageLink) { michael@0: // // Append any additional pages after a small timeout so that people michael@0: // // can start reading without having to wait for this to finish processing. michael@0: // setTimeout((function() { michael@0: // this._appendNextPage(nextPageLink); michael@0: // }).bind(this), 500); michael@0: // } michael@0: michael@0: let excerpt = this._getExcerpt(articleContent); michael@0: michael@0: return { title: articleTitle, michael@0: byline: this._articleByline, michael@0: dir: this._articleDir, michael@0: content: articleContent.innerHTML, michael@0: length: articleContent.textContent.length, michael@0: excerpt: excerpt }; michael@0: } michael@0: };