1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/mobile/android/chrome/content/Readability.js Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1517 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 Arc90 Inc 1.6 + * 1.7 + * Licensed under the Apache License, Version 2.0 (the "License"); 1.8 + * you may not use this file except in compliance with the License. 1.9 + * You may obtain a copy of the License at 1.10 + * 1.11 + * http://www.apache.org/licenses/LICENSE-2.0 1.12 + * 1.13 + * Unless required by applicable law or agreed to in writing, software 1.14 + * distributed under the License is distributed on an "AS IS" BASIS, 1.15 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.16 + * See the License for the specific language governing permissions and 1.17 + * limitations under the License. 1.18 + */ 1.19 + 1.20 +/* 1.21 + * This code is heavily based on Arc90's readability.js (1.7.1) script 1.22 + * available at: http://code.google.com/p/arc90labs-readability 1.23 + */ 1.24 + 1.25 +var Readability = function(uri, doc) { 1.26 + const ENABLE_LOGGING = false; 1.27 + 1.28 + this._uri = uri; 1.29 + this._doc = doc; 1.30 + this._biggestFrame = false; 1.31 + this._articleByline = null; 1.32 + this._articleDir = null; 1.33 + 1.34 + // Start with all flags set 1.35 + this._flags = this.FLAG_STRIP_UNLIKELYS | 1.36 + this.FLAG_WEIGHT_CLASSES | 1.37 + this.FLAG_CLEAN_CONDITIONALLY; 1.38 + 1.39 + // The list of pages we've parsed in this call of readability, 1.40 + // for autopaging. As a key store for easier searching. 1.41 + this._parsedPages = {}; 1.42 + 1.43 + // A list of the ETag headers of pages we've parsed, in case they happen to match, 1.44 + // we'll know it's a duplicate. 1.45 + this._pageETags = {}; 1.46 + 1.47 + // Make an AJAX request for each page and append it to the document. 1.48 + this._curPageNum = 1; 1.49 + 1.50 + // Control whether log messages are sent to the console 1.51 + if (ENABLE_LOGGING) { 1.52 + this.log = function (msg) { 1.53 + dump("Reader: (Readability) " + msg); 1.54 + }; 1.55 + } else { 1.56 + this.log = function () {}; 1.57 + } 1.58 +} 1.59 + 1.60 +Readability.prototype = { 1.61 + FLAG_STRIP_UNLIKELYS: 0x1, 1.62 + FLAG_WEIGHT_CLASSES: 0x2, 1.63 + FLAG_CLEAN_CONDITIONALLY: 0x4, 1.64 + 1.65 + // The number of top candidates to consider when analysing how 1.66 + // tight the competition is among candidates. 1.67 + N_TOP_CANDIDATES: 5, 1.68 + 1.69 + // The maximum number of pages to loop through before we call 1.70 + // it quits and just show a link. 1.71 + MAX_PAGES: 5, 1.72 + 1.73 + // All of the regular expressions in use within readability. 1.74 + // Defined up here so we don't instantiate them repeatedly in loops. 1.75 + REGEXPS: { 1.76 + unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, 1.77 + okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 1.78 + positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, 1.79 + negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, 1.80 + extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, 1.81 + byline: /byline|author|dateline|writtenby/i, 1.82 + replaceFonts: /<(\/?)font[^>]*>/gi, 1.83 + trim: /^\s+|\s+$/g, 1.84 + normalize: /\s{2,}/g, 1.85 + videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, 1.86 + nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, 1.87 + prevLink: /(prev|earl|old|new|<|«)/i, 1.88 + whitespace: /^\s*$/ 1.89 + }, 1.90 + 1.91 + DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], 1.92 + 1.93 + /** 1.94 + * Run any post-process modifications to article content as necessary. 1.95 + * 1.96 + * @param Element 1.97 + * @return void 1.98 + **/ 1.99 + _postProcessContent: function(articleContent) { 1.100 + // Readability cannot open relative uris so we convert them to absolute uris. 1.101 + this._fixRelativeUris(articleContent); 1.102 + }, 1.103 + 1.104 + /** 1.105 + * Converts each <a> and <img> uri in the given element to an absolute URI. 1.106 + * 1.107 + * @param Element 1.108 + * @return void 1.109 + */ 1.110 + _fixRelativeUris: function(articleContent) { 1.111 + let scheme = this._uri.scheme; 1.112 + let prePath = this._uri.prePath; 1.113 + let pathBase = this._uri.pathBase; 1.114 + 1.115 + function toAbsoluteURI(uri) { 1.116 + // If this is already an absolute URI, return it. 1.117 + if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) 1.118 + return uri; 1.119 + 1.120 + // Scheme-rooted relative URI. 1.121 + if (uri.substr(0, 2) == "//") 1.122 + return scheme + "://" + uri.substr(2); 1.123 + 1.124 + // Prepath-rooted relative URI. 1.125 + if (uri[0] == "/") 1.126 + return prePath + uri; 1.127 + 1.128 + // Standard relative URI; add entire path. pathBase already includes a 1.129 + // trailing "/". 1.130 + return pathBase + uri; 1.131 + } 1.132 + 1.133 + function convertRelativeURIs(tagName, propName) { 1.134 + let elems = articleContent.getElementsByTagName(tagName); 1.135 + for (let i = elems.length; --i >= 0;) { 1.136 + let elem = elems[i]; 1.137 + let relativeURI = elem.getAttribute(propName); 1.138 + if (relativeURI != null) 1.139 + elems[i].setAttribute(propName, toAbsoluteURI(relativeURI)); 1.140 + } 1.141 + } 1.142 + 1.143 + // Fix links. 1.144 + convertRelativeURIs("a", "href"); 1.145 + 1.146 + // Fix images. 1.147 + convertRelativeURIs("img", "src"); 1.148 + }, 1.149 + 1.150 + /** 1.151 + * Get the article title as an H1. 1.152 + * 1.153 + * @return void 1.154 + **/ 1.155 + _getArticleTitle: function() { 1.156 + let doc = this._doc; 1.157 + let curTitle = ""; 1.158 + let origTitle = ""; 1.159 + 1.160 + try { 1.161 + curTitle = origTitle = doc.title; 1.162 + 1.163 + // If they had an element with id "title" in their HTML 1.164 + if (typeof curTitle !== "string") 1.165 + curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); 1.166 + } catch(e) {} 1.167 + 1.168 + if (curTitle.match(/ [\|\-] /)) { 1.169 + curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 1.170 + 1.171 + if (curTitle.split(' ').length < 3) 1.172 + curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 1.173 + } else if (curTitle.indexOf(': ') !== -1) { 1.174 + curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); 1.175 + 1.176 + if (curTitle.split(' ').length < 3) 1.177 + curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); 1.178 + } else if (curTitle.length > 150 || curTitle.length < 15) { 1.179 + let hOnes = doc.getElementsByTagName('h1'); 1.180 + 1.181 + if (hOnes.length === 1) 1.182 + curTitle = this._getInnerText(hOnes[0]); 1.183 + } 1.184 + 1.185 + curTitle = curTitle.replace(this.REGEXPS.trim, ""); 1.186 + 1.187 + if (curTitle.split(' ').length <= 4) 1.188 + curTitle = origTitle; 1.189 + 1.190 + return curTitle; 1.191 + }, 1.192 + 1.193 + /** 1.194 + * Prepare the HTML document for readability to scrape it. 1.195 + * This includes things like stripping javascript, CSS, and handling terrible markup. 1.196 + * 1.197 + * @return void 1.198 + **/ 1.199 + _prepDocument: function() { 1.200 + let doc = this._doc; 1.201 + 1.202 + // In some cases a body element can't be found (if the HTML is 1.203 + // totally hosed for example) so we create a new body node and 1.204 + // append it to the document. 1.205 + if (doc.body === null) { 1.206 + let body = doc.createElement("body"); 1.207 + 1.208 + try { 1.209 + doc.body = body; 1.210 + } catch(e) { 1.211 + doc.documentElement.appendChild(body); 1.212 + this.log(e); 1.213 + } 1.214 + } 1.215 + 1.216 + // Remove all style tags in head 1.217 + let styleTags = doc.getElementsByTagName("style"); 1.218 + for (let st = 0; st < styleTags.length; st += 1) { 1.219 + styleTags[st].textContent = ""; 1.220 + } 1.221 + 1.222 + this._replaceBrs(doc.body); 1.223 + 1.224 + let fonts = doc.getElementsByTagName("FONT"); 1.225 + for (let i = fonts.length; --i >=0;) { 1.226 + this._setNodeTag(fonts[i], "SPAN"); 1.227 + } 1.228 + }, 1.229 + 1.230 + /** 1.231 + * Finds the next element, starting from the given node, and ignoring 1.232 + * whitespace in between. If the given node is an element, the same node is 1.233 + * returned. 1.234 + */ 1.235 + _nextElement: function (node) { 1.236 + let next = node; 1.237 + while (next 1.238 + && (next.nodeType != Node.ELEMENT_NODE) 1.239 + && this.REGEXPS.whitespace.test(next.textContent)) { 1.240 + next = next.nextSibling; 1.241 + } 1.242 + return next; 1.243 + }, 1.244 + 1.245 + /** 1.246 + * Replaces 2 or more successive <br> elements with a single <p>. 1.247 + * Whitespace between <br> elements are ignored. For example: 1.248 + * <div>foo<br>bar<br> <br><br>abc</div> 1.249 + * will become: 1.250 + * <div>foo<br>bar<p>abc</p></div> 1.251 + */ 1.252 + _replaceBrs: function (elem) { 1.253 + let brs = elem.getElementsByTagName("br"); 1.254 + for (let i = 0; i < brs.length; i++) { 1.255 + let br = brs[i]; 1.256 + let next = br.nextSibling; 1.257 + 1.258 + // Whether 2 or more <br> elements have been found and replaced with a 1.259 + // <p> block. 1.260 + let replaced = false; 1.261 + 1.262 + // If we find a <br> chain, remove the <br>s until we hit another element 1.263 + // or non-whitespace. This leaves behind the first <br> in the chain 1.264 + // (which will be replaced with a <p> later). 1.265 + while ((next = this._nextElement(next)) && (next.tagName == "BR")) { 1.266 + replaced = true; 1.267 + let sibling = next.nextSibling; 1.268 + next.parentNode.removeChild(next); 1.269 + next = sibling; 1.270 + } 1.271 + 1.272 + // If we removed a <br> chain, replace the remaining <br> with a <p>. Add 1.273 + // all sibling nodes as children of the <p> until we hit another <br> 1.274 + // chain. 1.275 + if (replaced) { 1.276 + let p = this._doc.createElement("p"); 1.277 + br.parentNode.replaceChild(p, br); 1.278 + 1.279 + next = p.nextSibling; 1.280 + while (next) { 1.281 + // If we've hit another <br><br>, we're done adding children to this <p>. 1.282 + if (next.tagName == "BR") { 1.283 + let nextElem = this._nextElement(next); 1.284 + if (nextElem && nextElem.tagName == "BR") 1.285 + break; 1.286 + } 1.287 + 1.288 + // Otherwise, make this node a child of the new <p>. 1.289 + let sibling = next.nextSibling; 1.290 + p.appendChild(next); 1.291 + next = sibling; 1.292 + } 1.293 + } 1.294 + } 1.295 + }, 1.296 + 1.297 + _setNodeTag: function (node, tag) { 1.298 + node.localName = tag.toLowerCase(); 1.299 + node.tagName = tag.toUpperCase(); 1.300 + }, 1.301 + 1.302 + /** 1.303 + * Prepare the article node for display. Clean out any inline styles, 1.304 + * iframes, forms, strip extraneous <p> tags, etc. 1.305 + * 1.306 + * @param Element 1.307 + * @return void 1.308 + **/ 1.309 + _prepArticle: function(articleContent) { 1.310 + this._cleanStyles(articleContent); 1.311 + 1.312 + // Clean out junk from the article content 1.313 + this._cleanConditionally(articleContent, "form"); 1.314 + this._clean(articleContent, "object"); 1.315 + this._clean(articleContent, "h1"); 1.316 + 1.317 + // If there is only one h2, they are probably using it as a header 1.318 + // and not a subheader, so remove it since we already have a header. 1.319 + if (articleContent.getElementsByTagName('h2').length === 1) 1.320 + this._clean(articleContent, "h2"); 1.321 + 1.322 + this._clean(articleContent, "iframe"); 1.323 + this._cleanHeaders(articleContent); 1.324 + 1.325 + // Do these last as the previous stuff may have removed junk 1.326 + // that will affect these 1.327 + this._cleanConditionally(articleContent, "table"); 1.328 + this._cleanConditionally(articleContent, "ul"); 1.329 + this._cleanConditionally(articleContent, "div"); 1.330 + 1.331 + // Remove extra paragraphs 1.332 + let articleParagraphs = articleContent.getElementsByTagName('p'); 1.333 + for (let i = articleParagraphs.length - 1; i >= 0; i -= 1) { 1.334 + let imgCount = articleParagraphs[i].getElementsByTagName('img').length; 1.335 + let embedCount = articleParagraphs[i].getElementsByTagName('embed').length; 1.336 + let objectCount = articleParagraphs[i].getElementsByTagName('object').length; 1.337 + 1.338 + if (imgCount === 0 && 1.339 + embedCount === 0 && 1.340 + objectCount === 0 && 1.341 + this._getInnerText(articleParagraphs[i], false) === '') 1.342 + articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); 1.343 + } 1.344 + 1.345 + let brs = articleContent.getElementsByTagName("BR"); 1.346 + for (let i = brs.length; --i >= 0;) { 1.347 + let br = brs[i]; 1.348 + let next = this._nextElement(br.nextSibling); 1.349 + if (next && next.tagName == "P") 1.350 + br.parentNode.removeChild(br); 1.351 + } 1.352 + }, 1.353 + 1.354 + /** 1.355 + * Initialize a node with the readability object. Also checks the 1.356 + * className/id for special names to add to its score. 1.357 + * 1.358 + * @param Element 1.359 + * @return void 1.360 + **/ 1.361 + _initializeNode: function(node) { 1.362 + node.readability = {"contentScore": 0}; 1.363 + 1.364 + switch(node.tagName) { 1.365 + case 'DIV': 1.366 + node.readability.contentScore += 5; 1.367 + break; 1.368 + 1.369 + case 'PRE': 1.370 + case 'TD': 1.371 + case 'BLOCKQUOTE': 1.372 + node.readability.contentScore += 3; 1.373 + break; 1.374 + 1.375 + case 'ADDRESS': 1.376 + case 'OL': 1.377 + case 'UL': 1.378 + case 'DL': 1.379 + case 'DD': 1.380 + case 'DT': 1.381 + case 'LI': 1.382 + case 'FORM': 1.383 + node.readability.contentScore -= 3; 1.384 + break; 1.385 + 1.386 + case 'H1': 1.387 + case 'H2': 1.388 + case 'H3': 1.389 + case 'H4': 1.390 + case 'H5': 1.391 + case 'H6': 1.392 + case 'TH': 1.393 + node.readability.contentScore -= 5; 1.394 + break; 1.395 + } 1.396 + 1.397 + node.readability.contentScore += this._getClassWeight(node); 1.398 + }, 1.399 + 1.400 + /*** 1.401 + * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 1.402 + * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 1.403 + * 1.404 + * @param page a document to run upon. Needs to be a full document, complete with body. 1.405 + * @return Element 1.406 + **/ 1.407 + _grabArticle: function (page) { 1.408 + let doc = this._doc; 1.409 + let isPaging = (page !== null ? true: false); 1.410 + page = page ? page : this._doc.body; 1.411 + let pageCacheHtml = page.innerHTML; 1.412 + 1.413 + // Check if any "dir" is set on the toplevel document element 1.414 + this._articleDir = doc.documentElement.getAttribute("dir"); 1.415 + 1.416 + while (true) { 1.417 + let stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); 1.418 + let allElements = page.getElementsByTagName('*'); 1.419 + 1.420 + // First, node prepping. Trash nodes that look cruddy (like ones with the 1.421 + // class name "comment", etc), and turn divs into P tags where they have been 1.422 + // used inappropriately (as in, where they contain no other block level elements.) 1.423 + // 1.424 + // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 1.425 + // TODO: Shouldn't this be a reverse traversal? 1.426 + let node = null; 1.427 + let nodesToScore = []; 1.428 + 1.429 + // Let each node know its index in the allElements array. 1.430 + for (let i = allElements.length; --i >= 0;) { 1.431 + allElements[i]._index = i; 1.432 + } 1.433 + 1.434 + /** 1.435 + * JSDOMParser returns static node lists, not live ones. When we remove 1.436 + * an element from the document, we need to manually remove it - and all 1.437 + * of its children - from the allElements array. 1.438 + */ 1.439 + function purgeNode(node) { 1.440 + for (let i = node.childNodes.length; --i >= 0;) { 1.441 + purgeNode(node.childNodes[i]); 1.442 + } 1.443 + if (node._index !== undefined && allElements[node._index] == node) 1.444 + delete allElements[node._index]; 1.445 + } 1.446 + 1.447 + for (let nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) { 1.448 + if (!(node = allElements[nodeIndex])) 1.449 + continue; 1.450 + 1.451 + let matchString = node.className + node.id; 1.452 + if (matchString.search(this.REGEXPS.byline) !== -1 && !this._articleByline) { 1.453 + this._articleByline = node.textContent; 1.454 + node.parentNode.removeChild(node); 1.455 + purgeNode(node); 1.456 + continue; 1.457 + } 1.458 + 1.459 + // Remove unlikely candidates 1.460 + if (stripUnlikelyCandidates) { 1.461 + if (matchString.search(this.REGEXPS.unlikelyCandidates) !== -1 && 1.462 + matchString.search(this.REGEXPS.okMaybeItsACandidate) === -1 && 1.463 + node.tagName !== "BODY") { 1.464 + this.log("Removing unlikely candidate - " + matchString); 1.465 + node.parentNode.removeChild(node); 1.466 + purgeNode(node); 1.467 + continue; 1.468 + } 1.469 + } 1.470 + 1.471 + if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") 1.472 + nodesToScore[nodesToScore.length] = node; 1.473 + 1.474 + // Turn all divs that don't have children block level elements into p's 1.475 + if (node.tagName === "DIV") { 1.476 + // Sites like http://mobile.slate.com encloses each paragraph with a DIV 1.477 + // element. DIVs with only a P element inside and no text content can be 1.478 + // safely converted into plain P elements to avoid confusing the scoring 1.479 + // algorithm with DIVs with are, in practice, paragraphs. 1.480 + let pIndex = this._getSinglePIndexInsideDiv(node); 1.481 + 1.482 + if (pIndex >= 0 || !this._hasChildBlockElement(node)) { 1.483 + if (pIndex >= 0) { 1.484 + let newNode = node.childNodes[pIndex]; 1.485 + node.parentNode.replaceChild(newNode, node); 1.486 + purgeNode(node); 1.487 + } else { 1.488 + this._setNodeTag(node, "P"); 1.489 + nodesToScore[nodesToScore.length] = node; 1.490 + } 1.491 + } else { 1.492 + // EXPERIMENTAL 1.493 + for (let i = 0, il = node.childNodes.length; i < il; i += 1) { 1.494 + let childNode = node.childNodes[i]; 1.495 + if (!childNode) 1.496 + continue; 1.497 + 1.498 + if (childNode.nodeType === 3) { // Node.TEXT_NODE 1.499 + let p = doc.createElement('p'); 1.500 + p.textContent = childNode.textContent; 1.501 + p.style.display = 'inline'; 1.502 + p.className = 'readability-styled'; 1.503 + childNode.parentNode.replaceChild(p, childNode); 1.504 + } 1.505 + } 1.506 + } 1.507 + } 1.508 + } 1.509 + 1.510 + /** 1.511 + * Loop through all paragraphs, and assign a score to them based on how content-y they look. 1.512 + * Then add their score to their parent node. 1.513 + * 1.514 + * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 1.515 + **/ 1.516 + let candidates = []; 1.517 + for (let pt = 0; pt < nodesToScore.length; pt += 1) { 1.518 + let parentNode = nodesToScore[pt].parentNode; 1.519 + let grandParentNode = parentNode ? parentNode.parentNode : null; 1.520 + let innerText = this._getInnerText(nodesToScore[pt]); 1.521 + 1.522 + if (!parentNode || typeof(parentNode.tagName) === 'undefined') 1.523 + continue; 1.524 + 1.525 + // If this paragraph is less than 25 characters, don't even count it. 1.526 + if (innerText.length < 25) 1.527 + continue; 1.528 + 1.529 + // Initialize readability data for the parent. 1.530 + if (typeof parentNode.readability === 'undefined') { 1.531 + this._initializeNode(parentNode); 1.532 + candidates.push(parentNode); 1.533 + } 1.534 + 1.535 + // Initialize readability data for the grandparent. 1.536 + if (grandParentNode && 1.537 + typeof(grandParentNode.readability) === 'undefined' && 1.538 + typeof(grandParentNode.tagName) !== 'undefined') { 1.539 + this._initializeNode(grandParentNode); 1.540 + candidates.push(grandParentNode); 1.541 + } 1.542 + 1.543 + let contentScore = 0; 1.544 + 1.545 + // Add a point for the paragraph itself as a base. 1.546 + contentScore += 1; 1.547 + 1.548 + // Add points for any commas within this paragraph. 1.549 + contentScore += innerText.split(',').length; 1.550 + 1.551 + // For every 100 characters in this paragraph, add another point. Up to 3 points. 1.552 + contentScore += Math.min(Math.floor(innerText.length / 100), 3); 1.553 + 1.554 + // Add the score to the parent. The grandparent gets half. 1.555 + parentNode.readability.contentScore += contentScore; 1.556 + 1.557 + if (grandParentNode) 1.558 + grandParentNode.readability.contentScore += contentScore / 2; 1.559 + } 1.560 + 1.561 + // After we've calculated scores, loop through all of the possible 1.562 + // candidate nodes we found and find the one with the highest score. 1.563 + let topCandidates = []; 1.564 + for (let c = 0, cl = candidates.length; c < cl; c += 1) { 1.565 + let candidate = candidates[c]; 1.566 + 1.567 + // Scale the final candidates score based on link density. Good content 1.568 + // should have a relatively small link density (5% or less) and be mostly 1.569 + // unaffected by this operation. 1.570 + let candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); 1.571 + candidate.readability.contentScore = candidateScore; 1.572 + 1.573 + this.log('Candidate: ' + candidate + " (" + candidate.className + ":" + 1.574 + candidate.id + ") with score " + candidateScore); 1.575 + 1.576 + for (let t = 0; t < this.N_TOP_CANDIDATES; t++) { 1.577 + let aTopCandidate = topCandidates[t]; 1.578 + 1.579 + if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { 1.580 + topCandidates.splice(t, 0, candidate); 1.581 + if (topCandidates.length > this.N_TOP_CANDIDATES) 1.582 + topCandidates.pop(); 1.583 + break; 1.584 + } 1.585 + } 1.586 + } 1.587 + 1.588 + let topCandidate = topCandidates[0] || null; 1.589 + let lastTopCandidate = (topCandidates.length > 3 ? topCandidates[topCandidates.length - 1] : null); 1.590 + 1.591 + // If we still have no top candidate, just use the body as a last resort. 1.592 + // We also have to copy the body node so it is something we can modify. 1.593 + if (topCandidate === null || topCandidate.tagName === "BODY") { 1.594 + // Move all of the page's children into topCandidate 1.595 + topCandidate = doc.createElement("DIV"); 1.596 + let children = page.childNodes; 1.597 + for (let i = 0; i < children.length; ++i) { 1.598 + topCandidate.appendChild(children[i]); 1.599 + } 1.600 + 1.601 + page.appendChild(topCandidate); 1.602 + 1.603 + this._initializeNode(topCandidate); 1.604 + } 1.605 + 1.606 + // Now that we have the top candidate, look through its siblings for content 1.607 + // that might also be related. Things like preambles, content split by ads 1.608 + // that we removed, etc. 1.609 + let articleContent = doc.createElement("DIV"); 1.610 + if (isPaging) 1.611 + articleContent.id = "readability-content"; 1.612 + 1.613 + let siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); 1.614 + let siblingNodes = topCandidate.parentNode.childNodes; 1.615 + 1.616 + for (let s = 0, sl = siblingNodes.length; s < sl; s += 1) { 1.617 + let siblingNode = siblingNodes[s]; 1.618 + let append = false; 1.619 + 1.620 + this.log("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); 1.621 + this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); 1.622 + 1.623 + if (siblingNode === topCandidate) 1.624 + append = true; 1.625 + 1.626 + let contentBonus = 0; 1.627 + 1.628 + // Give a bonus if sibling nodes and top candidates have the example same classname 1.629 + if (siblingNode.className === topCandidate.className && topCandidate.className !== "") 1.630 + contentBonus += topCandidate.readability.contentScore * 0.2; 1.631 + 1.632 + if (typeof siblingNode.readability !== 'undefined' && 1.633 + (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) 1.634 + append = true; 1.635 + 1.636 + if (siblingNode.nodeName === "P") { 1.637 + let linkDensity = this._getLinkDensity(siblingNode); 1.638 + let nodeContent = this._getInnerText(siblingNode); 1.639 + let nodeLength = nodeContent.length; 1.640 + 1.641 + if (nodeLength > 80 && linkDensity < 0.25) { 1.642 + append = true; 1.643 + } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { 1.644 + append = true; 1.645 + } 1.646 + } 1.647 + 1.648 + if (append) { 1.649 + this.log("Appending node: " + siblingNode); 1.650 + 1.651 + // siblingNodes is a reference to the childNodes array, and 1.652 + // siblingNode is removed from the array when we call appendChild() 1.653 + // below. As a result, we must revisit this index since the nodes 1.654 + // have been shifted. 1.655 + s -= 1; 1.656 + sl -= 1; 1.657 + 1.658 + if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { 1.659 + // We have a node that isn't a common block level element, like a form or td tag. 1.660 + // Turn it into a div so it doesn't get filtered out later by accident. */ 1.661 + this.log("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); 1.662 + 1.663 + this._setNodeTag(siblingNode, "DIV"); 1.664 + } 1.665 + 1.666 + // To ensure a node does not interfere with readability styles, 1.667 + // remove its classnames. 1.668 + siblingNode.className = ""; 1.669 + 1.670 + // Append sibling and subtract from our list because it removes 1.671 + // the node when you append to another node. 1.672 + articleContent.appendChild(siblingNode); 1.673 + } 1.674 + } 1.675 + 1.676 + // So we have all of the content that we need. Now we clean it up for presentation. 1.677 + this._prepArticle(articleContent); 1.678 + 1.679 + if (this._curPageNum === 1) { 1.680 + let div = doc.createElement("DIV"); 1.681 + div.id = "readability-page-1"; 1.682 + div.className = "page"; 1.683 + let children = articleContent.childNodes; 1.684 + for (let i = 0; i < children.length; ++i) { 1.685 + div.appendChild(children[i]); 1.686 + } 1.687 + articleContent.appendChild(div); 1.688 + } 1.689 + 1.690 + // Now that we've gone through the full algorithm, check to see if 1.691 + // we got any meaningful content. If we didn't, we may need to re-run 1.692 + // grabArticle with different flags set. This gives us a higher likelihood of 1.693 + // finding the content, and the sieve approach gives us a higher likelihood of 1.694 + // finding the -right- content. 1.695 + if (this._getInnerText(articleContent, true).length < 500) { 1.696 + page.innerHTML = pageCacheHtml; 1.697 + 1.698 + if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { 1.699 + this._removeFlag(this.FLAG_STRIP_UNLIKELYS); 1.700 + } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 1.701 + this._removeFlag(this.FLAG_WEIGHT_CLASSES); 1.702 + } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 1.703 + this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); 1.704 + } else { 1.705 + return null; 1.706 + } 1.707 + } else { 1.708 + if (lastTopCandidate !== null) { 1.709 + // EXPERIMENTAL: Contrast ratio is how we measure the level of competition between candidates in the 1.710 + // readability algorithm. This is to avoid offering reader mode on pages that are more like 1.711 + // a list or directory of links with summaries. It takes the score of the last top candidate 1.712 + // (see N_TOP_CANDIDATES) and checks how it compares to the top candidate's. On pages that are not 1.713 + // actual articles, there will likely be many candidates with similar score (i.e. higher contrast ratio). 1.714 + let contrastRatio = lastTopCandidate.readability.contentScore / topCandidate.readability.contentScore; 1.715 + if (contrastRatio > 0.45) 1.716 + return null; 1.717 + } 1.718 + 1.719 + return articleContent; 1.720 + } 1.721 + } 1.722 + }, 1.723 + 1.724 + /** 1.725 + * Attempts to get the excerpt from these 1.726 + * sources in the following order: 1.727 + * - meta description tag 1.728 + * - open-graph description 1.729 + * - twitter cards description 1.730 + * - article's first paragraph 1.731 + * If no excerpt is found, an empty string will be 1.732 + * returned. 1.733 + * 1.734 + * @param Element - root element of the processed version page 1.735 + * @return String - excerpt of the article 1.736 + **/ 1.737 + _getExcerpt: function(articleContent) { 1.738 + let values = {}; 1.739 + let metaElements = this._doc.getElementsByTagName("meta"); 1.740 + 1.741 + // Match "description", or Twitter's "twitter:description" (Cards) 1.742 + // in name attribute. 1.743 + let namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi; 1.744 + 1.745 + // Match Facebook's og:description (Open Graph) in property attribute. 1.746 + let propertyPattern = /^\s*og\s*:\s*description\s*$/gi; 1.747 + 1.748 + // Find description tags. 1.749 + for (let i = 0; i < metaElements.length; i++) { 1.750 + let element = metaElements[i]; 1.751 + let elementName = element.getAttribute("name"); 1.752 + let elementProperty = element.getAttribute("property"); 1.753 + 1.754 + let name; 1.755 + if (namePattern.test(elementName)) { 1.756 + name = elementName; 1.757 + } else if (propertyPattern.test(elementProperty)) { 1.758 + name = elementProperty; 1.759 + } 1.760 + 1.761 + if (name) { 1.762 + let content = element.getAttribute("content"); 1.763 + if (content) { 1.764 + // Convert to lowercase and remove any whitespace 1.765 + // so we can match below. 1.766 + name = name.toLowerCase().replace(/\s/g, ''); 1.767 + values[name] = content.trim(); 1.768 + } 1.769 + } 1.770 + } 1.771 + 1.772 + if ("description" in values) { 1.773 + return values["description"]; 1.774 + } 1.775 + 1.776 + if ("og:description" in values) { 1.777 + // Use facebook open graph description. 1.778 + return values["og:description"]; 1.779 + } 1.780 + 1.781 + if ("twitter:description" in values) { 1.782 + // Use twitter cards description. 1.783 + return values["twitter:description"]; 1.784 + } 1.785 + 1.786 + // No description meta tags, use the article's first paragraph. 1.787 + let paragraphs = articleContent.getElementsByTagName("p"); 1.788 + if (paragraphs.length > 0) { 1.789 + return paragraphs[0].textContent; 1.790 + } 1.791 + 1.792 + return ""; 1.793 + }, 1.794 + 1.795 + /** 1.796 + * Removes script tags from the document. 1.797 + * 1.798 + * @param Element 1.799 + **/ 1.800 + _removeScripts: function(doc) { 1.801 + let scripts = doc.getElementsByTagName('script'); 1.802 + for (let i = scripts.length - 1; i >= 0; i -= 1) { 1.803 + scripts[i].nodeValue=""; 1.804 + scripts[i].removeAttribute('src'); 1.805 + 1.806 + if (scripts[i].parentNode) 1.807 + scripts[i].parentNode.removeChild(scripts[i]); 1.808 + } 1.809 + }, 1.810 + 1.811 + /** 1.812 + * Get child index of the only P element inside a DIV with no 1.813 + * text content. Returns -1 if the DIV node contains non-empty 1.814 + * text nodes or if it contains other element nodes. 1.815 + * 1.816 + * @param Element 1.817 + **/ 1.818 + _getSinglePIndexInsideDiv: function(e) { 1.819 + let childNodes = e.childNodes; 1.820 + let pIndex = -1; 1.821 + 1.822 + for (let i = childNodes.length; --i >= 0;) { 1.823 + let node = childNodes[i]; 1.824 + 1.825 + if (node.nodeType === Node.ELEMENT_NODE) { 1.826 + if (node.tagName !== "P") 1.827 + return -1; 1.828 + 1.829 + if (pIndex >= 0) 1.830 + return -1; 1.831 + 1.832 + pIndex = i; 1.833 + } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) { 1.834 + return -1; 1.835 + } 1.836 + } 1.837 + 1.838 + return pIndex; 1.839 + }, 1.840 + 1.841 + /** 1.842 + * Determine whether element has any children block level elements. 1.843 + * 1.844 + * @param Element 1.845 + */ 1.846 + _hasChildBlockElement: function (e) { 1.847 + let length = e.childNodes.length; 1.848 + for (let i = 0; i < length; i++) { 1.849 + let child = e.childNodes[i]; 1.850 + if (child.nodeType != 1) 1.851 + continue; 1.852 + 1.853 + if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child)) 1.854 + return true; 1.855 + } 1.856 + return false; 1.857 + }, 1.858 + 1.859 + /** 1.860 + * Get the inner text of a node - cross browser compatibly. 1.861 + * This also strips out any excess whitespace to be found. 1.862 + * 1.863 + * @param Element 1.864 + * @return string 1.865 + **/ 1.866 + _getInnerText: function(e, normalizeSpaces) { 1.867 + let textContent = e.textContent.replace(this.REGEXPS.trim, ""); 1.868 + normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; 1.869 + 1.870 + if (normalizeSpaces) { 1.871 + return textContent.replace(this.REGEXPS.normalize, " "); 1.872 + } else { 1.873 + return textContent; 1.874 + } 1.875 + }, 1.876 + 1.877 + /** 1.878 + * Get the number of times a string s appears in the node e. 1.879 + * 1.880 + * @param Element 1.881 + * @param string - what to split on. Default is "," 1.882 + * @return number (integer) 1.883 + **/ 1.884 + _getCharCount: function(e,s) { 1.885 + s = s || ","; 1.886 + return this._getInnerText(e).split(s).length - 1; 1.887 + }, 1.888 + 1.889 + /** 1.890 + * Remove the style attribute on every e and under. 1.891 + * TODO: Test if getElementsByTagName(*) is faster. 1.892 + * 1.893 + * @param Element 1.894 + * @return void 1.895 + **/ 1.896 + _cleanStyles: function(e) { 1.897 + e = e || this._doc; 1.898 + let cur = e.firstChild; 1.899 + 1.900 + if (!e) 1.901 + return; 1.902 + 1.903 + // Remove any root styles, if we're able. 1.904 + if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') 1.905 + e.removeAttribute('style'); 1.906 + 1.907 + // Go until there are no more child nodes 1.908 + while (cur !== null) { 1.909 + if (cur.nodeType === 1) { 1.910 + // Remove style attribute(s) : 1.911 + if (cur.className !== "readability-styled") 1.912 + cur.removeAttribute("style"); 1.913 + 1.914 + this._cleanStyles(cur); 1.915 + } 1.916 + 1.917 + cur = cur.nextSibling; 1.918 + } 1.919 + }, 1.920 + 1.921 + /** 1.922 + * Get the density of links as a percentage of the content 1.923 + * This is the amount of text that is inside a link divided by the total text in the node. 1.924 + * 1.925 + * @param Element 1.926 + * @return number (float) 1.927 + **/ 1.928 + _getLinkDensity: function(e) { 1.929 + let links = e.getElementsByTagName("a"); 1.930 + let textLength = this._getInnerText(e).length; 1.931 + let linkLength = 0; 1.932 + 1.933 + for (let i = 0, il = links.length; i < il; i += 1) { 1.934 + linkLength += this._getInnerText(links[i]).length; 1.935 + } 1.936 + 1.937 + return linkLength / textLength; 1.938 + }, 1.939 + 1.940 + /** 1.941 + * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. 1.942 + * 1.943 + * @author Dan Lacy 1.944 + * @return string the base url 1.945 + **/ 1.946 + _findBaseUrl: function() { 1.947 + let uri = this._uri; 1.948 + let noUrlParams = uri.path.split("?")[0]; 1.949 + let urlSlashes = noUrlParams.split("/").reverse(); 1.950 + let cleanedSegments = []; 1.951 + let possibleType = ""; 1.952 + 1.953 + for (let i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { 1.954 + let segment = urlSlashes[i]; 1.955 + 1.956 + // Split off and save anything that looks like a file type. 1.957 + if (segment.indexOf(".") !== -1) { 1.958 + possibleType = segment.split(".")[1]; 1.959 + 1.960 + // If the type isn't alpha-only, it's probably not actually a file extension. 1.961 + if (!possibleType.match(/[^a-zA-Z]/)) 1.962 + segment = segment.split(".")[0]; 1.963 + } 1.964 + 1.965 + // EW-CMS specific segment replacement. Ugly. 1.966 + // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html 1.967 + if (segment.indexOf(',00') !== -1) 1.968 + segment = segment.replace(',00', ''); 1.969 + 1.970 + // If our first or second segment has anything looking like a page number, remove it. 1.971 + if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) 1.972 + segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); 1.973 + 1.974 + let del = false; 1.975 + 1.976 + // If this is purely a number, and it's the first or second segment, 1.977 + // it's probably a page number. Remove it. 1.978 + if (i < 2 && segment.match(/^\d{1,2}$/)) 1.979 + del = true; 1.980 + 1.981 + // If this is the first segment and it's just "index", remove it. 1.982 + if (i === 0 && segment.toLowerCase() === "index") 1.983 + del = true; 1.984 + 1.985 + // If our first or second segment is smaller than 3 characters, 1.986 + // and the first segment was purely alphas, remove it. 1.987 + if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) 1.988 + del = true; 1.989 + 1.990 + // If it's not marked for deletion, push it to cleanedSegments. 1.991 + if (!del) 1.992 + cleanedSegments.push(segment); 1.993 + } 1.994 + 1.995 + // This is our final, cleaned, base article URL. 1.996 + return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); 1.997 + }, 1.998 + 1.999 + /** 1.1000 + * Look for any paging links that may occur within the document. 1.1001 + * 1.1002 + * @param body 1.1003 + * @return object (array) 1.1004 + **/ 1.1005 + _findNextPageLink: function(elem) { 1.1006 + let uri = this._uri; 1.1007 + let possiblePages = {}; 1.1008 + let allLinks = elem.getElementsByTagName('a'); 1.1009 + let articleBaseUrl = this._findBaseUrl(); 1.1010 + 1.1011 + // Loop through all links, looking for hints that they may be next-page links. 1.1012 + // Things like having "page" in their textContent, className or id, or being a child 1.1013 + // of a node with a page-y className or id. 1.1014 + // 1.1015 + // Also possible: levenshtein distance? longest common subsequence? 1.1016 + // 1.1017 + // After we do that, assign each page a score, and 1.1018 + for (let i = 0, il = allLinks.length; i < il; i += 1) { 1.1019 + let link = allLinks[i]; 1.1020 + let linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); 1.1021 + 1.1022 + // If we've already seen this page, ignore it. 1.1023 + if (linkHref === "" || 1.1024 + linkHref === articleBaseUrl || 1.1025 + linkHref === uri.spec || 1.1026 + linkHref in this._parsedPages) { 1.1027 + continue; 1.1028 + } 1.1029 + 1.1030 + // If it's on a different domain, skip it. 1.1031 + if (uri.host !== linkHref.split(/\/+/g)[1]) 1.1032 + continue; 1.1033 + 1.1034 + let linkText = this._getInnerText(link); 1.1035 + 1.1036 + // If the linkText looks like it's not the next page, skip it. 1.1037 + if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) 1.1038 + continue; 1.1039 + 1.1040 + // If the leftovers of the URL after removing the base URL don't contain 1.1041 + // any digits, it's certainly not a next page link. 1.1042 + let linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1.1043 + if (!linkHrefLeftover.match(/\d/)) 1.1044 + continue; 1.1045 + 1.1046 + if (!(linkHref in possiblePages)) { 1.1047 + possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; 1.1048 + } else { 1.1049 + possiblePages[linkHref].linkText += ' | ' + linkText; 1.1050 + } 1.1051 + 1.1052 + let linkObj = possiblePages[linkHref]; 1.1053 + 1.1054 + // If the articleBaseUrl isn't part of this URL, penalize this link. It could 1.1055 + // still be the link, but the odds are lower. 1.1056 + // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1.1057 + if (linkHref.indexOf(articleBaseUrl) !== 0) 1.1058 + linkObj.score -= 25; 1.1059 + 1.1060 + let linkData = linkText + ' ' + link.className + ' ' + link.id; 1.1061 + if (linkData.match(this.REGEXPS.nextLink)) 1.1062 + linkObj.score += 50; 1.1063 + 1.1064 + if (linkData.match(/pag(e|ing|inat)/i)) 1.1065 + linkObj.score += 25; 1.1066 + 1.1067 + if (linkData.match(/(first|last)/i)) { 1.1068 + // -65 is enough to negate any bonuses gotten from a > or » in the text, 1.1069 + // If we already matched on "next", last is probably fine. 1.1070 + // If we didn't, then it's bad. Penalize. 1.1071 + if (!linkObj.linkText.match(this.REGEXPS.nextLink)) 1.1072 + linkObj.score -= 65; 1.1073 + } 1.1074 + 1.1075 + if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) 1.1076 + linkObj.score -= 50; 1.1077 + 1.1078 + if (linkData.match(this.REGEXPS.prevLink)) 1.1079 + linkObj.score -= 200; 1.1080 + 1.1081 + // If a parentNode contains page or paging or paginat 1.1082 + let parentNode = link.parentNode; 1.1083 + let positiveNodeMatch = false; 1.1084 + let negativeNodeMatch = false; 1.1085 + 1.1086 + while (parentNode) { 1.1087 + let parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; 1.1088 + 1.1089 + if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { 1.1090 + positiveNodeMatch = true; 1.1091 + linkObj.score += 25; 1.1092 + } 1.1093 + 1.1094 + if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { 1.1095 + // If this is just something like "footer", give it a negative. 1.1096 + // If it's something like "body-and-footer", leave it be. 1.1097 + if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { 1.1098 + linkObj.score -= 25; 1.1099 + negativeNodeMatch = true; 1.1100 + } 1.1101 + } 1.1102 + 1.1103 + parentNode = parentNode.parentNode; 1.1104 + } 1.1105 + 1.1106 + // If the URL looks like it has paging in it, add to the score. 1.1107 + // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1.1108 + if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) 1.1109 + linkObj.score += 25; 1.1110 + 1.1111 + // If the URL contains negative values, give a slight decrease. 1.1112 + if (linkHref.match(this.REGEXPS.extraneous)) 1.1113 + linkObj.score -= 15; 1.1114 + 1.1115 + /** 1.1116 + * Minor punishment to anything that doesn't match our current URL. 1.1117 + * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. 1.1118 + * Dan, can you show me a counterexample where this is necessary? 1.1119 + * if (linkHref.indexOf(window.location.href) !== 0) { 1.1120 + * linkObj.score -= 1; 1.1121 + * } 1.1122 + **/ 1.1123 + 1.1124 + // If the link text can be parsed as a number, give it a minor bonus, with a slight 1.1125 + // bias towards lower numbered pages. This is so that pages that might not have 'next' 1.1126 + // in their text can still get scored, and sorted properly by score. 1.1127 + let linkTextAsNumber = parseInt(linkText, 10); 1.1128 + if (linkTextAsNumber) { 1.1129 + // Punish 1 since we're either already there, or it's probably 1.1130 + // before what we want anyways. 1.1131 + if (linkTextAsNumber === 1) { 1.1132 + linkObj.score -= 10; 1.1133 + } else { 1.1134 + linkObj.score += Math.max(0, 10 - linkTextAsNumber); 1.1135 + } 1.1136 + } 1.1137 + } 1.1138 + 1.1139 + // Loop thrugh all of our possible pages from above and find our top 1.1140 + // candidate for the next page URL. Require at least a score of 50, which 1.1141 + // is a relatively high confidence that this page is the next link. 1.1142 + let topPage = null; 1.1143 + for (let page in possiblePages) { 1.1144 + if (possiblePages.hasOwnProperty(page)) { 1.1145 + if (possiblePages[page].score >= 50 && 1.1146 + (!topPage || topPage.score < possiblePages[page].score)) 1.1147 + topPage = possiblePages[page]; 1.1148 + } 1.1149 + } 1.1150 + 1.1151 + if (topPage) { 1.1152 + let nextHref = topPage.href.replace(/\/$/,''); 1.1153 + 1.1154 + this.log('NEXT PAGE IS ' + nextHref); 1.1155 + this._parsedPages[nextHref] = true; 1.1156 + return nextHref; 1.1157 + } else { 1.1158 + return null; 1.1159 + } 1.1160 + }, 1.1161 + 1.1162 + _successfulRequest: function(request) { 1.1163 + return (request.status >= 200 && request.status < 300) || 1.1164 + request.status === 304 || 1.1165 + (request.status === 0 && request.responseText); 1.1166 + }, 1.1167 + 1.1168 + _ajax: function(url, options) { 1.1169 + let request = new XMLHttpRequest(); 1.1170 + 1.1171 + function respondToReadyState(readyState) { 1.1172 + if (request.readyState === 4) { 1.1173 + if (this._successfulRequest(request)) { 1.1174 + if (options.success) 1.1175 + options.success(request); 1.1176 + } else { 1.1177 + if (options.error) 1.1178 + options.error(request); 1.1179 + } 1.1180 + } 1.1181 + } 1.1182 + 1.1183 + if (typeof options === 'undefined') 1.1184 + options = {}; 1.1185 + 1.1186 + request.onreadystatechange = respondToReadyState; 1.1187 + 1.1188 + request.open('get', url, true); 1.1189 + request.setRequestHeader('Accept', 'text/html'); 1.1190 + 1.1191 + try { 1.1192 + request.send(options.postBody); 1.1193 + } catch (e) { 1.1194 + if (options.error) 1.1195 + options.error(); 1.1196 + } 1.1197 + 1.1198 + return request; 1.1199 + }, 1.1200 + 1.1201 + _appendNextPage: function(nextPageLink) { 1.1202 + let doc = this._doc; 1.1203 + this._curPageNum += 1; 1.1204 + 1.1205 + let articlePage = doc.createElement("DIV"); 1.1206 + articlePage.id = 'readability-page-' + this._curPageNum; 1.1207 + articlePage.className = 'page'; 1.1208 + articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">§</p>'; 1.1209 + 1.1210 + doc.getElementById("readability-content").appendChild(articlePage); 1.1211 + 1.1212 + if (this._curPageNum > this.MAX_PAGES) { 1.1213 + let nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>"; 1.1214 + articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; 1.1215 + return; 1.1216 + } 1.1217 + 1.1218 + // Now that we've built the article page DOM element, get the page content 1.1219 + // asynchronously and load the cleaned content into the div we created for it. 1.1220 + (function(pageUrl, thisPage) { 1.1221 + this._ajax(pageUrl, { 1.1222 + success: function(r) { 1.1223 + 1.1224 + // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. 1.1225 + let eTag = r.getResponseHeader('ETag'); 1.1226 + if (eTag) { 1.1227 + if (eTag in this._pageETags) { 1.1228 + this.log("Exact duplicate page found via ETag. Aborting."); 1.1229 + articlePage.style.display = 'none'; 1.1230 + return; 1.1231 + } else { 1.1232 + this._pageETags[eTag] = 1; 1.1233 + } 1.1234 + } 1.1235 + 1.1236 + // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. 1.1237 + let page = doc.createElement("DIV"); 1.1238 + 1.1239 + // Do some preprocessing to our HTML to make it ready for appending. 1.1240 + // - Remove any script tags. Swap and reswap newlines with a unicode 1.1241 + // character because multiline regex doesn't work in javascript. 1.1242 + // - Turn any noscript tags into divs so that we can parse them. This 1.1243 + // allows us to find any next page links hidden via javascript. 1.1244 + // - Turn all double br's into p's - was handled by prepDocument in the original view. 1.1245 + // Maybe in the future abstract out prepDocument to work for both the original document 1.1246 + // and AJAX-added pages. 1.1247 + let responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, ''); 1.1248 + responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, ''); 1.1249 + responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div'); 1.1250 + responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); 1.1251 + 1.1252 + page.innerHTML = responseHtml; 1.1253 + this._replaceBrs(page); 1.1254 + 1.1255 + // Reset all flags for the next page, as they will search through it and 1.1256 + // disable as necessary at the end of grabArticle. 1.1257 + this._flags = 0x1 | 0x2 | 0x4; 1.1258 + 1.1259 + let nextPageLink = this._findNextPageLink(page); 1.1260 + 1.1261 + // NOTE: if we end up supporting _appendNextPage(), we'll need to 1.1262 + // change this call to be async 1.1263 + let content = this._grabArticle(page); 1.1264 + 1.1265 + if (!content) { 1.1266 + this.log("No content found in page to append. Aborting."); 1.1267 + return; 1.1268 + } 1.1269 + 1.1270 + // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. 1.1271 + // Compare it against all of the the previous document's we've gotten. If the previous 1.1272 + // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. 1.1273 + let firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; 1.1274 + if (firstP && firstP.innerHTML.length > 100) { 1.1275 + for (let i = 1; i <= this._curPageNum; i += 1) { 1.1276 + let rPage = doc.getElementById('readability-page-' + i); 1.1277 + if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { 1.1278 + this.log('Duplicate of page ' + i + ' - skipping.'); 1.1279 + articlePage.style.display = 'none'; 1.1280 + this._parsedPages[pageUrl] = true; 1.1281 + return; 1.1282 + } 1.1283 + } 1.1284 + } 1.1285 + 1.1286 + this._removeScripts(content); 1.1287 + 1.1288 + thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; 1.1289 + 1.1290 + // After the page has rendered, post process the content. This delay is necessary because, 1.1291 + // in webkit at least, offsetWidth is not set in time to determine image width. We have to 1.1292 + // wait a little bit for reflow to finish before we can fix floating images. 1.1293 + setTimeout((function() { 1.1294 + this._postProcessContent(thisPage); 1.1295 + }).bind(this), 500); 1.1296 + 1.1297 + 1.1298 + if (nextPageLink) 1.1299 + this._appendNextPage(nextPageLink); 1.1300 + } 1.1301 + }); 1.1302 + }).bind(this)(nextPageLink, articlePage); 1.1303 + }, 1.1304 + 1.1305 + /** 1.1306 + * Get an elements class/id weight. Uses regular expressions to tell if this 1.1307 + * element looks good or bad. 1.1308 + * 1.1309 + * @param Element 1.1310 + * @return number (Integer) 1.1311 + **/ 1.1312 + _getClassWeight: function(e) { 1.1313 + if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) 1.1314 + return 0; 1.1315 + 1.1316 + let weight = 0; 1.1317 + 1.1318 + // Look for a special classname 1.1319 + if (typeof(e.className) === 'string' && e.className !== '') { 1.1320 + if (e.className.search(this.REGEXPS.negative) !== -1) 1.1321 + weight -= 25; 1.1322 + 1.1323 + if (e.className.search(this.REGEXPS.positive) !== -1) 1.1324 + weight += 25; 1.1325 + } 1.1326 + 1.1327 + // Look for a special ID 1.1328 + if (typeof(e.id) === 'string' && e.id !== '') { 1.1329 + if (e.id.search(this.REGEXPS.negative) !== -1) 1.1330 + weight -= 25; 1.1331 + 1.1332 + if (e.id.search(this.REGEXPS.positive) !== -1) 1.1333 + weight += 25; 1.1334 + } 1.1335 + 1.1336 + return weight; 1.1337 + }, 1.1338 + 1.1339 + /** 1.1340 + * Clean a node of all elements of type "tag". 1.1341 + * (Unless it's a youtube/vimeo video. People love movies.) 1.1342 + * 1.1343 + * @param Element 1.1344 + * @param string tag to clean 1.1345 + * @return void 1.1346 + **/ 1.1347 + _clean: function(e, tag) { 1.1348 + let targetList = e.getElementsByTagName(tag); 1.1349 + let isEmbed = (tag === 'object' || tag === 'embed'); 1.1350 + 1.1351 + for (let y = targetList.length - 1; y >= 0; y -= 1) { 1.1352 + // Allow youtube and vimeo videos through as people usually want to see those. 1.1353 + if (isEmbed) { 1.1354 + let attributeValues = ""; 1.1355 + for (let i = 0, il = targetList[y].attributes.length; i < il; i += 1) { 1.1356 + attributeValues += targetList[y].attributes[i].value + '|'; 1.1357 + } 1.1358 + 1.1359 + // First, check the elements attributes to see if any of them contain youtube or vimeo 1.1360 + if (attributeValues.search(this.REGEXPS.videos) !== -1) 1.1361 + continue; 1.1362 + 1.1363 + // Then check the elements inside this element for the same. 1.1364 + if (targetList[y].innerHTML.search(this.REGEXPS.videos) !== -1) 1.1365 + continue; 1.1366 + } 1.1367 + 1.1368 + targetList[y].parentNode.removeChild(targetList[y]); 1.1369 + } 1.1370 + }, 1.1371 + 1.1372 + /** 1.1373 + * Clean an element of all tags of type "tag" if they look fishy. 1.1374 + * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 1.1375 + * 1.1376 + * @return void 1.1377 + **/ 1.1378 + _cleanConditionally: function(e, tag) { 1.1379 + if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) 1.1380 + return; 1.1381 + 1.1382 + let tagsList = e.getElementsByTagName(tag); 1.1383 + let curTagsLength = tagsList.length; 1.1384 + 1.1385 + // Gather counts for other typical elements embedded within. 1.1386 + // Traverse backwards so we can remove nodes at the same time 1.1387 + // without effecting the traversal. 1.1388 + // 1.1389 + // TODO: Consider taking into account original contentScore here. 1.1390 + for (let i = curTagsLength-1; i >= 0; i -= 1) { 1.1391 + let weight = this._getClassWeight(tagsList[i]); 1.1392 + let contentScore = 0; 1.1393 + 1.1394 + this.log("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")"); 1.1395 + 1.1396 + if (weight + contentScore < 0) { 1.1397 + tagsList[i].parentNode.removeChild(tagsList[i]); 1.1398 + } else if (this._getCharCount(tagsList[i],',') < 10) { 1.1399 + // If there are not very many commas, and the number of 1.1400 + // non-paragraph elements is more than paragraphs or other 1.1401 + // ominous signs, remove the element. 1.1402 + let p = tagsList[i].getElementsByTagName("p").length; 1.1403 + let img = tagsList[i].getElementsByTagName("img").length; 1.1404 + let li = tagsList[i].getElementsByTagName("li").length-100; 1.1405 + let input = tagsList[i].getElementsByTagName("input").length; 1.1406 + 1.1407 + let embedCount = 0; 1.1408 + let embeds = tagsList[i].getElementsByTagName("embed"); 1.1409 + for (let ei = 0, il = embeds.length; ei < il; ei += 1) { 1.1410 + if (embeds[ei].src.search(this.REGEXPS.videos) === -1) 1.1411 + embedCount += 1; 1.1412 + } 1.1413 + 1.1414 + let linkDensity = this._getLinkDensity(tagsList[i]); 1.1415 + let contentLength = this._getInnerText(tagsList[i]).length; 1.1416 + let toRemove = false; 1.1417 + 1.1418 + if (img > p) { 1.1419 + toRemove = true; 1.1420 + } else if (li > p && tag !== "ul" && tag !== "ol") { 1.1421 + toRemove = true; 1.1422 + } else if ( input > Math.floor(p/3) ) { 1.1423 + toRemove = true; 1.1424 + } else if (contentLength < 25 && (img === 0 || img > 2) ) { 1.1425 + toRemove = true; 1.1426 + } else if (weight < 25 && linkDensity > 0.2) { 1.1427 + toRemove = true; 1.1428 + } else if (weight >= 25 && linkDensity > 0.5) { 1.1429 + toRemove = true; 1.1430 + } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1.1431 + toRemove = true; 1.1432 + } 1.1433 + 1.1434 + if (toRemove) 1.1435 + tagsList[i].parentNode.removeChild(tagsList[i]); 1.1436 + } 1.1437 + } 1.1438 + }, 1.1439 + 1.1440 + /** 1.1441 + * Clean out spurious headers from an Element. Checks things like classnames and link density. 1.1442 + * 1.1443 + * @param Element 1.1444 + * @return void 1.1445 + **/ 1.1446 + _cleanHeaders: function(e) { 1.1447 + for (let headerIndex = 1; headerIndex < 3; headerIndex += 1) { 1.1448 + let headers = e.getElementsByTagName('h' + headerIndex); 1.1449 + for (let i = headers.length - 1; i >= 0; i -= 1) { 1.1450 + if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33) 1.1451 + headers[i].parentNode.removeChild(headers[i]); 1.1452 + } 1.1453 + } 1.1454 + }, 1.1455 + 1.1456 + _flagIsActive: function(flag) { 1.1457 + return (this._flags & flag) > 0; 1.1458 + }, 1.1459 + 1.1460 + _addFlag: function(flag) { 1.1461 + this._flags = this._flags | flag; 1.1462 + }, 1.1463 + 1.1464 + _removeFlag: function(flag) { 1.1465 + this._flags = this._flags & ~flag; 1.1466 + }, 1.1467 + 1.1468 + /** 1.1469 + * Runs readability. 1.1470 + * 1.1471 + * Workflow: 1.1472 + * 1. Prep the document by removing script tags, css, etc. 1.1473 + * 2. Build readability's DOM tree. 1.1474 + * 3. Grab the article content from the current dom tree. 1.1475 + * 4. Replace the current DOM tree with the new one. 1.1476 + * 5. Read peacefully. 1.1477 + * 1.1478 + * @return void 1.1479 + **/ 1.1480 + parse: function () { 1.1481 + // Remove script tags from the document. 1.1482 + this._removeScripts(this._doc); 1.1483 + 1.1484 + // FIXME: Disabled multi-page article support for now as it 1.1485 + // needs more work on infrastructure. 1.1486 + 1.1487 + // Make sure this document is added to the list of parsed pages first, 1.1488 + // so we don't double up on the first page. 1.1489 + // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; 1.1490 + 1.1491 + // Pull out any possible next page link first. 1.1492 + // let nextPageLink = this._findNextPageLink(doc.body); 1.1493 + 1.1494 + this._prepDocument(); 1.1495 + 1.1496 + let articleTitle = this._getArticleTitle(); 1.1497 + let articleContent = this._grabArticle(); 1.1498 + if (!articleContent) 1.1499 + return null; 1.1500 + 1.1501 + this._postProcessContent(articleContent); 1.1502 + 1.1503 + // if (nextPageLink) { 1.1504 + // // Append any additional pages after a small timeout so that people 1.1505 + // // can start reading without having to wait for this to finish processing. 1.1506 + // setTimeout((function() { 1.1507 + // this._appendNextPage(nextPageLink); 1.1508 + // }).bind(this), 500); 1.1509 + // } 1.1510 + 1.1511 + let excerpt = this._getExcerpt(articleContent); 1.1512 + 1.1513 + return { title: articleTitle, 1.1514 + byline: this._articleByline, 1.1515 + dir: this._articleDir, 1.1516 + content: articleContent.innerHTML, 1.1517 + length: articleContent.textContent.length, 1.1518 + excerpt: excerpt }; 1.1519 + } 1.1520 +};