Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright (c) 2010 Arc90 Inc |
michael@0 | 3 | * |
michael@0 | 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 5 | * you may not use this file except in compliance with the License. |
michael@0 | 6 | * You may obtain a copy of the License at |
michael@0 | 7 | * |
michael@0 | 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 9 | * |
michael@0 | 10 | * Unless required by applicable law or agreed to in writing, software |
michael@0 | 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 13 | * See the License for the specific language governing permissions and |
michael@0 | 14 | * limitations under the License. |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | /* |
michael@0 | 18 | * This code is heavily based on Arc90's readability.js (1.7.1) script |
michael@0 | 19 | * available at: http://code.google.com/p/arc90labs-readability |
michael@0 | 20 | */ |
michael@0 | 21 | |
michael@0 | 22 | var Readability = function(uri, doc) { |
michael@0 | 23 | const ENABLE_LOGGING = false; |
michael@0 | 24 | |
michael@0 | 25 | this._uri = uri; |
michael@0 | 26 | this._doc = doc; |
michael@0 | 27 | this._biggestFrame = false; |
michael@0 | 28 | this._articleByline = null; |
michael@0 | 29 | this._articleDir = null; |
michael@0 | 30 | |
michael@0 | 31 | // Start with all flags set |
michael@0 | 32 | this._flags = this.FLAG_STRIP_UNLIKELYS | |
michael@0 | 33 | this.FLAG_WEIGHT_CLASSES | |
michael@0 | 34 | this.FLAG_CLEAN_CONDITIONALLY; |
michael@0 | 35 | |
michael@0 | 36 | // The list of pages we've parsed in this call of readability, |
michael@0 | 37 | // for autopaging. As a key store for easier searching. |
michael@0 | 38 | this._parsedPages = {}; |
michael@0 | 39 | |
michael@0 | 40 | // A list of the ETag headers of pages we've parsed, in case they happen to match, |
michael@0 | 41 | // we'll know it's a duplicate. |
michael@0 | 42 | this._pageETags = {}; |
michael@0 | 43 | |
michael@0 | 44 | // Make an AJAX request for each page and append it to the document. |
michael@0 | 45 | this._curPageNum = 1; |
michael@0 | 46 | |
michael@0 | 47 | // Control whether log messages are sent to the console |
michael@0 | 48 | if (ENABLE_LOGGING) { |
michael@0 | 49 | this.log = function (msg) { |
michael@0 | 50 | dump("Reader: (Readability) " + msg); |
michael@0 | 51 | }; |
michael@0 | 52 | } else { |
michael@0 | 53 | this.log = function () {}; |
michael@0 | 54 | } |
michael@0 | 55 | } |
michael@0 | 56 | |
michael@0 | 57 | Readability.prototype = { |
michael@0 | 58 | FLAG_STRIP_UNLIKELYS: 0x1, |
michael@0 | 59 | FLAG_WEIGHT_CLASSES: 0x2, |
michael@0 | 60 | FLAG_CLEAN_CONDITIONALLY: 0x4, |
michael@0 | 61 | |
michael@0 | 62 | // The number of top candidates to consider when analysing how |
michael@0 | 63 | // tight the competition is among candidates. |
michael@0 | 64 | N_TOP_CANDIDATES: 5, |
michael@0 | 65 | |
michael@0 | 66 | // The maximum number of pages to loop through before we call |
michael@0 | 67 | // it quits and just show a link. |
michael@0 | 68 | MAX_PAGES: 5, |
michael@0 | 69 | |
michael@0 | 70 | // All of the regular expressions in use within readability. |
michael@0 | 71 | // Defined up here so we don't instantiate them repeatedly in loops. |
michael@0 | 72 | REGEXPS: { |
michael@0 | 73 | unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, |
michael@0 | 74 | okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
michael@0 | 75 | positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, |
michael@0 | 76 | negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, |
michael@0 | 77 | extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, |
michael@0 | 78 | byline: /byline|author|dateline|writtenby/i, |
michael@0 | 79 | replaceFonts: /<(\/?)font[^>]*>/gi, |
michael@0 | 80 | trim: /^\s+|\s+$/g, |
michael@0 | 81 | normalize: /\s{2,}/g, |
michael@0 | 82 | videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
michael@0 | 83 | nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, |
michael@0 | 84 | prevLink: /(prev|earl|old|new|<|«)/i, |
michael@0 | 85 | whitespace: /^\s*$/ |
michael@0 | 86 | }, |
michael@0 | 87 | |
michael@0 | 88 | DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], |
michael@0 | 89 | |
michael@0 | 90 | /** |
michael@0 | 91 | * Run any post-process modifications to article content as necessary. |
michael@0 | 92 | * |
michael@0 | 93 | * @param Element |
michael@0 | 94 | * @return void |
michael@0 | 95 | **/ |
michael@0 | 96 | _postProcessContent: function(articleContent) { |
michael@0 | 97 | // Readability cannot open relative uris so we convert them to absolute uris. |
michael@0 | 98 | this._fixRelativeUris(articleContent); |
michael@0 | 99 | }, |
michael@0 | 100 | |
michael@0 | 101 | /** |
michael@0 | 102 | * Converts each <a> and <img> uri in the given element to an absolute URI. |
michael@0 | 103 | * |
michael@0 | 104 | * @param Element |
michael@0 | 105 | * @return void |
michael@0 | 106 | */ |
michael@0 | 107 | _fixRelativeUris: function(articleContent) { |
michael@0 | 108 | let scheme = this._uri.scheme; |
michael@0 | 109 | let prePath = this._uri.prePath; |
michael@0 | 110 | let pathBase = this._uri.pathBase; |
michael@0 | 111 | |
michael@0 | 112 | function toAbsoluteURI(uri) { |
michael@0 | 113 | // If this is already an absolute URI, return it. |
michael@0 | 114 | if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) |
michael@0 | 115 | return uri; |
michael@0 | 116 | |
michael@0 | 117 | // Scheme-rooted relative URI. |
michael@0 | 118 | if (uri.substr(0, 2) == "//") |
michael@0 | 119 | return scheme + "://" + uri.substr(2); |
michael@0 | 120 | |
michael@0 | 121 | // Prepath-rooted relative URI. |
michael@0 | 122 | if (uri[0] == "/") |
michael@0 | 123 | return prePath + uri; |
michael@0 | 124 | |
michael@0 | 125 | // Standard relative URI; add entire path. pathBase already includes a |
michael@0 | 126 | // trailing "/". |
michael@0 | 127 | return pathBase + uri; |
michael@0 | 128 | } |
michael@0 | 129 | |
michael@0 | 130 | function convertRelativeURIs(tagName, propName) { |
michael@0 | 131 | let elems = articleContent.getElementsByTagName(tagName); |
michael@0 | 132 | for (let i = elems.length; --i >= 0;) { |
michael@0 | 133 | let elem = elems[i]; |
michael@0 | 134 | let relativeURI = elem.getAttribute(propName); |
michael@0 | 135 | if (relativeURI != null) |
michael@0 | 136 | elems[i].setAttribute(propName, toAbsoluteURI(relativeURI)); |
michael@0 | 137 | } |
michael@0 | 138 | } |
michael@0 | 139 | |
michael@0 | 140 | // Fix links. |
michael@0 | 141 | convertRelativeURIs("a", "href"); |
michael@0 | 142 | |
michael@0 | 143 | // Fix images. |
michael@0 | 144 | convertRelativeURIs("img", "src"); |
michael@0 | 145 | }, |
michael@0 | 146 | |
michael@0 | 147 | /** |
michael@0 | 148 | * Get the article title as an H1. |
michael@0 | 149 | * |
michael@0 | 150 | * @return void |
michael@0 | 151 | **/ |
michael@0 | 152 | _getArticleTitle: function() { |
michael@0 | 153 | let doc = this._doc; |
michael@0 | 154 | let curTitle = ""; |
michael@0 | 155 | let origTitle = ""; |
michael@0 | 156 | |
michael@0 | 157 | try { |
michael@0 | 158 | curTitle = origTitle = doc.title; |
michael@0 | 159 | |
michael@0 | 160 | // If they had an element with id "title" in their HTML |
michael@0 | 161 | if (typeof curTitle !== "string") |
michael@0 | 162 | curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); |
michael@0 | 163 | } catch(e) {} |
michael@0 | 164 | |
michael@0 | 165 | if (curTitle.match(/ [\|\-] /)) { |
michael@0 | 166 | curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
michael@0 | 167 | |
michael@0 | 168 | if (curTitle.split(' ').length < 3) |
michael@0 | 169 | curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
michael@0 | 170 | } else if (curTitle.indexOf(': ') !== -1) { |
michael@0 | 171 | curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
michael@0 | 172 | |
michael@0 | 173 | if (curTitle.split(' ').length < 3) |
michael@0 | 174 | curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
michael@0 | 175 | } else if (curTitle.length > 150 || curTitle.length < 15) { |
michael@0 | 176 | let hOnes = doc.getElementsByTagName('h1'); |
michael@0 | 177 | |
michael@0 | 178 | if (hOnes.length === 1) |
michael@0 | 179 | curTitle = this._getInnerText(hOnes[0]); |
michael@0 | 180 | } |
michael@0 | 181 | |
michael@0 | 182 | curTitle = curTitle.replace(this.REGEXPS.trim, ""); |
michael@0 | 183 | |
michael@0 | 184 | if (curTitle.split(' ').length <= 4) |
michael@0 | 185 | curTitle = origTitle; |
michael@0 | 186 | |
michael@0 | 187 | return curTitle; |
michael@0 | 188 | }, |
michael@0 | 189 | |
michael@0 | 190 | /** |
michael@0 | 191 | * Prepare the HTML document for readability to scrape it. |
michael@0 | 192 | * This includes things like stripping javascript, CSS, and handling terrible markup. |
michael@0 | 193 | * |
michael@0 | 194 | * @return void |
michael@0 | 195 | **/ |
michael@0 | 196 | _prepDocument: function() { |
michael@0 | 197 | let doc = this._doc; |
michael@0 | 198 | |
michael@0 | 199 | // In some cases a body element can't be found (if the HTML is |
michael@0 | 200 | // totally hosed for example) so we create a new body node and |
michael@0 | 201 | // append it to the document. |
michael@0 | 202 | if (doc.body === null) { |
michael@0 | 203 | let body = doc.createElement("body"); |
michael@0 | 204 | |
michael@0 | 205 | try { |
michael@0 | 206 | doc.body = body; |
michael@0 | 207 | } catch(e) { |
michael@0 | 208 | doc.documentElement.appendChild(body); |
michael@0 | 209 | this.log(e); |
michael@0 | 210 | } |
michael@0 | 211 | } |
michael@0 | 212 | |
michael@0 | 213 | // Remove all style tags in head |
michael@0 | 214 | let styleTags = doc.getElementsByTagName("style"); |
michael@0 | 215 | for (let st = 0; st < styleTags.length; st += 1) { |
michael@0 | 216 | styleTags[st].textContent = ""; |
michael@0 | 217 | } |
michael@0 | 218 | |
michael@0 | 219 | this._replaceBrs(doc.body); |
michael@0 | 220 | |
michael@0 | 221 | let fonts = doc.getElementsByTagName("FONT"); |
michael@0 | 222 | for (let i = fonts.length; --i >=0;) { |
michael@0 | 223 | this._setNodeTag(fonts[i], "SPAN"); |
michael@0 | 224 | } |
michael@0 | 225 | }, |
michael@0 | 226 | |
michael@0 | 227 | /** |
michael@0 | 228 | * Finds the next element, starting from the given node, and ignoring |
michael@0 | 229 | * whitespace in between. If the given node is an element, the same node is |
michael@0 | 230 | * returned. |
michael@0 | 231 | */ |
michael@0 | 232 | _nextElement: function (node) { |
michael@0 | 233 | let next = node; |
michael@0 | 234 | while (next |
michael@0 | 235 | && (next.nodeType != Node.ELEMENT_NODE) |
michael@0 | 236 | && this.REGEXPS.whitespace.test(next.textContent)) { |
michael@0 | 237 | next = next.nextSibling; |
michael@0 | 238 | } |
michael@0 | 239 | return next; |
michael@0 | 240 | }, |
michael@0 | 241 | |
michael@0 | 242 | /** |
michael@0 | 243 | * Replaces 2 or more successive <br> elements with a single <p>. |
michael@0 | 244 | * Whitespace between <br> elements are ignored. For example: |
michael@0 | 245 | * <div>foo<br>bar<br> <br><br>abc</div> |
michael@0 | 246 | * will become: |
michael@0 | 247 | * <div>foo<br>bar<p>abc</p></div> |
michael@0 | 248 | */ |
michael@0 | 249 | _replaceBrs: function (elem) { |
michael@0 | 250 | let brs = elem.getElementsByTagName("br"); |
michael@0 | 251 | for (let i = 0; i < brs.length; i++) { |
michael@0 | 252 | let br = brs[i]; |
michael@0 | 253 | let next = br.nextSibling; |
michael@0 | 254 | |
michael@0 | 255 | // Whether 2 or more <br> elements have been found and replaced with a |
michael@0 | 256 | // <p> block. |
michael@0 | 257 | let replaced = false; |
michael@0 | 258 | |
michael@0 | 259 | // If we find a <br> chain, remove the <br>s until we hit another element |
michael@0 | 260 | // or non-whitespace. This leaves behind the first <br> in the chain |
michael@0 | 261 | // (which will be replaced with a <p> later). |
michael@0 | 262 | while ((next = this._nextElement(next)) && (next.tagName == "BR")) { |
michael@0 | 263 | replaced = true; |
michael@0 | 264 | let sibling = next.nextSibling; |
michael@0 | 265 | next.parentNode.removeChild(next); |
michael@0 | 266 | next = sibling; |
michael@0 | 267 | } |
michael@0 | 268 | |
michael@0 | 269 | // If we removed a <br> chain, replace the remaining <br> with a <p>. Add |
michael@0 | 270 | // all sibling nodes as children of the <p> until we hit another <br> |
michael@0 | 271 | // chain. |
michael@0 | 272 | if (replaced) { |
michael@0 | 273 | let p = this._doc.createElement("p"); |
michael@0 | 274 | br.parentNode.replaceChild(p, br); |
michael@0 | 275 | |
michael@0 | 276 | next = p.nextSibling; |
michael@0 | 277 | while (next) { |
michael@0 | 278 | // If we've hit another <br><br>, we're done adding children to this <p>. |
michael@0 | 279 | if (next.tagName == "BR") { |
michael@0 | 280 | let nextElem = this._nextElement(next); |
michael@0 | 281 | if (nextElem && nextElem.tagName == "BR") |
michael@0 | 282 | break; |
michael@0 | 283 | } |
michael@0 | 284 | |
michael@0 | 285 | // Otherwise, make this node a child of the new <p>. |
michael@0 | 286 | let sibling = next.nextSibling; |
michael@0 | 287 | p.appendChild(next); |
michael@0 | 288 | next = sibling; |
michael@0 | 289 | } |
michael@0 | 290 | } |
michael@0 | 291 | } |
michael@0 | 292 | }, |
michael@0 | 293 | |
michael@0 | 294 | _setNodeTag: function (node, tag) { |
michael@0 | 295 | node.localName = tag.toLowerCase(); |
michael@0 | 296 | node.tagName = tag.toUpperCase(); |
michael@0 | 297 | }, |
michael@0 | 298 | |
michael@0 | 299 | /** |
michael@0 | 300 | * Prepare the article node for display. Clean out any inline styles, |
michael@0 | 301 | * iframes, forms, strip extraneous <p> tags, etc. |
michael@0 | 302 | * |
michael@0 | 303 | * @param Element |
michael@0 | 304 | * @return void |
michael@0 | 305 | **/ |
michael@0 | 306 | _prepArticle: function(articleContent) { |
michael@0 | 307 | this._cleanStyles(articleContent); |
michael@0 | 308 | |
michael@0 | 309 | // Clean out junk from the article content |
michael@0 | 310 | this._cleanConditionally(articleContent, "form"); |
michael@0 | 311 | this._clean(articleContent, "object"); |
michael@0 | 312 | this._clean(articleContent, "h1"); |
michael@0 | 313 | |
michael@0 | 314 | // If there is only one h2, they are probably using it as a header |
michael@0 | 315 | // and not a subheader, so remove it since we already have a header. |
michael@0 | 316 | if (articleContent.getElementsByTagName('h2').length === 1) |
michael@0 | 317 | this._clean(articleContent, "h2"); |
michael@0 | 318 | |
michael@0 | 319 | this._clean(articleContent, "iframe"); |
michael@0 | 320 | this._cleanHeaders(articleContent); |
michael@0 | 321 | |
michael@0 | 322 | // Do these last as the previous stuff may have removed junk |
michael@0 | 323 | // that will affect these |
michael@0 | 324 | this._cleanConditionally(articleContent, "table"); |
michael@0 | 325 | this._cleanConditionally(articleContent, "ul"); |
michael@0 | 326 | this._cleanConditionally(articleContent, "div"); |
michael@0 | 327 | |
michael@0 | 328 | // Remove extra paragraphs |
michael@0 | 329 | let articleParagraphs = articleContent.getElementsByTagName('p'); |
michael@0 | 330 | for (let i = articleParagraphs.length - 1; i >= 0; i -= 1) { |
michael@0 | 331 | let imgCount = articleParagraphs[i].getElementsByTagName('img').length; |
michael@0 | 332 | let embedCount = articleParagraphs[i].getElementsByTagName('embed').length; |
michael@0 | 333 | let objectCount = articleParagraphs[i].getElementsByTagName('object').length; |
michael@0 | 334 | |
michael@0 | 335 | if (imgCount === 0 && |
michael@0 | 336 | embedCount === 0 && |
michael@0 | 337 | objectCount === 0 && |
michael@0 | 338 | this._getInnerText(articleParagraphs[i], false) === '') |
michael@0 | 339 | articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); |
michael@0 | 340 | } |
michael@0 | 341 | |
michael@0 | 342 | let brs = articleContent.getElementsByTagName("BR"); |
michael@0 | 343 | for (let i = brs.length; --i >= 0;) { |
michael@0 | 344 | let br = brs[i]; |
michael@0 | 345 | let next = this._nextElement(br.nextSibling); |
michael@0 | 346 | if (next && next.tagName == "P") |
michael@0 | 347 | br.parentNode.removeChild(br); |
michael@0 | 348 | } |
michael@0 | 349 | }, |
michael@0 | 350 | |
michael@0 | 351 | /** |
michael@0 | 352 | * Initialize a node with the readability object. Also checks the |
michael@0 | 353 | * className/id for special names to add to its score. |
michael@0 | 354 | * |
michael@0 | 355 | * @param Element |
michael@0 | 356 | * @return void |
michael@0 | 357 | **/ |
michael@0 | 358 | _initializeNode: function(node) { |
michael@0 | 359 | node.readability = {"contentScore": 0}; |
michael@0 | 360 | |
michael@0 | 361 | switch(node.tagName) { |
michael@0 | 362 | case 'DIV': |
michael@0 | 363 | node.readability.contentScore += 5; |
michael@0 | 364 | break; |
michael@0 | 365 | |
michael@0 | 366 | case 'PRE': |
michael@0 | 367 | case 'TD': |
michael@0 | 368 | case 'BLOCKQUOTE': |
michael@0 | 369 | node.readability.contentScore += 3; |
michael@0 | 370 | break; |
michael@0 | 371 | |
michael@0 | 372 | case 'ADDRESS': |
michael@0 | 373 | case 'OL': |
michael@0 | 374 | case 'UL': |
michael@0 | 375 | case 'DL': |
michael@0 | 376 | case 'DD': |
michael@0 | 377 | case 'DT': |
michael@0 | 378 | case 'LI': |
michael@0 | 379 | case 'FORM': |
michael@0 | 380 | node.readability.contentScore -= 3; |
michael@0 | 381 | break; |
michael@0 | 382 | |
michael@0 | 383 | case 'H1': |
michael@0 | 384 | case 'H2': |
michael@0 | 385 | case 'H3': |
michael@0 | 386 | case 'H4': |
michael@0 | 387 | case 'H5': |
michael@0 | 388 | case 'H6': |
michael@0 | 389 | case 'TH': |
michael@0 | 390 | node.readability.contentScore -= 5; |
michael@0 | 391 | break; |
michael@0 | 392 | } |
michael@0 | 393 | |
michael@0 | 394 | node.readability.contentScore += this._getClassWeight(node); |
michael@0 | 395 | }, |
michael@0 | 396 | |
michael@0 | 397 | /*** |
michael@0 | 398 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is |
michael@0 | 399 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
michael@0 | 400 | * |
michael@0 | 401 | * @param page a document to run upon. Needs to be a full document, complete with body. |
michael@0 | 402 | * @return Element |
michael@0 | 403 | **/ |
michael@0 | 404 | _grabArticle: function (page) { |
michael@0 | 405 | let doc = this._doc; |
michael@0 | 406 | let isPaging = (page !== null ? true: false); |
michael@0 | 407 | page = page ? page : this._doc.body; |
michael@0 | 408 | let pageCacheHtml = page.innerHTML; |
michael@0 | 409 | |
michael@0 | 410 | // Check if any "dir" is set on the toplevel document element |
michael@0 | 411 | this._articleDir = doc.documentElement.getAttribute("dir"); |
michael@0 | 412 | |
michael@0 | 413 | while (true) { |
michael@0 | 414 | let stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); |
michael@0 | 415 | let allElements = page.getElementsByTagName('*'); |
michael@0 | 416 | |
michael@0 | 417 | // First, node prepping. Trash nodes that look cruddy (like ones with the |
michael@0 | 418 | // class name "comment", etc), and turn divs into P tags where they have been |
michael@0 | 419 | // used inappropriately (as in, where they contain no other block level elements.) |
michael@0 | 420 | // |
michael@0 | 421 | // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 |
michael@0 | 422 | // TODO: Shouldn't this be a reverse traversal? |
michael@0 | 423 | let node = null; |
michael@0 | 424 | let nodesToScore = []; |
michael@0 | 425 | |
michael@0 | 426 | // Let each node know its index in the allElements array. |
michael@0 | 427 | for (let i = allElements.length; --i >= 0;) { |
michael@0 | 428 | allElements[i]._index = i; |
michael@0 | 429 | } |
michael@0 | 430 | |
michael@0 | 431 | /** |
michael@0 | 432 | * JSDOMParser returns static node lists, not live ones. When we remove |
michael@0 | 433 | * an element from the document, we need to manually remove it - and all |
michael@0 | 434 | * of its children - from the allElements array. |
michael@0 | 435 | */ |
michael@0 | 436 | function purgeNode(node) { |
michael@0 | 437 | for (let i = node.childNodes.length; --i >= 0;) { |
michael@0 | 438 | purgeNode(node.childNodes[i]); |
michael@0 | 439 | } |
michael@0 | 440 | if (node._index !== undefined && allElements[node._index] == node) |
michael@0 | 441 | delete allElements[node._index]; |
michael@0 | 442 | } |
michael@0 | 443 | |
michael@0 | 444 | for (let nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) { |
michael@0 | 445 | if (!(node = allElements[nodeIndex])) |
michael@0 | 446 | continue; |
michael@0 | 447 | |
michael@0 | 448 | let matchString = node.className + node.id; |
michael@0 | 449 | if (matchString.search(this.REGEXPS.byline) !== -1 && !this._articleByline) { |
michael@0 | 450 | this._articleByline = node.textContent; |
michael@0 | 451 | node.parentNode.removeChild(node); |
michael@0 | 452 | purgeNode(node); |
michael@0 | 453 | continue; |
michael@0 | 454 | } |
michael@0 | 455 | |
michael@0 | 456 | // Remove unlikely candidates |
michael@0 | 457 | if (stripUnlikelyCandidates) { |
michael@0 | 458 | if (matchString.search(this.REGEXPS.unlikelyCandidates) !== -1 && |
michael@0 | 459 | matchString.search(this.REGEXPS.okMaybeItsACandidate) === -1 && |
michael@0 | 460 | node.tagName !== "BODY") { |
michael@0 | 461 | this.log("Removing unlikely candidate - " + matchString); |
michael@0 | 462 | node.parentNode.removeChild(node); |
michael@0 | 463 | purgeNode(node); |
michael@0 | 464 | continue; |
michael@0 | 465 | } |
michael@0 | 466 | } |
michael@0 | 467 | |
michael@0 | 468 | if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") |
michael@0 | 469 | nodesToScore[nodesToScore.length] = node; |
michael@0 | 470 | |
michael@0 | 471 | // Turn all divs that don't have children block level elements into p's |
michael@0 | 472 | if (node.tagName === "DIV") { |
michael@0 | 473 | // Sites like http://mobile.slate.com encloses each paragraph with a DIV |
michael@0 | 474 | // element. DIVs with only a P element inside and no text content can be |
michael@0 | 475 | // safely converted into plain P elements to avoid confusing the scoring |
michael@0 | 476 | // algorithm with DIVs with are, in practice, paragraphs. |
michael@0 | 477 | let pIndex = this._getSinglePIndexInsideDiv(node); |
michael@0 | 478 | |
michael@0 | 479 | if (pIndex >= 0 || !this._hasChildBlockElement(node)) { |
michael@0 | 480 | if (pIndex >= 0) { |
michael@0 | 481 | let newNode = node.childNodes[pIndex]; |
michael@0 | 482 | node.parentNode.replaceChild(newNode, node); |
michael@0 | 483 | purgeNode(node); |
michael@0 | 484 | } else { |
michael@0 | 485 | this._setNodeTag(node, "P"); |
michael@0 | 486 | nodesToScore[nodesToScore.length] = node; |
michael@0 | 487 | } |
michael@0 | 488 | } else { |
michael@0 | 489 | // EXPERIMENTAL |
michael@0 | 490 | for (let i = 0, il = node.childNodes.length; i < il; i += 1) { |
michael@0 | 491 | let childNode = node.childNodes[i]; |
michael@0 | 492 | if (!childNode) |
michael@0 | 493 | continue; |
michael@0 | 494 | |
michael@0 | 495 | if (childNode.nodeType === 3) { // Node.TEXT_NODE |
michael@0 | 496 | let p = doc.createElement('p'); |
michael@0 | 497 | p.textContent = childNode.textContent; |
michael@0 | 498 | p.style.display = 'inline'; |
michael@0 | 499 | p.className = 'readability-styled'; |
michael@0 | 500 | childNode.parentNode.replaceChild(p, childNode); |
michael@0 | 501 | } |
michael@0 | 502 | } |
michael@0 | 503 | } |
michael@0 | 504 | } |
michael@0 | 505 | } |
michael@0 | 506 | |
michael@0 | 507 | /** |
michael@0 | 508 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
michael@0 | 509 | * Then add their score to their parent node. |
michael@0 | 510 | * |
michael@0 | 511 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. |
michael@0 | 512 | **/ |
michael@0 | 513 | let candidates = []; |
michael@0 | 514 | for (let pt = 0; pt < nodesToScore.length; pt += 1) { |
michael@0 | 515 | let parentNode = nodesToScore[pt].parentNode; |
michael@0 | 516 | let grandParentNode = parentNode ? parentNode.parentNode : null; |
michael@0 | 517 | let innerText = this._getInnerText(nodesToScore[pt]); |
michael@0 | 518 | |
michael@0 | 519 | if (!parentNode || typeof(parentNode.tagName) === 'undefined') |
michael@0 | 520 | continue; |
michael@0 | 521 | |
michael@0 | 522 | // If this paragraph is less than 25 characters, don't even count it. |
michael@0 | 523 | if (innerText.length < 25) |
michael@0 | 524 | continue; |
michael@0 | 525 | |
michael@0 | 526 | // Initialize readability data for the parent. |
michael@0 | 527 | if (typeof parentNode.readability === 'undefined') { |
michael@0 | 528 | this._initializeNode(parentNode); |
michael@0 | 529 | candidates.push(parentNode); |
michael@0 | 530 | } |
michael@0 | 531 | |
michael@0 | 532 | // Initialize readability data for the grandparent. |
michael@0 | 533 | if (grandParentNode && |
michael@0 | 534 | typeof(grandParentNode.readability) === 'undefined' && |
michael@0 | 535 | typeof(grandParentNode.tagName) !== 'undefined') { |
michael@0 | 536 | this._initializeNode(grandParentNode); |
michael@0 | 537 | candidates.push(grandParentNode); |
michael@0 | 538 | } |
michael@0 | 539 | |
michael@0 | 540 | let contentScore = 0; |
michael@0 | 541 | |
michael@0 | 542 | // Add a point for the paragraph itself as a base. |
michael@0 | 543 | contentScore += 1; |
michael@0 | 544 | |
michael@0 | 545 | // Add points for any commas within this paragraph. |
michael@0 | 546 | contentScore += innerText.split(',').length; |
michael@0 | 547 | |
michael@0 | 548 | // For every 100 characters in this paragraph, add another point. Up to 3 points. |
michael@0 | 549 | contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
michael@0 | 550 | |
michael@0 | 551 | // Add the score to the parent. The grandparent gets half. |
michael@0 | 552 | parentNode.readability.contentScore += contentScore; |
michael@0 | 553 | |
michael@0 | 554 | if (grandParentNode) |
michael@0 | 555 | grandParentNode.readability.contentScore += contentScore / 2; |
michael@0 | 556 | } |
michael@0 | 557 | |
michael@0 | 558 | // After we've calculated scores, loop through all of the possible |
michael@0 | 559 | // candidate nodes we found and find the one with the highest score. |
michael@0 | 560 | let topCandidates = []; |
michael@0 | 561 | for (let c = 0, cl = candidates.length; c < cl; c += 1) { |
michael@0 | 562 | let candidate = candidates[c]; |
michael@0 | 563 | |
michael@0 | 564 | // Scale the final candidates score based on link density. Good content |
michael@0 | 565 | // should have a relatively small link density (5% or less) and be mostly |
michael@0 | 566 | // unaffected by this operation. |
michael@0 | 567 | let candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); |
michael@0 | 568 | candidate.readability.contentScore = candidateScore; |
michael@0 | 569 | |
michael@0 | 570 | this.log('Candidate: ' + candidate + " (" + candidate.className + ":" + |
michael@0 | 571 | candidate.id + ") with score " + candidateScore); |
michael@0 | 572 | |
michael@0 | 573 | for (let t = 0; t < this.N_TOP_CANDIDATES; t++) { |
michael@0 | 574 | let aTopCandidate = topCandidates[t]; |
michael@0 | 575 | |
michael@0 | 576 | if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { |
michael@0 | 577 | topCandidates.splice(t, 0, candidate); |
michael@0 | 578 | if (topCandidates.length > this.N_TOP_CANDIDATES) |
michael@0 | 579 | topCandidates.pop(); |
michael@0 | 580 | break; |
michael@0 | 581 | } |
michael@0 | 582 | } |
michael@0 | 583 | } |
michael@0 | 584 | |
michael@0 | 585 | let topCandidate = topCandidates[0] || null; |
michael@0 | 586 | let lastTopCandidate = (topCandidates.length > 3 ? topCandidates[topCandidates.length - 1] : null); |
michael@0 | 587 | |
michael@0 | 588 | // If we still have no top candidate, just use the body as a last resort. |
michael@0 | 589 | // We also have to copy the body node so it is something we can modify. |
michael@0 | 590 | if (topCandidate === null || topCandidate.tagName === "BODY") { |
michael@0 | 591 | // Move all of the page's children into topCandidate |
michael@0 | 592 | topCandidate = doc.createElement("DIV"); |
michael@0 | 593 | let children = page.childNodes; |
michael@0 | 594 | for (let i = 0; i < children.length; ++i) { |
michael@0 | 595 | topCandidate.appendChild(children[i]); |
michael@0 | 596 | } |
michael@0 | 597 | |
michael@0 | 598 | page.appendChild(topCandidate); |
michael@0 | 599 | |
michael@0 | 600 | this._initializeNode(topCandidate); |
michael@0 | 601 | } |
michael@0 | 602 | |
michael@0 | 603 | // Now that we have the top candidate, look through its siblings for content |
michael@0 | 604 | // that might also be related. Things like preambles, content split by ads |
michael@0 | 605 | // that we removed, etc. |
michael@0 | 606 | let articleContent = doc.createElement("DIV"); |
michael@0 | 607 | if (isPaging) |
michael@0 | 608 | articleContent.id = "readability-content"; |
michael@0 | 609 | |
michael@0 | 610 | let siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); |
michael@0 | 611 | let siblingNodes = topCandidate.parentNode.childNodes; |
michael@0 | 612 | |
michael@0 | 613 | for (let s = 0, sl = siblingNodes.length; s < sl; s += 1) { |
michael@0 | 614 | let siblingNode = siblingNodes[s]; |
michael@0 | 615 | let append = false; |
michael@0 | 616 | |
michael@0 | 617 | this.log("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); |
michael@0 | 618 | this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); |
michael@0 | 619 | |
michael@0 | 620 | if (siblingNode === topCandidate) |
michael@0 | 621 | append = true; |
michael@0 | 622 | |
michael@0 | 623 | let contentBonus = 0; |
michael@0 | 624 | |
michael@0 | 625 | // Give a bonus if sibling nodes and top candidates have the example same classname |
michael@0 | 626 | if (siblingNode.className === topCandidate.className && topCandidate.className !== "") |
michael@0 | 627 | contentBonus += topCandidate.readability.contentScore * 0.2; |
michael@0 | 628 | |
michael@0 | 629 | if (typeof siblingNode.readability !== 'undefined' && |
michael@0 | 630 | (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) |
michael@0 | 631 | append = true; |
michael@0 | 632 | |
michael@0 | 633 | if (siblingNode.nodeName === "P") { |
michael@0 | 634 | let linkDensity = this._getLinkDensity(siblingNode); |
michael@0 | 635 | let nodeContent = this._getInnerText(siblingNode); |
michael@0 | 636 | let nodeLength = nodeContent.length; |
michael@0 | 637 | |
michael@0 | 638 | if (nodeLength > 80 && linkDensity < 0.25) { |
michael@0 | 639 | append = true; |
michael@0 | 640 | } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { |
michael@0 | 641 | append = true; |
michael@0 | 642 | } |
michael@0 | 643 | } |
michael@0 | 644 | |
michael@0 | 645 | if (append) { |
michael@0 | 646 | this.log("Appending node: " + siblingNode); |
michael@0 | 647 | |
michael@0 | 648 | // siblingNodes is a reference to the childNodes array, and |
michael@0 | 649 | // siblingNode is removed from the array when we call appendChild() |
michael@0 | 650 | // below. As a result, we must revisit this index since the nodes |
michael@0 | 651 | // have been shifted. |
michael@0 | 652 | s -= 1; |
michael@0 | 653 | sl -= 1; |
michael@0 | 654 | |
michael@0 | 655 | if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { |
michael@0 | 656 | // We have a node that isn't a common block level element, like a form or td tag. |
michael@0 | 657 | // Turn it into a div so it doesn't get filtered out later by accident. */ |
michael@0 | 658 | this.log("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); |
michael@0 | 659 | |
michael@0 | 660 | this._setNodeTag(siblingNode, "DIV"); |
michael@0 | 661 | } |
michael@0 | 662 | |
michael@0 | 663 | // To ensure a node does not interfere with readability styles, |
michael@0 | 664 | // remove its classnames. |
michael@0 | 665 | siblingNode.className = ""; |
michael@0 | 666 | |
michael@0 | 667 | // Append sibling and subtract from our list because it removes |
michael@0 | 668 | // the node when you append to another node. |
michael@0 | 669 | articleContent.appendChild(siblingNode); |
michael@0 | 670 | } |
michael@0 | 671 | } |
michael@0 | 672 | |
michael@0 | 673 | // So we have all of the content that we need. Now we clean it up for presentation. |
michael@0 | 674 | this._prepArticle(articleContent); |
michael@0 | 675 | |
michael@0 | 676 | if (this._curPageNum === 1) { |
michael@0 | 677 | let div = doc.createElement("DIV"); |
michael@0 | 678 | div.id = "readability-page-1"; |
michael@0 | 679 | div.className = "page"; |
michael@0 | 680 | let children = articleContent.childNodes; |
michael@0 | 681 | for (let i = 0; i < children.length; ++i) { |
michael@0 | 682 | div.appendChild(children[i]); |
michael@0 | 683 | } |
michael@0 | 684 | articleContent.appendChild(div); |
michael@0 | 685 | } |
michael@0 | 686 | |
michael@0 | 687 | // Now that we've gone through the full algorithm, check to see if |
michael@0 | 688 | // we got any meaningful content. If we didn't, we may need to re-run |
michael@0 | 689 | // grabArticle with different flags set. This gives us a higher likelihood of |
michael@0 | 690 | // finding the content, and the sieve approach gives us a higher likelihood of |
michael@0 | 691 | // finding the -right- content. |
michael@0 | 692 | if (this._getInnerText(articleContent, true).length < 500) { |
michael@0 | 693 | page.innerHTML = pageCacheHtml; |
michael@0 | 694 | |
michael@0 | 695 | if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { |
michael@0 | 696 | this._removeFlag(this.FLAG_STRIP_UNLIKELYS); |
michael@0 | 697 | } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { |
michael@0 | 698 | this._removeFlag(this.FLAG_WEIGHT_CLASSES); |
michael@0 | 699 | } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { |
michael@0 | 700 | this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); |
michael@0 | 701 | } else { |
michael@0 | 702 | return null; |
michael@0 | 703 | } |
michael@0 | 704 | } else { |
michael@0 | 705 | if (lastTopCandidate !== null) { |
michael@0 | 706 | // EXPERIMENTAL: Contrast ratio is how we measure the level of competition between candidates in the |
michael@0 | 707 | // readability algorithm. This is to avoid offering reader mode on pages that are more like |
michael@0 | 708 | // a list or directory of links with summaries. It takes the score of the last top candidate |
michael@0 | 709 | // (see N_TOP_CANDIDATES) and checks how it compares to the top candidate's. On pages that are not |
michael@0 | 710 | // actual articles, there will likely be many candidates with similar score (i.e. higher contrast ratio). |
michael@0 | 711 | let contrastRatio = lastTopCandidate.readability.contentScore / topCandidate.readability.contentScore; |
michael@0 | 712 | if (contrastRatio > 0.45) |
michael@0 | 713 | return null; |
michael@0 | 714 | } |
michael@0 | 715 | |
michael@0 | 716 | return articleContent; |
michael@0 | 717 | } |
michael@0 | 718 | } |
michael@0 | 719 | }, |
michael@0 | 720 | |
michael@0 | 721 | /** |
michael@0 | 722 | * Attempts to get the excerpt from these |
michael@0 | 723 | * sources in the following order: |
michael@0 | 724 | * - meta description tag |
michael@0 | 725 | * - open-graph description |
michael@0 | 726 | * - twitter cards description |
michael@0 | 727 | * - article's first paragraph |
michael@0 | 728 | * If no excerpt is found, an empty string will be |
michael@0 | 729 | * returned. |
michael@0 | 730 | * |
michael@0 | 731 | * @param Element - root element of the processed version page |
michael@0 | 732 | * @return String - excerpt of the article |
michael@0 | 733 | **/ |
michael@0 | 734 | _getExcerpt: function(articleContent) { |
michael@0 | 735 | let values = {}; |
michael@0 | 736 | let metaElements = this._doc.getElementsByTagName("meta"); |
michael@0 | 737 | |
michael@0 | 738 | // Match "description", or Twitter's "twitter:description" (Cards) |
michael@0 | 739 | // in name attribute. |
michael@0 | 740 | let namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi; |
michael@0 | 741 | |
michael@0 | 742 | // Match Facebook's og:description (Open Graph) in property attribute. |
michael@0 | 743 | let propertyPattern = /^\s*og\s*:\s*description\s*$/gi; |
michael@0 | 744 | |
michael@0 | 745 | // Find description tags. |
michael@0 | 746 | for (let i = 0; i < metaElements.length; i++) { |
michael@0 | 747 | let element = metaElements[i]; |
michael@0 | 748 | let elementName = element.getAttribute("name"); |
michael@0 | 749 | let elementProperty = element.getAttribute("property"); |
michael@0 | 750 | |
michael@0 | 751 | let name; |
michael@0 | 752 | if (namePattern.test(elementName)) { |
michael@0 | 753 | name = elementName; |
michael@0 | 754 | } else if (propertyPattern.test(elementProperty)) { |
michael@0 | 755 | name = elementProperty; |
michael@0 | 756 | } |
michael@0 | 757 | |
michael@0 | 758 | if (name) { |
michael@0 | 759 | let content = element.getAttribute("content"); |
michael@0 | 760 | if (content) { |
michael@0 | 761 | // Convert to lowercase and remove any whitespace |
michael@0 | 762 | // so we can match below. |
michael@0 | 763 | name = name.toLowerCase().replace(/\s/g, ''); |
michael@0 | 764 | values[name] = content.trim(); |
michael@0 | 765 | } |
michael@0 | 766 | } |
michael@0 | 767 | } |
michael@0 | 768 | |
michael@0 | 769 | if ("description" in values) { |
michael@0 | 770 | return values["description"]; |
michael@0 | 771 | } |
michael@0 | 772 | |
michael@0 | 773 | if ("og:description" in values) { |
michael@0 | 774 | // Use facebook open graph description. |
michael@0 | 775 | return values["og:description"]; |
michael@0 | 776 | } |
michael@0 | 777 | |
michael@0 | 778 | if ("twitter:description" in values) { |
michael@0 | 779 | // Use twitter cards description. |
michael@0 | 780 | return values["twitter:description"]; |
michael@0 | 781 | } |
michael@0 | 782 | |
michael@0 | 783 | // No description meta tags, use the article's first paragraph. |
michael@0 | 784 | let paragraphs = articleContent.getElementsByTagName("p"); |
michael@0 | 785 | if (paragraphs.length > 0) { |
michael@0 | 786 | return paragraphs[0].textContent; |
michael@0 | 787 | } |
michael@0 | 788 | |
michael@0 | 789 | return ""; |
michael@0 | 790 | }, |
michael@0 | 791 | |
michael@0 | 792 | /** |
michael@0 | 793 | * Removes script tags from the document. |
michael@0 | 794 | * |
michael@0 | 795 | * @param Element |
michael@0 | 796 | **/ |
michael@0 | 797 | _removeScripts: function(doc) { |
michael@0 | 798 | let scripts = doc.getElementsByTagName('script'); |
michael@0 | 799 | for (let i = scripts.length - 1; i >= 0; i -= 1) { |
michael@0 | 800 | scripts[i].nodeValue=""; |
michael@0 | 801 | scripts[i].removeAttribute('src'); |
michael@0 | 802 | |
michael@0 | 803 | if (scripts[i].parentNode) |
michael@0 | 804 | scripts[i].parentNode.removeChild(scripts[i]); |
michael@0 | 805 | } |
michael@0 | 806 | }, |
michael@0 | 807 | |
michael@0 | 808 | /** |
michael@0 | 809 | * Get child index of the only P element inside a DIV with no |
michael@0 | 810 | * text content. Returns -1 if the DIV node contains non-empty |
michael@0 | 811 | * text nodes or if it contains other element nodes. |
michael@0 | 812 | * |
michael@0 | 813 | * @param Element |
michael@0 | 814 | **/ |
michael@0 | 815 | _getSinglePIndexInsideDiv: function(e) { |
michael@0 | 816 | let childNodes = e.childNodes; |
michael@0 | 817 | let pIndex = -1; |
michael@0 | 818 | |
michael@0 | 819 | for (let i = childNodes.length; --i >= 0;) { |
michael@0 | 820 | let node = childNodes[i]; |
michael@0 | 821 | |
michael@0 | 822 | if (node.nodeType === Node.ELEMENT_NODE) { |
michael@0 | 823 | if (node.tagName !== "P") |
michael@0 | 824 | return -1; |
michael@0 | 825 | |
michael@0 | 826 | if (pIndex >= 0) |
michael@0 | 827 | return -1; |
michael@0 | 828 | |
michael@0 | 829 | pIndex = i; |
michael@0 | 830 | } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) { |
michael@0 | 831 | return -1; |
michael@0 | 832 | } |
michael@0 | 833 | } |
michael@0 | 834 | |
michael@0 | 835 | return pIndex; |
michael@0 | 836 | }, |
michael@0 | 837 | |
michael@0 | 838 | /** |
michael@0 | 839 | * Determine whether element has any children block level elements. |
michael@0 | 840 | * |
michael@0 | 841 | * @param Element |
michael@0 | 842 | */ |
michael@0 | 843 | _hasChildBlockElement: function (e) { |
michael@0 | 844 | let length = e.childNodes.length; |
michael@0 | 845 | for (let i = 0; i < length; i++) { |
michael@0 | 846 | let child = e.childNodes[i]; |
michael@0 | 847 | if (child.nodeType != 1) |
michael@0 | 848 | continue; |
michael@0 | 849 | |
michael@0 | 850 | if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child)) |
michael@0 | 851 | return true; |
michael@0 | 852 | } |
michael@0 | 853 | return false; |
michael@0 | 854 | }, |
michael@0 | 855 | |
michael@0 | 856 | /** |
michael@0 | 857 | * Get the inner text of a node - cross browser compatibly. |
michael@0 | 858 | * This also strips out any excess whitespace to be found. |
michael@0 | 859 | * |
michael@0 | 860 | * @param Element |
michael@0 | 861 | * @return string |
michael@0 | 862 | **/ |
michael@0 | 863 | _getInnerText: function(e, normalizeSpaces) { |
michael@0 | 864 | let textContent = e.textContent.replace(this.REGEXPS.trim, ""); |
michael@0 | 865 | normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; |
michael@0 | 866 | |
michael@0 | 867 | if (normalizeSpaces) { |
michael@0 | 868 | return textContent.replace(this.REGEXPS.normalize, " "); |
michael@0 | 869 | } else { |
michael@0 | 870 | return textContent; |
michael@0 | 871 | } |
michael@0 | 872 | }, |
michael@0 | 873 | |
michael@0 | 874 | /** |
michael@0 | 875 | * Get the number of times a string s appears in the node e. |
michael@0 | 876 | * |
michael@0 | 877 | * @param Element |
michael@0 | 878 | * @param string - what to split on. Default is "," |
michael@0 | 879 | * @return number (integer) |
michael@0 | 880 | **/ |
michael@0 | 881 | _getCharCount: function(e,s) { |
michael@0 | 882 | s = s || ","; |
michael@0 | 883 | return this._getInnerText(e).split(s).length - 1; |
michael@0 | 884 | }, |
michael@0 | 885 | |
michael@0 | 886 | /** |
michael@0 | 887 | * Remove the style attribute on every e and under. |
michael@0 | 888 | * TODO: Test if getElementsByTagName(*) is faster. |
michael@0 | 889 | * |
michael@0 | 890 | * @param Element |
michael@0 | 891 | * @return void |
michael@0 | 892 | **/ |
michael@0 | 893 | _cleanStyles: function(e) { |
michael@0 | 894 | e = e || this._doc; |
michael@0 | 895 | let cur = e.firstChild; |
michael@0 | 896 | |
michael@0 | 897 | if (!e) |
michael@0 | 898 | return; |
michael@0 | 899 | |
michael@0 | 900 | // Remove any root styles, if we're able. |
michael@0 | 901 | if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') |
michael@0 | 902 | e.removeAttribute('style'); |
michael@0 | 903 | |
michael@0 | 904 | // Go until there are no more child nodes |
michael@0 | 905 | while (cur !== null) { |
michael@0 | 906 | if (cur.nodeType === 1) { |
michael@0 | 907 | // Remove style attribute(s) : |
michael@0 | 908 | if (cur.className !== "readability-styled") |
michael@0 | 909 | cur.removeAttribute("style"); |
michael@0 | 910 | |
michael@0 | 911 | this._cleanStyles(cur); |
michael@0 | 912 | } |
michael@0 | 913 | |
michael@0 | 914 | cur = cur.nextSibling; |
michael@0 | 915 | } |
michael@0 | 916 | }, |
michael@0 | 917 | |
michael@0 | 918 | /** |
michael@0 | 919 | * Get the density of links as a percentage of the content |
michael@0 | 920 | * This is the amount of text that is inside a link divided by the total text in the node. |
michael@0 | 921 | * |
michael@0 | 922 | * @param Element |
michael@0 | 923 | * @return number (float) |
michael@0 | 924 | **/ |
michael@0 | 925 | _getLinkDensity: function(e) { |
michael@0 | 926 | let links = e.getElementsByTagName("a"); |
michael@0 | 927 | let textLength = this._getInnerText(e).length; |
michael@0 | 928 | let linkLength = 0; |
michael@0 | 929 | |
michael@0 | 930 | for (let i = 0, il = links.length; i < il; i += 1) { |
michael@0 | 931 | linkLength += this._getInnerText(links[i]).length; |
michael@0 | 932 | } |
michael@0 | 933 | |
michael@0 | 934 | return linkLength / textLength; |
michael@0 | 935 | }, |
michael@0 | 936 | |
michael@0 | 937 | /** |
michael@0 | 938 | * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. |
michael@0 | 939 | * |
michael@0 | 940 | * @author Dan Lacy |
michael@0 | 941 | * @return string the base url |
michael@0 | 942 | **/ |
michael@0 | 943 | _findBaseUrl: function() { |
michael@0 | 944 | let uri = this._uri; |
michael@0 | 945 | let noUrlParams = uri.path.split("?")[0]; |
michael@0 | 946 | let urlSlashes = noUrlParams.split("/").reverse(); |
michael@0 | 947 | let cleanedSegments = []; |
michael@0 | 948 | let possibleType = ""; |
michael@0 | 949 | |
michael@0 | 950 | for (let i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { |
michael@0 | 951 | let segment = urlSlashes[i]; |
michael@0 | 952 | |
michael@0 | 953 | // Split off and save anything that looks like a file type. |
michael@0 | 954 | if (segment.indexOf(".") !== -1) { |
michael@0 | 955 | possibleType = segment.split(".")[1]; |
michael@0 | 956 | |
michael@0 | 957 | // If the type isn't alpha-only, it's probably not actually a file extension. |
michael@0 | 958 | if (!possibleType.match(/[^a-zA-Z]/)) |
michael@0 | 959 | segment = segment.split(".")[0]; |
michael@0 | 960 | } |
michael@0 | 961 | |
michael@0 | 962 | // EW-CMS specific segment replacement. Ugly. |
michael@0 | 963 | // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html |
michael@0 | 964 | if (segment.indexOf(',00') !== -1) |
michael@0 | 965 | segment = segment.replace(',00', ''); |
michael@0 | 966 | |
michael@0 | 967 | // If our first or second segment has anything looking like a page number, remove it. |
michael@0 | 968 | if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) |
michael@0 | 969 | segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); |
michael@0 | 970 | |
michael@0 | 971 | let del = false; |
michael@0 | 972 | |
michael@0 | 973 | // If this is purely a number, and it's the first or second segment, |
michael@0 | 974 | // it's probably a page number. Remove it. |
michael@0 | 975 | if (i < 2 && segment.match(/^\d{1,2}$/)) |
michael@0 | 976 | del = true; |
michael@0 | 977 | |
michael@0 | 978 | // If this is the first segment and it's just "index", remove it. |
michael@0 | 979 | if (i === 0 && segment.toLowerCase() === "index") |
michael@0 | 980 | del = true; |
michael@0 | 981 | |
michael@0 | 982 | // If our first or second segment is smaller than 3 characters, |
michael@0 | 983 | // and the first segment was purely alphas, remove it. |
michael@0 | 984 | if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) |
michael@0 | 985 | del = true; |
michael@0 | 986 | |
michael@0 | 987 | // If it's not marked for deletion, push it to cleanedSegments. |
michael@0 | 988 | if (!del) |
michael@0 | 989 | cleanedSegments.push(segment); |
michael@0 | 990 | } |
michael@0 | 991 | |
michael@0 | 992 | // This is our final, cleaned, base article URL. |
michael@0 | 993 | return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); |
michael@0 | 994 | }, |
michael@0 | 995 | |
michael@0 | 996 | /** |
michael@0 | 997 | * Look for any paging links that may occur within the document. |
michael@0 | 998 | * |
michael@0 | 999 | * @param body |
michael@0 | 1000 | * @return object (array) |
michael@0 | 1001 | **/ |
michael@0 | 1002 | _findNextPageLink: function(elem) { |
michael@0 | 1003 | let uri = this._uri; |
michael@0 | 1004 | let possiblePages = {}; |
michael@0 | 1005 | let allLinks = elem.getElementsByTagName('a'); |
michael@0 | 1006 | let articleBaseUrl = this._findBaseUrl(); |
michael@0 | 1007 | |
michael@0 | 1008 | // Loop through all links, looking for hints that they may be next-page links. |
michael@0 | 1009 | // Things like having "page" in their textContent, className or id, or being a child |
michael@0 | 1010 | // of a node with a page-y className or id. |
michael@0 | 1011 | // |
michael@0 | 1012 | // Also possible: levenshtein distance? longest common subsequence? |
michael@0 | 1013 | // |
michael@0 | 1014 | // After we do that, assign each page a score, and |
michael@0 | 1015 | for (let i = 0, il = allLinks.length; i < il; i += 1) { |
michael@0 | 1016 | let link = allLinks[i]; |
michael@0 | 1017 | let linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); |
michael@0 | 1018 | |
michael@0 | 1019 | // If we've already seen this page, ignore it. |
michael@0 | 1020 | if (linkHref === "" || |
michael@0 | 1021 | linkHref === articleBaseUrl || |
michael@0 | 1022 | linkHref === uri.spec || |
michael@0 | 1023 | linkHref in this._parsedPages) { |
michael@0 | 1024 | continue; |
michael@0 | 1025 | } |
michael@0 | 1026 | |
michael@0 | 1027 | // If it's on a different domain, skip it. |
michael@0 | 1028 | if (uri.host !== linkHref.split(/\/+/g)[1]) |
michael@0 | 1029 | continue; |
michael@0 | 1030 | |
michael@0 | 1031 | let linkText = this._getInnerText(link); |
michael@0 | 1032 | |
michael@0 | 1033 | // If the linkText looks like it's not the next page, skip it. |
michael@0 | 1034 | if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) |
michael@0 | 1035 | continue; |
michael@0 | 1036 | |
michael@0 | 1037 | // If the leftovers of the URL after removing the base URL don't contain |
michael@0 | 1038 | // any digits, it's certainly not a next page link. |
michael@0 | 1039 | let linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
michael@0 | 1040 | if (!linkHrefLeftover.match(/\d/)) |
michael@0 | 1041 | continue; |
michael@0 | 1042 | |
michael@0 | 1043 | if (!(linkHref in possiblePages)) { |
michael@0 | 1044 | possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; |
michael@0 | 1045 | } else { |
michael@0 | 1046 | possiblePages[linkHref].linkText += ' | ' + linkText; |
michael@0 | 1047 | } |
michael@0 | 1048 | |
michael@0 | 1049 | let linkObj = possiblePages[linkHref]; |
michael@0 | 1050 | |
michael@0 | 1051 | // If the articleBaseUrl isn't part of this URL, penalize this link. It could |
michael@0 | 1052 | // still be the link, but the odds are lower. |
michael@0 | 1053 | // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
michael@0 | 1054 | if (linkHref.indexOf(articleBaseUrl) !== 0) |
michael@0 | 1055 | linkObj.score -= 25; |
michael@0 | 1056 | |
michael@0 | 1057 | let linkData = linkText + ' ' + link.className + ' ' + link.id; |
michael@0 | 1058 | if (linkData.match(this.REGEXPS.nextLink)) |
michael@0 | 1059 | linkObj.score += 50; |
michael@0 | 1060 | |
michael@0 | 1061 | if (linkData.match(/pag(e|ing|inat)/i)) |
michael@0 | 1062 | linkObj.score += 25; |
michael@0 | 1063 | |
michael@0 | 1064 | if (linkData.match(/(first|last)/i)) { |
michael@0 | 1065 | // -65 is enough to negate any bonuses gotten from a > or » in the text, |
michael@0 | 1066 | // If we already matched on "next", last is probably fine. |
michael@0 | 1067 | // If we didn't, then it's bad. Penalize. |
michael@0 | 1068 | if (!linkObj.linkText.match(this.REGEXPS.nextLink)) |
michael@0 | 1069 | linkObj.score -= 65; |
michael@0 | 1070 | } |
michael@0 | 1071 | |
michael@0 | 1072 | if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) |
michael@0 | 1073 | linkObj.score -= 50; |
michael@0 | 1074 | |
michael@0 | 1075 | if (linkData.match(this.REGEXPS.prevLink)) |
michael@0 | 1076 | linkObj.score -= 200; |
michael@0 | 1077 | |
michael@0 | 1078 | // If a parentNode contains page or paging or paginat |
michael@0 | 1079 | let parentNode = link.parentNode; |
michael@0 | 1080 | let positiveNodeMatch = false; |
michael@0 | 1081 | let negativeNodeMatch = false; |
michael@0 | 1082 | |
michael@0 | 1083 | while (parentNode) { |
michael@0 | 1084 | let parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; |
michael@0 | 1085 | |
michael@0 | 1086 | if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { |
michael@0 | 1087 | positiveNodeMatch = true; |
michael@0 | 1088 | linkObj.score += 25; |
michael@0 | 1089 | } |
michael@0 | 1090 | |
michael@0 | 1091 | if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { |
michael@0 | 1092 | // If this is just something like "footer", give it a negative. |
michael@0 | 1093 | // If it's something like "body-and-footer", leave it be. |
michael@0 | 1094 | if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { |
michael@0 | 1095 | linkObj.score -= 25; |
michael@0 | 1096 | negativeNodeMatch = true; |
michael@0 | 1097 | } |
michael@0 | 1098 | } |
michael@0 | 1099 | |
michael@0 | 1100 | parentNode = parentNode.parentNode; |
michael@0 | 1101 | } |
michael@0 | 1102 | |
michael@0 | 1103 | // If the URL looks like it has paging in it, add to the score. |
michael@0 | 1104 | // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
michael@0 | 1105 | if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) |
michael@0 | 1106 | linkObj.score += 25; |
michael@0 | 1107 | |
michael@0 | 1108 | // If the URL contains negative values, give a slight decrease. |
michael@0 | 1109 | if (linkHref.match(this.REGEXPS.extraneous)) |
michael@0 | 1110 | linkObj.score -= 15; |
michael@0 | 1111 | |
michael@0 | 1112 | /** |
michael@0 | 1113 | * Minor punishment to anything that doesn't match our current URL. |
michael@0 | 1114 | * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. |
michael@0 | 1115 | * Dan, can you show me a counterexample where this is necessary? |
michael@0 | 1116 | * if (linkHref.indexOf(window.location.href) !== 0) { |
michael@0 | 1117 | * linkObj.score -= 1; |
michael@0 | 1118 | * } |
michael@0 | 1119 | **/ |
michael@0 | 1120 | |
michael@0 | 1121 | // If the link text can be parsed as a number, give it a minor bonus, with a slight |
michael@0 | 1122 | // bias towards lower numbered pages. This is so that pages that might not have 'next' |
michael@0 | 1123 | // in their text can still get scored, and sorted properly by score. |
michael@0 | 1124 | let linkTextAsNumber = parseInt(linkText, 10); |
michael@0 | 1125 | if (linkTextAsNumber) { |
michael@0 | 1126 | // Punish 1 since we're either already there, or it's probably |
michael@0 | 1127 | // before what we want anyways. |
michael@0 | 1128 | if (linkTextAsNumber === 1) { |
michael@0 | 1129 | linkObj.score -= 10; |
michael@0 | 1130 | } else { |
michael@0 | 1131 | linkObj.score += Math.max(0, 10 - linkTextAsNumber); |
michael@0 | 1132 | } |
michael@0 | 1133 | } |
michael@0 | 1134 | } |
michael@0 | 1135 | |
michael@0 | 1136 | // Loop thrugh all of our possible pages from above and find our top |
michael@0 | 1137 | // candidate for the next page URL. Require at least a score of 50, which |
michael@0 | 1138 | // is a relatively high confidence that this page is the next link. |
michael@0 | 1139 | let topPage = null; |
michael@0 | 1140 | for (let page in possiblePages) { |
michael@0 | 1141 | if (possiblePages.hasOwnProperty(page)) { |
michael@0 | 1142 | if (possiblePages[page].score >= 50 && |
michael@0 | 1143 | (!topPage || topPage.score < possiblePages[page].score)) |
michael@0 | 1144 | topPage = possiblePages[page]; |
michael@0 | 1145 | } |
michael@0 | 1146 | } |
michael@0 | 1147 | |
michael@0 | 1148 | if (topPage) { |
michael@0 | 1149 | let nextHref = topPage.href.replace(/\/$/,''); |
michael@0 | 1150 | |
michael@0 | 1151 | this.log('NEXT PAGE IS ' + nextHref); |
michael@0 | 1152 | this._parsedPages[nextHref] = true; |
michael@0 | 1153 | return nextHref; |
michael@0 | 1154 | } else { |
michael@0 | 1155 | return null; |
michael@0 | 1156 | } |
michael@0 | 1157 | }, |
michael@0 | 1158 | |
michael@0 | 1159 | _successfulRequest: function(request) { |
michael@0 | 1160 | return (request.status >= 200 && request.status < 300) || |
michael@0 | 1161 | request.status === 304 || |
michael@0 | 1162 | (request.status === 0 && request.responseText); |
michael@0 | 1163 | }, |
michael@0 | 1164 | |
michael@0 | 1165 | _ajax: function(url, options) { |
michael@0 | 1166 | let request = new XMLHttpRequest(); |
michael@0 | 1167 | |
michael@0 | 1168 | function respondToReadyState(readyState) { |
michael@0 | 1169 | if (request.readyState === 4) { |
michael@0 | 1170 | if (this._successfulRequest(request)) { |
michael@0 | 1171 | if (options.success) |
michael@0 | 1172 | options.success(request); |
michael@0 | 1173 | } else { |
michael@0 | 1174 | if (options.error) |
michael@0 | 1175 | options.error(request); |
michael@0 | 1176 | } |
michael@0 | 1177 | } |
michael@0 | 1178 | } |
michael@0 | 1179 | |
michael@0 | 1180 | if (typeof options === 'undefined') |
michael@0 | 1181 | options = {}; |
michael@0 | 1182 | |
michael@0 | 1183 | request.onreadystatechange = respondToReadyState; |
michael@0 | 1184 | |
michael@0 | 1185 | request.open('get', url, true); |
michael@0 | 1186 | request.setRequestHeader('Accept', 'text/html'); |
michael@0 | 1187 | |
michael@0 | 1188 | try { |
michael@0 | 1189 | request.send(options.postBody); |
michael@0 | 1190 | } catch (e) { |
michael@0 | 1191 | if (options.error) |
michael@0 | 1192 | options.error(); |
michael@0 | 1193 | } |
michael@0 | 1194 | |
michael@0 | 1195 | return request; |
michael@0 | 1196 | }, |
michael@0 | 1197 | |
michael@0 | 1198 | _appendNextPage: function(nextPageLink) { |
michael@0 | 1199 | let doc = this._doc; |
michael@0 | 1200 | this._curPageNum += 1; |
michael@0 | 1201 | |
michael@0 | 1202 | let articlePage = doc.createElement("DIV"); |
michael@0 | 1203 | articlePage.id = 'readability-page-' + this._curPageNum; |
michael@0 | 1204 | articlePage.className = 'page'; |
michael@0 | 1205 | articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">§</p>'; |
michael@0 | 1206 | |
michael@0 | 1207 | doc.getElementById("readability-content").appendChild(articlePage); |
michael@0 | 1208 | |
michael@0 | 1209 | if (this._curPageNum > this.MAX_PAGES) { |
michael@0 | 1210 | let nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>"; |
michael@0 | 1211 | articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; |
michael@0 | 1212 | return; |
michael@0 | 1213 | } |
michael@0 | 1214 | |
michael@0 | 1215 | // Now that we've built the article page DOM element, get the page content |
michael@0 | 1216 | // asynchronously and load the cleaned content into the div we created for it. |
michael@0 | 1217 | (function(pageUrl, thisPage) { |
michael@0 | 1218 | this._ajax(pageUrl, { |
michael@0 | 1219 | success: function(r) { |
michael@0 | 1220 | |
michael@0 | 1221 | // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. |
michael@0 | 1222 | let eTag = r.getResponseHeader('ETag'); |
michael@0 | 1223 | if (eTag) { |
michael@0 | 1224 | if (eTag in this._pageETags) { |
michael@0 | 1225 | this.log("Exact duplicate page found via ETag. Aborting."); |
michael@0 | 1226 | articlePage.style.display = 'none'; |
michael@0 | 1227 | return; |
michael@0 | 1228 | } else { |
michael@0 | 1229 | this._pageETags[eTag] = 1; |
michael@0 | 1230 | } |
michael@0 | 1231 | } |
michael@0 | 1232 | |
michael@0 | 1233 | // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. |
michael@0 | 1234 | let page = doc.createElement("DIV"); |
michael@0 | 1235 | |
michael@0 | 1236 | // Do some preprocessing to our HTML to make it ready for appending. |
michael@0 | 1237 | // - Remove any script tags. Swap and reswap newlines with a unicode |
michael@0 | 1238 | // character because multiline regex doesn't work in javascript. |
michael@0 | 1239 | // - Turn any noscript tags into divs so that we can parse them. This |
michael@0 | 1240 | // allows us to find any next page links hidden via javascript. |
michael@0 | 1241 | // - Turn all double br's into p's - was handled by prepDocument in the original view. |
michael@0 | 1242 | // Maybe in the future abstract out prepDocument to work for both the original document |
michael@0 | 1243 | // and AJAX-added pages. |
michael@0 | 1244 | let responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, ''); |
michael@0 | 1245 | responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, ''); |
michael@0 | 1246 | responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div'); |
michael@0 | 1247 | responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); |
michael@0 | 1248 | |
michael@0 | 1249 | page.innerHTML = responseHtml; |
michael@0 | 1250 | this._replaceBrs(page); |
michael@0 | 1251 | |
michael@0 | 1252 | // Reset all flags for the next page, as they will search through it and |
michael@0 | 1253 | // disable as necessary at the end of grabArticle. |
michael@0 | 1254 | this._flags = 0x1 | 0x2 | 0x4; |
michael@0 | 1255 | |
michael@0 | 1256 | let nextPageLink = this._findNextPageLink(page); |
michael@0 | 1257 | |
michael@0 | 1258 | // NOTE: if we end up supporting _appendNextPage(), we'll need to |
michael@0 | 1259 | // change this call to be async |
michael@0 | 1260 | let content = this._grabArticle(page); |
michael@0 | 1261 | |
michael@0 | 1262 | if (!content) { |
michael@0 | 1263 | this.log("No content found in page to append. Aborting."); |
michael@0 | 1264 | return; |
michael@0 | 1265 | } |
michael@0 | 1266 | |
michael@0 | 1267 | // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. |
michael@0 | 1268 | // Compare it against all of the the previous document's we've gotten. If the previous |
michael@0 | 1269 | // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. |
michael@0 | 1270 | let firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; |
michael@0 | 1271 | if (firstP && firstP.innerHTML.length > 100) { |
michael@0 | 1272 | for (let i = 1; i <= this._curPageNum; i += 1) { |
michael@0 | 1273 | let rPage = doc.getElementById('readability-page-' + i); |
michael@0 | 1274 | if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { |
michael@0 | 1275 | this.log('Duplicate of page ' + i + ' - skipping.'); |
michael@0 | 1276 | articlePage.style.display = 'none'; |
michael@0 | 1277 | this._parsedPages[pageUrl] = true; |
michael@0 | 1278 | return; |
michael@0 | 1279 | } |
michael@0 | 1280 | } |
michael@0 | 1281 | } |
michael@0 | 1282 | |
michael@0 | 1283 | this._removeScripts(content); |
michael@0 | 1284 | |
michael@0 | 1285 | thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; |
michael@0 | 1286 | |
michael@0 | 1287 | // After the page has rendered, post process the content. This delay is necessary because, |
michael@0 | 1288 | // in webkit at least, offsetWidth is not set in time to determine image width. We have to |
michael@0 | 1289 | // wait a little bit for reflow to finish before we can fix floating images. |
michael@0 | 1290 | setTimeout((function() { |
michael@0 | 1291 | this._postProcessContent(thisPage); |
michael@0 | 1292 | }).bind(this), 500); |
michael@0 | 1293 | |
michael@0 | 1294 | |
michael@0 | 1295 | if (nextPageLink) |
michael@0 | 1296 | this._appendNextPage(nextPageLink); |
michael@0 | 1297 | } |
michael@0 | 1298 | }); |
michael@0 | 1299 | }).bind(this)(nextPageLink, articlePage); |
michael@0 | 1300 | }, |
michael@0 | 1301 | |
michael@0 | 1302 | /** |
michael@0 | 1303 | * Get an elements class/id weight. Uses regular expressions to tell if this |
michael@0 | 1304 | * element looks good or bad. |
michael@0 | 1305 | * |
michael@0 | 1306 | * @param Element |
michael@0 | 1307 | * @return number (Integer) |
michael@0 | 1308 | **/ |
michael@0 | 1309 | _getClassWeight: function(e) { |
michael@0 | 1310 | if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) |
michael@0 | 1311 | return 0; |
michael@0 | 1312 | |
michael@0 | 1313 | let weight = 0; |
michael@0 | 1314 | |
michael@0 | 1315 | // Look for a special classname |
michael@0 | 1316 | if (typeof(e.className) === 'string' && e.className !== '') { |
michael@0 | 1317 | if (e.className.search(this.REGEXPS.negative) !== -1) |
michael@0 | 1318 | weight -= 25; |
michael@0 | 1319 | |
michael@0 | 1320 | if (e.className.search(this.REGEXPS.positive) !== -1) |
michael@0 | 1321 | weight += 25; |
michael@0 | 1322 | } |
michael@0 | 1323 | |
michael@0 | 1324 | // Look for a special ID |
michael@0 | 1325 | if (typeof(e.id) === 'string' && e.id !== '') { |
michael@0 | 1326 | if (e.id.search(this.REGEXPS.negative) !== -1) |
michael@0 | 1327 | weight -= 25; |
michael@0 | 1328 | |
michael@0 | 1329 | if (e.id.search(this.REGEXPS.positive) !== -1) |
michael@0 | 1330 | weight += 25; |
michael@0 | 1331 | } |
michael@0 | 1332 | |
michael@0 | 1333 | return weight; |
michael@0 | 1334 | }, |
michael@0 | 1335 | |
michael@0 | 1336 | /** |
michael@0 | 1337 | * Clean a node of all elements of type "tag". |
michael@0 | 1338 | * (Unless it's a youtube/vimeo video. People love movies.) |
michael@0 | 1339 | * |
michael@0 | 1340 | * @param Element |
michael@0 | 1341 | * @param string tag to clean |
michael@0 | 1342 | * @return void |
michael@0 | 1343 | **/ |
michael@0 | 1344 | _clean: function(e, tag) { |
michael@0 | 1345 | let targetList = e.getElementsByTagName(tag); |
michael@0 | 1346 | let isEmbed = (tag === 'object' || tag === 'embed'); |
michael@0 | 1347 | |
michael@0 | 1348 | for (let y = targetList.length - 1; y >= 0; y -= 1) { |
michael@0 | 1349 | // Allow youtube and vimeo videos through as people usually want to see those. |
michael@0 | 1350 | if (isEmbed) { |
michael@0 | 1351 | let attributeValues = ""; |
michael@0 | 1352 | for (let i = 0, il = targetList[y].attributes.length; i < il; i += 1) { |
michael@0 | 1353 | attributeValues += targetList[y].attributes[i].value + '|'; |
michael@0 | 1354 | } |
michael@0 | 1355 | |
michael@0 | 1356 | // First, check the elements attributes to see if any of them contain youtube or vimeo |
michael@0 | 1357 | if (attributeValues.search(this.REGEXPS.videos) !== -1) |
michael@0 | 1358 | continue; |
michael@0 | 1359 | |
michael@0 | 1360 | // Then check the elements inside this element for the same. |
michael@0 | 1361 | if (targetList[y].innerHTML.search(this.REGEXPS.videos) !== -1) |
michael@0 | 1362 | continue; |
michael@0 | 1363 | } |
michael@0 | 1364 | |
michael@0 | 1365 | targetList[y].parentNode.removeChild(targetList[y]); |
michael@0 | 1366 | } |
michael@0 | 1367 | }, |
michael@0 | 1368 | |
michael@0 | 1369 | /** |
michael@0 | 1370 | * Clean an element of all tags of type "tag" if they look fishy. |
michael@0 | 1371 | * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. |
michael@0 | 1372 | * |
michael@0 | 1373 | * @return void |
michael@0 | 1374 | **/ |
michael@0 | 1375 | _cleanConditionally: function(e, tag) { |
michael@0 | 1376 | if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) |
michael@0 | 1377 | return; |
michael@0 | 1378 | |
michael@0 | 1379 | let tagsList = e.getElementsByTagName(tag); |
michael@0 | 1380 | let curTagsLength = tagsList.length; |
michael@0 | 1381 | |
michael@0 | 1382 | // Gather counts for other typical elements embedded within. |
michael@0 | 1383 | // Traverse backwards so we can remove nodes at the same time |
michael@0 | 1384 | // without effecting the traversal. |
michael@0 | 1385 | // |
michael@0 | 1386 | // TODO: Consider taking into account original contentScore here. |
michael@0 | 1387 | for (let i = curTagsLength-1; i >= 0; i -= 1) { |
michael@0 | 1388 | let weight = this._getClassWeight(tagsList[i]); |
michael@0 | 1389 | let contentScore = 0; |
michael@0 | 1390 | |
michael@0 | 1391 | this.log("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")"); |
michael@0 | 1392 | |
michael@0 | 1393 | if (weight + contentScore < 0) { |
michael@0 | 1394 | tagsList[i].parentNode.removeChild(tagsList[i]); |
michael@0 | 1395 | } else if (this._getCharCount(tagsList[i],',') < 10) { |
michael@0 | 1396 | // If there are not very many commas, and the number of |
michael@0 | 1397 | // non-paragraph elements is more than paragraphs or other |
michael@0 | 1398 | // ominous signs, remove the element. |
michael@0 | 1399 | let p = tagsList[i].getElementsByTagName("p").length; |
michael@0 | 1400 | let img = tagsList[i].getElementsByTagName("img").length; |
michael@0 | 1401 | let li = tagsList[i].getElementsByTagName("li").length-100; |
michael@0 | 1402 | let input = tagsList[i].getElementsByTagName("input").length; |
michael@0 | 1403 | |
michael@0 | 1404 | let embedCount = 0; |
michael@0 | 1405 | let embeds = tagsList[i].getElementsByTagName("embed"); |
michael@0 | 1406 | for (let ei = 0, il = embeds.length; ei < il; ei += 1) { |
michael@0 | 1407 | if (embeds[ei].src.search(this.REGEXPS.videos) === -1) |
michael@0 | 1408 | embedCount += 1; |
michael@0 | 1409 | } |
michael@0 | 1410 | |
michael@0 | 1411 | let linkDensity = this._getLinkDensity(tagsList[i]); |
michael@0 | 1412 | let contentLength = this._getInnerText(tagsList[i]).length; |
michael@0 | 1413 | let toRemove = false; |
michael@0 | 1414 | |
michael@0 | 1415 | if (img > p) { |
michael@0 | 1416 | toRemove = true; |
michael@0 | 1417 | } else if (li > p && tag !== "ul" && tag !== "ol") { |
michael@0 | 1418 | toRemove = true; |
michael@0 | 1419 | } else if ( input > Math.floor(p/3) ) { |
michael@0 | 1420 | toRemove = true; |
michael@0 | 1421 | } else if (contentLength < 25 && (img === 0 || img > 2) ) { |
michael@0 | 1422 | toRemove = true; |
michael@0 | 1423 | } else if (weight < 25 && linkDensity > 0.2) { |
michael@0 | 1424 | toRemove = true; |
michael@0 | 1425 | } else if (weight >= 25 && linkDensity > 0.5) { |
michael@0 | 1426 | toRemove = true; |
michael@0 | 1427 | } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { |
michael@0 | 1428 | toRemove = true; |
michael@0 | 1429 | } |
michael@0 | 1430 | |
michael@0 | 1431 | if (toRemove) |
michael@0 | 1432 | tagsList[i].parentNode.removeChild(tagsList[i]); |
michael@0 | 1433 | } |
michael@0 | 1434 | } |
michael@0 | 1435 | }, |
michael@0 | 1436 | |
michael@0 | 1437 | /** |
michael@0 | 1438 | * Clean out spurious headers from an Element. Checks things like classnames and link density. |
michael@0 | 1439 | * |
michael@0 | 1440 | * @param Element |
michael@0 | 1441 | * @return void |
michael@0 | 1442 | **/ |
michael@0 | 1443 | _cleanHeaders: function(e) { |
michael@0 | 1444 | for (let headerIndex = 1; headerIndex < 3; headerIndex += 1) { |
michael@0 | 1445 | let headers = e.getElementsByTagName('h' + headerIndex); |
michael@0 | 1446 | for (let i = headers.length - 1; i >= 0; i -= 1) { |
michael@0 | 1447 | if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33) |
michael@0 | 1448 | headers[i].parentNode.removeChild(headers[i]); |
michael@0 | 1449 | } |
michael@0 | 1450 | } |
michael@0 | 1451 | }, |
michael@0 | 1452 | |
michael@0 | 1453 | _flagIsActive: function(flag) { |
michael@0 | 1454 | return (this._flags & flag) > 0; |
michael@0 | 1455 | }, |
michael@0 | 1456 | |
michael@0 | 1457 | _addFlag: function(flag) { |
michael@0 | 1458 | this._flags = this._flags | flag; |
michael@0 | 1459 | }, |
michael@0 | 1460 | |
michael@0 | 1461 | _removeFlag: function(flag) { |
michael@0 | 1462 | this._flags = this._flags & ~flag; |
michael@0 | 1463 | }, |
michael@0 | 1464 | |
michael@0 | 1465 | /** |
michael@0 | 1466 | * Runs readability. |
michael@0 | 1467 | * |
michael@0 | 1468 | * Workflow: |
michael@0 | 1469 | * 1. Prep the document by removing script tags, css, etc. |
michael@0 | 1470 | * 2. Build readability's DOM tree. |
michael@0 | 1471 | * 3. Grab the article content from the current dom tree. |
michael@0 | 1472 | * 4. Replace the current DOM tree with the new one. |
michael@0 | 1473 | * 5. Read peacefully. |
michael@0 | 1474 | * |
michael@0 | 1475 | * @return void |
michael@0 | 1476 | **/ |
michael@0 | 1477 | parse: function () { |
michael@0 | 1478 | // Remove script tags from the document. |
michael@0 | 1479 | this._removeScripts(this._doc); |
michael@0 | 1480 | |
michael@0 | 1481 | // FIXME: Disabled multi-page article support for now as it |
michael@0 | 1482 | // needs more work on infrastructure. |
michael@0 | 1483 | |
michael@0 | 1484 | // Make sure this document is added to the list of parsed pages first, |
michael@0 | 1485 | // so we don't double up on the first page. |
michael@0 | 1486 | // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; |
michael@0 | 1487 | |
michael@0 | 1488 | // Pull out any possible next page link first. |
michael@0 | 1489 | // let nextPageLink = this._findNextPageLink(doc.body); |
michael@0 | 1490 | |
michael@0 | 1491 | this._prepDocument(); |
michael@0 | 1492 | |
michael@0 | 1493 | let articleTitle = this._getArticleTitle(); |
michael@0 | 1494 | let articleContent = this._grabArticle(); |
michael@0 | 1495 | if (!articleContent) |
michael@0 | 1496 | return null; |
michael@0 | 1497 | |
michael@0 | 1498 | this._postProcessContent(articleContent); |
michael@0 | 1499 | |
michael@0 | 1500 | // if (nextPageLink) { |
michael@0 | 1501 | // // Append any additional pages after a small timeout so that people |
michael@0 | 1502 | // // can start reading without having to wait for this to finish processing. |
michael@0 | 1503 | // setTimeout((function() { |
michael@0 | 1504 | // this._appendNextPage(nextPageLink); |
michael@0 | 1505 | // }).bind(this), 500); |
michael@0 | 1506 | // } |
michael@0 | 1507 | |
michael@0 | 1508 | let excerpt = this._getExcerpt(articleContent); |
michael@0 | 1509 | |
michael@0 | 1510 | return { title: articleTitle, |
michael@0 | 1511 | byline: this._articleByline, |
michael@0 | 1512 | dir: this._articleDir, |
michael@0 | 1513 | content: articleContent.innerHTML, |
michael@0 | 1514 | length: articleContent.textContent.length, |
michael@0 | 1515 | excerpt: excerpt }; |
michael@0 | 1516 | } |
michael@0 | 1517 | }; |