|
1 /* |
|
2 * Copyright (c) 2010 Arc90 Inc |
|
3 * |
|
4 * Licensed under the Apache License, Version 2.0 (the "License"); |
|
5 * you may not use this file except in compliance with the License. |
|
6 * You may obtain a copy of the License at |
|
7 * |
|
8 * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 * |
|
10 * Unless required by applicable law or agreed to in writing, software |
|
11 * distributed under the License is distributed on an "AS IS" BASIS, |
|
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
13 * See the License for the specific language governing permissions and |
|
14 * limitations under the License. |
|
15 */ |
|
16 |
|
17 /* |
|
18 * This code is heavily based on Arc90's readability.js (1.7.1) script |
|
19 * available at: http://code.google.com/p/arc90labs-readability |
|
20 */ |
|
21 |
|
22 var Readability = function(uri, doc) { |
|
23 const ENABLE_LOGGING = false; |
|
24 |
|
25 this._uri = uri; |
|
26 this._doc = doc; |
|
27 this._biggestFrame = false; |
|
28 this._articleByline = null; |
|
29 this._articleDir = null; |
|
30 |
|
31 // Start with all flags set |
|
32 this._flags = this.FLAG_STRIP_UNLIKELYS | |
|
33 this.FLAG_WEIGHT_CLASSES | |
|
34 this.FLAG_CLEAN_CONDITIONALLY; |
|
35 |
|
36 // The list of pages we've parsed in this call of readability, |
|
37 // for autopaging. As a key store for easier searching. |
|
38 this._parsedPages = {}; |
|
39 |
|
40 // A list of the ETag headers of pages we've parsed, in case they happen to match, |
|
41 // we'll know it's a duplicate. |
|
42 this._pageETags = {}; |
|
43 |
|
44 // Make an AJAX request for each page and append it to the document. |
|
45 this._curPageNum = 1; |
|
46 |
|
47 // Control whether log messages are sent to the console |
|
48 if (ENABLE_LOGGING) { |
|
49 this.log = function (msg) { |
|
50 dump("Reader: (Readability) " + msg); |
|
51 }; |
|
52 } else { |
|
53 this.log = function () {}; |
|
54 } |
|
55 } |
|
56 |
|
57 Readability.prototype = { |
|
58 FLAG_STRIP_UNLIKELYS: 0x1, |
|
59 FLAG_WEIGHT_CLASSES: 0x2, |
|
60 FLAG_CLEAN_CONDITIONALLY: 0x4, |
|
61 |
|
62 // The number of top candidates to consider when analysing how |
|
63 // tight the competition is among candidates. |
|
64 N_TOP_CANDIDATES: 5, |
|
65 |
|
66 // The maximum number of pages to loop through before we call |
|
67 // it quits and just show a link. |
|
68 MAX_PAGES: 5, |
|
69 |
|
70 // All of the regular expressions in use within readability. |
|
71 // Defined up here so we don't instantiate them repeatedly in loops. |
|
72 REGEXPS: { |
|
73 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, |
|
74 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
|
75 positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, |
|
76 negative: /hidden|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, |
|
77 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, |
|
78 byline: /byline|author|dateline|writtenby/i, |
|
79 replaceFonts: /<(\/?)font[^>]*>/gi, |
|
80 trim: /^\s+|\s+$/g, |
|
81 normalize: /\s{2,}/g, |
|
82 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
|
83 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, |
|
84 prevLink: /(prev|earl|old|new|<|«)/i, |
|
85 whitespace: /^\s*$/ |
|
86 }, |
|
87 |
|
88 DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], |
|
89 |
|
90 /** |
|
91 * Run any post-process modifications to article content as necessary. |
|
92 * |
|
93 * @param Element |
|
94 * @return void |
|
95 **/ |
|
96 _postProcessContent: function(articleContent) { |
|
97 // Readability cannot open relative uris so we convert them to absolute uris. |
|
98 this._fixRelativeUris(articleContent); |
|
99 }, |
|
100 |
|
101 /** |
|
102 * Converts each <a> and <img> uri in the given element to an absolute URI. |
|
103 * |
|
104 * @param Element |
|
105 * @return void |
|
106 */ |
|
107 _fixRelativeUris: function(articleContent) { |
|
108 let scheme = this._uri.scheme; |
|
109 let prePath = this._uri.prePath; |
|
110 let pathBase = this._uri.pathBase; |
|
111 |
|
112 function toAbsoluteURI(uri) { |
|
113 // If this is already an absolute URI, return it. |
|
114 if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri)) |
|
115 return uri; |
|
116 |
|
117 // Scheme-rooted relative URI. |
|
118 if (uri.substr(0, 2) == "//") |
|
119 return scheme + "://" + uri.substr(2); |
|
120 |
|
121 // Prepath-rooted relative URI. |
|
122 if (uri[0] == "/") |
|
123 return prePath + uri; |
|
124 |
|
125 // Standard relative URI; add entire path. pathBase already includes a |
|
126 // trailing "/". |
|
127 return pathBase + uri; |
|
128 } |
|
129 |
|
130 function convertRelativeURIs(tagName, propName) { |
|
131 let elems = articleContent.getElementsByTagName(tagName); |
|
132 for (let i = elems.length; --i >= 0;) { |
|
133 let elem = elems[i]; |
|
134 let relativeURI = elem.getAttribute(propName); |
|
135 if (relativeURI != null) |
|
136 elems[i].setAttribute(propName, toAbsoluteURI(relativeURI)); |
|
137 } |
|
138 } |
|
139 |
|
140 // Fix links. |
|
141 convertRelativeURIs("a", "href"); |
|
142 |
|
143 // Fix images. |
|
144 convertRelativeURIs("img", "src"); |
|
145 }, |
|
146 |
|
147 /** |
|
148 * Get the article title as an H1. |
|
149 * |
|
150 * @return void |
|
151 **/ |
|
152 _getArticleTitle: function() { |
|
153 let doc = this._doc; |
|
154 let curTitle = ""; |
|
155 let origTitle = ""; |
|
156 |
|
157 try { |
|
158 curTitle = origTitle = doc.title; |
|
159 |
|
160 // If they had an element with id "title" in their HTML |
|
161 if (typeof curTitle !== "string") |
|
162 curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]); |
|
163 } catch(e) {} |
|
164 |
|
165 if (curTitle.match(/ [\|\-] /)) { |
|
166 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
|
167 |
|
168 if (curTitle.split(' ').length < 3) |
|
169 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
|
170 } else if (curTitle.indexOf(': ') !== -1) { |
|
171 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
|
172 |
|
173 if (curTitle.split(' ').length < 3) |
|
174 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
|
175 } else if (curTitle.length > 150 || curTitle.length < 15) { |
|
176 let hOnes = doc.getElementsByTagName('h1'); |
|
177 |
|
178 if (hOnes.length === 1) |
|
179 curTitle = this._getInnerText(hOnes[0]); |
|
180 } |
|
181 |
|
182 curTitle = curTitle.replace(this.REGEXPS.trim, ""); |
|
183 |
|
184 if (curTitle.split(' ').length <= 4) |
|
185 curTitle = origTitle; |
|
186 |
|
187 return curTitle; |
|
188 }, |
|
189 |
|
190 /** |
|
191 * Prepare the HTML document for readability to scrape it. |
|
192 * This includes things like stripping javascript, CSS, and handling terrible markup. |
|
193 * |
|
194 * @return void |
|
195 **/ |
|
196 _prepDocument: function() { |
|
197 let doc = this._doc; |
|
198 |
|
199 // In some cases a body element can't be found (if the HTML is |
|
200 // totally hosed for example) so we create a new body node and |
|
201 // append it to the document. |
|
202 if (doc.body === null) { |
|
203 let body = doc.createElement("body"); |
|
204 |
|
205 try { |
|
206 doc.body = body; |
|
207 } catch(e) { |
|
208 doc.documentElement.appendChild(body); |
|
209 this.log(e); |
|
210 } |
|
211 } |
|
212 |
|
213 // Remove all style tags in head |
|
214 let styleTags = doc.getElementsByTagName("style"); |
|
215 for (let st = 0; st < styleTags.length; st += 1) { |
|
216 styleTags[st].textContent = ""; |
|
217 } |
|
218 |
|
219 this._replaceBrs(doc.body); |
|
220 |
|
221 let fonts = doc.getElementsByTagName("FONT"); |
|
222 for (let i = fonts.length; --i >=0;) { |
|
223 this._setNodeTag(fonts[i], "SPAN"); |
|
224 } |
|
225 }, |
|
226 |
|
227 /** |
|
228 * Finds the next element, starting from the given node, and ignoring |
|
229 * whitespace in between. If the given node is an element, the same node is |
|
230 * returned. |
|
231 */ |
|
232 _nextElement: function (node) { |
|
233 let next = node; |
|
234 while (next |
|
235 && (next.nodeType != Node.ELEMENT_NODE) |
|
236 && this.REGEXPS.whitespace.test(next.textContent)) { |
|
237 next = next.nextSibling; |
|
238 } |
|
239 return next; |
|
240 }, |
|
241 |
|
242 /** |
|
243 * Replaces 2 or more successive <br> elements with a single <p>. |
|
244 * Whitespace between <br> elements are ignored. For example: |
|
245 * <div>foo<br>bar<br> <br><br>abc</div> |
|
246 * will become: |
|
247 * <div>foo<br>bar<p>abc</p></div> |
|
248 */ |
|
249 _replaceBrs: function (elem) { |
|
250 let brs = elem.getElementsByTagName("br"); |
|
251 for (let i = 0; i < brs.length; i++) { |
|
252 let br = brs[i]; |
|
253 let next = br.nextSibling; |
|
254 |
|
255 // Whether 2 or more <br> elements have been found and replaced with a |
|
256 // <p> block. |
|
257 let replaced = false; |
|
258 |
|
259 // If we find a <br> chain, remove the <br>s until we hit another element |
|
260 // or non-whitespace. This leaves behind the first <br> in the chain |
|
261 // (which will be replaced with a <p> later). |
|
262 while ((next = this._nextElement(next)) && (next.tagName == "BR")) { |
|
263 replaced = true; |
|
264 let sibling = next.nextSibling; |
|
265 next.parentNode.removeChild(next); |
|
266 next = sibling; |
|
267 } |
|
268 |
|
269 // If we removed a <br> chain, replace the remaining <br> with a <p>. Add |
|
270 // all sibling nodes as children of the <p> until we hit another <br> |
|
271 // chain. |
|
272 if (replaced) { |
|
273 let p = this._doc.createElement("p"); |
|
274 br.parentNode.replaceChild(p, br); |
|
275 |
|
276 next = p.nextSibling; |
|
277 while (next) { |
|
278 // If we've hit another <br><br>, we're done adding children to this <p>. |
|
279 if (next.tagName == "BR") { |
|
280 let nextElem = this._nextElement(next); |
|
281 if (nextElem && nextElem.tagName == "BR") |
|
282 break; |
|
283 } |
|
284 |
|
285 // Otherwise, make this node a child of the new <p>. |
|
286 let sibling = next.nextSibling; |
|
287 p.appendChild(next); |
|
288 next = sibling; |
|
289 } |
|
290 } |
|
291 } |
|
292 }, |
|
293 |
|
294 _setNodeTag: function (node, tag) { |
|
295 node.localName = tag.toLowerCase(); |
|
296 node.tagName = tag.toUpperCase(); |
|
297 }, |
|
298 |
|
299 /** |
|
300 * Prepare the article node for display. Clean out any inline styles, |
|
301 * iframes, forms, strip extraneous <p> tags, etc. |
|
302 * |
|
303 * @param Element |
|
304 * @return void |
|
305 **/ |
|
306 _prepArticle: function(articleContent) { |
|
307 this._cleanStyles(articleContent); |
|
308 |
|
309 // Clean out junk from the article content |
|
310 this._cleanConditionally(articleContent, "form"); |
|
311 this._clean(articleContent, "object"); |
|
312 this._clean(articleContent, "h1"); |
|
313 |
|
314 // If there is only one h2, they are probably using it as a header |
|
315 // and not a subheader, so remove it since we already have a header. |
|
316 if (articleContent.getElementsByTagName('h2').length === 1) |
|
317 this._clean(articleContent, "h2"); |
|
318 |
|
319 this._clean(articleContent, "iframe"); |
|
320 this._cleanHeaders(articleContent); |
|
321 |
|
322 // Do these last as the previous stuff may have removed junk |
|
323 // that will affect these |
|
324 this._cleanConditionally(articleContent, "table"); |
|
325 this._cleanConditionally(articleContent, "ul"); |
|
326 this._cleanConditionally(articleContent, "div"); |
|
327 |
|
328 // Remove extra paragraphs |
|
329 let articleParagraphs = articleContent.getElementsByTagName('p'); |
|
330 for (let i = articleParagraphs.length - 1; i >= 0; i -= 1) { |
|
331 let imgCount = articleParagraphs[i].getElementsByTagName('img').length; |
|
332 let embedCount = articleParagraphs[i].getElementsByTagName('embed').length; |
|
333 let objectCount = articleParagraphs[i].getElementsByTagName('object').length; |
|
334 |
|
335 if (imgCount === 0 && |
|
336 embedCount === 0 && |
|
337 objectCount === 0 && |
|
338 this._getInnerText(articleParagraphs[i], false) === '') |
|
339 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); |
|
340 } |
|
341 |
|
342 let brs = articleContent.getElementsByTagName("BR"); |
|
343 for (let i = brs.length; --i >= 0;) { |
|
344 let br = brs[i]; |
|
345 let next = this._nextElement(br.nextSibling); |
|
346 if (next && next.tagName == "P") |
|
347 br.parentNode.removeChild(br); |
|
348 } |
|
349 }, |
|
350 |
|
351 /** |
|
352 * Initialize a node with the readability object. Also checks the |
|
353 * className/id for special names to add to its score. |
|
354 * |
|
355 * @param Element |
|
356 * @return void |
|
357 **/ |
|
358 _initializeNode: function(node) { |
|
359 node.readability = {"contentScore": 0}; |
|
360 |
|
361 switch(node.tagName) { |
|
362 case 'DIV': |
|
363 node.readability.contentScore += 5; |
|
364 break; |
|
365 |
|
366 case 'PRE': |
|
367 case 'TD': |
|
368 case 'BLOCKQUOTE': |
|
369 node.readability.contentScore += 3; |
|
370 break; |
|
371 |
|
372 case 'ADDRESS': |
|
373 case 'OL': |
|
374 case 'UL': |
|
375 case 'DL': |
|
376 case 'DD': |
|
377 case 'DT': |
|
378 case 'LI': |
|
379 case 'FORM': |
|
380 node.readability.contentScore -= 3; |
|
381 break; |
|
382 |
|
383 case 'H1': |
|
384 case 'H2': |
|
385 case 'H3': |
|
386 case 'H4': |
|
387 case 'H5': |
|
388 case 'H6': |
|
389 case 'TH': |
|
390 node.readability.contentScore -= 5; |
|
391 break; |
|
392 } |
|
393 |
|
394 node.readability.contentScore += this._getClassWeight(node); |
|
395 }, |
|
396 |
|
397 /*** |
|
398 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is |
|
399 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
|
400 * |
|
401 * @param page a document to run upon. Needs to be a full document, complete with body. |
|
402 * @return Element |
|
403 **/ |
|
404 _grabArticle: function (page) { |
|
405 let doc = this._doc; |
|
406 let isPaging = (page !== null ? true: false); |
|
407 page = page ? page : this._doc.body; |
|
408 let pageCacheHtml = page.innerHTML; |
|
409 |
|
410 // Check if any "dir" is set on the toplevel document element |
|
411 this._articleDir = doc.documentElement.getAttribute("dir"); |
|
412 |
|
413 while (true) { |
|
414 let stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); |
|
415 let allElements = page.getElementsByTagName('*'); |
|
416 |
|
417 // First, node prepping. Trash nodes that look cruddy (like ones with the |
|
418 // class name "comment", etc), and turn divs into P tags where they have been |
|
419 // used inappropriately (as in, where they contain no other block level elements.) |
|
420 // |
|
421 // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 |
|
422 // TODO: Shouldn't this be a reverse traversal? |
|
423 let node = null; |
|
424 let nodesToScore = []; |
|
425 |
|
426 // Let each node know its index in the allElements array. |
|
427 for (let i = allElements.length; --i >= 0;) { |
|
428 allElements[i]._index = i; |
|
429 } |
|
430 |
|
431 /** |
|
432 * JSDOMParser returns static node lists, not live ones. When we remove |
|
433 * an element from the document, we need to manually remove it - and all |
|
434 * of its children - from the allElements array. |
|
435 */ |
|
436 function purgeNode(node) { |
|
437 for (let i = node.childNodes.length; --i >= 0;) { |
|
438 purgeNode(node.childNodes[i]); |
|
439 } |
|
440 if (node._index !== undefined && allElements[node._index] == node) |
|
441 delete allElements[node._index]; |
|
442 } |
|
443 |
|
444 for (let nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) { |
|
445 if (!(node = allElements[nodeIndex])) |
|
446 continue; |
|
447 |
|
448 let matchString = node.className + node.id; |
|
449 if (matchString.search(this.REGEXPS.byline) !== -1 && !this._articleByline) { |
|
450 this._articleByline = node.textContent; |
|
451 node.parentNode.removeChild(node); |
|
452 purgeNode(node); |
|
453 continue; |
|
454 } |
|
455 |
|
456 // Remove unlikely candidates |
|
457 if (stripUnlikelyCandidates) { |
|
458 if (matchString.search(this.REGEXPS.unlikelyCandidates) !== -1 && |
|
459 matchString.search(this.REGEXPS.okMaybeItsACandidate) === -1 && |
|
460 node.tagName !== "BODY") { |
|
461 this.log("Removing unlikely candidate - " + matchString); |
|
462 node.parentNode.removeChild(node); |
|
463 purgeNode(node); |
|
464 continue; |
|
465 } |
|
466 } |
|
467 |
|
468 if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") |
|
469 nodesToScore[nodesToScore.length] = node; |
|
470 |
|
471 // Turn all divs that don't have children block level elements into p's |
|
472 if (node.tagName === "DIV") { |
|
473 // Sites like http://mobile.slate.com encloses each paragraph with a DIV |
|
474 // element. DIVs with only a P element inside and no text content can be |
|
475 // safely converted into plain P elements to avoid confusing the scoring |
|
476 // algorithm with DIVs with are, in practice, paragraphs. |
|
477 let pIndex = this._getSinglePIndexInsideDiv(node); |
|
478 |
|
479 if (pIndex >= 0 || !this._hasChildBlockElement(node)) { |
|
480 if (pIndex >= 0) { |
|
481 let newNode = node.childNodes[pIndex]; |
|
482 node.parentNode.replaceChild(newNode, node); |
|
483 purgeNode(node); |
|
484 } else { |
|
485 this._setNodeTag(node, "P"); |
|
486 nodesToScore[nodesToScore.length] = node; |
|
487 } |
|
488 } else { |
|
489 // EXPERIMENTAL |
|
490 for (let i = 0, il = node.childNodes.length; i < il; i += 1) { |
|
491 let childNode = node.childNodes[i]; |
|
492 if (!childNode) |
|
493 continue; |
|
494 |
|
495 if (childNode.nodeType === 3) { // Node.TEXT_NODE |
|
496 let p = doc.createElement('p'); |
|
497 p.textContent = childNode.textContent; |
|
498 p.style.display = 'inline'; |
|
499 p.className = 'readability-styled'; |
|
500 childNode.parentNode.replaceChild(p, childNode); |
|
501 } |
|
502 } |
|
503 } |
|
504 } |
|
505 } |
|
506 |
|
507 /** |
|
508 * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
|
509 * Then add their score to their parent node. |
|
510 * |
|
511 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. |
|
512 **/ |
|
513 let candidates = []; |
|
514 for (let pt = 0; pt < nodesToScore.length; pt += 1) { |
|
515 let parentNode = nodesToScore[pt].parentNode; |
|
516 let grandParentNode = parentNode ? parentNode.parentNode : null; |
|
517 let innerText = this._getInnerText(nodesToScore[pt]); |
|
518 |
|
519 if (!parentNode || typeof(parentNode.tagName) === 'undefined') |
|
520 continue; |
|
521 |
|
522 // If this paragraph is less than 25 characters, don't even count it. |
|
523 if (innerText.length < 25) |
|
524 continue; |
|
525 |
|
526 // Initialize readability data for the parent. |
|
527 if (typeof parentNode.readability === 'undefined') { |
|
528 this._initializeNode(parentNode); |
|
529 candidates.push(parentNode); |
|
530 } |
|
531 |
|
532 // Initialize readability data for the grandparent. |
|
533 if (grandParentNode && |
|
534 typeof(grandParentNode.readability) === 'undefined' && |
|
535 typeof(grandParentNode.tagName) !== 'undefined') { |
|
536 this._initializeNode(grandParentNode); |
|
537 candidates.push(grandParentNode); |
|
538 } |
|
539 |
|
540 let contentScore = 0; |
|
541 |
|
542 // Add a point for the paragraph itself as a base. |
|
543 contentScore += 1; |
|
544 |
|
545 // Add points for any commas within this paragraph. |
|
546 contentScore += innerText.split(',').length; |
|
547 |
|
548 // For every 100 characters in this paragraph, add another point. Up to 3 points. |
|
549 contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
|
550 |
|
551 // Add the score to the parent. The grandparent gets half. |
|
552 parentNode.readability.contentScore += contentScore; |
|
553 |
|
554 if (grandParentNode) |
|
555 grandParentNode.readability.contentScore += contentScore / 2; |
|
556 } |
|
557 |
|
558 // After we've calculated scores, loop through all of the possible |
|
559 // candidate nodes we found and find the one with the highest score. |
|
560 let topCandidates = []; |
|
561 for (let c = 0, cl = candidates.length; c < cl; c += 1) { |
|
562 let candidate = candidates[c]; |
|
563 |
|
564 // Scale the final candidates score based on link density. Good content |
|
565 // should have a relatively small link density (5% or less) and be mostly |
|
566 // unaffected by this operation. |
|
567 let candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); |
|
568 candidate.readability.contentScore = candidateScore; |
|
569 |
|
570 this.log('Candidate: ' + candidate + " (" + candidate.className + ":" + |
|
571 candidate.id + ") with score " + candidateScore); |
|
572 |
|
573 for (let t = 0; t < this.N_TOP_CANDIDATES; t++) { |
|
574 let aTopCandidate = topCandidates[t]; |
|
575 |
|
576 if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { |
|
577 topCandidates.splice(t, 0, candidate); |
|
578 if (topCandidates.length > this.N_TOP_CANDIDATES) |
|
579 topCandidates.pop(); |
|
580 break; |
|
581 } |
|
582 } |
|
583 } |
|
584 |
|
585 let topCandidate = topCandidates[0] || null; |
|
586 let lastTopCandidate = (topCandidates.length > 3 ? topCandidates[topCandidates.length - 1] : null); |
|
587 |
|
588 // If we still have no top candidate, just use the body as a last resort. |
|
589 // We also have to copy the body node so it is something we can modify. |
|
590 if (topCandidate === null || topCandidate.tagName === "BODY") { |
|
591 // Move all of the page's children into topCandidate |
|
592 topCandidate = doc.createElement("DIV"); |
|
593 let children = page.childNodes; |
|
594 for (let i = 0; i < children.length; ++i) { |
|
595 topCandidate.appendChild(children[i]); |
|
596 } |
|
597 |
|
598 page.appendChild(topCandidate); |
|
599 |
|
600 this._initializeNode(topCandidate); |
|
601 } |
|
602 |
|
603 // Now that we have the top candidate, look through its siblings for content |
|
604 // that might also be related. Things like preambles, content split by ads |
|
605 // that we removed, etc. |
|
606 let articleContent = doc.createElement("DIV"); |
|
607 if (isPaging) |
|
608 articleContent.id = "readability-content"; |
|
609 |
|
610 let siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); |
|
611 let siblingNodes = topCandidate.parentNode.childNodes; |
|
612 |
|
613 for (let s = 0, sl = siblingNodes.length; s < sl; s += 1) { |
|
614 let siblingNode = siblingNodes[s]; |
|
615 let append = false; |
|
616 |
|
617 this.log("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); |
|
618 this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); |
|
619 |
|
620 if (siblingNode === topCandidate) |
|
621 append = true; |
|
622 |
|
623 let contentBonus = 0; |
|
624 |
|
625 // Give a bonus if sibling nodes and top candidates have the example same classname |
|
626 if (siblingNode.className === topCandidate.className && topCandidate.className !== "") |
|
627 contentBonus += topCandidate.readability.contentScore * 0.2; |
|
628 |
|
629 if (typeof siblingNode.readability !== 'undefined' && |
|
630 (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) |
|
631 append = true; |
|
632 |
|
633 if (siblingNode.nodeName === "P") { |
|
634 let linkDensity = this._getLinkDensity(siblingNode); |
|
635 let nodeContent = this._getInnerText(siblingNode); |
|
636 let nodeLength = nodeContent.length; |
|
637 |
|
638 if (nodeLength > 80 && linkDensity < 0.25) { |
|
639 append = true; |
|
640 } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { |
|
641 append = true; |
|
642 } |
|
643 } |
|
644 |
|
645 if (append) { |
|
646 this.log("Appending node: " + siblingNode); |
|
647 |
|
648 // siblingNodes is a reference to the childNodes array, and |
|
649 // siblingNode is removed from the array when we call appendChild() |
|
650 // below. As a result, we must revisit this index since the nodes |
|
651 // have been shifted. |
|
652 s -= 1; |
|
653 sl -= 1; |
|
654 |
|
655 if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { |
|
656 // We have a node that isn't a common block level element, like a form or td tag. |
|
657 // Turn it into a div so it doesn't get filtered out later by accident. */ |
|
658 this.log("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); |
|
659 |
|
660 this._setNodeTag(siblingNode, "DIV"); |
|
661 } |
|
662 |
|
663 // To ensure a node does not interfere with readability styles, |
|
664 // remove its classnames. |
|
665 siblingNode.className = ""; |
|
666 |
|
667 // Append sibling and subtract from our list because it removes |
|
668 // the node when you append to another node. |
|
669 articleContent.appendChild(siblingNode); |
|
670 } |
|
671 } |
|
672 |
|
673 // So we have all of the content that we need. Now we clean it up for presentation. |
|
674 this._prepArticle(articleContent); |
|
675 |
|
676 if (this._curPageNum === 1) { |
|
677 let div = doc.createElement("DIV"); |
|
678 div.id = "readability-page-1"; |
|
679 div.className = "page"; |
|
680 let children = articleContent.childNodes; |
|
681 for (let i = 0; i < children.length; ++i) { |
|
682 div.appendChild(children[i]); |
|
683 } |
|
684 articleContent.appendChild(div); |
|
685 } |
|
686 |
|
687 // Now that we've gone through the full algorithm, check to see if |
|
688 // we got any meaningful content. If we didn't, we may need to re-run |
|
689 // grabArticle with different flags set. This gives us a higher likelihood of |
|
690 // finding the content, and the sieve approach gives us a higher likelihood of |
|
691 // finding the -right- content. |
|
692 if (this._getInnerText(articleContent, true).length < 500) { |
|
693 page.innerHTML = pageCacheHtml; |
|
694 |
|
695 if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { |
|
696 this._removeFlag(this.FLAG_STRIP_UNLIKELYS); |
|
697 } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { |
|
698 this._removeFlag(this.FLAG_WEIGHT_CLASSES); |
|
699 } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { |
|
700 this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); |
|
701 } else { |
|
702 return null; |
|
703 } |
|
704 } else { |
|
705 if (lastTopCandidate !== null) { |
|
706 // EXPERIMENTAL: Contrast ratio is how we measure the level of competition between candidates in the |
|
707 // readability algorithm. This is to avoid offering reader mode on pages that are more like |
|
708 // a list or directory of links with summaries. It takes the score of the last top candidate |
|
709 // (see N_TOP_CANDIDATES) and checks how it compares to the top candidate's. On pages that are not |
|
710 // actual articles, there will likely be many candidates with similar score (i.e. higher contrast ratio). |
|
711 let contrastRatio = lastTopCandidate.readability.contentScore / topCandidate.readability.contentScore; |
|
712 if (contrastRatio > 0.45) |
|
713 return null; |
|
714 } |
|
715 |
|
716 return articleContent; |
|
717 } |
|
718 } |
|
719 }, |
|
720 |
|
721 /** |
|
722 * Attempts to get the excerpt from these |
|
723 * sources in the following order: |
|
724 * - meta description tag |
|
725 * - open-graph description |
|
726 * - twitter cards description |
|
727 * - article's first paragraph |
|
728 * If no excerpt is found, an empty string will be |
|
729 * returned. |
|
730 * |
|
731 * @param Element - root element of the processed version page |
|
732 * @return String - excerpt of the article |
|
733 **/ |
|
734 _getExcerpt: function(articleContent) { |
|
735 let values = {}; |
|
736 let metaElements = this._doc.getElementsByTagName("meta"); |
|
737 |
|
738 // Match "description", or Twitter's "twitter:description" (Cards) |
|
739 // in name attribute. |
|
740 let namePattern = /^\s*((twitter)\s*:\s*)?description\s*$/gi; |
|
741 |
|
742 // Match Facebook's og:description (Open Graph) in property attribute. |
|
743 let propertyPattern = /^\s*og\s*:\s*description\s*$/gi; |
|
744 |
|
745 // Find description tags. |
|
746 for (let i = 0; i < metaElements.length; i++) { |
|
747 let element = metaElements[i]; |
|
748 let elementName = element.getAttribute("name"); |
|
749 let elementProperty = element.getAttribute("property"); |
|
750 |
|
751 let name; |
|
752 if (namePattern.test(elementName)) { |
|
753 name = elementName; |
|
754 } else if (propertyPattern.test(elementProperty)) { |
|
755 name = elementProperty; |
|
756 } |
|
757 |
|
758 if (name) { |
|
759 let content = element.getAttribute("content"); |
|
760 if (content) { |
|
761 // Convert to lowercase and remove any whitespace |
|
762 // so we can match below. |
|
763 name = name.toLowerCase().replace(/\s/g, ''); |
|
764 values[name] = content.trim(); |
|
765 } |
|
766 } |
|
767 } |
|
768 |
|
769 if ("description" in values) { |
|
770 return values["description"]; |
|
771 } |
|
772 |
|
773 if ("og:description" in values) { |
|
774 // Use facebook open graph description. |
|
775 return values["og:description"]; |
|
776 } |
|
777 |
|
778 if ("twitter:description" in values) { |
|
779 // Use twitter cards description. |
|
780 return values["twitter:description"]; |
|
781 } |
|
782 |
|
783 // No description meta tags, use the article's first paragraph. |
|
784 let paragraphs = articleContent.getElementsByTagName("p"); |
|
785 if (paragraphs.length > 0) { |
|
786 return paragraphs[0].textContent; |
|
787 } |
|
788 |
|
789 return ""; |
|
790 }, |
|
791 |
|
792 /** |
|
793 * Removes script tags from the document. |
|
794 * |
|
795 * @param Element |
|
796 **/ |
|
797 _removeScripts: function(doc) { |
|
798 let scripts = doc.getElementsByTagName('script'); |
|
799 for (let i = scripts.length - 1; i >= 0; i -= 1) { |
|
800 scripts[i].nodeValue=""; |
|
801 scripts[i].removeAttribute('src'); |
|
802 |
|
803 if (scripts[i].parentNode) |
|
804 scripts[i].parentNode.removeChild(scripts[i]); |
|
805 } |
|
806 }, |
|
807 |
|
808 /** |
|
809 * Get child index of the only P element inside a DIV with no |
|
810 * text content. Returns -1 if the DIV node contains non-empty |
|
811 * text nodes or if it contains other element nodes. |
|
812 * |
|
813 * @param Element |
|
814 **/ |
|
815 _getSinglePIndexInsideDiv: function(e) { |
|
816 let childNodes = e.childNodes; |
|
817 let pIndex = -1; |
|
818 |
|
819 for (let i = childNodes.length; --i >= 0;) { |
|
820 let node = childNodes[i]; |
|
821 |
|
822 if (node.nodeType === Node.ELEMENT_NODE) { |
|
823 if (node.tagName !== "P") |
|
824 return -1; |
|
825 |
|
826 if (pIndex >= 0) |
|
827 return -1; |
|
828 |
|
829 pIndex = i; |
|
830 } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) { |
|
831 return -1; |
|
832 } |
|
833 } |
|
834 |
|
835 return pIndex; |
|
836 }, |
|
837 |
|
838 /** |
|
839 * Determine whether element has any children block level elements. |
|
840 * |
|
841 * @param Element |
|
842 */ |
|
843 _hasChildBlockElement: function (e) { |
|
844 let length = e.childNodes.length; |
|
845 for (let i = 0; i < length; i++) { |
|
846 let child = e.childNodes[i]; |
|
847 if (child.nodeType != 1) |
|
848 continue; |
|
849 |
|
850 if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child)) |
|
851 return true; |
|
852 } |
|
853 return false; |
|
854 }, |
|
855 |
|
856 /** |
|
857 * Get the inner text of a node - cross browser compatibly. |
|
858 * This also strips out any excess whitespace to be found. |
|
859 * |
|
860 * @param Element |
|
861 * @return string |
|
862 **/ |
|
863 _getInnerText: function(e, normalizeSpaces) { |
|
864 let textContent = e.textContent.replace(this.REGEXPS.trim, ""); |
|
865 normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; |
|
866 |
|
867 if (normalizeSpaces) { |
|
868 return textContent.replace(this.REGEXPS.normalize, " "); |
|
869 } else { |
|
870 return textContent; |
|
871 } |
|
872 }, |
|
873 |
|
874 /** |
|
875 * Get the number of times a string s appears in the node e. |
|
876 * |
|
877 * @param Element |
|
878 * @param string - what to split on. Default is "," |
|
879 * @return number (integer) |
|
880 **/ |
|
881 _getCharCount: function(e,s) { |
|
882 s = s || ","; |
|
883 return this._getInnerText(e).split(s).length - 1; |
|
884 }, |
|
885 |
|
886 /** |
|
887 * Remove the style attribute on every e and under. |
|
888 * TODO: Test if getElementsByTagName(*) is faster. |
|
889 * |
|
890 * @param Element |
|
891 * @return void |
|
892 **/ |
|
893 _cleanStyles: function(e) { |
|
894 e = e || this._doc; |
|
895 let cur = e.firstChild; |
|
896 |
|
897 if (!e) |
|
898 return; |
|
899 |
|
900 // Remove any root styles, if we're able. |
|
901 if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') |
|
902 e.removeAttribute('style'); |
|
903 |
|
904 // Go until there are no more child nodes |
|
905 while (cur !== null) { |
|
906 if (cur.nodeType === 1) { |
|
907 // Remove style attribute(s) : |
|
908 if (cur.className !== "readability-styled") |
|
909 cur.removeAttribute("style"); |
|
910 |
|
911 this._cleanStyles(cur); |
|
912 } |
|
913 |
|
914 cur = cur.nextSibling; |
|
915 } |
|
916 }, |
|
917 |
|
918 /** |
|
919 * Get the density of links as a percentage of the content |
|
920 * This is the amount of text that is inside a link divided by the total text in the node. |
|
921 * |
|
922 * @param Element |
|
923 * @return number (float) |
|
924 **/ |
|
925 _getLinkDensity: function(e) { |
|
926 let links = e.getElementsByTagName("a"); |
|
927 let textLength = this._getInnerText(e).length; |
|
928 let linkLength = 0; |
|
929 |
|
930 for (let i = 0, il = links.length; i < il; i += 1) { |
|
931 linkLength += this._getInnerText(links[i]).length; |
|
932 } |
|
933 |
|
934 return linkLength / textLength; |
|
935 }, |
|
936 |
|
937 /** |
|
938 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. |
|
939 * |
|
940 * @author Dan Lacy |
|
941 * @return string the base url |
|
942 **/ |
|
943 _findBaseUrl: function() { |
|
944 let uri = this._uri; |
|
945 let noUrlParams = uri.path.split("?")[0]; |
|
946 let urlSlashes = noUrlParams.split("/").reverse(); |
|
947 let cleanedSegments = []; |
|
948 let possibleType = ""; |
|
949 |
|
950 for (let i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { |
|
951 let segment = urlSlashes[i]; |
|
952 |
|
953 // Split off and save anything that looks like a file type. |
|
954 if (segment.indexOf(".") !== -1) { |
|
955 possibleType = segment.split(".")[1]; |
|
956 |
|
957 // If the type isn't alpha-only, it's probably not actually a file extension. |
|
958 if (!possibleType.match(/[^a-zA-Z]/)) |
|
959 segment = segment.split(".")[0]; |
|
960 } |
|
961 |
|
962 // EW-CMS specific segment replacement. Ugly. |
|
963 // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html |
|
964 if (segment.indexOf(',00') !== -1) |
|
965 segment = segment.replace(',00', ''); |
|
966 |
|
967 // If our first or second segment has anything looking like a page number, remove it. |
|
968 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) |
|
969 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); |
|
970 |
|
971 let del = false; |
|
972 |
|
973 // If this is purely a number, and it's the first or second segment, |
|
974 // it's probably a page number. Remove it. |
|
975 if (i < 2 && segment.match(/^\d{1,2}$/)) |
|
976 del = true; |
|
977 |
|
978 // If this is the first segment and it's just "index", remove it. |
|
979 if (i === 0 && segment.toLowerCase() === "index") |
|
980 del = true; |
|
981 |
|
982 // If our first or second segment is smaller than 3 characters, |
|
983 // and the first segment was purely alphas, remove it. |
|
984 if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) |
|
985 del = true; |
|
986 |
|
987 // If it's not marked for deletion, push it to cleanedSegments. |
|
988 if (!del) |
|
989 cleanedSegments.push(segment); |
|
990 } |
|
991 |
|
992 // This is our final, cleaned, base article URL. |
|
993 return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); |
|
994 }, |
|
995 |
|
996 /** |
|
997 * Look for any paging links that may occur within the document. |
|
998 * |
|
999 * @param body |
|
1000 * @return object (array) |
|
1001 **/ |
|
1002 _findNextPageLink: function(elem) { |
|
1003 let uri = this._uri; |
|
1004 let possiblePages = {}; |
|
1005 let allLinks = elem.getElementsByTagName('a'); |
|
1006 let articleBaseUrl = this._findBaseUrl(); |
|
1007 |
|
1008 // Loop through all links, looking for hints that they may be next-page links. |
|
1009 // Things like having "page" in their textContent, className or id, or being a child |
|
1010 // of a node with a page-y className or id. |
|
1011 // |
|
1012 // Also possible: levenshtein distance? longest common subsequence? |
|
1013 // |
|
1014 // After we do that, assign each page a score, and |
|
1015 for (let i = 0, il = allLinks.length; i < il; i += 1) { |
|
1016 let link = allLinks[i]; |
|
1017 let linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); |
|
1018 |
|
1019 // If we've already seen this page, ignore it. |
|
1020 if (linkHref === "" || |
|
1021 linkHref === articleBaseUrl || |
|
1022 linkHref === uri.spec || |
|
1023 linkHref in this._parsedPages) { |
|
1024 continue; |
|
1025 } |
|
1026 |
|
1027 // If it's on a different domain, skip it. |
|
1028 if (uri.host !== linkHref.split(/\/+/g)[1]) |
|
1029 continue; |
|
1030 |
|
1031 let linkText = this._getInnerText(link); |
|
1032 |
|
1033 // If the linkText looks like it's not the next page, skip it. |
|
1034 if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) |
|
1035 continue; |
|
1036 |
|
1037 // If the leftovers of the URL after removing the base URL don't contain |
|
1038 // any digits, it's certainly not a next page link. |
|
1039 let linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
|
1040 if (!linkHrefLeftover.match(/\d/)) |
|
1041 continue; |
|
1042 |
|
1043 if (!(linkHref in possiblePages)) { |
|
1044 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; |
|
1045 } else { |
|
1046 possiblePages[linkHref].linkText += ' | ' + linkText; |
|
1047 } |
|
1048 |
|
1049 let linkObj = possiblePages[linkHref]; |
|
1050 |
|
1051 // If the articleBaseUrl isn't part of this URL, penalize this link. It could |
|
1052 // still be the link, but the odds are lower. |
|
1053 // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
|
1054 if (linkHref.indexOf(articleBaseUrl) !== 0) |
|
1055 linkObj.score -= 25; |
|
1056 |
|
1057 let linkData = linkText + ' ' + link.className + ' ' + link.id; |
|
1058 if (linkData.match(this.REGEXPS.nextLink)) |
|
1059 linkObj.score += 50; |
|
1060 |
|
1061 if (linkData.match(/pag(e|ing|inat)/i)) |
|
1062 linkObj.score += 25; |
|
1063 |
|
1064 if (linkData.match(/(first|last)/i)) { |
|
1065 // -65 is enough to negate any bonuses gotten from a > or » in the text, |
|
1066 // If we already matched on "next", last is probably fine. |
|
1067 // If we didn't, then it's bad. Penalize. |
|
1068 if (!linkObj.linkText.match(this.REGEXPS.nextLink)) |
|
1069 linkObj.score -= 65; |
|
1070 } |
|
1071 |
|
1072 if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) |
|
1073 linkObj.score -= 50; |
|
1074 |
|
1075 if (linkData.match(this.REGEXPS.prevLink)) |
|
1076 linkObj.score -= 200; |
|
1077 |
|
1078 // If a parentNode contains page or paging or paginat |
|
1079 let parentNode = link.parentNode; |
|
1080 let positiveNodeMatch = false; |
|
1081 let negativeNodeMatch = false; |
|
1082 |
|
1083 while (parentNode) { |
|
1084 let parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; |
|
1085 |
|
1086 if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { |
|
1087 positiveNodeMatch = true; |
|
1088 linkObj.score += 25; |
|
1089 } |
|
1090 |
|
1091 if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { |
|
1092 // If this is just something like "footer", give it a negative. |
|
1093 // If it's something like "body-and-footer", leave it be. |
|
1094 if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { |
|
1095 linkObj.score -= 25; |
|
1096 negativeNodeMatch = true; |
|
1097 } |
|
1098 } |
|
1099 |
|
1100 parentNode = parentNode.parentNode; |
|
1101 } |
|
1102 |
|
1103 // If the URL looks like it has paging in it, add to the score. |
|
1104 // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
|
1105 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) |
|
1106 linkObj.score += 25; |
|
1107 |
|
1108 // If the URL contains negative values, give a slight decrease. |
|
1109 if (linkHref.match(this.REGEXPS.extraneous)) |
|
1110 linkObj.score -= 15; |
|
1111 |
|
1112 /** |
|
1113 * Minor punishment to anything that doesn't match our current URL. |
|
1114 * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. |
|
1115 * Dan, can you show me a counterexample where this is necessary? |
|
1116 * if (linkHref.indexOf(window.location.href) !== 0) { |
|
1117 * linkObj.score -= 1; |
|
1118 * } |
|
1119 **/ |
|
1120 |
|
1121 // If the link text can be parsed as a number, give it a minor bonus, with a slight |
|
1122 // bias towards lower numbered pages. This is so that pages that might not have 'next' |
|
1123 // in their text can still get scored, and sorted properly by score. |
|
1124 let linkTextAsNumber = parseInt(linkText, 10); |
|
1125 if (linkTextAsNumber) { |
|
1126 // Punish 1 since we're either already there, or it's probably |
|
1127 // before what we want anyways. |
|
1128 if (linkTextAsNumber === 1) { |
|
1129 linkObj.score -= 10; |
|
1130 } else { |
|
1131 linkObj.score += Math.max(0, 10 - linkTextAsNumber); |
|
1132 } |
|
1133 } |
|
1134 } |
|
1135 |
|
1136 // Loop thrugh all of our possible pages from above and find our top |
|
1137 // candidate for the next page URL. Require at least a score of 50, which |
|
1138 // is a relatively high confidence that this page is the next link. |
|
1139 let topPage = null; |
|
1140 for (let page in possiblePages) { |
|
1141 if (possiblePages.hasOwnProperty(page)) { |
|
1142 if (possiblePages[page].score >= 50 && |
|
1143 (!topPage || topPage.score < possiblePages[page].score)) |
|
1144 topPage = possiblePages[page]; |
|
1145 } |
|
1146 } |
|
1147 |
|
1148 if (topPage) { |
|
1149 let nextHref = topPage.href.replace(/\/$/,''); |
|
1150 |
|
1151 this.log('NEXT PAGE IS ' + nextHref); |
|
1152 this._parsedPages[nextHref] = true; |
|
1153 return nextHref; |
|
1154 } else { |
|
1155 return null; |
|
1156 } |
|
1157 }, |
|
1158 |
|
1159 _successfulRequest: function(request) { |
|
1160 return (request.status >= 200 && request.status < 300) || |
|
1161 request.status === 304 || |
|
1162 (request.status === 0 && request.responseText); |
|
1163 }, |
|
1164 |
|
1165 _ajax: function(url, options) { |
|
1166 let request = new XMLHttpRequest(); |
|
1167 |
|
1168 function respondToReadyState(readyState) { |
|
1169 if (request.readyState === 4) { |
|
1170 if (this._successfulRequest(request)) { |
|
1171 if (options.success) |
|
1172 options.success(request); |
|
1173 } else { |
|
1174 if (options.error) |
|
1175 options.error(request); |
|
1176 } |
|
1177 } |
|
1178 } |
|
1179 |
|
1180 if (typeof options === 'undefined') |
|
1181 options = {}; |
|
1182 |
|
1183 request.onreadystatechange = respondToReadyState; |
|
1184 |
|
1185 request.open('get', url, true); |
|
1186 request.setRequestHeader('Accept', 'text/html'); |
|
1187 |
|
1188 try { |
|
1189 request.send(options.postBody); |
|
1190 } catch (e) { |
|
1191 if (options.error) |
|
1192 options.error(); |
|
1193 } |
|
1194 |
|
1195 return request; |
|
1196 }, |
|
1197 |
|
1198 _appendNextPage: function(nextPageLink) { |
|
1199 let doc = this._doc; |
|
1200 this._curPageNum += 1; |
|
1201 |
|
1202 let articlePage = doc.createElement("DIV"); |
|
1203 articlePage.id = 'readability-page-' + this._curPageNum; |
|
1204 articlePage.className = 'page'; |
|
1205 articlePage.innerHTML = '<p class="page-separator" title="Page ' + this._curPageNum + '">§</p>'; |
|
1206 |
|
1207 doc.getElementById("readability-content").appendChild(articlePage); |
|
1208 |
|
1209 if (this._curPageNum > this.MAX_PAGES) { |
|
1210 let nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>"; |
|
1211 articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; |
|
1212 return; |
|
1213 } |
|
1214 |
|
1215 // Now that we've built the article page DOM element, get the page content |
|
1216 // asynchronously and load the cleaned content into the div we created for it. |
|
1217 (function(pageUrl, thisPage) { |
|
1218 this._ajax(pageUrl, { |
|
1219 success: function(r) { |
|
1220 |
|
1221 // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. |
|
1222 let eTag = r.getResponseHeader('ETag'); |
|
1223 if (eTag) { |
|
1224 if (eTag in this._pageETags) { |
|
1225 this.log("Exact duplicate page found via ETag. Aborting."); |
|
1226 articlePage.style.display = 'none'; |
|
1227 return; |
|
1228 } else { |
|
1229 this._pageETags[eTag] = 1; |
|
1230 } |
|
1231 } |
|
1232 |
|
1233 // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. |
|
1234 let page = doc.createElement("DIV"); |
|
1235 |
|
1236 // Do some preprocessing to our HTML to make it ready for appending. |
|
1237 // - Remove any script tags. Swap and reswap newlines with a unicode |
|
1238 // character because multiline regex doesn't work in javascript. |
|
1239 // - Turn any noscript tags into divs so that we can parse them. This |
|
1240 // allows us to find any next page links hidden via javascript. |
|
1241 // - Turn all double br's into p's - was handled by prepDocument in the original view. |
|
1242 // Maybe in the future abstract out prepDocument to work for both the original document |
|
1243 // and AJAX-added pages. |
|
1244 let responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, ''); |
|
1245 responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, ''); |
|
1246 responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div'); |
|
1247 responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); |
|
1248 |
|
1249 page.innerHTML = responseHtml; |
|
1250 this._replaceBrs(page); |
|
1251 |
|
1252 // Reset all flags for the next page, as they will search through it and |
|
1253 // disable as necessary at the end of grabArticle. |
|
1254 this._flags = 0x1 | 0x2 | 0x4; |
|
1255 |
|
1256 let nextPageLink = this._findNextPageLink(page); |
|
1257 |
|
1258 // NOTE: if we end up supporting _appendNextPage(), we'll need to |
|
1259 // change this call to be async |
|
1260 let content = this._grabArticle(page); |
|
1261 |
|
1262 if (!content) { |
|
1263 this.log("No content found in page to append. Aborting."); |
|
1264 return; |
|
1265 } |
|
1266 |
|
1267 // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. |
|
1268 // Compare it against all of the the previous document's we've gotten. If the previous |
|
1269 // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. |
|
1270 let firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; |
|
1271 if (firstP && firstP.innerHTML.length > 100) { |
|
1272 for (let i = 1; i <= this._curPageNum; i += 1) { |
|
1273 let rPage = doc.getElementById('readability-page-' + i); |
|
1274 if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { |
|
1275 this.log('Duplicate of page ' + i + ' - skipping.'); |
|
1276 articlePage.style.display = 'none'; |
|
1277 this._parsedPages[pageUrl] = true; |
|
1278 return; |
|
1279 } |
|
1280 } |
|
1281 } |
|
1282 |
|
1283 this._removeScripts(content); |
|
1284 |
|
1285 thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; |
|
1286 |
|
1287 // After the page has rendered, post process the content. This delay is necessary because, |
|
1288 // in webkit at least, offsetWidth is not set in time to determine image width. We have to |
|
1289 // wait a little bit for reflow to finish before we can fix floating images. |
|
1290 setTimeout((function() { |
|
1291 this._postProcessContent(thisPage); |
|
1292 }).bind(this), 500); |
|
1293 |
|
1294 |
|
1295 if (nextPageLink) |
|
1296 this._appendNextPage(nextPageLink); |
|
1297 } |
|
1298 }); |
|
1299 }).bind(this)(nextPageLink, articlePage); |
|
1300 }, |
|
1301 |
|
1302 /** |
|
1303 * Get an elements class/id weight. Uses regular expressions to tell if this |
|
1304 * element looks good or bad. |
|
1305 * |
|
1306 * @param Element |
|
1307 * @return number (Integer) |
|
1308 **/ |
|
1309 _getClassWeight: function(e) { |
|
1310 if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) |
|
1311 return 0; |
|
1312 |
|
1313 let weight = 0; |
|
1314 |
|
1315 // Look for a special classname |
|
1316 if (typeof(e.className) === 'string' && e.className !== '') { |
|
1317 if (e.className.search(this.REGEXPS.negative) !== -1) |
|
1318 weight -= 25; |
|
1319 |
|
1320 if (e.className.search(this.REGEXPS.positive) !== -1) |
|
1321 weight += 25; |
|
1322 } |
|
1323 |
|
1324 // Look for a special ID |
|
1325 if (typeof(e.id) === 'string' && e.id !== '') { |
|
1326 if (e.id.search(this.REGEXPS.negative) !== -1) |
|
1327 weight -= 25; |
|
1328 |
|
1329 if (e.id.search(this.REGEXPS.positive) !== -1) |
|
1330 weight += 25; |
|
1331 } |
|
1332 |
|
1333 return weight; |
|
1334 }, |
|
1335 |
|
1336 /** |
|
1337 * Clean a node of all elements of type "tag". |
|
1338 * (Unless it's a youtube/vimeo video. People love movies.) |
|
1339 * |
|
1340 * @param Element |
|
1341 * @param string tag to clean |
|
1342 * @return void |
|
1343 **/ |
|
1344 _clean: function(e, tag) { |
|
1345 let targetList = e.getElementsByTagName(tag); |
|
1346 let isEmbed = (tag === 'object' || tag === 'embed'); |
|
1347 |
|
1348 for (let y = targetList.length - 1; y >= 0; y -= 1) { |
|
1349 // Allow youtube and vimeo videos through as people usually want to see those. |
|
1350 if (isEmbed) { |
|
1351 let attributeValues = ""; |
|
1352 for (let i = 0, il = targetList[y].attributes.length; i < il; i += 1) { |
|
1353 attributeValues += targetList[y].attributes[i].value + '|'; |
|
1354 } |
|
1355 |
|
1356 // First, check the elements attributes to see if any of them contain youtube or vimeo |
|
1357 if (attributeValues.search(this.REGEXPS.videos) !== -1) |
|
1358 continue; |
|
1359 |
|
1360 // Then check the elements inside this element for the same. |
|
1361 if (targetList[y].innerHTML.search(this.REGEXPS.videos) !== -1) |
|
1362 continue; |
|
1363 } |
|
1364 |
|
1365 targetList[y].parentNode.removeChild(targetList[y]); |
|
1366 } |
|
1367 }, |
|
1368 |
|
1369 /** |
|
1370 * Clean an element of all tags of type "tag" if they look fishy. |
|
1371 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. |
|
1372 * |
|
1373 * @return void |
|
1374 **/ |
|
1375 _cleanConditionally: function(e, tag) { |
|
1376 if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) |
|
1377 return; |
|
1378 |
|
1379 let tagsList = e.getElementsByTagName(tag); |
|
1380 let curTagsLength = tagsList.length; |
|
1381 |
|
1382 // Gather counts for other typical elements embedded within. |
|
1383 // Traverse backwards so we can remove nodes at the same time |
|
1384 // without effecting the traversal. |
|
1385 // |
|
1386 // TODO: Consider taking into account original contentScore here. |
|
1387 for (let i = curTagsLength-1; i >= 0; i -= 1) { |
|
1388 let weight = this._getClassWeight(tagsList[i]); |
|
1389 let contentScore = 0; |
|
1390 |
|
1391 this.log("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")"); |
|
1392 |
|
1393 if (weight + contentScore < 0) { |
|
1394 tagsList[i].parentNode.removeChild(tagsList[i]); |
|
1395 } else if (this._getCharCount(tagsList[i],',') < 10) { |
|
1396 // If there are not very many commas, and the number of |
|
1397 // non-paragraph elements is more than paragraphs or other |
|
1398 // ominous signs, remove the element. |
|
1399 let p = tagsList[i].getElementsByTagName("p").length; |
|
1400 let img = tagsList[i].getElementsByTagName("img").length; |
|
1401 let li = tagsList[i].getElementsByTagName("li").length-100; |
|
1402 let input = tagsList[i].getElementsByTagName("input").length; |
|
1403 |
|
1404 let embedCount = 0; |
|
1405 let embeds = tagsList[i].getElementsByTagName("embed"); |
|
1406 for (let ei = 0, il = embeds.length; ei < il; ei += 1) { |
|
1407 if (embeds[ei].src.search(this.REGEXPS.videos) === -1) |
|
1408 embedCount += 1; |
|
1409 } |
|
1410 |
|
1411 let linkDensity = this._getLinkDensity(tagsList[i]); |
|
1412 let contentLength = this._getInnerText(tagsList[i]).length; |
|
1413 let toRemove = false; |
|
1414 |
|
1415 if (img > p) { |
|
1416 toRemove = true; |
|
1417 } else if (li > p && tag !== "ul" && tag !== "ol") { |
|
1418 toRemove = true; |
|
1419 } else if ( input > Math.floor(p/3) ) { |
|
1420 toRemove = true; |
|
1421 } else if (contentLength < 25 && (img === 0 || img > 2) ) { |
|
1422 toRemove = true; |
|
1423 } else if (weight < 25 && linkDensity > 0.2) { |
|
1424 toRemove = true; |
|
1425 } else if (weight >= 25 && linkDensity > 0.5) { |
|
1426 toRemove = true; |
|
1427 } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) { |
|
1428 toRemove = true; |
|
1429 } |
|
1430 |
|
1431 if (toRemove) |
|
1432 tagsList[i].parentNode.removeChild(tagsList[i]); |
|
1433 } |
|
1434 } |
|
1435 }, |
|
1436 |
|
1437 /** |
|
1438 * Clean out spurious headers from an Element. Checks things like classnames and link density. |
|
1439 * |
|
1440 * @param Element |
|
1441 * @return void |
|
1442 **/ |
|
1443 _cleanHeaders: function(e) { |
|
1444 for (let headerIndex = 1; headerIndex < 3; headerIndex += 1) { |
|
1445 let headers = e.getElementsByTagName('h' + headerIndex); |
|
1446 for (let i = headers.length - 1; i >= 0; i -= 1) { |
|
1447 if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33) |
|
1448 headers[i].parentNode.removeChild(headers[i]); |
|
1449 } |
|
1450 } |
|
1451 }, |
|
1452 |
|
1453 _flagIsActive: function(flag) { |
|
1454 return (this._flags & flag) > 0; |
|
1455 }, |
|
1456 |
|
1457 _addFlag: function(flag) { |
|
1458 this._flags = this._flags | flag; |
|
1459 }, |
|
1460 |
|
1461 _removeFlag: function(flag) { |
|
1462 this._flags = this._flags & ~flag; |
|
1463 }, |
|
1464 |
|
1465 /** |
|
1466 * Runs readability. |
|
1467 * |
|
1468 * Workflow: |
|
1469 * 1. Prep the document by removing script tags, css, etc. |
|
1470 * 2. Build readability's DOM tree. |
|
1471 * 3. Grab the article content from the current dom tree. |
|
1472 * 4. Replace the current DOM tree with the new one. |
|
1473 * 5. Read peacefully. |
|
1474 * |
|
1475 * @return void |
|
1476 **/ |
|
1477 parse: function () { |
|
1478 // Remove script tags from the document. |
|
1479 this._removeScripts(this._doc); |
|
1480 |
|
1481 // FIXME: Disabled multi-page article support for now as it |
|
1482 // needs more work on infrastructure. |
|
1483 |
|
1484 // Make sure this document is added to the list of parsed pages first, |
|
1485 // so we don't double up on the first page. |
|
1486 // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; |
|
1487 |
|
1488 // Pull out any possible next page link first. |
|
1489 // let nextPageLink = this._findNextPageLink(doc.body); |
|
1490 |
|
1491 this._prepDocument(); |
|
1492 |
|
1493 let articleTitle = this._getArticleTitle(); |
|
1494 let articleContent = this._grabArticle(); |
|
1495 if (!articleContent) |
|
1496 return null; |
|
1497 |
|
1498 this._postProcessContent(articleContent); |
|
1499 |
|
1500 // if (nextPageLink) { |
|
1501 // // Append any additional pages after a small timeout so that people |
|
1502 // // can start reading without having to wait for this to finish processing. |
|
1503 // setTimeout((function() { |
|
1504 // this._appendNextPage(nextPageLink); |
|
1505 // }).bind(this), 500); |
|
1506 // } |
|
1507 |
|
1508 let excerpt = this._getExcerpt(articleContent); |
|
1509 |
|
1510 return { title: articleTitle, |
|
1511 byline: this._articleByline, |
|
1512 dir: this._articleDir, |
|
1513 content: articleContent.innerHTML, |
|
1514 length: articleContent.textContent.length, |
|
1515 excerpt: excerpt }; |
|
1516 } |
|
1517 }; |