Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Readability-readerable.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
var REGEXPS = {
// NOTE: These two regular expressions are duplicated in
// Readability.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
unlikelyCandidates: /-ad-|ai2html|banner|combx|comment|community|cover-wrap|credentials|date|hide|hidden|disqus|extra|footer|gdpr|legends|nav|paywall|meta|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|share|sharing|yom-remote|byline|topbar|article-meta|brand|tooltip/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow|header|summary/i,
};

function isNodeVisible(node) {
Expand Down
76 changes: 56 additions & 20 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Readability.prototype = {
DEFAULT_N_TOP_CANDIDATES: 5,

// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre,summary,article,header,main".toUpperCase().split(","),

// The default number of chars an article must have in order to return a result
DEFAULT_CHAR_THRESHOLD: 500,
Expand All @@ -122,17 +122,17 @@ Readability.prototype = {
REGEXPS: {
// NOTE: These two regular expressions are duplicated in
// Readability-readerable.js. Please keep both copies in sync.
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
unlikelyCandidates: /-ad-|ai2html|banner|combx|comment|community|cover-wrap|credentials|date|hide|hidden|disqus|extra|footer|gdpr|legends|nav|paywall|meta|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|share|sharing|yom-remote|byline|topbar|article-meta|brand|tooltip/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow|header|summary/i,

positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
positive: /article|body|content|entry|header|hentry|h-entry|intro|intro|intro|intro|main|main-article|main-content|page|lead|leading|pagination|primary|post|text|blog|story|summary|strapline/i,
negative: /-ad-|affiliate|credentials|controls|date|desktop|hidden|nav|^hid$| hid$| hid |^hid |hide|banner|login|gate|combx|comment|com-|contact|foot|footer|footnote|gdpr|icon|^icon|icons$|icons|masthead|media|meta|paywall|nav|outbrain|promo|related|scroll|share|sharing|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|tooltip|widget|video-player|video|jw-player|jw-aspect|modal|carousel|overlay|byline|brand|disclosure|nav|logo|account|cart|dock/i,
extraneous: /print|affiliate|archive|button|comment|controls|discuss|e[\-]?mail|meta|icons|share|reply|all|login|sign|single|utility|icons|nav|video-player|jw-player|modal|video|paidcontent|carousel|overlay|social|topbar|article-meta|onetrust-consent-sdk|logo|account|cart|hamburger|traffic|weather|search/i,
byline: /byline|author|dateline|credentials|writtenby|p-author|article-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
shareElements: /(\b|_)(share|sharedaddy|social|sharebar)(\b|_)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
tokenize: /\W+/g,
Expand All @@ -148,7 +148,10 @@ Readability.prototype = {
jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
},

UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog", "nav" ],

NODES_TO_CLEAN_FIRST: ["object", "embed", "footer", "link", "aside", "nav", ".icons", ".byline", ".sub-nav", ".identity", ".logo", ".video-player", ".jw-player", ".jw-wrapper", ".video", ".byline", ".author", ".dateline", ".credentials", ".writtenby", ".p-author", ".article-author", ".navigation", ".hidden-xs", ".hidden-sm", ".brand", ".modalContent", ".noPrint", ".noprint", ".screenonly", ".breadcrumb", ".breadcrumbs", "amp-iframe", "amp-img", "amp-ad", ".advert", ".ads", ".brand", ".search", ".nav", ".user", ".users", "#onetrust-consent-sdk", "#branding", "#branding-content" ],
NODES_TO_CLEAN_SECOND: [ "iframe", "input", "textarea", "select", "button", "svg"],

DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]),

Expand Down Expand Up @@ -679,11 +682,9 @@ Readability.prototype = {
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
this._clean(articleContent, "aside");
this.NODES_TO_CLEAN_FIRST.forEach((el) => {
this._clean(articleContent, el);
});

// Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
Expand All @@ -696,11 +697,9 @@ Readability.prototype = {
});
});

this._clean(articleContent, "iframe");
this._clean(articleContent, "input");
this._clean(articleContent, "textarea");
this._clean(articleContent, "select");
this._clean(articleContent, "button");
this.NODES_TO_CLEAN_SECOND.forEach((el) => {
this._clean(articleContent, el);
});
this._cleanHeaders(articleContent);

// Do these last as the previous stuff may have removed junk
Expand All @@ -709,6 +708,13 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");

//scale down h2-h5 because it's too large most of the time (intro's in h2, etc)
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h5"]), "h6");
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h4"]), "h5");
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h3"]), "h4");
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h2"]), "h3");


// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2");

Expand Down Expand Up @@ -756,6 +762,9 @@ Readability.prototype = {

switch (node.tagName) {
case "DIV":
case "MAIN":
case "HEADER":
case "ARTICLE":
node.readability.contentScore += 5;
break;

Expand Down Expand Up @@ -826,6 +835,10 @@ Readability.prototype = {
// works the way that it splits both texts into words and then finds words that are unique in second text
// the result is given by the lower length of unique parts
_textSimilarity: function(textA, textB) {
if (!textA || !textB)
return 0;
if (Math.abs(textA.length - textB.length) > 25)
return 0;
var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
if (!tokensA.length || !tokensB.length) {
Expand Down Expand Up @@ -885,6 +898,11 @@ Readability.prototype = {
return null;
}

var fullArticleText = this._doc.body.innerText;
if (fullArticleText.length) {
fullArticleText = fullArticleText.split(/[\r\n]+/).filter((el) => el.length > 50);
}

var pageCacheHtml = page.innerHTML;

while (true) {
Expand Down Expand Up @@ -1013,6 +1031,8 @@ Readability.prototype = {
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
var elementsCounter = 0;

this._forEachNode(elementsToScore, function(elementToScore) {
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
return;
Expand All @@ -1027,6 +1047,8 @@ Readability.prototype = {
if (ancestors.length === 0)
return;

elementsCounter++;

var contentScore = 0;

// Add a point for the paragraph itself as a base.
Expand All @@ -1038,6 +1060,20 @@ Readability.prototype = {
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);

if (innerText.length > 100 && elementsCounter < 10)
fullArticleText.forEach((el) => {
if (el.length > 5 && innerText.indexOf(el) != -1) {
var extra = Math.max(Math.max(0, 10 * (10 - elementsCounter)), 10);
// console.log('add ', extra, innerText);
contentScore += extra;
}
});

// extra score for headers
if (elementToScore.tagName && elementToScore.tagName.length == 2 && elementToScore.tagName.toLowerCase().startsWith("h")) {
contentScore += 100;
}

// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
Expand Down Expand Up @@ -1546,7 +1582,7 @@ Readability.prototype = {

// get article published time
metadata.publishedTime = jsonld.datePublished ||
values["article:published_time"] || null;
values["article:published_time"] || null;

// in many sites the meta value is escaped with HTML entities,
// so here we need to unescape it
Expand Down