Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 114 additions & 6 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ function Readability(doc, options) {
this._disableJSONLD = !!options.disableJSONLD;
this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
this._linkDensityModifier = options.linkDensityModifier || 0;
/**
* If true, keep the first in-article H1/H2 that duplicates the article title
* and leave H1 tags in the extracted content. Defaults to false (strip the
* duplicate title header and normalize remaining H1 elements to H2).
* When true, also prepend clones of document `h1` nodes that lie outside the
* extracted subtree and precede the grabbed content in document order (for example
* hero headings); snapshots are taken before `_grabArticle` because extraction
* mutates the DOM. Those clones are inserted before `_postProcessContent` so they
* receive URI fixes and class cleanup.
*/
this._keepOriginalTitleHeaders = !!options.keepOriginalTitleHeaders;

// Start with all flags set
this._flags =
Expand Down Expand Up @@ -835,11 +846,13 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");

// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(
this._getAllNodesWithTag(articleContent, ["h1"]),
"h2"
);
if (!this._keepOriginalTitleHeaders) {
// replace H1 with H2 as H1 should be only title that is displayed separately
this._replaceNodeTags(
this._getAllNodesWithTag(articleContent, ["h1"]),
"h2"
);
}

// Remove extra paragraphs
this._removeNodes(
Expand Down Expand Up @@ -1064,7 +1077,7 @@ Readability.prototype = {
var elementsToScore = [];
var node = this._doc.documentElement;

let shouldRemoveTitleHeader = true;
let shouldRemoveTitleHeader = !this._keepOriginalTitleHeaders;

while (node) {
if (node.tagName === "HTML") {
Expand Down Expand Up @@ -2709,6 +2722,76 @@ Readability.prototype = {
return this._textSimilarity(this._articleTitle, heading) > 0.75;
},

/**
* Assign stable preorder indices (depth-first, element-only) so we can compare what
* appeared before extracted content while `_grabArticle` still sees the original tree.
*
* @param Element root
* @param {{ i: number }} counterHolder mutable `{ i }` counter.
*/
_documentPreorderWalk(root, counterHolder) {
if (!root || root.nodeType !== this.ELEMENT_NODE) {
return;
}
this._elementPreorderIndex.set(root, counterHolder.i++);
var child = root.firstElementChild;
while (child) {
this._documentPreorderWalk(child, counterHolder);
child = child.nextElementSibling;
}
},

/**
* Prepend `h1` clones that existed elsewhere on the page before extraction (hero,
* etc.), in document order. Snapshots pair each original node with its clone because
* `_grabArticle` may remove or move originals. Only headings whose preorder index is
* strictly before the earliest preorder among nodes inside `articleContent` are kept
* ("before grabbed content").
*
* @param Element articleContent root returned by `_grabArticle`.
* @param Array<{original: Element, clone: Element, preorder?: number}> snapshots from before grab.
*/
_prependExternalH1HeadingsBeforePostProcess(articleContent, snapshots) {
if (!snapshots || !snapshots.length) {
return;
}

var minPreorderInGrabbed = Infinity;
var descendants = articleContent.querySelectorAll("*");
for (var j = 0; j < descendants.length; j++) {
var grabbedPo = this._elementPreorderIndex.get(descendants[j]);
if (grabbedPo !== undefined) {
minPreorderInGrabbed = Math.min(minPreorderInGrabbed, grabbedPo);
}
}

var fragment = this._doc.createDocumentFragment();

for (var i = 0; i < snapshots.length; i++) {
var entry = snapshots[i];
if (articleContent.contains(entry.original)) {
continue;
}
if (!this._isProbablyVisible(entry.original)) {
continue;
}
if (
entry.preorder === undefined ||
minPreorderInGrabbed === Infinity ||
entry.preorder >= minPreorderInGrabbed
) {
continue;
}
fragment.appendChild(entry.clone);
}

if (!fragment.childNodes.length) {
return;
}

articleContent.insertBefore(fragment, articleContent.firstChild);
},

_flagIsActive(flag) {
return (this._flags & flag) > 0;
},
Expand Down Expand Up @@ -2770,13 +2853,38 @@ Readability.prototype = {
this._metadata = metadata;
this._articleTitle = metadata.title;

var prefgrabH1Snapshots = null;
if (this._keepOriginalTitleHeaders) {
this._elementPreorderIndex = new WeakMap();
var preorderCounter = { i: 0 };
this._documentPreorderWalk(this._doc.documentElement, preorderCounter);

prefgrabH1Snapshots = Array.from(
this._doc.getElementsByTagName("h1"),
function (h) {
return {
original: h,
clone: h.cloneNode(true),
preorder: this._elementPreorderIndex.get(h),
};
}.bind(this)
);
}

var articleContent = this._grabArticle();
if (!articleContent) {
return null;
}

this.log("Grabbed: " + articleContent.innerHTML);

if (prefgrabH1Snapshots) {
this._prependExternalH1HeadingsBeforePostProcess(
articleContent,
prefgrabH1Snapshots
);
}

this._postProcessContent(articleContent);

// If we haven't found an excerpt in the article's metadata, use the article's
Expand Down
9 changes: 9 additions & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ export interface ReadabilityOptions<T = string> {
* Defaults to 1.
*/
linkDensityModifier?: number;
/**
* If `true`, the first in-article heading that closely matches the article
* title is kept, and H1 tags in the extracted content are not rewritten to H2.
* When `true`, also prepends clones of those `h1` elements that lie outside the
* extracted subtree **and** precede the grabbed content in document order (for example
* hero titles), captured before extraction so they still run through post-processing
* (relative URL fixes, etc.). Defaults to `false`.
*/
keepOriginalTitleHeaders?: boolean;
}

export class Readability<T = string> {
Expand Down
185 changes: 185 additions & 0 deletions test/test-keep-original-title-headers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
/* eslint-env node, mocha */

var JSDOM = require("jsdom").JSDOM;
var chai = require("chai");
var expect = chai.expect;

var Readability = require("../index").Readability;

function articleHtml(titleText, headingTag, headingText) {
var long =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
"eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
"minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
"in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
return (
"<!DOCTYPE html><html><head><title>" +
titleText +
"</title></head><body><article>" +
"<" +
headingTag +
">" +
headingText +
"</" +
headingTag +
"><p>" +
long +
"</p><p>" +
long +
"</p></article></body></html>"
);
}

describe("keepOriginalTitleHeaders option", function () {
this.timeout(30000);

it("when false, removes the first heading that duplicates the title and rewrites other H1 to H2", function () {
var titleText = "Readability Title Headers Option Test 7f3a";
var source = articleHtml(titleText, "h1", titleText);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc).parse();
expect(result.content).to.not.include("<h1>");
expect(result.content).to.not.include("<h2>" + titleText);
expect(result.title).to.eql(titleText);
});

it("when true, keeps the duplicate title header as H1 and does not rewrite it to H2", function () {
var titleText = "Readability Title Headers Option Test 7f3b";
var source = articleHtml(titleText, "h1", titleText);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc, {
keepOriginalTitleHeaders: true,
}).parse();
expect(result.content).to.include("<h1>" + titleText + "</h1>");
expect(result.title).to.eql(titleText);
});

it("when false, rewrites a non-title H1 in the article body to H2", function () {
var titleText = "Readability Title Headers Option Test 7f3c";
var bodyHeading = "Distinct In Article Heading 9z2q";
var source = articleHtml(titleText, "h1", bodyHeading);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc).parse();
expect(result.content).to.include("<h2>" + bodyHeading + "</h2>");
expect(result.content).to.not.include("<h1>" + bodyHeading);
});

it("when true, leaves a non-title H1 in the article body as H1", function () {
var titleText = "Readability Title Headers Option Test 7f3d";
var bodyHeading = "Distinct In Article Heading 9z2r";
var source = articleHtml(titleText, "h1", bodyHeading);
var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc, {
keepOriginalTitleHeaders: true,
}).parse();
expect(result.content).to.include("<h1>" + bodyHeading + "</h1>");
expect(result.content).to.not.include("<h2>" + bodyHeading);
});

it("when true, prepends clones of document-level H1 outside the extracted subtree (before post-processing)", function () {
var titleText = "Readability External Hero H1 Title Option Test 9x4m";
var long =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
"eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
"minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
"in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
var source =
"<!DOCTYPE html><html><head><title>" +
titleText +
"</title></head><body><main>" +
'<section class="hero"><h1 class="hero-title">' +
titleText +
"</h1></section>" +
"<article><p>" +
long +
"</p><p>" +
long +
"</p></article>" +
"</main></body></html>";

var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc, {
keepOriginalTitleHeaders: true,
}).parse();

expect(result.content).to.include("<h1>" + titleText + "</h1>");
expect(result.content.indexOf("<h1>" + titleText)).to.be.lessThan(
result.content.indexOf('id="readability-page-1"')
);
expect(result.title).to.eql(titleText);
});

it("when true, does not prepend H1 that appear after grabbed content in document order", function () {
var titleText =
"Readability Article Title After Hero Ignore Later H1 Test 9x5p";
var sidebarHeading = "Sidebar Or Footer H1 Must Not Prepend 9x5q";
var long =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
"eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
"minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
"in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
var source =
"<!DOCTYPE html><html><head><title>" +
titleText +
"</title></head><body><main>" +
"<article><p>" +
long +
"</p><p>" +
long +
"</p></article>" +
"<aside><h1>" +
sidebarHeading +
"</h1><p>" +
long +
"</p></aside>" +
"</main></body></html>";

var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc, {
keepOriginalTitleHeaders: true,
}).parse();

expect(result.content).to.not.include(sidebarHeading);
expect(result.title).to.eql(titleText);
});

it("when false, does not prepend hero H1 from outside the extracted subtree", function () {
var titleText =
"Readability External Hero H1 Absent When Option False Test 9x4n";
var long =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " +
"eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " +
"minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
"aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
"in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
var source =
"<!DOCTYPE html><html><head><title>" +
titleText +
"</title></head><body><main>" +
'<section class="hero"><h1 class="hero-title">' +
titleText +
"</h1></section>" +
"<article><p>" +
long +
"</p><p>" +
long +
"</p></article>" +
"</main></body></html>";

var doc = new JSDOM(source, { url: "http://example.com/article" }).window
.document;
var result = new Readability(doc).parse();

expect(result.content).to.not.include("<h1");
expect(result.title).to.eql(titleText);
});
});
8 changes: 8 additions & 0 deletions test/test-readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,14 @@ describe("Readability API", function () {
);
});

it("should accept a keepOriginalTitleHeaders option", function () {
expect(new Readability(doc)._keepOriginalTitleHeaders).eql(false);
expect(
new Readability(doc, { keepOriginalTitleHeaders: true })
._keepOriginalTitleHeaders
).eql(true);
});

it("should accept a allowedVideoRegex option or default it", function () {
expect(new Readability(doc)._allowedVideoRegex).eql(
Readability.prototype.REGEXPS.videos
Expand Down