Skip to content

Commit a60c59c

Browse files
committed
Adding types and using native methods and properties where available
1 parent e03143c commit a60c59c

File tree

4 files changed

+25
-40
lines changed

4 files changed

+25
-40
lines changed

src/Nodes/NodeTrait.php

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
use fivefilters\Readability\Nodes\DOM\DOMElement;
77
use fivefilters\Readability\Nodes\DOM\DOMNode;
88
use fivefilters\Readability\Nodes\DOM\DOMText;
9-
use DOMNodeList;
109

1110
/**
1211
* @property ?DOMNode $firstChild
@@ -19,29 +18,20 @@ trait NodeTrait
1918
{
2019
/**
2120
* Content score of the node. Used to determine the value of the content.
22-
*
23-
* @var int
2421
*/
25-
public $contentScore = 0;
22+
public float $contentScore = 0.0;
2623

2724
/**
2825
* Flag for initialized status.
29-
*
30-
* @var bool
3126
*/
32-
private $initialized = false;
27+
private bool $initialized = false;
3328

3429
/**
35-
* Flag data tables.
36-
*
37-
* @var bool
30+
* Flag for data tables.
3831
*/
39-
private $readabilityDataTable = false;
32+
private bool $readabilityDataTable = false;
4033

41-
/**
42-
* @var array
43-
*/
44-
private $divToPElements = [
34+
private array $divToPElements = [
4535
'blockquote',
4636
'dl',
4737
'div',
@@ -56,10 +46,8 @@ trait NodeTrait
5646
/**
5747
* The commented out elements qualify as phrasing content but tend to be
5848
* removed by readability when put into paragraphs, so we ignore them here.
59-
*
60-
* @var array
6149
*/
62-
private $phrasing_elems = [
50+
private array $phrasing_elems = [
6351
// 'CANVAS', 'IFRAME', 'SVG', 'VIDEO',
6452
'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data',
6553
'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label',
@@ -69,7 +57,7 @@ trait NodeTrait
6957
];
7058

7159
/**
72-
* initialized getter.
60+
* Is initialized getter.
7361
*/
7462
public function isInitialized(): bool
7563
{
@@ -444,18 +432,18 @@ public function isPhrasingContent(): bool
444432
*/
445433
public function isProbablyVisible(): bool
446434
{
447-
return !preg_match('/display:( )?none/i', $this->getAttribute('style')) &&
435+
return !preg_match('/display:( )?none/i', $this->getAttribute('style')) &&
448436
!$this->hasAttribute('hidden') &&
449437
//check for "fallback-image" so that wikimedia math images are displayed
450-
(!$this->hasAttribute('aria-hidden') || $this->getAttribute('aria-hidden') !== 'true' || ($this->hasAttribute('class') && strpos($this->getAttribute('class'), 'fallback-image') !== false));
438+
(!$this->hasAttribute('aria-hidden') || $this->getAttribute('aria-hidden') !== 'true' || str_contains($this->getAttribute('class'), 'fallback-image'));
451439
}
452440

453441
/**
454442
* Check if node is whitespace.
455443
*/
456444
public function isWhitespace(): bool
457445
{
458-
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
446+
return ($this->nodeType === XML_TEXT_NODE && $this->isWhitespaceInElementContent()) ||
459447
($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
460448
}
461449

@@ -497,17 +485,12 @@ public function shiftingAwareGetElementsByTagName(string $tag): \Generator
497485
}
498486

499487
/**
500-
* Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this
501-
* function to get the first one that is an DOMElement node.
488+
* Git first element child or null
502489
*/
503490
public function getFirstElementChild(): ?DOMElement
504491
{
505-
if ($this->childNodes instanceof \Traversable) {
506-
foreach ($this->childNodes as $node) {
507-
if ($node instanceof DOMElement) {
508-
return $node;
509-
}
510-
}
492+
if ($this->nodeType === XML_ELEMENT_NODE || $this->nodeType === XML_DOCUMENT_NODE) {
493+
return $this->firstElementChild;
511494
}
512495

513496
return null;

src/Nodes/NodeUtility.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ public static function filterTextNodes(\DOMNodeList $list): DOMNodeList
157157
{
158158
$newList = new DOMNodeList();
159159
foreach ($list as $node) {
160-
if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) {
160+
if ($node->nodeType !== XML_TEXT_NODE || !$node->isWhitespaceInElementContent()) {
161161
$newList->add($node);
162162
}
163163
}

src/Readability.php

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,7 @@ private function getArticleTitle(): ?string
665665
$curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
666666
$this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
667667
}
668-
} elseif (strpos($curTitle, ': ') !== false) {
668+
} elseif (str_contains($curTitle, ': ')) {
669669
// Check if we have an heading containing this exact string, so we
670670
// could assume it's the full title.
671671
$match = false;
@@ -976,7 +976,7 @@ private function checkByline(DOMNode|DOMText|DOMElement $node, string $matchStri
976976
$rel = $node->getAttribute('rel');
977977
$itemprop = $node->getAttribute("itemprop");
978978

979-
if ($rel === 'author' || ($itemprop && strpos($itemprop, 'author') !== false) || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent(false))) {
979+
if ($rel === 'author' || ($itemprop && str_contains($itemprop, 'author')) || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent(false))) {
980980
$this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent(false)));
981981
$this->setAuthor(trim($node->getTextContent(false)));
982982

@@ -1117,7 +1117,7 @@ private function unwrapNoscriptImages(DOMDocument $dom): void
11171117
}
11181118
}
11191119

1120-
$noscript->parentNode->replaceChild($tmp->getFirstElementChild(), $prevElement);
1120+
$noscript->parentNode->replaceChild($tmp->firstElementChild, $prevElement);
11211121
}
11221122
});
11231123
}
@@ -1604,11 +1604,11 @@ public function prepArticle(DOMDocument $article): DOMDocument
16041604
// Remove single-cell tables
16051605
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
16061606
/** @var DOMElement $table */
1607-
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table;
1607+
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->firstElementChild : $table;
16081608
if ($tbody->hasSingleTagInsideElement('tr')) {
1609-
$row = $tbody->getFirstElementChild();
1609+
$row = $tbody->firstElementChild;
16101610
if ($row->hasSingleTagInsideElement('td')) {
1611-
$cell = $row->getFirstElementChild();
1611+
$cell = $row->firstElementChild;
16121612
$cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {
16131613
return $node->isPhrasingContent() && $carry;
16141614
}, true)) ? 'p' : 'div');
@@ -2064,7 +2064,7 @@ private function headerDuplicatesTitle(DOMNode|DOMText|DOMElement $node): bool
20642064
**/
20652065
public function _cleanClasses(DOMDocument|DOMText|DOMNode|DOMElement $node): void
20662066
{
2067-
if ($node->getAttribute('class') !== '') {
2067+
if ($node->hasAttribute('class')) {
20682068
$node->removeAttribute('class');
20692069
}
20702070

@@ -2088,7 +2088,7 @@ public function postProcessContent(DOMDocument $article): DOMDocument
20882088
if ($href) {
20892089
// Remove links with javascript: URIs, since
20902090
// they won't work after scripts have been removed from the page.
2091-
if (strpos($href, 'javascript:') === 0) {
2091+
if (str_starts_with($href, 'javascript:')) {
20922092
$this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128)));
20932093

20942094
// if the link only contains simple text content, it can be converted to a text node

test/ConfigurationTest.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ private function doEqualsAsserts(Configuration $config, array $options): void
4343
$this->assertEquals($options['stripUnlikelyCandidates'], $config->getStripUnlikelyCandidates());
4444
$this->assertEquals($options['cleanConditionally'], $config->getCleanConditionally());
4545
$this->assertEquals($options['weightClasses'], $config->getWeightClasses());
46+
$this->assertEquals($options['keepClasses'], $config->getKeepClasses());
4647
$this->assertEquals($options['fixRelativeURLs'], $config->getFixRelativeURLs());
4748
$this->assertEquals($options['substituteEntities'], $config->getSubstituteEntities());
4849
$this->assertEquals($options['normalizeEntities'], $config->getNormalizeEntities());
@@ -68,7 +69,8 @@ public static function getParams(): array
6869
'substituteEntities' => true,
6970
'normalizeEntities' => true,
7071
'originalURL' => 'my.original.url',
71-
'summonCthulhu' => 'my.original.url',
72+
'summonCthulhu' => false,
73+
'keepClasses' => false,
7274
'invalidParameter' => 'invalidParameterValue'
7375
]
7476
]];

0 commit comments

Comments
 (0)