From 63b52d8c030de5d47efaef8ce7a4b6ea5775ef39 Mon Sep 17 00:00:00 2001 From: Robert Newton Date: Thu, 14 Nov 2019 13:11:43 -0800 Subject: [PATCH 01/68] allow changing of tag --- src/PHPHtmlParser/Dom/AbstractNode.php | 23 +++++++++++++++++++- tests/Node/HtmlTest.php | 30 ++++++++++++++++++-------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index ac86f538..dd53a929 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -321,6 +321,27 @@ public function getTag(): Tag return $this->tag; } + /** + * Replaces the tag for this node + * + * @param string|Tag $tag + * @return AbstractNode + * @chainable + */ + public function setTag($tag): AbstractNode + { + if (is_string($tag)) { + $tag = new Tag($tag); + } + + $this->tag = $tag; + + // clear any cache + $this->clear(); + + return $this; + } + /** * A wrapper method that simply calls the getAttribute method * on the tag of this node. @@ -512,7 +533,7 @@ abstract protected function clear(): void; * * @return boolean */ - public function isTextNode(): bool + public function isTextNode(): bool { return false; diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index e5642bb4..df6ca6cf 100644 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -195,7 +195,7 @@ public function testOuterHtmlEmpty() ], ]); $node = new HtmlNode($a); - + $this->assertEquals("", $node->OuterHtml()); } @@ -275,9 +275,9 @@ public function testOuterHtmlWithChanges() $childa->addChild(new TextNode('link')); $this->assertEquals('
link
', $parent->outerHtml()); - + $childa->setAttribute('href', 'https://www.google.com'); - + $this->assertEquals('link', $childa->outerHtml()); } @@ -286,7 +286,7 @@ public function testText() $a = new Tag('a'); $node = new HtmlNode($a); $node->addChild(new TextNode('link')); - + $this->assertEquals('link', $node->text()); } @@ -295,7 +295,7 @@ public function testTextTwice() $a = new Tag('a'); $node = new HtmlNode($a); $node->addChild(new TextNode('link')); - + $text = $node->text(); $this->assertEquals($text, $node->text()); } @@ -312,7 +312,7 @@ public function testTextMagic() { $node = new HtmlNode('a'); $node->addChild(new TextNode('link')); - + $this->assertEquals('link', $node->text); } @@ -358,7 +358,7 @@ public function testGetAttribute() 'doubleQuote' => true, ], ]); - + $this->assertEquals('outerlink rounded', $node->getAttribute('class')); } @@ -375,7 +375,7 @@ public function testGetAttributeMagic() 'doubleQuote' => true, ], ]); - + $this->assertEquals('http://google.com', $node->href); } @@ -392,7 +392,7 @@ public function testGetAttributes() 'doubleQuote' => true, ], ]); - + $this->assertEquals('outerlink rounded', $node->getAttributes()['class']); } @@ -420,6 +420,18 @@ public function testRemoveAllAttributes() $this->assertEquals(0, count($node->getAttributes())); } + public function testSetTag() + { + $node = new HtmlNode('div'); + $this->assertEquals('
', $node->outerHtml()); + + $node->setTag('p'); + $this->assertEquals('

', $node->outerHtml()); + + $node->setTag(new Tag('span')); + $this->assertEquals('', $node->outerHtml()); + } + public function testCountable() { $div = new Tag('div'); From 8a551ccda8e777fe031c519dae7809524be4b03f Mon Sep 17 00:00:00 2001 From: Harry Merritt Date: Fri, 10 Jan 2020 21:13:31 +0000 Subject: [PATCH 02/68] Add custom headers to curl request - pass headers as an option when using loadFromUrl --- src/PHPHtmlParser/Curl.php | 7 ++++++- src/PHPHtmlParser/CurlInterface.php | 3 ++- src/PHPHtmlParser/Dom.php | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php index 353b00f0..8eb90451 100644 --- a/src/PHPHtmlParser/Curl.php +++ b/src/PHPHtmlParser/Curl.php @@ -15,10 +15,11 @@ class Curl implements CurlInterface * A simple curl implementation to get the content of the url. * * @param string $url + * @param array $options * @return string * @throws CurlException */ - public function get(string $url): string + public function get(string $url, array $options): string { $ch = curl_init($url); @@ -26,6 +27,10 @@ public function get(string $url): string curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); } + if (isset($options['curlHeaders'])) { + curl_setopt($ch, CURLOPT_HTTPHEADER, $options['curlHeaders']); + } + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); diff --git a/src/PHPHtmlParser/CurlInterface.php b/src/PHPHtmlParser/CurlInterface.php index 1d5d96c8..ff6ac97f 100644 --- a/src/PHPHtmlParser/CurlInterface.php +++ b/src/PHPHtmlParser/CurlInterface.php @@ -13,7 +13,8 @@ interface CurlInterface * This method should return the content of the url in a string * * @param string $url + * @param array $options * @return string */ - public function get(string $url): string; + public function get(string $url, array $options): string; } diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index cafce57c..cc46aa5f 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -191,7 +191,7 @@ public function loadFromUrl(string $url, array $options = [], CurlInterface $cur // use the default curl interface $curl = new Curl; } - $content = $curl->get($url); + $content = $curl->get($url, $options); return $this->loadStr($content, $options); } From 02b2d0caa3a03d9e4829204a87194a54b5beefce Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 12 Jan 2020 20:40:54 +0000 Subject: [PATCH 03/68] Added support for php 7.4 --- .travis.yml | 1 + CHANGELOG.md | 5 +++++ README.md | 4 ++-- src/PHPHtmlParser/Dom/Tag.php | 4 ++-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 37036761..6f7354cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ php: - 7.1 - 7.2 - 7.3 + - 7.4 install: - composer self-update diff --git a/CHANGELOG.md b/CHANGELOG.md index 400a20f1..5c14284b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added support for php 7.4 + +## 2.1.0 + ### Added - New `removeSmartyScripts` configuration setting. Defaults to true. - Added `declare(strict_types=1)` to all source files. diff --git a/README.md b/README.md index 8085a64c..f89a09d7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 2.1.0 +Version 2.2.0 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) @@ -18,7 +18,7 @@ Install the latest version using composer. $ composer require paquettg/php-html-parser ``` -This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.1, 7.2, and 7.3. +This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.1, 7.2, 7.3, and 7.4. Usage ----- diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 54a1865f..f773e9f1 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -296,11 +296,11 @@ public function getAttributes() * @param string $key * @return mixed */ - public function getAttribute(string $key) + public function getAttribute(string $key):array { $key = strtolower($key); if ( ! isset($this->attr[$key])) { - return null; + return ['value' => null, 'doubleQuote' => true]; } $value = $this->attr[$key]['value']; if (is_string($value) && ! is_null($this->encode)) { From e2d2d2eb72d5db0183c6960a3a3306e7b81e514d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Thu, 22 Aug 2019 08:59:57 -0400 Subject: [PATCH 04/68] Fixed small issues with the Dom object --- src/PHPHtmlParser/Dom.php | 14 +++++++++++--- src/PHPHtmlParser/Dom/AbstractNode.php | 4 ++-- src/PHPHtmlParser/Dom/Tag.php | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index cc46aa5f..961519a6 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -252,7 +252,14 @@ public function find(string $selector, int $nth = null) { $this->isLoaded(); - return $this->root->find($selector, $nth, $this->options->get('depthFirstSearch')); + $depthFirstSearch = $this->options->get('depthFirstSearch'); + if (is_bool($depthFirstSearch)) { + $result = $this->root->find($selector, $nth, $depthFirstSearch); + } else { + $result = $this->root->find($selector, $nth); + } + + return $result; } /** @@ -793,6 +800,7 @@ protected function detectCharset(): bool return false; } + /** @var AbstractNode $meta */ $meta = $this->root->find('meta[http-equiv=Content-Type]', 0); if (is_null($meta)) { // could not find meta tag @@ -800,8 +808,8 @@ protected function detectCharset(): bool return false; } - $content = $meta->content; - if (empty($content)) { + $content = $meta->getAttribute('content'); + if (is_null($content)) { // could not find content $this->root->propagateEncoding($encode); diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index ac86f538..3c5e23ff 100644 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -342,9 +342,9 @@ public function getAttributes(): array * on the tag of this node. * * @param string $key - * @return mixed + * @return string|null */ - public function getAttribute(string $key) + public function getAttribute(string $key): ?string { $attribute = $this->tag->getAttribute($key); if ( ! is_null($attribute)) { diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index f773e9f1..f95d7871 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -294,7 +294,7 @@ public function getAttributes() * Returns an attribute by the key * * @param string $key - * @return mixed + * @return array|null */ public function getAttribute(string $key):array { From 6bc74388321a5df2133e30375f3de61fd2ed5446 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Thu, 22 Aug 2019 09:09:18 -0400 Subject: [PATCH 05/68] Added more type checking to avoid strict type errors. --- src/PHPHtmlParser/Curl.php | 5 +++++ src/PHPHtmlParser/Dom/InnerNode.php | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php index 8eb90451..b3e33edc 100644 --- a/src/PHPHtmlParser/Curl.php +++ b/src/PHPHtmlParser/Curl.php @@ -22,6 +22,9 @@ class Curl implements CurlInterface public function get(string $url, array $options): string { $ch = curl_init($url); + if ($ch === false) { + throw new CurlException('Curl Init return `false`.'); + } if ( ! ini_get('open_basedir')) { curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); @@ -42,6 +45,8 @@ public function get(string $url, array $options): string // there was a problem $error = curl_error($ch); throw new CurlException('Error retrieving "'.$url.'" ('.$error.')'); + } elseif ($content === true) { + throw new CurlException('Unexpected return value of content set to true.'); } return $content; diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 8ae2b9f8..3ca23893 100644 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -165,7 +165,10 @@ public function addChild(AbstractNode $child, int $before = -1): bool array_splice($children, $index, 0, [$insert]); // add the child - $this->children = array_combine($keys, $children); + $combination = array_combine($keys, $children); + if ($combination !== false) { + $this->children = $combination; + } // tell child I am the new parent $child->setParent($this); @@ -338,7 +341,10 @@ public function replaceChild(int $childId, AbstractNode $newChild): void $keys = array_keys($this->children); $index = array_search($childId, $keys, true); $keys[$index] = $newChild->id(); - $this->children = array_combine($keys, $this->children); + $combination = array_combine($keys, $this->children); + if ($combination !== false) { + $this->children = $combination; + } $this->children[$newChild->id()] = [ 'prev' => $oldChild['prev'], 'node' => $newChild, From 50a909aea06c59791d80821e72f21bd23c1858ea Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 23 Aug 2019 22:07:02 -0400 Subject: [PATCH 06/68] Added Infection as a dev dependency --- .gitattributes | 21 +++++++++++---------- .gitignore | 1 + composer.json | 3 ++- infection.json.dist | 14 ++++++++++++++ 4 files changed, 28 insertions(+), 11 deletions(-) create mode 100644 infection.json.dist diff --git a/.gitattributes b/.gitattributes index afc2bfbc..9f59affd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,10 +1,11 @@ -/tests export-ignore -/.scrutinizar.yml export-ignore -/.travis.yml export-ignore -/.gitignore export-ignore -/CHANGELOG.md export-ignore -/CONTRIBUTING.md export-ignore -/LICENSE.md export-ignore -/README.md export-ignore -/phpunit.php export-ignore -/phpunit.xml export-ignore +/tests export-ignore +/.scrutinizar.yml export-ignore +/.travis.yml export-ignore +/.gitignore export-ignore +/CHANGELOG.md export-ignore +/CONTRIBUTING.md export-ignore +/LICENSE.md export-ignore +/README.md export-ignore +/phpunit.php export-ignore +/phpunit.xml export-ignore +/infection.json.dist export-ignore diff --git a/.gitignore b/.gitignore index b871be44..274cf429 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ composer.phar composer.lock +infection.log /vendor/ .idea/ *.swp diff --git a/composer.json b/composer.json index de617750..76f1f5ec 100644 --- a/composer.json +++ b/composer.json @@ -21,7 +21,8 @@ "require-dev": { "phpunit/phpunit": "^7.5.1", "mockery/mockery": "^1.2", - "php-coveralls/php-coveralls": "^2.1" + "php-coveralls/php-coveralls": "^2.1", + "infection/infection": "^0.13.4" }, "autoload": { "psr-4": { diff --git a/infection.json.dist b/infection.json.dist new file mode 100644 index 00000000..0243ccf4 --- /dev/null +++ b/infection.json.dist @@ -0,0 +1,14 @@ +{ + "timeout": 10, + "source": { + "directories": [ + "src\/PHPHtmlParser" + ] + }, + "logs": { + "text": "infection.log" + }, + "mutators": { + "@default": true + } +} \ No newline at end of file From 5c7fe62f6a90fb90940bf3f8996ed2a44acc28cd Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 24 Aug 2019 21:37:53 -0400 Subject: [PATCH 07/68] Issue #115 fixed and test added --- src/PHPHtmlParser/Dom.php | 1 + src/PHPHtmlParser/Dom/HtmlNode.php | 2 +- tests/Options/StrictTest.php | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 961519a6..0c833bee 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -748,6 +748,7 @@ protected function parseTag(): array } $this->content->skipByToken('blank'); + $tag = strtolower($tag); if ($this->content->char() == '/') { // self closing tag $node->getTag()->selfClosing(); diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php index 1e81234e..5217bb85 100644 --- a/src/PHPHtmlParser/Dom/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/HtmlNode.php @@ -201,7 +201,7 @@ protected function clear(): void $this->text = null; $this->textWithChildren = null; - if (is_null($this->parent) === false) { + if (!is_null($this->parent)) { $this->parent->clear(); } } diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php index a76ded60..e7f22f0c 100644 --- a/tests/Options/StrictTest.php +++ b/tests/Options/StrictTest.php @@ -53,4 +53,14 @@ public function testConfigStrictMissingAttribute() $this->assertEquals("Tag 'p' has an attribute 'block' with out a value! (character #22)", $e->getMessage()); } } + + public function testConfigStrictBRTag() + { + $dom = new Dom; + $dom->setOptions([ + 'strict' => true, + ]); + $dom->load('
'); + $this->assertTrue(true); + } } From 566aaa2f17002b15494cf67c625dd23be858bda8 Mon Sep 17 00:00:00 2001 From: Rik van der Heijden Date: Wed, 11 Sep 2019 21:08:51 +0200 Subject: [PATCH 08/68] Add a failing test --- tests/DomTest.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/DomTest.php b/tests/DomTest.php index b44cbb06..733342f8 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -483,4 +483,14 @@ public function testCaseSensitivity() $FooBar = $dom->find('FooBar'); $this->assertEquals('asdf', $FooBar->Attribute); } + + public function testEmptyAttribute() + { + $str = '
  • blah
  • what
'; + $dom = new Dom(); + $dom->load($str); + + $items = $dom->find('.summary .foo'); + $this->assertEquals(1, count($items)); + } } From b42ff35020023603e5bce9f0dd985762715c985a Mon Sep 17 00:00:00 2001 From: Rik van der Heijden Date: Wed, 11 Sep 2019 21:12:00 +0200 Subject: [PATCH 09/68] Fix failing test by adding null-coalescing check --- src/PHPHtmlParser/Selector/Selector.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index b3aaae57..56d1478a 100644 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -354,7 +354,7 @@ public function checkComparison(array $rule, AbstractNode $node): bool // handle multiple classes if ( ! $check && $rule['key'] == 'class') { - $nodeClasses = explode(' ', $node->getAttribute('class')); + $nodeClasses = explode(' ', $node->getAttribute('class') ?? ''); foreach ($rule['value'] as $value) { foreach ($nodeClasses as $class) { if ( ! empty($class)) { From 454373742045fe357ddba4cff4971f3c9a73e519 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 27 Sep 2019 21:32:12 -0400 Subject: [PATCH 10/68] Updated coverall travis configuration --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6f7354cd..9ffb2529 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,6 @@ script: - php vendor/bin/phpunit --coverage-clover build/logs/clover.xml after_script: - - php vendor/bin/coveralls + - travis_retry php vendor/bin/coveralls - wget https://scrutinizer-ci.com/ocular.phar - php ocular.phar code-coverage:upload --format=php-clover build/logs/clover.xml From c2cf01ac46ff09026aff48986d1c302d506ff63c Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 27 Sep 2019 21:33:51 -0400 Subject: [PATCH 11/68] Fixes #192 - Fixed documentation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f89a09d7..18dacfbc 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ require "vendor/autoload.php"; use PHPHtmlParser\Dom; $dom = new Dom; -$dom->loadFromFile('tests/big.html'); +$dom->loadFromFile('tests/data/big.html'); $contents = $dom->find('.content-border'); echo count($contents); // 10 From 12f382f3530cfbc2bb96cb3ae20fc322575685bb Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 27 Sep 2019 21:37:44 -0400 Subject: [PATCH 12/68] Fixed #190 - Added gzip detection and decoding. --- composer.json | 4 +++- src/PHPHtmlParser/Dom.php | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/composer.json b/composer.json index 76f1f5ec..f68a0a5a 100644 --- a/composer.json +++ b/composer.json @@ -16,7 +16,9 @@ "require": { "php": ">=7.1", "ext-mbstring": "*", - "paquettg/string-encode": "~1.0.0" + "paquettg/string-encode": "~1.0.0", + "ext-zlib": "*", + "ext-curl": "*" }, "require-dev": { "phpunit/phpunit": "^7.5.1", diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 0c833bee..d83ec07f 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -513,6 +513,11 @@ protected function clean(string $str): string return $str; } + $is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII"); + if ($is_gzip) { + $str = gzdecode($str); + } + // remove white space before closing tags $str = mb_eregi_replace("'\s+>", "'>", $str); $str = mb_eregi_replace('"\s+>', '">', $str); From bad55125647fff01476d8dffd9ff62f2ce356e70 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 28 Sep 2019 22:29:21 -0400 Subject: [PATCH 13/68] Fixes #116 - Added support for multiple selectors. --- .travis.yml | 1 - CHANGELOG.md | 6 + src/PHPHtmlParser/Selector/Parser.php | 22 +++- src/PHPHtmlParser/Selector/Selector.php | 152 ++++++++++++++++-------- tests/DomTest.php | 13 +- 5 files changed, 140 insertions(+), 54 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9ffb2529..20e644a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ php: - 7.1 - 7.2 - 7.3 - - 7.4 install: - composer self-update diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c14284b..0c766a67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Fixed bug with multiple selectors query. + +## 2.1.0 + ### Added - Added support for php 7.4 diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index 9bea98e2..ce6a59c3 100644 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -1,4 +1,7 @@ -getTag()->name() + if ($rule['tag'] == '*' + || $rule['tag'] == $node->getTag() + ->name() ) { ++$count; if ($count == $rule['key']) { @@ -132,15 +132,14 @@ protected function seek(array $nodes, array $rule, array $options): array /** @var InnerNode $node */ foreach ($nodes as $node) { // check if we are a leaf - if ($node instanceof LeafNode || - ! $node->hasChildren() + if ($node instanceof LeafNode || !$node->hasChildren() ) { continue; } $children = []; - $child = $node->firstChild(); - while ( ! is_null($child)) { + $child = $node->firstChild(); + while (!is_null($child)) { // wild card, grab all if ($rule['tag'] == '*' && is_null($rule['key'])) { $return[] = $child; @@ -149,11 +148,11 @@ protected function seek(array $nodes, array $rule, array $options): array } $pass = $this->checkTag($rule, $child); - if ($pass && ! is_null($rule['key'])) { + if ($pass && !is_null($rule['key'])) { $pass = $this->checkKey($rule, $child); } - if ($pass && ! is_null($rule['key']) && - ! is_null($rule['value']) && $rule['value'] != '*' + if ($pass && !is_null($rule['key']) && !is_null($rule['value']) + && $rule['value'] != '*' ) { $pass = $this->checkComparison($rule, $child); } @@ -163,14 +162,15 @@ protected function seek(array $nodes, array $rule, array $options): array $return[] = $child; } else { // this child failed to be matched - if ($child instanceof InnerNode && - $child->hasChildren() + if ($child instanceof InnerNode && $child->hasChildren() ) { if ($this->depthFirst) { - if ( ! isset($options['checkGrandChildren']) || - $options['checkGrandChildren']) { + if (!isset($options['checkGrandChildren']) + || $options['checkGrandChildren'] + ) { // we have a child that failed but are not leaves. - $matches = $this->seek([$child], $rule, $options); + $matches = $this->seek([$child], $rule, + $options); foreach ($matches as $match) { $return[] = $match; } @@ -185,9 +185,9 @@ protected function seek(array $nodes, array $rule, array $options): array $child = $this->getNextChild($node, $child); } - if (( ! isset($options['checkGrandChildren']) || - $options['checkGrandChildren']) - && count($children) > 0 + if ((!isset($options['checkGrandChildren']) + || $options['checkGrandChildren']) + && count($children) > 0 ) { // we have children that failed but are not leaves. $matches = $this->seek($children, $rule, $options); @@ -202,15 +202,17 @@ protected function seek(array $nodes, array $rule, array $options): array /** * Attempts to match the given arguments with the given operator. - * * @param string $operator * @param string $pattern * @param string $value * @return bool */ - protected function match(string $operator, string $pattern, string $value): bool - { - $value = strtolower($value); + protected function match( + string $operator, + string $pattern, + string $value + ): bool { + $value = strtolower($value); $pattern = strtolower($pattern); switch ($operator) { case '=': @@ -218,15 +220,17 @@ protected function match(string $operator, string $pattern, string $value): bool case '!=': return $value !== $pattern; case '^=': - return preg_match('/^'.preg_quote($pattern, '/').'/', $value) == 1; + return preg_match('/^' . preg_quote($pattern, '/') . '/', + $value) == 1; case '$=': - return preg_match('/'.preg_quote($pattern, '/').'$/', $value) == 1; + return preg_match('/' . preg_quote($pattern, '/') . '$/', + $value) == 1; case '*=': if ($pattern[0] == '/') { return preg_match($pattern, $value) == 1; } - return preg_match("/".$pattern."/i", $value) == 1; + return preg_match("/" . $pattern . "/i", $value) == 1; } return false; @@ -235,7 +239,6 @@ protected function match(string $operator, string $pattern, string $value): bool /** * Attempts to figure out what the alteration will be for * the next element. - * * @param array $rule * @return array */ @@ -251,7 +254,6 @@ protected function alterNext(array $rule): array /** * Flattens the option array. - * * @param array $optionsArray * @return array */ @@ -269,13 +271,14 @@ protected function flattenOptions(array $optionsArray) /** * Returns the next child or null if no more children. - * * @param AbstractNode $node * @param AbstractNode $currentChild * @return AbstractNode|null */ - protected function getNextChild(AbstractNode $node, AbstractNode $currentChild) - { + protected function getNextChild( + AbstractNode $node, + AbstractNode $currentChild + ) { try { $child = null; if ($node instanceof InnerNode) { @@ -292,15 +295,14 @@ protected function getNextChild(AbstractNode $node, AbstractNode $currentChild) /** * Checks tag condition from rules against node. - * - * @param array $rule + * @param array $rule * @param AbstractNode $node * @return bool */ protected function checkTag(array $rule, AbstractNode $node): bool { - if ( ! empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() && - $rule['tag'] != '*' + if (!empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() + && $rule['tag'] != '*' ) { return false; } @@ -310,20 +312,39 @@ protected function checkTag(array $rule, AbstractNode $node): bool /** * Checks key condition from rules against node. - * - * @param array $rule + * @param array $rule * @param AbstractNode $node * @return bool */ protected function checkKey(array $rule, AbstractNode $node): bool { - if ($rule['noKey']) { - if ( ! is_null($node->getAttribute($rule['key']))) { - return false; + if (!is_array($rule['key'])) { + if ($rule['noKey']) { + if (!is_null($node->getAttribute($rule['key']))) { + return false; + } + } else { + if ($rule['key'] != 'plaintext' + && !$node->hasAttribute($rule['key']) + ) { + return false; + } } } else { - if ($rule['key'] != 'plaintext' && !$node->hasAttribute($rule['key'])) { - return false; + if ($rule['noKey']) { + foreach ($rule['key'] as $key) { + if (!is_null($node->getAttribute($key))) { + return false; + } + } + } else { + foreach ($rule['key'] as $key) { + if ($key != 'plaintext' + && !$node->hasAttribute($key) + ) { + return false; + } + } } } @@ -332,8 +353,7 @@ protected function checkKey(array $rule, AbstractNode $node): bool /** * Checks comparison condition from rules against node. - * - * @param array $rule + * @param array $rule * @param AbstractNode $node * @return bool */ @@ -342,18 +362,46 @@ public function checkComparison(array $rule, AbstractNode $node): bool if ($rule['key'] == 'plaintext') { // plaintext search $nodeValue = $node->text(); + $result = $this->checkNodeValue($nodeValue, $rule, $node); } else { // normal search - $nodeValue = $node->getAttribute($rule['key']); + if (!is_array($rule['key'])) { + $nodeValue = $node->getAttribute($rule['key']); + $result = $this->checkNodeValue($nodeValue, $rule, $node); + } else { + $result = true; + foreach ($rule['key'] as $index => $key) { + $nodeValue = $node->getAttribute($key); + $result = $result && + $this->checkNodeValue($nodeValue, $rule, $node, $index); + } + } } + return $result; + } + + /** + * @param string|null $nodeValue + * @param array $rule + * @param AbstractNode $node + * @param int|null $index + * @return bool + */ + private function checkNodeValue( + ?string $nodeValue, + array $rule, + AbstractNode $node, + ?int $index = null + ) : bool { $check = false; if (!is_array($rule['value'])) { $check = $this->match($rule['operator'], $rule['value'], $nodeValue); } // handle multiple classes - if ( ! $check && $rule['key'] == 'class') { + $key = $rule['key']; + if (!$check && $key == 'class') { $nodeClasses = explode(' ', $node->getAttribute('class') ?? ''); foreach ($rule['value'] as $value) { foreach ($nodeClasses as $class) { @@ -368,6 +416,8 @@ public function checkComparison(array $rule, AbstractNode $node): bool break; } } + } elseif (!$check && is_array($key)) { + $check = $this->match($rule['operator'], $rule['value'][$index], $nodeValue); } return $check; diff --git a/tests/DomTest.php b/tests/DomTest.php index 733342f8..094c39a3 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -1,4 +1,6 @@ -find('.summary .foo'); $this->assertEquals(1, count($items)); } + + public function testMultipleSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[type=text][name=foo][baz=fig]'); + $this->assertEquals(1, count($items)); + } } From 69c30e15093e81450e3a370765ef2fad50f80566 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 12 Jan 2020 21:20:29 +0000 Subject: [PATCH 14/68] Fixed unit tests --- CHANGELOG.md | 13 ++++++++----- tests/DomTest.php | 2 +- tests/StaticDomTest.php | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c766a67..c585d0c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,14 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -### Changed +### Added +- Added support for php 7.4. +- Added custom header support for curl request. +- Added gzip detection and decoding. +- Added additional type checking. +### Changed - Fixed bug with multiple selectors query. +- Updated documentation. +- Fixed issue with Dom object. -## 2.1.0 - -### Added -- Added support for php 7.4 ## 2.1.0 diff --git a/tests/DomTest.php b/tests/DomTest.php index 094c39a3..cc486457 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -224,7 +224,7 @@ public function testLoadFromUrl() $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); $curl->shouldReceive('get') ->once() - ->with('http://google.com') + ->with('http://google.com', []) ->andReturn(file_get_contents('tests/data/files/small.html')); $dom = new Dom; diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index ad2318cb..a6fba2c0 100644 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -61,7 +61,7 @@ public function testLoadFromUrl() $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); $curl->shouldReceive('get') ->once() - ->with('http://google.com') + ->with('http://google.com', []) ->andReturn(file_get_contents('tests/data/files/small.html')); Dom::loadFromUrl('http://google.com', [], $curl); From c8c4f23dd02191bd2ca8fbaded7e6792bd176c53 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 12 Jan 2020 21:26:57 +0000 Subject: [PATCH 15/68] Added back 7.4 to travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 20e644a7..9ffb2529 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ php: - 7.1 - 7.2 - 7.3 + - 7.4 install: - composer self-update From 12b94f69637f946ca35af6554f0da67d9d176ca8 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 13 Jan 2020 00:03:24 +0000 Subject: [PATCH 16/68] Removed version from composer.json --- composer.json | 1 - 1 file changed, 1 deletion(-) diff --git a/composer.json b/composer.json index f68a0a5a..1672cfd7 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,6 @@ { "name": "paquettg/php-html-parser", "type": "library", - "version": "2.1.0", "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", "keywords": ["html", "dom", "parser"], "homepage": "https://github.com/paquettg/php-html-parser", From 8e5735987714451424df85dae4265563fb4f2cf9 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 13 Jan 2020 02:02:32 +0000 Subject: [PATCH 17/68] Fixed issue #97 --- CHANGELOG.md | 5 +++++ src/PHPHtmlParser/Dom.php | 4 ++-- tests/DomTest.php | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c585d0c9..b23c6998 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Fixed issue with \ causing an infite loop. + +## 2.2.0 + ### Added - Added support for php 7.4. - Added custom header support for curl request. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index d83ec07f..25c94852 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -708,7 +708,7 @@ protected function parseTag(): array case '"': $attr['doubleQuote'] = true; $this->content->fastForward(1); - $string = $this->content->copyUntil('"', true, true); + $string = $this->content->copyUntil('"', true); do { $moreString = $this->content->copyUntilUnless('"', '=>'); $string .= $moreString; @@ -720,7 +720,7 @@ protected function parseTag(): array case "'": $attr['doubleQuote'] = false; $this->content->fastForward(1); - $string = $this->content->copyUntil("'", true, true); + $string = $this->content->copyUntil("'", true); do { $moreString = $this->content->copyUntilUnless("'", '=>'); $string .= $moreString; diff --git a/tests/DomTest.php b/tests/DomTest.php index cc486457..755962cf 100644 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -142,6 +142,13 @@ public function testLoadNoValueAttribute() $this->assertEquals('
Main content here
', $dom->innerHtml); } + public function testLoadBackslashAttributeValue() + { + $dom = new Dom; + $dom->load('
Main content here
'); + $this->assertEquals('
Main content here
', $dom->innerHtml); + } + public function testLoadNoValueAttributeBefore() { $dom = new Dom; @@ -504,4 +511,13 @@ public function testMultipleSquareSelector() $items = $dom->find('input[type=text][name=foo][baz=fig]'); $this->assertEquals(1, count($items)); } + + public function testLoadGetAttributeWithBackslash() + { + $dom = new Dom(); + $dom->load('
\
demo
'); + $imgs = $dom->find('img', 0); + $this->assertEquals("/img/test.png", $imgs->getAttribute('src')); + + } } From c8e2b6dac69e366f83b9ec3a4959e31d8c146782 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 13 Jan 2020 23:45:05 +0000 Subject: [PATCH 18/68] Added phan level 1 and updated docs according to recommendation --- .gitattributes | 1 + .phan/config.php | 382 ++++++++++++++++++ composer.json | 3 +- src/PHPHtmlParser/Content.php | 24 +- src/PHPHtmlParser/Dom.php | 60 ++- src/PHPHtmlParser/Dom/AbstractNode.php | 37 +- src/PHPHtmlParser/Dom/Collection.php | 4 +- src/PHPHtmlParser/Dom/HtmlNode.php | 9 +- src/PHPHtmlParser/Dom/InnerNode.php | 50 ++- src/PHPHtmlParser/Dom/Tag.php | 6 +- src/PHPHtmlParser/Dom/TextNode.php | 15 +- .../Exceptions/LogicalException.php | 14 + src/PHPHtmlParser/Finder.php | 3 + src/PHPHtmlParser/Options.php | 28 +- src/PHPHtmlParser/Selector/Parser.php | 4 +- src/PHPHtmlParser/Selector/Selector.php | 27 +- src/PHPHtmlParser/StaticDom.php | 4 +- 17 files changed, 588 insertions(+), 83 deletions(-) create mode 100644 .phan/config.php create mode 100644 src/PHPHtmlParser/Exceptions/LogicalException.php diff --git a/.gitattributes b/.gitattributes index 9f59affd..93691f38 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,3 +9,4 @@ /phpunit.php export-ignore /phpunit.xml export-ignore /infection.json.dist export-ignore +/.phan export-ignore diff --git a/.phan/config.php b/.phan/config.php new file mode 100644 index 00000000..8c25e9d5 --- /dev/null +++ b/.phan/config.php @@ -0,0 +1,382 @@ +=7.1" + 'target_php_version' => '7.1', + + // If enabled, missing properties will be created when + // they are first seen. If false, we'll report an + // error message if there is an attempt to write + // to a class property that wasn't explicitly + // defined. + 'allow_missing_properties' => false, + + // If enabled, null can be cast to any type and any + // type can be cast to null. Setting this to true + // will cut down on false positives. + 'null_casts_as_any_type' => false, + + // If enabled, allow null to be cast as any array-like type. + // + // This is an incremental step in migrating away from `null_casts_as_any_type`. + // If `null_casts_as_any_type` is true, this has no effect. + 'null_casts_as_array' => false, + + // If enabled, allow any array-like type to be cast to null. + // This is an incremental step in migrating away from `null_casts_as_any_type`. + // If `null_casts_as_any_type` is true, this has no effect. + 'array_casts_as_null' => false, + + // If enabled, scalars (int, float, bool, string, null) + // are treated as if they can cast to each other. + // This does not affect checks of array keys. See `scalar_array_key_cast`. + 'scalar_implicit_cast' => false, + + // If enabled, any scalar array keys (int, string) + // are treated as if they can cast to each other. + // E.g. `array` can cast to `array` and vice versa. + // Normally, a scalar type such as int could only cast to/from int and mixed. + 'scalar_array_key_cast' => false, + + // If this has entries, scalars (int, float, bool, string, null) + // are allowed to perform the casts listed. + // + // E.g. `['int' => ['float', 'string'], 'float' => ['int'], 'string' => ['int'], 'null' => ['string']]` + // allows casting null to a string, but not vice versa. + // (subset of `scalar_implicit_cast`) + 'scalar_implicit_partial' => [], + + // If enabled, Phan will warn if **any** type in a method invocation's object + // is definitely not an object, + // or if **any** type in an invoked expression is not a callable. + // Setting this to true will introduce numerous false positives + // (and reveal some bugs). + 'strict_method_checking' => true, + + // If enabled, Phan will warn if **any** type of the object expression for a property access + // does not contain that property. + 'strict_object_checking' => true, + + // If enabled, Phan will warn if **any** type in the argument's union type + // cannot be cast to a type in the parameter's expected union type. + // Setting this to true will introduce numerous false positives + // (and reveal some bugs). + 'strict_param_checking' => true, + + // If enabled, Phan will warn if **any** type in a property assignment's union type + // cannot be cast to a type in the property's declared union type. + // Setting this to true will introduce numerous false positives + // (and reveal some bugs). + 'strict_property_checking' => true, + + // If enabled, Phan will warn if **any** type in a returned value's union type + // cannot be cast to the declared return type. + // Setting this to true will introduce numerous false positives + // (and reveal some bugs). + 'strict_return_checking' => true, + + // If true, seemingly undeclared variables in the global + // scope will be ignored. + // + // This is useful for projects with complicated cross-file + // globals that you have no hope of fixing. + 'ignore_undeclared_variables_in_global_scope' => false, + + // Set this to false to emit `PhanUndeclaredFunction` issues for internal functions that Phan has signatures for, + // but aren't available in the codebase, or from Reflection. + // (may lead to false positives if an extension isn't loaded) + // + // If this is true(default), then Phan will not warn. + // + // Even when this is false, Phan will still infer return values and check parameters of internal functions + // if Phan has the signatures. + 'ignore_undeclared_functions_with_known_signatures' => false, + + // Backwards Compatibility Checking. This is slow + // and expensive, but you should consider running + // it before upgrading your version of PHP to a + // new version that has backward compatibility + // breaks. + // + // If you are migrating from PHP 5 to PHP 7, + // you should also look into using + // [php7cc (no longer maintained)](https://github.com/sstalle/php7cc) + // and [php7mar](https://github.com/Alexia/php7mar), + // which have different backwards compatibility checks. + 'backward_compatibility_checks' => false, + + // If true, check to make sure the return type declared + // in the doc-block (if any) matches the return type + // declared in the method signature. + 'check_docblock_signature_return_type_match' => true, + + // If true, make narrowed types from phpdoc params override + // the real types from the signature, when real types exist. + // (E.g. allows specifying desired lists of subclasses, + // or to indicate a preference for non-nullable types over nullable types) + // + // Affects analysis of the body of the method and the param types passed in by callers. + // + // (*Requires `check_docblock_signature_param_type_match` to be true*) + 'prefer_narrowed_phpdoc_param_type' => true, + + // (*Requires `check_docblock_signature_return_type_match` to be true*) + // + // If true, make narrowed types from phpdoc returns override + // the real types from the signature, when real types exist. + // + // (E.g. allows specifying desired lists of subclasses, + // or to indicate a preference for non-nullable types over nullable types) + // + // This setting affects the analysis of return statements in the body of the method and the return types passed in by callers. + 'prefer_narrowed_phpdoc_return_type' => true, + + // If enabled, check all methods that override a + // parent method to make sure its signature is + // compatible with the parent's. + // + // This check can add quite a bit of time to the analysis. + // + // This will also check if final methods are overridden, etc. + 'analyze_signature_compatibility' => true, + + // This setting maps case-insensitive strings to union types. + // + // This is useful if a project uses phpdoc that differs from the phpdoc2 standard. + // + // If the corresponding value is the empty string, + // then Phan will ignore that union type (E.g. can ignore 'the' in `@return the value`) + // + // If the corresponding value is not empty, + // then Phan will act as though it saw the corresponding UnionTypes(s) + // when the keys show up in a UnionType of `@param`, `@return`, `@var`, `@property`, etc. + // + // This matches the **entire string**, not parts of the string. + // (E.g. `@return the|null` will still look for a class with the name `the`, but `@return the` will be ignored with the below setting) + // + // (These are not aliases, this setting is ignored outside of doc comments). + // (Phan does not check if classes with these names exist) + // + // Example setting: `['unknown' => '', 'number' => 'int|float', 'char' => 'string', 'long' => 'int', 'the' => '']` + 'phpdoc_type_mapping' => [], + + // Set to true in order to attempt to detect dead + // (unreferenced) code. Keep in mind that the + // results will only be a guess given that classes, + // properties, constants and methods can be referenced + // as variables (like `$class->$property` or + // `$class->$method()`) in ways that we're unable + // to make sense of. + 'dead_code_detection' => false, + + // Set to true in order to attempt to detect unused variables. + // `dead_code_detection` will also enable unused variable detection. + // + // This has a few known false positives, e.g. for loops or branches. + 'unused_variable_detection' => true, + + // Set to true in order to attempt to detect redundant and impossible conditions. + // + // This has some false positives involving loops, + // variables set in branches of loops, and global variables. + 'redundant_condition_detection' => true, + + // If enabled, Phan will act as though it's certain of real return types of a subset of internal functions, + // even if those return types aren't available in reflection (real types were taken from php 7.3 or 8.0-dev, depending on target_php_version). + // + // Note that with php 7 and earlier, php would return null or false for many internal functions if the argument types or counts were incorrect. + // As a result, enabling this setting with target_php_version 8.0 may result in false positives for `--redundant-condition-detection` when codebases also support php 7.x. + 'assume_real_types_for_internal_functions' => true, + + // If true, this runs a quick version of checks that takes less + // time at the cost of not running as thorough + // of an analysis. You should consider setting this + // to true only when you wish you had more **undiagnosed** issues + // to fix in your code base. + // + // In quick-mode the scanner doesn't rescan a function + // or a method's code block every time a call is seen. + // This means that the problem here won't be detected: + // + // ```php + // false, + + // Enable or disable support for generic templated + // class types. + 'generic_types_enabled' => true, + + // Override to hardcode existence and types of (non-builtin) globals in the global scope. + // Class names should be prefixed with `\`. + // + // (E.g. `['_FOO' => '\FooClass', 'page' => '\PageClass', 'userId' => 'int']`) + 'globals_type_map' => [], + + // The minimum severity level to report on. This can be + // set to `Issue::SEVERITY_LOW`, `Issue::SEVERITY_NORMAL` or + // `Issue::SEVERITY_CRITICAL`. Setting it to only + // critical issues is a good place to start on a big + // sloppy mature code base. + 'minimum_severity' => Issue::SEVERITY_LOW, + + // Add any issue types (such as `'PhanUndeclaredMethod'`) + // to this black-list to inhibit them from being reported. + 'suppress_issue_types' => [], + + // A regular expression to match files to be excluded + // from parsing and analysis and will not be read at all. + // + // This is useful for excluding groups of test or example + // directories/files, unanalyzable files, or files that + // can't be removed for whatever reason. + // (e.g. `'@Test\.php$@'`, or `'@vendor/.*/(tests|Tests)/@'`) + 'exclude_file_regex' => '@^vendor/.*/(tests?|Tests?)/@', + + // A list of files that will be excluded from parsing and analysis + // and will not be read at all. + // + // This is useful for excluding hopelessly unanalyzable + // files that can't be removed for whatever reason. + 'exclude_file_list' => [], + + // A directory list that defines files that will be excluded + // from static analysis, but whose class and method + // information should be included. + // + // Generally, you'll want to include the directories for + // third-party code (such as "vendor/") in this list. + // + // n.b.: If you'd like to parse but not analyze 3rd + // party code, directories containing that code + // should be added to the `directory_list` as well as + // to `exclude_analysis_directory_list`. + 'exclude_analysis_directory_list' => [ + 'vendor/', + ], + + // Enable this to enable checks of require/include statements referring to valid paths. + 'enable_include_path_checks' => true, + + // The number of processes to fork off during the analysis + // phase. + 'processes' => 1, + + // List of case-insensitive file extensions supported by Phan. + // (e.g. `['php', 'html', 'htm']`) + 'analyzed_file_extensions' => [ + 'php', + ], + + // You can put paths to stubs of internal extensions in this config option. + // If the corresponding extension is **not** loaded, then Phan will use the stubs instead. + // Phan will continue using its detailed type annotations, + // but load the constants, classes, functions, and classes (and their Reflection types) + // from these stub files (doubling as valid php files). + // Use a different extension from php to avoid accidentally loading these. + // The `tools/make_stubs` script can be used to generate your own stubs (compatible with php 7.0+ right now) + // + // (e.g. `['xdebug' => '.phan/internal_stubs/xdebug.phan_php']`) + 'autoload_internal_extension_signatures' => [], + + // A list of plugin files to execute. + // + // Plugins which are bundled with Phan can be added here by providing their name (e.g. `'AlwaysReturnPlugin'`) + // + // Documentation about available bundled plugins can be found [here](https://github.com/phan/phan/tree/master/.phan/plugins). + // + // Alternately, you can pass in the full path to a PHP file with the plugin's implementation (e.g. `'vendor/phan/phan/.phan/plugins/AlwaysReturnPlugin.php'`) + 'plugins' => [ + 'AlwaysReturnPlugin', + 'DollarDollarPlugin', + 'DuplicateArrayKeyPlugin', + 'DuplicateExpressionPlugin', + 'PregRegexCheckerPlugin', + 'PrintfCheckerPlugin', + 'SleepCheckerPlugin', + 'UnreachableCodePlugin', + 'UseReturnValuePlugin', + 'EmptyStatementListPlugin', + 'StrictComparisonPlugin', + 'LoopVariableReusePlugin', + ], + + // A list of directories that should be parsed for class and + // method information. After excluding the directories + // defined in `exclude_analysis_directory_list`, the remaining + // files will be statically analyzed for errors. + // + // Thus, both first-party and third-party code being used by + // your application should be included in this list. + 'directory_list' => [ + 'src/PHPHtmlParser', + 'vendor/infection/infection/src', + 'vendor/mockery/mockery/library', + 'vendor/paquettg/string-encode/src', + 'vendor/phan/phan/src/Phan', + 'vendor/php-coveralls/php-coveralls/src', + 'vendor/phpunit/phpunit/src', + ], + + // A list of individual files to include in analysis + // with a path relative to the root directory of the + // project. + 'file_list' => [], +]; diff --git a/composer.json b/composer.json index 1672cfd7..e924886e 100644 --- a/composer.json +++ b/composer.json @@ -23,7 +23,8 @@ "phpunit/phpunit": "^7.5.1", "mockery/mockery": "^1.2", "php-coveralls/php-coveralls": "^2.1", - "infection/infection": "^0.13.4" + "infection/infection": "^0.13.4", + "phan/phan": "^2.4" }, "autoload": { "psr-4": { diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index f2c6d5d3..93b3a73b 100644 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -1,6 +1,12 @@ -pos; if ( ! is_null($char)) { @@ -135,8 +141,7 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal $position = strpos($this->content, $string, $position); if ($position === false) { // reached the end - $found = true; - continue; + break; } if ($this->char($position - 1) == '\\') { @@ -157,6 +162,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal if ($position === false) { // could not find character, just return the remaining of the content $return = substr($this->content, $this->pos, $this->size - $this->pos); + if ($return === false) { + throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + } $this->pos = $this->size; return $return; @@ -168,6 +176,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal } $return = substr($this->content, $this->pos, $position - $this->pos); + if ($return === false) { + throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + } // set the new position $this->pos = $position; @@ -229,6 +240,9 @@ public function skip(string $string, bool $copy = false) $return = $this; if ($copy) { $return = substr($this->content, $this->pos, $len); + if ($return === false) { + throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + } } // update the position diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 25c94852..6b44408f 100644 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -12,6 +12,7 @@ use PHPHtmlParser\Exceptions\ParentNotFoundException; use PHPHtmlParser\Exceptions\StrictException; use PHPHtmlParser\Exceptions\UnknownChildTypeException; +use PHPHtmlParser\Exceptions\LogicalException; use stringEncode\Encode; /** @@ -167,10 +168,15 @@ public function load(string $str, array $options = []): Dom * @throws ChildNotFoundException * @throws CircularException * @throws StrictException + * @throws LogicalException */ public function loadFromFile(string $file, array $options = []): Dom { - return $this->loadStr(file_get_contents($file), $options); + $content = file_get_contents($file); + if ($content === false) { + throw new LogicalException('file_get_contents failed and returned false when trying to read "'.$file.'".'); + } + return $this->loadStr($content, $options); } /** @@ -516,11 +522,20 @@ protected function clean(string $str): string $is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII"); if ($is_gzip) { $str = gzdecode($str); + if ($str === false) { + throw new LogicalException('gzdecode returned false. Error when trying to decode the string.'); + } } // remove white space before closing tags $str = mb_eregi_replace("'\s+>", "'>", $str); + if ($str === false) { + throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.'); + } $str = mb_eregi_replace('"\s+>', '">', $str); + if ($str === false) { + throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.'); + } // clean out the \n\r $replace = ' '; @@ -528,36 +543,66 @@ protected function clean(string $str): string $replace = ' '; } $str = str_replace(["\r\n", "\r", "\n"], $replace, $str); + if ($str === false) { + throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.'); + } // strip the doctype $str = mb_eregi_replace("", '', $str); + if ($str === false) { + throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.'); + } // strip out comments $str = mb_eregi_replace("", '', $str); + if ($str === false) { + throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.'); + } // strip out cdata $str = mb_eregi_replace("", '', $str); + if ($str === false) { + throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.'); + } // strip out "; + $dom = new Dom(); + $dom->setOptions(['cleanupInput' => false,]); + $dom->load($html); + $this->assertSame($html, $dom->root->outerHtml()); + } + public function testLoad() { $dom = new Dom; From 0689a0468f47b479e49eff02dc04cf3f757b097f Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 27 Apr 2020 02:42:30 +0000 Subject: [PATCH 28/68] Added support for PSR7 requests --- CHANGELOG.md | 6 ++++ composer.json | 5 ++- src/PHPHtmlParser/Curl.php | 54 ----------------------------- src/PHPHtmlParser/CurlInterface.php | 20 ----------- src/PHPHtmlParser/Dom.php | 30 +++++++++++----- src/PHPHtmlParser/StaticDom.php | 29 +++++++++++----- tests/DomTest.php | 24 ++++++++----- tests/StaticDomTest.php | 19 ++++++---- 8 files changed, 80 insertions(+), 107 deletions(-) delete mode 100755 src/PHPHtmlParser/Curl.php delete mode 100755 src/PHPHtmlParser/CurlInterface.php diff --git a/CHANGELOG.md b/CHANGELOG.md index d4921140..0039aa5e 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added support for PSR7 HTTP clients and requests for URL calls. + ### Changed - Fixed issue with \ causing an infite loop. +### Removed +- Removed curl interface and curl implementation. + ## 2.2.0 ### Added diff --git a/composer.json b/composer.json index e924886e..8e643b31 100755 --- a/composer.json +++ b/composer.json @@ -17,7 +17,10 @@ "ext-mbstring": "*", "paquettg/string-encode": "~1.0.0", "ext-zlib": "*", - "ext-curl": "*" + "ext-curl": "*", + "php-http/httplug": "^2.1", + "php-http/guzzle6-adapter": "^2.0", + "guzzlehttp/psr7": "^1.6" }, "require-dev": { "phpunit/phpunit": "^7.5.1", diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php deleted file mode 100755 index b3e33edc..00000000 --- a/src/PHPHtmlParser/Curl.php +++ /dev/null @@ -1,54 +0,0 @@ -get($url, $options); + if (is_null($request)) { + $request = new Request('GET', $url); + } + + $response = $client->sendRequest($request); + $content = $response->getBody()->getContents(); return $this->loadStr($content, $options); } diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index 0114bb70..cb70d1d1 100755 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -1,11 +1,17 @@ loadFromUrl($url, $options, $curl); + return $dom->loadFromUrl($url, $options, $client, $request); } /** diff --git a/tests/DomTest.php b/tests/DomTest.php index 3297923e..7a2cc4ef 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -228,14 +228,22 @@ public function testLoadFileBigTwicePreserveOption() public function testLoadFromUrl() { - $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); - $curl->shouldReceive('get') - ->once() - ->with('http://google.com', []) - ->andReturn(file_get_contents('tests/data/files/small.html')); - - $dom = new Dom; - $dom->loadFromUrl('http://google.com', [], $curl); + $streamMock = Mockery::mock(\Psr\Http\Message\StreamInterface::class); + $streamMock->shouldReceive('getContents') + ->once() + ->andReturn(file_get_contents('tests/data/files/small.html')); + $responseMock = Mockery::mock(\Psr\Http\Message\ResponseInterface::class); + $responseMock->shouldReceive('getBody') + ->once() + ->andReturn($streamMock); + $clientMock = Mockery::mock(\Psr\Http\Client\ClientInterface::class); + $clientMock->shouldReceive('sendRequest') + ->once() + ->andReturn($responseMock); + + + $dom = new Dom; + $dom->loadFromUrl('http://google.com', [], $clientMock); $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index a6fba2c0..ac9df656 100755 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -58,13 +58,20 @@ public function testFindI() public function testLoadFromUrl() { - $curl = Mockery::mock('PHPHtmlParser\CurlInterface'); - $curl->shouldReceive('get') - ->once() - ->with('http://google.com', []) - ->andReturn(file_get_contents('tests/data/files/small.html')); + $streamMock = Mockery::mock(\Psr\Http\Message\StreamInterface::class); + $streamMock->shouldReceive('getContents') + ->once() + ->andReturn(file_get_contents('tests/data/files/small.html')); + $responseMock = Mockery::mock(\Psr\Http\Message\ResponseInterface::class); + $responseMock->shouldReceive('getBody') + ->once() + ->andReturn($streamMock); + $clientMock = Mockery::mock(\Psr\Http\Client\ClientInterface::class); + $clientMock->shouldReceive('sendRequest') + ->once() + ->andReturn($responseMock); - Dom::loadFromUrl('http://google.com', [], $curl); + Dom::loadFromUrl('http://google.com', [], $clientMock); $this->assertEquals('VonBurgermeister', Dom::find('.post-row div .post-user font', 0)->text); } From 1d4e3792b487387d1328f7a04bda2ae42e318770 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 27 Apr 2020 04:04:03 +0000 Subject: [PATCH 29/68] Added php_cs --- .gitattributes | 1 + .gitignore | 1 + .php_cs.dist | 148 +++++++++ composer.json | 5 +- src/PHPHtmlParser/Content.php | 77 ++--- src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 109 ++++--- src/PHPHtmlParser/Dom.php | 251 +++++++--------- src/PHPHtmlParser/Dom/AbstractNode.php | 189 +++++------- src/PHPHtmlParser/Dom/ArrayNode.php | 18 +- src/PHPHtmlParser/Dom/Collection.php | 59 ++-- src/PHPHtmlParser/Dom/HtmlNode.php | 52 ++-- src/PHPHtmlParser/Dom/InnerNode.php | 146 ++++----- src/PHPHtmlParser/Dom/LeafNode.php | 11 +- src/PHPHtmlParser/Dom/Tag.php | 123 ++++---- src/PHPHtmlParser/Dom/TextNode.php | 50 ++-- .../Exceptions/ChildNotFoundException.php | 10 +- .../Exceptions/CircularException.php | 9 +- .../Exceptions/CurlException.php | 9 +- .../Exceptions/EmptyCollectionException.php | 9 +- .../Exceptions/LogicalException.php | 10 +- .../Exceptions/NotLoadedException.php | 9 +- .../Exceptions/ParentNotFoundException.php | 9 +- .../Exceptions/StrictException.php | 9 +- .../Tag/AttributeNotFoundException.php | 25 +- .../Exceptions/UnknownChildTypeException.php | 9 +- src/PHPHtmlParser/Finder.php | 14 +- src/PHPHtmlParser/Options.php | 40 ++- src/PHPHtmlParser/Selector/Parser.php | 59 ++-- .../Selector/ParserInterface.php | 5 +- src/PHPHtmlParser/Selector/Selector.php | 170 +++++------ src/PHPHtmlParser/StaticDom.php | 48 ++- tests/CollectionTest.php | 45 +-- tests/ContentTest.php | 11 +- tests/DomTest.php | 166 +++++------ tests/Node/ChildrenTest.php | 80 ++--- tests/Node/HtmlTest.php | 99 +++--- tests/Node/ParentTest.php | 281 +++++++++--------- tests/Node/TagTest.php | 15 +- tests/Node/TextTest.php | 16 +- tests/Options/CleanupTest.php | 44 +-- tests/Options/PreserveLineBreaks.php | 24 +- tests/Options/StrictTest.php | 34 +-- tests/Options/WhitespaceTextNodeTest.php | 14 +- tests/OptionsTest.php | 21 +- tests/Selector/SelectorTest.php | 50 ++-- tests/StaticDomTest.php | 17 +- tests/data/MockNode.php | 14 +- 47 files changed, 1265 insertions(+), 1350 deletions(-) create mode 100644 .php_cs.dist diff --git a/.gitattributes b/.gitattributes index 93691f38..ebfea7c7 100755 --- a/.gitattributes +++ b/.gitattributes @@ -10,3 +10,4 @@ /phpunit.xml export-ignore /infection.json.dist export-ignore /.phan export-ignore +/.php_cs.dist export-ignore diff --git a/.gitignore b/.gitignore index 274cf429..9a550fad 100755 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ composer.lock infection.log /vendor/ .idea/ +.php_cs.cache *.swp diff --git a/.php_cs.dist b/.php_cs.dist new file mode 100644 index 00000000..56af284d --- /dev/null +++ b/.php_cs.dist @@ -0,0 +1,148 @@ +in('src') + ->in('tests') +; + +return PhpCsFixer\Config::create() + ->setRiskyAllowed(true) + ->setRules([ + 'array_indentation' => true, + 'array_syntax' => ['syntax' => 'short'], + 'binary_operator_spaces' => ['align_double_arrow'=>true], + 'blank_line_after_namespace' => true, + 'blank_line_after_opening_tag' => true, + 'blank_line_before_statement' => ['statements'=>['return']], + 'braces' => ['allow_single_line_closure' => false], + 'cast_spaces' => true, + 'class_attributes_separation' => ['elements'=>['method']], + 'class_definition' => ['single_line'=>true], + 'combine_consecutive_issets' => true, + 'concat_space' => ['spacing' => 'one'], + 'declare_equal_normalize' => true, + 'elseif' => true, + 'encoding' => true, + 'full_opening_tag' => true, + 'function_declaration' => true, + 'function_typehint_space' => true, + 'include' => true, + 'increment_style' => true, + 'indentation_type' => true, + 'line_ending' => true, + 'linebreak_after_opening_tag' => true, + 'lowercase_cast' => true, + 'lowercase_keywords' => true, + 'lowercase_static_reference' => true, + 'magic_constant_casing' => true, + 'magic_method_casing' => true, + 'mb_str_functions' => false, + 'method_argument_space' => true, + 'native_function_casing' => true, + 'native_function_invocation' => true, + 'native_function_type_declaration_casing' => true, + 'new_with_braces' => true, + 'no_blank_lines_after_class_opening' => true, + 'no_blank_lines_after_phpdoc' => true, + 'no_break_comment' => true, + 'no_closing_tag' => true, + 'no_empty_comment' => true, + 'no_empty_phpdoc' => true, + 'no_empty_statement' => true, + 'no_extra_blank_lines' => ['tokens'=>[ + 'curly_brace_block', + 'extra', + 'parenthesis_brace_block', + 'square_brace_block', + 'throw', + 'use', + ]], + 'no_leading_import_slash' => true, + 'no_leading_namespace_whitespace' => true, + 'no_mixed_echo_print' => true, + 'no_multiline_whitespace_around_double_arrow' => true, + 'no_null_property_initialization' => true, + 'no_short_bool_cast' => true, + 'no_singleline_whitespace_before_semicolons' => true, + 'no_superfluous_elseif' => true, + 'no_spaces_after_function_name' => true, + 'no_spaces_around_offset' => true, + 'no_spaces_inside_parenthesis' => true, + 'no_superfluous_phpdoc_tags' => ['allow_mixed' => true, 'allow_unused_params' => true], + 'no_trailing_comma_in_list_call' => true, + 'no_trailing_comma_in_singleline_array' => true, + 'no_trailing_whitespace' => true, + 'no_trailing_whitespace_in_comment' => true, + 'no_unneeded_control_parentheses' => true, + 'no_unneeded_curly_braces' => ['namespaces' => true], + 'no_unused_imports' => true, + 'no_useless_else' => true, + 'no_useless_return' => true, + 'no_whitespace_before_comma_in_array' => true, + 'no_whitespace_in_blank_line' => true, + 'normalize_index_brace' => true, + 'not_operator_with_space' => false, + 'object_operator_without_whitespace' => true, + 'ordered_class_elements' => true, + 'ordered_imports' => true, + 'php_unit_fqcn_annotation' => true, + 'phpdoc_align' => ['tags' => [ + 'method', + 'param', + 'property', + 'return', + 'throws', + 'type', + 'var', + ]], + 'phpdoc_add_missing_param_annotation' => true, + 'phpdoc_annotation_without_dot' => true, + 'phpdoc_indent' => true, + 'phpdoc_inline_tag' => true, + 'phpdoc_no_access' => true, + 'phpdoc_no_alias_tag' => true, + 'phpdoc_no_package' => true, + 'phpdoc_no_useless_inheritdoc' => true, + 'phpdoc_order' => true, + 'phpdoc_return_self_reference' => true, + 'phpdoc_scalar' => true, + 'phpdoc_separation' => true, + 'phpdoc_single_line_var_spacing' => true, + 'phpdoc_summary' => true, + 'phpdoc_to_comment' => true, + 'phpdoc_trim' => true, + 'phpdoc_trim_consecutive_blank_line_separation' => true, + 'phpdoc_types' => true, + 'phpdoc_types_order' => ['null_adjustment' => 'always_last', 'sort_algorithm' => 'none'], + 'phpdoc_var_without_name' => true, + 'return_assignment' => true, + 'return_type_declaration' => true, + 'semicolon_after_instruction' => true, + 'simplified_null_return' => true, + 'short_scalar_cast' => true, + 'single_blank_line_at_eof' => true, + 'single_blank_line_before_namespace' => true, + 'single_class_element_per_statement' => true, + 'single_import_per_statement' => true, + 'single_line_after_imports' => true, + 'single_line_comment_style' => ['comment_types' => ['hash']], + 'single_line_throw' => true, + 'single_quote' => true, + 'single_trait_insert_per_statement' => true, + 'space_after_semicolon' => ['remove_in_empty_for_expressions'=>true], + 'standardize_increment' => true, + 'standardize_not_equals' => true, + 'switch_case_semicolon_to_colon' => true, + 'switch_case_space' => true, + 'ternary_operator_spaces' => true, + 'ternary_to_null_coalescing' => true, + 'trailing_comma_in_multiline_array' => true, + 'trim_array_spaces' => true, + 'unary_operator_spaces' => true, + 'visibility_required' => true, + 'whitespace_after_comma_in_array' => true, + 'yoda_style' => false, + ]) + ->setFinder($finder) + ->setCacheFile(__DIR__.'/.php_cs.cache') +; \ No newline at end of file diff --git a/composer.json b/composer.json index 8e643b31..79258c58 100755 --- a/composer.json +++ b/composer.json @@ -15,9 +15,9 @@ "require": { "php": ">=7.1", "ext-mbstring": "*", - "paquettg/string-encode": "~1.0.0", "ext-zlib": "*", "ext-curl": "*", + "paquettg/string-encode": "~1.0.0", "php-http/httplug": "^2.1", "php-http/guzzle6-adapter": "^2.0", "guzzlehttp/psr7": "^1.6" @@ -27,7 +27,8 @@ "mockery/mockery": "^1.2", "php-coveralls/php-coveralls": "^2.1", "infection/infection": "^0.13.4", - "phan/phan": "^2.4" + "phan/phan": "^2.4", + "friendsofphp/php-cs-fixer": "^2.16" }, "autoload": { "psr-4": { diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 93b3a73b..37415a91 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -1,20 +1,16 @@ -content = $content; - $this->size = strlen($content); - $this->pos = 0; + $this->size = \strlen($content); + $this->pos = 0; } /** * Returns the current position of the content. - * - * @return int */ public function getPosition(): int { @@ -72,16 +64,15 @@ public function getPosition(): int * Gets the current character we are at. * * @param ?int $char - * @return string */ public function char(?int $char = null): string { $pos = $this->pos; - if ( ! is_null($char)) { + if (!\is_null($char)) { $pos = $char; } - if ( ! isset($this->content[$pos])) { + if (!isset($this->content[$pos])) { return ''; } @@ -91,8 +82,6 @@ public function char(?int $char = null): string /** * Moves the current position forward. * - * @param int $count - * @return Content * @chainable */ public function fastForward(int $count): Content @@ -105,8 +94,6 @@ public function fastForward(int $count): Content /** * Moves the current position backward. * - * @param int $count - * @return Content * @chainable */ public function rewind(int $count): Content @@ -121,11 +108,6 @@ public function rewind(int $count): Content /** * Copy the content until we find the given string. - * - * @param string $string - * @param bool $char - * @param bool $escape - * @return string */ public function copyUntil(string $string, bool $char = false, bool $escape = false): string { @@ -136,9 +118,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal if ($escape) { $position = $this->pos; - $found = false; - while ( ! $found) { - $position = strpos($this->content, $string, $position); + $found = false; + while (!$found) { + $position = \strpos($this->content, $string, $position); if ($position === false) { // reached the end break; @@ -153,17 +135,17 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal $found = true; } } elseif ($char) { - $position = strcspn($this->content, $string, $this->pos); + $position = \strcspn($this->content, $string, $this->pos); $position += $this->pos; } else { - $position = strpos($this->content, $string, $this->pos); + $position = \strpos($this->content, $string, $this->pos); } if ($position === false) { // could not find character, just return the remaining of the content - $return = substr($this->content, $this->pos, $this->size - $this->pos); + $return = \substr($this->content, $this->pos, $this->size - $this->pos); if ($return === false) { - throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + throw new LogicalException('Substr returned false with position ' . $this->pos . '.'); } $this->pos = $this->size; @@ -175,9 +157,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal return ''; } - $return = substr($this->content, $this->pos, $position - $this->pos); + $return = \substr($this->content, $this->pos, $position - $this->pos); if ($return === false) { - throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + throw new LogicalException('Substr returned false with position ' . $this->pos . '.'); } // set the new position $this->pos = $position; @@ -189,8 +171,6 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal * Copies the content until the string is found and return it * unless the 'unless' is found in the substring. * - * @param string $string - * @param string $unless * @return string */ public function copyUntilUnless(string $string, string $unless) @@ -199,9 +179,9 @@ public function copyUntilUnless(string $string, string $unless) $this->fastForward(1); $foundString = $this->copyUntil($string, true, true); - $position = strcspn($foundString, $unless); - if ($position == strlen($foundString)) { - return $string.$foundString; + $position = \strcspn($foundString, $unless); + if ($position == \strlen($foundString)) { + return $string . $foundString; } // rewind changes and return nothing $this->pos = $lastPos; @@ -210,12 +190,10 @@ public function copyUntilUnless(string $string, string $unless) } /** - * Copies the content until it reaches the token string., + * Copies the content until it reaches the token string.,. * - * @param string $token - * @param bool $char - * @param bool $escape * @return string + * * @uses $this->copyUntil() */ public function copyByToken(string $token, bool $char = false, bool $escape = false) @@ -228,20 +206,18 @@ public function copyByToken(string $token, bool $char = false, bool $escape = fa /** * Skip a given set of characters. * - * @param string $string - * @param bool $copy * @return Content|string */ public function skip(string $string, bool $copy = false) { - $len = strspn($this->content, $string, $this->pos); + $len = \strspn($this->content, $string, $this->pos); // make it chainable if they don't want a copy $return = $this; if ($copy) { - $return = substr($this->content, $this->pos, $len); + $return = \substr($this->content, $this->pos, $len); if ($return === false) { - throw new LogicalException('Substr returned false with position '.$this->pos.'.'); + throw new LogicalException('Substr returned false with position ' . $this->pos . '.'); } } @@ -254,9 +230,8 @@ public function skip(string $string, bool $copy = false) /** * Skip a given token of pre-defined characters. * - * @param string $token - * @param bool $copy * @return Content|string + * * @uses $this->skip() */ public function skipByToken(string $token, bool $copy = false) diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 44b9bb2f..489b843c 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -1,57 +1,52 @@ -value = $values['value']; - $this->doubleQuote = $values['doubleQuote']; - } - - /** - * @return string - */ - public function getValue(): ?string - { - return $this->value; - } - - /** - * @return bool - */ - public function isDoubleQuote(): bool - { - return $this->doubleQuote; - } - - public function htmlspecialcharsDecode(): void - { - $this->value = htmlspecialchars_decode($this->value); - } - - /** - * @param Encode $encode - * @throws Exception - */ - public function encodeValue(Encode $encode) - { - $this->value = $encode->convert($this->value); - } -} +value = $values['value']; + $this->doubleQuote = $values['doubleQuote']; + } + + public function getValue(): ?string + { + return $this->value; + } + + public function isDoubleQuote(): bool + { + return $this->doubleQuote; + } + + public function htmlspecialcharsDecode(): void + { + if (!\is_null($this->value)) { + $this->value = \htmlspecialchars_decode($this->value); + } + } + + /** + * @throws Exception + */ + public function encodeValue(Encode $encode) + { + $this->value = $encode->convert($this->value); + } +} diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 99e5b796..b0b30a31 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -1,11 +1,11 @@ - at the end (html5 style) + * A list of tags where there should be no /> at the end (html5 style). * * @var array */ @@ -122,7 +119,6 @@ class Dom /** * Returns the inner html of the root node. * - * @return string * @throws ChildNotFoundException * @throws UnknownChildTypeException */ @@ -135,6 +131,7 @@ public function __toString(): string * A simple wrapper around the root node. * * @param string $name + * * @return mixed */ public function __get($name) @@ -144,9 +141,7 @@ public function __get($name) /** * Attempts to load the dom from any resource, string, file, or URL. - * @param string $str - * @param array $options - * @return Dom + * * @throws ChildNotFoundException * @throws CircularException * @throws CurlException @@ -156,11 +151,11 @@ public function load(string $str, array $options = []): Dom { AbstractNode::resetCount(); // check if it's a file - if (strpos($str, "\n") === false && is_file($str)) { + if (\strpos($str, "\n") === false && \is_file($str)) { return $this->loadFromFile($str, $options); } // check if it's a url - if (preg_match("/^https?:\/\//i", $str)) { + if (\preg_match("/^https?:\/\//i", $str)) { return $this->loadFromUrl($str, $options); } @@ -168,10 +163,8 @@ public function load(string $str, array $options = []): Dom } /** - * Loads the dom from a document file/url - * @param string $file - * @param array $options - * @return Dom + * Loads the dom from a document file/url. + * * @throws ChildNotFoundException * @throws CircularException * @throws StrictException @@ -179,32 +172,31 @@ public function load(string $str, array $options = []): Dom */ public function loadFromFile(string $file, array $options = []): Dom { - $content = file_get_contents($file); + $content = \file_get_contents($file); if ($content === false) { - throw new LogicalException('file_get_contents failed and returned false when trying to read "'.$file.'".'); + throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".'); } + return $this->loadStr($content, $options); } /** * Use a curl interface implementation to attempt to load * the content from a url. - * @param string $url - * @param array $options + * * @param ClientInterface $client - * @param RequestInterface|null $request - * @return Dom + * * @throws ChildNotFoundException * @throws CircularException * @throws StrictException * @throws \Psr\Http\Client\ClientExceptionInterface */ - public function loadFromUrl(string $url, array $options = [], ClientInterface $client = null, RequestInterface $request = null): Dom + public function loadFromUrl(string $url, array $options = [], ?ClientInterface $client = null, ?RequestInterface $request = null): Dom { - if (is_null($client)) { + if (\is_null($client)) { $client = new Client(); } - if (is_null($request)) { + if (\is_null($request)) { $request = new Request('GET', $url); } @@ -217,25 +209,23 @@ public function loadFromUrl(string $url, array $options = [], ClientInterface $c /** * Parsers the html of the given string. Used for load(), loadFromFile(), * and loadFromUrl(). - * @param string $str - * @param array $option - * @return Dom + * * @throws ChildNotFoundException * @throws CircularException * @throws StrictException */ public function loadStr(string $str, array $option = []): Dom { - $this->options = new Options; + $this->options = new Options(); $this->options->setOptions($this->globalOptions) ->setOptions($option); - $this->rawSize = strlen($str); - $this->raw = $str; + $this->rawSize = \strlen($str); + $this->raw = $str; $html = $this->clean($str); - $this->size = strlen($str); + $this->size = \strlen($str); $this->content = new Content($html); $this->parse(); @@ -247,8 +237,6 @@ public function loadStr(string $str, array $option = []): Dom /** * Sets a global options array to be used by all load calls. * - * @param array $options - * @return Dom * @chainable */ public function setOptions(array $options): Dom @@ -260,18 +248,18 @@ public function setOptions(array $options): Dom /** * Find elements by css selector on the root node. - * @param string $selector - * @param int|null $nth - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function find(string $selector, int $nth = null) { $this->isLoaded(); $depthFirstSearch = $this->options->get('depthFirstSearch'); - if (is_bool($depthFirstSearch)) { + if (\is_bool($depthFirstSearch)) { $result = $this->root->find($selector, $nth, $depthFirstSearch); } else { $result = $this->root->find($selector, $nth); @@ -281,12 +269,13 @@ public function find(string $selector, int $nth = null) } /** - * Find element by Id on the root node - * @param int $id - * @return bool|AbstractNode + * Find element by Id on the root node. + * * @throws ChildNotFoundException * @throws NotLoadedException * @throws ParentNotFoundException + * + * @return bool|AbstractNode */ public function findById(int $id) { @@ -300,12 +289,11 @@ public function findById(int $id) * be self closing. * * @param string|array $tag - * @return Dom * @chainable */ public function addSelfClosingTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } foreach ($tag as $value) { @@ -320,15 +308,14 @@ public function addSelfClosingTag($tag): Dom * always be self closing. * * @param string|array $tag - * @return Dom * @chainable */ public function removeSelfClosingTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } - $this->selfClosing = array_diff($this->selfClosing, $tag); + $this->selfClosing = \array_diff($this->selfClosing, $tag); return $this; } @@ -336,7 +323,6 @@ public function removeSelfClosingTag($tag): Dom /** * Sets the list of self closing tags to empty. * - * @return Dom * @chainable */ public function clearSelfClosingTags(): Dom @@ -346,17 +332,15 @@ public function clearSelfClosingTags(): Dom return $this; } - /** - * Adds a tag to the list of self closing tags that should not have a trailing slash + * Adds a tag to the list of self closing tags that should not have a trailing slash. * * @param $tag - * @return Dom * @chainable */ public function addNoSlashTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } foreach ($tag as $value) { @@ -370,15 +354,14 @@ public function addNoSlashTag($tag): Dom * Removes a tag from the list of no-slash tags. * * @param $tag - * @return Dom * @chainable */ public function removeNoSlashTag($tag): Dom { - if ( ! is_array($tag)) { + if (!\is_array($tag)) { $tag = [$tag]; } - $this->noSlash = array_diff($this->noSlash, $tag); + $this->noSlash = \array_diff($this->noSlash, $tag); return $this; } @@ -386,7 +369,6 @@ public function removeNoSlashTag($tag): Dom /** * Empties the list of no-slash tags. * - * @return Dom * @chainable */ public function clearNoSlashTags(): Dom @@ -398,7 +380,7 @@ public function clearNoSlashTags(): Dom /** * Simple wrapper function that returns the first child. - * @return AbstractNode + * * @throws ChildNotFoundException * @throws NotLoadedException */ @@ -411,7 +393,7 @@ public function firstChild(): AbstractNode /** * Simple wrapper function that returns the last child. - * @return AbstractNode + * * @throws ChildNotFoundException * @throws NotLoadedException */ @@ -423,9 +405,8 @@ public function lastChild(): AbstractNode } /** - * Simple wrapper function that returns count of child elements + * Simple wrapper function that returns count of child elements. * - * @return int * @throws NotLoadedException */ public function countChildren(): int @@ -436,9 +417,8 @@ public function countChildren(): int } /** - * Get array of children + * Get array of children. * - * @return array * @throws NotLoadedException */ public function getChildren(): array @@ -449,9 +429,8 @@ public function getChildren(): array } /** - * Check if node have children nodes + * Check if node have children nodes. * - * @return bool * @throws NotLoadedException */ public function hasChildren(): bool @@ -464,25 +443,29 @@ public function hasChildren(): bool /** * Simple wrapper function that returns an element by the * id. + * * @param $id - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function getElementById($id) { $this->isLoaded(); - return $this->find('#'.$id, 0); + return $this->find('#' . $id, 0); } /** * Simple wrapper function that returns all elements by * tag name. - * @param string $name - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function getElementsByTag(string $name) { @@ -494,16 +477,17 @@ public function getElementsByTag(string $name) /** * Simple wrapper function that returns all elements by * class name. - * @param string $class - * @return mixed|Collection|null + * * @throws ChildNotFoundException * @throws NotLoadedException + * + * @return mixed|Collection|null */ public function getElementsByClass(string $class) { $this->isLoaded(); - return $this->find('.'.$class); + return $this->find('.' . $class); } /** @@ -513,16 +497,13 @@ public function getElementsByClass(string $class) */ protected function isLoaded(): void { - if (is_null($this->content)) { + if (\is_null($this->content)) { throw new NotLoadedException('Content is not loaded!'); } } /** * Cleans the html of any none-html information. - * - * @param string $str - * @return string */ protected function clean(string $str): string { @@ -531,20 +512,20 @@ protected function clean(string $str): string return $str; } - $is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII"); + $is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII'); if ($is_gzip) { - $str = gzdecode($str); + $str = \gzdecode($str); if ($str === false) { throw new LogicalException('gzdecode returned false. Error when trying to decode the string.'); } } // remove white space before closing tags - $str = mb_eregi_replace("'\s+>", "'>", $str); + $str = \mb_eregi_replace("'\s+>", "'>", $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.'); } - $str = mb_eregi_replace('"\s+>', '">', $str); + $str = \mb_eregi_replace('"\s+>', '">', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.'); } @@ -554,36 +535,36 @@ protected function clean(string $str): string if ($this->options->get('preserveLineBreaks')) { $replace = ' '; } - $str = str_replace(["\r\n", "\r", "\n"], $replace, $str); + $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str); if ($str === false) { throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.'); } // strip the doctype - $str = mb_eregi_replace("", '', $str); + $str = \mb_eregi_replace('', '', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.'); } // strip out comments - $str = mb_eregi_replace("", '', $str); + $str = \mb_eregi_replace('', '', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.'); } // strip out cdata - $str = mb_eregi_replace("", '', $str); + $str = \mb_eregi_replace("", '', $str); if ($str === false) { throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.'); } // strip out "; $dom = new Dom(); - $dom->setOptions(['cleanupInput' => false,]); + $dom->setOptions(['cleanupInput' => false]); $dom->load($html); $this->assertSame($html, $dom->root->outerHtml()); } From edec82b2ac45135ec8fbe4a88140d2ddedf71f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Fedor?= Date: Sat, 25 Jan 2020 19:36:43 +0100 Subject: [PATCH 31/68] Throw exception when trying to set unknown option --- .../Exceptions/UnknownOptionException.php | 13 +++++++ src/PHPHtmlParser/Options.php | 6 ++++ tests/OptionsTest.php | 35 +++++++++++++++---- 3 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 src/PHPHtmlParser/Exceptions/UnknownOptionException.php diff --git a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php new file mode 100644 index 00000000..3b139c0b --- /dev/null +++ b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php @@ -0,0 +1,13 @@ + $option) { + if (!isset($this->defaults[$key])) { + throw new UnknownOptionException("Option '$option' is not recognized"); + } $this->options[$key] = $option; } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index 03fe3ee0..90468b8d 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -2,8 +2,10 @@ declare(strict_types=1); -use PHPHtmlParser\Options; +use PHPHtmlParser\Dom; +use PHPHtmlParser\Exceptions\UnknownOptionException; use PHPUnit\Framework\TestCase; +use PHPHtmlParser\Options; class OptionsTest extends TestCase { @@ -14,26 +16,37 @@ public function testDefaultWhitespaceTextNode() $this->assertTrue($options->whitespaceTextNode); } + public function testSettingOption() + { + $options = new Options; + $options->setOptions([ + 'strict' => true, + ]); + + $this->assertTrue($options->strict); + } + public function testAddingOption() { + $this->expectException(UnknownOptionException::class); + $options = new Options(); $options->setOptions([ 'test' => true, ]); - - $this->assertTrue($options->test); } - public function testAddingOver() + public function testOverwritingOption() { $options = new Options(); $options->setOptions([ - 'test' => false, + 'strict' => false, ])->setOptions([ - 'test' => true, + 'strict' => true, 'whitespaceTextNode' => false, ]); + $this->assertTrue($options->get('strict')); $this->assertFalse($options->get('whitespaceTextNode')); } @@ -42,4 +55,14 @@ public function testGettingNoOption() $options = new Options(); $this->assertEquals(null, $options->get('doesnotexist')); } + + public function testUnknownOptionDom() { + $dom = new Dom; + $dom->setOptions([ + 'unknown_option' => true, + ]); + + $this->expectException(UnknownOptionException::class); + $dom->load('
'); + } } From b86c1d3c5e7a6368cbc756e3eb33826fcac5d12e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Fedor?= Date: Sat, 25 Jan 2020 19:57:22 +0100 Subject: [PATCH 32/68] Fix option existence check --- src/PHPHtmlParser/Options.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index 5b2a7b34..b0bb747e 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -80,8 +80,8 @@ public function __get($key) public function setOptions(array $options): Options { foreach ($options as $key => $option) { - if (!isset($this->defaults[$key])) { - throw new UnknownOptionException("Option '$option' is not recognized"); + if (!array_key_exists($key, $this->defaults)) { + throw new UnknownOptionException("Option '$key' is not recognized"); } $this->options[$key] = $option; } From 71c3758da857203423b0071350c59fb5624a504a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 10 May 2020 23:59:09 +0000 Subject: [PATCH 33/68] Updated the way we calculate selector logic --- CHANGELOG.md | 6 +- .../Contracts/Selector/ParserInterface.php | 12 + .../Contracts/Selector/SeekerInterface.php | 17 + .../Contracts/Selector/SelectorInterface.php | 33 ++ .../Selector/ParsedSelectorCollectionDTO.php | 30 ++ .../DTO/Selector/ParsedSelectorDTO.php | 30 ++ src/PHPHtmlParser/DTO/Selector/RuleDTO.php | 96 +++++ src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 6 +- .../Discovery/ParserDiscovery.php | 25 ++ .../Discovery/SeekerDiscovery.php | 25 ++ src/PHPHtmlParser/Dom/AbstractNode.php | 14 +- .../Exceptions/UnknownOptionException.php | 9 +- src/PHPHtmlParser/Options.php | 101 +++-- src/PHPHtmlParser/Selector/Parser.php | 28 +- .../Selector/ParserInterface.php | 10 - src/PHPHtmlParser/Selector/Seeker.php | 321 ++++++++++++++++ src/PHPHtmlParser/Selector/Selector.php | 359 ++---------------- tests/OptionsTest.php | 18 +- tests/Selector/SelectorTest.php | 16 +- 19 files changed, 737 insertions(+), 419 deletions(-) create mode 100644 src/PHPHtmlParser/Contracts/Selector/ParserInterface.php create mode 100644 src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php create mode 100644 src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php create mode 100644 src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php create mode 100644 src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php create mode 100644 src/PHPHtmlParser/DTO/Selector/RuleDTO.php create mode 100644 src/PHPHtmlParser/Discovery/ParserDiscovery.php create mode 100644 src/PHPHtmlParser/Discovery/SeekerDiscovery.php delete mode 100755 src/PHPHtmlParser/Selector/ParserInterface.php create mode 100644 src/PHPHtmlParser/Selector/Seeker.php diff --git a/CHANGELOG.md b/CHANGELOG.md index f765b63e..770a5d92 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,6 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -### Changed -- Added tag attribute DTO. - ## [Unreleased] ### Added @@ -17,7 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Fixed issue with \ causing an infite loop. - CDATA should not be altered when cleanupInput is false. - +- Added tag attribute DTO. +- Cleaned up the selector logic. ### Removed - Removed curl interface and curl implementation. diff --git a/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php b/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php new file mode 100644 index 00000000..3b2477b9 --- /dev/null +++ b/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php @@ -0,0 +1,12 @@ +parsedSelectorDTO[] = $value; + } + } + } + + /** + * @return ParsedSelectorDTO[] + */ + public function getParsedSelectorDTO(): array + { + return $this->parsedSelectorDTO; + } +} diff --git a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php new file mode 100644 index 00000000..5424e2a7 --- /dev/null +++ b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php @@ -0,0 +1,30 @@ +rules[] = $value; + } + } + } + + /** + * @return RuleDTO[] + */ + public function getRules(): array + { + return $this->rules; + } +} diff --git a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php new file mode 100644 index 00000000..1c336149 --- /dev/null +++ b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php @@ -0,0 +1,96 @@ +tag = $values['tag']; + $this->operator = $values['operator']; + $this->key = $values['key']; + $this->value = $values['value']; + $this->noKey = $values['noKey']; + $this->alterNext = $values['alterNext']; + } + + /** + * @return string + */ + public function getTag(): string + { + return $this->tag; + } + + /** + * @return string + */ + public function getOperator(): string + { + return $this->operator; + } + + /** + * @return string|array|null + */ + public function getKey() + { + return $this->key; + } + + /** + * @return string|array|null + */ + public function getValue() + { + return $this->value; + } + + /** + * @return bool + */ + public function isNoKey(): bool + { + return $this->noKey; + } + + /** + * @return bool + */ + public function isAlterNext(): bool + { + return $this->alterNext; + } +} diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 489b843c..1f15c492 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -7,17 +7,17 @@ use stringEncode\Encode; use stringEncode\Exception; -class AttributeDTO +final class AttributeDTO { /** * @var ?string */ - protected $value; + private $value; /** * @var bool */ - protected $doubleQuote = true; + private $doubleQuote = true; public function __construct(array $values) { diff --git a/src/PHPHtmlParser/Discovery/ParserDiscovery.php b/src/PHPHtmlParser/Discovery/ParserDiscovery.php new file mode 100644 index 00000000..a7d3c60a --- /dev/null +++ b/src/PHPHtmlParser/Discovery/ParserDiscovery.php @@ -0,0 +1,25 @@ +setDepthFirstFind($depthFirst); $nodes = $selector->find($this); diff --git a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php index 3b139c0b..58be8198 100644 --- a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php +++ b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php @@ -1,12 +1,13 @@ -options = $this->defaults; } + /** + * A magic get to call the get() method. + * + * @param string $key + * + * @return mixed + * + * @uses $this->get() + */ + public function __get($key) + { + return $this->get($key); + } + /** * The whitespaceTextNode, by default true, option tells the parser to save textnodes even if the content of the * node is empty (only whitespace). Setting it to false will ignore all whitespace only text node found in the document. - * @param bool $value + * * @return Options */ - public function setWhitespaceTextNode(bool $value): self { + public function setWhitespaceTextNode(bool $value): self + { $this->options['whitespaceTextNode'] = $value; + return $this; } /** * Strict, by default false, will throw a StrictException if it finds that the html is not strictly compliant * (all tags must have a closing tag, no attribute with out a value, etc.). - * @param bool $value + * * @return Options */ - public function setStrict(bool $value): self { + public function setStrict(bool $value): self + { $this->options['strict'] = $value; + return $this; } @@ -83,21 +101,25 @@ public function setStrict(bool $value): self { * The enforceEncoding, by default null, option will enforce an character set to be used for reading the content * and returning the content in that encoding. Setting it to null will trigger an attempt to figure out * the encoding from within the content of the string given instead. - * @param string|null $value + * * @return Options */ - public function setEnforceEncoding(?string $value): self { + public function setEnforceEncoding(?string $value): self + { $this->options['enforceEncoding'] = $value; + return $this; } /** * Set this to false to skip the entire clean up phase of the parser. Defaults to true. - * @param bool $value + * * @return Options */ - public function setCleanupInput(bool $value): self { + public function setCleanupInput(bool $value): self + { $this->options['cleanupInput'] = $value; + return $this; } @@ -107,11 +129,12 @@ public function setCleanupInput(bool $value): self { * * NOTE: Ignored if cleanupInit is true. * - * @param bool $value * @return Options */ - public function setRemoveScripts(bool $value): self { + public function setRemoveScripts(bool $value): self + { $this->options['removeScripts'] = $value; + return $this; } @@ -119,11 +142,13 @@ public function setRemoveScripts(bool $value): self { * Set this to false to skip removing of style tags from the document body. This might have adverse effects. Defaults to true. * * NOTE: Ignored if cleanupInit is true. - * @param bool $value + * * @return Options */ - public function setRemoveStyles(bool $value): self { + public function setRemoveStyles(bool $value): self + { $this->options['removeStyles'] = $value; + return $this; } @@ -132,31 +157,37 @@ public function setRemoveStyles(bool $value): self { * as part of the input clean up process. Defaults to false. * * NOTE: Ignored if cleanupInit is true. - * @param bool $value + * * @return Options */ - public function setPreserveLineBreaks(bool $value): self { + public function setPreserveLineBreaks(bool $value): self + { $this->options['preserveLineBreaks'] = $value; + return $this; } /** * Set this to false if you want to preserve whitespace inside of text nodes. It is set to true by default. - * @param bool $value + * * @return Options */ - public function setRemoveDoubleSpace(bool $value): self { + public function setRemoveDoubleSpace(bool $value): self + { $this->options['removeDoubleSpace'] = $value; + return $this; } /** * Set this to false if you want to preserve smarty script found in the html content. It is set to true by default. - * @param bool $value + * * @return Options */ - public function setRemoveSmartyScripts(bool $value): self { + public function setRemoveSmartyScripts(bool $value): self + { $this->options['removeSmartyScripts'] = $value; + return $this; } @@ -164,49 +195,40 @@ public function setRemoveSmartyScripts(bool $value): self { * By default this is set to false for legacy support. Setting this to true will change the behavior of find * to order elements by depth first. This will properly preserve the order of elements as they where in the HTML. * - * @param bool $value * @return Options + * * @deprecated This option will be removed in version 3.0.0 with the new behavior being as if it was set to true. */ - public function setDepthFirstSearch(bool $value): self { + public function setDepthFirstSearch(bool $value): self + { $this->options['depthFirstSearch'] = $value; + return $this; } /** * By default this is set to false. Setting this to true will apply the php function htmlspecialchars_decode too all attribute values and text nodes. - * @param bool $value + * * @return Options */ - public function setHtmlSpecialCharsDecode(bool $value): self { + public function setHtmlSpecialCharsDecode(bool $value): self + { $this->options['htmlSpecialCharsDecode'] = $value; - return $this; - } - /** - * A magic get to call the get() method. - * - * @param string $key - * - * @return mixed - * - * @uses $this->get() - */ - public function __get($key) - { - return $this->get($key); + return $this; } /** * Sets a new options param to override the current option array. * * @chainable + * * @throws UnknownOptionException */ public function setOptions(array $options): Options { foreach ($options as $key => $option) { - if (!array_key_exists($key, $this->defaults)) { + if (!\array_key_exists($key, $this->defaults)) { throw new UnknownOptionException("Option '$key' is not recognized"); } $this->options[$key] = $option; @@ -229,11 +251,12 @@ public function get(string $key) } /** - * Return current options as array + * Return current options as array. * * @return array */ - public function asArray() { + public function asArray() + { return $this->options; } } diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index 0f987903..a70a7a5e 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -4,8 +4,13 @@ namespace PHPHtmlParser\Selector; +use PHPHtmlParser\Contracts\Selector\ParserInterface; +use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO; +use PHPHtmlParser\DTO\Selector\ParsedSelectorDTO; +use PHPHtmlParser\DTO\Selector\RuleDTO; + /** - * This is the parser for the selector. + * This is the default parser for the selector. */ class Parser implements ParserInterface { @@ -14,20 +19,19 @@ class Parser implements ParserInterface * * @var string */ - protected $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; + private $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; /** * Parses the selector string. */ - public function parseSelectorString(string $selector): array + public function parseSelectorString(string $selector): ParsedSelectorCollectionDTO { $selectors = []; - $matches = []; + $rules = []; \preg_match_all($this->pattern, \trim($selector) . ' ', $matches, PREG_SET_ORDER); // skip tbody - $result = []; foreach ($matches as $match) { // default values $tag = \strtolower(\trim($match[1])); @@ -88,25 +92,25 @@ public function parseSelectorString(string $selector): array $noKey = true; } - $result[] = [ + $rules[] = new RuleDTO([ 'tag' => $tag, 'key' => $key, 'value' => $value, 'operator' => $operator, 'noKey' => $noKey, 'alterNext' => $alterNext, - ]; + ]); if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') { - $selectors[] = $result; - $result = []; + $selectors[] = new ParsedSelectorDTO($rules); + $rules = []; } } // save last results - if (\count($result) > 0) { - $selectors[] = $result; + if (\count($rules) > 0) { + $selectors[] = new ParsedSelectorDTO($rules); } - return $selectors; + return new ParsedSelectorCollectionDTO($selectors); } } diff --git a/src/PHPHtmlParser/Selector/ParserInterface.php b/src/PHPHtmlParser/Selector/ParserInterface.php deleted file mode 100755 index 755966ae..00000000 --- a/src/PHPHtmlParser/Selector/ParserInterface.php +++ /dev/null @@ -1,10 +0,0 @@ -getTag() !== null && \is_numeric($rule->getKey())) { + $count = 0; + /** @var AbstractNode $node */ + foreach ($nodes as $node) { + if ($rule->getTag() == '*' + || $rule->getTag() == $node->getTag() + ->name() + ) { + ++$count; + if ($count == $rule->getKey()) { + // found the node we wanted + return [$node]; + } + } + } + + return []; + } + + $options = $this->flattenOptions($options); + + $return = []; + /** @var InnerNode $node */ + foreach ($nodes as $node) { + // check if we are a leaf + if ($node instanceof LeafNode || !$node->hasChildren() + ) { + continue; + } + + $children = []; + $child = $node->firstChild(); + while (!\is_null($child)) { + // wild card, grab all + if ($rule->getTag() == '*' && \is_null($rule->getKey())) { + $return[] = $child; + $child = $this->getNextChild($node, $child); + continue; + } + + $pass = $this->checkTag($rule, $child); + if ($pass && $rule->getKey() != null) { + $pass = $this->checkKey($rule, $child); + } + if ($pass && + $rule->getKey() != null && + $rule->getValue() != null && + $rule->getValue() != '*' + ) { + $pass = $this->checkComparison($rule, $child); + } + + if ($pass) { + // it passed all checks + $return[] = $child; + } else { + // this child failed to be matched + if ($child instanceof InnerNode && $child->hasChildren() + ) { + if ($depthFirst) { + if (!isset($options['checkGrandChildren']) + || $options['checkGrandChildren'] + ) { + // we have a child that failed but are not leaves. + $matches = $this->seek([$child], $rule, $options, $depthFirst); + foreach ($matches as $match) { + $return[] = $match; + } + } + } else { + // we still want to check its children + $children[] = $child; + } + } + } + + $child = $this->getNextChild($node, $child); + } + + if ((!isset($options['checkGrandChildren']) + || $options['checkGrandChildren']) + && \count($children) > 0 + ) { + // we have children that failed but are not leaves. + $matches = $this->seek($children, $rule, $options, $depthFirst); + foreach ($matches as $match) { + $return[] = $match; + } + } + } + + return $return; + } + + /** + * Checks comparison condition from rules against node. + */ + private function checkComparison(RuleDTO $rule, AbstractNode $node): bool + { + if ($rule->getKey() == 'plaintext') { + // plaintext search + $nodeValue = $node->text(); + $result = $this->checkNodeValue($nodeValue, $rule, $node); + } else { + // normal search + if (!\is_array($rule->getKey())) { + $nodeValue = $node->getAttribute($rule->getKey()); + $result = $this->checkNodeValue($nodeValue, $rule, $node); + } else { + $result = true; + foreach ($rule->getKey() as $index => $key) { + $nodeValue = $node->getAttribute($key); + $result = $result && + $this->checkNodeValue($nodeValue, $rule, $node, $index); + } + } + } + + return $result; + } + + /** + * Flattens the option array. + * + * @return array + */ + private function flattenOptions(array $optionsArray) + { + $options = []; + foreach ($optionsArray as $optionArray) { + foreach ($optionArray as $key => $option) { + $options[$key] = $option; + } + } + + return $options; + } + + /** + * Returns the next child or null if no more children. + * + * @return AbstractNode|null + */ + private function getNextChild( + AbstractNode $node, + AbstractNode $currentChild + ) { + try { + $child = null; + if ($node instanceof InnerNode) { + // get next child + $child = $node->nextChild($currentChild->id()); + } + } catch (ChildNotFoundException $e) { + // no more children + unset($e); + $child = null; + } + + return $child; + } + + /** + * Checks tag condition from rules against node. + */ + private function checkTag(RuleDTO $rule, AbstractNode $node): bool + { + if (!empty($rule->getTag()) && $rule->getTag() != $node->getTag()->name() + && $rule->getTag() != '*' + ) { + return false; + } + + return true; + } + + /** + * Checks key condition from rules against node. + */ + private function checkKey(RuleDTO $rule, AbstractNode $node): bool + { + if (!\is_array($rule->getKey())) { + if ($rule->isNoKey()) { + if ($node->getAttribute($rule->getKey()) !== null) { + return false; + } + } else { + if ($rule->getKey() != 'plaintext' + && !$node->hasAttribute($rule->getKey()) + ) { + return false; + } + } + } else { + if ($rule->isNoKey()) { + foreach ($rule->getKey() as $key) { + if (!\is_null($node->getAttribute($key))) { + return false; + } + } + } else { + foreach ($rule->getKey() as $key) { + if ($key != 'plaintext' + && !$node->hasAttribute($key) + ) { + return false; + } + } + } + } + + return true; + } + + private function checkNodeValue( + ?string $nodeValue, + RuleDTO $rule, + AbstractNode $node, + ?int $index = null + ): bool { + $check = false; + if ( + $rule->getValue() != null && + \is_string($rule->getValue()) + ) { + $check = $this->match($rule->getOperator(), $rule->getValue(), $nodeValue); + } + + // handle multiple classes + $key = $rule->getKey(); + if ( + !$check && + $key == 'class' && + \is_array($rule->getValue()) + ) { + $nodeClasses = \explode(' ', $node->getAttribute('class') ?? ''); + foreach ($rule->getValue() as $value) { + foreach ($nodeClasses as $class) { + if ( + !empty($class) && + \is_string($rule->getOperator()) + ) { + $check = $this->match($rule->getOperator(), $value, $class); + } + if ($check) { + break; + } + } + if (!$check) { + break; + } + } + } elseif ( + !$check && + \is_array($key) && + !\is_null($nodeValue) && + \is_string($rule->getOperator()) && + \is_string($rule->getValue()[$index]) + ) { + $check = $this->match($rule->getOperator(), $rule->getValue()[$index], $nodeValue); + } + + return $check; + } + + /** + * Attempts to match the given arguments with the given operator. + */ + private function match( + string $operator, + string $pattern, + string $value + ): bool { + $value = \strtolower($value); + $pattern = \strtolower($pattern); + switch ($operator) { + case '=': + return $value === $pattern; + case '!=': + return $value !== $pattern; + case '^=': + return \preg_match('/^' . \preg_quote($pattern, '/') . '/', + $value) == 1; + case '$=': + return \preg_match('/' . \preg_quote($pattern, '/') . '$/', + $value) == 1; + case '*=': + if ($pattern[0] == '/') { + return \preg_match($pattern, $value) == 1; + } + + return \preg_match('/' . $pattern . '/i', $value) == 1; + } + + return false; + } +} diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 21d6920d..95c47001 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -4,33 +4,51 @@ namespace PHPHtmlParser\Selector; +use PHPHtmlParser\Contracts\Selector\ParserInterface; +use PHPHtmlParser\Contracts\Selector\SeekerInterface; +use PHPHtmlParser\Contracts\Selector\SelectorInterface; +use PHPHtmlParser\Discovery\ParserDiscovery; +use PHPHtmlParser\Discovery\SeekerDiscovery; use PHPHtmlParser\Dom\AbstractNode; use PHPHtmlParser\Dom\Collection; -use PHPHtmlParser\Dom\InnerNode; -use PHPHtmlParser\Dom\LeafNode; +use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO; +use PHPHtmlParser\DTO\Selector\RuleDTO; use PHPHtmlParser\Exceptions\ChildNotFoundException; /** * Class Selector. */ -class Selector +class Selector implements SelectorInterface { /** - * @var array + * @var ParsedSelectorCollectionDTO */ - protected $selectors = []; + private $ParsedSelectorCollectionDTO; /** * @var bool */ private $depthFirst = false; + /** + * @var SeekerInterface + */ + private $seeker; + /** * Constructs with the selector string. */ - public function __construct(string $selector, ParserInterface $parser) + public function __construct(string $selector, ?ParserInterface $parser = null, ?SeekerInterface $seeker = null) { - $this->selectors = $parser->parseSelectorString($selector); + if ($parser == null) { + $parser = ParserDiscovery::find(); + } + if ($seeker == null) { + $seeker = SeekerDiscovery::find(); + } + + $this->ParsedSelectorCollectionDTO = $parser->parseSelectorString($selector); + $this->seeker = $seeker; } /** @@ -38,9 +56,9 @@ public function __construct(string $selector, ParserInterface $parser) * * @return array */ - public function getSelectors() + public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO { - return $this->selectors; + return $this->ParsedSelectorCollectionDTO; } public function setDepthFirstFind(bool $status): void @@ -57,19 +75,19 @@ public function setDepthFirstFind(bool $status): void public function find(AbstractNode $node): Collection { $results = new Collection(); - foreach ($this->selectors as $selector) { + foreach ($this->ParsedSelectorCollectionDTO->getParsedSelectorDTO() as $selector) { $nodes = [$node]; - if (\count($selector) == 0) { + if (\count($selector->getRules()) == 0) { continue; } $options = []; - foreach ($selector as $rule) { - if ($rule['alterNext']) { + foreach ($selector->getRules() as $rule) { + if ($rule->isAlterNext()) { $options[] = $this->alterNext($rule); continue; } - $nodes = $this->seek($nodes, $rule, $options); + $nodes = $this->seeker->seek($nodes, $rule, $options, $this->depthFirst); // clear the options $options = []; } @@ -83,326 +101,17 @@ public function find(AbstractNode $node): Collection return $results; } - /** - * Checks comparison condition from rules against node. - */ - public function checkComparison(array $rule, AbstractNode $node): bool - { - if ($rule['key'] == 'plaintext') { - // plaintext search - $nodeValue = $node->text(); - $result = $this->checkNodeValue($nodeValue, $rule, $node); - } else { - // normal search - if (!\is_array($rule['key'])) { - $nodeValue = $node->getAttribute($rule['key']); - $result = $this->checkNodeValue($nodeValue, $rule, $node); - } else { - $result = true; - foreach ($rule['key'] as $index => $key) { - $nodeValue = $node->getAttribute($key); - $result = $result && - $this->checkNodeValue($nodeValue, $rule, $node, $index); - } - } - } - - return $result; - } - - /** - * Attempts to find all children that match the rule - * given. - * - * @throws ChildNotFoundException - */ - protected function seek(array $nodes, array $rule, array $options): array - { - // XPath index - if (\array_key_exists('tag', $rule) && \array_key_exists('key', $rule) - && \is_numeric($rule['key']) - ) { - $count = 0; - /** @var AbstractNode $node */ - foreach ($nodes as $node) { - if ($rule['tag'] == '*' - || $rule['tag'] == $node->getTag() - ->name() - ) { - ++$count; - if ($count == $rule['key']) { - // found the node we wanted - return [$node]; - } - } - } - - return []; - } - - $options = $this->flattenOptions($options); - - $return = []; - /** @var InnerNode $node */ - foreach ($nodes as $node) { - // check if we are a leaf - if ($node instanceof LeafNode || !$node->hasChildren() - ) { - continue; - } - - $children = []; - $child = $node->firstChild(); - while (!\is_null($child)) { - // wild card, grab all - if ($rule['tag'] == '*' && \is_null($rule['key'])) { - $return[] = $child; - $child = $this->getNextChild($node, $child); - continue; - } - - $pass = $this->checkTag($rule, $child); - if ($pass && !\is_null($rule['key'])) { - $pass = $this->checkKey($rule, $child); - } - if ($pass && !\is_null($rule['key']) && !\is_null($rule['value']) - && $rule['value'] != '*' - ) { - $pass = $this->checkComparison($rule, $child); - } - - if ($pass) { - // it passed all checks - $return[] = $child; - } else { - // this child failed to be matched - if ($child instanceof InnerNode && $child->hasChildren() - ) { - if ($this->depthFirst) { - if (!isset($options['checkGrandChildren']) - || $options['checkGrandChildren'] - ) { - // we have a child that failed but are not leaves. - $matches = $this->seek([$child], $rule, - $options); - foreach ($matches as $match) { - $return[] = $match; - } - } - } else { - // we still want to check its children - $children[] = $child; - } - } - } - - $child = $this->getNextChild($node, $child); - } - - if ((!isset($options['checkGrandChildren']) - || $options['checkGrandChildren']) - && \count($children) > 0 - ) { - // we have children that failed but are not leaves. - $matches = $this->seek($children, $rule, $options); - foreach ($matches as $match) { - $return[] = $match; - } - } - } - - return $return; - } - - /** - * Attempts to match the given arguments with the given operator. - */ - protected function match( - string $operator, - string $pattern, - string $value - ): bool { - $value = \strtolower($value); - $pattern = \strtolower($pattern); - switch ($operator) { - case '=': - return $value === $pattern; - case '!=': - return $value !== $pattern; - case '^=': - return \preg_match('/^' . \preg_quote($pattern, '/') . '/', - $value) == 1; - case '$=': - return \preg_match('/' . \preg_quote($pattern, '/') . '$/', - $value) == 1; - case '*=': - if ($pattern[0] == '/') { - return \preg_match($pattern, $value) == 1; - } - - return \preg_match('/' . $pattern . '/i', $value) == 1; - } - - return false; - } - /** * Attempts to figure out what the alteration will be for * the next element. */ - protected function alterNext(array $rule): array + private function alterNext(RuleDTO $rule): array { $options = []; - if ($rule['tag'] == '>') { + if ($rule->getTag() == '>') { $options['checkGrandChildren'] = false; } return $options; } - - /** - * Flattens the option array. - * - * @return array - */ - protected function flattenOptions(array $optionsArray) - { - $options = []; - foreach ($optionsArray as $optionArray) { - foreach ($optionArray as $key => $option) { - $options[$key] = $option; - } - } - - return $options; - } - - /** - * Returns the next child or null if no more children. - * - * @return AbstractNode|null - */ - protected function getNextChild( - AbstractNode $node, - AbstractNode $currentChild - ) { - try { - $child = null; - if ($node instanceof InnerNode) { - // get next child - $child = $node->nextChild($currentChild->id()); - } - } catch (ChildNotFoundException $e) { - // no more children - unset($e); - $child = null; - } - - return $child; - } - - /** - * Checks tag condition from rules against node. - */ - protected function checkTag(array $rule, AbstractNode $node): bool - { - if (!empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() - && $rule['tag'] != '*' - ) { - return false; - } - - return true; - } - - /** - * Checks key condition from rules against node. - */ - protected function checkKey(array $rule, AbstractNode $node): bool - { - if (!\is_array($rule['key'])) { - if ($rule['noKey']) { - if (!\is_null($node->getAttribute($rule['key']))) { - return false; - } - } else { - if ($rule['key'] != 'plaintext' - && !$node->hasAttribute($rule['key']) - ) { - return false; - } - } - } else { - if ($rule['noKey']) { - foreach ($rule['key'] as $key) { - if (!\is_null($node->getAttribute($key))) { - return false; - } - } - } else { - foreach ($rule['key'] as $key) { - if ($key != 'plaintext' - && !$node->hasAttribute($key) - ) { - return false; - } - } - } - } - - return true; - } - - private function checkNodeValue( - ?string $nodeValue, - array $rule, - AbstractNode $node, - ?int $index = null - ): bool { - $check = false; - if ( - \array_key_exists('value', $rule) && !\is_array($rule['value']) && - !\is_null($nodeValue) && - \array_key_exists('operator', $rule) && \is_string($rule['operator']) && - \array_key_exists('value', $rule) && \is_string($rule['value']) - ) { - $check = $this->match($rule['operator'], $rule['value'], $nodeValue); - } - - // handle multiple classes - $key = $rule['key']; - if ( - !$check && - $key == 'class' && - \array_key_exists('value', $rule) && \is_array($rule['value']) - ) { - $nodeClasses = \explode(' ', $node->getAttribute('class') ?? ''); - foreach ($rule['value'] as $value) { - foreach ($nodeClasses as $class) { - if ( - !empty($class) && - \array_key_exists('operator', $rule) && \is_string($rule['operator']) - ) { - $check = $this->match($rule['operator'], $value, $class); - } - if ($check) { - break; - } - } - if (!$check) { - break; - } - } - } elseif ( - !$check && - \is_array($key) && - !\is_null($nodeValue) && - \array_key_exists('operator', $rule) && \is_string($rule['operator']) && - \array_key_exists('value', $rule) && \is_string($rule['value'][$index]) - ) { - $check = $this->match($rule['operator'], $rule['value'][$index], $nodeValue); - } - - return $check; - } } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index ed83b177..91c62591 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -4,8 +4,8 @@ use PHPHtmlParser\Dom; use PHPHtmlParser\Exceptions\UnknownOptionException; -use PHPUnit\Framework\TestCase; use PHPHtmlParser\Options; +use PHPUnit\Framework\TestCase; class OptionsTest extends TestCase { @@ -18,7 +18,7 @@ public function testDefaultWhitespaceTextNode() public function testSettingOption() { - $options = new Options; + $options = new Options(); $options->setOptions([ 'strict' => true, ]); @@ -42,7 +42,7 @@ public function testOverwritingOption() $options->setOptions([ 'strict' => false, ])->setOptions([ - 'strict' => true, + 'strict' => true, 'whitespaceTextNode' => false, ]); @@ -56,7 +56,8 @@ public function testGettingNoOption() $this->assertEquals(null, $options->get('doesnotexist')); } - public function testSetters() { + public function testSetters() + { $options = new Options(); $options->setOptions([ @@ -79,8 +80,8 @@ public function testSetters() { $options->setStrict(true); $this->assertTrue($options->get('strict')); - $options->setEnforceEncoding("utf8"); - $this->assertEquals("utf8", $options->get('enforceEncoding')); + $options->setEnforceEncoding('utf8'); + $this->assertEquals('utf8', $options->get('enforceEncoding')); $options->setCleanupInput(true); $this->assertTrue($options->get('cleanupInput')); @@ -142,8 +143,9 @@ public function testSetters() { $this->assertFalse($options->get('htmlSpecialCharsDecode')); } - public function testUnknownOptionDom() { - $dom = new Dom; + public function testUnknownOptionDom() + { + $dom = new Dom(); $dom->setOptions([ 'unknown_option' => true, ]); diff --git a/tests/Selector/SelectorTest.php b/tests/Selector/SelectorTest.php index 261b3cb8..d2a12a59 100755 --- a/tests/Selector/SelectorTest.php +++ b/tests/Selector/SelectorTest.php @@ -13,29 +13,29 @@ class SelectorTest extends TestCase public function testParseSelectorStringId() { $selector = new Selector('#all', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertEquals('id', $selectors[0][0]['key']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertEquals('id', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getKey()); } public function testParseSelectorStringClass() { $selector = new Selector('div.post', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertEquals('class', $selectors[0][0]['key']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertEquals('class', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getKey()); } public function testParseSelectorStringAttribute() { $selector = new Selector('div[visible=yes]', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertEquals('yes', $selectors[0][0]['value']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertEquals('yes', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getValue()); } public function testParseSelectorStringNoKey() { $selector = new Selector('div[!visible]', new Parser()); - $selectors = $selector->getSelectors(); - $this->assertTrue($selectors[0][0]['noKey']); + $selectors = $selector->getParsedSelectorCollectionDTO(); + $this->assertTrue($selectors->getParsedSelectorDTO()[0]->getRules()[0]->isNoKey()); } public function testFind() From 04321f991ba37b9b47ee22ae52dc2319c353a6b0 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 11 May 2020 00:52:31 +0000 Subject: [PATCH 34/68] fixes #82 --- CHANGELOG.md | 2 + src/PHPHtmlParser/Dom.php | 58 +- src/PHPHtmlParser/Selector/Seeker.php | 4 +- tests/DomTest.php | 71 + tests/Selector/SeekerTest.php | 25 + tests/data/files/big.html | 2 +- tests/data/files/html5.html | 2957 +++++++++++++++++++++++++ 7 files changed, 3096 insertions(+), 23 deletions(-) create mode 100644 tests/Selector/SeekerTest.php create mode 100644 tests/data/files/html5.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 770a5d92..8daa5304 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,12 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added support for PSR7 HTTP clients and requests for URL calls. - Added PHAN support and fixed all issues from PHAN. +- Added support for html5 charset detection. ### Changed - Fixed issue with \ causing an infite loop. - CDATA should not be altered when cleanupInput is false. - Added tag attribute DTO. - Cleaned up the selector logic. +- Fixed issue with greedy regex for charset detection. ### Removed - Removed curl interface and curl implementation. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 9a980ab5..8c2ebcde 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -39,42 +39,42 @@ class Dom * * @var string */ - protected $defaultCharset = 'UTF-8'; + private $defaultCharset = 'UTF-8'; /** * The raw version of the document string. * * @var string */ - protected $raw; + private $raw; /** * The document string. * * @var Content */ - protected $content; + private $content; /** * The original file size of the document. * * @var int */ - protected $rawSize; + private $rawSize; /** * The size of the document after it is cleaned. * * @var int */ - protected $size; + private $size; /** * A global options array to be used by all load calls. * * @var array */ - protected $globalOptions = []; + private $globalOptions = []; /** * A persistent option object to be used for all options in the @@ -82,14 +82,14 @@ class Dom * * @var Options */ - protected $options; + private $options; /** * A list of tags which will always be self closing. * * @var array */ - protected $selfClosing = [ + private $selfClosing = [ 'area', 'base', 'basefont', @@ -114,7 +114,7 @@ class Dom * * @var array */ - protected $noSlash = []; + private $noSlash = []; /** * Returns the inner html of the root node. @@ -173,7 +173,7 @@ public function load(string $str, array $options = []): Dom */ public function loadFromFile(string $file, array $options = []): Dom { - $content = \file_get_contents($file); + $content = @\file_get_contents($file); if ($content === false) { throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".'); } @@ -496,7 +496,7 @@ public function getElementsByClass(string $class) * * @throws NotLoadedException */ - protected function isLoaded(): void + private function isLoaded(): void { if (\is_null($this->content)) { throw new NotLoadedException('Content is not loaded!'); @@ -506,7 +506,7 @@ protected function isLoaded(): void /** * Cleans the html of any none-html information. */ - protected function clean(string $str): string + private function clean(string $str): string { if ($this->options->get('cleanupInput') != true) { // skip entire cleanup step @@ -610,7 +610,7 @@ protected function clean(string $str): string * @throws StrictException * @throws LogicalException */ - protected function parse(): void + private function parse(): void { // add the root node $this->root = new HtmlNode('root'); @@ -679,7 +679,7 @@ protected function parse(): void * * @throws StrictException */ - protected function parseTag(): array + private function parseTag(): array { $return = [ 'status' => false, @@ -823,7 +823,7 @@ protected function parseTag(): array * * @throws ChildNotFoundException */ - protected function detectCharset(): bool + private function detectCharset(): bool { // set the default $encode = new Encode(); @@ -841,11 +841,15 @@ protected function detectCharset(): bool /** @var AbstractNode $meta */ $meta = $this->root->find('meta[http-equiv=Content-Type]', 0); - if (\is_null($meta)) { - // could not find meta tag - $this->root->propagateEncoding($encode); + if ($meta == null) { + if (!$this->detectHTML5Charset($encode)) { + // could not find meta tag + $this->root->propagateEncoding($encode); - return false; + return false; + } + + return true; } $content = $meta->getAttribute('content'); if (\is_null($content)) { @@ -855,7 +859,7 @@ protected function detectCharset(): bool return false; } $matches = []; - if (\preg_match('/charset=(.+)/', $content, $matches)) { + if (\preg_match('/charset=([^;]+)/', $content, $matches)) { $encode->from(\trim($matches[1])); $this->root->propagateEncoding($encode); @@ -867,4 +871,18 @@ protected function detectCharset(): bool return false; } + + private function detectHTML5Charset(Encode $encode): bool + { + /** @var AbstractNode|null $meta */ + $meta = $this->root->find('meta[charset]', 0); + if ($meta == null) { + return false; + } + + $encode->from(\trim($meta->getAttribute('charset'))); + $this->root->propagateEncoding($encode); + + return true; + } } diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index e9ed2484..971c40f9 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -314,8 +314,8 @@ private function match( } return \preg_match('/' . $pattern . '/i', $value) == 1; + default: + return false; } - - return false; } } diff --git a/tests/DomTest.php b/tests/DomTest.php index 0a50021e..ea570561 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -196,6 +196,13 @@ public function testLoadFromFileFind() $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } + public function testLoadFromFileNotFound() + { + $dom = new Dom(); + $this->expectException(\PHPHtmlParser\Exceptions\LogicalException::class); + $dom->loadFromFile('tests/data/files/unkowne.html'); + } + public function testLoadUtf8() { $dom = new Dom(); @@ -531,6 +538,60 @@ public function testMultipleSquareSelector() $this->assertEquals(1, \count($items)); } + public function testNotSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[type!=foo]'); + $this->assertEquals(1, \count($items)); + } + + public function testStartSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[name^=f]'); + $this->assertEquals(1, \count($items)); + } + + public function testEndSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz$=g]'); + $this->assertEquals(1, \count($items)); + } + + public function testStarSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz*=*]'); + $this->assertEquals(1, \count($items)); + } + + public function testStarFullRegexSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz*=/\w+/]'); + $this->assertEquals(1, \count($items)); + } + + public function testFailedSquareSelector() + { + $dom = new Dom(); + $dom->load(''); + + $items = $dom->find('input[baz%=g]'); + $this->assertEquals(1, \count($items)); + } + public function testLoadGetAttributeWithBackslash() { $dom = new Dom(); @@ -547,4 +608,14 @@ public function test25ChildrenFound() $children = $dom->find('#red-line-g *'); $this->assertEquals(25, \count($children)); } + + public function testHtml5PageLoad() + { + $dom = new Dom(); + $dom->loadFromFile('tests/data/files/html5.html'); + + /** @var Dom\AbstractNode $meta */ + $div = $dom->find('div.d-inline-block', 0); + $this->assertEquals('max-width: 29px', $div->getAttribute('style')); + } } diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php new file mode 100644 index 00000000..4e2d9e4f --- /dev/null +++ b/tests/Selector/SeekerTest.php @@ -0,0 +1,25 @@ + 'tag', + 'key' => 1, + 'value' => null, + 'operator' => null, + 'noKey' => false, + 'alterNext' => false, + ]); + $seeker = new Seeker(); + $results = $seeker->seek([], $ruleDTO, [], false); + $this->assertCount(0, $results); + } +} diff --git a/tests/data/files/big.html b/tests/data/files/big.html index 6b5e3ee5..a26f5093 100755 --- a/tests/data/files/big.html +++ b/tests/data/files/big.html @@ -2,7 +2,7 @@ - + diff --git a/tests/data/files/html5.html b/tests/data/files/html5.html new file mode 100644 index 00000000..b2b1413d --- /dev/null +++ b/tests/data/files/html5.html @@ -0,0 +1,2957 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + Dom.php uses greedy RegEx to match charset · Issue #82 · paquettg/php-html-parser + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content + + + + + + + + + + + + + +
+ +
+ + + + +
+ + + +
+ + + + + + + + + +
+
+
+ + + + + + + + + + + + + +
+ +
+ +
+

+ + + + + / + + php-html-parser + + +

+ + +
+ +
    + + + +
  • + + + + +
  • + +
  • + +
    + +
    + + + Unwatch + + +
    + Notifications +
    +
    + + + + + + + +
    +
    +
    + +
    +
  • + +
  • +
    +
    + + +
    +
    + + +
    + +
  • + +
  • + + + Fork + + +
  • +
+ +
+ + + + + +
+ + + + + + +
+
+ + + +
+ + + + +
+ + +
+
+
+
+ + + + New issue + + +
+ +

+ + Dom.php uses greedy RegEx to match charset + + #82 +

+
+
+ +
+
+ +
+ + +
+
+
+
+ + + Open + + + +
+
+ thinkingmedia opened this issue + Jul 17, 2016 + · 0 comments + + + + + +
+
+ + +
+
+
+
+
+ + + Open + + + +
+
+

+ Dom.php uses greedy RegEx to match charset + #82 +

+ +
+ thinkingmedia opened this issue + Jul 17, 2016 + · 0 comments + + + + + +
+
+
+
+
+
+
+
+ + + +
+ +
+ Labels +
+ bug +
+
+ +
+ Projects + +
+ +
+ Milestone + +
+
+ + +

Comments

+
+
+ +
+
+ +
+ +
+ + +
+ @thinkingmedia + +
+ + +
+
+ + + +
+
+ + + + + + + + + + + + + + + + + +
+ + + + + +
+

+ Pick your reaction +

+ + + +
+ + + + + + + + + + +
+
+ +
+ + + + +
+ + + + + + Copy link + + + + + + + + + + + + Report content + + + +
+ +
+ + +
+ + + + + + +
+ +

+ + + @thinkingmedia + + + + + thinkingmedia + + + + + + commented + + + Jul 17, 2016 + + + + +

+
+ + +
+ + + + + + + + + +
+

I found an edge case where Dom.php would call detectCharset and extract an invalid charset

+

The example comes from https://duckduckgo.com/

+

They have this meta tag.

+
    <meta http-equiv="content-type" content="text/html; charset=UTF-8;charset=utf-8">
+
+

The problem is that this section of Dom.php uses a greedy regex.

+
        $matches = [];
+        if (preg_match('/charset=(.+)/', $content, $matches)) {
+            $encode->from(trim($matches[1]));
+            $this->root->propagateEncoding($encode);
+
+            return true;
+        }
+
+

So I changed it to this and it works.

+
        if (preg_match('/charset=([^;]+)/', $content, $matches)) {
+
+

I use the ; character as a terminator for the charset identifier.

+

I also noticed that the Dom.php does not support this meta tag.

+
<meta charset="utf-8" />
+
+

This is the new charset identifier for HTML5.

+
+
+ + + + +
+ +
+ +
+ +
+
+ + +
+ + +
+ +
+ +
+ + +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ + + + + +
+ Select a reply + ctrl . +
+ + + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ + +
+ +

+ The content you are editing has changed. Please try again. +

+ + + +
+ + + + + + + + + + + + + + + + +
+
+ +
+
+
+

Nothing to preview

+
+
+ +
+ +
+ + +
+ + +
+
+ + +
+ +
+
+ +
+ + + + +
+ + +
+ + + + +
+ +
+
+ +
+
+ + + + @paquettg + paquettg + + + + + + added + the + + bug + + label + + + Jan 30, 2019 + +
+
+ + +
+
+ +
+
+ + + + @paquettg + paquettg + + + + + + added this to the 3.0.0 milestone + + + Aug 18, 2019 + +
+
+ + +
+
+ +
+
+ + + + @paquettg + paquettg + + + + added this to To do + in 3.0.0 + + + Aug 18, 2019 + +
+
+ + + + +
+ + + + + + + +
+
+ +
+ +
+ + +
+ + + +
+
+
+ @paquettg +
+ +
+ +
+ + + +
+
+ + +
+ +
+ +
+ + +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ +
+ + + + + + + + + + + +
+ + + + + +
+ Select a reply + ctrl . +
+ + + +
+
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ + + + +
+ + + + + + + + + +
+
+ + + + + +
+ +
+
+ +
+
+ + +
+
+ +
+
+
+ +
+
+
+
+ Remember, contributions to this repository should follow + its + contributing guidelines. +
+ + +
+ + +
+
+ +
+ +
+ +
+ + + + + + + +
+
+ +
+ + + + Projects + + + +
+ Projects + +
+ + + + +
+
+ + + + + +
+
+ +
+
+ + 3.0.0 +
+
+ +    + +
+ +
+ +
+ +
+
+
+ + +
+ +
+ + + + +
+
+ +
+ + + + Linked pull requests + + + +
+ Link a pull request from this repository + +
+ + + + +
+
+ + + +

Successfully merging a pull request may close this issue.

+ + None yet + +
+
+ + + +
+
+ +
+ 2 participants +
+ +
+
+ + + +
+
+ + + Lock conversation + + +
+
+ +

+ Lock conversation on this issue +

+
+
+
    +
  • Other users can’t add new comments to this issue.
  • +
  • + You and other collaborators + with access + to this repository can still leave comments that others can see. +
  • +
  • You can always unlock this issue again in the future.
  • +
+ +
+
+ +
+
+ +

+ Optionally, choose a reason for locking that others can see. Learn more about when + it’s appropriate to lock conversations. +

+
+
+
+ +
+
+
+ +
+
+ + +
+ + +
+
+
+ + + Transfer issue + + + + +
+ Loading transfer form... +
+
+
+
+
+
+ +
+
+ + + Delete issue + + + +
+ + + + +
+

Are you sure you want to delete this issue?

+
+
    +
  • This cannot be undone
  • +
  • Only administrators can delete issues
  • +
  • Deletion will remove the issue from search and previous references will point to a placeholder
  • +
+
+ + +
+
+
+
+ + + +
+ + +
+
+ + +
+
+ +
+ + +
+
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + + + + + + \ No newline at end of file From 9d8149016d0eb45b6695d860dae7581dbdcc4b98 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Mon, 11 May 2020 01:01:36 +0000 Subject: [PATCH 35/68] fix #103 --- CHANGELOG.md | 10 ++++---- src/PHPHtmlParser/Selector/Seeker.php | 34 +++++++++++++-------------- tests/DomTest.php | 14 ++++++++++- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8daa5304..1de2d167 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Added support for PSR7 HTTP clients and requests for URL calls. -- Added PHAN support and fixed all issues from PHAN. -- Added support for html5 charset detection. +- Support for PSR7 HTTP clients and requests for URL calls has been added. +- PHAN support and fixed all issues from PHAN has been added. +- PHP-CS-Fixer added. +- Support for html5 charset detection. +- Added the ability to match both parent and children. ### Changed - Fixed issue with \ causing an infite loop. @@ -20,7 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed issue with greedy regex for charset detection. ### Removed -- Removed curl interface and curl implementation. +- Curl interface and curl implementation has been removed. ## 2.2.0 diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index 971c40f9..fa101e9e 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -17,6 +17,7 @@ class Seeker implements SeekerInterface * Attempts to find all children that match the rule * given. * + * @var InnerNode[] $nodes * @throws ChildNotFoundException */ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array @@ -24,7 +25,6 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir // XPath index if ($rule->getTag() !== null && \is_numeric($rule->getKey())) { $count = 0; - /** @var AbstractNode $node */ foreach ($nodes as $node) { if ($rule->getTag() == '*' || $rule->getTag() == $node->getTag() @@ -44,7 +44,6 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir $options = $this->flattenOptions($options); $return = []; - /** @var InnerNode $node */ foreach ($nodes as $node) { // check if we are a leaf if ($node instanceof LeafNode || !$node->hasChildren() @@ -77,24 +76,23 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir if ($pass) { // it passed all checks $return[] = $child; - } else { - // this child failed to be matched - if ($child instanceof InnerNode && $child->hasChildren() - ) { - if ($depthFirst) { - if (!isset($options['checkGrandChildren']) - || $options['checkGrandChildren'] - ) { - // we have a child that failed but are not leaves. - $matches = $this->seek([$child], $rule, $options, $depthFirst); - foreach ($matches as $match) { - $return[] = $match; - } + } + // this child failed to be matched + if ($child instanceof InnerNode && $child->hasChildren() + ) { + if ($depthFirst) { + if (!isset($options['checkGrandChildren']) + || $options['checkGrandChildren'] + ) { + // we have a child that failed but are not leaves. + $matches = $this->seek([$child], $rule, $options, $depthFirst); + foreach ($matches as $match) { + $return[] = $match; } - } else { - // we still want to check its children - $children[] = $child; } + } else { + // we still want to check its children + $children[] = $child; } } diff --git a/tests/DomTest.php b/tests/DomTest.php index ea570561..7c29b508 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -223,7 +223,7 @@ public function testLoadFileBig() { $dom = new Dom(); $dom->loadFromFile('tests/data/files/big.html'); - $this->assertEquals(10, \count($dom->find('.content-border'))); + $this->assertEquals(20, \count($dom->find('.content-border'))); } public function testLoadFileBigTwice() @@ -618,4 +618,16 @@ public function testHtml5PageLoad() $div = $dom->find('div.d-inline-block', 0); $this->assertEquals('max-width: 29px', $div->getAttribute('style')); } + + public function testFindAttributeInBothParentAndChild() + { + $dom = new Dom(); + $dom->load(' + +'); + + /** @var Dom\AbstractNode $meta */ + $nodes = $dom->find('[attribute]'); + $this->assertCount(2, $nodes); + } } From 4e13ad24dadd0313ed48448632e1bc317b9c780c Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 12 May 2020 02:49:20 +0000 Subject: [PATCH 36/68] Removed the depricated depthFirstSearch option --- CHANGELOG.md | 1 + .../Contracts/Selector/SeekerInterface.php | 2 +- .../Contracts/Selector/SelectorInterface.php | 2 -- src/PHPHtmlParser/Dom.php | 7 +----- src/PHPHtmlParser/Dom/AbstractNode.php | 3 +-- src/PHPHtmlParser/Options.php | 17 -------------- src/PHPHtmlParser/Selector/Seeker.php | 23 ++++++++----------- src/PHPHtmlParser/Selector/Selector.php | 12 +--------- tests/DomTest.php | 13 ----------- tests/OptionsTest.php | 7 ------ 10 files changed, 14 insertions(+), 73 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1de2d167..3cd22cd0 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - Curl interface and curl implementation has been removed. +- Removed support for the depth first search option. ## 2.2.0 diff --git a/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php b/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php index cca4eb54..23357795 100644 --- a/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php +++ b/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php @@ -13,5 +13,5 @@ interface SeekerInterface * * @throws ChildNotFoundException */ - public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array; + public function seek(array $nodes, RuleDTO $rule, array $options): array; } diff --git a/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php b/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php index 8eca7d1e..c1aceeb9 100644 --- a/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php +++ b/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php @@ -21,8 +21,6 @@ public function __construct(string $selector, ?ParserInterface $parser = null, ? */ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO; - public function setDepthFirstFind(bool $status): void; - /** * Attempts to find the selectors starting from the given * node object. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 8c2ebcde..fa659f67 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -259,12 +259,7 @@ public function find(string $selector, int $nth = null) { $this->isLoaded(); - $depthFirstSearch = $this->options->get('depthFirstSearch'); - if (\is_bool($depthFirstSearch)) { - $result = $this->root->find($selector, $nth, $depthFirstSearch); - } else { - $result = $this->root->find($selector, $nth); - } + $result = $this->root->find($selector, $nth); return $result; } diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 0d096904..596a3ae8 100755 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -439,13 +439,12 @@ public function ancestorByTag(string $tag): AbstractNode * * @return mixed|Collection|null */ - public function find(string $selectorString, ?int $nth = null, bool $depthFirst = false, ?SelectorInterface $selector = null) + public function find(string $selectorString, ?int $nth = null, ?SelectorInterface $selector = null) { if (\is_null($selector)) { $selector = new Selector($selectorString); } - $selector->setDepthFirstFind($depthFirst); $nodes = $selector->find($this); if ($nth !== null) { diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index e90e435f..b7e1cd17 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -18,7 +18,6 @@ * @property bool $preserveLineBreaks * @property bool $removeDoubleSpace * @property bool $removeSmartyScripts - * @property bool $depthFirstSearch * @property bool $htmlSpecialCharsDecode */ class Options @@ -38,7 +37,6 @@ class Options 'preserveLineBreaks' => false, 'removeDoubleSpace' => true, 'removeSmartyScripts' => true, - 'depthFirstSearch' => false, 'htmlSpecialCharsDecode' => false, ]; @@ -191,21 +189,6 @@ public function setRemoveSmartyScripts(bool $value): self return $this; } - /** - * By default this is set to false for legacy support. Setting this to true will change the behavior of find - * to order elements by depth first. This will properly preserve the order of elements as they where in the HTML. - * - * @return Options - * - * @deprecated This option will be removed in version 3.0.0 with the new behavior being as if it was set to true. - */ - public function setDepthFirstSearch(bool $value): self - { - $this->options['depthFirstSearch'] = $value; - - return $this; - } - /** * By default this is set to false. Setting this to true will apply the php function htmlspecialchars_decode too all attribute values and text nodes. * diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index fa101e9e..93d1bc1c 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -20,7 +20,7 @@ class Seeker implements SeekerInterface * @var InnerNode[] $nodes * @throws ChildNotFoundException */ - public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array + public function seek(array $nodes, RuleDTO $rule, array $options): array { // XPath index if ($rule->getTag() !== null && \is_numeric($rule->getKey())) { @@ -80,19 +80,14 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir // this child failed to be matched if ($child instanceof InnerNode && $child->hasChildren() ) { - if ($depthFirst) { - if (!isset($options['checkGrandChildren']) - || $options['checkGrandChildren'] - ) { - // we have a child that failed but are not leaves. - $matches = $this->seek([$child], $rule, $options, $depthFirst); - foreach ($matches as $match) { - $return[] = $match; - } + if (!isset($options['checkGrandChildren']) + || $options['checkGrandChildren'] + ) { + // we have a child that failed but are not leaves. + $matches = $this->seek([$child], $rule, $options); + foreach ($matches as $match) { + $return[] = $match; } - } else { - // we still want to check its children - $children[] = $child; } } @@ -104,7 +99,7 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir && \count($children) > 0 ) { // we have children that failed but are not leaves. - $matches = $this->seek($children, $rule, $options, $depthFirst); + $matches = $this->seek($children, $rule, $options); foreach ($matches as $match) { $return[] = $match; } diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 95c47001..4c45da01 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -25,11 +25,6 @@ class Selector implements SelectorInterface */ private $ParsedSelectorCollectionDTO; - /** - * @var bool - */ - private $depthFirst = false; - /** * @var SeekerInterface */ @@ -61,11 +56,6 @@ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO return $this->ParsedSelectorCollectionDTO; } - public function setDepthFirstFind(bool $status): void - { - $this->depthFirst = $status; - } - /** * Attempts to find the selectors starting from the given * node object. @@ -87,7 +77,7 @@ public function find(AbstractNode $node): Collection $options[] = $this->alterNext($rule); continue; } - $nodes = $this->seeker->seek($nodes, $rule, $options, $this->depthFirst); + $nodes = $this->seeker->seek($nodes, $rule, $options); // clear the options $options = []; } diff --git a/tests/DomTest.php b/tests/DomTest.php index 7c29b508..9922f17f 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -483,19 +483,6 @@ public function testFindOrder() $dom->load($str); $images = $dom->find('img'); - $this->assertEquals('', (string) $images[0]); - } - - public function testFindDepthFirstSearch() - { - $str = '

'; - $dom = new Dom(); - $dom->setOptions([ - 'depthFirstSearch' => true, - ]); - $dom->load($str); - $images = $dom->find('img'); - $this->assertEquals('', (string) $images[0]); } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index 91c62591..899a0622 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -70,7 +70,6 @@ public function testSetters() 'preserveLineBreaks' => false, 'removeDoubleSpace' => false, 'removeSmartyScripts' => false, - 'depthFirstSearch' => false, 'htmlSpecialCharsDecode' => false, ]); @@ -101,9 +100,6 @@ public function testSetters() $options->setRemoveSmartyScripts(true); $this->assertTrue($options->get('removeSmartyScripts')); - $options->setDepthFirstSearch(true); - $this->assertTrue($options->get('depthFirstSearch')); - $options->setHtmlSpecialCharsDecode(true); $this->assertTrue($options->get('htmlSpecialCharsDecode')); @@ -136,9 +132,6 @@ public function testSetters() $options->setRemoveSmartyScripts(false); $this->assertFalse($options->get('removeSmartyScripts')); - $options->setDepthFirstSearch(false); - $this->assertFalse($options->get('depthFirstSearch')); - $options->setHtmlSpecialCharsDecode(false); $this->assertFalse($options->get('htmlSpecialCharsDecode')); } From 924a594e7df145511466939171ca6c1966cd0cc6 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 12 May 2020 02:55:03 +0000 Subject: [PATCH 37/68] Fix #187 --- CHANGELOG.md | 1 + src/PHPHtmlParser/Content.php | 2 +- src/PHPHtmlParser/Dom.php | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cd22cd0..5d9b1bab 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added tag attribute DTO. - Cleaned up the selector logic. - Fixed issue with greedy regex for charset detection. +- Fixed bug causing infinite loops in some cases. ### Removed - Curl interface and curl implementation has been removed. diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 0ae7e0e4..24bca182 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -164,7 +164,7 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal * * @return string */ - public function copyUntilUnless(string $string, string $unless) + public function copyUntilUnless(string $string, string $unless): string { $lastPos = $this->pos; $this->fastForward(1); diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index fa659f67..f772e707 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -749,7 +749,7 @@ private function parseTag(): array do { $moreString = $this->content->copyUntilUnless('"', '=>'); $string .= $moreString; - } while (!empty($moreString)); + } while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size); $attr['value'] = $string; $this->content->fastForward(1); $node->getTag()->setAttribute($name, $string); @@ -760,7 +760,7 @@ private function parseTag(): array do { $moreString = $this->content->copyUntilUnless("'", '=>'); $string .= $moreString; - } while (!empty($moreString)); + } while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size); $attr['value'] = $string; $this->content->fastForward(1); $node->getTag()->setAttribute($name, $string, false); From 0127b9e354e92f9c515653b8b50423e38010762c Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 12 May 2020 03:15:13 +0000 Subject: [PATCH 38/68] fixes #188 --- tests/DomTest.php | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/DomTest.php b/tests/DomTest.php index 9922f17f..47aeb6ae 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -617,4 +617,16 @@ public function testFindAttributeInBothParentAndChild() $nodes = $dom->find('[attribute]'); $this->assertCount(2, $nodes); } + + public function testLessThanCharacterInJavascript() + { + $results = (new Dom())->load('
', + [ + 'cleanupInput' => false, + 'removeScripts' => false + ])->find('body'); + $this->assertCount(1, $results); + } } From 4bb7098f3a46582dd9c5cd289fae6f0e835f2916 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 13 May 2020 14:36:50 +0000 Subject: [PATCH 39/68] Fixes #203 --- CHANGELOG.md | 1 + src/PHPHtmlParser/Dom.php | 17 ---------- src/PHPHtmlParser/Dom/AbstractNode.php | 10 ------ tests/DomTest.php | 44 ++++++++++---------------- 4 files changed, 18 insertions(+), 54 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d9b1bab..bd2dffcb 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - Curl interface and curl implementation has been removed. - Removed support for the depth first search option. +- findById() method removed from Dom object. ## 2.2.0 diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index f772e707..ba9ea4ae 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -150,7 +150,6 @@ public function __get($name) */ public function load(string $str, array $options = []): Dom { - AbstractNode::resetCount(); // check if it's a file if (\strpos($str, "\n") === false && \is_file($str)) { return $this->loadFromFile($str, $options); @@ -264,22 +263,6 @@ public function find(string $selector, int $nth = null) return $result; } - /** - * Find element by Id on the root node. - * - * @throws ChildNotFoundException - * @throws NotLoadedException - * @throws ParentNotFoundException - * - * @return bool|AbstractNode - */ - public function findById(int $id) - { - $this->isLoaded(); - - return $this->root->findById($id); - } - /** * Adds the tag (or tags in an array) to the list of tags that will always * be self closing. diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 596a3ae8..3d67ab5c 100755 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -140,16 +140,6 @@ public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void $this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode; } - /** - * Reset node counter. - * - * @return void - */ - public static function resetCount() - { - self::$count = 0; - } - /** * Returns the id of this object. */ diff --git a/tests/DomTest.php b/tests/DomTest.php index 47aeb6ae..8e800487 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -394,33 +394,6 @@ public function testHasChildren() $this->assertTrue($dom->hasChildren()); } - public function testFindByIdVar1() - { - $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); - /** @var Dom\AbstractNode $result */ - $result = $dom->findById(4); - $this->assertEquals(4, $result->id()); - } - - public function testFindByIdVar2() - { - $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); - /** @var Dom\AbstractNode $result */ - $result = $dom->findById(5); - $this->assertEquals(5, $result->id()); - } - - public function testFindByIdNotFountEleement() - { - $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); - /** @var Dom\AbstractNode $result */ - $result = $dom->findById(8); - $this->assertFalse($result); - } - public function testWhitespaceInText() { $dom = new Dom(); @@ -629,4 +602,21 @@ public function testLessThanCharacterInJavascript() ])->find('body'); $this->assertCount(1, $results); } + + public function testUniqueIdForAllObjects() + { + // Create a dom which will be used as a parent/container for a paragraph + $dom1 = new \PHPHtmlParser\Dom; + $dom1->load('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) + $div = $dom1->firstChild(); + + // Create a paragraph outside of the first dom + $dom2 = new \PHPHtmlParser\Dom; + $dom2->load('

Our new paragraph.

'); // Resets the counter + $paragraph = $dom2->firstChild(); + + $div->addChild($paragraph); + + $this->assertEquals('A container div

Our new paragraph.

', $div->innerhtml); + } } From 4e3158c561878076a82b32143e73537a1c391fa6 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 14 Jul 2020 20:24:14 +0000 Subject: [PATCH 40/68] Added test to cover #189 --- tests/DomTest.php | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/DomTest.php b/tests/DomTest.php index 8e800487..9c3781c4 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -619,4 +619,23 @@ public function testUniqueIdForAllObjects() $this->assertEquals('A container div

Our new paragraph.

', $div->innerhtml); } + + public function testFindDescendantsOfMatch() + { + $dom = new Dom(); + $dom->load('

+ + test + testing + This is a test + italic + password123 + + another +

'); + + /** @var Dom\AbstractNode $meta */ + $nodes = $dom->find('b'); + $this->assertCount(5, $nodes); + } } From 8fccd89f73faa2fb7dd523605d927226b78d20a3 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 14 Jul 2020 20:28:33 +0000 Subject: [PATCH 41/68] Added coverage for #174 --- tests/DomTest.php | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/DomTest.php b/tests/DomTest.php index 9c3781c4..d68b0fc2 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -634,8 +634,19 @@ public function testFindDescendantsOfMatch() another

'); - /** @var Dom\AbstractNode $meta */ $nodes = $dom->find('b'); $this->assertCount(5, $nodes); } + + public function testCompatibleWithWordPressShortcode() + { + $dom = new Dom(); + $dom->load('

+[wprs_alert type="success" content="this is a short code" /] +

'); + + $node = $dom->find('p', 0); + $this->assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml); + + } } From 1a1c3eb2d20069ca811ffcdce7be8a7d4f0effe6 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Tue, 14 Jul 2020 20:41:19 +0000 Subject: [PATCH 42/68] Fixed #155, removed load method call --- CHANGELOG.md | 3 +- README.md | 8 +- src/PHPHtmlParser/Dom.php | 24 ----- src/PHPHtmlParser/StaticDom.php | 25 ++--- tests/DomTest.php | 116 +++++++++++------------ tests/Node/HtmlTest.php | 4 +- tests/Node/TextTest.php | 2 +- tests/Options/CleanupTest.php | 4 +- tests/Options/PreserveLineBreaks.php | 4 +- tests/Options/StrictTest.php | 8 +- tests/Options/WhitespaceTextNodeTest.php | 4 +- tests/OptionsTest.php | 2 +- tests/StaticDomTest.php | 10 +- 13 files changed, 91 insertions(+), 123 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd2dffcb..9862beac 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - Curl interface and curl implementation has been removed. - Removed support for the depth first search option. -- findById() method removed from Dom object. +- `findById()` method removed from Dom object. +- Removed `load()` method in Dom object. ## 2.2.0 diff --git a/README.md b/README.md index 54c28d04..cbd64800 100755 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ require "vendor/autoload.php"; use PHPHtmlParser\Dom; $dom = new Dom; -$dom->load('

Hey bro, click here
:)

'); +$dom->loadStr('

Hey bro, click here
:)

'); $a = $dom->find('a')[0]; echo $a->text; // "click here" ``` @@ -86,7 +86,7 @@ $dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // or -$dom->load('http://google.com'); +$dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // same result as the first example ``` @@ -137,11 +137,11 @@ $dom->setOptions([ 'strict' => true, // Set a global option to enable strict html parsing. ]); -$dom->load('http://google.com', [ +$dom->loadFromUrl('http://google.com', [ 'whitespaceTextNode' => false, // Only applies to this load. ]); -$dom->load('http://gmail.com'); // will not have whitespaceTextNode set to false. +$dom->loadFromUrl('http://gmail.com'); // will not have whitespaceTextNode set to false. ``` At the moment we support 8 options. diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index ba9ea4ae..1cba5050 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -15,7 +15,6 @@ use PHPHtmlParser\Exceptions\CurlException; use PHPHtmlParser\Exceptions\LogicalException; use PHPHtmlParser\Exceptions\NotLoadedException; -use PHPHtmlParser\Exceptions\ParentNotFoundException; use PHPHtmlParser\Exceptions\StrictException; use PHPHtmlParser\Exceptions\UnknownChildTypeException; use Psr\Http\Client\ClientInterface; @@ -139,29 +138,6 @@ public function __get($name) return $this->root->$name; } - /** - * Attempts to load the dom from any resource, string, file, or URL. - * - * @throws ChildNotFoundException - * @throws CircularException - * @throws CurlException - * @throws StrictException - * @throws LogicalException - */ - public function load(string $str, array $options = []): Dom - { - // check if it's a file - if (\strpos($str, "\n") === false && \is_file($str)) { - return $this->loadFromFile($str, $options); - } - // check if it's a url - if (\preg_match("/^https?:\/\//i", $str)) { - return $this->loadFromUrl($str, $options); - } - - return $this->loadStr($str, $options); - } - /** * Loads the dom from a document file/url. * diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index b4c3ef22..411ca3de 100755 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -56,23 +56,6 @@ public static function mount(string $className = 'Dom', ?Dom $dom = null): bool return true; } - /** - * Creates a new dom object and calls load() on the - * new object. - * - * @throws ChildNotFoundException - * @throws CircularException - * @throws CurlException - * @throws StrictException - */ - public static function load(string $str): Dom - { - $dom = new Dom(); - self::$dom = $dom; - - return $dom->load($str); - } - /** * Creates a new dom object and calls loadFromFile() on the * new object. @@ -114,6 +97,14 @@ public static function loadFromUrl(string $url, array $options = [], ClientInter return $dom->loadFromUrl($url, $options, $client, $request); } + public static function loadStr(string $str, array $options = []): Dom + { + $dom = new Dom(); + self::$dom = $dom; + + return $dom->loadStr($str, $options); + } + /** * Sets the $dom variable to null. */ diff --git a/tests/DomTest.php b/tests/DomTest.php index d68b0fc2..2a904cc5 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -20,14 +20,14 @@ public function testParsingCData() $html = ""; $dom = new Dom(); $dom->setOptions(['cleanupInput' => false]); - $dom->load($html); + $dom->loadStr($html); $this->assertSame($html, $dom->root->outerHtml()); } - public function testLoad() + public function testloadStr() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $div = $dom->find('div', 0); $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); } @@ -44,7 +44,7 @@ public function testNotLoaded() public function testIncorrectAccess() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $div = $dom->find('div', 0); $this->assertEquals(null, $div->foo); } @@ -52,7 +52,7 @@ public function testIncorrectAccess() public function testLoadSelfclosingAttr() { $dom = new Dom(); - $dom->load("

baz
"); + $dom->loadStr("

baz
"); $br = $dom->find('br', 0); $this->assertEquals('
', $br->outerHtml); } @@ -60,7 +60,7 @@ public function testLoadSelfclosingAttr() public function testLoadSelfclosingAttrToString() { $dom = new Dom(); - $dom->load("

baz
"); + $dom->loadStr("

baz
"); $br = $dom->find('br', 0); $this->assertEquals('
', (string) $br); } @@ -68,7 +68,7 @@ public function testLoadSelfclosingAttrToString() public function testLoadEscapeQuotes() { $dom = new Dom(); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $div = $dom->find('div', 0); $this->assertEquals('

Hey bro, click here

', $div->outerHtml); } @@ -76,14 +76,14 @@ public function testLoadEscapeQuotes() public function testLoadNoOpeningTag() { $dom = new Dom(); - $dom->load('
PR Manager
content
'); + $dom->loadStr('
PR Manager
content
'); $this->assertEquals('content', $dom->find('.content', 0)->text); } public function testLoadNoClosingTag() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $root = $dom->find('div', 0)->getParent(); $this->assertEquals('

Hey bro, click here


', $root->outerHtml); } @@ -91,7 +91,7 @@ public function testLoadNoClosingTag() public function testLoadAttributeOnSelfClosing() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $br = $dom->find('br', 0); $this->assertEquals('both', $br->getAttribute('class')); } @@ -99,7 +99,7 @@ public function testLoadAttributeOnSelfClosing() public function testLoadClosingTagOnSelfClosing() { $dom = new Dom(); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -108,7 +108,7 @@ public function testLoadClosingTagOnSelfClosingNoSlash() $dom = new Dom(); $dom->addNoSlashTag('br'); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -116,7 +116,7 @@ public function testLoadClosingTagAddSelfClosingTag() { $dom = new Dom(); $dom->addSelfClosingTag('mytag'); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -127,7 +127,7 @@ public function testLoadClosingTagAddSelfClosingTagArray() 'mytag', 'othertag', ]); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } @@ -135,7 +135,7 @@ public function testLoadClosingTagRemoveSelfClosingTag() { $dom = new Dom(); $dom->removeSelfClosingTag('br'); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); } @@ -143,35 +143,35 @@ public function testLoadClosingTagClearSelfClosingTag() { $dom = new Dom(); $dom->clearSelfClosingTags(); - $dom->load('

Hey bro, click here

'); + $dom->loadStr('

Hey bro, click here

'); $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); } public function testLoadNoValueAttribute() { $dom = new Dom(); - $dom->load('
Main content here
'); + $dom->loadStr('
Main content here
'); $this->assertEquals('
Main content here
', $dom->innerHtml); } public function testLoadBackslashAttributeValue() { $dom = new Dom(); - $dom->load('
Main content here
'); + $dom->loadStr('
Main content here
'); $this->assertEquals('
Main content here
', $dom->innerHtml); } public function testLoadNoValueAttributeBefore() { $dom = new Dom(); - $dom->load('
Main content here
'); + $dom->loadStr('
Main content here
'); $this->assertEquals('
Main content here
', $dom->innerHtml); } public function testLoadUpperCase() { $dom = new Dom(); - $dom->load('

hEY BRO, CLICK HERE

'); + $dom->loadStr('

hEY BRO, CLICK HERE

'); $this->assertEquals('

hEY BRO, CLICK HERE

', $dom->find('div', 0)->innerHtml); } @@ -206,7 +206,7 @@ public function testLoadFromFileNotFound() public function testLoadUtf8() { $dom = new Dom(); - $dom->load('

Dzień

'); + $dom->loadStr('

Dzień

'); $this->assertEquals('Dzień', $dom->find('p', 0)->text); } @@ -268,56 +268,56 @@ public function testLoadFromUrl() public function testToStringMagic() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $this->assertEquals('

Hey bro, click here
:)

', (string) $dom); } public function testGetMagic() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $this->assertEquals('

Hey bro, click here
:)

', $dom->innerHtml); } public function testFirstChild() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('

Hey bro, click here

', $dom->firstChild()->outerHtml); } public function testLastChild() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('
', $dom->lastChild()->outerHtml); } public function testGetElementById() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('click here', $dom->getElementById('78')->outerHtml); } public function testGetElementsByTag() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('

Hey bro, click here

', $dom->getElementsByTag('p')[0]->outerHtml); } public function testGetElementsByClass() { $dom = new Dom(); - $dom->load('

Hey bro, click here


'); + $dom->loadStr('

Hey bro, click here


'); $this->assertEquals('

Hey bro, click here

', $dom->getElementsByClass('all')[0]->innerHtml); } public function testScriptCleanerScriptTag() { $dom = new Dom(); - $dom->load(' + $dom->loadStr('

.....

', [ @@ -607,12 +607,12 @@ public function testUniqueIdForAllObjects() { // Create a dom which will be used as a parent/container for a paragraph $dom1 = new \PHPHtmlParser\Dom; - $dom1->load('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) + $dom1->loadStr('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) $div = $dom1->firstChild(); // Create a paragraph outside of the first dom $dom2 = new \PHPHtmlParser\Dom; - $dom2->load('

Our new paragraph.

'); // Resets the counter + $dom2->loadStr('

Our new paragraph.

'); // Resets the counter $paragraph = $dom2->firstChild(); $div->addChild($paragraph); @@ -623,7 +623,7 @@ public function testUniqueIdForAllObjects() public function testFindDescendantsOfMatch() { $dom = new Dom(); - $dom->load('

+ $dom->loadStr('

test testing @@ -641,7 +641,7 @@ public function testFindDescendantsOfMatch() public function testCompatibleWithWordPressShortcode() { $dom = new Dom(); - $dom->load('

+ $dom->loadStr('

[wprs_alert type="success" content="this is a short code" /]

'); diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index 677b280e..a4db8142 100755 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -500,7 +500,7 @@ public function testAncestorByTagFailure() public function testReplaceNode() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $id = $dom->find('p')[0]->id(); $newChild = new HtmlNode('h1'); $dom->find('p')[0]->getParent()->replaceChild($id, $newChild); @@ -510,7 +510,7 @@ public function testReplaceNode() public function testTextNodeFirstChild() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $p = $dom->find('p'); foreach ($p as $element) { $child = $element->firstChild(); diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 27dd03e9..d36eddc0 100755 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -57,7 +57,7 @@ public function testSetTextToTextNode() public function testSetText() { $dom = new Dom(); - $dom->load('

Hey bro, click here
:)

'); + $dom->loadStr('

Hey bro, click here
:)

'); $a = $dom->find('a')[0]; $a->firstChild()->setText('biz baz'); $this->assertEquals('

Hey bro, biz baz
:)

', (string) $dom); diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php index 0a8a9baf..b7e5325e 100755 --- a/tests/Options/CleanupTest.php +++ b/tests/Options/CleanupTest.php @@ -76,7 +76,7 @@ public function testRemoveScriptsFalse() public function testSmartyScripts() { $dom = new Dom(); - $dom->load(' + $dom->loadStr(' aa={123} '); $this->assertEquals(' aa= ', $dom->innerHtml); @@ -88,7 +88,7 @@ public function testSmartyScriptsDisabled() $dom->setOptions([ 'removeSmartyScripts' => false, ]); - $dom->load(' + $dom->loadStr(' aa={123} '); $this->assertEquals(' aa={123} ', $dom->innerHtml); diff --git a/tests/Options/PreserveLineBreaks.php b/tests/Options/PreserveLineBreaks.php index 3df7223e..ad095a38 100755 --- a/tests/Options/PreserveLineBreaks.php +++ b/tests/Options/PreserveLineBreaks.php @@ -13,7 +13,7 @@ public function testPreserveLineBreakTrue() $dom->setOptions([ 'preserveLineBreaks' => true, ]); - $dom->load('
+ $dom->loadStr('
'); $this->assertEquals("
\n
", (string) $dom); @@ -25,7 +25,7 @@ public function testPreserveLineBreakBeforeClosingTag() $dom->setOptions([ 'preserveLineBreaks' => true, ]); - $dom->load('
loadStr('
'); $this->assertEquals('
', (string) $dom); diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php index cb015981..96d457b7 100755 --- a/tests/Options/StrictTest.php +++ b/tests/Options/StrictTest.php @@ -14,7 +14,7 @@ public function testConfigStrict() $dom->setOptions([ 'strict' => true, ]); - $dom->load('

Hey you

Ya you!

'); + $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); } @@ -26,7 +26,7 @@ public function testConfigStrictMissingSelfClosing() ]); try { // should throw an exception - $dom->load('

Hey you


Ya you!

'); + $dom->loadStr('

Hey you


Ya you!

'); // we should not get here $this->assertTrue(false); } catch (StrictException $e) { @@ -42,7 +42,7 @@ public function testConfigStrictMissingAttribute() ]); try { // should throw an exception - $dom->load('

Hey you

Ya you!

'); + $dom->loadStr('

Hey you

Ya you!

'); // we should not get here $this->assertTrue(false); } catch (StrictException $e) { @@ -56,7 +56,7 @@ public function testConfigStrictBRTag() $dom->setOptions([ 'strict' => true, ]); - $dom->load('
'); + $dom->loadStr('
'); $this->assertTrue(true); } } diff --git a/tests/Options/WhitespaceTextNodeTest.php b/tests/Options/WhitespaceTextNodeTest.php index 541fbec0..0097f28d 100755 --- a/tests/Options/WhitespaceTextNodeTest.php +++ b/tests/Options/WhitespaceTextNodeTest.php @@ -13,7 +13,7 @@ public function testConfigGlobalNoWhitespaceTextNode() $dom->setOptions([ 'whitespaceTextNode' => false, ]); - $dom->load('

Hey you

Ya you!

'); + $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text); } @@ -23,7 +23,7 @@ public function testConfigLocalOverride() $dom->setOptions([ 'whitespaceTextNode' => false, ]); - $dom->load('

Hey you

Ya you!

', [ + $dom->loadStr('

Hey you

Ya you!

', [ 'whitespaceTextNode' => true, ]); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index 899a0622..a78f508f 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -144,6 +144,6 @@ public function testUnknownOptionDom() ]); $this->expectException(UnknownOptionException::class); - $dom->load('
'); + $dom->loadStr('
'); } } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index 2fb225fb..fbc1a5bd 100755 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -25,16 +25,16 @@ public function testMountWithDom() $this->assertTrue($status); } - public function testLoad() + public function testloadStr() { - $dom = Dom::load('

Hey bro, click here
:)

'); + $dom = Dom::loadStr('

Hey bro, click here
:)

'); $div = $dom->find('div', 0); $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); } public function testLoadWithFile() { - $dom = Dom::load('tests/data/files/small.html'); + $dom = Dom::loadFromFile('tests/data/files/small.html'); $this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text); } @@ -47,14 +47,14 @@ public function testLoadFromFile() /** * @expectedException \PHPHtmlParser\Exceptions\NotLoadedException */ - public function testFindNoLoad() + public function testFindNoloadStr() { Dom::find('.post-user font', 0); } public function testFindI() { - Dom::load('tests/data/files/big.html'); + Dom::loadFromFile('tests/data/files/big.html'); $this->assertEquals('В кустах блестит металл
И искрится ток
Человечеству конец', Dom::find('i')[1]->innerHtml); } From e37e8ef9eda6bb44f50519b51fd80f0207f29585 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 00:03:46 +0000 Subject: [PATCH 43/68] Fixed #187 and added tests --- src/PHPHtmlParser/Content.php | 25 +++++++++++++++---- src/PHPHtmlParser/Dom.php | 20 ++++++++++++--- .../Exceptions/ContentLengthException.php | 14 +++++++++++ tests/DomTest.php | 8 ++++++ 4 files changed, 59 insertions(+), 8 deletions(-) create mode 100644 src/PHPHtmlParser/Exceptions/ContentLengthException.php diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 24bca182..66bc7794 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -4,6 +4,7 @@ namespace PHPHtmlParser; +use PHPHtmlParser\Exceptions\ContentLengthException; use PHPHtmlParser\Exceptions\LogicalException; /** @@ -74,14 +75,27 @@ public function char(?int $char = null): string * Moves the current position forward. * * @chainable + * @throws ContentLengthException */ public function fastForward(int $count): Content { + if (!$this->canFastForward()) { + // trying to go over the content length, throw exception + throw new ContentLengthException('Attempt to fastForward pass the length of the content.'); + } $this->pos += $count; return $this; } + /** + * Checks if we can move the position forward. + */ + public function canFastForward(): bool + { + return \strlen($this->content) > $this->pos; + } + /** * Moves the current position backward. * @@ -197,14 +211,15 @@ public function copyByToken(string $token, bool $char = false, bool $escape = fa /** * Skip a given set of characters. * - * @return Content|string + * @throws LogicalException */ - public function skip(string $string, bool $copy = false) + public function skip(string $string, bool $copy = false): string { $len = \strspn($this->content, $string, $this->pos); - - // make it chainable if they don't want a copy - $return = $this; + if ($len === false) { + throw new LogicalException('Strspn returned false with position ' . $this->pos . '.'); + } + $return = ''; if ($copy) { $return = \substr($this->content, $this->pos, $len); if ($return === false) { diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 1cba5050..d23110df 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -12,6 +12,7 @@ use PHPHtmlParser\Dom\TextNode; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; +use PHPHtmlParser\Exceptions\ContentLengthException; use PHPHtmlParser\Exceptions\CurlException; use PHPHtmlParser\Exceptions\LogicalException; use PHPHtmlParser\Exceptions\NotLoadedException; @@ -646,7 +647,13 @@ private function parseTag(): array } // check if this is a closing tag - if ($this->content->fastForward(1)->char() == '/') { + try { + $this->content->fastForward(1); + } catch (ContentLengthException $exception) { + // we are at the end of the file + return $return; + } + if ($this->content->char() == '/') { // end tag $tag = $this->content->fastForward(1) ->copyByToken('slash', true); @@ -683,7 +690,12 @@ private function parseTag(): array ) { $space = $this->content->skipByToken('blank', true); if (empty($space)) { - $this->content->fastForward(1); + try { + $this->content->fastForward(1); + } catch (ContentLengthException $exception) { + // reached the end of the content + break; + } continue; } @@ -764,7 +776,9 @@ private function parseTag(): array } } - $this->content->fastForward(1); + if ($this->content->canFastForward()) { + $this->content->fastForward(1); + } $return['status'] = true; $return['node'] = $node; diff --git a/src/PHPHtmlParser/Exceptions/ContentLengthException.php b/src/PHPHtmlParser/Exceptions/ContentLengthException.php new file mode 100644 index 00000000..83c9e771 --- /dev/null +++ b/src/PHPHtmlParser/Exceptions/ContentLengthException.php @@ -0,0 +1,14 @@ +assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml); } + + public function testBrokenHtml() + { + $dom = new Dom(); + $dom->loadStr('assertEquals('', $dom->outerHtml); + } } From b58c6da6c58e9da334de20b46f602e9cb70d5095 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 01:18:55 +0000 Subject: [PATCH 44/68] Removed magical option array --- CHANGELOG.md | 1 + composer.json | 3 +- src/PHPHtmlParser/Content.php | 22 +- src/PHPHtmlParser/Dom.php | 86 ++++--- src/PHPHtmlParser/Enum/StringToken.php | 21 ++ src/PHPHtmlParser/Options.php | 291 +++++++++++------------ src/PHPHtmlParser/Selector/Seeker.php | 3 +- src/PHPHtmlParser/StaticDom.php | 9 +- tests/ContentTest.php | 5 +- tests/DomTest.php | 30 ++- tests/Options/CleanupTest.php | 29 +-- tests/Options/PreserveLineBreaks.php | 10 +- tests/Options/StrictTest.php | 17 +- tests/Options/WhitespaceTextNodeTest.php | 13 +- tests/OptionsTest.php | 117 ++------- tests/StaticDomTest.php | 2 +- 16 files changed, 273 insertions(+), 386 deletions(-) create mode 100644 src/PHPHtmlParser/Enum/StringToken.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 9862beac..05d2146f 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Cleaned up the selector logic. - Fixed issue with greedy regex for charset detection. - Fixed bug causing infinite loops in some cases. +- Refactored the way we handle options. Removed the magical option array. ### Removed - Curl interface and curl implementation has been removed. diff --git a/composer.json b/composer.json index 79258c58..5549a5ee 100755 --- a/composer.json +++ b/composer.json @@ -20,7 +20,8 @@ "paquettg/string-encode": "~1.0.0", "php-http/httplug": "^2.1", "php-http/guzzle6-adapter": "^2.0", - "guzzlehttp/psr7": "^1.6" + "guzzlehttp/psr7": "^1.6", + "myclabs/php-enum": "^1.7" }, "require-dev": { "phpunit/phpunit": "^7.5.1", diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 66bc7794..fdb741c4 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -4,6 +4,7 @@ namespace PHPHtmlParser; +use PHPHtmlParser\Enum\StringToken; use PHPHtmlParser\Exceptions\ContentLengthException; use PHPHtmlParser\Exceptions\LogicalException; @@ -75,11 +76,12 @@ public function char(?int $char = null): string * Moves the current position forward. * * @chainable + * * @throws ContentLengthException */ public function fastForward(int $count): Content { - if (!$this->canFastForward()) { + if (!$this->canFastForward($count)) { // trying to go over the content length, throw exception throw new ContentLengthException('Attempt to fastForward pass the length of the content.'); } @@ -91,9 +93,9 @@ public function fastForward(int $count): Content /** * Checks if we can move the position forward. */ - public function canFastForward(): bool + public function canFastForward(int $count): bool { - return \strlen($this->content) > $this->pos; + return \strlen($this->content) >= $this->pos + $count; } /** @@ -175,8 +177,6 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal /** * Copies the content until the string is found and return it * unless the 'unless' is found in the substring. - * - * @return string */ public function copyUntilUnless(string $string, string $unless): string { @@ -197,13 +197,11 @@ public function copyUntilUnless(string $string, string $unless): string /** * Copies the content until it reaches the token string.,. * - * @return string - * * @uses $this->copyUntil() */ - public function copyByToken(string $token, bool $char = false, bool $escape = false) + public function copyByToken(StringToken $stringToken, bool $char = false, bool $escape = false): string { - $string = $this->$token; + $string = $stringToken->getValue(); return $this->copyUntil($string, $char, $escape); } @@ -236,13 +234,11 @@ public function skip(string $string, bool $copy = false): string /** * Skip a given token of pre-defined characters. * - * @return Content|string - * * @uses $this->skip() */ - public function skipByToken(string $token, bool $copy = false) + public function skipByToken(StringToken $skipToken, bool $copy = false): string { - $string = $this->$token; + $string = $skipToken->getValue(); return $this->skip($string, $copy); } diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index d23110df..d2db15e2 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -10,10 +10,10 @@ use PHPHtmlParser\Dom\Collection; use PHPHtmlParser\Dom\HtmlNode; use PHPHtmlParser\Dom\TextNode; +use PHPHtmlParser\Enum\StringToken; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\ContentLengthException; -use PHPHtmlParser\Exceptions\CurlException; use PHPHtmlParser\Exceptions\LogicalException; use PHPHtmlParser\Exceptions\NotLoadedException; use PHPHtmlParser\Exceptions\StrictException; @@ -72,9 +72,9 @@ class Dom /** * A global options array to be used by all load calls. * - * @var array + * @var ?Options */ - private $globalOptions = []; + private $globalOptions; /** * A persistent option object to be used for all options in the @@ -147,7 +147,7 @@ public function __get($name) * @throws StrictException * @throws LogicalException */ - public function loadFromFile(string $file, array $options = []): Dom + public function loadFromFile(string $file, ?Options $options = null): Dom { $content = @\file_get_contents($file); if ($content === false) { @@ -168,7 +168,7 @@ public function loadFromFile(string $file, array $options = []): Dom * @throws StrictException * @throws \Psr\Http\Client\ClientExceptionInterface */ - public function loadFromUrl(string $url, array $options = [], ?ClientInterface $client = null, ?RequestInterface $request = null): Dom + public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom { if ($client === null) { $client = new Client(); @@ -191,11 +191,15 @@ public function loadFromUrl(string $url, array $options = [], ?ClientInterface $ * @throws CircularException * @throws StrictException */ - public function loadStr(string $str, array $option = []): Dom + public function loadStr(string $str, ?Options $options = null): Dom { $this->options = new Options(); - $this->options->setOptions($this->globalOptions) - ->setOptions($option); + if ($this->globalOptions !== null) { + $this->options->setFromOptions($this->globalOptions); + } + if ($options !== null) { + $this->options->setFromOptions($options); + } $this->rawSize = \strlen($str); $this->raw = $str; @@ -216,7 +220,7 @@ public function loadStr(string $str, array $option = []): Dom * * @chainable */ - public function setOptions(array $options): Dom + public function setOptions(Options $options): Dom { $this->globalOptions = $options; @@ -235,9 +239,7 @@ public function find(string $selector, int $nth = null) { $this->isLoaded(); - $result = $this->root->find($selector, $nth); - - return $result; + return $this->root->find($selector, $nth); } /** @@ -463,7 +465,7 @@ private function isLoaded(): void */ private function clean(string $str): string { - if ($this->options->get('cleanupInput') != true) { + if ($this->options->isCleanupInput() != true) { // skip entire cleanup step return $str; } @@ -488,7 +490,7 @@ private function clean(string $str): string // clean out the \n\r $replace = ' '; - if ($this->options->get('preserveLineBreaks')) { + if ($this->options->isPreserveLineBreaks()) { $replace = ' '; } $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str); @@ -515,7 +517,7 @@ private function clean(string $str): string } // strip out "; $dom = new Dom(); - $dom->setOptions(['cleanupInput' => false]); + $dom->setOptions((new Options())->setCleanupInput(false)); $dom->loadStr($html); $this->assertSame($html, $dom->root->outerHtml()); } @@ -213,7 +214,7 @@ public function testLoadUtf8() public function testLoadFileWhitespace() { $dom = new Dom(); - $dom->setOptions(['cleanupInput' => false]); + $dom->setOptions((new Options())->setCleanupInput(false)); $dom->loadFromFile('tests/data/files/whitespace.html'); $this->assertEquals(1, \count($dom->find('.class'))); $this->assertEquals('', (string) $dom); @@ -237,7 +238,8 @@ public function testLoadFileBigTwice() public function testLoadFileBigTwicePreserveOption() { $dom = new Dom(); - $dom->loadFromFile('tests/data/files/big.html', ['preserveLineBreaks' => true]); + $dom->loadFromFile('tests/data/files/big.html', + (new Options)->setPreserveLineBreaks(true)); $post = $dom->find('.post-row', 0); $this->assertEquals( "

Журчанье воды
\nЧерно-белые тени
\nВновь на фонтане

", @@ -261,7 +263,7 @@ public function testLoadFromUrl() ->andReturn($responseMock); $dom = new Dom(); - $dom->loadFromUrl('http://google.com', [], $clientMock); + $dom->loadFromUrl('http://google.com', null, $clientMock); $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } @@ -397,9 +399,7 @@ public function testHasChildren() public function testWhitespaceInText() { $dom = new Dom(); - $dom->setOptions([ - 'removeDoubleSpace' => false, - ]); + $dom->setOptions((new Options())->setRemoveDoubleSpace(false)); $dom->loadStr('
    Hello world
'); $this->assertEquals('
    Hello world
', (string) $dom); } @@ -415,7 +415,7 @@ public function testGetComplexAttribute() public function testGetComplexAttributeHtmlSpecialCharsDecode() { $dom = new Dom(); - $dom->setOptions(['htmlSpecialCharsDecode' => true]); + $dom->setOptions((new Options())->setHtmlSpecialCharsDecode(true)); $dom->loadStr('Next >'); $a = $dom->find('a', 0); $this->assertEquals('Next >', $a->innerHtml); @@ -563,7 +563,7 @@ public function testLoadGetAttributeWithBackslash() public function test25ChildrenFound() { $dom = new Dom(); - $dom->setOptions(['whitespaceTextNode' => false]); + $dom->setOptions((new Options())->setWhitespaceTextNode(false)); $dom->loadFromFile('tests/data/files/51children.html'); $children = $dom->find('#red-line-g *'); $this->assertEquals(25, \count($children)); @@ -596,22 +596,21 @@ public function testLessThanCharacterInJavascript() $results = (new Dom())->loadStr('
', - [ - 'cleanupInput' => false, - 'removeScripts' => false - ])->find('body'); + (new Options())->setCleanupInput(false) + ->setRemoveScripts(false) + )->find('body'); $this->assertCount(1, $results); } public function testUniqueIdForAllObjects() { // Create a dom which will be used as a parent/container for a paragraph - $dom1 = new \PHPHtmlParser\Dom; + $dom1 = new \PHPHtmlParser\Dom(); $dom1->loadStr('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting) $div = $dom1->firstChild(); // Create a paragraph outside of the first dom - $dom2 = new \PHPHtmlParser\Dom; + $dom2 = new \PHPHtmlParser\Dom(); $dom2->loadStr('

Our new paragraph.

'); // Resets the counter $paragraph = $dom2->firstChild(); @@ -647,7 +646,6 @@ public function testCompatibleWithWordPressShortcode() $node = $dom->find('p', 0); $this->assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml); - } public function testBrokenHtml() diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php index b7e5325e..914078ac 100755 --- a/tests/Options/CleanupTest.php +++ b/tests/Options/CleanupTest.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class CleanupTest extends TestCase @@ -10,9 +11,7 @@ class CleanupTest extends TestCase public function testCleanupInputTrue() { $dom = new Dom(); - $dom->setOptions([ - 'cleanupInput' => true, - ]); + $dom->setOptions((new Options())->setCleanupInput(true)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(0, \count($dom->find('style'))); $this->assertEquals(0, \count($dom->find('script'))); @@ -21,9 +20,7 @@ public function testCleanupInputTrue() public function testCleanupInputFalse() { $dom = new Dom(); - $dom->setOptions([ - 'cleanupInput' => false, - ]); + $dom->setOptions((new Options())->setCleanupInput(false)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(1, \count($dom->find('style'))); $this->assertEquals(22, \count($dom->find('script'))); @@ -32,9 +29,7 @@ public function testCleanupInputFalse() public function testRemoveStylesTrue() { $dom = new Dom(); - $dom->setOptions([ - 'removeStyles' => true, - ]); + $dom->setOptions((new Options())->setRemoveStyles(true)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(0, \count($dom->find('style'))); } @@ -42,9 +37,7 @@ public function testRemoveStylesTrue() public function testRemoveStylesFalse() { $dom = new Dom(); - $dom->setOptions([ - 'removeStyles' => false, - ]); + $dom->setOptions((new Options())->setRemoveStyles(false)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(1, \count($dom->find('style'))); $this->assertEquals('text/css', @@ -54,9 +47,7 @@ public function testRemoveStylesFalse() public function testRemoveScriptsTrue() { $dom = new Dom(); - $dom->setOptions([ - 'removeScripts' => true, - ]); + $dom->setOptions((new Options())->setRemoveScripts(true)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(0, \count($dom->find('script'))); } @@ -64,9 +55,7 @@ public function testRemoveScriptsTrue() public function testRemoveScriptsFalse() { $dom = new Dom(); - $dom->setOptions([ - 'removeScripts' => false, - ]); + $dom->setOptions((new Options())->setRemoveScripts(false)); $dom->loadFromFile('tests/data/files/big.html'); $this->assertEquals(22, \count($dom->find('script'))); $this->assertEquals('text/javascript', @@ -85,9 +74,7 @@ public function testSmartyScripts() public function testSmartyScriptsDisabled() { $dom = new Dom(); - $dom->setOptions([ - 'removeSmartyScripts' => false, - ]); + $dom->setOptions((new Options())->setRemoveSmartyScripts(false)); $dom->loadStr(' aa={123} '); diff --git a/tests/Options/PreserveLineBreaks.php b/tests/Options/PreserveLineBreaks.php index ad095a38..be396490 100755 --- a/tests/Options/PreserveLineBreaks.php +++ b/tests/Options/PreserveLineBreaks.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class PreserveLineBreaks extends TestCase @@ -10,9 +11,8 @@ class PreserveLineBreaks extends TestCase public function testPreserveLineBreakTrue() { $dom = new Dom(); - $dom->setOptions([ - 'preserveLineBreaks' => true, - ]); + $dom->setOptions((new Options())->setPreserveLineBreaks(true)); + $dom->loadStr('
'); @@ -22,9 +22,7 @@ public function testPreserveLineBreakTrue() public function testPreserveLineBreakBeforeClosingTag() { $dom = new Dom(); - $dom->setOptions([ - 'preserveLineBreaks' => true, - ]); + $dom->setOptions((new Options())->setPreserveLineBreaks(true)); $dom->loadStr('
'); diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php index 96d457b7..709f292d 100755 --- a/tests/Options/StrictTest.php +++ b/tests/Options/StrictTest.php @@ -4,6 +4,7 @@ use PHPHtmlParser\Dom; use PHPHtmlParser\Exceptions\StrictException; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class StrictTest extends TestCase @@ -11,9 +12,7 @@ class StrictTest extends TestCase public function testConfigStrict() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); } @@ -21,9 +20,7 @@ public function testConfigStrict() public function testConfigStrictMissingSelfClosing() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); try { // should throw an exception $dom->loadStr('

Hey you


Ya you!

'); @@ -37,9 +34,7 @@ public function testConfigStrictMissingSelfClosing() public function testConfigStrictMissingAttribute() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); try { // should throw an exception $dom->loadStr('

Hey you

Ya you!

'); @@ -53,9 +48,7 @@ public function testConfigStrictMissingAttribute() public function testConfigStrictBRTag() { $dom = new Dom(); - $dom->setOptions([ - 'strict' => true, - ]); + $dom->setOptions((new Options())->setStrict(true)); $dom->loadStr('
'); $this->assertTrue(true); } diff --git a/tests/Options/WhitespaceTextNodeTest.php b/tests/Options/WhitespaceTextNodeTest.php index 0097f28d..245ef7f0 100755 --- a/tests/Options/WhitespaceTextNodeTest.php +++ b/tests/Options/WhitespaceTextNodeTest.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; class WhitespaceTextNodeTest extends TestCase @@ -10,9 +11,7 @@ class WhitespaceTextNodeTest extends TestCase public function testConfigGlobalNoWhitespaceTextNode() { $dom = new Dom(); - $dom->setOptions([ - 'whitespaceTextNode' => false, - ]); + $dom->setOptions((new Options())->setWhitespaceTextNode(false)); $dom->loadStr('

Hey you

Ya you!

'); $this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text); } @@ -20,12 +19,8 @@ public function testConfigGlobalNoWhitespaceTextNode() public function testConfigLocalOverride() { $dom = new Dom(); - $dom->setOptions([ - 'whitespaceTextNode' => false, - ]); - $dom->loadStr('

Hey you

Ya you!

', [ - 'whitespaceTextNode' => true, - ]); + $dom->setOptions((new Options())->setWhitespaceTextNode(false)); + $dom->loadStr('

Hey you

Ya you!

', (new Options())->setWhitespaceTextNode(true)); $this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text); } } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index a78f508f..f7406a14 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -2,8 +2,6 @@ declare(strict_types=1); -use PHPHtmlParser\Dom; -use PHPHtmlParser\Exceptions\UnknownOptionException; use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; @@ -13,137 +11,62 @@ public function testDefaultWhitespaceTextNode() { $options = new Options(); - $this->assertTrue($options->whitespaceTextNode); + $this->assertTrue($options->isWhitespaceTextNode()); } public function testSettingOption() { $options = new Options(); - $options->setOptions([ - 'strict' => true, - ]); - - $this->assertTrue($options->strict); - } - - public function testAddingOption() - { - $this->expectException(UnknownOptionException::class); + $options->setStrict(true); - $options = new Options(); - $options->setOptions([ - 'test' => true, - ]); + $this->assertTrue($options->isStrict()); } public function testOverwritingOption() { $options = new Options(); - $options->setOptions([ - 'strict' => false, - ])->setOptions([ - 'strict' => true, - 'whitespaceTextNode' => false, - ]); - - $this->assertTrue($options->get('strict')); - $this->assertFalse($options->get('whitespaceTextNode')); - } + $options->setStrict(false); + $options2 = new Options(); + $options2->setStrict(true); + $options2->setWhitespaceTextNode(false); + $options->setFromOptions($options2); - public function testGettingNoOption() - { - $options = new Options(); - $this->assertEquals(null, $options->get('doesnotexist')); + $this->assertTrue($options->isStrict()); + $this->assertFalse($options->isWhitespaceTextNode()); } public function testSetters() { $options = new Options(); - $options->setOptions([ - 'whitespaceTextNode' => false, - 'strict' => false, - 'enforceEncoding' => null, - 'cleanupInput' => false, - 'removeScripts' => false, - 'removeStyles' => false, - 'preserveLineBreaks' => false, - 'removeDoubleSpace' => false, - 'removeSmartyScripts' => false, - 'htmlSpecialCharsDecode' => false, - ]); - $options->setWhitespaceTextNode(true); - $this->assertTrue($options->get('whitespaceTextNode')); + $this->assertTrue($options->isWhitespaceTextNode()); $options->setStrict(true); - $this->assertTrue($options->get('strict')); + $this->assertTrue($options->isStrict()); $options->setEnforceEncoding('utf8'); - $this->assertEquals('utf8', $options->get('enforceEncoding')); + $this->assertEquals('utf8', $options->getEnforceEncoding()); $options->setCleanupInput(true); - $this->assertTrue($options->get('cleanupInput')); + $this->assertTrue($options->isCleanupInput()); $options->setRemoveScripts(true); - $this->assertTrue($options->get('removeScripts')); + $this->assertTrue($options->isRemoveScripts()); $options->setRemoveStyles(true); - $this->assertTrue($options->get('removeStyles')); + $this->assertTrue($options->isRemoveStyles()); $options->setPreserveLineBreaks(true); - $this->assertTrue($options->get('preserveLineBreaks')); + $this->assertTrue($options->isPreserveLineBreaks()); $options->setRemoveDoubleSpace(true); - $this->assertTrue($options->get('removeDoubleSpace')); + $this->assertTrue($options->isRemoveDoubleSpace()); $options->setRemoveSmartyScripts(true); - $this->assertTrue($options->get('removeSmartyScripts')); + $this->assertTrue($options->isRemoveSmartyScripts()); $options->setHtmlSpecialCharsDecode(true); - $this->assertTrue($options->get('htmlSpecialCharsDecode')); - - // now reset to false - - $options->setWhitespaceTextNode(false); - $this->assertFalse($options->get('whitespaceTextNode')); - - $options->setStrict(false); - $this->assertFalse($options->get('strict')); - - $options->setEnforceEncoding(null); - $this->assertNull($options->get('enforceEncoding')); - - $options->setCleanupInput(false); - $this->assertFalse($options->get('cleanupInput')); - - $options->setRemoveScripts(false); - $this->assertFalse($options->get('removeScripts')); - - $options->setRemoveStyles(false); - $this->assertFalse($options->get('removeStyles')); - - $options->setPreserveLineBreaks(false); - $this->assertFalse($options->get('preserveLineBreaks')); - - $options->setRemoveDoubleSpace(false); - $this->assertFalse($options->get('removeDoubleSpace')); - - $options->setRemoveSmartyScripts(false); - $this->assertFalse($options->get('removeSmartyScripts')); - - $options->setHtmlSpecialCharsDecode(false); - $this->assertFalse($options->get('htmlSpecialCharsDecode')); - } - - public function testUnknownOptionDom() - { - $dom = new Dom(); - $dom->setOptions([ - 'unknown_option' => true, - ]); - - $this->expectException(UnknownOptionException::class); - $dom->loadStr('
'); + $this->assertTrue($options->isHtmlSpecialCharsDecode()); } } diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php index fbc1a5bd..73453633 100755 --- a/tests/StaticDomTest.php +++ b/tests/StaticDomTest.php @@ -73,7 +73,7 @@ public function testLoadFromUrl() ->once() ->andReturn($responseMock); - Dom::loadFromUrl('http://google.com', [], $clientMock); + Dom::loadFromUrl('http://google.com', null, $clientMock); $this->assertEquals('VonBurgermeister', Dom::find('.post-row div .post-user font', 0)->text); } } From 7bba8adf4348c42e877275a6740689b90ef5707d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 02:09:17 +0000 Subject: [PATCH 45/68] Cleaned up the code --- src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 4 +- src/PHPHtmlParser/DTO/TagDTO.php | 70 +++++++++ src/PHPHtmlParser/Dom.php | 166 +++------------------ src/PHPHtmlParser/Options.php | 134 +++++++++++++++++ tests/DomTest.php | 46 +----- tests/Options/NoSlashTest.php | 45 ++++++ tests/Options/SelfClosingTest.php | 45 ++++++ 7 files changed, 321 insertions(+), 189 deletions(-) create mode 100644 src/PHPHtmlParser/DTO/TagDTO.php create mode 100644 tests/Options/NoSlashTest.php create mode 100644 tests/Options/SelfClosingTest.php diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 1f15c492..6ac22197 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -17,12 +17,12 @@ final class AttributeDTO /** * @var bool */ - private $doubleQuote = true; + private $doubleQuote; public function __construct(array $values) { $this->value = $values['value']; - $this->doubleQuote = $values['doubleQuote']; + $this->doubleQuote = $values['doubleQuote'] ?? true; } public function getValue(): ?string diff --git a/src/PHPHtmlParser/DTO/TagDTO.php b/src/PHPHtmlParser/DTO/TagDTO.php new file mode 100644 index 00000000..d1c365e1 --- /dev/null +++ b/src/PHPHtmlParser/DTO/TagDTO.php @@ -0,0 +1,70 @@ +status = $values['status'] ?? false; + $this->closing = $values['closing'] ?? false; + $this->node = $values['node'] ?? null; + $this->tag = $values['tag'] ?? null; + } + + /** + * @return bool + */ + public function isStatus(): bool + { + return $this->status; + } + + /** + * @return bool + */ + public function isClosing(): bool + { + return $this->closing; + } + + /** + * @return mixed + */ + public function getNode(): ?HtmlNode + { + return $this->node; + } + + /** + * @return mixed + */ + public function getTag(): ?string + { + return $this->tag; + } +} diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index d2db15e2..68c8d144 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -10,6 +10,7 @@ use PHPHtmlParser\Dom\Collection; use PHPHtmlParser\Dom\HtmlNode; use PHPHtmlParser\Dom\TextNode; +use PHPHtmlParser\DTO\TagDTO; use PHPHtmlParser\Enum\StringToken; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; @@ -84,46 +85,17 @@ class Dom */ private $options; - /** - * A list of tags which will always be self closing. - * - * @var array - */ - private $selfClosing = [ - 'area', - 'base', - 'basefont', - 'br', - 'col', - 'embed', - 'hr', - 'img', - 'input', - 'keygen', - 'link', - 'meta', - 'param', - 'source', - 'spacer', - 'track', - 'wbr', - ]; - - /** - * A list of tags where there should be no /> at the end (html5 style). - * - * @var array - */ - private $noSlash = []; - /** * Returns the inner html of the root node. * * @throws ChildNotFoundException * @throws UnknownChildTypeException + * @throws NotLoadedException */ public function __toString(): string { + $this->isLoaded(); + return $this->root->innerHtml(); } @@ -132,10 +104,14 @@ public function __toString(): string * * @param string $name * + * @throws NotLoadedException + * * @return mixed */ public function __get($name) { + $this->isLoaded(); + return $this->root->$name; } @@ -242,100 +218,6 @@ public function find(string $selector, int $nth = null) return $this->root->find($selector, $nth); } - /** - * Adds the tag (or tags in an array) to the list of tags that will always - * be self closing. - * - * @param string|array $tag - * @chainable - */ - public function addSelfClosingTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - foreach ($tag as $value) { - $this->selfClosing[] = $value; - } - - return $this; - } - - /** - * Removes the tag (or tags in an array) from the list of tags that will - * always be self closing. - * - * @param string|array $tag - * @chainable - */ - public function removeSelfClosingTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - $this->selfClosing = \array_diff($this->selfClosing, $tag); - - return $this; - } - - /** - * Sets the list of self closing tags to empty. - * - * @chainable - */ - public function clearSelfClosingTags(): Dom - { - $this->selfClosing = []; - - return $this; - } - - /** - * Adds a tag to the list of self closing tags that should not have a trailing slash. - * - * @param $tag - * @chainable - */ - public function addNoSlashTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - foreach ($tag as $value) { - $this->noSlash[] = $value; - } - - return $this; - } - - /** - * Removes a tag from the list of no-slash tags. - * - * @param $tag - * @chainable - */ - public function removeNoSlashTag($tag): Dom - { - if (!\is_array($tag)) { - $tag = [$tag]; - } - $this->noSlash = \array_diff($this->noSlash, $tag); - - return $this; - } - - /** - * Empties the list of no-slash tags. - * - * @chainable - */ - public function clearNoSlashTags(): Dom - { - $this->noSlash = []; - - return $this; - } - /** * Simple wrapper function that returns the first child. * @@ -574,18 +456,18 @@ private function parse(): void $str = $this->content->copyUntil('<'); } if ($str == '') { - $info = $this->parseTag(); - if (!$info['status']) { + $tagDTO = $this->parseTag(); + if (!$tagDTO->isStatus()) { // we are done here $activeNode = null; continue; } // check if it was a closing tag - if ($info['closing']) { + if ($tagDTO->isClosing()) { $foundOpeningTag = true; $originalNode = $activeNode; - while ($activeNode->getTag()->name() != $info['tag']) { + while ($activeNode->getTag()->name() != $tagDTO->getTag()) { $activeNode = $activeNode->getParent(); if ($activeNode === null) { // we could not find opening tag @@ -600,12 +482,12 @@ private function parse(): void continue; } - if (!isset($info['node'])) { + if ($tagDTO->getNode() === null) { continue; } /** @var AbstractNode $node */ - $node = $info['node']; + $node = $tagDTO->getNode(); $activeNode->addChild($node); // check if node is self closing @@ -628,7 +510,7 @@ private function parse(): void * * @throws StrictException */ - private function parseTag(): array + private function parseTag(): TagDTO { $return = [ 'status' => false, @@ -637,7 +519,7 @@ private function parseTag(): array ]; if ($this->content->char() != '<') { // we are not at the beginning of a tag - return $return; + return new TagDTO(); } // check if this is a closing tag @@ -645,7 +527,7 @@ private function parseTag(): array $this->content->fastForward(1); } catch (ContentLengthException $exception) { // we are at the end of the file - return $return; + return new TagDTO(); } if ($this->content->char() == '/') { // end tag @@ -657,22 +539,22 @@ private function parseTag(): array // check if this closing tag counts $tag = \strtolower($tag); - if (\in_array($tag, $this->selfClosing, true)) { + if (\in_array($tag, $this->options->getSelfClosing(), true)) { $return['status'] = true; - return $return; + return new TagDTO($return); } $return['status'] = true; $return['closing'] = true; $return['tag'] = \strtolower($tag); - return $return; + return new TagDTO($return); } $tag = \strtolower($this->content->copyByToken(StringToken::SLASH(), true)); if (\trim($tag) == '') { // no tag found, invalid < found - return $return; + return new TagDTO(); } $node = new HtmlNode($tag); $node->setHtmlSpecialCharsDecode($this->options->isHtmlSpecialCharsDecode()); @@ -754,7 +636,7 @@ private function parseTag(): array // self closing tag $node->getTag()->selfClosing(); $this->content->fastForward(1); - } elseif (\in_array($tag, $this->selfClosing, true)) { + } elseif (\in_array($tag, $this->options->getSelfClosing(), true)) { // Should be a self closing tag, check if we are strict if ($this->options->isStrict()) { $character = $this->content->getPosition(); @@ -765,7 +647,7 @@ private function parseTag(): array $node->getTag()->selfClosing(); // Should this tag use a trailing slash? - if (\in_array($tag, $this->noSlash, true)) { + if (\in_array($tag, $this->options->getNoSlash(), true)) { $node->getTag()->noTrailingSlash(); } } @@ -777,7 +659,7 @@ private function parseTag(): array $return['status'] = true; $return['node'] = $node; - return $return; + return new TagDTO($return); } /** diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index 9d1dfbfd..d995dbc8 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -88,6 +88,38 @@ class Options */ private $htmlSpecialCharsDecode = false; + /** + * A list of tags which will always be self closing. + * + * @var array + */ + private $selfClosing = [ + 'area', + 'base', + 'basefont', + 'br', + 'col', + 'embed', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'spacer', + 'track', + 'wbr', + ]; + + /** + * A list of tags where there should be no /> at the end (html5 style). + * + * @var array + */ + private $noSlash = []; + public function isWhitespaceTextNode(): bool { return $this->whitespaceTextNode; @@ -208,6 +240,106 @@ public function setHtmlSpecialCharsDecode(bool $htmlSpecialCharsDecode): Options return $this; } + public function getSelfClosing(): array + { + return $this->selfClosing; + } + + public function setSelfClosing(array $selfClosing): Options + { + $this->selfClosing = $selfClosing; + + return $this; + } + + /** + * Adds the tag to the list of tags that will always be self closing. + */ + public function addSelfClosingTag(string $tag): Options + { + $this->selfClosing[] = $tag; + + return $this; + } + + /** + * Adds the tags to the list of tags that will always be self closing. + * + * @param string[] $tags + */ + public function addSelfClosingTags(array $tags): Options + { + foreach ($tags as $tag) { + $this->selfClosing[] = $tag; + } + + return $this; + } + + /** + * Removes the tag from the list of tags that will always be self closing. + */ + public function removeSelfClosingTag(string $tag): Options + { + $tags = [$tag]; + $this->selfClosing = \array_diff($this->selfClosing, $tags); + + return $this; + } + + /** + * Sets the list of self closing tags to empty. + */ + public function clearSelfClosingTags(): Options + { + $this->selfClosing = []; + + return $this; + } + + public function getNoSlash(): array + { + return $this->noSlash; + } + + public function setNoSlash(array $noSlash): Options + { + $this->noSlash = $noSlash; + + return $this; + } + + /** + * Adds a tag to the list of self closing tags that should not have a trailing slash. + */ + public function addNoSlashTag(string $tag): Options + { + $this->noSlash[] = $tag; + + return $this; + } + + /** + * Removes a tag from the list of no-slash tags. + */ + public function removeNoSlashTag(string $tag): Options + { + $tags = [$tag]; + $this->noSlash = \array_diff($this->noSlash, $tags); + + return $this; + } + + /** + * Empties the list of no-slash tags. + */ + public function clearNoSlashTags(): Options + { + $this->noSlash = []; + + return $this; + } + public function setFromOptions(Options $options): void { $this->setCleanupInput($options->isCleanupInput()); @@ -220,5 +352,7 @@ public function setFromOptions(Options $options): void $this->setRemoveStyles($options->isRemoveStyles()); $this->setStrict($options->isStrict()); $this->setWhitespaceTextNode($options->isWhitespaceTextNode()); + $this->setSelfClosing($options->getSelfClosing()); + $this->setNoSlash($options->getNoSlash()); } } diff --git a/tests/DomTest.php b/tests/DomTest.php index 81396962..96756d6c 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -104,50 +104,6 @@ public function testLoadClosingTagOnSelfClosing() $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); } - public function testLoadClosingTagOnSelfClosingNoSlash() - { - $dom = new Dom(); - $dom->addNoSlashTag('br'); - - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagAddSelfClosingTag() - { - $dom = new Dom(); - $dom->addSelfClosingTag('mytag'); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagAddSelfClosingTagArray() - { - $dom = new Dom(); - $dom->addSelfClosingTag([ - 'mytag', - 'othertag', - ]); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagRemoveSelfClosingTag() - { - $dom = new Dom(); - $dom->removeSelfClosingTag('br'); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); - } - - public function testLoadClosingTagClearSelfClosingTag() - { - $dom = new Dom(); - $dom->clearSelfClosingTags(); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); - } - public function testLoadNoValueAttribute() { $dom = new Dom(); @@ -239,7 +195,7 @@ public function testLoadFileBigTwicePreserveOption() { $dom = new Dom(); $dom->loadFromFile('tests/data/files/big.html', - (new Options)->setPreserveLineBreaks(true)); + (new Options())->setPreserveLineBreaks(true)); $post = $dom->find('.post-row', 0); $this->assertEquals( "

Журчанье воды
\nЧерно-белые тени
\nВновь на фонтане

", diff --git a/tests/Options/NoSlashTest.php b/tests/Options/NoSlashTest.php new file mode 100644 index 00000000..93370b56 --- /dev/null +++ b/tests/Options/NoSlashTest.php @@ -0,0 +1,45 @@ +setOptions((new Options())->addNoSlashTag('br')); + + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagOnSelfClosingRemoveNoSlash() + { + $dom = new Dom(); + $dom->setOptions( + (new Options()) + ->addNoSlashTag('br') + ->removeNoSlashTag('br') + ); + + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagOnSelfClosingClearNoSlash() + { + $dom = new Dom(); + $dom->setOptions( + (new Options()) + ->addNoSlashTag('br') + ->clearNoSlashTags() + ); + + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } +} diff --git a/tests/Options/SelfClosingTest.php b/tests/Options/SelfClosingTest.php new file mode 100644 index 00000000..f1349821 --- /dev/null +++ b/tests/Options/SelfClosingTest.php @@ -0,0 +1,45 @@ +setOptions((new Options())->addSelfClosingTag('mytag')); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagAddSelfClosingTagArray() + { + $dom = new Dom(); + $dom->setOptions((new Options())->addSelfClosingTags([ + 'mytag', + 'othertag', + ])); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagRemoveSelfClosingTag() + { + $dom = new Dom(); + $dom->setOptions((new Options())->removeSelfClosingTag('br')); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); + } + + public function testLoadClosingTagClearSelfClosingTag() + { + $dom = new Dom(); + $dom->setOptions((new Options())->clearSelfClosingTags()); + $dom->loadStr('

Hey bro, click here

'); + $this->assertEquals('

Hey bro, click here


', $dom->find('div', 0)->innerHtml); + } +} From 8f43f08fe9f5f4d002e4cf8124a6e48c0d582652 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 03:13:13 +0000 Subject: [PATCH 46/68] Fixed #215 - Added support for options = new Options(); if ($this->globalOptions !== null) { - $this->options->setFromOptions($this->globalOptions); + $this->options = $this->options->setFromOptions($this->globalOptions); } if ($options !== null) { - $this->options->setFromOptions($options); + $this->options = $this->options->setFromOptions($options); } $this->rawSize = \strlen($str); @@ -194,7 +195,7 @@ public function loadStr(string $str, ?Options $options = null): Dom /** * Sets a global options array to be used by all load calls. * - * @chainable + * */ public function setOptions(Options $options): Dom { @@ -512,11 +513,7 @@ private function parse(): void */ private function parseTag(): TagDTO { - $return = [ - 'status' => false, - 'closing' => false, - 'node' => null, - ]; + $return = []; if ($this->content->char() != '<') { // we are not at the beginning of a tag return new TagDTO(); @@ -549,12 +546,20 @@ private function parseTag(): TagDTO $return['tag'] = \strtolower($tag); return new TagDTO($return); - } - - $tag = \strtolower($this->content->copyByToken(StringToken::SLASH(), true)); - if (\trim($tag) == '') { - // no tag found, invalid < found - return new TagDTO(); + } elseif ($this->content->char() == '?') { + // special setting tag + $tag = $this->content->fastForward(1) + ->copyByToken(StringToken::SLASH(), true); + $tag = (new Tag($tag)) + ->setOpening('setClosing(' ?>') + ->selfClosing(); + } else { + $tag = \strtolower($this->content->copyByToken(StringToken::SLASH(), true)); + if (\trim($tag) == '') { + // no tag found, invalid < found + return new TagDTO(); + } } $node = new HtmlNode($tag); $node->setHtmlSpecialCharsDecode($this->options->isHtmlSpecialCharsDecode()); @@ -631,23 +636,22 @@ private function parseTag(): TagDTO } $this->content->skipByToken(StringToken::BLANK()); - $tag = \strtolower($tag); if ($this->content->char() == '/') { // self closing tag $node->getTag()->selfClosing(); $this->content->fastForward(1); - } elseif (\in_array($tag, $this->options->getSelfClosing(), true)) { + } elseif (\in_array($node->getTag()->name(), $this->options->getSelfClosing(), true)) { // Should be a self closing tag, check if we are strict if ($this->options->isStrict()) { $character = $this->content->getPosition(); - throw new StrictException("Tag '$tag' is not self closing! (character #$character)"); + throw new StrictException("Tag '".$node->getTag()->name()."' is not self closing! (character #$character)"); } // We force self closing on this tag. $node->getTag()->selfClosing(); // Should this tag use a trailing slash? - if (\in_array($tag, $this->options->getNoSlash(), true)) { + if (\in_array($node->getTag()->name(), $this->options->getNoSlash(), true)) { $node->getTag()->noTrailingSlash(); } } diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php index 3d67ab5c..9c89b5ce 100755 --- a/src/PHPHtmlParser/Dom/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/AbstractNode.php @@ -304,7 +304,7 @@ public function getTag(): Tag * Replaces the tag for this node. * * @param string|Tag $tag - * @chainable + * */ public function setTag($tag): AbstractNode { @@ -365,7 +365,7 @@ public function hasAttribute(string $key): bool * A wrapper method that simply calls the setAttribute method * on the tag of this node. * - * @chainable + * */ public function setAttribute(string $key, ?string $value, bool $doubleQuote = true): AbstractNode { diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php index 60c120d7..07f16c7d 100755 --- a/src/PHPHtmlParser/Dom/InnerNode.php +++ b/src/PHPHtmlParser/Dom/InnerNode.php @@ -205,7 +205,7 @@ public function insertAfter(AbstractNode $child, int $id): bool /** * Removes the child by id. * - * @chainable + * */ public function removeChild(int $id): InnerNode { diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 8fc84874..a98590e4 100755 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -58,6 +58,19 @@ class Tag */ private $HtmlSpecialCharsDecode = false; + /** + * What the opening of this tag will be. + * + * @var string + */ + private $opening = '<'; + + /** + * What the closing tag for self-closing elements should be. + * @var string + */ + private $closing = ' />'; + /** * Sets up the tag with a name. * @@ -79,25 +92,37 @@ public function name(): string /** * Sets the tag to be self closing. * - * @chainable + * */ public function selfClosing(): Tag { $this->selfClosing = true; - return $this; + return clone $this; + } + + public function setOpening(string $opening): Tag + { + $this->opening = $opening; + return clone $this; + } + + public function setClosing(string $closing): Tag + { + $this->closing = $closing; + return clone $this; } /** * Sets the tag to not use a trailing slash. * - * @chainable + * */ public function noTrailingSlash(): Tag { $this->trailingSlash = false; - return $this; + return clone $this; } /** @@ -131,7 +156,7 @@ public function noise(string $noise): Tag { $this->noise = $noise; - return $this; + return clone $this; } /** @@ -148,7 +173,7 @@ public function setAttribute(string $key, ?string $attributeValue, bool $doubleQ } $this->attr[\strtolower($key)] = $attributeDTO; - return $this; + return clone $this; } /** @@ -296,7 +321,7 @@ public function hasAttribute(string $key) */ public function makeOpeningTag() { - $return = '<' . $this->name; + $return = $this->opening . $this->name; // add the attributes foreach (\array_keys($this->attr) as $key) { @@ -317,7 +342,7 @@ public function makeOpeningTag() } if ($this->selfClosing && $this->trailingSlash) { - return $return . ' />'; + return $return . $this->closing; } return $return . '>'; diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php index d995dbc8..ffd3c734 100755 --- a/src/PHPHtmlParser/Options.php +++ b/src/PHPHtmlParser/Options.php @@ -91,7 +91,7 @@ class Options /** * A list of tags which will always be self closing. * - * @var array + * @var string[] */ private $selfClosing = [ 'area', @@ -116,7 +116,7 @@ class Options /** * A list of tags where there should be no /> at the end (html5 style). * - * @var array + * @var string[] */ private $noSlash = []; @@ -129,7 +129,7 @@ public function setWhitespaceTextNode(bool $whitespaceTextNode): Options { $this->whitespaceTextNode = $whitespaceTextNode; - return $this; + return clone $this; } public function isStrict(): bool @@ -141,7 +141,7 @@ public function setStrict(bool $strict): Options { $this->strict = $strict; - return $this; + return clone $this; } public function getEnforceEncoding(): ?string @@ -153,7 +153,7 @@ public function setEnforceEncoding(?string $enforceEncoding): Options { $this->enforceEncoding = $enforceEncoding; - return $this; + return clone $this; } public function isCleanupInput(): bool @@ -165,7 +165,7 @@ public function setCleanupInput(bool $cleanupInput): Options { $this->cleanupInput = $cleanupInput; - return $this; + return clone $this; } public function isRemoveScripts(): bool @@ -177,7 +177,7 @@ public function setRemoveScripts(bool $removeScripts): Options { $this->removeScripts = $removeScripts; - return $this; + return clone $this; } public function isRemoveStyles(): bool @@ -189,7 +189,7 @@ public function setRemoveStyles(bool $removeStyles): Options { $this->removeStyles = $removeStyles; - return $this; + return clone $this; } public function isPreserveLineBreaks(): bool @@ -201,7 +201,7 @@ public function setPreserveLineBreaks(bool $preserveLineBreaks): Options { $this->preserveLineBreaks = $preserveLineBreaks; - return $this; + return clone $this; } public function isRemoveDoubleSpace(): bool @@ -213,7 +213,7 @@ public function setRemoveDoubleSpace(bool $removeDoubleSpace): Options { $this->removeDoubleSpace = $removeDoubleSpace; - return $this; + return clone $this; } public function isRemoveSmartyScripts(): bool @@ -225,7 +225,7 @@ public function setRemoveSmartyScripts(bool $removeSmartyScripts): Options { $this->removeSmartyScripts = $removeSmartyScripts; - return $this; + return clone $this; } public function isHtmlSpecialCharsDecode(): bool @@ -237,9 +237,12 @@ public function setHtmlSpecialCharsDecode(bool $htmlSpecialCharsDecode): Options { $this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode; - return $this; + return clone $this; } + /** + * @return string[] + */ public function getSelfClosing(): array { return $this->selfClosing; @@ -249,7 +252,7 @@ public function setSelfClosing(array $selfClosing): Options { $this->selfClosing = $selfClosing; - return $this; + return clone $this; } /** @@ -259,7 +262,7 @@ public function addSelfClosingTag(string $tag): Options { $this->selfClosing[] = $tag; - return $this; + return clone $this; } /** @@ -273,7 +276,7 @@ public function addSelfClosingTags(array $tags): Options $this->selfClosing[] = $tag; } - return $this; + return clone $this; } /** @@ -284,7 +287,7 @@ public function removeSelfClosingTag(string $tag): Options $tags = [$tag]; $this->selfClosing = \array_diff($this->selfClosing, $tags); - return $this; + return clone $this; } /** @@ -294,19 +297,25 @@ public function clearSelfClosingTags(): Options { $this->selfClosing = []; - return $this; + return clone $this; } + /** + * @return string[] + */ public function getNoSlash(): array { return $this->noSlash; } + /** + * @param string[] $noSlash + */ public function setNoSlash(array $noSlash): Options { $this->noSlash = $noSlash; - return $this; + return clone $this; } /** @@ -316,7 +325,7 @@ public function addNoSlashTag(string $tag): Options { $this->noSlash[] = $tag; - return $this; + return clone $this; } /** @@ -327,7 +336,7 @@ public function removeNoSlashTag(string $tag): Options $tags = [$tag]; $this->noSlash = \array_diff($this->noSlash, $tags); - return $this; + return clone $this; } /** @@ -337,22 +346,24 @@ public function clearNoSlashTags(): Options { $this->noSlash = []; - return $this; + return clone $this; } - public function setFromOptions(Options $options): void + public function setFromOptions(Options $options): Options { - $this->setCleanupInput($options->isCleanupInput()); - $this->setEnforceEncoding($options->getEnforceEncoding()); - $this->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode()); - $this->setPreserveLineBreaks($options->isPreserveLineBreaks()); - $this->setRemoveDoubleSpace($options->isRemoveDoubleSpace()); - $this->setRemoveScripts($options->isRemoveScripts()); - $this->setRemoveSmartyScripts($options->isRemoveSmartyScripts()); - $this->setRemoveStyles($options->isRemoveStyles()); - $this->setStrict($options->isStrict()); - $this->setWhitespaceTextNode($options->isWhitespaceTextNode()); - $this->setSelfClosing($options->getSelfClosing()); - $this->setNoSlash($options->getNoSlash()); + $newOptions = $this->setCleanupInput($options->isCleanupInput()) + ->setEnforceEncoding($options->getEnforceEncoding()) + ->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode()) + ->setPreserveLineBreaks($options->isPreserveLineBreaks()) + ->setRemoveDoubleSpace($options->isRemoveDoubleSpace()) + ->setRemoveScripts($options->isRemoveScripts()) + ->setRemoveSmartyScripts($options->isRemoveSmartyScripts()) + ->setRemoveStyles($options->isRemoveStyles()) + ->setStrict($options->isStrict()) + ->setWhitespaceTextNode($options->isWhitespaceTextNode()) + ->setSelfClosing($options->getSelfClosing()) + ->setNoSlash($options->getNoSlash()); + + return $newOptions; } } diff --git a/tests/DomTest.php b/tests/DomTest.php index 96756d6c..fed044b8 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -611,4 +611,12 @@ public function testBrokenHtml() $this->assertEquals('', $dom->outerHtml); } + + public function testXMLOpeningToken() + { + $dom = new Dom(); + $dom->loadStr('

fun time

'); + + $this->assertEquals('

fun time

', $dom->outerHtml); + } } diff --git a/tests/Node/TagTest.php b/tests/Node/TagTest.php index 601e82eb..f14de7e2 100755 --- a/tests/Node/TagTest.php +++ b/tests/Node/TagTest.php @@ -159,8 +159,8 @@ public function testMakeOpeningTagSelfClosing() ], ]; - $tag = new Tag('div'); - $tag->selfClosing() + $tag = (new Tag('div')) + ->selfClosing() ->setAttributes($attr); $this->assertEquals('
', $tag->makeOpeningTag()); } diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php index f7406a14..43dd6fb6 100755 --- a/tests/OptionsTest.php +++ b/tests/OptionsTest.php @@ -29,7 +29,7 @@ public function testOverwritingOption() $options2 = new Options(); $options2->setStrict(true); $options2->setWhitespaceTextNode(false); - $options->setFromOptions($options2); + $options = $options->setFromOptions($options2); $this->assertTrue($options->isStrict()); $this->assertFalse($options->isWhitespaceTextNode()); From a78054fec726427e723bd31fcf10f21403e5e4d7 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Wed, 15 Jul 2020 03:56:54 +0000 Subject: [PATCH 47/68] Cleaned up code --- .travis.yml | 4 ++-- src/PHPHtmlParser/Contracts/DomInterface.php | 23 ++++++++++++++++++++ src/PHPHtmlParser/Dom.php | 5 +++-- src/PHPHtmlParser/Dom/InnerNode.php | 11 ++++++++++ src/PHPHtmlParser/Selector/Seeker.php | 11 +++++----- src/PHPHtmlParser/Selector/Selector.php | 2 -- tests/Selector/SeekerTest.php | 2 +- 7 files changed, 46 insertions(+), 12 deletions(-) create mode 100644 src/PHPHtmlParser/Contracts/DomInterface.php diff --git a/.travis.yml b/.travis.yml index 9ffb2529..a7abcac9 100755 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ script: - mkdir -p build/logs - php vendor/bin/phpunit --coverage-clover build/logs/clover.xml -after_script: - - travis_retry php vendor/bin/coveralls +after_success: + - travis_retry php vendor/bin/php-coveralls -v - wget https://scrutinizer-ci.com/ocular.phar - php ocular.phar code-coverage:upload --format=php-clover build/logs/clover.xml diff --git a/src/PHPHtmlParser/Contracts/DomInterface.php b/src/PHPHtmlParser/Contracts/DomInterface.php new file mode 100644 index 00000000..b803f8f2 --- /dev/null +++ b/src/PHPHtmlParser/Contracts/DomInterface.php @@ -0,0 +1,23 @@ +root; while ($activeNode !== null) { if ($activeNode && $activeNode->tag->name() === 'script' - && $this->options->isCleanupInput() != true + && $this->options->isCleanupInput() !== true ) { $str = $this->content->copyUntil('children = $combination; // tell child I am the new parent @@ -300,6 +305,8 @@ public function isChild(int $id): bool /** * Removes the child with id $childId and replace it with the new child * $newChild. + * + * @throws LogicalException */ public function replaceChild(int $childId, AbstractNode $newChild): void { @@ -312,6 +319,10 @@ public function replaceChild(int $childId, AbstractNode $newChild): void $index = \array_search($childId, $keys, true); $keys[$index] = $newChild->id(); $combination = \array_combine($keys, $this->children); + if ($combination === false) { + // The number of elements for each array isn't equal or if the arrays are empty. + throw new LogicalException('array combine failed during replace child method call.'); + } $this->children = $combination; $this->children[$newChild->id()] = [ 'prev' => $oldChild['prev'], diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php index ca92cb29..523aa42e 100644 --- a/src/PHPHtmlParser/Selector/Seeker.php +++ b/src/PHPHtmlParser/Selector/Seeker.php @@ -63,12 +63,12 @@ public function seek(array $nodes, RuleDTO $rule, array $options): array } $pass = $this->checkTag($rule, $child); - if ($pass && $rule->getKey() != null) { + if ($pass && $rule->getKey() !== null) { $pass = $this->checkKey($rule, $child); } if ($pass && - $rule->getKey() != null && - $rule->getValue() != null && + $rule->getKey() !== null && + $rule->getValue() !== null && $rule->getValue() != '*' ) { $pass = $this->checkComparison($rule, $child); @@ -238,8 +238,9 @@ private function checkNodeValue( ): bool { $check = false; if ( - $rule->getValue() != null && - \is_string($rule->getValue()) + $rule->getValue() !== null && + \is_string($rule->getValue()) && + $nodeValue !== null ) { $check = $this->match($rule->getOperator(), $rule->getValue(), $nodeValue); } diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php index 4c45da01..7179ee1f 100755 --- a/src/PHPHtmlParser/Selector/Selector.php +++ b/src/PHPHtmlParser/Selector/Selector.php @@ -48,8 +48,6 @@ public function __construct(string $selector, ?ParserInterface $parser = null, ? /** * Returns the selectors that where found in __construct. - * - * @return array */ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO { diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index 4e2d9e4f..a5106e98 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -19,7 +19,7 @@ public function testSeekReturnEmptyArray() 'alterNext' => false, ]); $seeker = new Seeker(); - $results = $seeker->seek([], $ruleDTO, [], false); + $results = $seeker->seek([], $ruleDTO, []); $this->assertCount(0, $results); } } From 3f1f6d60e4572c43c50378b14f102bc9caf33f30 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 17 Jul 2020 03:14:48 +0000 Subject: [PATCH 48/68] Cleaned up tests --- .travis.yml | 1 - phpunit.xml | 45 +++++++------ src/PHPHtmlParser/Dom.php | 2 +- tests/Dom/LoadTest.php | 99 ++++++++++++++++++++++++++++ tests/Dom/NotLoadedTest.php | 34 ++++++++++ tests/DomTest.php | 127 ++++-------------------------------- 6 files changed, 170 insertions(+), 138 deletions(-) create mode 100644 tests/Dom/LoadTest.php create mode 100644 tests/Dom/NotLoadedTest.php diff --git a/.travis.yml b/.travis.yml index a7abcac9..25ba270f 100755 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: php php: - - 7.1 - 7.2 - 7.3 - 7.4 diff --git a/phpunit.xml b/phpunit.xml index d0aa7db8..04b1d77d 100755 --- a/phpunit.xml +++ b/phpunit.xml @@ -1,26 +1,29 @@ - - - ./tests/ - - + + + ./tests/ + + - - - src - - vendor - - - + + + src + + vendor + + + diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 10d19144..df5b344e 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -349,7 +349,7 @@ private function isLoaded(): void */ private function clean(string $str): string { - if ($this->options->isCleanupInput() != true) { + if (!$this->options->isCleanupInput()) { // skip entire cleanup step return $str; } diff --git a/tests/Dom/LoadTest.php b/tests/Dom/LoadTest.php new file mode 100644 index 00000000..6079d2f5 --- /dev/null +++ b/tests/Dom/LoadTest.php @@ -0,0 +1,99 @@ +loadStr('

Hey bro, click here


'); + $this->dom = $dom; + } + + public function tearDown() + { + Mockery::close(); + } + + public function testLoadEscapeQuotes() + { + $a = $this->dom->find('a', 0); + $this->assertEquals('click here', $a->outerHtml); + } + + public function testLoadNoClosingTag() + { + $p = $this->dom->find('p', 0); + $this->assertEquals('Hey bro, click here', $p->innerHtml); + } + + public function testLoadClosingTagOnSelfClosing() + { + $this->assertCount(2, $this->dom->find('br')); + } + + public function testIncorrectAccess() + { + $div = $this->dom->find('div', 0); + $this->assertEquals(null, $div->foo); + } + + public function testLoadAttributeOnSelfClosing() + { + $br = $this->dom->find('br', 1); + $this->assertEquals('both', $br->getAttribute('class')); + } + + public function testToStringMagic() + { + $this->assertEquals('

Hey bro, click here


', (string) $this->dom); + } + + public function testGetMagic() + { + $this->assertEquals('

Hey bro, click here


', $this->dom->innerHtml); + } + + public function testFirstChild() + { + $this->assertEquals('

Hey bro, click here

', $this->dom->firstChild()->outerHtml); + } + + public function testLastChild() + { + $this->assertEquals('
', $this->dom->lastChild()->outerHtml); + } + + public function testGetElementById() + { + $this->assertEquals('click here', $this->dom->getElementById('78')->outerHtml); + } + + public function testGetElementsByTag() + { + $this->assertEquals('

Hey bro, click here

', $this->dom->getElementsByTag('p')[0]->outerHtml); + } + + public function testGetElementsByClass() + { + $this->assertEquals('

Hey bro, click here

', $this->dom->getElementsByClass('all')[0]->innerHtml); + } + + public function testDeleteNode() + { + $a = $this->dom->find('a')[0]; + $a->delete(); + unset($a); + $this->assertEquals('

Hey bro,


', (string) $this->dom); + } +} diff --git a/tests/Dom/NotLoadedTest.php b/tests/Dom/NotLoadedTest.php new file mode 100644 index 00000000..a8cc42ff --- /dev/null +++ b/tests/Dom/NotLoadedTest.php @@ -0,0 +1,34 @@ +dom = $dom; + } + + public function tearDown() + { + Mockery::close(); + } + + public function testNotLoaded() + { + $this->expectException(NotLoadedException::class); + $div = $this->dom->find('div', 0); + } +} + + diff --git a/tests/DomTest.php b/tests/DomTest.php index fed044b8..fb740235 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -3,6 +3,7 @@ declare(strict_types=1); use PHPHtmlParser\Dom; +use PHPHtmlParser\Exceptions\NotLoadedException; use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; @@ -25,31 +26,6 @@ public function testParsingCData() $this->assertSame($html, $dom->root->outerHtml()); } - public function testloadStr() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $div = $dom->find('div', 0); - $this->assertEquals('

Hey bro, click here
:)

', $div->outerHtml); - } - - /** - * @expectedException \PHPHtmlParser\Exceptions\NotLoadedException - */ - public function testNotLoaded() - { - $dom = new Dom(); - $div = $dom->find('div', 0); - } - - public function testIncorrectAccess() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $div = $dom->find('div', 0); - $this->assertEquals(null, $div->foo); - } - public function testLoadSelfclosingAttr() { $dom = new Dom(); @@ -66,14 +42,6 @@ public function testLoadSelfclosingAttrToString() $this->assertEquals('
', (string) $br); } - public function testLoadEscapeQuotes() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here

'); - $div = $dom->find('div', 0); - $this->assertEquals('

Hey bro, click here

', $div->outerHtml); - } - public function testLoadNoOpeningTag() { $dom = new Dom(); @@ -81,29 +49,6 @@ public function testLoadNoOpeningTag() $this->assertEquals('content', $dom->find('.content', 0)->text); } - public function testLoadNoClosingTag() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $root = $dom->find('div', 0)->getParent(); - $this->assertEquals('

Hey bro, click here


', $root->outerHtml); - } - - public function testLoadAttributeOnSelfClosing() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $br = $dom->find('br', 0); - $this->assertEquals('both', $br->getAttribute('class')); - } - - public function testLoadClosingTagOnSelfClosing() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here

'); - $this->assertEquals('

Hey bro, click here

', $dom->find('div', 0)->innerHtml); - } - public function testLoadNoValueAttribute() { $dom = new Dom(); @@ -223,55 +168,6 @@ public function testLoadFromUrl() $this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text); } - public function testToStringMagic() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $this->assertEquals('

Hey bro, click here
:)

', (string) $dom); - } - - public function testGetMagic() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $this->assertEquals('

Hey bro, click here
:)

', $dom->innerHtml); - } - - public function testFirstChild() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('

Hey bro, click here

', $dom->firstChild()->outerHtml); - } - - public function testLastChild() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('
', $dom->lastChild()->outerHtml); - } - - public function testGetElementById() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('click here', $dom->getElementById('78')->outerHtml); - } - - public function testGetElementsByTag() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('

Hey bro, click here

', $dom->getElementsByTag('p')[0]->outerHtml); - } - - public function testGetElementsByClass() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here


'); - $this->assertEquals('

Hey bro, click here

', $dom->getElementsByClass('all')[0]->innerHtml); - } - public function testScriptCleanerScriptTag() { $dom = new Dom(); @@ -321,16 +217,6 @@ public function testCodeTag() $this->assertEquals('hello$foo = "bar";', (string) $dom); } - public function testDeleteNode() - { - $dom = new Dom(); - $dom->loadStr('

Hey bro, click here
:)

'); - $a = $dom->find('a')[0]; - $a->delete(); - unset($a); - $this->assertEquals('

Hey bro,
:)

', (string) $dom); - } - public function testCountChildren() { $dom = new Dom(); @@ -619,4 +505,15 @@ public function testXMLOpeningToken() $this->assertEquals('

fun time

', $dom->outerHtml); } + + /** + * Test to cover issue found in ticket #221 + */ + public function testRandomTagInMiddleOfText() + { + $dom = new Dom(); + $dom->loadStr('

Hello, this is just a test in which <55 names with some other text > should be interpreted as text

'); + + $this->assertEquals('

Hello, this is just a test in which <55 names with some other text> should be interpreted as text

', $dom->outerHtml); + } } From c487fce3c8f931a7dbbbe630666316625c7d247a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Fri, 17 Jul 2020 04:07:21 +0000 Subject: [PATCH 49/68] Cleaned up code base --- README.md | 12 +- .../Contracts/Dom/CleanerInterface.php | 16 + .../Contracts/Dom/ParserInterface.php | 33 + .../Contracts/Selector/SelectorInterface.php | 4 +- src/PHPHtmlParser/DTO/TagDTO.php | 2 +- .../Discovery/CleanerDiscovery.php | 25 + .../Discovery/DomParserDiscovery.php | 25 + ...covery.php => SelectorParserDiscovery.php} | 2 +- src/PHPHtmlParser/Dom.php | 582 ++---------------- src/PHPHtmlParser/Dom/Cleaner.php | 108 ++++ .../Dom/{ => Node}/AbstractNode.php | 3 +- .../Dom/{ => Node}/ArrayNode.php | 2 +- .../Dom/{ => Node}/Collection.php | 2 +- src/PHPHtmlParser/Dom/{ => Node}/HtmlNode.php | 3 +- .../Dom/{ => Node}/InnerNode.php | 2 +- src/PHPHtmlParser/Dom/{ => Node}/LeafNode.php | 2 +- src/PHPHtmlParser/Dom/{ => Node}/TextNode.php | 3 +- src/PHPHtmlParser/Dom/Parser.php | 332 ++++++++++ src/PHPHtmlParser/Dom/RootAccessTrait.php | 100 +++ src/PHPHtmlParser/Dom/Tag.php | 0 src/PHPHtmlParser/Finder.php | 4 +- src/PHPHtmlParser/Selector/Seeker.php | 6 +- src/PHPHtmlParser/Selector/Selector.php | 8 +- tests/CollectionTest.php | 4 +- tests/Dom/LoadTest.php | 1 - tests/DomTest.php | 5 +- tests/Node/ChildrenTest.php | 2 +- tests/Node/HtmlTest.php | 6 +- tests/Node/ParentTest.php | 2 +- tests/Node/TextTest.php | 2 +- tests/Selector/SelectorTest.php | 2 +- tests/data/MockNode.php | 2 +- 32 files changed, 729 insertions(+), 573 deletions(-) create mode 100644 src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php create mode 100644 src/PHPHtmlParser/Contracts/Dom/ParserInterface.php create mode 100644 src/PHPHtmlParser/Discovery/CleanerDiscovery.php create mode 100644 src/PHPHtmlParser/Discovery/DomParserDiscovery.php rename src/PHPHtmlParser/Discovery/{ParserDiscovery.php => SelectorParserDiscovery.php} (93%) create mode 100644 src/PHPHtmlParser/Dom/Cleaner.php rename src/PHPHtmlParser/Dom/{ => Node}/AbstractNode.php (99%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/ArrayNode.php (95%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/Collection.php (98%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/HtmlNode.php (98%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/InnerNode.php (99%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/LeafNode.php (76%) mode change 100755 => 100644 rename src/PHPHtmlParser/Dom/{ => Node}/TextNode.php (98%) mode change 100755 => 100644 create mode 100644 src/PHPHtmlParser/Dom/Parser.php create mode 100644 src/PHPHtmlParser/Dom/RootAccessTrait.php mode change 100755 => 100644 src/PHPHtmlParser/Dom/Tag.php diff --git a/README.md b/README.md index cbd64800..c46d9913 100755 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ You can find many examples of how to use the dom parser and any of its parts (wh ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadStr('

Hey bro, click here
:)

'); @@ -46,7 +46,7 @@ You may also seamlessly load a file into the dom instead of a string, which is m ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadFromFile('tests/data/big.html'); @@ -79,7 +79,7 @@ Loading a url is very similar to the way you would load the html from a file. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadFromUrl('http://google.com'); @@ -95,7 +95,7 @@ What makes the loadFromUrl method note worthy is the `PHPHtmlParser\CurlInterfac ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; use App\Services\Connector; $dom = new Dom; @@ -113,7 +113,7 @@ Loading a string directly, with out the checks in `load()` is also easily done. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->loadStr('String', []); @@ -130,7 +130,7 @@ You can also set parsing option that will effect the behavior of the parsing eng ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom; +use PHPHtmlParser\Dom\Node; $dom = new Dom; $dom->setOptions([ diff --git a/src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php b/src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php new file mode 100644 index 00000000..e2a4f111 --- /dev/null +++ b/src/PHPHtmlParser/Contracts/Dom/CleanerInterface.php @@ -0,0 +1,16 @@ +isLoaded(); + if ($domParser === null) { + $domParser = DomParserDiscovery::find(); + } + if ($domCleaner === null) { + $domCleaner = CleanerDiscovery::find(); + } - return $this->root->innerHtml(); + $this->domParser = $domParser; + $this->domCleaner = $domCleaner; } /** - * A simple wrapper around the root node. - * - * @param string $name + * Returns the inner html of the root node. * + * @throws ChildNotFoundException + * @throws UnknownChildTypeException * @throws NotLoadedException - * - * @return mixed */ - public function __get($name) + public function __toString(): string { $this->isLoaded(); - return $this->root->$name; + return $this->root->innerHtml(); } /** @@ -122,8 +92,9 @@ public function __get($name) * * @throws ChildNotFoundException * @throws CircularException - * @throws StrictException + * @throws Exceptions\ContentLengthException * @throws LogicalException + * @throws StrictException */ public function loadFromFile(string $file, ?Options $options = null): Dom { @@ -139,12 +110,12 @@ public function loadFromFile(string $file, ?Options $options = null): Dom * Use a curl interface implementation to attempt to load * the content from a url. * - * @param ClientInterface $client - * * @throws ChildNotFoundException * @throws CircularException + * @throws Exceptions\ContentLengthException + * @throws LogicalException * @throws StrictException - * @throws \Psr\Http\Client\ClientExceptionInterface + * @throws ClientExceptionInterface */ public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom { @@ -167,28 +138,26 @@ public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $cl * * @throws ChildNotFoundException * @throws CircularException + * @throws Exceptions\ContentLengthException + * @throws LogicalException * @throws StrictException */ public function loadStr(string $str, ?Options $options = null): Dom { - $this->options = new Options(); + $localOptions = new Options(); if ($this->globalOptions !== null) { - $this->options = $this->options->setFromOptions($this->globalOptions); + $localOptions = $localOptions->setFromOptions($this->globalOptions); } if ($options !== null) { - $this->options = $this->options->setFromOptions($options); + $localOptions = $localOptions->setFromOptions($options); } - $this->rawSize = \strlen($str); - $this->raw = $str; - - $html = $this->clean($str); + $html = $this->domCleaner->clean($str, $localOptions); - $this->size = \strlen($str); $this->content = new Content($html); - $this->parse(); - $this->detectCharset(); + $this->root = $this->domParser->parse($localOptions, $this->content, strlen($str)); + $this->domParser->detectCharset($localOptions, $this->defaultCharset, $this->root); return $this; } @@ -208,78 +177,16 @@ public function setOptions(Options $options): Dom /** * Find elements by css selector on the root node. * - * @throws ChildNotFoundException - * @throws NotLoadedException - * * @return mixed|Collection|null - */ - public function find(string $selector, int $nth = null) - { - $this->isLoaded(); - - return $this->root->find($selector, $nth); - } - - /** - * Simple wrapper function that returns the first child. - * - * @throws ChildNotFoundException * @throws NotLoadedException - */ - public function firstChild(): AbstractNode - { - $this->isLoaded(); - - return $this->root->firstChild(); - } - - /** - * Simple wrapper function that returns the last child. * * @throws ChildNotFoundException - * @throws NotLoadedException - */ - public function lastChild(): AbstractNode - { - $this->isLoaded(); - - return $this->root->lastChild(); - } - - /** - * Simple wrapper function that returns count of child elements. - * - * @throws NotLoadedException - */ - public function countChildren(): int - { - $this->isLoaded(); - - return $this->root->countChildren(); - } - - /** - * Get array of children. - * - * @throws NotLoadedException */ - public function getChildren(): array - { - $this->isLoaded(); - - return $this->root->getChildren(); - } - - /** - * Check if node have children nodes. - * - * @throws NotLoadedException - */ - public function hasChildren(): bool + public function find(string $selector, int $nth = null) { $this->isLoaded(); - return $this->root->hasChildren(); + return $this->root->find($selector, $nth); } /** @@ -288,10 +195,10 @@ public function hasChildren(): bool * * @param $id * - * @throws ChildNotFoundException + * @return mixed|Collection|null * @throws NotLoadedException * - * @return mixed|Collection|null + * @throws ChildNotFoundException */ public function getElementById($id) { @@ -304,10 +211,10 @@ public function getElementById($id) * Simple wrapper function that returns all elements by * tag name. * - * @throws ChildNotFoundException + * @return mixed|Collection|null * @throws NotLoadedException * - * @return mixed|Collection|null + * @throws ChildNotFoundException */ public function getElementsByTag(string $name) { @@ -320,10 +227,10 @@ public function getElementsByTag(string $name) * Simple wrapper function that returns all elements by * class name. * - * @throws ChildNotFoundException + * @return mixed|Collection|null * @throws NotLoadedException * - * @return mixed|Collection|null + * @throws ChildNotFoundException */ public function getElementsByClass(string $class) { @@ -343,395 +250,4 @@ private function isLoaded(): void throw new NotLoadedException('Content is not loaded!'); } } - - /** - * Cleans the html of any none-html information. - */ - private function clean(string $str): string - { - if (!$this->options->isCleanupInput()) { - // skip entire cleanup step - return $str; - } - - $is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII'); - if ($is_gzip) { - $str = \gzdecode($str); - if ($str === false) { - throw new LogicalException('gzdecode returned false. Error when trying to decode the string.'); - } - } - - // remove white space before closing tags - $str = \mb_eregi_replace("'\s+>", "'>", $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.'); - } - $str = \mb_eregi_replace('"\s+>', '">', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.'); - } - - // clean out the \n\r - $replace = ' '; - if ($this->options->isPreserveLineBreaks()) { - $replace = ' '; - } - $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str); - if ($str === false) { - throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.'); - } - - // strip the doctype - $str = \mb_eregi_replace('', '', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.'); - } - - // strip out comments - $str = \mb_eregi_replace('', '', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.'); - } - - // strip out cdata - $str = \mb_eregi_replace("", '', $str); - if ($str === false) { - throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.'); - } - - // strip out + + + + + + + + + An Introduction to Custom Fields – WordPress.tv + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + +

An Introduction to Custom Fields

+ +
+ +
+ +
+
+ +
+
+
+
+

+ 16 responses on “An Introduction to Custom Fields

+ +
    +
  1. + + Alex (Viper007Bond) + +
    + + + + + + +
    +
  2. + +
  3. + + Consciência Planetária + +
    + + + + + +
    + +

    Nice explanation!

    +

    I always wondered how these thumbnails are added to magazine layout themes and I had no idea! Indeed I was more interested on the PHP code of how making it work 😛

    +

    It seems that with custom fields we can do pratically any customization…
    +But I have 2 questions

    +

    Are custom fields limited to varchar values, or can we use any kind of data there? Does it have any size limit?

    +

    Ans what about these themes that have a stick post with a bigger image? Does it have 2 custom fields 1 for thumnail and 1 for stick image?

    +

    tnx again!

    +

    Like

    +
    +
    +
  4. + +
  5. + + Vid + +
    + + + + + +
    + +

    Hi, Scott

    +

    This is very helpful for those of us who aren’t programmers but want to maximize WordPress. Thanks for taking the time to submit this tip.

    +

    Like

    +
    +
    +
  6. + +
  7. + + Thomas Clausen + +
    + + + + + +
    + +

    Justin Tadlocks Get the Image plugin can also help you to solve this task (also without CSS stuff in the php file 😉 ).

    +

    Like

    +
    +
    +
  8. + +
  9. + + driz + +
    + + + + + +
    + +

    I would like to see a follow-up video explaining about that Custom Field Template that you told us to ignore. I know it’s created by coding in some WP hooks in functions.php as I have done it myself, but it would be cool to see your interpretation of doing this, might pick up some additional tips.

    +

    Like

    +
    +
    +
  10. + +
  11. + + Scott Ellis + +
    + + + + + +
    + +

    Thomas, that is a good plugin, Justin does great stuff but custom fileds can be used for a lot of things, images were just an easy example, hopefully viewers will find other creative uses. Not sure what you mean about all the css in the php?

    +

    Like

    +
    +
    +
  12. +
      +
    • + + Thomas Clausen + +
      + + + + + +
      + +

      I was just wondering about the styling, that you’ve got about 3 minutes into the video.

      +

      But I didn’t mean to take our eyes of off things. You’re absolutely right the custom fields can be a powerful tool. And I hadn’t really grasped how easy it is, before I saw your video. Good job.

      +

      Like

      +
      +
      +
    • + +
    + +
  13. + + Karen + +
    + + + + + +
    + +

    This is very hard to follow. I admit, I’m not a newbie, but I am not terribly advanced, so I’m not sure where the problem is. One, I think you’re moving too fast. Two, when exactly should you first see the picture appear in the custom field? After you copy the url to the value field and update? or after you go into the home.php and change the code? That is completely unclear. This is such a great topic, so I hope if I post a few questions, it will become clear how to do this. I really do appreciate that you’ve posted this. Thanks!

    +

    Like

    +
    +
    +
  14. + +
  15. + + Karen + +
    + + + + + +
    + +

    also, my home.php doesn’t have the code you show. I am working in the theme Constructor. below is all the text in the home.php file:

    +

    Like

    +
    +
    +
  16. + +
  17. + + Karen + +
    + + + + + +
    + +

    Woopsie & sorry. below is the code in my file:

    +

    Like

    +
    +
    +
  18. + +
  19. + + Scott Ellis + +
    + + + + + +
    + +

    Consciência, you could use two custom fields for that if you wanted to. It would work just fine. I’ve read about some potential performance issues if you use a lot of custom fields but haven’t experienced it myself. We use several on citycrush.com including for the post thumbnail and the image in the post after you click through. The custom field type in the DB is “longtext” maximum size 4GB.

    +

    Driz – We used a plugin for that previously but moved to adding it to the functions.php and using wp hooks so it sounds like you are doing exactly what we would.

    +

    Thomas, most of what was in there was actual logic or just spitting out the html, not css styling, we keep all of that in the stylesheet. Glad you enjoyed the video.

    +

    Karen, sorry it felt fast, if you look at an example and watch the video I’m sure you’ll pick it up quickly. It took me a couple of rounds the first time I started playing with custom fields. Justin Tadlock has a good explanation here as well: http://justintadlock.com/archives/2007/10/24/using-wordpress-custom-fields-introduction.
    +The pictures will appear on the page where your custom field spits them out once you save the image url in the appropriate custom field. FYI, you code didn’t show up so visit http://www.vsellis.com/wordpress-how-to/using-custom-fields-in-wordpress/ and leave a comment and I’ll take a closer look.

    +

    Like

    +
    +
    +
  20. + +
  21. + + Consciência Planetária + +
    + + + + + +
    + +

    Thanks for the reply!

    +

    I’d like to suggest a subject for a future tutorial.

    +

    I love Drupal’s ability to use blocks above and below main content area. It is much easier to implement than WordPress widgets.

    +

    But I know it can be done in WordPress too. I’ve seen some magazine themes that have a “horizontal sidebar” on the botton of the page, and recently I’ve also seen a premium theme that has a “top horizontal sidebar” and a “bottom horizontal sidebar”, together with standard right and left ones.

    +

    It would be great if we had a tutorial teaching how to do it!

    +

    Like

    +
    +
    +
  22. + +
  23. + + PNaw10 + +
    + + + + + +
    + +

    Hello all, just wanted to add one extra tidbit of info.

    +

    The first time you use custom fields, the “name” field is blank, so yes, you would be typing in “thumbnail” as seen in the video. But after your very first use, the “name” field will appear as a pulldown menu which displays ALL previously-used names. So you really don’t have to worry about typing it the right way every single time — just as long as you get it right the first time, you can just select it from the menu. Much faster, and it ensures you’re spelling it the same way every time.

    +

    I realize everyone will discover this on their own as they try it, but thought I’d mention it in case anyone was daunted by the prospect of having to be extra-careful about typing out those case-sensitive field names every time.

    +

    Case-sensitive is definitely important though… for one website I run, cnyradio.com , I originally used Tadlock’s “Newspaperize” theme, which used the custom keywords “thumbnail” and “image.” Later, I upgraded to a newer theme of his, but the theme was designed to seek out “Thumbnail” and “Image” with capital letters at the beginning. Rather than go through all my old posts to change the custom keywords (would have taken forever) I just changed the uppercase letters to lowercase in the theme templates.

    +

    If you want a good example of how different custom fields can help with your site design, check out cnyradio.com. It’s not as complex as the site shown in the video, but it’s (hopefully) still simple enough for newbies (like I was just 2 years ago) to understand.

    +

    My “loop” pages (home page, category pages, etc.) show 128×96 images invoked by the “thumbnail” custom field. When you click to read the full text of any post, a larger 200×150 image appears, invoked by the “image” custom field. If either field is blank or missing, then the site simply doesn’t display an image — the text takes up the entire width of the space.

    +

    Yes, it’s more work because I have to create 2 custom fields for each post, and I create 2 separate images. I do the latter for two main reasons. One, I don’t like relying on web browsers to resize images on-the-fly. Even if it looks OK on my computer, it may appear choppy on someone else’s.

    +

    Two, and more importantly, an image at 200×150 doesn’t always look so good when you simply resize it to 128×96. For example, the “fullsize” version of any mugshots I use will often include the subject’s name and a “courtesy line” to credit the photo source. But that text would be cluttered and tiny when the size is reduced, so when I make the thumbnail, I usually delete the “courtesy” line and bump up the text size of the person’s last name so it’s less cluttered and easier to read.

    +

    If anyone reading this does look at my site to see what I’m talking about, just a note that any “Picture of the Week” posts are done entirely differently. I won’t get into details, just wanted to avoid any confusion.

    +

    Like

    +
    +
    +
  24. + +
  25. + + Sarfraz Ahmed + +
    + + + + + +
    + +

    can we add custom fields to wordpress.com blogs?

    +

    Like

    +
    +
    +
  26. + + +
  27. + + votar fotos + +
    + + + + + +
    + +

    I guess never say never, huh?

    +

    Like

    +
    +
    +
  28. + +
+
+

Continue the discussion

+ + +
+ +
+
+ +
+
+

Fill in your details below or click an icon to log in:

+ +
+ +
+
+
+ Gravatar +
+ +
+ +
+ +
+
+
+ +
+
+
+ +
+
+ +
+
+
+ WordPress.com Logo +
+ +
+ + + +

+ + You are commenting using your WordPress.com account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ +
+
+
+ Google photo +
+ +
+ + + +

+ + You are commenting using your Google account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ +
+
+
+ Twitter picture +
+ +
+ + + +

+ + You are commenting using your Twitter account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ +
+
+
+ Facebook photo +
+ +
+ + + +

+ + You are commenting using your Facebook account. + ( Log Out /  + Change ) + + +

+
+ +
+
+ + +
+ +

Connecting to %s

+
+ +
+ + + +
+

+ + + + +

+ +

+ +

+
+
+
+ +
+
Published
+

August 29, 2009

+ +

Using custom fields can be confusing to new WordPress users. Scott Ellis provides an introductory explanation of how to use custom fields for image placement and the components that go into making custom fields work from front end placement to back end utilization and code.

+

Rate this:

+
Speakers

Scott Ellis 3

Tags

Custom Fields 23

Language

English 8849

Download
+
+MP4: Low, Med
OGG: Low
+
Subtitles
Subtitle this video → +
Producer
+ + + +
+
+ + +
+ + + + + + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + +
    + + + + + + diff --git a/tests/data/files/mvEregiReplaceFailure.html b/tests/data/files/mvEregiReplaceFailure.html new file mode 100644 index 00000000..d9a559d4 --- /dev/null +++ b/tests/data/files/mvEregiReplaceFailure.html @@ -0,0 +1,1117 @@ + + + + + + + + + + + + + + + + + + + + An Introduction to Custom Fields – WordPress.tv + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    +
    + + +

    An Introduction to Custom Fields

    + +
    + +
    + +
    +
    + +
    +
    +
    +
    +

    + 16 responses on “An Introduction to Custom Fields

    + +
      +
    1. + + Alex (Viper007Bond) + +
      + + + + + + +
      +
    2. + +
    3. + + Consciência Planetária + +
      + + + + + +
      + +

      Nice explanation!

      +

      I always wondered how these thumbnails are added to magazine layout themes and I had no idea! Indeed I was more interested on the PHP code of how making it work 😛

      +

      It seems that with custom fields we can do pratically any customization…
      +But I have 2 questions

      +

      Are custom fields limited to varchar values, or can we use any kind of data there? Does it have any size limit?

      +

      Ans what about these themes that have a stick post with a bigger image? Does it have 2 custom fields 1 for thumnail and 1 for stick image?

      +

      tnx again!

      +

      Like

      +
      +
      +
    4. + +
    5. + + Vid + +
      + + + + + +
      + +

      Hi, Scott

      +

      This is very helpful for those of us who aren’t programmers but want to maximize WordPress. Thanks for taking the time to submit this tip.

      +

      Like

      +
      +
      +
    6. + +
    7. + + Thomas Clausen + +
      + + + + + +
      + +

      Justin Tadlocks Get the Image plugin can also help you to solve this task (also without CSS stuff in the php file 😉 ).

      +

      Like

      +
      +
      +
    8. + +
    9. + + driz + +
      + + + + + +
      + +

      I would like to see a follow-up video explaining about that Custom Field Template that you told us to ignore. I know it’s created by coding in some WP hooks in functions.php as I have done it myself, but it would be cool to see your interpretation of doing this, might pick up some additional tips.

      +

      Like

      +
      +
      +
    10. + +
    11. + + Scott Ellis + +
      + + + + + +
      + +

      Thomas, that is a good plugin, Justin does great stuff but custom fileds can be used for a lot of things, images were just an easy example, hopefully viewers will find other creative uses. Not sure what you mean about all the css in the php?

      +

      Like

      +
      +
      +
    12. +
        +
      • + + Thomas Clausen + +
        + + + + + +
        + +

        I was just wondering about the styling, that you’ve got about 3 minutes into the video.

        +

        But I didn’t mean to take our eyes of off things. You’re absolutely right the custom fields can be a powerful tool. And I hadn’t really grasped how easy it is, before I saw your video. Good job.

        +

        Like

        +
        +
        +
      • + +
      + +
    13. + + Karen + +
      + + + + + +
      + +

      This is very hard to follow. I admit, I’m not a newbie, but I am not terribly advanced, so I’m not sure where the problem is. One, I think you’re moving too fast. Two, when exactly should you first see the picture appear in the custom field? After you copy the url to the value field and update? or after you go into the home.php and change the code? That is completely unclear. This is such a great topic, so I hope if I post a few questions, it will become clear how to do this. I really do appreciate that you’ve posted this. Thanks!

      +

      Like

      +
      +
      +
    14. + +
    15. + + Karen + +
      + + + + + +
      + +

      also, my home.php doesn’t have the code you show. I am working in the theme Constructor. below is all the text in the home.php file:

      +

      Like

      +
      +
      +
    16. + +
    17. + + Karen + +
      + + + + + +
      + +

      Woopsie & sorry. below is the code in my file:

      +

      Like

      +
      +
      +
    18. + +
    19. + + Scott Ellis + +
      + + + + + +
      + +

      Consciência, you could use two custom fields for that if you wanted to. It would work just fine. I’ve read about some potential performance issues if you use a lot of custom fields but haven’t experienced it myself. We use several on citycrush.com including for the post thumbnail and the image in the post after you click through. The custom field type in the DB is “longtext” maximum size 4GB.

      +

      Driz – We used a plugin for that previously but moved to adding it to the functions.php and using wp hooks so it sounds like you are doing exactly what we would.

      +

      Thomas, most of what was in there was actual logic or just spitting out the html, not css styling, we keep all of that in the stylesheet. Glad you enjoyed the video.

      +

      Karen, sorry it felt fast, if you look at an example and watch the video I’m sure you’ll pick it up quickly. It took me a couple of rounds the first time I started playing with custom fields. Justin Tadlock has a good explanation here as well: http://justintadlock.com/archives/2007/10/24/using-wordpress-custom-fields-introduction.
      +The pictures will appear on the page where your custom field spits them out once you save the image url in the appropriate custom field. FYI, you code didn’t show up so visit http://www.vsellis.com/wordpress-how-to/using-custom-fields-in-wordpress/ and leave a comment and I’ll take a closer look.

      +

      Like

      +
      +
      +
    20. + +
    21. + + Consciência Planetária + +
      + + + + + +
      + +

      Thanks for the reply!

      +

      I’d like to suggest a subject for a future tutorial.

      +

      I love Drupal’s ability to use blocks above and below main content area. It is much easier to implement than WordPress widgets.

      +

      But I know it can be done in WordPress too. I’ve seen some magazine themes that have a “horizontal sidebar” on the botton of the page, and recently I’ve also seen a premium theme that has a “top horizontal sidebar” and a “bottom horizontal sidebar”, together with standard right and left ones.

      +

      It would be great if we had a tutorial teaching how to do it!

      +

      Like

      +
      +
      +
    22. + +
    23. + + PNaw10 + +
      + + + + + +
      + +

      Hello all, just wanted to add one extra tidbit of info.

      +

      The first time you use custom fields, the “name” field is blank, so yes, you would be typing in “thumbnail” as seen in the video. But after your very first use, the “name” field will appear as a pulldown menu which displays ALL previously-used names. So you really don’t have to worry about typing it the right way every single time — just as long as you get it right the first time, you can just select it from the menu. Much faster, and it ensures you’re spelling it the same way every time.

      +

      I realize everyone will discover this on their own as they try it, but thought I’d mention it in case anyone was daunted by the prospect of having to be extra-careful about typing out those case-sensitive field names every time.

      +

      Case-sensitive is definitely important though… for one website I run, cnyradio.com , I originally used Tadlock’s “Newspaperize” theme, which used the custom keywords “thumbnail” and “image.” Later, I upgraded to a newer theme of his, but the theme was designed to seek out “Thumbnail” and “Image” with capital letters at the beginning. Rather than go through all my old posts to change the custom keywords (would have taken forever) I just changed the uppercase letters to lowercase in the theme templates.

      +

      If you want a good example of how different custom fields can help with your site design, check out cnyradio.com. It’s not as complex as the site shown in the video, but it’s (hopefully) still simple enough for newbies (like I was just 2 years ago) to understand.

      +

      My “loop” pages (home page, category pages, etc.) show 128×96 images invoked by the “thumbnail” custom field. When you click to read the full text of any post, a larger 200×150 image appears, invoked by the “image” custom field. If either field is blank or missing, then the site simply doesn’t display an image — the text takes up the entire width of the space.

      +

      Yes, it’s more work because I have to create 2 custom fields for each post, and I create 2 separate images. I do the latter for two main reasons. One, I don’t like relying on web browsers to resize images on-the-fly. Even if it looks OK on my computer, it may appear choppy on someone else’s.

      +

      Two, and more importantly, an image at 200×150 doesn’t always look so good when you simply resize it to 128×96. For example, the “fullsize” version of any mugshots I use will often include the subject’s name and a “courtesy line” to credit the photo source. But that text would be cluttered and tiny when the size is reduced, so when I make the thumbnail, I usually delete the “courtesy” line and bump up the text size of the person’s last name so it’s less cluttered and easier to read.

      +

      If anyone reading this does look at my site to see what I’m talking about, just a note that any “Picture of the Week” posts are done entirely differently. I won’t get into details, just wanted to avoid any confusion.

      +

      Like

      +
      +
      +
    24. + +
    25. + + Sarfraz Ahmed + +
      + + + + + +
      + +

      can we add custom fields to wordpress.com blogs?

      +

      Like

      +
      +
      +
    26. + + +
    27. + + votar fotos + +
      + + + + + +
      + +

      I guess never say never, huh?

      +

      Like

      +
      +
      +
    28. + +
    +
    +

    Continue the discussion

    + + +
    + +
    +
    + +
    +
    +

    Fill in your details below or click an icon to log in:

    + +
    + +
    +
    +
    + Gravatar +
    + +
    + +
    + +
    +
    +
    + +
    +
    +
    + +
    +
    + +
    +
    +
    + WordPress.com Logo +
    + +
    + + + +

    + + You are commenting using your WordPress.com account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + +
    +
    +
    + Google photo +
    + +
    + + + +

    + + You are commenting using your Google account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + +
    +
    +
    + Twitter picture +
    + +
    + + + +

    + + You are commenting using your Twitter account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + +
    +
    +
    + Facebook photo +
    + +
    + + + +

    + + You are commenting using your Facebook account. + ( Log Out /  + Change ) + + +

    +
    + +
    +
    + + +
    + +

    Connecting to %s

    +
    + +
    + + + +
    +

    + + + + +

    + +

    + +

    +
    +
    +
    + +
    +
    Published
    +

    August 29, 2009

    + +

    Using custom fields can be confusing to new WordPress users. Scott Ellis provides an introductory explanation of how to use custom fields for image placement and the components that go into making custom fields work from front end placement to back end utilization and code.

    +

    Rate this:

    +
    Speakers

    Scott Ellis 3

    Tags

    Custom Fields 23

    Language

    English 8849

    Download
    +
    +MP4: Low, Med
    OGG: Low
    +
    Subtitles
    Subtitle this video → +
    Producer
    + + + +
    +
    + + +
    + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + + + + + + + + + + + +
      + + + + + + From c116346b2fcc038d4ab6615f4328346a4eebf121 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 19 Jul 2020 18:28:37 +0000 Subject: [PATCH 52/68] Updated documentation --- CHANGELOG.md | 4 +- README.md | 82 ++++++++++++++++++++++----------------- tests/Dom/CleanerTest.php | 2 +- 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05d2146f..25f862dc 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## 3.0.0 ### Added - Support for PSR7 HTTP clients and requests for URL calls has been added. @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - PHP-CS-Fixer added. - Support for html5 charset detection. - Added the ability to match both parent and children. +- Added character set conversion in load. ### Changed - Fixed issue with \ causing an infite loop. @@ -28,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Removed support for the depth first search option. - `findById()` method removed from Dom object. - Removed `load()` method in Dom object. +- Removed support for php 7.1. ## 2.2.0 diff --git a/README.md b/README.md index c46d9913..32853b91 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 2.2.1 +Version 3.0.0 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) @@ -18,7 +18,7 @@ Install the latest version using composer. $ composer require paquettg/php-html-parser ``` -This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.1, 7.2, 7.3, and 7.4. +This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.2, 7.3, and 7.4. Usage ----- @@ -28,7 +28,7 @@ You can find many examples of how to use the dom parser and any of its parts (wh ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; $dom->loadStr('

      Hey bro, click here
      :)

      '); @@ -46,7 +46,7 @@ You may also seamlessly load a file into the dom instead of a string, which is m ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; $dom->loadFromFile('tests/data/big.html'); @@ -69,8 +69,6 @@ foreach ($contents as $content) This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of methods that a node has available. -Alternativly, you can always use the `load()` method to load the file. It will attempt to find the file using `file_exists` and, if successful, will call `loadFromFile()` for you. The same applies to a URL and `loadFromUrl()` method. - Loading Url ---------------- @@ -79,7 +77,7 @@ Loading a url is very similar to the way you would load the html from a file. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; $dom->loadFromUrl('http://google.com'); @@ -90,38 +88,36 @@ $dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // same result as the first example ``` -What makes the loadFromUrl method note worthy is the `PHPHtmlParser\CurlInterface` parameter, an optional second parameter. By default, we use the `PHPHtmlParser\Curl` class to get the contents of the url. On the other hand, though, you can inject your own implementation of CurlInterface and we will attempt to load the url using what ever tool/settings you want, up to you. +loadFromUrl will, by default, use an implementation of the `\Psr\Http\Client\ClientInterface` to do the HTTP request and a default implementation of `\Psr\Http\Message\RequestInterface` to create the body of the request. You can easely implement your own version of either the client or request to use a custom HTTP connection when using loadFromUrl. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; -use App\Services\Connector; +use PHPHtmlParser\Dom; +use App\Services\MyClient; $dom = new Dom; -$dom->loadFromUrl('http://google.com', [], new Connector); +$dom->loadFromUrl('http://google.com', null, new MyClient()); $html = $dom->outerHtml; ``` -As long as the Connector object implements the `PHPHtmlParser\CurlInterface` interface properly it will use that object to get the content of the url instead of the default `PHPHtmlParser\Curl` class. +As long as the client object implements the interface properly it will use that object to get the content of the url. Loading Strings --------------- -Loading a string directly, with out the checks in `load()` is also easily done. +Loading a string directly is also easily done. ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; $dom = new Dom; -$dom->loadStr('String', []); +$dom->loadStr('String'); $html = $dom->outerHtml; ``` -If the string is to long, depending on your file system, the `load()` method will throw a warning. If this happens you can just call the above method to bypass the `is_file()` check in the `load()` method. - Options ------- @@ -130,21 +126,24 @@ You can also set parsing option that will effect the behavior of the parsing eng ```php // Assuming you installed from Composer: require "vendor/autoload.php"; -use PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom; +use PHPHtmlParser\Options; $dom = new Dom; -$dom->setOptions([ - 'strict' => true, // Set a global option to enable strict html parsing. -]); +$dom->setOptions( + // this is set as the global option level. + (new Options()) + ->setStrict(true) +); -$dom->loadFromUrl('http://google.com', [ - 'whitespaceTextNode' => false, // Only applies to this load. -]); +$dom->loadFromUrl('http://google.com', + (new Options())->setWhitespaceTextNode(false) // only applies to this load. +); $dom->loadFromUrl('http://gmail.com'); // will not have whitespaceTextNode set to false. ``` -At the moment we support 8 options. +At the moment we support 12 options. **Strict** @@ -182,15 +181,17 @@ Set this to `false` if you want to preserve whitespace inside of text nodes. It Set this to `false` if you want to preserve smarty script found in the html content. It is set to `true` by default. -**depthFirstSearch** +**htmlSpecialCharsDecode** + +By default this is set to `false`. Setting this to `true` will apply the php function `htmlspecialchars_decode` too all attribute values and text nodes. -By default this is set to `false` for legacy support. Setting this to `true` will change the behavior of find to order elements by depth first. This will properly preserve the order of elements as they where in the HTML. +**selfClosing** -This option is depricated and will be removed in version `3.0.0` with the new behavior being as if it was set to `true`. +This option contains an array of all self closing tags. These tags must be self closing and the parser will force them to be so if you have strict turned on. You can update this list with any additional tags that can be used as a self closing tag when using strict. You can also remove tags from this array or clear it out completly. -**htmlSpecialCharsDecode** +**noSlash** -By default this is set to `false`. Setting this to `true` will apply the php function `htmlspecialchars_decode` too all attribute values and text nodes. +This option contains an array of all tags that can not be self closing. The list starts off as empty but you can add elements as you wish. Static Facade ------------- @@ -200,7 +201,7 @@ You can also mount a static facade for the Dom object. ```PHP PHPHtmlParser\StaticDom::mount(); -Dom::load('tests/big.hmtl'); +Dom::loadFromFile('tests/big.hmtl'); $objects = Dom::find('.content-border'); ``` @@ -213,8 +214,10 @@ Modifying The Dom You can always modify the dom that was created from any loading method. To change the attribute of any node you can just call the `setAttribute` method. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); $a = $dom->find('a')[0]; $a->setAttribute('class', 'foo'); echo $a->getAttribute('class'); // "foo" @@ -223,8 +226,11 @@ echo $a->getAttribute('class'); // "foo" You may also get the `PHPHtmlParser\Dom\Tag` class directly and manipulate it as you see fit. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); +/** @var Dom\Node\AbstractNode $a */ $a = $dom->find('a')[0]; $tag = $a->getTag(); $tag->setAttribute('class', 'foo'); @@ -234,8 +240,11 @@ echo $a->getAttribute('class'); // "foo" It is also possible to remove a node from the tree. Simply call the `delete` method on any node to remove it from the tree. It is important to note that you should unset the node after removing it from the `DOM``, it will still take memory as long as it is not unset. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); +/** @var Dom\Node\AbstractNode $a */ $a = $dom->find('a')[0]; $a->delete(); unset($a); @@ -245,8 +254,11 @@ echo $dom; // '

      Hey bro,
      :)

      '); You can modify the text of `TextNode` objects easely. Please note that, if you set an encoding, the new text will be encoded using the existing encoding. ```php +use PHPHtmlParser\Dom; + $dom = new Dom; -$dom->load('

      Hey bro, click here
      :)

      '); +$dom->loadStr('

      Hey bro, click here
      :)

      '); +/** @var Dom\Node\InnerNode $a */ $a = $dom->find('a')[0]; $a->firstChild()->setText('biz baz'); echo $dom; // '

      Hey bro, biz baz
      :)

      ' diff --git a/tests/Dom/CleanerTest.php b/tests/Dom/CleanerTest.php index 8473eaff..3ff32506 100644 --- a/tests/Dom/CleanerTest.php +++ b/tests/Dom/CleanerTest.php @@ -8,7 +8,7 @@ class CleanerTest extends TestCase { - public function testLoadByURL() + public function testCleanEregiFailureFile() { $cleaner = new Cleaner(); $string = $cleaner->clean(\file_get_contents('tests/data/files/mvEregiReplaceFailure.html'), new Options(), 'utf-8'); From 382b98c8de5c6b7d1fd59831e88ddcb82f4f6fbb Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:15:44 +0000 Subject: [PATCH 53/68] fixed #228 - Fixed documentation and added typew hint --- src/PHPHtmlParser/Dom/Node/AbstractNode.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/PHPHtmlParser/Dom/Node/AbstractNode.php b/src/PHPHtmlParser/Dom/Node/AbstractNode.php index 254ca31f..a2c29274 100644 --- a/src/PHPHtmlParser/Dom/Node/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/Node/AbstractNode.php @@ -154,9 +154,9 @@ public function id(): int /** * Returns the parent of node. * - * @return AbstractNode + * @return InnerNode */ - public function getParent() + public function getParent(): ?InnerNode { return $this->parent; } From cf0bb680b39a8ad59444d54e67462527c4ba455a Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:17:12 +0000 Subject: [PATCH 54/68] Small refactor - Added private methods for DTO --- .gitattributes | 1 + .../Selector/ParsedSelectorCollectionDTO.php | 19 +++++++++++--- .../DTO/Selector/ParsedSelectorDTO.php | 19 +++++++++++--- src/PHPHtmlParser/DTO/Selector/RuleDTO.php | 26 +++++++++++-------- src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 10 ++++++- src/PHPHtmlParser/DTO/TagDTO.php | 12 ++++++++- src/PHPHtmlParser/Dom/Parser.php | 22 +++++----------- src/PHPHtmlParser/Dom/Tag.php | 8 +++--- src/PHPHtmlParser/Selector/Parser.php | 22 ++++++++-------- tests/Selector/SeekerTest.php | 16 ++++++------ 10 files changed, 95 insertions(+), 60 deletions(-) diff --git a/.gitattributes b/.gitattributes index ebfea7c7..77b544ff 100755 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,5 @@ /tests export-ignore +/tests linguist-documentation /.scrutinizar.yml export-ignore /.travis.yml export-ignore /.gitignore export-ignore diff --git a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php index 128db514..870262cc 100644 --- a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php @@ -11,15 +11,26 @@ final class ParsedSelectorCollectionDTO */ private $parsedSelectorDTO = []; - public function __construct(array $values) + /** + * @param ParsedSelectorDTO[] $parsedSelectorDTOs + */ + private function __construct(array $parsedSelectorDTOs) { - foreach ($values as $value) { - if ($value instanceof ParsedSelectorDTO) { - $this->parsedSelectorDTO[] = $value; + foreach ($parsedSelectorDTOs as $parsedSelectorDTO) { + if ($parsedSelectorDTO instanceof ParsedSelectorDTO) { + $this->parsedSelectorDTO[] = $parsedSelectorDTO; } } } + /** + * @param ParsedSelectorDTO[] $parsedSelectorDTOs + */ + public static function makeCollection(array $parsedSelectorDTOs): ParsedSelectorCollectionDTO + { + return new ParsedSelectorCollectionDTO($parsedSelectorDTOs); + } + /** * @return ParsedSelectorDTO[] */ diff --git a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php index 5424e2a7..bce0721f 100644 --- a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php @@ -11,15 +11,26 @@ final class ParsedSelectorDTO */ private $rules = []; - public function __construct(array $values) + /** + * @param RuleDTO[] $ruleDTOs + */ + private function __construct(array $ruleDTOs) { - foreach ($values as $value) { - if ($value instanceof RuleDTO) { - $this->rules[] = $value; + foreach ($ruleDTOs as $ruleDTO) { + if ($ruleDTO instanceof RuleDTO) { + $this->rules[] = $ruleDTO; } } } + /** + * @param RuleDTO[] $ruleDTOs + */ + public static function makeFromRules(array $ruleDTOs): ParsedSelectorDTO + { + return new ParsedSelectorDTO($ruleDTOs); + } + /** * @return RuleDTO[] */ diff --git a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php index 1c336149..5299e3a0 100644 --- a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php +++ b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php @@ -36,7 +36,7 @@ final class RuleDTO */ private $alterNext; - public function __construct(array $values) + private function __construct(array $values) { $this->tag = $values['tag']; $this->operator = $values['operator']; @@ -47,16 +47,26 @@ public function __construct(array $values) } /** - * @return string + * @param string|array|null $key + * @param string|array|null $value */ + public static function makeFromPrimitives(string $tag, string $operator, $key, $value, bool $noKey, bool $alterNext): RuleDTO + { + return new RuleDTO([ + 'tag' => $tag, + 'operator' => $operator, + 'key' => $key, + 'value' => $value, + 'noKey' => $noKey, + 'alterNext' => $alterNext, + ]); + } + public function getTag(): string { return $this->tag; } - /** - * @return string - */ public function getOperator(): string { return $this->operator; @@ -78,17 +88,11 @@ public function getValue() return $this->value; } - /** - * @return bool - */ public function isNoKey(): bool { return $this->noKey; } - /** - * @return bool - */ public function isAlterNext(): bool { return $this->alterNext; diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php index 6ac22197..3e7e1824 100755 --- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php +++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php @@ -19,12 +19,20 @@ final class AttributeDTO */ private $doubleQuote; - public function __construct(array $values) + private function __construct(array $values) { $this->value = $values['value']; $this->doubleQuote = $values['doubleQuote'] ?? true; } + public static function makeFromPrimitives(?string $value, bool $doubleQuote = true): AttributeDTO + { + return new AttributeDTO([ + 'value' => $value, + 'doubleQuote' => $doubleQuote, + ]); + } + public function getValue(): ?string { return $this->value; diff --git a/src/PHPHtmlParser/DTO/TagDTO.php b/src/PHPHtmlParser/DTO/TagDTO.php index e9d182db..71f0ec1c 100644 --- a/src/PHPHtmlParser/DTO/TagDTO.php +++ b/src/PHPHtmlParser/DTO/TagDTO.php @@ -28,7 +28,7 @@ final class TagDTO */ private $tag; - public function __construct(array $values = []) + private function __construct(array $values = []) { $this->status = $values['status'] ?? false; $this->closing = $values['closing'] ?? false; @@ -36,6 +36,16 @@ public function __construct(array $values = []) $this->tag = $values['tag'] ?? null; } + public static function makeFromPrimitives(bool $status = false, bool $closing = false, ?HtmlNode $node = null, ?string $tag = null): TagDTO + { + return new TagDTO([ + 'status' => $status, + 'closing' => $closing, + 'node' => $node, + 'tag' => $tag, + ]); + } + public function isStatus(): bool { return $this->status; diff --git a/src/PHPHtmlParser/Dom/Parser.php b/src/PHPHtmlParser/Dom/Parser.php index 0d4573f4..418e535c 100644 --- a/src/PHPHtmlParser/Dom/Parser.php +++ b/src/PHPHtmlParser/Dom/Parser.php @@ -160,10 +160,9 @@ public function detectCharset(Options $options, string $defaultCharset, Abstract */ private function parseTag(Options $options, Content $content, int $size): TagDTO { - $return = []; if ($content->char() != '<') { // we are not at the beginning of a tag - return new TagDTO(); + return TagDTO::makeFromPrimitives(); } // check if this is a closing tag @@ -171,7 +170,7 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO $content->fastForward(1); } catch (ContentLengthException $exception) { // we are at the end of the file - return new TagDTO(); + return TagDTO::makeFromPrimitives(); } if ($content->char() == '/') { return $this->makeEndTag($content, $options); @@ -188,7 +187,7 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true)); if (\trim($tag) == '') { // no tag found, invalid < found - return new TagDTO(); + return TagDTO::makeFromPrimitives(); } } $node = new HtmlNode($tag); @@ -220,10 +219,7 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO $content->fastForward(1); } - $return['status'] = true; - $return['node'] = $node; - - return new TagDTO($return); + return TagDTO::makeFromPrimitives(true, false, $node); } /** @@ -249,7 +245,6 @@ private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool */ private function makeEndTag(Content $content, Options $options): TagDTO { - $return = []; $tag = $content->fastForward(1) ->copyByToken(StringToken::SLASH(), true); // move to end of tag @@ -259,15 +254,10 @@ private function makeEndTag(Content $content, Options $options): TagDTO // check if this closing tag counts $tag = \strtolower($tag); if (\in_array($tag, $options->getSelfClosing(), true)) { - $return['status'] = true; - - return new TagDTO($return); + return TagDTO::makeFromPrimitives(true); } - $return['status'] = true; - $return['closing'] = true; - $return['tag'] = \strtolower($tag); - return new TagDTO($return); + return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag)); } /** diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index be974c1e..29b68bf7 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -163,10 +163,10 @@ public function noise(string $noise): Tag */ public function setAttribute(string $key, ?string $attributeValue, bool $doubleQuote = true): Tag { - $attributeDTO = new AttributeDTO([ - 'value' => $attributeValue, - 'doubleQuote' => $doubleQuote, - ]); + $attributeDTO = AttributeDTO::makeFromPrimitives( + $attributeValue, + $doubleQuote + ); if ($this->HtmlSpecialCharsDecode) { $attributeDTO->htmlspecialcharsDecode(); } diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php index a70a7a5e..4643c467 100755 --- a/src/PHPHtmlParser/Selector/Parser.php +++ b/src/PHPHtmlParser/Selector/Parser.php @@ -92,25 +92,25 @@ public function parseSelectorString(string $selector): ParsedSelectorCollectionD $noKey = true; } - $rules[] = new RuleDTO([ - 'tag' => $tag, - 'key' => $key, - 'value' => $value, - 'operator' => $operator, - 'noKey' => $noKey, - 'alterNext' => $alterNext, - ]); + $rules[] = RuleDTO::makeFromPrimitives( + $tag, + $operator, + $key, + $value, + $noKey, + $alterNext + ); if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') { - $selectors[] = new ParsedSelectorDTO($rules); + $selectors[] = ParsedSelectorDTO::makeFromRules($rules); $rules = []; } } // save last results if (\count($rules) > 0) { - $selectors[] = new ParsedSelectorDTO($rules); + $selectors[] = ParsedSelectorDTO::makeFromRules($rules); } - return new ParsedSelectorCollectionDTO($selectors); + return ParsedSelectorCollectionDTO::makeCollection($selectors); } } diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php index a5106e98..d9e0e824 100644 --- a/tests/Selector/SeekerTest.php +++ b/tests/Selector/SeekerTest.php @@ -10,14 +10,14 @@ class SeekerTest extends TestCase { public function testSeekReturnEmptyArray() { - $ruleDTO = new RuleDTO([ - 'tag' => 'tag', - 'key' => 1, - 'value' => null, - 'operator' => null, - 'noKey' => false, - 'alterNext' => false, - ]); + $ruleDTO = RuleDTO::makeFromPrimitives( + 'tag', + '=', + null, + null, + false, + false + ); $seeker = new Seeker(); $results = $seeker->seek([], $ruleDTO, []); $this->assertCount(0, $results); From 77a7eb18f003dba0b0d82f969e93f2896947121e Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:30:20 +0000 Subject: [PATCH 55/68] fixed #229 - Added documentation to reflect read only property --- .php_cs.dist | 3 ++- src/PHPHtmlParser/Dom/Node/AbstractNode.php | 13 ++++++------- src/PHPHtmlParser/Dom/Node/ArrayNode.php | 8 ++++++++ src/PHPHtmlParser/Dom/Node/HtmlNode.php | 7 +++++++ src/PHPHtmlParser/Dom/Node/InnerNode.php | 8 ++++++++ src/PHPHtmlParser/Dom/Node/LeafNode.php | 9 +++++++++ src/PHPHtmlParser/Dom/Node/TextNode.php | 7 +++++++ tests/Node/HtmlTest.php | 2 +- 8 files changed, 48 insertions(+), 9 deletions(-) diff --git a/.php_cs.dist b/.php_cs.dist index 56af284d..2ead7195 100644 --- a/.php_cs.dist +++ b/.php_cs.dist @@ -90,6 +90,7 @@ return PhpCsFixer\Config::create() 'method', 'param', 'property', + 'property-read', 'return', 'throws', 'type', @@ -100,7 +101,7 @@ return PhpCsFixer\Config::create() 'phpdoc_indent' => true, 'phpdoc_inline_tag' => true, 'phpdoc_no_access' => true, - 'phpdoc_no_alias_tag' => true, + 'phpdoc_no_alias_tag' => false, 'phpdoc_no_package' => true, 'phpdoc_no_useless_inheritdoc' => true, 'phpdoc_order' => true, diff --git a/src/PHPHtmlParser/Dom/Node/AbstractNode.php b/src/PHPHtmlParser/Dom/Node/AbstractNode.php index a2c29274..897445b0 100644 --- a/src/PHPHtmlParser/Dom/Node/AbstractNode.php +++ b/src/PHPHtmlParser/Dom/Node/AbstractNode.php @@ -17,13 +17,12 @@ /** * Dom node object. * - * @property string $outerhtml - * @property string $innerhtml - * @property string $text - * @property int $prev - * @property int $next - * @property Tag $tag - * @property InnerNode $parent + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class AbstractNode { diff --git a/src/PHPHtmlParser/Dom/Node/ArrayNode.php b/src/PHPHtmlParser/Dom/Node/ArrayNode.php index fb8ed4c2..87e8bd51 100644 --- a/src/PHPHtmlParser/Dom/Node/ArrayNode.php +++ b/src/PHPHtmlParser/Dom/Node/ArrayNode.php @@ -7,10 +7,18 @@ use ArrayIterator; use Countable; use IteratorAggregate; +use PHPHtmlParser\Dom\Tag; /** * Dom node object which will allow users to use it as * an array. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class ArrayNode extends AbstractNode implements IteratorAggregate, Countable { diff --git a/src/PHPHtmlParser/Dom/Node/HtmlNode.php b/src/PHPHtmlParser/Dom/Node/HtmlNode.php index 0d78b8ff..2acb2592 100644 --- a/src/PHPHtmlParser/Dom/Node/HtmlNode.php +++ b/src/PHPHtmlParser/Dom/Node/HtmlNode.php @@ -10,6 +10,13 @@ /** * Class HtmlNode. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ class HtmlNode extends InnerNode { diff --git a/src/PHPHtmlParser/Dom/Node/InnerNode.php b/src/PHPHtmlParser/Dom/Node/InnerNode.php index 911e10a0..448057a7 100644 --- a/src/PHPHtmlParser/Dom/Node/InnerNode.php +++ b/src/PHPHtmlParser/Dom/Node/InnerNode.php @@ -4,6 +4,7 @@ namespace PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom\Tag; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\LogicalException; @@ -11,6 +12,13 @@ /** * Inner node of the html tree, might have children. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class InnerNode extends ArrayNode { diff --git a/src/PHPHtmlParser/Dom/Node/LeafNode.php b/src/PHPHtmlParser/Dom/Node/LeafNode.php index 7a2a7386..f74414a0 100644 --- a/src/PHPHtmlParser/Dom/Node/LeafNode.php +++ b/src/PHPHtmlParser/Dom/Node/LeafNode.php @@ -4,8 +4,17 @@ namespace PHPHtmlParser\Dom\Node; +use PHPHtmlParser\Dom\Tag; + /** * Class LeafNode. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ abstract class LeafNode extends AbstractNode { diff --git a/src/PHPHtmlParser/Dom/Node/TextNode.php b/src/PHPHtmlParser/Dom/Node/TextNode.php index a5bd934e..1c8b646c 100644 --- a/src/PHPHtmlParser/Dom/Node/TextNode.php +++ b/src/PHPHtmlParser/Dom/Node/TextNode.php @@ -9,6 +9,13 @@ /** * Class TextNode. + * + * @property-read string $outerhtml + * @property-read string $innerhtml + * @property-read string $innerText + * @property-read string $text + * @property-read Tag $tag + * @property-read InnerNode $parent */ class TextNode extends LeafNode { diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php index 153f7da5..592003b4 100755 --- a/tests/Node/HtmlTest.php +++ b/tests/Node/HtmlTest.php @@ -334,7 +334,7 @@ public function testInnerText() $node->addChild($anode); $node->addChild($span_node); - $this->assertEquals($node->innerText(), '123 456789 101112'); + $this->assertEquals($node->innerText, '123 456789 101112'); } public function testTextLookInChildrenAndNoChildren() From 93ec62003ad6ebbe3136fa897e346aad7b377a0d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 23 Aug 2020 18:59:19 +0000 Subject: [PATCH 56/68] Version 3.0.1 --- CHANGELOG.md | 6 ++++++ README.md | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25f862dc..ef3240dd 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.0.1 + +### Changed +- Updated all DTOs to make them immutable. +- Updated documentation. + ## 3.0.0 ### Added diff --git a/README.md b/README.md index 32853b91..c194c8be 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 3.0.0 +Version 3.0.1 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) From 2db46371c180c504a2d16e44e4800455f8a5c801 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Thu, 27 Aug 2020 10:05:23 -0400 Subject: [PATCH 57/68] Create FUNDING.yml --- .github/FUNDING.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..8fe59770 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,12 @@ +# These are supported funding model platforms + +# github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +# patreon: # Replace with a single Patreon username +# open_collective: # Replace with a single Open Collective username +# ko_fi: # Replace with a single Ko-fi username +tidelift: "packagist/paquettg/php-html-parser" +# community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +# liberapay: # Replace with a single Liberapay username +# issuehunt: # Replace with a single IssueHunt username +# otechie: # Replace with a single Otechie username +# custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] From 590a1b7ea822f31f2739277ef4402e88714ba61a Mon Sep 17 00:00:00 2001 From: Raphael Krut-Landau Date: Wed, 16 Sep 2020 11:31:25 -0400 Subject: [PATCH 58/68] Fixed a few stray typos --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c194c8be..26f6d20b 100755 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ This package can be found on [packagist](https://packagist.org/packages/paquettg Usage ----- -You can find many examples of how to use the dom parser and any of its parts (which you will most likely never touch) in the tests directory. The tests are done using PHPUnit and are very small, a few lines each, and are a great place to start. Given that, I'll still be showing a few examples of how the package should be used. The following example is a very simplistic usage of the package. +You can find many examples of how to use the DOM parser and any of its parts (which you will most likely never touch) in the tests directory. The tests are done using PHPUnit and are very small, a few lines each, and are a great place to start. Given that, I'll still be showing a few examples of how the package should be used. The following example is a very simplistic usage of the package. ```php // Assuming you installed from Composer: @@ -36,12 +36,12 @@ $a = $dom->find('a')[0]; echo $a->text; // "click here" ``` -The above will output "click here". Simple no? There are many ways to get the same result from the dome, such as `$dom->getElementsbyTag('a')[0]` or `$dom->find('a', 0)` which can all be found in the tests or in the code itself. +The above will output "click here". Simple, no? There are many ways to get the same result from the DOM, such as `$dom->getElementsbyTag('a')[0]` or `$dom->find('a', 0)`, which can all be found in the tests or in the code itself. Loading Files ------------------ -You may also seamlessly load a file into the dom instead of a string, which is much more convenient and is how I except most developers will be loading the html. The following example is taken from our test and uses the "big.html" file found there. +You may also seamlessly load a file into the DOM instead of a string, which is much more convenient and is how I expect most developers will be loading the HTML. The following example is taken from our test and uses the "big.html" file found there. ```php // Assuming you installed from Composer: @@ -67,12 +67,12 @@ foreach ($contents as $content) } ``` -This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of methods that a node has available. +This example loads the html from big.html, a real page found online, and gets all the content-border classes to process. It also shows a few things you can do with a node but it is not an exhaustive list of the methods that a node has available. -Loading Url +Loading URLs ---------------- -Loading a url is very similar to the way you would load the html from a file. +Loading a URL is very similar to the way you would load the HTML from a file. ```php // Assuming you installed from Composer: @@ -88,7 +88,7 @@ $dom->loadFromUrl('http://google.com'); $html = $dom->outerHtml; // same result as the first example ``` -loadFromUrl will, by default, use an implementation of the `\Psr\Http\Client\ClientInterface` to do the HTTP request and a default implementation of `\Psr\Http\Message\RequestInterface` to create the body of the request. You can easely implement your own version of either the client or request to use a custom HTTP connection when using loadFromUrl. +loadFromUrl will, by default, use an implementation of the `\Psr\Http\Client\ClientInterface` to do the HTTP request and a default implementation of `\Psr\Http\Message\RequestInterface` to create the body of the request. You can easily implement your own version of either the client or request to use a custom HTTP connection when using loadFromUrl. ```php // Assuming you installed from Composer: @@ -101,7 +101,7 @@ $dom->loadFromUrl('http://google.com', null, new MyClient()); $html = $dom->outerHtml; ``` -As long as the client object implements the interface properly it will use that object to get the content of the url. +As long as the client object implements the interface properly, it will use that object to get the content of the url. Loading Strings --------------- From 7d0468794e6f13874ecf134316ab39eb172d3455 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:17:01 +0000 Subject: [PATCH 59/68] Fixed #235 - Removed guzzle 6 and added guzzle 7 support --- composer.json | 3 +-- src/PHPHtmlParser/Dom.php | 2 +- src/PHPHtmlParser/StaticDom.php | 2 +- tests/DomTest.php | 7 +++++++ 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/composer.json b/composer.json index 5549a5ee..f8ca8450 100755 --- a/composer.json +++ b/composer.json @@ -19,14 +19,13 @@ "ext-curl": "*", "paquettg/string-encode": "~1.0.0", "php-http/httplug": "^2.1", - "php-http/guzzle6-adapter": "^2.0", + "guzzlehttp/guzzle": "^7.0", "guzzlehttp/psr7": "^1.6", "myclabs/php-enum": "^1.7" }, "require-dev": { "phpunit/phpunit": "^7.5.1", "mockery/mockery": "^1.2", - "php-coveralls/php-coveralls": "^2.1", "infection/infection": "^0.13.4", "phan/phan": "^2.4", "friendsofphp/php-cs-fixer": "^2.16" diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php index 0b3a0730..816b1b3b 100755 --- a/src/PHPHtmlParser/Dom.php +++ b/src/PHPHtmlParser/Dom.php @@ -5,7 +5,7 @@ namespace PHPHtmlParser; use GuzzleHttp\Psr7\Request; -use Http\Adapter\Guzzle6\Client; +use GuzzleHttp\Client; use PHPHtmlParser\Contracts\Dom\CleanerInterface; use PHPHtmlParser\Contracts\Dom\ParserInterface; use PHPHtmlParser\Contracts\DomInterface; diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php index 95d01073..78950204 100755 --- a/src/PHPHtmlParser/StaticDom.php +++ b/src/PHPHtmlParser/StaticDom.php @@ -5,7 +5,7 @@ namespace PHPHtmlParser; use GuzzleHttp\Psr7\Request; -use Http\Adapter\Guzzle6\Client; +use GuzzleHttp\Client; use PHPHtmlParser\Exceptions\ChildNotFoundException; use PHPHtmlParser\Exceptions\CircularException; use PHPHtmlParser\Exceptions\NotLoadedException; diff --git a/tests/DomTest.php b/tests/DomTest.php index 9fbb1529..519d5594 100755 --- a/tests/DomTest.php +++ b/tests/DomTest.php @@ -525,4 +525,11 @@ public function testRandomTagInMiddleOfText() $this->assertEquals('

      Hello, this is just a test in which <55 names with some other text> should be interpreted as text

      ', $dom->outerHtml); } + + public function testHttpCall() + { + $dom = new Dom(); + $dom->loadFromUrl('http://google.com'); + $this->assertNotEmpty($dom->outerHtml); + } } From b523b1d785f65e414a4bf865b776c0d3dcacba5d Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:17:40 +0000 Subject: [PATCH 60/68] Added tifelift verbage --- CHANGELOG.md | 14 ++++++++++++++ README.md | 11 +++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef3240dd..beb5ec6f 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.1.0 + +- Updated to include Tidelift subscription option. +- Removed php-coverall. +- Removed Guzzle 6 Adapter. +- Added support for Guzzle 7. + ## 3.0.1 ### Changed @@ -37,6 +44,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Removed `load()` method in Dom object. - Removed support for php 7.1. +## 2.2.1 + +### Added +- Added php_cs. +- Added support for PSR7 requests. +- Added the attribute type dto. + ## 2.2.0 ### Added diff --git a/README.md b/README.md index c194c8be..b8f4b7ae 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 3.0.1 +Version 3.0.2 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) @@ -20,7 +20,7 @@ $ composer require paquettg/php-html-parser This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.2, 7.3, and 7.4. -Usage +Basic Usage ----- You can find many examples of how to use the dom parser and any of its parts (which you will most likely never touch) in the tests directory. The tests are done using PHPUnit and are very small, a few lines each, and are a great place to start. Given that, I'll still be showing a few examples of how the package should be used. The following example is a very simplistic usage of the package. @@ -38,6 +38,13 @@ echo $a->text; // "click here" The above will output "click here". Simple no? There are many ways to get the same result from the dome, such as `$dom->getElementsbyTag('a')[0]` or `$dom->find('a', 0)` which can all be found in the tests or in the code itself. +Support PHP Html Parser Financially +-------------- + +Get supported Monolog and help fund the project with the [Tidelift Subscription](https://tidelift.com/subscription/pkg/packagist-paquettg-php-html-parser?utm_source=packagist-paquettg-php-html-parser&utm_medium=referral&utm_campaign=enterprise). + +Tidelift delivers commercial support and maintenance for the open source dependencies you use to build your applications. Save time, reduce risk, and improve code health, while paying the maintainers of the exact dependencies you use. + Loading Files ------------------ From b200ae9894af3ba309115c120799f3f273d3e2aa Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:27:31 +0000 Subject: [PATCH 61/68] Updated readme versions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b30a95d..c89f3bdf 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ PHP Html Parser ========================== -Version 3.0.2 +Version 3.1.0 [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) From f5c2dd9b8abd03cfe9383efe2575c7ff0d9711ea Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sat, 19 Sep 2020 19:35:11 +0000 Subject: [PATCH 62/68] Create SECURITY.md --- SECURITY.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..3fc4dfcc --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,11 @@ +# Security Policy + +## Supported Versions + +We only support the most recent version with security fixes. + +## Reporting a Vulnerability + +If you have found any issues that might have security implications, please refer to https://tidelift.com/security + +Do not report security reports publicly. From 9851d9875721109d0f5ee07f389f673500670a16 Mon Sep 17 00:00:00 2001 From: RajaTaimur7 <72160749+RajaTaimur7@users.noreply.github.com> Date: Sat, 3 Oct 2020 00:23:08 +0500 Subject: [PATCH 63/68] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c89f3bdf..2d30b978 100755 --- a/README.md +++ b/README.md @@ -258,7 +258,7 @@ unset($a); echo $dom; // '

      Hey bro,
      :)

      '); ``` -You can modify the text of `TextNode` objects easely. Please note that, if you set an encoding, the new text will be encoded using the existing encoding. +You can modify the text of `TextNode` objects easily. Please note that, if you set an encoding, the new text will be encoded using the existing encoding. ```php use PHPHtmlParser\Dom; From 5572180311dc69ba195e7e6945570a73395ef9c9 Mon Sep 17 00:00:00 2001 From: Leon Kessler Date: Mon, 26 Oct 2020 15:55:22 +0000 Subject: [PATCH 64/68] Fixes #247 numbers in comments can cause php fatal errors. --- src/PHPHtmlParser/Dom/Tag.php | 2 ++ tests/Node/TextTest.php | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php index 29b68bf7..2aeb6aa8 100644 --- a/src/PHPHtmlParser/Dom/Tag.php +++ b/src/PHPHtmlParser/Dom/Tag.php @@ -329,6 +329,8 @@ public function makeOpeningTag() } catch (AttributeNotFoundException $e) { // attribute that was in the array not found in the array... let's continue. continue; + } catch (\TypeError $e) { + $val = null; } $val = $attributeDTO->getValue(); if (\is_null($val)) { diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index 44298fc9..ce7f0f59 100755 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -4,6 +4,7 @@ use PHPHtmlParser\Dom; use PHPHtmlParser\Dom\Node\TextNode; +use PHPHtmlParser\Options; use PHPUnit\Framework\TestCase; use stringEncode\Encode; @@ -74,4 +75,14 @@ public function testSetTextEncoded() $node->setText('biz baz'); $this->assertEquals('biz baz', $node->text()); } + + public function testCommentWithNumbers() { + $dom = new Dom; + $options = new Options(); + $options->setCleanupInput(false); + $dom->setOptions($options); + $dom->loadStr(''); + $output = $dom->outerHtml; + $this->assertContains('', $output); + } } From 81341e1cfb9ce843ce50bd9b3715733ec5e5abfb Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 19:45:49 +0000 Subject: [PATCH 65/68] Updated change log to reflect fix --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index beb5ec6f..eafc6357 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 3.1.1 + +### Changed +- Fixed issue with numbers in comments + ## 3.1.0 +### Changed - Updated to include Tidelift subscription option. - Removed php-coverall. - Removed Guzzle 6 Adapter. From ec1bc10b6acfa69cff1a7259a5abd051ad7f42d4 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 19:50:43 +0000 Subject: [PATCH 66/68] fixed #246 Fixed php version dependency. --- CHANGELOG.md | 3 ++- composer.json | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eafc6357..065e5b36 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## 3.1.1 ### Changed -- Fixed issue with numbers in comments +- Fixed issue with numbers in comments. +- Updated minimume php version to correct version. ## 3.1.0 diff --git a/composer.json b/composer.json index f8ca8450..166886f7 100755 --- a/composer.json +++ b/composer.json @@ -13,7 +13,7 @@ } ], "require": { - "php": ">=7.1", + "php": ">=7.2", "ext-mbstring": "*", "ext-zlib": "*", "ext-curl": "*", From 40c335b512969bbfeb819771eabd130a40170338 Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 20:15:40 +0000 Subject: [PATCH 67/68] fix #233 - Made comments self-closing --- CHANGELOG.md | 1 + src/PHPHtmlParser/Content.php | 16 ++++++++++++ src/PHPHtmlParser/Dom/Parser.php | 8 ++++++ src/PHPHtmlParser/Enum/StringToken.php | 2 ++ tests/Dom/CommentTest.php | 34 ++++++++++++++++++++++++++ tests/Node/TextTest.php | 10 -------- 6 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 tests/Dom/CommentTest.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 065e5b36..3fbf0bb4 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Fixed issue with numbers in comments. - Updated minimume php version to correct version. +- Comment tags are now self-closing when cleanup input is set to false. ## 3.1.0 diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php index 888a6039..f1332175 100755 --- a/src/PHPHtmlParser/Content.php +++ b/src/PHPHtmlParser/Content.php @@ -72,6 +72,22 @@ public function char(?int $char = null): string return $this->content[$char ?? $this->pos] ?? ''; } + /** + * Gets a string from the current character position. + * + * @param int $length + * @return string + */ + public function string(int $length = 1): string + { + $string = ''; + $position = $this->pos; + do { + $string .= $this->char($position++); + } while ($position < $this->pos + $length); + return $string; + } + /** * Moves the current position forward. * diff --git a/src/PHPHtmlParser/Dom/Parser.php b/src/PHPHtmlParser/Dom/Parser.php index 418e535c..7ed310cb 100644 --- a/src/PHPHtmlParser/Dom/Parser.php +++ b/src/PHPHtmlParser/Dom/Parser.php @@ -183,6 +183,14 @@ private function parseTag(Options $options, Content $content, int $size): TagDTO ->setOpening('setClosing(' ?>') ->selfClosing(); + } elseif($content->string(3) == '!--') { + // comment tag + $tag = $content->fastForward(3) + ->copyByToken(StringToken::CLOSECOMMENT(), true); + $tag = (new Tag($tag)) + ->setOpening('') + ->selfClosing(); } else { $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true)); if (\trim($tag) == '') { diff --git a/src/PHPHtmlParser/Enum/StringToken.php b/src/PHPHtmlParser/Enum/StringToken.php index 6b60d520..7a209e00 100644 --- a/src/PHPHtmlParser/Enum/StringToken.php +++ b/src/PHPHtmlParser/Enum/StringToken.php @@ -11,6 +11,7 @@ * @method static StringToken EQUAL() * @method static StringToken SLASH() * @method static StringToken ATTR() + * @method static StringToken CLOSECOMMENT() */ class StringToken extends Enum { @@ -18,4 +19,5 @@ class StringToken extends Enum private const EQUAL = ' =/>'; private const SLASH = " />\r\n\t"; private const ATTR = ' >'; + private const CLOSECOMMENT = '-->'; } diff --git a/tests/Dom/CommentTest.php b/tests/Dom/CommentTest.php new file mode 100644 index 00000000..3f10696e --- /dev/null +++ b/tests/Dom/CommentTest.php @@ -0,0 +1,34 @@ +setCleanupInput(false); + $dom->loadStr('', $options); + $this->dom = $dom; + } + + public function tearDown() + { + Mockery::close(); + } + + public function testLoadCommentInnerHtml() + { + $this->assertEquals('', $this->dom->innerHtml); + } +} diff --git a/tests/Node/TextTest.php b/tests/Node/TextTest.php index ce7f0f59..f94c4962 100755 --- a/tests/Node/TextTest.php +++ b/tests/Node/TextTest.php @@ -75,14 +75,4 @@ public function testSetTextEncoded() $node->setText('biz baz'); $this->assertEquals('biz baz', $node->text()); } - - public function testCommentWithNumbers() { - $dom = new Dom; - $options = new Options(); - $options->setCleanupInput(false); - $dom->setOptions($options); - $dom->loadStr(''); - $output = $dom->outerHtml; - $this->assertContains('', $output); - } } From 7c05e4192a918cb72902499d275a5b9fa7779d7e Mon Sep 17 00:00:00 2001 From: Gilles Paquette Date: Sun, 1 Nov 2020 20:33:18 +0000 Subject: [PATCH 68/68] Removed version number from readme --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 2d30b978..6889b079 100755 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ PHP Html Parser ========================== -Version 3.1.0 - [![Build Status](https://travis-ci.org/paquettg/php-html-parser.png)](https://travis-ci.org/paquettg/php-html-parser) [![Coverage Status](https://coveralls.io/repos/paquettg/php-html-parser/badge.png)](https://coveralls.io/r/paquettg/php-html-parser) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/paquettg/php-html-parser/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/paquettg/php-html-parser/?branch=master)