', $parent->outerHtml());
-
+
$childa->setAttribute('href', 'https://www.google.com');
-
+
$this->assertEquals('link', $childa->outerHtml());
}
@@ -286,7 +286,7 @@ public function testText()
$a = new Tag('a');
$node = new HtmlNode($a);
$node->addChild(new TextNode('link'));
-
+
$this->assertEquals('link', $node->text());
}
@@ -295,7 +295,7 @@ public function testTextTwice()
$a = new Tag('a');
$node = new HtmlNode($a);
$node->addChild(new TextNode('link'));
-
+
$text = $node->text();
$this->assertEquals($text, $node->text());
}
@@ -312,7 +312,7 @@ public function testTextMagic()
{
$node = new HtmlNode('a');
$node->addChild(new TextNode('link'));
-
+
$this->assertEquals('link', $node->text);
}
@@ -358,7 +358,7 @@ public function testGetAttribute()
'doubleQuote' => true,
],
]);
-
+
$this->assertEquals('outerlink rounded', $node->getAttribute('class'));
}
@@ -375,7 +375,7 @@ public function testGetAttributeMagic()
'doubleQuote' => true,
],
]);
-
+
$this->assertEquals('http://google.com', $node->href);
}
@@ -392,7 +392,7 @@ public function testGetAttributes()
'doubleQuote' => true,
],
]);
-
+
$this->assertEquals('outerlink rounded', $node->getAttributes()['class']);
}
@@ -420,6 +420,18 @@ public function testRemoveAllAttributes()
$this->assertEquals(0, count($node->getAttributes()));
}
+ public function testSetTag()
+ {
+ $node = new HtmlNode('div');
+ $this->assertEquals('', $node->outerHtml());
+
+ $node->setTag('p');
+ $this->assertEquals('', $node->outerHtml());
+
+ $node->setTag(new Tag('span'));
+ $this->assertEquals('', $node->outerHtml());
+ }
+
public function testCountable()
{
$div = new Tag('div');
From 8a551ccda8e777fe031c519dae7809524be4b03f Mon Sep 17 00:00:00 2001
From: Harry Merritt
Date: Fri, 10 Jan 2020 21:13:31 +0000
Subject: [PATCH 02/68] Add custom headers to curl request - pass headers as an
option when using loadFromUrl
---
src/PHPHtmlParser/Curl.php | 7 ++++++-
src/PHPHtmlParser/CurlInterface.php | 3 ++-
src/PHPHtmlParser/Dom.php | 2 +-
3 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php
index 353b00f0..8eb90451 100644
--- a/src/PHPHtmlParser/Curl.php
+++ b/src/PHPHtmlParser/Curl.php
@@ -15,10 +15,11 @@ class Curl implements CurlInterface
* A simple curl implementation to get the content of the url.
*
* @param string $url
+ * @param array $options
* @return string
* @throws CurlException
*/
- public function get(string $url): string
+ public function get(string $url, array $options): string
{
$ch = curl_init($url);
@@ -26,6 +27,10 @@ public function get(string $url): string
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
}
+ if (isset($options['curlHeaders'])) {
+ curl_setopt($ch, CURLOPT_HTTPHEADER, $options['curlHeaders']);
+ }
+
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
diff --git a/src/PHPHtmlParser/CurlInterface.php b/src/PHPHtmlParser/CurlInterface.php
index 1d5d96c8..ff6ac97f 100644
--- a/src/PHPHtmlParser/CurlInterface.php
+++ b/src/PHPHtmlParser/CurlInterface.php
@@ -13,7 +13,8 @@ interface CurlInterface
* This method should return the content of the url in a string
*
* @param string $url
+ * @param array $options
* @return string
*/
- public function get(string $url): string;
+ public function get(string $url, array $options): string;
}
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index cafce57c..cc46aa5f 100644
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -191,7 +191,7 @@ public function loadFromUrl(string $url, array $options = [], CurlInterface $cur
// use the default curl interface
$curl = new Curl;
}
- $content = $curl->get($url);
+ $content = $curl->get($url, $options);
return $this->loadStr($content, $options);
}
From 02b2d0caa3a03d9e4829204a87194a54b5beefce Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Sun, 12 Jan 2020 20:40:54 +0000
Subject: [PATCH 03/68] Added support for php 7.4
---
.travis.yml | 1 +
CHANGELOG.md | 5 +++++
README.md | 4 ++--
src/PHPHtmlParser/Dom/Tag.php | 4 ++--
4 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 37036761..6f7354cd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ php:
- 7.1
- 7.2
- 7.3
+ - 7.4
install:
- composer self-update
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 400a20f1..5c14284b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Added
+- Added support for php 7.4
+
+## 2.1.0
+
### Added
- New `removeSmartyScripts` configuration setting. Defaults to true.
- Added `declare(strict_types=1)` to all source files.
diff --git a/README.md b/README.md
index 8085a64c..f89a09d7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
PHP Html Parser
==========================
-Version 2.1.0
+Version 2.2.0
[](https://travis-ci.org/paquettg/php-html-parser)
[](https://coveralls.io/r/paquettg/php-html-parser)
@@ -18,7 +18,7 @@ Install the latest version using composer.
$ composer require paquettg/php-html-parser
```
-This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.1, 7.2, and 7.3.
+This package can be found on [packagist](https://packagist.org/packages/paquettg/php-html-parser) and is best loaded using [composer](http://getcomposer.org/). We support php 7.1, 7.2, 7.3, and 7.4.
Usage
-----
diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php
index 54a1865f..f773e9f1 100644
--- a/src/PHPHtmlParser/Dom/Tag.php
+++ b/src/PHPHtmlParser/Dom/Tag.php
@@ -296,11 +296,11 @@ public function getAttributes()
* @param string $key
* @return mixed
*/
- public function getAttribute(string $key)
+ public function getAttribute(string $key):array
{
$key = strtolower($key);
if ( ! isset($this->attr[$key])) {
- return null;
+ return ['value' => null, 'doubleQuote' => true];
}
$value = $this->attr[$key]['value'];
if (is_string($value) && ! is_null($this->encode)) {
From e2d2d2eb72d5db0183c6960a3a3306e7b81e514d Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Thu, 22 Aug 2019 08:59:57 -0400
Subject: [PATCH 04/68] Fixed small issues with the Dom object
---
src/PHPHtmlParser/Dom.php | 14 +++++++++++---
src/PHPHtmlParser/Dom/AbstractNode.php | 4 ++--
src/PHPHtmlParser/Dom/Tag.php | 2 +-
3 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index cc46aa5f..961519a6 100644
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -252,7 +252,14 @@ public function find(string $selector, int $nth = null)
{
$this->isLoaded();
- return $this->root->find($selector, $nth, $this->options->get('depthFirstSearch'));
+ $depthFirstSearch = $this->options->get('depthFirstSearch');
+ if (is_bool($depthFirstSearch)) {
+ $result = $this->root->find($selector, $nth, $depthFirstSearch);
+ } else {
+ $result = $this->root->find($selector, $nth);
+ }
+
+ return $result;
}
/**
@@ -793,6 +800,7 @@ protected function detectCharset(): bool
return false;
}
+ /** @var AbstractNode $meta */
$meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
if (is_null($meta)) {
// could not find meta tag
@@ -800,8 +808,8 @@ protected function detectCharset(): bool
return false;
}
- $content = $meta->content;
- if (empty($content)) {
+ $content = $meta->getAttribute('content');
+ if (is_null($content)) {
// could not find content
$this->root->propagateEncoding($encode);
diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php
index ac86f538..3c5e23ff 100644
--- a/src/PHPHtmlParser/Dom/AbstractNode.php
+++ b/src/PHPHtmlParser/Dom/AbstractNode.php
@@ -342,9 +342,9 @@ public function getAttributes(): array
* on the tag of this node.
*
* @param string $key
- * @return mixed
+ * @return string|null
*/
- public function getAttribute(string $key)
+ public function getAttribute(string $key): ?string
{
$attribute = $this->tag->getAttribute($key);
if ( ! is_null($attribute)) {
diff --git a/src/PHPHtmlParser/Dom/Tag.php b/src/PHPHtmlParser/Dom/Tag.php
index f773e9f1..f95d7871 100644
--- a/src/PHPHtmlParser/Dom/Tag.php
+++ b/src/PHPHtmlParser/Dom/Tag.php
@@ -294,7 +294,7 @@ public function getAttributes()
* Returns an attribute by the key
*
* @param string $key
- * @return mixed
+ * @return array|null
*/
public function getAttribute(string $key):array
{
From 6bc74388321a5df2133e30375f3de61fd2ed5446 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Thu, 22 Aug 2019 09:09:18 -0400
Subject: [PATCH 05/68] Added more type checking to avoid strict type errors.
---
src/PHPHtmlParser/Curl.php | 5 +++++
src/PHPHtmlParser/Dom/InnerNode.php | 10 ++++++++--
2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php
index 8eb90451..b3e33edc 100644
--- a/src/PHPHtmlParser/Curl.php
+++ b/src/PHPHtmlParser/Curl.php
@@ -22,6 +22,9 @@ class Curl implements CurlInterface
public function get(string $url, array $options): string
{
$ch = curl_init($url);
+ if ($ch === false) {
+ throw new CurlException('Curl Init return `false`.');
+ }
if ( ! ini_get('open_basedir')) {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
@@ -42,6 +45,8 @@ public function get(string $url, array $options): string
// there was a problem
$error = curl_error($ch);
throw new CurlException('Error retrieving "'.$url.'" ('.$error.')');
+ } elseif ($content === true) {
+ throw new CurlException('Unexpected return value of content set to true.');
}
return $content;
diff --git a/src/PHPHtmlParser/Dom/InnerNode.php b/src/PHPHtmlParser/Dom/InnerNode.php
index 8ae2b9f8..3ca23893 100644
--- a/src/PHPHtmlParser/Dom/InnerNode.php
+++ b/src/PHPHtmlParser/Dom/InnerNode.php
@@ -165,7 +165,10 @@ public function addChild(AbstractNode $child, int $before = -1): bool
array_splice($children, $index, 0, [$insert]);
// add the child
- $this->children = array_combine($keys, $children);
+ $combination = array_combine($keys, $children);
+ if ($combination !== false) {
+ $this->children = $combination;
+ }
// tell child I am the new parent
$child->setParent($this);
@@ -338,7 +341,10 @@ public function replaceChild(int $childId, AbstractNode $newChild): void
$keys = array_keys($this->children);
$index = array_search($childId, $keys, true);
$keys[$index] = $newChild->id();
- $this->children = array_combine($keys, $this->children);
+ $combination = array_combine($keys, $this->children);
+ if ($combination !== false) {
+ $this->children = $combination;
+ }
$this->children[$newChild->id()] = [
'prev' => $oldChild['prev'],
'node' => $newChild,
From 50a909aea06c59791d80821e72f21bd23c1858ea Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Fri, 23 Aug 2019 22:07:02 -0400
Subject: [PATCH 06/68] Added Infection as a dev dependency
---
.gitattributes | 21 +++++++++++----------
.gitignore | 1 +
composer.json | 3 ++-
infection.json.dist | 14 ++++++++++++++
4 files changed, 28 insertions(+), 11 deletions(-)
create mode 100644 infection.json.dist
diff --git a/.gitattributes b/.gitattributes
index afc2bfbc..9f59affd 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,10 +1,11 @@
-/tests export-ignore
-/.scrutinizar.yml export-ignore
-/.travis.yml export-ignore
-/.gitignore export-ignore
-/CHANGELOG.md export-ignore
-/CONTRIBUTING.md export-ignore
-/LICENSE.md export-ignore
-/README.md export-ignore
-/phpunit.php export-ignore
-/phpunit.xml export-ignore
+/tests export-ignore
+/.scrutinizar.yml export-ignore
+/.travis.yml export-ignore
+/.gitignore export-ignore
+/CHANGELOG.md export-ignore
+/CONTRIBUTING.md export-ignore
+/LICENSE.md export-ignore
+/README.md export-ignore
+/phpunit.php export-ignore
+/phpunit.xml export-ignore
+/infection.json.dist export-ignore
diff --git a/.gitignore b/.gitignore
index b871be44..274cf429 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
composer.phar
composer.lock
+infection.log
/vendor/
.idea/
*.swp
diff --git a/composer.json b/composer.json
index de617750..76f1f5ec 100644
--- a/composer.json
+++ b/composer.json
@@ -21,7 +21,8 @@
"require-dev": {
"phpunit/phpunit": "^7.5.1",
"mockery/mockery": "^1.2",
- "php-coveralls/php-coveralls": "^2.1"
+ "php-coveralls/php-coveralls": "^2.1",
+ "infection/infection": "^0.13.4"
},
"autoload": {
"psr-4": {
diff --git a/infection.json.dist b/infection.json.dist
new file mode 100644
index 00000000..0243ccf4
--- /dev/null
+++ b/infection.json.dist
@@ -0,0 +1,14 @@
+{
+ "timeout": 10,
+ "source": {
+ "directories": [
+ "src\/PHPHtmlParser"
+ ]
+ },
+ "logs": {
+ "text": "infection.log"
+ },
+ "mutators": {
+ "@default": true
+ }
+}
\ No newline at end of file
From 5c7fe62f6a90fb90940bf3f8996ed2a44acc28cd Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Sat, 24 Aug 2019 21:37:53 -0400
Subject: [PATCH 07/68] Issue #115 fixed and test added
---
src/PHPHtmlParser/Dom.php | 1 +
src/PHPHtmlParser/Dom/HtmlNode.php | 2 +-
tests/Options/StrictTest.php | 10 ++++++++++
3 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index 961519a6..0c833bee 100644
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -748,6 +748,7 @@ protected function parseTag(): array
}
$this->content->skipByToken('blank');
+ $tag = strtolower($tag);
if ($this->content->char() == '/') {
// self closing tag
$node->getTag()->selfClosing();
diff --git a/src/PHPHtmlParser/Dom/HtmlNode.php b/src/PHPHtmlParser/Dom/HtmlNode.php
index 1e81234e..5217bb85 100644
--- a/src/PHPHtmlParser/Dom/HtmlNode.php
+++ b/src/PHPHtmlParser/Dom/HtmlNode.php
@@ -201,7 +201,7 @@ protected function clear(): void
$this->text = null;
$this->textWithChildren = null;
- if (is_null($this->parent) === false) {
+ if (!is_null($this->parent)) {
$this->parent->clear();
}
}
diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php
index a76ded60..e7f22f0c 100644
--- a/tests/Options/StrictTest.php
+++ b/tests/Options/StrictTest.php
@@ -53,4 +53,14 @@ public function testConfigStrictMissingAttribute()
$this->assertEquals("Tag 'p' has an attribute 'block' with out a value! (character #22)", $e->getMessage());
}
}
+
+ public function testConfigStrictBRTag()
+ {
+ $dom = new Dom;
+ $dom->setOptions([
+ 'strict' => true,
+ ]);
+ $dom->load(' ');
+ $this->assertTrue(true);
+ }
}
From 566aaa2f17002b15494cf67c625dd23be858bda8 Mon Sep 17 00:00:00 2001
From: Rik van der Heijden
Date: Wed, 11 Sep 2019 21:08:51 +0200
Subject: [PATCH 08/68] Add a failing test
---
tests/DomTest.php | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/tests/DomTest.php b/tests/DomTest.php
index b44cbb06..733342f8 100644
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -483,4 +483,14 @@ public function testCaseSensitivity()
$FooBar = $dom->find('FooBar');
$this->assertEquals('asdf', $FooBar->Attribute);
}
+
+ public function testEmptyAttribute()
+ {
+ $str = '
blah
what
';
+ $dom = new Dom();
+ $dom->load($str);
+
+ $items = $dom->find('.summary .foo');
+ $this->assertEquals(1, count($items));
+ }
}
From b42ff35020023603e5bce9f0dd985762715c985a Mon Sep 17 00:00:00 2001
From: Rik van der Heijden
Date: Wed, 11 Sep 2019 21:12:00 +0200
Subject: [PATCH 09/68] Fix failing test by adding null-coalescing check
---
src/PHPHtmlParser/Selector/Selector.php | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php
index b3aaae57..56d1478a 100644
--- a/src/PHPHtmlParser/Selector/Selector.php
+++ b/src/PHPHtmlParser/Selector/Selector.php
@@ -354,7 +354,7 @@ public function checkComparison(array $rule, AbstractNode $node): bool
// handle multiple classes
if ( ! $check && $rule['key'] == 'class') {
- $nodeClasses = explode(' ', $node->getAttribute('class'));
+ $nodeClasses = explode(' ', $node->getAttribute('class') ?? '');
foreach ($rule['value'] as $value) {
foreach ($nodeClasses as $class) {
if ( ! empty($class)) {
From 454373742045fe357ddba4cff4971f3c9a73e519 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Fri, 27 Sep 2019 21:32:12 -0400
Subject: [PATCH 10/68] Updated coverall travis configuration
---
.travis.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.travis.yml b/.travis.yml
index 6f7354cd..9ffb2529 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,6 +15,6 @@ script:
- php vendor/bin/phpunit --coverage-clover build/logs/clover.xml
after_script:
- - php vendor/bin/coveralls
+ - travis_retry php vendor/bin/coveralls
- wget https://scrutinizer-ci.com/ocular.phar
- php ocular.phar code-coverage:upload --format=php-clover build/logs/clover.xml
From c2cf01ac46ff09026aff48986d1c302d506ff63c Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Fri, 27 Sep 2019 21:33:51 -0400
Subject: [PATCH 11/68] Fixes #192 - Fixed documentation
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index f89a09d7..18dacfbc 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ require "vendor/autoload.php";
use PHPHtmlParser\Dom;
$dom = new Dom;
-$dom->loadFromFile('tests/big.html');
+$dom->loadFromFile('tests/data/big.html');
$contents = $dom->find('.content-border');
echo count($contents); // 10
From 12f382f3530cfbc2bb96cb3ae20fc322575685bb Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Fri, 27 Sep 2019 21:37:44 -0400
Subject: [PATCH 12/68] Fixed #190 - Added gzip detection and decoding.
---
composer.json | 4 +++-
src/PHPHtmlParser/Dom.php | 5 +++++
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/composer.json b/composer.json
index 76f1f5ec..f68a0a5a 100644
--- a/composer.json
+++ b/composer.json
@@ -16,7 +16,9 @@
"require": {
"php": ">=7.1",
"ext-mbstring": "*",
- "paquettg/string-encode": "~1.0.0"
+ "paquettg/string-encode": "~1.0.0",
+ "ext-zlib": "*",
+ "ext-curl": "*"
},
"require-dev": {
"phpunit/phpunit": "^7.5.1",
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index 0c833bee..d83ec07f 100644
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -513,6 +513,11 @@ protected function clean(string $str): string
return $str;
}
+ $is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII");
+ if ($is_gzip) {
+ $str = gzdecode($str);
+ }
+
// remove white space before closing tags
$str = mb_eregi_replace("'\s+>", "'>", $str);
$str = mb_eregi_replace('"\s+>', '">', $str);
From bad55125647fff01476d8dffd9ff62f2ce356e70 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Sat, 28 Sep 2019 22:29:21 -0400
Subject: [PATCH 13/68] Fixes #116 - Added support for multiple selectors.
---
.travis.yml | 1 -
CHANGELOG.md | 6 +
src/PHPHtmlParser/Selector/Parser.php | 22 +++-
src/PHPHtmlParser/Selector/Selector.php | 152 ++++++++++++++++--------
tests/DomTest.php | 13 +-
5 files changed, 140 insertions(+), 54 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 9ffb2529..20e644a7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,6 @@ php:
- 7.1
- 7.2
- 7.3
- - 7.4
install:
- composer self-update
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5c14284b..0c766a67 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Changed
+
+- Fixed bug with multiple selectors query.
+
+## 2.1.0
+
### Added
- Added support for php 7.4
diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php
index 9bea98e2..ce6a59c3 100644
--- a/src/PHPHtmlParser/Selector/Parser.php
+++ b/src/PHPHtmlParser/Selector/Parser.php
@@ -1,4 +1,7 @@
-getTag()->name()
+ if ($rule['tag'] == '*'
+ || $rule['tag'] == $node->getTag()
+ ->name()
) {
++$count;
if ($count == $rule['key']) {
@@ -132,15 +132,14 @@ protected function seek(array $nodes, array $rule, array $options): array
/** @var InnerNode $node */
foreach ($nodes as $node) {
// check if we are a leaf
- if ($node instanceof LeafNode ||
- ! $node->hasChildren()
+ if ($node instanceof LeafNode || !$node->hasChildren()
) {
continue;
}
$children = [];
- $child = $node->firstChild();
- while ( ! is_null($child)) {
+ $child = $node->firstChild();
+ while (!is_null($child)) {
// wild card, grab all
if ($rule['tag'] == '*' && is_null($rule['key'])) {
$return[] = $child;
@@ -149,11 +148,11 @@ protected function seek(array $nodes, array $rule, array $options): array
}
$pass = $this->checkTag($rule, $child);
- if ($pass && ! is_null($rule['key'])) {
+ if ($pass && !is_null($rule['key'])) {
$pass = $this->checkKey($rule, $child);
}
- if ($pass && ! is_null($rule['key']) &&
- ! is_null($rule['value']) && $rule['value'] != '*'
+ if ($pass && !is_null($rule['key']) && !is_null($rule['value'])
+ && $rule['value'] != '*'
) {
$pass = $this->checkComparison($rule, $child);
}
@@ -163,14 +162,15 @@ protected function seek(array $nodes, array $rule, array $options): array
$return[] = $child;
} else {
// this child failed to be matched
- if ($child instanceof InnerNode &&
- $child->hasChildren()
+ if ($child instanceof InnerNode && $child->hasChildren()
) {
if ($this->depthFirst) {
- if ( ! isset($options['checkGrandChildren']) ||
- $options['checkGrandChildren']) {
+ if (!isset($options['checkGrandChildren'])
+ || $options['checkGrandChildren']
+ ) {
// we have a child that failed but are not leaves.
- $matches = $this->seek([$child], $rule, $options);
+ $matches = $this->seek([$child], $rule,
+ $options);
foreach ($matches as $match) {
$return[] = $match;
}
@@ -185,9 +185,9 @@ protected function seek(array $nodes, array $rule, array $options): array
$child = $this->getNextChild($node, $child);
}
- if (( ! isset($options['checkGrandChildren']) ||
- $options['checkGrandChildren'])
- && count($children) > 0
+ if ((!isset($options['checkGrandChildren'])
+ || $options['checkGrandChildren'])
+ && count($children) > 0
) {
// we have children that failed but are not leaves.
$matches = $this->seek($children, $rule, $options);
@@ -202,15 +202,17 @@ protected function seek(array $nodes, array $rule, array $options): array
/**
* Attempts to match the given arguments with the given operator.
- *
* @param string $operator
* @param string $pattern
* @param string $value
* @return bool
*/
- protected function match(string $operator, string $pattern, string $value): bool
- {
- $value = strtolower($value);
+ protected function match(
+ string $operator,
+ string $pattern,
+ string $value
+ ): bool {
+ $value = strtolower($value);
$pattern = strtolower($pattern);
switch ($operator) {
case '=':
@@ -218,15 +220,17 @@ protected function match(string $operator, string $pattern, string $value): bool
case '!=':
return $value !== $pattern;
case '^=':
- return preg_match('/^'.preg_quote($pattern, '/').'/', $value) == 1;
+ return preg_match('/^' . preg_quote($pattern, '/') . '/',
+ $value) == 1;
case '$=':
- return preg_match('/'.preg_quote($pattern, '/').'$/', $value) == 1;
+ return preg_match('/' . preg_quote($pattern, '/') . '$/',
+ $value) == 1;
case '*=':
if ($pattern[0] == '/') {
return preg_match($pattern, $value) == 1;
}
- return preg_match("/".$pattern."/i", $value) == 1;
+ return preg_match("/" . $pattern . "/i", $value) == 1;
}
return false;
@@ -235,7 +239,6 @@ protected function match(string $operator, string $pattern, string $value): bool
/**
* Attempts to figure out what the alteration will be for
* the next element.
- *
* @param array $rule
* @return array
*/
@@ -251,7 +254,6 @@ protected function alterNext(array $rule): array
/**
* Flattens the option array.
- *
* @param array $optionsArray
* @return array
*/
@@ -269,13 +271,14 @@ protected function flattenOptions(array $optionsArray)
/**
* Returns the next child or null if no more children.
- *
* @param AbstractNode $node
* @param AbstractNode $currentChild
* @return AbstractNode|null
*/
- protected function getNextChild(AbstractNode $node, AbstractNode $currentChild)
- {
+ protected function getNextChild(
+ AbstractNode $node,
+ AbstractNode $currentChild
+ ) {
try {
$child = null;
if ($node instanceof InnerNode) {
@@ -292,15 +295,14 @@ protected function getNextChild(AbstractNode $node, AbstractNode $currentChild)
/**
* Checks tag condition from rules against node.
- *
- * @param array $rule
+ * @param array $rule
* @param AbstractNode $node
* @return bool
*/
protected function checkTag(array $rule, AbstractNode $node): bool
{
- if ( ! empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() &&
- $rule['tag'] != '*'
+ if (!empty($rule['tag']) && $rule['tag'] != $node->getTag()->name()
+ && $rule['tag'] != '*'
) {
return false;
}
@@ -310,20 +312,39 @@ protected function checkTag(array $rule, AbstractNode $node): bool
/**
* Checks key condition from rules against node.
- *
- * @param array $rule
+ * @param array $rule
* @param AbstractNode $node
* @return bool
*/
protected function checkKey(array $rule, AbstractNode $node): bool
{
- if ($rule['noKey']) {
- if ( ! is_null($node->getAttribute($rule['key']))) {
- return false;
+ if (!is_array($rule['key'])) {
+ if ($rule['noKey']) {
+ if (!is_null($node->getAttribute($rule['key']))) {
+ return false;
+ }
+ } else {
+ if ($rule['key'] != 'plaintext'
+ && !$node->hasAttribute($rule['key'])
+ ) {
+ return false;
+ }
}
} else {
- if ($rule['key'] != 'plaintext' && !$node->hasAttribute($rule['key'])) {
- return false;
+ if ($rule['noKey']) {
+ foreach ($rule['key'] as $key) {
+ if (!is_null($node->getAttribute($key))) {
+ return false;
+ }
+ }
+ } else {
+ foreach ($rule['key'] as $key) {
+ if ($key != 'plaintext'
+ && !$node->hasAttribute($key)
+ ) {
+ return false;
+ }
+ }
}
}
@@ -332,8 +353,7 @@ protected function checkKey(array $rule, AbstractNode $node): bool
/**
* Checks comparison condition from rules against node.
- *
- * @param array $rule
+ * @param array $rule
* @param AbstractNode $node
* @return bool
*/
@@ -342,18 +362,46 @@ public function checkComparison(array $rule, AbstractNode $node): bool
if ($rule['key'] == 'plaintext') {
// plaintext search
$nodeValue = $node->text();
+ $result = $this->checkNodeValue($nodeValue, $rule, $node);
} else {
// normal search
- $nodeValue = $node->getAttribute($rule['key']);
+ if (!is_array($rule['key'])) {
+ $nodeValue = $node->getAttribute($rule['key']);
+ $result = $this->checkNodeValue($nodeValue, $rule, $node);
+ } else {
+ $result = true;
+ foreach ($rule['key'] as $index => $key) {
+ $nodeValue = $node->getAttribute($key);
+ $result = $result &&
+ $this->checkNodeValue($nodeValue, $rule, $node, $index);
+ }
+ }
}
+ return $result;
+ }
+
+ /**
+ * @param string|null $nodeValue
+ * @param array $rule
+ * @param AbstractNode $node
+ * @param int|null $index
+ * @return bool
+ */
+ private function checkNodeValue(
+ ?string $nodeValue,
+ array $rule,
+ AbstractNode $node,
+ ?int $index = null
+ ) : bool {
$check = false;
if (!is_array($rule['value'])) {
$check = $this->match($rule['operator'], $rule['value'], $nodeValue);
}
// handle multiple classes
- if ( ! $check && $rule['key'] == 'class') {
+ $key = $rule['key'];
+ if (!$check && $key == 'class') {
$nodeClasses = explode(' ', $node->getAttribute('class') ?? '');
foreach ($rule['value'] as $value) {
foreach ($nodeClasses as $class) {
@@ -368,6 +416,8 @@ public function checkComparison(array $rule, AbstractNode $node): bool
break;
}
}
+ } elseif (!$check && is_array($key)) {
+ $check = $this->match($rule['operator'], $rule['value'][$index], $nodeValue);
}
return $check;
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 733342f8..094c39a3 100644
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -1,4 +1,6 @@
-find('.summary .foo');
$this->assertEquals(1, count($items));
}
+
+ public function testMultipleSquareSelector()
+ {
+ $dom = new Dom();
+ $dom->load('');
+
+ $items = $dom->find('input[type=text][name=foo][baz=fig]');
+ $this->assertEquals(1, count($items));
+ }
}
From 69c30e15093e81450e3a370765ef2fad50f80566 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Sun, 12 Jan 2020 21:20:29 +0000
Subject: [PATCH 14/68] Fixed unit tests
---
CHANGELOG.md | 13 ++++++++-----
tests/DomTest.php | 2 +-
tests/StaticDomTest.php | 2 +-
3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0c766a67..c585d0c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,14 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
-### Changed
+### Added
+- Added support for php 7.4.
+- Added custom header support for curl request.
+- Added gzip detection and decoding.
+- Added additional type checking.
+### Changed
- Fixed bug with multiple selectors query.
+- Updated documentation.
+- Fixed issue with Dom object.
-## 2.1.0
-
-### Added
-- Added support for php 7.4
## 2.1.0
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 094c39a3..cc486457 100644
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -224,7 +224,7 @@ public function testLoadFromUrl()
$curl = Mockery::mock('PHPHtmlParser\CurlInterface');
$curl->shouldReceive('get')
->once()
- ->with('http://google.com')
+ ->with('http://google.com', [])
->andReturn(file_get_contents('tests/data/files/small.html'));
$dom = new Dom;
diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php
index ad2318cb..a6fba2c0 100644
--- a/tests/StaticDomTest.php
+++ b/tests/StaticDomTest.php
@@ -61,7 +61,7 @@ public function testLoadFromUrl()
$curl = Mockery::mock('PHPHtmlParser\CurlInterface');
$curl->shouldReceive('get')
->once()
- ->with('http://google.com')
+ ->with('http://google.com', [])
->andReturn(file_get_contents('tests/data/files/small.html'));
Dom::loadFromUrl('http://google.com', [], $curl);
From c8c4f23dd02191bd2ca8fbaded7e6792bd176c53 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Sun, 12 Jan 2020 21:26:57 +0000
Subject: [PATCH 15/68] Added back 7.4 to travis
---
.travis.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.travis.yml b/.travis.yml
index 20e644a7..9ffb2529 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ php:
- 7.1
- 7.2
- 7.3
+ - 7.4
install:
- composer self-update
From 12b94f69637f946ca35af6554f0da67d9d176ca8 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Mon, 13 Jan 2020 00:03:24 +0000
Subject: [PATCH 16/68] Removed version from composer.json
---
composer.json | 1 -
1 file changed, 1 deletion(-)
diff --git a/composer.json b/composer.json
index f68a0a5a..1672cfd7 100644
--- a/composer.json
+++ b/composer.json
@@ -1,7 +1,6 @@
{
"name": "paquettg/php-html-parser",
"type": "library",
- "version": "2.1.0",
"description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.",
"keywords": ["html", "dom", "parser"],
"homepage": "https://github.com/paquettg/php-html-parser",
From 8e5735987714451424df85dae4265563fb4f2cf9 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Mon, 13 Jan 2020 02:02:32 +0000
Subject: [PATCH 17/68] Fixed issue #97
---
CHANGELOG.md | 5 +++++
src/PHPHtmlParser/Dom.php | 4 ++--
tests/DomTest.php | 16 ++++++++++++++++
3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c585d0c9..b23c6998 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Changed
+- Fixed issue with \ causing an infite loop.
+
+## 2.2.0
+
### Added
- Added support for php 7.4.
- Added custom header support for curl request.
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index d83ec07f..25c94852 100644
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -708,7 +708,7 @@ protected function parseTag(): array
case '"':
$attr['doubleQuote'] = true;
$this->content->fastForward(1);
- $string = $this->content->copyUntil('"', true, true);
+ $string = $this->content->copyUntil('"', true);
do {
$moreString = $this->content->copyUntilUnless('"', '=>');
$string .= $moreString;
@@ -720,7 +720,7 @@ protected function parseTag(): array
case "'":
$attr['doubleQuote'] = false;
$this->content->fastForward(1);
- $string = $this->content->copyUntil("'", true, true);
+ $string = $this->content->copyUntil("'", true);
do {
$moreString = $this->content->copyUntilUnless("'", '=>');
$string .= $moreString;
diff --git a/tests/DomTest.php b/tests/DomTest.php
index cc486457..755962cf 100644
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -142,6 +142,13 @@ public function testLoadNoValueAttribute()
$this->assertEquals('
Main content here
', $dom->innerHtml);
}
+ public function testLoadBackslashAttributeValue()
+ {
+ $dom = new Dom;
+ $dom->load('
Main content here
');
+ $this->assertEquals('
Main content here
', $dom->innerHtml);
+ }
+
public function testLoadNoValueAttributeBefore()
{
$dom = new Dom;
@@ -504,4 +511,13 @@ public function testMultipleSquareSelector()
$items = $dom->find('input[type=text][name=foo][baz=fig]');
$this->assertEquals(1, count($items));
}
+
+ public function testLoadGetAttributeWithBackslash()
+ {
+ $dom = new Dom();
+ $dom->load('
');
+ $imgs = $dom->find('img', 0);
+ $this->assertEquals("/img/test.png", $imgs->getAttribute('src'));
+
+ }
}
From c8e2b6dac69e366f83b9ec3a4959e31d8c146782 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Mon, 13 Jan 2020 23:45:05 +0000
Subject: [PATCH 18/68] Added phan level 1 and updated docs according to
recommendation
---
.gitattributes | 1 +
.phan/config.php | 382 ++++++++++++++++++
composer.json | 3 +-
src/PHPHtmlParser/Content.php | 24 +-
src/PHPHtmlParser/Dom.php | 60 ++-
src/PHPHtmlParser/Dom/AbstractNode.php | 37 +-
src/PHPHtmlParser/Dom/Collection.php | 4 +-
src/PHPHtmlParser/Dom/HtmlNode.php | 9 +-
src/PHPHtmlParser/Dom/InnerNode.php | 50 ++-
src/PHPHtmlParser/Dom/Tag.php | 6 +-
src/PHPHtmlParser/Dom/TextNode.php | 15 +-
.../Exceptions/LogicalException.php | 14 +
src/PHPHtmlParser/Finder.php | 3 +
src/PHPHtmlParser/Options.php | 28 +-
src/PHPHtmlParser/Selector/Parser.php | 4 +-
src/PHPHtmlParser/Selector/Selector.php | 27 +-
src/PHPHtmlParser/StaticDom.php | 4 +-
17 files changed, 588 insertions(+), 83 deletions(-)
create mode 100644 .phan/config.php
create mode 100644 src/PHPHtmlParser/Exceptions/LogicalException.php
diff --git a/.gitattributes b/.gitattributes
index 9f59affd..93691f38 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -9,3 +9,4 @@
/phpunit.php export-ignore
/phpunit.xml export-ignore
/infection.json.dist export-ignore
+/.phan export-ignore
diff --git a/.phan/config.php b/.phan/config.php
new file mode 100644
index 00000000..8c25e9d5
--- /dev/null
+++ b/.phan/config.php
@@ -0,0 +1,382 @@
+=7.1"
+ 'target_php_version' => '7.1',
+
+ // If enabled, missing properties will be created when
+ // they are first seen. If false, we'll report an
+ // error message if there is an attempt to write
+ // to a class property that wasn't explicitly
+ // defined.
+ 'allow_missing_properties' => false,
+
+ // If enabled, null can be cast to any type and any
+ // type can be cast to null. Setting this to true
+ // will cut down on false positives.
+ 'null_casts_as_any_type' => false,
+
+ // If enabled, allow null to be cast as any array-like type.
+ //
+ // This is an incremental step in migrating away from `null_casts_as_any_type`.
+ // If `null_casts_as_any_type` is true, this has no effect.
+ 'null_casts_as_array' => false,
+
+ // If enabled, allow any array-like type to be cast to null.
+ // This is an incremental step in migrating away from `null_casts_as_any_type`.
+ // If `null_casts_as_any_type` is true, this has no effect.
+ 'array_casts_as_null' => false,
+
+ // If enabled, scalars (int, float, bool, string, null)
+ // are treated as if they can cast to each other.
+ // This does not affect checks of array keys. See `scalar_array_key_cast`.
+ 'scalar_implicit_cast' => false,
+
+ // If enabled, any scalar array keys (int, string)
+ // are treated as if they can cast to each other.
+ // E.g. `array` can cast to `array` and vice versa.
+ // Normally, a scalar type such as int could only cast to/from int and mixed.
+ 'scalar_array_key_cast' => false,
+
+ // If this has entries, scalars (int, float, bool, string, null)
+ // are allowed to perform the casts listed.
+ //
+ // E.g. `['int' => ['float', 'string'], 'float' => ['int'], 'string' => ['int'], 'null' => ['string']]`
+ // allows casting null to a string, but not vice versa.
+ // (subset of `scalar_implicit_cast`)
+ 'scalar_implicit_partial' => [],
+
+ // If enabled, Phan will warn if **any** type in a method invocation's object
+ // is definitely not an object,
+ // or if **any** type in an invoked expression is not a callable.
+ // Setting this to true will introduce numerous false positives
+ // (and reveal some bugs).
+ 'strict_method_checking' => true,
+
+ // If enabled, Phan will warn if **any** type of the object expression for a property access
+ // does not contain that property.
+ 'strict_object_checking' => true,
+
+ // If enabled, Phan will warn if **any** type in the argument's union type
+ // cannot be cast to a type in the parameter's expected union type.
+ // Setting this to true will introduce numerous false positives
+ // (and reveal some bugs).
+ 'strict_param_checking' => true,
+
+ // If enabled, Phan will warn if **any** type in a property assignment's union type
+ // cannot be cast to a type in the property's declared union type.
+ // Setting this to true will introduce numerous false positives
+ // (and reveal some bugs).
+ 'strict_property_checking' => true,
+
+ // If enabled, Phan will warn if **any** type in a returned value's union type
+ // cannot be cast to the declared return type.
+ // Setting this to true will introduce numerous false positives
+ // (and reveal some bugs).
+ 'strict_return_checking' => true,
+
+ // If true, seemingly undeclared variables in the global
+ // scope will be ignored.
+ //
+ // This is useful for projects with complicated cross-file
+ // globals that you have no hope of fixing.
+ 'ignore_undeclared_variables_in_global_scope' => false,
+
+ // Set this to false to emit `PhanUndeclaredFunction` issues for internal functions that Phan has signatures for,
+ // but aren't available in the codebase, or from Reflection.
+ // (may lead to false positives if an extension isn't loaded)
+ //
+ // If this is true(default), then Phan will not warn.
+ //
+ // Even when this is false, Phan will still infer return values and check parameters of internal functions
+ // if Phan has the signatures.
+ 'ignore_undeclared_functions_with_known_signatures' => false,
+
+ // Backwards Compatibility Checking. This is slow
+ // and expensive, but you should consider running
+ // it before upgrading your version of PHP to a
+ // new version that has backward compatibility
+ // breaks.
+ //
+ // If you are migrating from PHP 5 to PHP 7,
+ // you should also look into using
+ // [php7cc (no longer maintained)](https://github.com/sstalle/php7cc)
+ // and [php7mar](https://github.com/Alexia/php7mar),
+ // which have different backwards compatibility checks.
+ 'backward_compatibility_checks' => false,
+
+ // If true, check to make sure the return type declared
+ // in the doc-block (if any) matches the return type
+ // declared in the method signature.
+ 'check_docblock_signature_return_type_match' => true,
+
+ // If true, make narrowed types from phpdoc params override
+ // the real types from the signature, when real types exist.
+ // (E.g. allows specifying desired lists of subclasses,
+ // or to indicate a preference for non-nullable types over nullable types)
+ //
+ // Affects analysis of the body of the method and the param types passed in by callers.
+ //
+ // (*Requires `check_docblock_signature_param_type_match` to be true*)
+ 'prefer_narrowed_phpdoc_param_type' => true,
+
+ // (*Requires `check_docblock_signature_return_type_match` to be true*)
+ //
+ // If true, make narrowed types from phpdoc returns override
+ // the real types from the signature, when real types exist.
+ //
+ // (E.g. allows specifying desired lists of subclasses,
+ // or to indicate a preference for non-nullable types over nullable types)
+ //
+ // This setting affects the analysis of return statements in the body of the method and the return types passed in by callers.
+ 'prefer_narrowed_phpdoc_return_type' => true,
+
+ // If enabled, check all methods that override a
+ // parent method to make sure its signature is
+ // compatible with the parent's.
+ //
+ // This check can add quite a bit of time to the analysis.
+ //
+ // This will also check if final methods are overridden, etc.
+ 'analyze_signature_compatibility' => true,
+
+ // This setting maps case-insensitive strings to union types.
+ //
+ // This is useful if a project uses phpdoc that differs from the phpdoc2 standard.
+ //
+ // If the corresponding value is the empty string,
+ // then Phan will ignore that union type (E.g. can ignore 'the' in `@return the value`)
+ //
+ // If the corresponding value is not empty,
+ // then Phan will act as though it saw the corresponding UnionTypes(s)
+ // when the keys show up in a UnionType of `@param`, `@return`, `@var`, `@property`, etc.
+ //
+ // This matches the **entire string**, not parts of the string.
+ // (E.g. `@return the|null` will still look for a class with the name `the`, but `@return the` will be ignored with the below setting)
+ //
+ // (These are not aliases, this setting is ignored outside of doc comments).
+ // (Phan does not check if classes with these names exist)
+ //
+ // Example setting: `['unknown' => '', 'number' => 'int|float', 'char' => 'string', 'long' => 'int', 'the' => '']`
+ 'phpdoc_type_mapping' => [],
+
+ // Set to true in order to attempt to detect dead
+ // (unreferenced) code. Keep in mind that the
+ // results will only be a guess given that classes,
+ // properties, constants and methods can be referenced
+ // as variables (like `$class->$property` or
+ // `$class->$method()`) in ways that we're unable
+ // to make sense of.
+ 'dead_code_detection' => false,
+
+ // Set to true in order to attempt to detect unused variables.
+ // `dead_code_detection` will also enable unused variable detection.
+ //
+ // This has a few known false positives, e.g. for loops or branches.
+ 'unused_variable_detection' => true,
+
+ // Set to true in order to attempt to detect redundant and impossible conditions.
+ //
+ // This has some false positives involving loops,
+ // variables set in branches of loops, and global variables.
+ 'redundant_condition_detection' => true,
+
+ // If enabled, Phan will act as though it's certain of real return types of a subset of internal functions,
+ // even if those return types aren't available in reflection (real types were taken from php 7.3 or 8.0-dev, depending on target_php_version).
+ //
+ // Note that with php 7 and earlier, php would return null or false for many internal functions if the argument types or counts were incorrect.
+ // As a result, enabling this setting with target_php_version 8.0 may result in false positives for `--redundant-condition-detection` when codebases also support php 7.x.
+ 'assume_real_types_for_internal_functions' => true,
+
+ // If true, this runs a quick version of checks that takes less
+ // time at the cost of not running as thorough
+ // of an analysis. You should consider setting this
+ // to true only when you wish you had more **undiagnosed** issues
+ // to fix in your code base.
+ //
+ // In quick-mode the scanner doesn't rescan a function
+ // or a method's code block every time a call is seen.
+ // This means that the problem here won't be detected:
+ //
+ // ```php
+ // false,
+
+ // Enable or disable support for generic templated
+ // class types.
+ 'generic_types_enabled' => true,
+
+ // Override to hardcode existence and types of (non-builtin) globals in the global scope.
+ // Class names should be prefixed with `\`.
+ //
+ // (E.g. `['_FOO' => '\FooClass', 'page' => '\PageClass', 'userId' => 'int']`)
+ 'globals_type_map' => [],
+
+ // The minimum severity level to report on. This can be
+ // set to `Issue::SEVERITY_LOW`, `Issue::SEVERITY_NORMAL` or
+ // `Issue::SEVERITY_CRITICAL`. Setting it to only
+ // critical issues is a good place to start on a big
+ // sloppy mature code base.
+ 'minimum_severity' => Issue::SEVERITY_LOW,
+
+ // Add any issue types (such as `'PhanUndeclaredMethod'`)
+ // to this black-list to inhibit them from being reported.
+ 'suppress_issue_types' => [],
+
+ // A regular expression to match files to be excluded
+ // from parsing and analysis and will not be read at all.
+ //
+ // This is useful for excluding groups of test or example
+ // directories/files, unanalyzable files, or files that
+ // can't be removed for whatever reason.
+ // (e.g. `'@Test\.php$@'`, or `'@vendor/.*/(tests|Tests)/@'`)
+ 'exclude_file_regex' => '@^vendor/.*/(tests?|Tests?)/@',
+
+ // A list of files that will be excluded from parsing and analysis
+ // and will not be read at all.
+ //
+ // This is useful for excluding hopelessly unanalyzable
+ // files that can't be removed for whatever reason.
+ 'exclude_file_list' => [],
+
+ // A directory list that defines files that will be excluded
+ // from static analysis, but whose class and method
+ // information should be included.
+ //
+ // Generally, you'll want to include the directories for
+ // third-party code (such as "vendor/") in this list.
+ //
+ // n.b.: If you'd like to parse but not analyze 3rd
+ // party code, directories containing that code
+ // should be added to the `directory_list` as well as
+ // to `exclude_analysis_directory_list`.
+ 'exclude_analysis_directory_list' => [
+ 'vendor/',
+ ],
+
+ // Enable this to enable checks of require/include statements referring to valid paths.
+ 'enable_include_path_checks' => true,
+
+ // The number of processes to fork off during the analysis
+ // phase.
+ 'processes' => 1,
+
+ // List of case-insensitive file extensions supported by Phan.
+ // (e.g. `['php', 'html', 'htm']`)
+ 'analyzed_file_extensions' => [
+ 'php',
+ ],
+
+ // You can put paths to stubs of internal extensions in this config option.
+ // If the corresponding extension is **not** loaded, then Phan will use the stubs instead.
+ // Phan will continue using its detailed type annotations,
+ // but load the constants, classes, functions, and classes (and their Reflection types)
+ // from these stub files (doubling as valid php files).
+ // Use a different extension from php to avoid accidentally loading these.
+ // The `tools/make_stubs` script can be used to generate your own stubs (compatible with php 7.0+ right now)
+ //
+ // (e.g. `['xdebug' => '.phan/internal_stubs/xdebug.phan_php']`)
+ 'autoload_internal_extension_signatures' => [],
+
+ // A list of plugin files to execute.
+ //
+ // Plugins which are bundled with Phan can be added here by providing their name (e.g. `'AlwaysReturnPlugin'`)
+ //
+ // Documentation about available bundled plugins can be found [here](https://github.com/phan/phan/tree/master/.phan/plugins).
+ //
+ // Alternately, you can pass in the full path to a PHP file with the plugin's implementation (e.g. `'vendor/phan/phan/.phan/plugins/AlwaysReturnPlugin.php'`)
+ 'plugins' => [
+ 'AlwaysReturnPlugin',
+ 'DollarDollarPlugin',
+ 'DuplicateArrayKeyPlugin',
+ 'DuplicateExpressionPlugin',
+ 'PregRegexCheckerPlugin',
+ 'PrintfCheckerPlugin',
+ 'SleepCheckerPlugin',
+ 'UnreachableCodePlugin',
+ 'UseReturnValuePlugin',
+ 'EmptyStatementListPlugin',
+ 'StrictComparisonPlugin',
+ 'LoopVariableReusePlugin',
+ ],
+
+ // A list of directories that should be parsed for class and
+ // method information. After excluding the directories
+ // defined in `exclude_analysis_directory_list`, the remaining
+ // files will be statically analyzed for errors.
+ //
+ // Thus, both first-party and third-party code being used by
+ // your application should be included in this list.
+ 'directory_list' => [
+ 'src/PHPHtmlParser',
+ 'vendor/infection/infection/src',
+ 'vendor/mockery/mockery/library',
+ 'vendor/paquettg/string-encode/src',
+ 'vendor/phan/phan/src/Phan',
+ 'vendor/php-coveralls/php-coveralls/src',
+ 'vendor/phpunit/phpunit/src',
+ ],
+
+ // A list of individual files to include in analysis
+ // with a path relative to the root directory of the
+ // project.
+ 'file_list' => [],
+];
diff --git a/composer.json b/composer.json
index 1672cfd7..e924886e 100644
--- a/composer.json
+++ b/composer.json
@@ -23,7 +23,8 @@
"phpunit/phpunit": "^7.5.1",
"mockery/mockery": "^1.2",
"php-coveralls/php-coveralls": "^2.1",
- "infection/infection": "^0.13.4"
+ "infection/infection": "^0.13.4",
+ "phan/phan": "^2.4"
},
"autoload": {
"psr-4": {
diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php
index f2c6d5d3..93b3a73b 100644
--- a/src/PHPHtmlParser/Content.php
+++ b/src/PHPHtmlParser/Content.php
@@ -1,6 +1,12 @@
-pos;
if ( ! is_null($char)) {
@@ -135,8 +141,7 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
$position = strpos($this->content, $string, $position);
if ($position === false) {
// reached the end
- $found = true;
- continue;
+ break;
}
if ($this->char($position - 1) == '\\') {
@@ -157,6 +162,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
if ($position === false) {
// could not find character, just return the remaining of the content
$return = substr($this->content, $this->pos, $this->size - $this->pos);
+ if ($return === false) {
+ throw new LogicalException('Substr returned false with position '.$this->pos.'.');
+ }
$this->pos = $this->size;
return $return;
@@ -168,6 +176,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
}
$return = substr($this->content, $this->pos, $position - $this->pos);
+ if ($return === false) {
+ throw new LogicalException('Substr returned false with position '.$this->pos.'.');
+ }
// set the new position
$this->pos = $position;
@@ -229,6 +240,9 @@ public function skip(string $string, bool $copy = false)
$return = $this;
if ($copy) {
$return = substr($this->content, $this->pos, $len);
+ if ($return === false) {
+ throw new LogicalException('Substr returned false with position '.$this->pos.'.');
+ }
}
// update the position
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index 25c94852..6b44408f 100644
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -12,6 +12,7 @@
use PHPHtmlParser\Exceptions\ParentNotFoundException;
use PHPHtmlParser\Exceptions\StrictException;
use PHPHtmlParser\Exceptions\UnknownChildTypeException;
+use PHPHtmlParser\Exceptions\LogicalException;
use stringEncode\Encode;
/**
@@ -167,10 +168,15 @@ public function load(string $str, array $options = []): Dom
* @throws ChildNotFoundException
* @throws CircularException
* @throws StrictException
+ * @throws LogicalException
*/
public function loadFromFile(string $file, array $options = []): Dom
{
- return $this->loadStr(file_get_contents($file), $options);
+ $content = file_get_contents($file);
+ if ($content === false) {
+ throw new LogicalException('file_get_contents failed and returned false when trying to read "'.$file.'".');
+ }
+ return $this->loadStr($content, $options);
}
/**
@@ -516,11 +522,20 @@ protected function clean(string $str): string
$is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII");
if ($is_gzip) {
$str = gzdecode($str);
+ if ($str === false) {
+ throw new LogicalException('gzdecode returned false. Error when trying to decode the string.');
+ }
}
// remove white space before closing tags
$str = mb_eregi_replace("'\s+>", "'>", $str);
+ if ($str === false) {
+ throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.');
+ }
$str = mb_eregi_replace('"\s+>', '">', $str);
+ if ($str === false) {
+ throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.');
+ }
// clean out the \n\r
$replace = ' ';
@@ -528,36 +543,66 @@ protected function clean(string $str): string
$replace = '
';
}
$str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
+ if ($str === false) {
+ throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.');
+ }
// strip the doctype
$str = mb_eregi_replace("", '', $str);
+ if ($str === false) {
+ throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.');
+ }
// strip out comments
$str = mb_eregi_replace("", '', $str);
+ if ($str === false) {
+ throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.');
+ }
// strip out cdata
$str = mb_eregi_replace("", '', $str);
+ if ($str === false) {
+ throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.');
+ }
// strip out ";
+ $dom = new Dom();
+ $dom->setOptions(['cleanupInput' => false,]);
+ $dom->load($html);
+ $this->assertSame($html, $dom->root->outerHtml());
+ }
+
public function testLoad()
{
$dom = new Dom;
From 0689a0468f47b479e49eff02dc04cf3f757b097f Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Mon, 27 Apr 2020 02:42:30 +0000
Subject: [PATCH 28/68] Added support for PSR7 requests
---
CHANGELOG.md | 6 ++++
composer.json | 5 ++-
src/PHPHtmlParser/Curl.php | 54 -----------------------------
src/PHPHtmlParser/CurlInterface.php | 20 -----------
src/PHPHtmlParser/Dom.php | 30 +++++++++++-----
src/PHPHtmlParser/StaticDom.php | 29 +++++++++++-----
tests/DomTest.php | 24 ++++++++-----
tests/StaticDomTest.php | 19 ++++++----
8 files changed, 80 insertions(+), 107 deletions(-)
delete mode 100755 src/PHPHtmlParser/Curl.php
delete mode 100755 src/PHPHtmlParser/CurlInterface.php
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d4921140..0039aa5e 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,9 +10,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Added
+- Added support for PSR7 HTTP clients and requests for URL calls.
+
### Changed
- Fixed issue with \ causing an infite loop.
+### Removed
+- Removed curl interface and curl implementation.
+
## 2.2.0
### Added
diff --git a/composer.json b/composer.json
index e924886e..8e643b31 100755
--- a/composer.json
+++ b/composer.json
@@ -17,7 +17,10 @@
"ext-mbstring": "*",
"paquettg/string-encode": "~1.0.0",
"ext-zlib": "*",
- "ext-curl": "*"
+ "ext-curl": "*",
+ "php-http/httplug": "^2.1",
+ "php-http/guzzle6-adapter": "^2.0",
+ "guzzlehttp/psr7": "^1.6"
},
"require-dev": {
"phpunit/phpunit": "^7.5.1",
diff --git a/src/PHPHtmlParser/Curl.php b/src/PHPHtmlParser/Curl.php
deleted file mode 100755
index b3e33edc..00000000
--- a/src/PHPHtmlParser/Curl.php
+++ /dev/null
@@ -1,54 +0,0 @@
-get($url, $options);
+ if (is_null($request)) {
+ $request = new Request('GET', $url);
+ }
+
+ $response = $client->sendRequest($request);
+ $content = $response->getBody()->getContents();
return $this->loadStr($content, $options);
}
diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php
index 0114bb70..cb70d1d1 100755
--- a/src/PHPHtmlParser/StaticDom.php
+++ b/src/PHPHtmlParser/StaticDom.php
@@ -1,11 +1,17 @@
loadFromUrl($url, $options, $curl);
+ return $dom->loadFromUrl($url, $options, $client, $request);
}
/**
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 3297923e..7a2cc4ef 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -228,14 +228,22 @@ public function testLoadFileBigTwicePreserveOption()
public function testLoadFromUrl()
{
- $curl = Mockery::mock('PHPHtmlParser\CurlInterface');
- $curl->shouldReceive('get')
- ->once()
- ->with('http://google.com', [])
- ->andReturn(file_get_contents('tests/data/files/small.html'));
-
- $dom = new Dom;
- $dom->loadFromUrl('http://google.com', [], $curl);
+ $streamMock = Mockery::mock(\Psr\Http\Message\StreamInterface::class);
+ $streamMock->shouldReceive('getContents')
+ ->once()
+ ->andReturn(file_get_contents('tests/data/files/small.html'));
+ $responseMock = Mockery::mock(\Psr\Http\Message\ResponseInterface::class);
+ $responseMock->shouldReceive('getBody')
+ ->once()
+ ->andReturn($streamMock);
+ $clientMock = Mockery::mock(\Psr\Http\Client\ClientInterface::class);
+ $clientMock->shouldReceive('sendRequest')
+ ->once()
+ ->andReturn($responseMock);
+
+
+ $dom = new Dom;
+ $dom->loadFromUrl('http://google.com', [], $clientMock);
$this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text);
}
diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php
index a6fba2c0..ac9df656 100755
--- a/tests/StaticDomTest.php
+++ b/tests/StaticDomTest.php
@@ -58,13 +58,20 @@ public function testFindI()
public function testLoadFromUrl()
{
- $curl = Mockery::mock('PHPHtmlParser\CurlInterface');
- $curl->shouldReceive('get')
- ->once()
- ->with('http://google.com', [])
- ->andReturn(file_get_contents('tests/data/files/small.html'));
+ $streamMock = Mockery::mock(\Psr\Http\Message\StreamInterface::class);
+ $streamMock->shouldReceive('getContents')
+ ->once()
+ ->andReturn(file_get_contents('tests/data/files/small.html'));
+ $responseMock = Mockery::mock(\Psr\Http\Message\ResponseInterface::class);
+ $responseMock->shouldReceive('getBody')
+ ->once()
+ ->andReturn($streamMock);
+ $clientMock = Mockery::mock(\Psr\Http\Client\ClientInterface::class);
+ $clientMock->shouldReceive('sendRequest')
+ ->once()
+ ->andReturn($responseMock);
- Dom::loadFromUrl('http://google.com', [], $curl);
+ Dom::loadFromUrl('http://google.com', [], $clientMock);
$this->assertEquals('VonBurgermeister', Dom::find('.post-row div .post-user font', 0)->text);
}
From 1d4e3792b487387d1328f7a04bda2ae42e318770 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Mon, 27 Apr 2020 04:04:03 +0000
Subject: [PATCH 29/68] Added php_cs
---
.gitattributes | 1 +
.gitignore | 1 +
.php_cs.dist | 148 +++++++++
composer.json | 5 +-
src/PHPHtmlParser/Content.php | 77 ++---
src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 109 ++++---
src/PHPHtmlParser/Dom.php | 251 +++++++---------
src/PHPHtmlParser/Dom/AbstractNode.php | 189 +++++-------
src/PHPHtmlParser/Dom/ArrayNode.php | 18 +-
src/PHPHtmlParser/Dom/Collection.php | 59 ++--
src/PHPHtmlParser/Dom/HtmlNode.php | 52 ++--
src/PHPHtmlParser/Dom/InnerNode.php | 146 ++++-----
src/PHPHtmlParser/Dom/LeafNode.php | 11 +-
src/PHPHtmlParser/Dom/Tag.php | 123 ++++----
src/PHPHtmlParser/Dom/TextNode.php | 50 ++--
.../Exceptions/ChildNotFoundException.php | 10 +-
.../Exceptions/CircularException.php | 9 +-
.../Exceptions/CurlException.php | 9 +-
.../Exceptions/EmptyCollectionException.php | 9 +-
.../Exceptions/LogicalException.php | 10 +-
.../Exceptions/NotLoadedException.php | 9 +-
.../Exceptions/ParentNotFoundException.php | 9 +-
.../Exceptions/StrictException.php | 9 +-
.../Tag/AttributeNotFoundException.php | 25 +-
.../Exceptions/UnknownChildTypeException.php | 9 +-
src/PHPHtmlParser/Finder.php | 14 +-
src/PHPHtmlParser/Options.php | 40 ++-
src/PHPHtmlParser/Selector/Parser.php | 59 ++--
.../Selector/ParserInterface.php | 5 +-
src/PHPHtmlParser/Selector/Selector.php | 170 +++++------
src/PHPHtmlParser/StaticDom.php | 48 ++-
tests/CollectionTest.php | 45 +--
tests/ContentTest.php | 11 +-
tests/DomTest.php | 166 +++++------
tests/Node/ChildrenTest.php | 80 ++---
tests/Node/HtmlTest.php | 99 +++---
tests/Node/ParentTest.php | 281 +++++++++---------
tests/Node/TagTest.php | 15 +-
tests/Node/TextTest.php | 16 +-
tests/Options/CleanupTest.php | 44 +--
tests/Options/PreserveLineBreaks.php | 24 +-
tests/Options/StrictTest.php | 34 +--
tests/Options/WhitespaceTextNodeTest.php | 14 +-
tests/OptionsTest.php | 21 +-
tests/Selector/SelectorTest.php | 50 ++--
tests/StaticDomTest.php | 17 +-
tests/data/MockNode.php | 14 +-
47 files changed, 1265 insertions(+), 1350 deletions(-)
create mode 100644 .php_cs.dist
diff --git a/.gitattributes b/.gitattributes
index 93691f38..ebfea7c7 100755
--- a/.gitattributes
+++ b/.gitattributes
@@ -10,3 +10,4 @@
/phpunit.xml export-ignore
/infection.json.dist export-ignore
/.phan export-ignore
+/.php_cs.dist export-ignore
diff --git a/.gitignore b/.gitignore
index 274cf429..9a550fad 100755
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@ composer.lock
infection.log
/vendor/
.idea/
+.php_cs.cache
*.swp
diff --git a/.php_cs.dist b/.php_cs.dist
new file mode 100644
index 00000000..56af284d
--- /dev/null
+++ b/.php_cs.dist
@@ -0,0 +1,148 @@
+in('src')
+ ->in('tests')
+;
+
+return PhpCsFixer\Config::create()
+ ->setRiskyAllowed(true)
+ ->setRules([
+ 'array_indentation' => true,
+ 'array_syntax' => ['syntax' => 'short'],
+ 'binary_operator_spaces' => ['align_double_arrow'=>true],
+ 'blank_line_after_namespace' => true,
+ 'blank_line_after_opening_tag' => true,
+ 'blank_line_before_statement' => ['statements'=>['return']],
+ 'braces' => ['allow_single_line_closure' => false],
+ 'cast_spaces' => true,
+ 'class_attributes_separation' => ['elements'=>['method']],
+ 'class_definition' => ['single_line'=>true],
+ 'combine_consecutive_issets' => true,
+ 'concat_space' => ['spacing' => 'one'],
+ 'declare_equal_normalize' => true,
+ 'elseif' => true,
+ 'encoding' => true,
+ 'full_opening_tag' => true,
+ 'function_declaration' => true,
+ 'function_typehint_space' => true,
+ 'include' => true,
+ 'increment_style' => true,
+ 'indentation_type' => true,
+ 'line_ending' => true,
+ 'linebreak_after_opening_tag' => true,
+ 'lowercase_cast' => true,
+ 'lowercase_keywords' => true,
+ 'lowercase_static_reference' => true,
+ 'magic_constant_casing' => true,
+ 'magic_method_casing' => true,
+ 'mb_str_functions' => false,
+ 'method_argument_space' => true,
+ 'native_function_casing' => true,
+ 'native_function_invocation' => true,
+ 'native_function_type_declaration_casing' => true,
+ 'new_with_braces' => true,
+ 'no_blank_lines_after_class_opening' => true,
+ 'no_blank_lines_after_phpdoc' => true,
+ 'no_break_comment' => true,
+ 'no_closing_tag' => true,
+ 'no_empty_comment' => true,
+ 'no_empty_phpdoc' => true,
+ 'no_empty_statement' => true,
+ 'no_extra_blank_lines' => ['tokens'=>[
+ 'curly_brace_block',
+ 'extra',
+ 'parenthesis_brace_block',
+ 'square_brace_block',
+ 'throw',
+ 'use',
+ ]],
+ 'no_leading_import_slash' => true,
+ 'no_leading_namespace_whitespace' => true,
+ 'no_mixed_echo_print' => true,
+ 'no_multiline_whitespace_around_double_arrow' => true,
+ 'no_null_property_initialization' => true,
+ 'no_short_bool_cast' => true,
+ 'no_singleline_whitespace_before_semicolons' => true,
+ 'no_superfluous_elseif' => true,
+ 'no_spaces_after_function_name' => true,
+ 'no_spaces_around_offset' => true,
+ 'no_spaces_inside_parenthesis' => true,
+ 'no_superfluous_phpdoc_tags' => ['allow_mixed' => true, 'allow_unused_params' => true],
+ 'no_trailing_comma_in_list_call' => true,
+ 'no_trailing_comma_in_singleline_array' => true,
+ 'no_trailing_whitespace' => true,
+ 'no_trailing_whitespace_in_comment' => true,
+ 'no_unneeded_control_parentheses' => true,
+ 'no_unneeded_curly_braces' => ['namespaces' => true],
+ 'no_unused_imports' => true,
+ 'no_useless_else' => true,
+ 'no_useless_return' => true,
+ 'no_whitespace_before_comma_in_array' => true,
+ 'no_whitespace_in_blank_line' => true,
+ 'normalize_index_brace' => true,
+ 'not_operator_with_space' => false,
+ 'object_operator_without_whitespace' => true,
+ 'ordered_class_elements' => true,
+ 'ordered_imports' => true,
+ 'php_unit_fqcn_annotation' => true,
+ 'phpdoc_align' => ['tags' => [
+ 'method',
+ 'param',
+ 'property',
+ 'return',
+ 'throws',
+ 'type',
+ 'var',
+ ]],
+ 'phpdoc_add_missing_param_annotation' => true,
+ 'phpdoc_annotation_without_dot' => true,
+ 'phpdoc_indent' => true,
+ 'phpdoc_inline_tag' => true,
+ 'phpdoc_no_access' => true,
+ 'phpdoc_no_alias_tag' => true,
+ 'phpdoc_no_package' => true,
+ 'phpdoc_no_useless_inheritdoc' => true,
+ 'phpdoc_order' => true,
+ 'phpdoc_return_self_reference' => true,
+ 'phpdoc_scalar' => true,
+ 'phpdoc_separation' => true,
+ 'phpdoc_single_line_var_spacing' => true,
+ 'phpdoc_summary' => true,
+ 'phpdoc_to_comment' => true,
+ 'phpdoc_trim' => true,
+ 'phpdoc_trim_consecutive_blank_line_separation' => true,
+ 'phpdoc_types' => true,
+ 'phpdoc_types_order' => ['null_adjustment' => 'always_last', 'sort_algorithm' => 'none'],
+ 'phpdoc_var_without_name' => true,
+ 'return_assignment' => true,
+ 'return_type_declaration' => true,
+ 'semicolon_after_instruction' => true,
+ 'simplified_null_return' => true,
+ 'short_scalar_cast' => true,
+ 'single_blank_line_at_eof' => true,
+ 'single_blank_line_before_namespace' => true,
+ 'single_class_element_per_statement' => true,
+ 'single_import_per_statement' => true,
+ 'single_line_after_imports' => true,
+ 'single_line_comment_style' => ['comment_types' => ['hash']],
+ 'single_line_throw' => true,
+ 'single_quote' => true,
+ 'single_trait_insert_per_statement' => true,
+ 'space_after_semicolon' => ['remove_in_empty_for_expressions'=>true],
+ 'standardize_increment' => true,
+ 'standardize_not_equals' => true,
+ 'switch_case_semicolon_to_colon' => true,
+ 'switch_case_space' => true,
+ 'ternary_operator_spaces' => true,
+ 'ternary_to_null_coalescing' => true,
+ 'trailing_comma_in_multiline_array' => true,
+ 'trim_array_spaces' => true,
+ 'unary_operator_spaces' => true,
+ 'visibility_required' => true,
+ 'whitespace_after_comma_in_array' => true,
+ 'yoda_style' => false,
+ ])
+ ->setFinder($finder)
+ ->setCacheFile(__DIR__.'/.php_cs.cache')
+;
\ No newline at end of file
diff --git a/composer.json b/composer.json
index 8e643b31..79258c58 100755
--- a/composer.json
+++ b/composer.json
@@ -15,9 +15,9 @@
"require": {
"php": ">=7.1",
"ext-mbstring": "*",
- "paquettg/string-encode": "~1.0.0",
"ext-zlib": "*",
"ext-curl": "*",
+ "paquettg/string-encode": "~1.0.0",
"php-http/httplug": "^2.1",
"php-http/guzzle6-adapter": "^2.0",
"guzzlehttp/psr7": "^1.6"
@@ -27,7 +27,8 @@
"mockery/mockery": "^1.2",
"php-coveralls/php-coveralls": "^2.1",
"infection/infection": "^0.13.4",
- "phan/phan": "^2.4"
+ "phan/phan": "^2.4",
+ "friendsofphp/php-cs-fixer": "^2.16"
},
"autoload": {
"psr-4": {
diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php
index 93b3a73b..37415a91 100755
--- a/src/PHPHtmlParser/Content.php
+++ b/src/PHPHtmlParser/Content.php
@@ -1,20 +1,16 @@
-content = $content;
- $this->size = strlen($content);
- $this->pos = 0;
+ $this->size = \strlen($content);
+ $this->pos = 0;
}
/**
* Returns the current position of the content.
- *
- * @return int
*/
public function getPosition(): int
{
@@ -72,16 +64,15 @@ public function getPosition(): int
* Gets the current character we are at.
*
* @param ?int $char
- * @return string
*/
public function char(?int $char = null): string
{
$pos = $this->pos;
- if ( ! is_null($char)) {
+ if (!\is_null($char)) {
$pos = $char;
}
- if ( ! isset($this->content[$pos])) {
+ if (!isset($this->content[$pos])) {
return '';
}
@@ -91,8 +82,6 @@ public function char(?int $char = null): string
/**
* Moves the current position forward.
*
- * @param int $count
- * @return Content
* @chainable
*/
public function fastForward(int $count): Content
@@ -105,8 +94,6 @@ public function fastForward(int $count): Content
/**
* Moves the current position backward.
*
- * @param int $count
- * @return Content
* @chainable
*/
public function rewind(int $count): Content
@@ -121,11 +108,6 @@ public function rewind(int $count): Content
/**
* Copy the content until we find the given string.
- *
- * @param string $string
- * @param bool $char
- * @param bool $escape
- * @return string
*/
public function copyUntil(string $string, bool $char = false, bool $escape = false): string
{
@@ -136,9 +118,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
if ($escape) {
$position = $this->pos;
- $found = false;
- while ( ! $found) {
- $position = strpos($this->content, $string, $position);
+ $found = false;
+ while (!$found) {
+ $position = \strpos($this->content, $string, $position);
if ($position === false) {
// reached the end
break;
@@ -153,17 +135,17 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
$found = true;
}
} elseif ($char) {
- $position = strcspn($this->content, $string, $this->pos);
+ $position = \strcspn($this->content, $string, $this->pos);
$position += $this->pos;
} else {
- $position = strpos($this->content, $string, $this->pos);
+ $position = \strpos($this->content, $string, $this->pos);
}
if ($position === false) {
// could not find character, just return the remaining of the content
- $return = substr($this->content, $this->pos, $this->size - $this->pos);
+ $return = \substr($this->content, $this->pos, $this->size - $this->pos);
if ($return === false) {
- throw new LogicalException('Substr returned false with position '.$this->pos.'.');
+ throw new LogicalException('Substr returned false with position ' . $this->pos . '.');
}
$this->pos = $this->size;
@@ -175,9 +157,9 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
return '';
}
- $return = substr($this->content, $this->pos, $position - $this->pos);
+ $return = \substr($this->content, $this->pos, $position - $this->pos);
if ($return === false) {
- throw new LogicalException('Substr returned false with position '.$this->pos.'.');
+ throw new LogicalException('Substr returned false with position ' . $this->pos . '.');
}
// set the new position
$this->pos = $position;
@@ -189,8 +171,6 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
* Copies the content until the string is found and return it
* unless the 'unless' is found in the substring.
*
- * @param string $string
- * @param string $unless
* @return string
*/
public function copyUntilUnless(string $string, string $unless)
@@ -199,9 +179,9 @@ public function copyUntilUnless(string $string, string $unless)
$this->fastForward(1);
$foundString = $this->copyUntil($string, true, true);
- $position = strcspn($foundString, $unless);
- if ($position == strlen($foundString)) {
- return $string.$foundString;
+ $position = \strcspn($foundString, $unless);
+ if ($position == \strlen($foundString)) {
+ return $string . $foundString;
}
// rewind changes and return nothing
$this->pos = $lastPos;
@@ -210,12 +190,10 @@ public function copyUntilUnless(string $string, string $unless)
}
/**
- * Copies the content until it reaches the token string.,
+ * Copies the content until it reaches the token string.,.
*
- * @param string $token
- * @param bool $char
- * @param bool $escape
* @return string
+ *
* @uses $this->copyUntil()
*/
public function copyByToken(string $token, bool $char = false, bool $escape = false)
@@ -228,20 +206,18 @@ public function copyByToken(string $token, bool $char = false, bool $escape = fa
/**
* Skip a given set of characters.
*
- * @param string $string
- * @param bool $copy
* @return Content|string
*/
public function skip(string $string, bool $copy = false)
{
- $len = strspn($this->content, $string, $this->pos);
+ $len = \strspn($this->content, $string, $this->pos);
// make it chainable if they don't want a copy
$return = $this;
if ($copy) {
- $return = substr($this->content, $this->pos, $len);
+ $return = \substr($this->content, $this->pos, $len);
if ($return === false) {
- throw new LogicalException('Substr returned false with position '.$this->pos.'.');
+ throw new LogicalException('Substr returned false with position ' . $this->pos . '.');
}
}
@@ -254,9 +230,8 @@ public function skip(string $string, bool $copy = false)
/**
* Skip a given token of pre-defined characters.
*
- * @param string $token
- * @param bool $copy
* @return Content|string
+ *
* @uses $this->skip()
*/
public function skipByToken(string $token, bool $copy = false)
diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php
index 44b9bb2f..489b843c 100755
--- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php
+++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php
@@ -1,57 +1,52 @@
-value = $values['value'];
- $this->doubleQuote = $values['doubleQuote'];
- }
-
- /**
- * @return string
- */
- public function getValue(): ?string
- {
- return $this->value;
- }
-
- /**
- * @return bool
- */
- public function isDoubleQuote(): bool
- {
- return $this->doubleQuote;
- }
-
- public function htmlspecialcharsDecode(): void
- {
- $this->value = htmlspecialchars_decode($this->value);
- }
-
- /**
- * @param Encode $encode
- * @throws Exception
- */
- public function encodeValue(Encode $encode)
- {
- $this->value = $encode->convert($this->value);
- }
-}
+value = $values['value'];
+ $this->doubleQuote = $values['doubleQuote'];
+ }
+
+ public function getValue(): ?string
+ {
+ return $this->value;
+ }
+
+ public function isDoubleQuote(): bool
+ {
+ return $this->doubleQuote;
+ }
+
+ public function htmlspecialcharsDecode(): void
+ {
+ if (!\is_null($this->value)) {
+ $this->value = \htmlspecialchars_decode($this->value);
+ }
+ }
+
+ /**
+ * @throws Exception
+ */
+ public function encodeValue(Encode $encode)
+ {
+ $this->value = $encode->convert($this->value);
+ }
+}
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index 99e5b796..b0b30a31 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -1,11 +1,11 @@
- at the end (html5 style)
+ * A list of tags where there should be no /> at the end (html5 style).
*
* @var array
*/
@@ -122,7 +119,6 @@ class Dom
/**
* Returns the inner html of the root node.
*
- * @return string
* @throws ChildNotFoundException
* @throws UnknownChildTypeException
*/
@@ -135,6 +131,7 @@ public function __toString(): string
* A simple wrapper around the root node.
*
* @param string $name
+ *
* @return mixed
*/
public function __get($name)
@@ -144,9 +141,7 @@ public function __get($name)
/**
* Attempts to load the dom from any resource, string, file, or URL.
- * @param string $str
- * @param array $options
- * @return Dom
+ *
* @throws ChildNotFoundException
* @throws CircularException
* @throws CurlException
@@ -156,11 +151,11 @@ public function load(string $str, array $options = []): Dom
{
AbstractNode::resetCount();
// check if it's a file
- if (strpos($str, "\n") === false && is_file($str)) {
+ if (\strpos($str, "\n") === false && \is_file($str)) {
return $this->loadFromFile($str, $options);
}
// check if it's a url
- if (preg_match("/^https?:\/\//i", $str)) {
+ if (\preg_match("/^https?:\/\//i", $str)) {
return $this->loadFromUrl($str, $options);
}
@@ -168,10 +163,8 @@ public function load(string $str, array $options = []): Dom
}
/**
- * Loads the dom from a document file/url
- * @param string $file
- * @param array $options
- * @return Dom
+ * Loads the dom from a document file/url.
+ *
* @throws ChildNotFoundException
* @throws CircularException
* @throws StrictException
@@ -179,32 +172,31 @@ public function load(string $str, array $options = []): Dom
*/
public function loadFromFile(string $file, array $options = []): Dom
{
- $content = file_get_contents($file);
+ $content = \file_get_contents($file);
if ($content === false) {
- throw new LogicalException('file_get_contents failed and returned false when trying to read "'.$file.'".');
+ throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".');
}
+
return $this->loadStr($content, $options);
}
/**
* Use a curl interface implementation to attempt to load
* the content from a url.
- * @param string $url
- * @param array $options
+ *
* @param ClientInterface $client
- * @param RequestInterface|null $request
- * @return Dom
+ *
* @throws ChildNotFoundException
* @throws CircularException
* @throws StrictException
* @throws \Psr\Http\Client\ClientExceptionInterface
*/
- public function loadFromUrl(string $url, array $options = [], ClientInterface $client = null, RequestInterface $request = null): Dom
+ public function loadFromUrl(string $url, array $options = [], ?ClientInterface $client = null, ?RequestInterface $request = null): Dom
{
- if (is_null($client)) {
+ if (\is_null($client)) {
$client = new Client();
}
- if (is_null($request)) {
+ if (\is_null($request)) {
$request = new Request('GET', $url);
}
@@ -217,25 +209,23 @@ public function loadFromUrl(string $url, array $options = [], ClientInterface $c
/**
* Parsers the html of the given string. Used for load(), loadFromFile(),
* and loadFromUrl().
- * @param string $str
- * @param array $option
- * @return Dom
+ *
* @throws ChildNotFoundException
* @throws CircularException
* @throws StrictException
*/
public function loadStr(string $str, array $option = []): Dom
{
- $this->options = new Options;
+ $this->options = new Options();
$this->options->setOptions($this->globalOptions)
->setOptions($option);
- $this->rawSize = strlen($str);
- $this->raw = $str;
+ $this->rawSize = \strlen($str);
+ $this->raw = $str;
$html = $this->clean($str);
- $this->size = strlen($str);
+ $this->size = \strlen($str);
$this->content = new Content($html);
$this->parse();
@@ -247,8 +237,6 @@ public function loadStr(string $str, array $option = []): Dom
/**
* Sets a global options array to be used by all load calls.
*
- * @param array $options
- * @return Dom
* @chainable
*/
public function setOptions(array $options): Dom
@@ -260,18 +248,18 @@ public function setOptions(array $options): Dom
/**
* Find elements by css selector on the root node.
- * @param string $selector
- * @param int|null $nth
- * @return mixed|Collection|null
+ *
* @throws ChildNotFoundException
* @throws NotLoadedException
+ *
+ * @return mixed|Collection|null
*/
public function find(string $selector, int $nth = null)
{
$this->isLoaded();
$depthFirstSearch = $this->options->get('depthFirstSearch');
- if (is_bool($depthFirstSearch)) {
+ if (\is_bool($depthFirstSearch)) {
$result = $this->root->find($selector, $nth, $depthFirstSearch);
} else {
$result = $this->root->find($selector, $nth);
@@ -281,12 +269,13 @@ public function find(string $selector, int $nth = null)
}
/**
- * Find element by Id on the root node
- * @param int $id
- * @return bool|AbstractNode
+ * Find element by Id on the root node.
+ *
* @throws ChildNotFoundException
* @throws NotLoadedException
* @throws ParentNotFoundException
+ *
+ * @return bool|AbstractNode
*/
public function findById(int $id)
{
@@ -300,12 +289,11 @@ public function findById(int $id)
* be self closing.
*
* @param string|array $tag
- * @return Dom
* @chainable
*/
public function addSelfClosingTag($tag): Dom
{
- if ( ! is_array($tag)) {
+ if (!\is_array($tag)) {
$tag = [$tag];
}
foreach ($tag as $value) {
@@ -320,15 +308,14 @@ public function addSelfClosingTag($tag): Dom
* always be self closing.
*
* @param string|array $tag
- * @return Dom
* @chainable
*/
public function removeSelfClosingTag($tag): Dom
{
- if ( ! is_array($tag)) {
+ if (!\is_array($tag)) {
$tag = [$tag];
}
- $this->selfClosing = array_diff($this->selfClosing, $tag);
+ $this->selfClosing = \array_diff($this->selfClosing, $tag);
return $this;
}
@@ -336,7 +323,6 @@ public function removeSelfClosingTag($tag): Dom
/**
* Sets the list of self closing tags to empty.
*
- * @return Dom
* @chainable
*/
public function clearSelfClosingTags(): Dom
@@ -346,17 +332,15 @@ public function clearSelfClosingTags(): Dom
return $this;
}
-
/**
- * Adds a tag to the list of self closing tags that should not have a trailing slash
+ * Adds a tag to the list of self closing tags that should not have a trailing slash.
*
* @param $tag
- * @return Dom
* @chainable
*/
public function addNoSlashTag($tag): Dom
{
- if ( ! is_array($tag)) {
+ if (!\is_array($tag)) {
$tag = [$tag];
}
foreach ($tag as $value) {
@@ -370,15 +354,14 @@ public function addNoSlashTag($tag): Dom
* Removes a tag from the list of no-slash tags.
*
* @param $tag
- * @return Dom
* @chainable
*/
public function removeNoSlashTag($tag): Dom
{
- if ( ! is_array($tag)) {
+ if (!\is_array($tag)) {
$tag = [$tag];
}
- $this->noSlash = array_diff($this->noSlash, $tag);
+ $this->noSlash = \array_diff($this->noSlash, $tag);
return $this;
}
@@ -386,7 +369,6 @@ public function removeNoSlashTag($tag): Dom
/**
* Empties the list of no-slash tags.
*
- * @return Dom
* @chainable
*/
public function clearNoSlashTags(): Dom
@@ -398,7 +380,7 @@ public function clearNoSlashTags(): Dom
/**
* Simple wrapper function that returns the first child.
- * @return AbstractNode
+ *
* @throws ChildNotFoundException
* @throws NotLoadedException
*/
@@ -411,7 +393,7 @@ public function firstChild(): AbstractNode
/**
* Simple wrapper function that returns the last child.
- * @return AbstractNode
+ *
* @throws ChildNotFoundException
* @throws NotLoadedException
*/
@@ -423,9 +405,8 @@ public function lastChild(): AbstractNode
}
/**
- * Simple wrapper function that returns count of child elements
+ * Simple wrapper function that returns count of child elements.
*
- * @return int
* @throws NotLoadedException
*/
public function countChildren(): int
@@ -436,9 +417,8 @@ public function countChildren(): int
}
/**
- * Get array of children
+ * Get array of children.
*
- * @return array
* @throws NotLoadedException
*/
public function getChildren(): array
@@ -449,9 +429,8 @@ public function getChildren(): array
}
/**
- * Check if node have children nodes
+ * Check if node have children nodes.
*
- * @return bool
* @throws NotLoadedException
*/
public function hasChildren(): bool
@@ -464,25 +443,29 @@ public function hasChildren(): bool
/**
* Simple wrapper function that returns an element by the
* id.
+ *
* @param $id
- * @return mixed|Collection|null
+ *
* @throws ChildNotFoundException
* @throws NotLoadedException
+ *
+ * @return mixed|Collection|null
*/
public function getElementById($id)
{
$this->isLoaded();
- return $this->find('#'.$id, 0);
+ return $this->find('#' . $id, 0);
}
/**
* Simple wrapper function that returns all elements by
* tag name.
- * @param string $name
- * @return mixed|Collection|null
+ *
* @throws ChildNotFoundException
* @throws NotLoadedException
+ *
+ * @return mixed|Collection|null
*/
public function getElementsByTag(string $name)
{
@@ -494,16 +477,17 @@ public function getElementsByTag(string $name)
/**
* Simple wrapper function that returns all elements by
* class name.
- * @param string $class
- * @return mixed|Collection|null
+ *
* @throws ChildNotFoundException
* @throws NotLoadedException
+ *
+ * @return mixed|Collection|null
*/
public function getElementsByClass(string $class)
{
$this->isLoaded();
- return $this->find('.'.$class);
+ return $this->find('.' . $class);
}
/**
@@ -513,16 +497,13 @@ public function getElementsByClass(string $class)
*/
protected function isLoaded(): void
{
- if (is_null($this->content)) {
+ if (\is_null($this->content)) {
throw new NotLoadedException('Content is not loaded!');
}
}
/**
* Cleans the html of any none-html information.
- *
- * @param string $str
- * @return string
*/
protected function clean(string $str): string
{
@@ -531,20 +512,20 @@ protected function clean(string $str): string
return $str;
}
- $is_gzip = 0 === mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII");
+ $is_gzip = 0 === \mb_strpos($str, "\x1f" . "\x8b" . "\x08", 0, 'US-ASCII');
if ($is_gzip) {
- $str = gzdecode($str);
+ $str = \gzdecode($str);
if ($str === false) {
throw new LogicalException('gzdecode returned false. Error when trying to decode the string.');
}
}
// remove white space before closing tags
- $str = mb_eregi_replace("'\s+>", "'>", $str);
+ $str = \mb_eregi_replace("'\s+>", "'>", $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean single quotes.');
}
- $str = mb_eregi_replace('"\s+>', '">', $str);
+ $str = \mb_eregi_replace('"\s+>', '">', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to clean double quotes.');
}
@@ -554,36 +535,36 @@ protected function clean(string $str): string
if ($this->options->get('preserveLineBreaks')) {
$replace = '
';
}
- $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
+ $str = \str_replace(["\r\n", "\r", "\n"], $replace, $str);
if ($str === false) {
throw new LogicalException('str_replace returned false instead of a string. Error when attempting to clean input string.');
}
// strip the doctype
- $str = mb_eregi_replace("", '', $str);
+ $str = \mb_eregi_replace('', '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip the doctype.');
}
// strip out comments
- $str = mb_eregi_replace("", '', $str);
+ $str = \mb_eregi_replace('', '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip comments.');
}
// strip out cdata
- $str = mb_eregi_replace("", '', $str);
+ $str = \mb_eregi_replace("", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out cdata.');
}
// strip out ";
$dom = new Dom();
- $dom->setOptions(['cleanupInput' => false,]);
+ $dom->setOptions(['cleanupInput' => false]);
$dom->load($html);
$this->assertSame($html, $dom->root->outerHtml());
}
From edec82b2ac45135ec8fbe4a88140d2ddedf71f1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Fedor?=
Date: Sat, 25 Jan 2020 19:36:43 +0100
Subject: [PATCH 31/68] Throw exception when trying to set unknown option
---
.../Exceptions/UnknownOptionException.php | 13 +++++++
src/PHPHtmlParser/Options.php | 6 ++++
tests/OptionsTest.php | 35 +++++++++++++++----
3 files changed, 48 insertions(+), 6 deletions(-)
create mode 100644 src/PHPHtmlParser/Exceptions/UnknownOptionException.php
diff --git a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php
new file mode 100644
index 00000000..3b139c0b
--- /dev/null
+++ b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php
@@ -0,0 +1,13 @@
+ $option) {
+ if (!isset($this->defaults[$key])) {
+ throw new UnknownOptionException("Option '$option' is not recognized");
+ }
$this->options[$key] = $option;
}
diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php
index 03fe3ee0..90468b8d 100755
--- a/tests/OptionsTest.php
+++ b/tests/OptionsTest.php
@@ -2,8 +2,10 @@
declare(strict_types=1);
-use PHPHtmlParser\Options;
+use PHPHtmlParser\Dom;
+use PHPHtmlParser\Exceptions\UnknownOptionException;
use PHPUnit\Framework\TestCase;
+use PHPHtmlParser\Options;
class OptionsTest extends TestCase
{
@@ -14,26 +16,37 @@ public function testDefaultWhitespaceTextNode()
$this->assertTrue($options->whitespaceTextNode);
}
+ public function testSettingOption()
+ {
+ $options = new Options;
+ $options->setOptions([
+ 'strict' => true,
+ ]);
+
+ $this->assertTrue($options->strict);
+ }
+
public function testAddingOption()
{
+ $this->expectException(UnknownOptionException::class);
+
$options = new Options();
$options->setOptions([
'test' => true,
]);
-
- $this->assertTrue($options->test);
}
- public function testAddingOver()
+ public function testOverwritingOption()
{
$options = new Options();
$options->setOptions([
- 'test' => false,
+ 'strict' => false,
])->setOptions([
- 'test' => true,
+ 'strict' => true,
'whitespaceTextNode' => false,
]);
+ $this->assertTrue($options->get('strict'));
$this->assertFalse($options->get('whitespaceTextNode'));
}
@@ -42,4 +55,14 @@ public function testGettingNoOption()
$options = new Options();
$this->assertEquals(null, $options->get('doesnotexist'));
}
+
+ public function testUnknownOptionDom() {
+ $dom = new Dom;
+ $dom->setOptions([
+ 'unknown_option' => true,
+ ]);
+
+ $this->expectException(UnknownOptionException::class);
+ $dom->load('');
+ }
}
From b86c1d3c5e7a6368cbc756e3eb33826fcac5d12e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Fedor?=
Date: Sat, 25 Jan 2020 19:57:22 +0100
Subject: [PATCH 32/68] Fix option existence check
---
src/PHPHtmlParser/Options.php | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php
index 5b2a7b34..b0bb747e 100755
--- a/src/PHPHtmlParser/Options.php
+++ b/src/PHPHtmlParser/Options.php
@@ -80,8 +80,8 @@ public function __get($key)
public function setOptions(array $options): Options
{
foreach ($options as $key => $option) {
- if (!isset($this->defaults[$key])) {
- throw new UnknownOptionException("Option '$option' is not recognized");
+ if (!array_key_exists($key, $this->defaults)) {
+ throw new UnknownOptionException("Option '$key' is not recognized");
}
$this->options[$key] = $option;
}
From 71c3758da857203423b0071350c59fb5624a504a Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Sun, 10 May 2020 23:59:09 +0000
Subject: [PATCH 33/68] Updated the way we calculate selector logic
---
CHANGELOG.md | 6 +-
.../Contracts/Selector/ParserInterface.php | 12 +
.../Contracts/Selector/SeekerInterface.php | 17 +
.../Contracts/Selector/SelectorInterface.php | 33 ++
.../Selector/ParsedSelectorCollectionDTO.php | 30 ++
.../DTO/Selector/ParsedSelectorDTO.php | 30 ++
src/PHPHtmlParser/DTO/Selector/RuleDTO.php | 96 +++++
src/PHPHtmlParser/DTO/Tag/AttributeDTO.php | 6 +-
.../Discovery/ParserDiscovery.php | 25 ++
.../Discovery/SeekerDiscovery.php | 25 ++
src/PHPHtmlParser/Dom/AbstractNode.php | 14 +-
.../Exceptions/UnknownOptionException.php | 9 +-
src/PHPHtmlParser/Options.php | 101 +++--
src/PHPHtmlParser/Selector/Parser.php | 28 +-
.../Selector/ParserInterface.php | 10 -
src/PHPHtmlParser/Selector/Seeker.php | 321 ++++++++++++++++
src/PHPHtmlParser/Selector/Selector.php | 359 ++----------------
tests/OptionsTest.php | 18 +-
tests/Selector/SelectorTest.php | 16 +-
19 files changed, 737 insertions(+), 419 deletions(-)
create mode 100644 src/PHPHtmlParser/Contracts/Selector/ParserInterface.php
create mode 100644 src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php
create mode 100644 src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php
create mode 100644 src/PHPHtmlParser/DTO/Selector/ParsedSelectorCollectionDTO.php
create mode 100644 src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php
create mode 100644 src/PHPHtmlParser/DTO/Selector/RuleDTO.php
create mode 100644 src/PHPHtmlParser/Discovery/ParserDiscovery.php
create mode 100644 src/PHPHtmlParser/Discovery/SeekerDiscovery.php
delete mode 100755 src/PHPHtmlParser/Selector/ParserInterface.php
create mode 100644 src/PHPHtmlParser/Selector/Seeker.php
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f765b63e..770a5d92 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,9 +5,6 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-### Changed
-- Added tag attribute DTO.
-
## [Unreleased]
### Added
@@ -17,7 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Fixed issue with \ causing an infite loop.
- CDATA should not be altered when cleanupInput is false.
-
+- Added tag attribute DTO.
+- Cleaned up the selector logic.
### Removed
- Removed curl interface and curl implementation.
diff --git a/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php b/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php
new file mode 100644
index 00000000..3b2477b9
--- /dev/null
+++ b/src/PHPHtmlParser/Contracts/Selector/ParserInterface.php
@@ -0,0 +1,12 @@
+parsedSelectorDTO[] = $value;
+ }
+ }
+ }
+
+ /**
+ * @return ParsedSelectorDTO[]
+ */
+ public function getParsedSelectorDTO(): array
+ {
+ return $this->parsedSelectorDTO;
+ }
+}
diff --git a/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php
new file mode 100644
index 00000000..5424e2a7
--- /dev/null
+++ b/src/PHPHtmlParser/DTO/Selector/ParsedSelectorDTO.php
@@ -0,0 +1,30 @@
+rules[] = $value;
+ }
+ }
+ }
+
+ /**
+ * @return RuleDTO[]
+ */
+ public function getRules(): array
+ {
+ return $this->rules;
+ }
+}
diff --git a/src/PHPHtmlParser/DTO/Selector/RuleDTO.php b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php
new file mode 100644
index 00000000..1c336149
--- /dev/null
+++ b/src/PHPHtmlParser/DTO/Selector/RuleDTO.php
@@ -0,0 +1,96 @@
+tag = $values['tag'];
+ $this->operator = $values['operator'];
+ $this->key = $values['key'];
+ $this->value = $values['value'];
+ $this->noKey = $values['noKey'];
+ $this->alterNext = $values['alterNext'];
+ }
+
+ /**
+ * @return string
+ */
+ public function getTag(): string
+ {
+ return $this->tag;
+ }
+
+ /**
+ * @return string
+ */
+ public function getOperator(): string
+ {
+ return $this->operator;
+ }
+
+ /**
+ * @return string|array|null
+ */
+ public function getKey()
+ {
+ return $this->key;
+ }
+
+ /**
+ * @return string|array|null
+ */
+ public function getValue()
+ {
+ return $this->value;
+ }
+
+ /**
+ * @return bool
+ */
+ public function isNoKey(): bool
+ {
+ return $this->noKey;
+ }
+
+ /**
+ * @return bool
+ */
+ public function isAlterNext(): bool
+ {
+ return $this->alterNext;
+ }
+}
diff --git a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php
index 489b843c..1f15c492 100755
--- a/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php
+++ b/src/PHPHtmlParser/DTO/Tag/AttributeDTO.php
@@ -7,17 +7,17 @@
use stringEncode\Encode;
use stringEncode\Exception;
-class AttributeDTO
+final class AttributeDTO
{
/**
* @var ?string
*/
- protected $value;
+ private $value;
/**
* @var bool
*/
- protected $doubleQuote = true;
+ private $doubleQuote = true;
public function __construct(array $values)
{
diff --git a/src/PHPHtmlParser/Discovery/ParserDiscovery.php b/src/PHPHtmlParser/Discovery/ParserDiscovery.php
new file mode 100644
index 00000000..a7d3c60a
--- /dev/null
+++ b/src/PHPHtmlParser/Discovery/ParserDiscovery.php
@@ -0,0 +1,25 @@
+setDepthFirstFind($depthFirst);
$nodes = $selector->find($this);
diff --git a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php
index 3b139c0b..58be8198 100644
--- a/src/PHPHtmlParser/Exceptions/UnknownOptionException.php
+++ b/src/PHPHtmlParser/Exceptions/UnknownOptionException.php
@@ -1,12 +1,13 @@
-options = $this->defaults;
}
+ /**
+ * A magic get to call the get() method.
+ *
+ * @param string $key
+ *
+ * @return mixed
+ *
+ * @uses $this->get()
+ */
+ public function __get($key)
+ {
+ return $this->get($key);
+ }
+
/**
* The whitespaceTextNode, by default true, option tells the parser to save textnodes even if the content of the
* node is empty (only whitespace). Setting it to false will ignore all whitespace only text node found in the document.
- * @param bool $value
+ *
* @return Options
*/
- public function setWhitespaceTextNode(bool $value): self {
+ public function setWhitespaceTextNode(bool $value): self
+ {
$this->options['whitespaceTextNode'] = $value;
+
return $this;
}
/**
* Strict, by default false, will throw a StrictException if it finds that the html is not strictly compliant
* (all tags must have a closing tag, no attribute with out a value, etc.).
- * @param bool $value
+ *
* @return Options
*/
- public function setStrict(bool $value): self {
+ public function setStrict(bool $value): self
+ {
$this->options['strict'] = $value;
+
return $this;
}
@@ -83,21 +101,25 @@ public function setStrict(bool $value): self {
* The enforceEncoding, by default null, option will enforce an character set to be used for reading the content
* and returning the content in that encoding. Setting it to null will trigger an attempt to figure out
* the encoding from within the content of the string given instead.
- * @param string|null $value
+ *
* @return Options
*/
- public function setEnforceEncoding(?string $value): self {
+ public function setEnforceEncoding(?string $value): self
+ {
$this->options['enforceEncoding'] = $value;
+
return $this;
}
/**
* Set this to false to skip the entire clean up phase of the parser. Defaults to true.
- * @param bool $value
+ *
* @return Options
*/
- public function setCleanupInput(bool $value): self {
+ public function setCleanupInput(bool $value): self
+ {
$this->options['cleanupInput'] = $value;
+
return $this;
}
@@ -107,11 +129,12 @@ public function setCleanupInput(bool $value): self {
*
* NOTE: Ignored if cleanupInit is true.
*
- * @param bool $value
* @return Options
*/
- public function setRemoveScripts(bool $value): self {
+ public function setRemoveScripts(bool $value): self
+ {
$this->options['removeScripts'] = $value;
+
return $this;
}
@@ -119,11 +142,13 @@ public function setRemoveScripts(bool $value): self {
* Set this to false to skip removing of style tags from the document body. This might have adverse effects. Defaults to true.
*
* NOTE: Ignored if cleanupInit is true.
- * @param bool $value
+ *
* @return Options
*/
- public function setRemoveStyles(bool $value): self {
+ public function setRemoveStyles(bool $value): self
+ {
$this->options['removeStyles'] = $value;
+
return $this;
}
@@ -132,31 +157,37 @@ public function setRemoveStyles(bool $value): self {
* as part of the input clean up process. Defaults to false.
*
* NOTE: Ignored if cleanupInit is true.
- * @param bool $value
+ *
* @return Options
*/
- public function setPreserveLineBreaks(bool $value): self {
+ public function setPreserveLineBreaks(bool $value): self
+ {
$this->options['preserveLineBreaks'] = $value;
+
return $this;
}
/**
* Set this to false if you want to preserve whitespace inside of text nodes. It is set to true by default.
- * @param bool $value
+ *
* @return Options
*/
- public function setRemoveDoubleSpace(bool $value): self {
+ public function setRemoveDoubleSpace(bool $value): self
+ {
$this->options['removeDoubleSpace'] = $value;
+
return $this;
}
/**
* Set this to false if you want to preserve smarty script found in the html content. It is set to true by default.
- * @param bool $value
+ *
* @return Options
*/
- public function setRemoveSmartyScripts(bool $value): self {
+ public function setRemoveSmartyScripts(bool $value): self
+ {
$this->options['removeSmartyScripts'] = $value;
+
return $this;
}
@@ -164,49 +195,40 @@ public function setRemoveSmartyScripts(bool $value): self {
* By default this is set to false for legacy support. Setting this to true will change the behavior of find
* to order elements by depth first. This will properly preserve the order of elements as they where in the HTML.
*
- * @param bool $value
* @return Options
+ *
* @deprecated This option will be removed in version 3.0.0 with the new behavior being as if it was set to true.
*/
- public function setDepthFirstSearch(bool $value): self {
+ public function setDepthFirstSearch(bool $value): self
+ {
$this->options['depthFirstSearch'] = $value;
+
return $this;
}
/**
* By default this is set to false. Setting this to true will apply the php function htmlspecialchars_decode too all attribute values and text nodes.
- * @param bool $value
+ *
* @return Options
*/
- public function setHtmlSpecialCharsDecode(bool $value): self {
+ public function setHtmlSpecialCharsDecode(bool $value): self
+ {
$this->options['htmlSpecialCharsDecode'] = $value;
- return $this;
- }
- /**
- * A magic get to call the get() method.
- *
- * @param string $key
- *
- * @return mixed
- *
- * @uses $this->get()
- */
- public function __get($key)
- {
- return $this->get($key);
+ return $this;
}
/**
* Sets a new options param to override the current option array.
*
* @chainable
+ *
* @throws UnknownOptionException
*/
public function setOptions(array $options): Options
{
foreach ($options as $key => $option) {
- if (!array_key_exists($key, $this->defaults)) {
+ if (!\array_key_exists($key, $this->defaults)) {
throw new UnknownOptionException("Option '$key' is not recognized");
}
$this->options[$key] = $option;
@@ -229,11 +251,12 @@ public function get(string $key)
}
/**
- * Return current options as array
+ * Return current options as array.
*
* @return array
*/
- public function asArray() {
+ public function asArray()
+ {
return $this->options;
}
}
diff --git a/src/PHPHtmlParser/Selector/Parser.php b/src/PHPHtmlParser/Selector/Parser.php
index 0f987903..a70a7a5e 100755
--- a/src/PHPHtmlParser/Selector/Parser.php
+++ b/src/PHPHtmlParser/Selector/Parser.php
@@ -4,8 +4,13 @@
namespace PHPHtmlParser\Selector;
+use PHPHtmlParser\Contracts\Selector\ParserInterface;
+use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO;
+use PHPHtmlParser\DTO\Selector\ParsedSelectorDTO;
+use PHPHtmlParser\DTO\Selector\RuleDTO;
+
/**
- * This is the parser for the selector.
+ * This is the default parser for the selector.
*/
class Parser implements ParserInterface
{
@@ -14,20 +19,19 @@ class Parser implements ParserInterface
*
* @var string
*/
- protected $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
+ private $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\.\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
/**
* Parses the selector string.
*/
- public function parseSelectorString(string $selector): array
+ public function parseSelectorString(string $selector): ParsedSelectorCollectionDTO
{
$selectors = [];
-
$matches = [];
+ $rules = [];
\preg_match_all($this->pattern, \trim($selector) . ' ', $matches, PREG_SET_ORDER);
// skip tbody
- $result = [];
foreach ($matches as $match) {
// default values
$tag = \strtolower(\trim($match[1]));
@@ -88,25 +92,25 @@ public function parseSelectorString(string $selector): array
$noKey = true;
}
- $result[] = [
+ $rules[] = new RuleDTO([
'tag' => $tag,
'key' => $key,
'value' => $value,
'operator' => $operator,
'noKey' => $noKey,
'alterNext' => $alterNext,
- ];
+ ]);
if (isset($match[7]) && \is_string($match[7]) && \trim($match[7]) == ',') {
- $selectors[] = $result;
- $result = [];
+ $selectors[] = new ParsedSelectorDTO($rules);
+ $rules = [];
}
}
// save last results
- if (\count($result) > 0) {
- $selectors[] = $result;
+ if (\count($rules) > 0) {
+ $selectors[] = new ParsedSelectorDTO($rules);
}
- return $selectors;
+ return new ParsedSelectorCollectionDTO($selectors);
}
}
diff --git a/src/PHPHtmlParser/Selector/ParserInterface.php b/src/PHPHtmlParser/Selector/ParserInterface.php
deleted file mode 100755
index 755966ae..00000000
--- a/src/PHPHtmlParser/Selector/ParserInterface.php
+++ /dev/null
@@ -1,10 +0,0 @@
-getTag() !== null && \is_numeric($rule->getKey())) {
+ $count = 0;
+ /** @var AbstractNode $node */
+ foreach ($nodes as $node) {
+ if ($rule->getTag() == '*'
+ || $rule->getTag() == $node->getTag()
+ ->name()
+ ) {
+ ++$count;
+ if ($count == $rule->getKey()) {
+ // found the node we wanted
+ return [$node];
+ }
+ }
+ }
+
+ return [];
+ }
+
+ $options = $this->flattenOptions($options);
+
+ $return = [];
+ /** @var InnerNode $node */
+ foreach ($nodes as $node) {
+ // check if we are a leaf
+ if ($node instanceof LeafNode || !$node->hasChildren()
+ ) {
+ continue;
+ }
+
+ $children = [];
+ $child = $node->firstChild();
+ while (!\is_null($child)) {
+ // wild card, grab all
+ if ($rule->getTag() == '*' && \is_null($rule->getKey())) {
+ $return[] = $child;
+ $child = $this->getNextChild($node, $child);
+ continue;
+ }
+
+ $pass = $this->checkTag($rule, $child);
+ if ($pass && $rule->getKey() != null) {
+ $pass = $this->checkKey($rule, $child);
+ }
+ if ($pass &&
+ $rule->getKey() != null &&
+ $rule->getValue() != null &&
+ $rule->getValue() != '*'
+ ) {
+ $pass = $this->checkComparison($rule, $child);
+ }
+
+ if ($pass) {
+ // it passed all checks
+ $return[] = $child;
+ } else {
+ // this child failed to be matched
+ if ($child instanceof InnerNode && $child->hasChildren()
+ ) {
+ if ($depthFirst) {
+ if (!isset($options['checkGrandChildren'])
+ || $options['checkGrandChildren']
+ ) {
+ // we have a child that failed but are not leaves.
+ $matches = $this->seek([$child], $rule, $options, $depthFirst);
+ foreach ($matches as $match) {
+ $return[] = $match;
+ }
+ }
+ } else {
+ // we still want to check its children
+ $children[] = $child;
+ }
+ }
+ }
+
+ $child = $this->getNextChild($node, $child);
+ }
+
+ if ((!isset($options['checkGrandChildren'])
+ || $options['checkGrandChildren'])
+ && \count($children) > 0
+ ) {
+ // we have children that failed but are not leaves.
+ $matches = $this->seek($children, $rule, $options, $depthFirst);
+ foreach ($matches as $match) {
+ $return[] = $match;
+ }
+ }
+ }
+
+ return $return;
+ }
+
+ /**
+ * Checks comparison condition from rules against node.
+ */
+ private function checkComparison(RuleDTO $rule, AbstractNode $node): bool
+ {
+ if ($rule->getKey() == 'plaintext') {
+ // plaintext search
+ $nodeValue = $node->text();
+ $result = $this->checkNodeValue($nodeValue, $rule, $node);
+ } else {
+ // normal search
+ if (!\is_array($rule->getKey())) {
+ $nodeValue = $node->getAttribute($rule->getKey());
+ $result = $this->checkNodeValue($nodeValue, $rule, $node);
+ } else {
+ $result = true;
+ foreach ($rule->getKey() as $index => $key) {
+ $nodeValue = $node->getAttribute($key);
+ $result = $result &&
+ $this->checkNodeValue($nodeValue, $rule, $node, $index);
+ }
+ }
+ }
+
+ return $result;
+ }
+
+ /**
+ * Flattens the option array.
+ *
+ * @return array
+ */
+ private function flattenOptions(array $optionsArray)
+ {
+ $options = [];
+ foreach ($optionsArray as $optionArray) {
+ foreach ($optionArray as $key => $option) {
+ $options[$key] = $option;
+ }
+ }
+
+ return $options;
+ }
+
+ /**
+ * Returns the next child or null if no more children.
+ *
+ * @return AbstractNode|null
+ */
+ private function getNextChild(
+ AbstractNode $node,
+ AbstractNode $currentChild
+ ) {
+ try {
+ $child = null;
+ if ($node instanceof InnerNode) {
+ // get next child
+ $child = $node->nextChild($currentChild->id());
+ }
+ } catch (ChildNotFoundException $e) {
+ // no more children
+ unset($e);
+ $child = null;
+ }
+
+ return $child;
+ }
+
+ /**
+ * Checks tag condition from rules against node.
+ */
+ private function checkTag(RuleDTO $rule, AbstractNode $node): bool
+ {
+ if (!empty($rule->getTag()) && $rule->getTag() != $node->getTag()->name()
+ && $rule->getTag() != '*'
+ ) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Checks key condition from rules against node.
+ */
+ private function checkKey(RuleDTO $rule, AbstractNode $node): bool
+ {
+ if (!\is_array($rule->getKey())) {
+ if ($rule->isNoKey()) {
+ if ($node->getAttribute($rule->getKey()) !== null) {
+ return false;
+ }
+ } else {
+ if ($rule->getKey() != 'plaintext'
+ && !$node->hasAttribute($rule->getKey())
+ ) {
+ return false;
+ }
+ }
+ } else {
+ if ($rule->isNoKey()) {
+ foreach ($rule->getKey() as $key) {
+ if (!\is_null($node->getAttribute($key))) {
+ return false;
+ }
+ }
+ } else {
+ foreach ($rule->getKey() as $key) {
+ if ($key != 'plaintext'
+ && !$node->hasAttribute($key)
+ ) {
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+
+ private function checkNodeValue(
+ ?string $nodeValue,
+ RuleDTO $rule,
+ AbstractNode $node,
+ ?int $index = null
+ ): bool {
+ $check = false;
+ if (
+ $rule->getValue() != null &&
+ \is_string($rule->getValue())
+ ) {
+ $check = $this->match($rule->getOperator(), $rule->getValue(), $nodeValue);
+ }
+
+ // handle multiple classes
+ $key = $rule->getKey();
+ if (
+ !$check &&
+ $key == 'class' &&
+ \is_array($rule->getValue())
+ ) {
+ $nodeClasses = \explode(' ', $node->getAttribute('class') ?? '');
+ foreach ($rule->getValue() as $value) {
+ foreach ($nodeClasses as $class) {
+ if (
+ !empty($class) &&
+ \is_string($rule->getOperator())
+ ) {
+ $check = $this->match($rule->getOperator(), $value, $class);
+ }
+ if ($check) {
+ break;
+ }
+ }
+ if (!$check) {
+ break;
+ }
+ }
+ } elseif (
+ !$check &&
+ \is_array($key) &&
+ !\is_null($nodeValue) &&
+ \is_string($rule->getOperator()) &&
+ \is_string($rule->getValue()[$index])
+ ) {
+ $check = $this->match($rule->getOperator(), $rule->getValue()[$index], $nodeValue);
+ }
+
+ return $check;
+ }
+
+ /**
+ * Attempts to match the given arguments with the given operator.
+ */
+ private function match(
+ string $operator,
+ string $pattern,
+ string $value
+ ): bool {
+ $value = \strtolower($value);
+ $pattern = \strtolower($pattern);
+ switch ($operator) {
+ case '=':
+ return $value === $pattern;
+ case '!=':
+ return $value !== $pattern;
+ case '^=':
+ return \preg_match('/^' . \preg_quote($pattern, '/') . '/',
+ $value) == 1;
+ case '$=':
+ return \preg_match('/' . \preg_quote($pattern, '/') . '$/',
+ $value) == 1;
+ case '*=':
+ if ($pattern[0] == '/') {
+ return \preg_match($pattern, $value) == 1;
+ }
+
+ return \preg_match('/' . $pattern . '/i', $value) == 1;
+ }
+
+ return false;
+ }
+}
diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php
index 21d6920d..95c47001 100755
--- a/src/PHPHtmlParser/Selector/Selector.php
+++ b/src/PHPHtmlParser/Selector/Selector.php
@@ -4,33 +4,51 @@
namespace PHPHtmlParser\Selector;
+use PHPHtmlParser\Contracts\Selector\ParserInterface;
+use PHPHtmlParser\Contracts\Selector\SeekerInterface;
+use PHPHtmlParser\Contracts\Selector\SelectorInterface;
+use PHPHtmlParser\Discovery\ParserDiscovery;
+use PHPHtmlParser\Discovery\SeekerDiscovery;
use PHPHtmlParser\Dom\AbstractNode;
use PHPHtmlParser\Dom\Collection;
-use PHPHtmlParser\Dom\InnerNode;
-use PHPHtmlParser\Dom\LeafNode;
+use PHPHtmlParser\DTO\Selector\ParsedSelectorCollectionDTO;
+use PHPHtmlParser\DTO\Selector\RuleDTO;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
/**
* Class Selector.
*/
-class Selector
+class Selector implements SelectorInterface
{
/**
- * @var array
+ * @var ParsedSelectorCollectionDTO
*/
- protected $selectors = [];
+ private $ParsedSelectorCollectionDTO;
/**
* @var bool
*/
private $depthFirst = false;
+ /**
+ * @var SeekerInterface
+ */
+ private $seeker;
+
/**
* Constructs with the selector string.
*/
- public function __construct(string $selector, ParserInterface $parser)
+ public function __construct(string $selector, ?ParserInterface $parser = null, ?SeekerInterface $seeker = null)
{
- $this->selectors = $parser->parseSelectorString($selector);
+ if ($parser == null) {
+ $parser = ParserDiscovery::find();
+ }
+ if ($seeker == null) {
+ $seeker = SeekerDiscovery::find();
+ }
+
+ $this->ParsedSelectorCollectionDTO = $parser->parseSelectorString($selector);
+ $this->seeker = $seeker;
}
/**
@@ -38,9 +56,9 @@ public function __construct(string $selector, ParserInterface $parser)
*
* @return array
*/
- public function getSelectors()
+ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO
{
- return $this->selectors;
+ return $this->ParsedSelectorCollectionDTO;
}
public function setDepthFirstFind(bool $status): void
@@ -57,19 +75,19 @@ public function setDepthFirstFind(bool $status): void
public function find(AbstractNode $node): Collection
{
$results = new Collection();
- foreach ($this->selectors as $selector) {
+ foreach ($this->ParsedSelectorCollectionDTO->getParsedSelectorDTO() as $selector) {
$nodes = [$node];
- if (\count($selector) == 0) {
+ if (\count($selector->getRules()) == 0) {
continue;
}
$options = [];
- foreach ($selector as $rule) {
- if ($rule['alterNext']) {
+ foreach ($selector->getRules() as $rule) {
+ if ($rule->isAlterNext()) {
$options[] = $this->alterNext($rule);
continue;
}
- $nodes = $this->seek($nodes, $rule, $options);
+ $nodes = $this->seeker->seek($nodes, $rule, $options, $this->depthFirst);
// clear the options
$options = [];
}
@@ -83,326 +101,17 @@ public function find(AbstractNode $node): Collection
return $results;
}
- /**
- * Checks comparison condition from rules against node.
- */
- public function checkComparison(array $rule, AbstractNode $node): bool
- {
- if ($rule['key'] == 'plaintext') {
- // plaintext search
- $nodeValue = $node->text();
- $result = $this->checkNodeValue($nodeValue, $rule, $node);
- } else {
- // normal search
- if (!\is_array($rule['key'])) {
- $nodeValue = $node->getAttribute($rule['key']);
- $result = $this->checkNodeValue($nodeValue, $rule, $node);
- } else {
- $result = true;
- foreach ($rule['key'] as $index => $key) {
- $nodeValue = $node->getAttribute($key);
- $result = $result &&
- $this->checkNodeValue($nodeValue, $rule, $node, $index);
- }
- }
- }
-
- return $result;
- }
-
- /**
- * Attempts to find all children that match the rule
- * given.
- *
- * @throws ChildNotFoundException
- */
- protected function seek(array $nodes, array $rule, array $options): array
- {
- // XPath index
- if (\array_key_exists('tag', $rule) && \array_key_exists('key', $rule)
- && \is_numeric($rule['key'])
- ) {
- $count = 0;
- /** @var AbstractNode $node */
- foreach ($nodes as $node) {
- if ($rule['tag'] == '*'
- || $rule['tag'] == $node->getTag()
- ->name()
- ) {
- ++$count;
- if ($count == $rule['key']) {
- // found the node we wanted
- return [$node];
- }
- }
- }
-
- return [];
- }
-
- $options = $this->flattenOptions($options);
-
- $return = [];
- /** @var InnerNode $node */
- foreach ($nodes as $node) {
- // check if we are a leaf
- if ($node instanceof LeafNode || !$node->hasChildren()
- ) {
- continue;
- }
-
- $children = [];
- $child = $node->firstChild();
- while (!\is_null($child)) {
- // wild card, grab all
- if ($rule['tag'] == '*' && \is_null($rule['key'])) {
- $return[] = $child;
- $child = $this->getNextChild($node, $child);
- continue;
- }
-
- $pass = $this->checkTag($rule, $child);
- if ($pass && !\is_null($rule['key'])) {
- $pass = $this->checkKey($rule, $child);
- }
- if ($pass && !\is_null($rule['key']) && !\is_null($rule['value'])
- && $rule['value'] != '*'
- ) {
- $pass = $this->checkComparison($rule, $child);
- }
-
- if ($pass) {
- // it passed all checks
- $return[] = $child;
- } else {
- // this child failed to be matched
- if ($child instanceof InnerNode && $child->hasChildren()
- ) {
- if ($this->depthFirst) {
- if (!isset($options['checkGrandChildren'])
- || $options['checkGrandChildren']
- ) {
- // we have a child that failed but are not leaves.
- $matches = $this->seek([$child], $rule,
- $options);
- foreach ($matches as $match) {
- $return[] = $match;
- }
- }
- } else {
- // we still want to check its children
- $children[] = $child;
- }
- }
- }
-
- $child = $this->getNextChild($node, $child);
- }
-
- if ((!isset($options['checkGrandChildren'])
- || $options['checkGrandChildren'])
- && \count($children) > 0
- ) {
- // we have children that failed but are not leaves.
- $matches = $this->seek($children, $rule, $options);
- foreach ($matches as $match) {
- $return[] = $match;
- }
- }
- }
-
- return $return;
- }
-
- /**
- * Attempts to match the given arguments with the given operator.
- */
- protected function match(
- string $operator,
- string $pattern,
- string $value
- ): bool {
- $value = \strtolower($value);
- $pattern = \strtolower($pattern);
- switch ($operator) {
- case '=':
- return $value === $pattern;
- case '!=':
- return $value !== $pattern;
- case '^=':
- return \preg_match('/^' . \preg_quote($pattern, '/') . '/',
- $value) == 1;
- case '$=':
- return \preg_match('/' . \preg_quote($pattern, '/') . '$/',
- $value) == 1;
- case '*=':
- if ($pattern[0] == '/') {
- return \preg_match($pattern, $value) == 1;
- }
-
- return \preg_match('/' . $pattern . '/i', $value) == 1;
- }
-
- return false;
- }
-
/**
* Attempts to figure out what the alteration will be for
* the next element.
*/
- protected function alterNext(array $rule): array
+ private function alterNext(RuleDTO $rule): array
{
$options = [];
- if ($rule['tag'] == '>') {
+ if ($rule->getTag() == '>') {
$options['checkGrandChildren'] = false;
}
return $options;
}
-
- /**
- * Flattens the option array.
- *
- * @return array
- */
- protected function flattenOptions(array $optionsArray)
- {
- $options = [];
- foreach ($optionsArray as $optionArray) {
- foreach ($optionArray as $key => $option) {
- $options[$key] = $option;
- }
- }
-
- return $options;
- }
-
- /**
- * Returns the next child or null if no more children.
- *
- * @return AbstractNode|null
- */
- protected function getNextChild(
- AbstractNode $node,
- AbstractNode $currentChild
- ) {
- try {
- $child = null;
- if ($node instanceof InnerNode) {
- // get next child
- $child = $node->nextChild($currentChild->id());
- }
- } catch (ChildNotFoundException $e) {
- // no more children
- unset($e);
- $child = null;
- }
-
- return $child;
- }
-
- /**
- * Checks tag condition from rules against node.
- */
- protected function checkTag(array $rule, AbstractNode $node): bool
- {
- if (!empty($rule['tag']) && $rule['tag'] != $node->getTag()->name()
- && $rule['tag'] != '*'
- ) {
- return false;
- }
-
- return true;
- }
-
- /**
- * Checks key condition from rules against node.
- */
- protected function checkKey(array $rule, AbstractNode $node): bool
- {
- if (!\is_array($rule['key'])) {
- if ($rule['noKey']) {
- if (!\is_null($node->getAttribute($rule['key']))) {
- return false;
- }
- } else {
- if ($rule['key'] != 'plaintext'
- && !$node->hasAttribute($rule['key'])
- ) {
- return false;
- }
- }
- } else {
- if ($rule['noKey']) {
- foreach ($rule['key'] as $key) {
- if (!\is_null($node->getAttribute($key))) {
- return false;
- }
- }
- } else {
- foreach ($rule['key'] as $key) {
- if ($key != 'plaintext'
- && !$node->hasAttribute($key)
- ) {
- return false;
- }
- }
- }
- }
-
- return true;
- }
-
- private function checkNodeValue(
- ?string $nodeValue,
- array $rule,
- AbstractNode $node,
- ?int $index = null
- ): bool {
- $check = false;
- if (
- \array_key_exists('value', $rule) && !\is_array($rule['value']) &&
- !\is_null($nodeValue) &&
- \array_key_exists('operator', $rule) && \is_string($rule['operator']) &&
- \array_key_exists('value', $rule) && \is_string($rule['value'])
- ) {
- $check = $this->match($rule['operator'], $rule['value'], $nodeValue);
- }
-
- // handle multiple classes
- $key = $rule['key'];
- if (
- !$check &&
- $key == 'class' &&
- \array_key_exists('value', $rule) && \is_array($rule['value'])
- ) {
- $nodeClasses = \explode(' ', $node->getAttribute('class') ?? '');
- foreach ($rule['value'] as $value) {
- foreach ($nodeClasses as $class) {
- if (
- !empty($class) &&
- \array_key_exists('operator', $rule) && \is_string($rule['operator'])
- ) {
- $check = $this->match($rule['operator'], $value, $class);
- }
- if ($check) {
- break;
- }
- }
- if (!$check) {
- break;
- }
- }
- } elseif (
- !$check &&
- \is_array($key) &&
- !\is_null($nodeValue) &&
- \array_key_exists('operator', $rule) && \is_string($rule['operator']) &&
- \array_key_exists('value', $rule) && \is_string($rule['value'][$index])
- ) {
- $check = $this->match($rule['operator'], $rule['value'][$index], $nodeValue);
- }
-
- return $check;
- }
}
diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php
index ed83b177..91c62591 100755
--- a/tests/OptionsTest.php
+++ b/tests/OptionsTest.php
@@ -4,8 +4,8 @@
use PHPHtmlParser\Dom;
use PHPHtmlParser\Exceptions\UnknownOptionException;
-use PHPUnit\Framework\TestCase;
use PHPHtmlParser\Options;
+use PHPUnit\Framework\TestCase;
class OptionsTest extends TestCase
{
@@ -18,7 +18,7 @@ public function testDefaultWhitespaceTextNode()
public function testSettingOption()
{
- $options = new Options;
+ $options = new Options();
$options->setOptions([
'strict' => true,
]);
@@ -42,7 +42,7 @@ public function testOverwritingOption()
$options->setOptions([
'strict' => false,
])->setOptions([
- 'strict' => true,
+ 'strict' => true,
'whitespaceTextNode' => false,
]);
@@ -56,7 +56,8 @@ public function testGettingNoOption()
$this->assertEquals(null, $options->get('doesnotexist'));
}
- public function testSetters() {
+ public function testSetters()
+ {
$options = new Options();
$options->setOptions([
@@ -79,8 +80,8 @@ public function testSetters() {
$options->setStrict(true);
$this->assertTrue($options->get('strict'));
- $options->setEnforceEncoding("utf8");
- $this->assertEquals("utf8", $options->get('enforceEncoding'));
+ $options->setEnforceEncoding('utf8');
+ $this->assertEquals('utf8', $options->get('enforceEncoding'));
$options->setCleanupInput(true);
$this->assertTrue($options->get('cleanupInput'));
@@ -142,8 +143,9 @@ public function testSetters() {
$this->assertFalse($options->get('htmlSpecialCharsDecode'));
}
- public function testUnknownOptionDom() {
- $dom = new Dom;
+ public function testUnknownOptionDom()
+ {
+ $dom = new Dom();
$dom->setOptions([
'unknown_option' => true,
]);
diff --git a/tests/Selector/SelectorTest.php b/tests/Selector/SelectorTest.php
index 261b3cb8..d2a12a59 100755
--- a/tests/Selector/SelectorTest.php
+++ b/tests/Selector/SelectorTest.php
@@ -13,29 +13,29 @@ class SelectorTest extends TestCase
public function testParseSelectorStringId()
{
$selector = new Selector('#all', new Parser());
- $selectors = $selector->getSelectors();
- $this->assertEquals('id', $selectors[0][0]['key']);
+ $selectors = $selector->getParsedSelectorCollectionDTO();
+ $this->assertEquals('id', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getKey());
}
public function testParseSelectorStringClass()
{
$selector = new Selector('div.post', new Parser());
- $selectors = $selector->getSelectors();
- $this->assertEquals('class', $selectors[0][0]['key']);
+ $selectors = $selector->getParsedSelectorCollectionDTO();
+ $this->assertEquals('class', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getKey());
}
public function testParseSelectorStringAttribute()
{
$selector = new Selector('div[visible=yes]', new Parser());
- $selectors = $selector->getSelectors();
- $this->assertEquals('yes', $selectors[0][0]['value']);
+ $selectors = $selector->getParsedSelectorCollectionDTO();
+ $this->assertEquals('yes', $selectors->getParsedSelectorDTO()[0]->getRules()[0]->getValue());
}
public function testParseSelectorStringNoKey()
{
$selector = new Selector('div[!visible]', new Parser());
- $selectors = $selector->getSelectors();
- $this->assertTrue($selectors[0][0]['noKey']);
+ $selectors = $selector->getParsedSelectorCollectionDTO();
+ $this->assertTrue($selectors->getParsedSelectorDTO()[0]->getRules()[0]->isNoKey());
}
public function testFind()
From 04321f991ba37b9b47ee22ae52dc2319c353a6b0 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Mon, 11 May 2020 00:52:31 +0000
Subject: [PATCH 34/68] fixes #82
---
CHANGELOG.md | 2 +
src/PHPHtmlParser/Dom.php | 58 +-
src/PHPHtmlParser/Selector/Seeker.php | 4 +-
tests/DomTest.php | 71 +
tests/Selector/SeekerTest.php | 25 +
tests/data/files/big.html | 2 +-
tests/data/files/html5.html | 2957 +++++++++++++++++++++++++
7 files changed, 3096 insertions(+), 23 deletions(-)
create mode 100644 tests/Selector/SeekerTest.php
create mode 100644 tests/data/files/html5.html
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 770a5d92..8daa5304 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,12 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added support for PSR7 HTTP clients and requests for URL calls.
- Added PHAN support and fixed all issues from PHAN.
+- Added support for html5 charset detection.
### Changed
- Fixed issue with \ causing an infite loop.
- CDATA should not be altered when cleanupInput is false.
- Added tag attribute DTO.
- Cleaned up the selector logic.
+- Fixed issue with greedy regex for charset detection.
### Removed
- Removed curl interface and curl implementation.
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index 9a980ab5..8c2ebcde 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -39,42 +39,42 @@ class Dom
*
* @var string
*/
- protected $defaultCharset = 'UTF-8';
+ private $defaultCharset = 'UTF-8';
/**
* The raw version of the document string.
*
* @var string
*/
- protected $raw;
+ private $raw;
/**
* The document string.
*
* @var Content
*/
- protected $content;
+ private $content;
/**
* The original file size of the document.
*
* @var int
*/
- protected $rawSize;
+ private $rawSize;
/**
* The size of the document after it is cleaned.
*
* @var int
*/
- protected $size;
+ private $size;
/**
* A global options array to be used by all load calls.
*
* @var array
*/
- protected $globalOptions = [];
+ private $globalOptions = [];
/**
* A persistent option object to be used for all options in the
@@ -82,14 +82,14 @@ class Dom
*
* @var Options
*/
- protected $options;
+ private $options;
/**
* A list of tags which will always be self closing.
*
* @var array
*/
- protected $selfClosing = [
+ private $selfClosing = [
'area',
'base',
'basefont',
@@ -114,7 +114,7 @@ class Dom
*
* @var array
*/
- protected $noSlash = [];
+ private $noSlash = [];
/**
* Returns the inner html of the root node.
@@ -173,7 +173,7 @@ public function load(string $str, array $options = []): Dom
*/
public function loadFromFile(string $file, array $options = []): Dom
{
- $content = \file_get_contents($file);
+ $content = @\file_get_contents($file);
if ($content === false) {
throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".');
}
@@ -496,7 +496,7 @@ public function getElementsByClass(string $class)
*
* @throws NotLoadedException
*/
- protected function isLoaded(): void
+ private function isLoaded(): void
{
if (\is_null($this->content)) {
throw new NotLoadedException('Content is not loaded!');
@@ -506,7 +506,7 @@ protected function isLoaded(): void
/**
* Cleans the html of any none-html information.
*/
- protected function clean(string $str): string
+ private function clean(string $str): string
{
if ($this->options->get('cleanupInput') != true) {
// skip entire cleanup step
@@ -610,7 +610,7 @@ protected function clean(string $str): string
* @throws StrictException
* @throws LogicalException
*/
- protected function parse(): void
+ private function parse(): void
{
// add the root node
$this->root = new HtmlNode('root');
@@ -679,7 +679,7 @@ protected function parse(): void
*
* @throws StrictException
*/
- protected function parseTag(): array
+ private function parseTag(): array
{
$return = [
'status' => false,
@@ -823,7 +823,7 @@ protected function parseTag(): array
*
* @throws ChildNotFoundException
*/
- protected function detectCharset(): bool
+ private function detectCharset(): bool
{
// set the default
$encode = new Encode();
@@ -841,11 +841,15 @@ protected function detectCharset(): bool
/** @var AbstractNode $meta */
$meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
- if (\is_null($meta)) {
- // could not find meta tag
- $this->root->propagateEncoding($encode);
+ if ($meta == null) {
+ if (!$this->detectHTML5Charset($encode)) {
+ // could not find meta tag
+ $this->root->propagateEncoding($encode);
- return false;
+ return false;
+ }
+
+ return true;
}
$content = $meta->getAttribute('content');
if (\is_null($content)) {
@@ -855,7 +859,7 @@ protected function detectCharset(): bool
return false;
}
$matches = [];
- if (\preg_match('/charset=(.+)/', $content, $matches)) {
+ if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
$encode->from(\trim($matches[1]));
$this->root->propagateEncoding($encode);
@@ -867,4 +871,18 @@ protected function detectCharset(): bool
return false;
}
+
+ private function detectHTML5Charset(Encode $encode): bool
+ {
+ /** @var AbstractNode|null $meta */
+ $meta = $this->root->find('meta[charset]', 0);
+ if ($meta == null) {
+ return false;
+ }
+
+ $encode->from(\trim($meta->getAttribute('charset')));
+ $this->root->propagateEncoding($encode);
+
+ return true;
+ }
}
diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php
index e9ed2484..971c40f9 100644
--- a/src/PHPHtmlParser/Selector/Seeker.php
+++ b/src/PHPHtmlParser/Selector/Seeker.php
@@ -314,8 +314,8 @@ private function match(
}
return \preg_match('/' . $pattern . '/i', $value) == 1;
+ default:
+ return false;
}
-
- return false;
}
}
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 0a50021e..ea570561 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -196,6 +196,13 @@ public function testLoadFromFileFind()
$this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text);
}
+ public function testLoadFromFileNotFound()
+ {
+ $dom = new Dom();
+ $this->expectException(\PHPHtmlParser\Exceptions\LogicalException::class);
+ $dom->loadFromFile('tests/data/files/unkowne.html');
+ }
+
public function testLoadUtf8()
{
$dom = new Dom();
@@ -531,6 +538,60 @@ public function testMultipleSquareSelector()
$this->assertEquals(1, \count($items));
}
+ public function testNotSquareSelector()
+ {
+ $dom = new Dom();
+ $dom->load('');
+
+ $items = $dom->find('input[type!=foo]');
+ $this->assertEquals(1, \count($items));
+ }
+
+ public function testStartSquareSelector()
+ {
+ $dom = new Dom();
+ $dom->load('');
+
+ $items = $dom->find('input[name^=f]');
+ $this->assertEquals(1, \count($items));
+ }
+
+ public function testEndSquareSelector()
+ {
+ $dom = new Dom();
+ $dom->load('');
+
+ $items = $dom->find('input[baz$=g]');
+ $this->assertEquals(1, \count($items));
+ }
+
+ public function testStarSquareSelector()
+ {
+ $dom = new Dom();
+ $dom->load('');
+
+ $items = $dom->find('input[baz*=*]');
+ $this->assertEquals(1, \count($items));
+ }
+
+ public function testStarFullRegexSquareSelector()
+ {
+ $dom = new Dom();
+ $dom->load('');
+
+ $items = $dom->find('input[baz*=/\w+/]');
+ $this->assertEquals(1, \count($items));
+ }
+
+ public function testFailedSquareSelector()
+ {
+ $dom = new Dom();
+ $dom->load('');
+
+ $items = $dom->find('input[baz%=g]');
+ $this->assertEquals(1, \count($items));
+ }
+
public function testLoadGetAttributeWithBackslash()
{
$dom = new Dom();
@@ -547,4 +608,14 @@ public function test25ChildrenFound()
$children = $dom->find('#red-line-g *');
$this->assertEquals(25, \count($children));
}
+
+ public function testHtml5PageLoad()
+ {
+ $dom = new Dom();
+ $dom->loadFromFile('tests/data/files/html5.html');
+
+ /** @var Dom\AbstractNode $meta */
+ $div = $dom->find('div.d-inline-block', 0);
+ $this->assertEquals('max-width: 29px', $div->getAttribute('style'));
+ }
}
diff --git a/tests/Selector/SeekerTest.php b/tests/Selector/SeekerTest.php
new file mode 100644
index 00000000..4e2d9e4f
--- /dev/null
+++ b/tests/Selector/SeekerTest.php
@@ -0,0 +1,25 @@
+ 'tag',
+ 'key' => 1,
+ 'value' => null,
+ 'operator' => null,
+ 'noKey' => false,
+ 'alterNext' => false,
+ ]);
+ $seeker = new Seeker();
+ $results = $seeker->seek([], $ruleDTO, [], false);
+ $this->assertCount(0, $results);
+ }
+}
diff --git a/tests/data/files/big.html b/tests/data/files/big.html
index 6b5e3ee5..a26f5093 100755
--- a/tests/data/files/big.html
+++ b/tests/data/files/big.html
@@ -2,7 +2,7 @@
-
+
diff --git a/tests/data/files/html5.html b/tests/data/files/html5.html
new file mode 100644
index 00000000..b2b1413d
--- /dev/null
+++ b/tests/data/files/html5.html
@@ -0,0 +1,2957 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Dom.php uses greedy RegEx to match charset · Issue #82 · paquettg/php-html-parser
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ You can’t perform that action at this time.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ You signed in with another tab or window. Reload to refresh your session.
+ You signed out in another tab or window. Reload to refresh your session.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
From 9d8149016d0eb45b6695d860dae7581dbdcc4b98 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Mon, 11 May 2020 01:01:36 +0000
Subject: [PATCH 35/68] fix #103
---
CHANGELOG.md | 10 ++++----
src/PHPHtmlParser/Selector/Seeker.php | 34 +++++++++++++--------------
tests/DomTest.php | 14 ++++++++++-
3 files changed, 35 insertions(+), 23 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8daa5304..1de2d167 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,9 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
-- Added support for PSR7 HTTP clients and requests for URL calls.
-- Added PHAN support and fixed all issues from PHAN.
-- Added support for html5 charset detection.
+- Support for PSR7 HTTP clients and requests for URL calls has been added.
+- PHAN support and fixed all issues from PHAN has been added.
+- PHP-CS-Fixer added.
+- Support for html5 charset detection.
+- Added the ability to match both parent and children.
### Changed
- Fixed issue with \ causing an infite loop.
@@ -20,7 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed issue with greedy regex for charset detection.
### Removed
-- Removed curl interface and curl implementation.
+- Curl interface and curl implementation has been removed.
## 2.2.0
diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php
index 971c40f9..fa101e9e 100644
--- a/src/PHPHtmlParser/Selector/Seeker.php
+++ b/src/PHPHtmlParser/Selector/Seeker.php
@@ -17,6 +17,7 @@ class Seeker implements SeekerInterface
* Attempts to find all children that match the rule
* given.
*
+ * @var InnerNode[] $nodes
* @throws ChildNotFoundException
*/
public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array
@@ -24,7 +25,6 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir
// XPath index
if ($rule->getTag() !== null && \is_numeric($rule->getKey())) {
$count = 0;
- /** @var AbstractNode $node */
foreach ($nodes as $node) {
if ($rule->getTag() == '*'
|| $rule->getTag() == $node->getTag()
@@ -44,7 +44,6 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir
$options = $this->flattenOptions($options);
$return = [];
- /** @var InnerNode $node */
foreach ($nodes as $node) {
// check if we are a leaf
if ($node instanceof LeafNode || !$node->hasChildren()
@@ -77,24 +76,23 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir
if ($pass) {
// it passed all checks
$return[] = $child;
- } else {
- // this child failed to be matched
- if ($child instanceof InnerNode && $child->hasChildren()
- ) {
- if ($depthFirst) {
- if (!isset($options['checkGrandChildren'])
- || $options['checkGrandChildren']
- ) {
- // we have a child that failed but are not leaves.
- $matches = $this->seek([$child], $rule, $options, $depthFirst);
- foreach ($matches as $match) {
- $return[] = $match;
- }
+ }
+ // this child failed to be matched
+ if ($child instanceof InnerNode && $child->hasChildren()
+ ) {
+ if ($depthFirst) {
+ if (!isset($options['checkGrandChildren'])
+ || $options['checkGrandChildren']
+ ) {
+ // we have a child that failed but are not leaves.
+ $matches = $this->seek([$child], $rule, $options, $depthFirst);
+ foreach ($matches as $match) {
+ $return[] = $match;
}
- } else {
- // we still want to check its children
- $children[] = $child;
}
+ } else {
+ // we still want to check its children
+ $children[] = $child;
}
}
diff --git a/tests/DomTest.php b/tests/DomTest.php
index ea570561..7c29b508 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -223,7 +223,7 @@ public function testLoadFileBig()
{
$dom = new Dom();
$dom->loadFromFile('tests/data/files/big.html');
- $this->assertEquals(10, \count($dom->find('.content-border')));
+ $this->assertEquals(20, \count($dom->find('.content-border')));
}
public function testLoadFileBigTwice()
@@ -618,4 +618,16 @@ public function testHtml5PageLoad()
$div = $dom->find('div.d-inline-block', 0);
$this->assertEquals('max-width: 29px', $div->getAttribute('style'));
}
+
+ public function testFindAttributeInBothParentAndChild()
+ {
+ $dom = new Dom();
+ $dom->load('
+
+');
+
+ /** @var Dom\AbstractNode $meta */
+ $nodes = $dom->find('[attribute]');
+ $this->assertCount(2, $nodes);
+ }
}
From 4e13ad24dadd0313ed48448632e1bc317b9c780c Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Tue, 12 May 2020 02:49:20 +0000
Subject: [PATCH 36/68] Removed the depricated depthFirstSearch option
---
CHANGELOG.md | 1 +
.../Contracts/Selector/SeekerInterface.php | 2 +-
.../Contracts/Selector/SelectorInterface.php | 2 --
src/PHPHtmlParser/Dom.php | 7 +-----
src/PHPHtmlParser/Dom/AbstractNode.php | 3 +--
src/PHPHtmlParser/Options.php | 17 --------------
src/PHPHtmlParser/Selector/Seeker.php | 23 ++++++++-----------
src/PHPHtmlParser/Selector/Selector.php | 12 +---------
tests/DomTest.php | 13 -----------
tests/OptionsTest.php | 7 ------
10 files changed, 14 insertions(+), 73 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1de2d167..3cd22cd0 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Removed
- Curl interface and curl implementation has been removed.
+- Removed support for the depth first search option.
## 2.2.0
diff --git a/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php b/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php
index cca4eb54..23357795 100644
--- a/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php
+++ b/src/PHPHtmlParser/Contracts/Selector/SeekerInterface.php
@@ -13,5 +13,5 @@ interface SeekerInterface
*
* @throws ChildNotFoundException
*/
- public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array;
+ public function seek(array $nodes, RuleDTO $rule, array $options): array;
}
diff --git a/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php b/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php
index 8eca7d1e..c1aceeb9 100644
--- a/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php
+++ b/src/PHPHtmlParser/Contracts/Selector/SelectorInterface.php
@@ -21,8 +21,6 @@ public function __construct(string $selector, ?ParserInterface $parser = null, ?
*/
public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO;
- public function setDepthFirstFind(bool $status): void;
-
/**
* Attempts to find the selectors starting from the given
* node object.
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index 8c2ebcde..fa659f67 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -259,12 +259,7 @@ public function find(string $selector, int $nth = null)
{
$this->isLoaded();
- $depthFirstSearch = $this->options->get('depthFirstSearch');
- if (\is_bool($depthFirstSearch)) {
- $result = $this->root->find($selector, $nth, $depthFirstSearch);
- } else {
- $result = $this->root->find($selector, $nth);
- }
+ $result = $this->root->find($selector, $nth);
return $result;
}
diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php
index 0d096904..596a3ae8 100755
--- a/src/PHPHtmlParser/Dom/AbstractNode.php
+++ b/src/PHPHtmlParser/Dom/AbstractNode.php
@@ -439,13 +439,12 @@ public function ancestorByTag(string $tag): AbstractNode
*
* @return mixed|Collection|null
*/
- public function find(string $selectorString, ?int $nth = null, bool $depthFirst = false, ?SelectorInterface $selector = null)
+ public function find(string $selectorString, ?int $nth = null, ?SelectorInterface $selector = null)
{
if (\is_null($selector)) {
$selector = new Selector($selectorString);
}
- $selector->setDepthFirstFind($depthFirst);
$nodes = $selector->find($this);
if ($nth !== null) {
diff --git a/src/PHPHtmlParser/Options.php b/src/PHPHtmlParser/Options.php
index e90e435f..b7e1cd17 100755
--- a/src/PHPHtmlParser/Options.php
+++ b/src/PHPHtmlParser/Options.php
@@ -18,7 +18,6 @@
* @property bool $preserveLineBreaks
* @property bool $removeDoubleSpace
* @property bool $removeSmartyScripts
- * @property bool $depthFirstSearch
* @property bool $htmlSpecialCharsDecode
*/
class Options
@@ -38,7 +37,6 @@ class Options
'preserveLineBreaks' => false,
'removeDoubleSpace' => true,
'removeSmartyScripts' => true,
- 'depthFirstSearch' => false,
'htmlSpecialCharsDecode' => false,
];
@@ -191,21 +189,6 @@ public function setRemoveSmartyScripts(bool $value): self
return $this;
}
- /**
- * By default this is set to false for legacy support. Setting this to true will change the behavior of find
- * to order elements by depth first. This will properly preserve the order of elements as they where in the HTML.
- *
- * @return Options
- *
- * @deprecated This option will be removed in version 3.0.0 with the new behavior being as if it was set to true.
- */
- public function setDepthFirstSearch(bool $value): self
- {
- $this->options['depthFirstSearch'] = $value;
-
- return $this;
- }
-
/**
* By default this is set to false. Setting this to true will apply the php function htmlspecialchars_decode too all attribute values and text nodes.
*
diff --git a/src/PHPHtmlParser/Selector/Seeker.php b/src/PHPHtmlParser/Selector/Seeker.php
index fa101e9e..93d1bc1c 100644
--- a/src/PHPHtmlParser/Selector/Seeker.php
+++ b/src/PHPHtmlParser/Selector/Seeker.php
@@ -20,7 +20,7 @@ class Seeker implements SeekerInterface
* @var InnerNode[] $nodes
* @throws ChildNotFoundException
*/
- public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFirst): array
+ public function seek(array $nodes, RuleDTO $rule, array $options): array
{
// XPath index
if ($rule->getTag() !== null && \is_numeric($rule->getKey())) {
@@ -80,19 +80,14 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir
// this child failed to be matched
if ($child instanceof InnerNode && $child->hasChildren()
) {
- if ($depthFirst) {
- if (!isset($options['checkGrandChildren'])
- || $options['checkGrandChildren']
- ) {
- // we have a child that failed but are not leaves.
- $matches = $this->seek([$child], $rule, $options, $depthFirst);
- foreach ($matches as $match) {
- $return[] = $match;
- }
+ if (!isset($options['checkGrandChildren'])
+ || $options['checkGrandChildren']
+ ) {
+ // we have a child that failed but are not leaves.
+ $matches = $this->seek([$child], $rule, $options);
+ foreach ($matches as $match) {
+ $return[] = $match;
}
- } else {
- // we still want to check its children
- $children[] = $child;
}
}
@@ -104,7 +99,7 @@ public function seek(array $nodes, RuleDTO $rule, array $options, bool $depthFir
&& \count($children) > 0
) {
// we have children that failed but are not leaves.
- $matches = $this->seek($children, $rule, $options, $depthFirst);
+ $matches = $this->seek($children, $rule, $options);
foreach ($matches as $match) {
$return[] = $match;
}
diff --git a/src/PHPHtmlParser/Selector/Selector.php b/src/PHPHtmlParser/Selector/Selector.php
index 95c47001..4c45da01 100755
--- a/src/PHPHtmlParser/Selector/Selector.php
+++ b/src/PHPHtmlParser/Selector/Selector.php
@@ -25,11 +25,6 @@ class Selector implements SelectorInterface
*/
private $ParsedSelectorCollectionDTO;
- /**
- * @var bool
- */
- private $depthFirst = false;
-
/**
* @var SeekerInterface
*/
@@ -61,11 +56,6 @@ public function getParsedSelectorCollectionDTO(): ParsedSelectorCollectionDTO
return $this->ParsedSelectorCollectionDTO;
}
- public function setDepthFirstFind(bool $status): void
- {
- $this->depthFirst = $status;
- }
-
/**
* Attempts to find the selectors starting from the given
* node object.
@@ -87,7 +77,7 @@ public function find(AbstractNode $node): Collection
$options[] = $this->alterNext($rule);
continue;
}
- $nodes = $this->seeker->seek($nodes, $rule, $options, $this->depthFirst);
+ $nodes = $this->seeker->seek($nodes, $rule, $options);
// clear the options
$options = [];
}
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 7c29b508..9922f17f 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -483,19 +483,6 @@ public function testFindOrder()
$dom->load($str);
$images = $dom->find('img');
- $this->assertEquals('', (string) $images[0]);
- }
-
- public function testFindDepthFirstSearch()
- {
- $str = '
';
- $dom = new Dom();
- $dom->setOptions([
- 'depthFirstSearch' => true,
- ]);
- $dom->load($str);
- $images = $dom->find('img');
-
$this->assertEquals('', (string) $images[0]);
}
diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php
index 91c62591..899a0622 100755
--- a/tests/OptionsTest.php
+++ b/tests/OptionsTest.php
@@ -70,7 +70,6 @@ public function testSetters()
'preserveLineBreaks' => false,
'removeDoubleSpace' => false,
'removeSmartyScripts' => false,
- 'depthFirstSearch' => false,
'htmlSpecialCharsDecode' => false,
]);
@@ -101,9 +100,6 @@ public function testSetters()
$options->setRemoveSmartyScripts(true);
$this->assertTrue($options->get('removeSmartyScripts'));
- $options->setDepthFirstSearch(true);
- $this->assertTrue($options->get('depthFirstSearch'));
-
$options->setHtmlSpecialCharsDecode(true);
$this->assertTrue($options->get('htmlSpecialCharsDecode'));
@@ -136,9 +132,6 @@ public function testSetters()
$options->setRemoveSmartyScripts(false);
$this->assertFalse($options->get('removeSmartyScripts'));
- $options->setDepthFirstSearch(false);
- $this->assertFalse($options->get('depthFirstSearch'));
-
$options->setHtmlSpecialCharsDecode(false);
$this->assertFalse($options->get('htmlSpecialCharsDecode'));
}
From 924a594e7df145511466939171ca6c1966cd0cc6 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Tue, 12 May 2020 02:55:03 +0000
Subject: [PATCH 37/68] Fix #187
---
CHANGELOG.md | 1 +
src/PHPHtmlParser/Content.php | 2 +-
src/PHPHtmlParser/Dom.php | 4 ++--
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3cd22cd0..5d9b1bab 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added tag attribute DTO.
- Cleaned up the selector logic.
- Fixed issue with greedy regex for charset detection.
+- Fixed bug causing infinite loops in some cases.
### Removed
- Curl interface and curl implementation has been removed.
diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php
index 0ae7e0e4..24bca182 100755
--- a/src/PHPHtmlParser/Content.php
+++ b/src/PHPHtmlParser/Content.php
@@ -164,7 +164,7 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
*
* @return string
*/
- public function copyUntilUnless(string $string, string $unless)
+ public function copyUntilUnless(string $string, string $unless): string
{
$lastPos = $this->pos;
$this->fastForward(1);
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index fa659f67..f772e707 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -749,7 +749,7 @@ private function parseTag(): array
do {
$moreString = $this->content->copyUntilUnless('"', '=>');
$string .= $moreString;
- } while (!empty($moreString));
+ } while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size);
$attr['value'] = $string;
$this->content->fastForward(1);
$node->getTag()->setAttribute($name, $string);
@@ -760,7 +760,7 @@ private function parseTag(): array
do {
$moreString = $this->content->copyUntilUnless("'", '=>');
$string .= $moreString;
- } while (!empty($moreString));
+ } while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size);
$attr['value'] = $string;
$this->content->fastForward(1);
$node->getTag()->setAttribute($name, $string, false);
From 0127b9e354e92f9c515653b8b50423e38010762c Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Tue, 12 May 2020 03:15:13 +0000
Subject: [PATCH 38/68] fixes #188
---
tests/DomTest.php | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 9922f17f..47aeb6ae 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -617,4 +617,16 @@ public function testFindAttributeInBothParentAndChild()
$nodes = $dom->find('[attribute]');
$this->assertCount(2, $nodes);
}
+
+ public function testLessThanCharacterInJavascript()
+ {
+ $results = (new Dom())->load('',
+ [
+ 'cleanupInput' => false,
+ 'removeScripts' => false
+ ])->find('body');
+ $this->assertCount(1, $results);
+ }
}
From 4bb7098f3a46582dd9c5cd289fae6f0e835f2916 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Wed, 13 May 2020 14:36:50 +0000
Subject: [PATCH 39/68] Fixes #203
---
CHANGELOG.md | 1 +
src/PHPHtmlParser/Dom.php | 17 ----------
src/PHPHtmlParser/Dom/AbstractNode.php | 10 ------
tests/DomTest.php | 44 ++++++++++----------------
4 files changed, 18 insertions(+), 54 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d9b1bab..bd2dffcb 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Removed
- Curl interface and curl implementation has been removed.
- Removed support for the depth first search option.
+- findById() method removed from Dom object.
## 2.2.0
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index f772e707..ba9ea4ae 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -150,7 +150,6 @@ public function __get($name)
*/
public function load(string $str, array $options = []): Dom
{
- AbstractNode::resetCount();
// check if it's a file
if (\strpos($str, "\n") === false && \is_file($str)) {
return $this->loadFromFile($str, $options);
@@ -264,22 +263,6 @@ public function find(string $selector, int $nth = null)
return $result;
}
- /**
- * Find element by Id on the root node.
- *
- * @throws ChildNotFoundException
- * @throws NotLoadedException
- * @throws ParentNotFoundException
- *
- * @return bool|AbstractNode
- */
- public function findById(int $id)
- {
- $this->isLoaded();
-
- return $this->root->findById($id);
- }
-
/**
* Adds the tag (or tags in an array) to the list of tags that will always
* be self closing.
diff --git a/src/PHPHtmlParser/Dom/AbstractNode.php b/src/PHPHtmlParser/Dom/AbstractNode.php
index 596a3ae8..3d67ab5c 100755
--- a/src/PHPHtmlParser/Dom/AbstractNode.php
+++ b/src/PHPHtmlParser/Dom/AbstractNode.php
@@ -140,16 +140,6 @@ public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
$this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode;
}
- /**
- * Reset node counter.
- *
- * @return void
- */
- public static function resetCount()
- {
- self::$count = 0;
- }
-
/**
* Returns the id of this object.
*/
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 47aeb6ae..8e800487 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -394,33 +394,6 @@ public function testHasChildren()
$this->assertTrue($dom->hasChildren());
}
- public function testFindByIdVar1()
- {
- $dom = new Dom();
- $dom->load('
');
- /** @var Dom\AbstractNode $result */
- $result = $dom->findById(8);
- $this->assertFalse($result);
- }
-
public function testWhitespaceInText()
{
$dom = new Dom();
@@ -629,4 +602,21 @@ public function testLessThanCharacterInJavascript()
])->find('body');
$this->assertCount(1, $results);
}
+
+ public function testUniqueIdForAllObjects()
+ {
+ // Create a dom which will be used as a parent/container for a paragraph
+ $dom1 = new \PHPHtmlParser\Dom;
+ $dom1->load('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting)
+ $div = $dom1->firstChild();
+
+ // Create a paragraph outside of the first dom
+ $dom2 = new \PHPHtmlParser\Dom;
+ $dom2->load('
Our new paragraph.
'); // Resets the counter
+ $paragraph = $dom2->firstChild();
+
+ $div->addChild($paragraph);
+
+ $this->assertEquals('A container div
Our new paragraph.
', $div->innerhtml);
+ }
}
From 4e3158c561878076a82b32143e73537a1c391fa6 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Tue, 14 Jul 2020 20:24:14 +0000
Subject: [PATCH 40/68] Added test to cover #189
---
tests/DomTest.php | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/tests/DomTest.php b/tests/DomTest.php
index 8e800487..9c3781c4 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -619,4 +619,23 @@ public function testUniqueIdForAllObjects()
$this->assertEquals('A container div
Our new paragraph.
', $div->innerhtml);
}
+
+ public function testFindDescendantsOfMatch()
+ {
+ $dom = new Dom();
+ $dom->load('
+
+ test
+ testing
+ This is a test
+ italic
+ password123
+
+ another
+
');
$a = $dom->find('a')[0];
echo $a->text; // "click here"
```
@@ -86,7 +86,7 @@ $dom->loadFromUrl('http://google.com');
$html = $dom->outerHtml;
// or
-$dom->load('http://google.com');
+$dom->loadFromUrl('http://google.com');
$html = $dom->outerHtml; // same result as the first example
```
@@ -137,11 +137,11 @@ $dom->setOptions([
'strict' => true, // Set a global option to enable strict html parsing.
]);
-$dom->load('http://google.com', [
+$dom->loadFromUrl('http://google.com', [
'whitespaceTextNode' => false, // Only applies to this load.
]);
-$dom->load('http://gmail.com'); // will not have whitespaceTextNode set to false.
+$dom->loadFromUrl('http://gmail.com'); // will not have whitespaceTextNode set to false.
```
At the moment we support 8 options.
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index ba9ea4ae..1cba5050 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -15,7 +15,6 @@
use PHPHtmlParser\Exceptions\CurlException;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Exceptions\NotLoadedException;
-use PHPHtmlParser\Exceptions\ParentNotFoundException;
use PHPHtmlParser\Exceptions\StrictException;
use PHPHtmlParser\Exceptions\UnknownChildTypeException;
use Psr\Http\Client\ClientInterface;
@@ -139,29 +138,6 @@ public function __get($name)
return $this->root->$name;
}
- /**
- * Attempts to load the dom from any resource, string, file, or URL.
- *
- * @throws ChildNotFoundException
- * @throws CircularException
- * @throws CurlException
- * @throws StrictException
- * @throws LogicalException
- */
- public function load(string $str, array $options = []): Dom
- {
- // check if it's a file
- if (\strpos($str, "\n") === false && \is_file($str)) {
- return $this->loadFromFile($str, $options);
- }
- // check if it's a url
- if (\preg_match("/^https?:\/\//i", $str)) {
- return $this->loadFromUrl($str, $options);
- }
-
- return $this->loadStr($str, $options);
- }
-
/**
* Loads the dom from a document file/url.
*
diff --git a/src/PHPHtmlParser/StaticDom.php b/src/PHPHtmlParser/StaticDom.php
index b4c3ef22..411ca3de 100755
--- a/src/PHPHtmlParser/StaticDom.php
+++ b/src/PHPHtmlParser/StaticDom.php
@@ -56,23 +56,6 @@ public static function mount(string $className = 'Dom', ?Dom $dom = null): bool
return true;
}
- /**
- * Creates a new dom object and calls load() on the
- * new object.
- *
- * @throws ChildNotFoundException
- * @throws CircularException
- * @throws CurlException
- * @throws StrictException
- */
- public static function load(string $str): Dom
- {
- $dom = new Dom();
- self::$dom = $dom;
-
- return $dom->load($str);
- }
-
/**
* Creates a new dom object and calls loadFromFile() on the
* new object.
@@ -114,6 +97,14 @@ public static function loadFromUrl(string $url, array $options = [], ClientInter
return $dom->loadFromUrl($url, $options, $client, $request);
}
+ public static function loadStr(string $str, array $options = []): Dom
+ {
+ $dom = new Dom();
+ self::$dom = $dom;
+
+ return $dom->loadStr($str, $options);
+ }
+
/**
* Sets the $dom variable to null.
*/
diff --git a/tests/DomTest.php b/tests/DomTest.php
index d68b0fc2..2a904cc5 100755
--- a/tests/DomTest.php
+++ b/tests/DomTest.php
@@ -20,14 +20,14 @@ public function testParsingCData()
$html = "";
$dom = new Dom();
$dom->setOptions(['cleanupInput' => false]);
- $dom->load($html);
+ $dom->loadStr($html);
$this->assertSame($html, $dom->root->outerHtml());
}
- public function testLoad()
+ public function testloadStr()
{
$dom = new Dom();
- $dom->load('
');
$div = $dom->find('div', 0);
$this->assertEquals(null, $div->foo);
}
@@ -52,7 +52,7 @@ public function testIncorrectAccess()
public function testLoadSelfclosingAttr()
{
$dom = new Dom();
- $dom->load("
baz
");
+ $dom->loadStr("
baz
");
$br = $dom->find('br', 0);
$this->assertEquals(' ', $br->outerHtml);
}
@@ -60,7 +60,7 @@ public function testLoadSelfclosingAttr()
public function testLoadSelfclosingAttrToString()
{
$dom = new Dom();
- $dom->load("
baz
");
+ $dom->loadStr("
baz
");
$br = $dom->find('br', 0);
$this->assertEquals(' ', (string) $br);
}
@@ -68,7 +68,7 @@ public function testLoadSelfclosingAttrToString()
public function testLoadEscapeQuotes()
{
$dom = new Dom();
- $dom->load('
', $div->outerHtml);
}
@@ -76,14 +76,14 @@ public function testLoadEscapeQuotes()
public function testLoadNoOpeningTag()
{
$dom = new Dom();
- $dom->load('
PR Manager
content
');
+ $dom->loadStr('
PR Manager
content
');
$this->assertEquals('content', $dom->find('.content', 0)->text);
}
public function testLoadNoClosingTag()
{
$dom = new Dom();
- $dom->load('
', $root->outerHtml);
}
@@ -91,7 +91,7 @@ public function testLoadNoClosingTag()
public function testLoadAttributeOnSelfClosing()
{
$dom = new Dom();
- $dom->load('
');
$br = $dom->find('br', 0);
$this->assertEquals('both', $br->getAttribute('class'));
}
@@ -99,7 +99,7 @@ public function testLoadAttributeOnSelfClosing()
public function testLoadClosingTagOnSelfClosing()
{
$dom = new Dom();
- $dom->load('
', $dom->find('div', 0)->innerHtml);
}
@@ -206,7 +206,7 @@ public function testLoadFromFileNotFound()
public function testLoadUtf8()
{
$dom = new Dom();
- $dom->load('
Dzień
');
+ $dom->loadStr('
Dzień
');
$this->assertEquals('Dzień', $dom->find('p', 0)->text);
}
@@ -268,56 +268,56 @@ public function testLoadFromUrl()
public function testToStringMagic()
{
$dom = new Dom();
- $dom->load('
');
$this->assertEquals('click here', $dom->getElementById('78')->outerHtml);
}
public function testGetElementsByTag()
{
$dom = new Dom();
- $dom->load('
', $dom->getElementsByClass('all')[0]->innerHtml);
}
public function testScriptCleanerScriptTag()
{
$dom = new Dom();
- $dom->load('
+ $dom->loadStr('
.....
',
[
@@ -607,12 +607,12 @@ public function testUniqueIdForAllObjects()
{
// Create a dom which will be used as a parent/container for a paragraph
$dom1 = new \PHPHtmlParser\Dom;
- $dom1->load('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting)
+ $dom1->loadStr('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting)
$div = $dom1->firstChild();
// Create a paragraph outside of the first dom
$dom2 = new \PHPHtmlParser\Dom;
- $dom2->load('
Our new paragraph.
'); // Resets the counter
+ $dom2->loadStr('
Our new paragraph.
'); // Resets the counter
$paragraph = $dom2->firstChild();
$div->addChild($paragraph);
@@ -623,7 +623,7 @@ public function testUniqueIdForAllObjects()
public function testFindDescendantsOfMatch()
{
$dom = new Dom();
- $dom->load('
+ $dom->loadStr('
test
testing
@@ -641,7 +641,7 @@ public function testFindDescendantsOfMatch()
public function testCompatibleWithWordPressShortcode()
{
$dom = new Dom();
- $dom->load('
+ $dom->loadStr('
[wprs_alert type="success" content="this is a short code" /]
');
diff --git a/tests/Node/HtmlTest.php b/tests/Node/HtmlTest.php
index 677b280e..a4db8142 100755
--- a/tests/Node/HtmlTest.php
+++ b/tests/Node/HtmlTest.php
@@ -500,7 +500,7 @@ public function testAncestorByTagFailure()
public function testReplaceNode()
{
$dom = new Dom();
- $dom->load('
');
$id = $dom->find('p')[0]->id();
$newChild = new HtmlNode('h1');
$dom->find('p')[0]->getParent()->replaceChild($id, $newChild);
@@ -510,7 +510,7 @@ public function testReplaceNode()
public function testTextNodeFirstChild()
{
$dom = new Dom();
- $dom->load('
', (string) $dom);
diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php
index 0a8a9baf..b7e5325e 100755
--- a/tests/Options/CleanupTest.php
+++ b/tests/Options/CleanupTest.php
@@ -76,7 +76,7 @@ public function testRemoveScriptsFalse()
public function testSmartyScripts()
{
$dom = new Dom();
- $dom->load('
+ $dom->loadStr('
aa={123}
');
$this->assertEquals(' aa= ', $dom->innerHtml);
@@ -88,7 +88,7 @@ public function testSmartyScriptsDisabled()
$dom->setOptions([
'removeSmartyScripts' => false,
]);
- $dom->load('
+ $dom->loadStr('
aa={123}
');
$this->assertEquals(' aa={123} ', $dom->innerHtml);
diff --git a/tests/Options/PreserveLineBreaks.php b/tests/Options/PreserveLineBreaks.php
index 3df7223e..ad095a38 100755
--- a/tests/Options/PreserveLineBreaks.php
+++ b/tests/Options/PreserveLineBreaks.php
@@ -13,7 +13,7 @@ public function testPreserveLineBreakTrue()
$dom->setOptions([
'preserveLineBreaks' => true,
]);
- $dom->load('
+ $dom->loadStr('
');
$this->assertEquals("
\n
", (string) $dom);
@@ -25,7 +25,7 @@ public function testPreserveLineBreakBeforeClosingTag()
$dom->setOptions([
'preserveLineBreaks' => true,
]);
- $dom->load('
loadStr('
');
$this->assertEquals('
', (string) $dom);
diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php
index cb015981..96d457b7 100755
--- a/tests/Options/StrictTest.php
+++ b/tests/Options/StrictTest.php
@@ -14,7 +14,7 @@ public function testConfigStrict()
$dom->setOptions([
'strict' => true,
]);
- $dom->load('
Hey you
Ya you!
');
+ $dom->loadStr('
Hey you
Ya you!
');
$this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text);
}
@@ -26,7 +26,7 @@ public function testConfigStrictMissingSelfClosing()
]);
try {
// should throw an exception
- $dom->load('
Hey you
Ya you!
');
+ $dom->loadStr('
Hey you
Ya you!
');
// we should not get here
$this->assertTrue(false);
} catch (StrictException $e) {
@@ -42,7 +42,7 @@ public function testConfigStrictMissingAttribute()
]);
try {
// should throw an exception
- $dom->load('
Hey you
Ya you!
');
+ $dom->loadStr('
Hey you
Ya you!
');
// we should not get here
$this->assertTrue(false);
} catch (StrictException $e) {
@@ -56,7 +56,7 @@ public function testConfigStrictBRTag()
$dom->setOptions([
'strict' => true,
]);
- $dom->load(' ');
+ $dom->loadStr(' ');
$this->assertTrue(true);
}
}
diff --git a/tests/Options/WhitespaceTextNodeTest.php b/tests/Options/WhitespaceTextNodeTest.php
index 541fbec0..0097f28d 100755
--- a/tests/Options/WhitespaceTextNodeTest.php
+++ b/tests/Options/WhitespaceTextNodeTest.php
@@ -13,7 +13,7 @@ public function testConfigGlobalNoWhitespaceTextNode()
$dom->setOptions([
'whitespaceTextNode' => false,
]);
- $dom->load('
Hey you
Ya you!
');
+ $dom->loadStr('
Hey you
Ya you!
');
$this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text);
}
@@ -23,7 +23,7 @@ public function testConfigLocalOverride()
$dom->setOptions([
'whitespaceTextNode' => false,
]);
- $dom->load('
Hey you
Ya you!
', [
+ $dom->loadStr('
Hey you
Ya you!
', [
'whitespaceTextNode' => true,
]);
$this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text);
diff --git a/tests/OptionsTest.php b/tests/OptionsTest.php
index 899a0622..a78f508f 100755
--- a/tests/OptionsTest.php
+++ b/tests/OptionsTest.php
@@ -144,6 +144,6 @@ public function testUnknownOptionDom()
]);
$this->expectException(UnknownOptionException::class);
- $dom->load('');
+ $dom->loadStr('');
}
}
diff --git a/tests/StaticDomTest.php b/tests/StaticDomTest.php
index 2fb225fb..fbc1a5bd 100755
--- a/tests/StaticDomTest.php
+++ b/tests/StaticDomTest.php
@@ -25,16 +25,16 @@ public function testMountWithDom()
$this->assertTrue($status);
}
- public function testLoad()
+ public function testloadStr()
{
- $dom = Dom::load('
', $div->outerHtml);
}
public function testLoadWithFile()
{
- $dom = Dom::load('tests/data/files/small.html');
+ $dom = Dom::loadFromFile('tests/data/files/small.html');
$this->assertEquals('VonBurgermeister', $dom->find('.post-user font', 0)->text);
}
@@ -47,14 +47,14 @@ public function testLoadFromFile()
/**
* @expectedException \PHPHtmlParser\Exceptions\NotLoadedException
*/
- public function testFindNoLoad()
+ public function testFindNoloadStr()
{
Dom::find('.post-user font', 0);
}
public function testFindI()
{
- Dom::load('tests/data/files/big.html');
+ Dom::loadFromFile('tests/data/files/big.html');
$this->assertEquals('В кустах блестит металл И искрится ток Человечеству конец', Dom::find('i')[1]->innerHtml);
}
From e37e8ef9eda6bb44f50519b51fd80f0207f29585 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Wed, 15 Jul 2020 00:03:46 +0000
Subject: [PATCH 43/68] Fixed #187 and added tests
---
src/PHPHtmlParser/Content.php | 25 +++++++++++++++----
src/PHPHtmlParser/Dom.php | 20 ++++++++++++---
.../Exceptions/ContentLengthException.php | 14 +++++++++++
tests/DomTest.php | 8 ++++++
4 files changed, 59 insertions(+), 8 deletions(-)
create mode 100644 src/PHPHtmlParser/Exceptions/ContentLengthException.php
diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php
index 24bca182..66bc7794 100755
--- a/src/PHPHtmlParser/Content.php
+++ b/src/PHPHtmlParser/Content.php
@@ -4,6 +4,7 @@
namespace PHPHtmlParser;
+use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\LogicalException;
/**
@@ -74,14 +75,27 @@ public function char(?int $char = null): string
* Moves the current position forward.
*
* @chainable
+ * @throws ContentLengthException
*/
public function fastForward(int $count): Content
{
+ if (!$this->canFastForward()) {
+ // trying to go over the content length, throw exception
+ throw new ContentLengthException('Attempt to fastForward pass the length of the content.');
+ }
$this->pos += $count;
return $this;
}
+ /**
+ * Checks if we can move the position forward.
+ */
+ public function canFastForward(): bool
+ {
+ return \strlen($this->content) > $this->pos;
+ }
+
/**
* Moves the current position backward.
*
@@ -197,14 +211,15 @@ public function copyByToken(string $token, bool $char = false, bool $escape = fa
/**
* Skip a given set of characters.
*
- * @return Content|string
+ * @throws LogicalException
*/
- public function skip(string $string, bool $copy = false)
+ public function skip(string $string, bool $copy = false): string
{
$len = \strspn($this->content, $string, $this->pos);
-
- // make it chainable if they don't want a copy
- $return = $this;
+ if ($len === false) {
+ throw new LogicalException('Strspn returned false with position ' . $this->pos . '.');
+ }
+ $return = '';
if ($copy) {
$return = \substr($this->content, $this->pos, $len);
if ($return === false) {
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index 1cba5050..d23110df 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -12,6 +12,7 @@
use PHPHtmlParser\Dom\TextNode;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
+use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\CurlException;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Exceptions\NotLoadedException;
@@ -646,7 +647,13 @@ private function parseTag(): array
}
// check if this is a closing tag
- if ($this->content->fastForward(1)->char() == '/') {
+ try {
+ $this->content->fastForward(1);
+ } catch (ContentLengthException $exception) {
+ // we are at the end of the file
+ return $return;
+ }
+ if ($this->content->char() == '/') {
// end tag
$tag = $this->content->fastForward(1)
->copyByToken('slash', true);
@@ -683,7 +690,12 @@ private function parseTag(): array
) {
$space = $this->content->skipByToken('blank', true);
if (empty($space)) {
- $this->content->fastForward(1);
+ try {
+ $this->content->fastForward(1);
+ } catch (ContentLengthException $exception) {
+ // reached the end of the content
+ break;
+ }
continue;
}
@@ -764,7 +776,9 @@ private function parseTag(): array
}
}
- $this->content->fastForward(1);
+ if ($this->content->canFastForward()) {
+ $this->content->fastForward(1);
+ }
$return['status'] = true;
$return['node'] = $node;
diff --git a/src/PHPHtmlParser/Exceptions/ContentLengthException.php b/src/PHPHtmlParser/Exceptions/ContentLengthException.php
new file mode 100644
index 00000000..83c9e771
--- /dev/null
+++ b/src/PHPHtmlParser/Exceptions/ContentLengthException.php
@@ -0,0 +1,14 @@
+assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml);
}
+
+ public function testBrokenHtml()
+ {
+ $dom = new Dom();
+ $dom->loadStr('assertEquals('', $dom->outerHtml);
+ }
}
From b58c6da6c58e9da334de20b46f602e9cb70d5095 Mon Sep 17 00:00:00 2001
From: Gilles Paquette
Date: Wed, 15 Jul 2020 01:18:55 +0000
Subject: [PATCH 44/68] Removed magical option array
---
CHANGELOG.md | 1 +
composer.json | 3 +-
src/PHPHtmlParser/Content.php | 22 +-
src/PHPHtmlParser/Dom.php | 86 ++++---
src/PHPHtmlParser/Enum/StringToken.php | 21 ++
src/PHPHtmlParser/Options.php | 291 +++++++++++------------
src/PHPHtmlParser/Selector/Seeker.php | 3 +-
src/PHPHtmlParser/StaticDom.php | 9 +-
tests/ContentTest.php | 5 +-
tests/DomTest.php | 30 ++-
tests/Options/CleanupTest.php | 29 +--
tests/Options/PreserveLineBreaks.php | 10 +-
tests/Options/StrictTest.php | 17 +-
tests/Options/WhitespaceTextNodeTest.php | 13 +-
tests/OptionsTest.php | 117 ++-------
tests/StaticDomTest.php | 2 +-
16 files changed, 273 insertions(+), 386 deletions(-)
create mode 100644 src/PHPHtmlParser/Enum/StringToken.php
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9862beac..05d2146f 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Cleaned up the selector logic.
- Fixed issue with greedy regex for charset detection.
- Fixed bug causing infinite loops in some cases.
+- Refactored the way we handle options. Removed the magical option array.
### Removed
- Curl interface and curl implementation has been removed.
diff --git a/composer.json b/composer.json
index 79258c58..5549a5ee 100755
--- a/composer.json
+++ b/composer.json
@@ -20,7 +20,8 @@
"paquettg/string-encode": "~1.0.0",
"php-http/httplug": "^2.1",
"php-http/guzzle6-adapter": "^2.0",
- "guzzlehttp/psr7": "^1.6"
+ "guzzlehttp/psr7": "^1.6",
+ "myclabs/php-enum": "^1.7"
},
"require-dev": {
"phpunit/phpunit": "^7.5.1",
diff --git a/src/PHPHtmlParser/Content.php b/src/PHPHtmlParser/Content.php
index 66bc7794..fdb741c4 100755
--- a/src/PHPHtmlParser/Content.php
+++ b/src/PHPHtmlParser/Content.php
@@ -4,6 +4,7 @@
namespace PHPHtmlParser;
+use PHPHtmlParser\Enum\StringToken;
use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\LogicalException;
@@ -75,11 +76,12 @@ public function char(?int $char = null): string
* Moves the current position forward.
*
* @chainable
+ *
* @throws ContentLengthException
*/
public function fastForward(int $count): Content
{
- if (!$this->canFastForward()) {
+ if (!$this->canFastForward($count)) {
// trying to go over the content length, throw exception
throw new ContentLengthException('Attempt to fastForward pass the length of the content.');
}
@@ -91,9 +93,9 @@ public function fastForward(int $count): Content
/**
* Checks if we can move the position forward.
*/
- public function canFastForward(): bool
+ public function canFastForward(int $count): bool
{
- return \strlen($this->content) > $this->pos;
+ return \strlen($this->content) >= $this->pos + $count;
}
/**
@@ -175,8 +177,6 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
/**
* Copies the content until the string is found and return it
* unless the 'unless' is found in the substring.
- *
- * @return string
*/
public function copyUntilUnless(string $string, string $unless): string
{
@@ -197,13 +197,11 @@ public function copyUntilUnless(string $string, string $unless): string
/**
* Copies the content until it reaches the token string.,.
*
- * @return string
- *
* @uses $this->copyUntil()
*/
- public function copyByToken(string $token, bool $char = false, bool $escape = false)
+ public function copyByToken(StringToken $stringToken, bool $char = false, bool $escape = false): string
{
- $string = $this->$token;
+ $string = $stringToken->getValue();
return $this->copyUntil($string, $char, $escape);
}
@@ -236,13 +234,11 @@ public function skip(string $string, bool $copy = false): string
/**
* Skip a given token of pre-defined characters.
*
- * @return Content|string
- *
* @uses $this->skip()
*/
- public function skipByToken(string $token, bool $copy = false)
+ public function skipByToken(StringToken $skipToken, bool $copy = false): string
{
- $string = $this->$token;
+ $string = $skipToken->getValue();
return $this->skip($string, $copy);
}
diff --git a/src/PHPHtmlParser/Dom.php b/src/PHPHtmlParser/Dom.php
index d23110df..d2db15e2 100755
--- a/src/PHPHtmlParser/Dom.php
+++ b/src/PHPHtmlParser/Dom.php
@@ -10,10 +10,10 @@
use PHPHtmlParser\Dom\Collection;
use PHPHtmlParser\Dom\HtmlNode;
use PHPHtmlParser\Dom\TextNode;
+use PHPHtmlParser\Enum\StringToken;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\ContentLengthException;
-use PHPHtmlParser\Exceptions\CurlException;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Exceptions\NotLoadedException;
use PHPHtmlParser\Exceptions\StrictException;
@@ -72,9 +72,9 @@ class Dom
/**
* A global options array to be used by all load calls.
*
- * @var array
+ * @var ?Options
*/
- private $globalOptions = [];
+ private $globalOptions;
/**
* A persistent option object to be used for all options in the
@@ -147,7 +147,7 @@ public function __get($name)
* @throws StrictException
* @throws LogicalException
*/
- public function loadFromFile(string $file, array $options = []): Dom
+ public function loadFromFile(string $file, ?Options $options = null): Dom
{
$content = @\file_get_contents($file);
if ($content === false) {
@@ -168,7 +168,7 @@ public function loadFromFile(string $file, array $options = []): Dom
* @throws StrictException
* @throws \Psr\Http\Client\ClientExceptionInterface
*/
- public function loadFromUrl(string $url, array $options = [], ?ClientInterface $client = null, ?RequestInterface $request = null): Dom
+ public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom
{
if ($client === null) {
$client = new Client();
@@ -191,11 +191,15 @@ public function loadFromUrl(string $url, array $options = [], ?ClientInterface $
* @throws CircularException
* @throws StrictException
*/
- public function loadStr(string $str, array $option = []): Dom
+ public function loadStr(string $str, ?Options $options = null): Dom
{
$this->options = new Options();
- $this->options->setOptions($this->globalOptions)
- ->setOptions($option);
+ if ($this->globalOptions !== null) {
+ $this->options->setFromOptions($this->globalOptions);
+ }
+ if ($options !== null) {
+ $this->options->setFromOptions($options);
+ }
$this->rawSize = \strlen($str);
$this->raw = $str;
@@ -216,7 +220,7 @@ public function loadStr(string $str, array $option = []): Dom
*
* @chainable
*/
- public function setOptions(array $options): Dom
+ public function setOptions(Options $options): Dom
{
$this->globalOptions = $options;
@@ -235,9 +239,7 @@ public function find(string $selector, int $nth = null)
{
$this->isLoaded();
- $result = $this->root->find($selector, $nth);
-
- return $result;
+ return $this->root->find($selector, $nth);
}
/**
@@ -463,7 +465,7 @@ private function isLoaded(): void
*/
private function clean(string $str): string
{
- if ($this->options->get('cleanupInput') != true) {
+ if ($this->options->isCleanupInput() != true) {
// skip entire cleanup step
return $str;
}
@@ -488,7 +490,7 @@ private function clean(string $str): string
// clean out the \n\r
$replace = ' ';
- if ($this->options->get('preserveLineBreaks')) {
+ if ($this->options->isPreserveLineBreaks()) {
$replace = '
';
}
$str = \str_replace(["\r\n", "\r", "\n"], $replace, $str);
@@ -515,7 +517,7 @@ private function clean(string $str): string
}
// strip out ";
$dom = new Dom();
- $dom->setOptions(['cleanupInput' => false]);
+ $dom->setOptions((new Options())->setCleanupInput(false));
$dom->loadStr($html);
$this->assertSame($html, $dom->root->outerHtml());
}
@@ -213,7 +214,7 @@ public function testLoadUtf8()
public function testLoadFileWhitespace()
{
$dom = new Dom();
- $dom->setOptions(['cleanupInput' => false]);
+ $dom->setOptions((new Options())->setCleanupInput(false));
$dom->loadFromFile('tests/data/files/whitespace.html');
$this->assertEquals(1, \count($dom->find('.class')));
$this->assertEquals('', (string) $dom);
@@ -237,7 +238,8 @@ public function testLoadFileBigTwice()
public function testLoadFileBigTwicePreserveOption()
{
$dom = new Dom();
- $dom->loadFromFile('tests/data/files/big.html', ['preserveLineBreaks' => true]);
+ $dom->loadFromFile('tests/data/files/big.html',
+ (new Options)->setPreserveLineBreaks(true));
$post = $dom->find('.post-row', 0);
$this->assertEquals(
"
Журчанье воды \nЧерно-белые тени \nВновь на фонтане
",
@@ -261,7 +263,7 @@ public function testLoadFromUrl()
->andReturn($responseMock);
$dom = new Dom();
- $dom->loadFromUrl('http://google.com', [], $clientMock);
+ $dom->loadFromUrl('http://google.com', null, $clientMock);
$this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text);
}
@@ -397,9 +399,7 @@ public function testHasChildren()
public function testWhitespaceInText()
{
$dom = new Dom();
- $dom->setOptions([
- 'removeDoubleSpace' => false,
- ]);
+ $dom->setOptions((new Options())->setRemoveDoubleSpace(false));
$dom->loadStr('
Hello world
');
$this->assertEquals('
Hello world
', (string) $dom);
}
@@ -415,7 +415,7 @@ public function testGetComplexAttribute()
public function testGetComplexAttributeHtmlSpecialCharsDecode()
{
$dom = new Dom();
- $dom->setOptions(['htmlSpecialCharsDecode' => true]);
+ $dom->setOptions((new Options())->setHtmlSpecialCharsDecode(true));
$dom->loadStr('Next >');
$a = $dom->find('a', 0);
$this->assertEquals('Next >', $a->innerHtml);
@@ -563,7 +563,7 @@ public function testLoadGetAttributeWithBackslash()
public function test25ChildrenFound()
{
$dom = new Dom();
- $dom->setOptions(['whitespaceTextNode' => false]);
+ $dom->setOptions((new Options())->setWhitespaceTextNode(false));
$dom->loadFromFile('tests/data/files/51children.html');
$children = $dom->find('#red-line-g *');
$this->assertEquals(25, \count($children));
@@ -596,22 +596,21 @@ public function testLessThanCharacterInJavascript()
$results = (new Dom())->loadStr('',
- [
- 'cleanupInput' => false,
- 'removeScripts' => false
- ])->find('body');
+ (new Options())->setCleanupInput(false)
+ ->setRemoveScripts(false)
+ )->find('body');
$this->assertCount(1, $results);
}
public function testUniqueIdForAllObjects()
{
// Create a dom which will be used as a parent/container for a paragraph
- $dom1 = new \PHPHtmlParser\Dom;
+ $dom1 = new \PHPHtmlParser\Dom();
$dom1->loadStr('
A container div
'); // Resets the counter (doesn't matter here as the counter was 0 even without resetting)
$div = $dom1->firstChild();
// Create a paragraph outside of the first dom
- $dom2 = new \PHPHtmlParser\Dom;
+ $dom2 = new \PHPHtmlParser\Dom();
$dom2->loadStr('
Our new paragraph.
'); // Resets the counter
$paragraph = $dom2->firstChild();
@@ -647,7 +646,6 @@ public function testCompatibleWithWordPressShortcode()
$node = $dom->find('p', 0);
$this->assertEquals(' [wprs_alert type="success" content="this is a short code" /] ', $node->innerHtml);
-
}
public function testBrokenHtml()
diff --git a/tests/Options/CleanupTest.php b/tests/Options/CleanupTest.php
index b7e5325e..914078ac 100755
--- a/tests/Options/CleanupTest.php
+++ b/tests/Options/CleanupTest.php
@@ -3,6 +3,7 @@
declare(strict_types=1);
use PHPHtmlParser\Dom;
+use PHPHtmlParser\Options;
use PHPUnit\Framework\TestCase;
class CleanupTest extends TestCase
@@ -10,9 +11,7 @@ class CleanupTest extends TestCase
public function testCleanupInputTrue()
{
$dom = new Dom();
- $dom->setOptions([
- 'cleanupInput' => true,
- ]);
+ $dom->setOptions((new Options())->setCleanupInput(true));
$dom->loadFromFile('tests/data/files/big.html');
$this->assertEquals(0, \count($dom->find('style')));
$this->assertEquals(0, \count($dom->find('script')));
@@ -21,9 +20,7 @@ public function testCleanupInputTrue()
public function testCleanupInputFalse()
{
$dom = new Dom();
- $dom->setOptions([
- 'cleanupInput' => false,
- ]);
+ $dom->setOptions((new Options())->setCleanupInput(false));
$dom->loadFromFile('tests/data/files/big.html');
$this->assertEquals(1, \count($dom->find('style')));
$this->assertEquals(22, \count($dom->find('script')));
@@ -32,9 +29,7 @@ public function testCleanupInputFalse()
public function testRemoveStylesTrue()
{
$dom = new Dom();
- $dom->setOptions([
- 'removeStyles' => true,
- ]);
+ $dom->setOptions((new Options())->setRemoveStyles(true));
$dom->loadFromFile('tests/data/files/big.html');
$this->assertEquals(0, \count($dom->find('style')));
}
@@ -42,9 +37,7 @@ public function testRemoveStylesTrue()
public function testRemoveStylesFalse()
{
$dom = new Dom();
- $dom->setOptions([
- 'removeStyles' => false,
- ]);
+ $dom->setOptions((new Options())->setRemoveStyles(false));
$dom->loadFromFile('tests/data/files/big.html');
$this->assertEquals(1, \count($dom->find('style')));
$this->assertEquals('text/css',
@@ -54,9 +47,7 @@ public function testRemoveStylesFalse()
public function testRemoveScriptsTrue()
{
$dom = new Dom();
- $dom->setOptions([
- 'removeScripts' => true,
- ]);
+ $dom->setOptions((new Options())->setRemoveScripts(true));
$dom->loadFromFile('tests/data/files/big.html');
$this->assertEquals(0, \count($dom->find('script')));
}
@@ -64,9 +55,7 @@ public function testRemoveScriptsTrue()
public function testRemoveScriptsFalse()
{
$dom = new Dom();
- $dom->setOptions([
- 'removeScripts' => false,
- ]);
+ $dom->setOptions((new Options())->setRemoveScripts(false));
$dom->loadFromFile('tests/data/files/big.html');
$this->assertEquals(22, \count($dom->find('script')));
$this->assertEquals('text/javascript',
@@ -85,9 +74,7 @@ public function testSmartyScripts()
public function testSmartyScriptsDisabled()
{
$dom = new Dom();
- $dom->setOptions([
- 'removeSmartyScripts' => false,
- ]);
+ $dom->setOptions((new Options())->setRemoveSmartyScripts(false));
$dom->loadStr('
aa={123}
');
diff --git a/tests/Options/PreserveLineBreaks.php b/tests/Options/PreserveLineBreaks.php
index ad095a38..be396490 100755
--- a/tests/Options/PreserveLineBreaks.php
+++ b/tests/Options/PreserveLineBreaks.php
@@ -3,6 +3,7 @@
declare(strict_types=1);
use PHPHtmlParser\Dom;
+use PHPHtmlParser\Options;
use PHPUnit\Framework\TestCase;
class PreserveLineBreaks extends TestCase
@@ -10,9 +11,8 @@ class PreserveLineBreaks extends TestCase
public function testPreserveLineBreakTrue()
{
$dom = new Dom();
- $dom->setOptions([
- 'preserveLineBreaks' => true,
- ]);
+ $dom->setOptions((new Options())->setPreserveLineBreaks(true));
+
$dom->loadStr('
');
@@ -22,9 +22,7 @@ public function testPreserveLineBreakTrue()
public function testPreserveLineBreakBeforeClosingTag()
{
$dom = new Dom();
- $dom->setOptions([
- 'preserveLineBreaks' => true,
- ]);
+ $dom->setOptions((new Options())->setPreserveLineBreaks(true));
$dom->loadStr('
');
diff --git a/tests/Options/StrictTest.php b/tests/Options/StrictTest.php
index 96d457b7..709f292d 100755
--- a/tests/Options/StrictTest.php
+++ b/tests/Options/StrictTest.php
@@ -4,6 +4,7 @@
use PHPHtmlParser\Dom;
use PHPHtmlParser\Exceptions\StrictException;
+use PHPHtmlParser\Options;
use PHPUnit\Framework\TestCase;
class StrictTest extends TestCase
@@ -11,9 +12,7 @@ class StrictTest extends TestCase
public function testConfigStrict()
{
$dom = new Dom();
- $dom->setOptions([
- 'strict' => true,
- ]);
+ $dom->setOptions((new Options())->setStrict(true));
$dom->loadStr('
Hey you
Ya you!
');
$this->assertEquals(' ', $dom->getElementById('hey')->nextSibling()->text);
}
@@ -21,9 +20,7 @@ public function testConfigStrict()
public function testConfigStrictMissingSelfClosing()
{
$dom = new Dom();
- $dom->setOptions([
- 'strict' => true,
- ]);
+ $dom->setOptions((new Options())->setStrict(true));
try {
// should throw an exception
$dom->loadStr('
Hey you
Ya you!
');
@@ -37,9 +34,7 @@ public function testConfigStrictMissingSelfClosing()
public function testConfigStrictMissingAttribute()
{
$dom = new Dom();
- $dom->setOptions([
- 'strict' => true,
- ]);
+ $dom->setOptions((new Options())->setStrict(true));
try {
// should throw an exception
$dom->loadStr('
Hey you
Ya you!
');
@@ -53,9 +48,7 @@ public function testConfigStrictMissingAttribute()
public function testConfigStrictBRTag()
{
$dom = new Dom();
- $dom->setOptions([
- 'strict' => true,
- ]);
+ $dom->setOptions((new Options())->setStrict(true));
$dom->loadStr(' ');
$this->assertTrue(true);
}
diff --git a/tests/Options/WhitespaceTextNodeTest.php b/tests/Options/WhitespaceTextNodeTest.php
index 0097f28d..245ef7f0 100755
--- a/tests/Options/WhitespaceTextNodeTest.php
+++ b/tests/Options/WhitespaceTextNodeTest.php
@@ -3,6 +3,7 @@
declare(strict_types=1);
use PHPHtmlParser\Dom;
+use PHPHtmlParser\Options;
use PHPUnit\Framework\TestCase;
class WhitespaceTextNodeTest extends TestCase
@@ -10,9 +11,7 @@ class WhitespaceTextNodeTest extends TestCase
public function testConfigGlobalNoWhitespaceTextNode()
{
$dom = new Dom();
- $dom->setOptions([
- 'whitespaceTextNode' => false,
- ]);
+ $dom->setOptions((new Options())->setWhitespaceTextNode(false));
$dom->loadStr('
Hey you
Ya you!
');
$this->assertEquals('Ya you!', $dom->getElementById('hey')->nextSibling()->text);
}
@@ -20,12 +19,8 @@ public function testConfigGlobalNoWhitespaceTextNode()
public function testConfigLocalOverride()
{
$dom = new Dom();
- $dom->setOptions([
- 'whitespaceTextNode' => false,
- ]);
- $dom->loadStr('
', $dom->find('div', 0)->innerHtml);
- }
-
public function testLoadNoValueAttribute()
{
$dom = new Dom();
@@ -239,7 +195,7 @@ public function testLoadFileBigTwicePreserveOption()
{
$dom = new Dom();
$dom->loadFromFile('tests/data/files/big.html',
- (new Options)->setPreserveLineBreaks(true));
+ (new Options())->setPreserveLineBreaks(true));
$post = $dom->find('.post-row', 0);
$this->assertEquals(
"
Журчанье воды \nЧерно-белые тени \nВновь на фонтане
', $div->outerHtml);
- }
-
public function testLoadNoOpeningTag()
{
$dom = new Dom();
@@ -81,29 +49,6 @@ public function testLoadNoOpeningTag()
$this->assertEquals('content', $dom->find('.content', 0)->text);
}
- public function testLoadNoClosingTag()
- {
- $dom = new Dom();
- $dom->loadStr('
', $dom->find('div', 0)->innerHtml);
- }
-
public function testLoadNoValueAttribute()
{
$dom = new Dom();
@@ -223,55 +168,6 @@ public function testLoadFromUrl()
$this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text);
}
- public function testToStringMagic()
- {
- $dom = new Dom();
- $dom->loadStr('
', $dom->getElementsByClass('all')[0]->innerHtml);
- }
-
public function testScriptCleanerScriptTag()
{
$dom = new Dom();
@@ -321,16 +217,6 @@ public function testCodeTag()
$this->assertEquals('hello$foo = "bar";', (string) $dom);
}
- public function testDeleteNode()
- {
- $dom = new Dom();
- $dom->loadStr('
', (string) $dom);
- }
-
public function testCountChildren()
{
$dom = new Dom();
@@ -619,4 +505,15 @@ public function testXMLOpeningToken()
$this->assertEquals('
fun time
', $dom->outerHtml);
}
+
+ /**
+ * Test to cover issue found in ticket #221
+ */
+ public function testRandomTagInMiddleOfText()
+ {
+ $dom = new Dom();
+ $dom->loadStr('
Hello, this is just a test in which <55 names with some other text > should be interpreted as text
');
+
+ $this->assertEquals('
Hello, this is just a test in which <55 names with some other text> should be interpreted as text55>
I always wondered how these thumbnails are added to magazine layout themes and I had no idea! Indeed I was more interested on the PHP code of how making it work 😛
+
It seems that with custom fields we can do pratically any customization…
+But I have 2 questions
+
Are custom fields limited to varchar values, or can we use any kind of data there? Does it have any size limit?
+
Ans what about these themes that have a stick post with a bigger image? Does it have 2 custom fields 1 for thumnail and 1 for stick image?
I would like to see a follow-up video explaining about that Custom Field Template that you told us to ignore. I know it’s created by coding in some WP hooks in functions.php as I have done it myself, but it would be cool to see your interpretation of doing this, might pick up some additional tips.
Thomas, that is a good plugin, Justin does great stuff but custom fileds can be used for a lot of things, images were just an easy example, hopefully viewers will find other creative uses. Not sure what you mean about all the css in the php?
I was just wondering about the styling, that you’ve got about 3 minutes into the video.
+
But I didn’t mean to take our eyes of off things. You’re absolutely right the custom fields can be a powerful tool. And I hadn’t really grasped how easy it is, before I saw your video. Good job.
This is very hard to follow. I admit, I’m not a newbie, but I am not terribly advanced, so I’m not sure where the problem is. One, I think you’re moving too fast. Two, when exactly should you first see the picture appear in the custom field? After you copy the url to the value field and update? or after you go into the home.php and change the code? That is completely unclear. This is such a great topic, so I hope if I post a few questions, it will become clear how to do this. I really do appreciate that you’ve posted this. Thanks!
Consciência, you could use two custom fields for that if you wanted to. It would work just fine. I’ve read about some potential performance issues if you use a lot of custom fields but haven’t experienced it myself. We use several on citycrush.com including for the post thumbnail and the image in the post after you click through. The custom field type in the DB is “longtext” maximum size 4GB.
+
Driz – We used a plugin for that previously but moved to adding it to the functions.php and using wp hooks so it sounds like you are doing exactly what we would.
+
Thomas, most of what was in there was actual logic or just spitting out the html, not css styling, we keep all of that in the stylesheet. Glad you enjoyed the video.
I’d like to suggest a subject for a future tutorial.
+
I love Drupal’s ability to use blocks above and below main content area. It is much easier to implement than WordPress widgets.
+
But I know it can be done in WordPress too. I’ve seen some magazine themes that have a “horizontal sidebar” on the botton of the page, and recently I’ve also seen a premium theme that has a “top horizontal sidebar” and a “bottom horizontal sidebar”, together with standard right and left ones.
+
It would be great if we had a tutorial teaching how to do it!
Hello all, just wanted to add one extra tidbit of info.
+
The first time you use custom fields, the “name” field is blank, so yes, you would be typing in “thumbnail” as seen in the video. But after your very first use, the “name” field will appear as a pulldown menu which displays ALL previously-used names. So you really don’t have to worry about typing it the right way every single time — just as long as you get it right the first time, you can just select it from the menu. Much faster, and it ensures you’re spelling it the same way every time.
+
I realize everyone will discover this on their own as they try it, but thought I’d mention it in case anyone was daunted by the prospect of having to be extra-careful about typing out those case-sensitive field names every time.
+
Case-sensitive is definitely important though… for one website I run, cnyradio.com , I originally used Tadlock’s “Newspaperize” theme, which used the custom keywords “thumbnail” and “image.” Later, I upgraded to a newer theme of his, but the theme was designed to seek out “Thumbnail” and “Image” with capital letters at the beginning. Rather than go through all my old posts to change the custom keywords (would have taken forever) I just changed the uppercase letters to lowercase in the theme templates.
+
If you want a good example of how different custom fields can help with your site design, check out cnyradio.com. It’s not as complex as the site shown in the video, but it’s (hopefully) still simple enough for newbies (like I was just 2 years ago) to understand.
+
My “loop” pages (home page, category pages, etc.) show 128×96 images invoked by the “thumbnail” custom field. When you click to read the full text of any post, a larger 200×150 image appears, invoked by the “image” custom field. If either field is blank or missing, then the site simply doesn’t display an image — the text takes up the entire width of the space.
+
Yes, it’s more work because I have to create 2 custom fields for each post, and I create 2 separate images. I do the latter for two main reasons. One, I don’t like relying on web browsers to resize images on-the-fly. Even if it looks OK on my computer, it may appear choppy on someone else’s.
+
Two, and more importantly, an image at 200×150 doesn’t always look so good when you simply resize it to 128×96. For example, the “fullsize” version of any mugshots I use will often include the subject’s name and a “courtesy line” to credit the photo source. But that text would be cluttered and tiny when the size is reduced, so when I make the thumbnail, I usually delete the “courtesy” line and bump up the text size of the person’s last name so it’s less cluttered and easier to read.
+
If anyone reading this does look at my site to see what I’m talking about, just a note that any “Picture of the Week” posts are done entirely differently. I won’t get into details, just wanted to avoid any confusion.
Using custom fields can be confusing to new WordPress users. Scott Ellis provides an introductory explanation of how to use custom fields for image placement and the components that go into making custom fields work from front end placement to back end utilization and code.
+ + + + August 29, 2009 at 3:42 am + | + + +
Some helpful links:
+http://codex.wordpress.org/Function_Reference/get_post_meta
++http://codex.wordpress.org/Function_Reference/post_meta_Function_Examples
LikeLike
++ + + + August 29, 2009 at 7:29 am + | + + +
Nice explanation!
+I always wondered how these thumbnails are added to magazine layout themes and I had no idea! Indeed I was more interested on the PHP code of how making it work 😛
+It seems that with custom fields we can do pratically any customization…
++But I have 2 questions
Are custom fields limited to varchar values, or can we use any kind of data there? Does it have any size limit?
+Ans what about these themes that have a stick post with a bigger image? Does it have 2 custom fields 1 for thumnail and 1 for stick image?
+tnx again!
+LikeLike
++ + + + August 29, 2009 at 2:32 pm + | + + +
Hi, Scott
+This is very helpful for those of us who aren’t programmers but want to maximize WordPress. Thanks for taking the time to submit this tip.
+LikeLike
++ + + + August 29, 2009 at 8:24 pm + | + + +
Justin Tadlocks Get the Image plugin can also help you to solve this task (also without CSS stuff in the php file 😉 ).
+LikeLike
++ + + + August 29, 2009 at 11:25 pm + | + + +
I would like to see a follow-up video explaining about that Custom Field Template that you told us to ignore. I know it’s created by coding in some WP hooks in functions.php as I have done it myself, but it would be cool to see your interpretation of doing this, might pick up some additional tips.
+LikeLike
++ + + + August 30, 2009 at 2:53 pm + | + + +
Thomas, that is a good plugin, Justin does great stuff but custom fileds can be used for a lot of things, images were just an easy example, hopefully viewers will find other creative uses. Not sure what you mean about all the css in the php?
+LikeLike
++-
+
+
Thomas Clausen
+
+
+
+
+
+
+
+
+ ++ + + + August 30, 2009 at 3:11 pm + | + + +
I was just wondering about the styling, that you’ve got about 3 minutes into the video.
+But I didn’t mean to take our eyes of off things. You’re absolutely right the custom fields can be a powerful tool. And I hadn’t really grasped how easy it is, before I saw your video. Good job.
+LikeLike
++ + + + August 30, 2009 at 5:10 pm + | + + +
This is very hard to follow. I admit, I’m not a newbie, but I am not terribly advanced, so I’m not sure where the problem is. One, I think you’re moving too fast. Two, when exactly should you first see the picture appear in the custom field? After you copy the url to the value field and update? or after you go into the home.php and change the code? That is completely unclear. This is such a great topic, so I hope if I post a few questions, it will become clear how to do this. I really do appreciate that you’ve posted this. Thanks!
+LikeLike
++ + + + August 30, 2009 at 5:13 pm + | + + +
also, my home.php doesn’t have the code you show. I am working in the theme Constructor. below is all the text in the home.php file:
+LikeLike
++ + + + August 30, 2009 at 5:38 pm + | + + +
Woopsie & sorry. below is the code in my file:
+LikeLike
++ + + + September 2, 2009 at 10:26 pm + | + + +
Consciência, you could use two custom fields for that if you wanted to. It would work just fine. I’ve read about some potential performance issues if you use a lot of custom fields but haven’t experienced it myself. We use several on citycrush.com including for the post thumbnail and the image in the post after you click through. The custom field type in the DB is “longtext” maximum size 4GB.
+Driz – We used a plugin for that previously but moved to adding it to the functions.php and using wp hooks so it sounds like you are doing exactly what we would.
+Thomas, most of what was in there was actual logic or just spitting out the html, not css styling, we keep all of that in the stylesheet. Glad you enjoyed the video.
+Karen, sorry it felt fast, if you look at an example and watch the video I’m sure you’ll pick it up quickly. It took me a couple of rounds the first time I started playing with custom fields. Justin Tadlock has a good explanation here as well: http://justintadlock.com/archives/2007/10/24/using-wordpress-custom-fields-introduction.
++The pictures will appear on the page where your custom field spits them out once you save the image url in the appropriate custom field. FYI, you code didn’t show up so visit http://www.vsellis.com/wordpress-how-to/using-custom-fields-in-wordpress/ and leave a comment and I’ll take a closer look.
LikeLike
++ + + + September 10, 2009 at 4:33 pm + | + + +
Thanks for the reply!
+I’d like to suggest a subject for a future tutorial.
+I love Drupal’s ability to use blocks above and below main content area. It is much easier to implement than WordPress widgets.
+But I know it can be done in WordPress too. I’ve seen some magazine themes that have a “horizontal sidebar” on the botton of the page, and recently I’ve also seen a premium theme that has a “top horizontal sidebar” and a “bottom horizontal sidebar”, together with standard right and left ones.
+It would be great if we had a tutorial teaching how to do it!
+LikeLike
++ + + + September 12, 2009 at 7:21 am + | + + +
Hello all, just wanted to add one extra tidbit of info.
+The first time you use custom fields, the “name” field is blank, so yes, you would be typing in “thumbnail” as seen in the video. But after your very first use, the “name” field will appear as a pulldown menu which displays ALL previously-used names. So you really don’t have to worry about typing it the right way every single time — just as long as you get it right the first time, you can just select it from the menu. Much faster, and it ensures you’re spelling it the same way every time.
+I realize everyone will discover this on their own as they try it, but thought I’d mention it in case anyone was daunted by the prospect of having to be extra-careful about typing out those case-sensitive field names every time.
+Case-sensitive is definitely important though… for one website I run, cnyradio.com , I originally used Tadlock’s “Newspaperize” theme, which used the custom keywords “thumbnail” and “image.” Later, I upgraded to a newer theme of his, but the theme was designed to seek out “Thumbnail” and “Image” with capital letters at the beginning. Rather than go through all my old posts to change the custom keywords (would have taken forever) I just changed the uppercase letters to lowercase in the theme templates.
+If you want a good example of how different custom fields can help with your site design, check out cnyradio.com. It’s not as complex as the site shown in the video, but it’s (hopefully) still simple enough for newbies (like I was just 2 years ago) to understand.
+My “loop” pages (home page, category pages, etc.) show 128×96 images invoked by the “thumbnail” custom field. When you click to read the full text of any post, a larger 200×150 image appears, invoked by the “image” custom field. If either field is blank or missing, then the site simply doesn’t display an image — the text takes up the entire width of the space.
+Yes, it’s more work because I have to create 2 custom fields for each post, and I create 2 separate images. I do the latter for two main reasons. One, I don’t like relying on web browsers to resize images on-the-fly. Even if it looks OK on my computer, it may appear choppy on someone else’s.
+Two, and more importantly, an image at 200×150 doesn’t always look so good when you simply resize it to 128×96. For example, the “fullsize” version of any mugshots I use will often include the subject’s name and a “courtesy line” to credit the photo source. But that text would be cluttered and tiny when the size is reduced, so when I make the thumbnail, I usually delete the “courtesy” line and bump up the text size of the person’s last name so it’s less cluttered and easier to read.
+If anyone reading this does look at my site to see what I’m talking about, just a note that any “Picture of the Week” posts are done entirely differently. I won’t get into details, just wanted to avoid any confusion.
+LikeLike
++ + + + September 25, 2009 at 1:09 pm + | + + +
can we add custom fields to wordpress.com blogs?
+LikeLike
++-
+
+
Ryan Markel
+
+
+
+
+
+
+
+
+ ++ + + + September 28, 2009 at 3:24 am + | + + +
No; you cannot.
+LikeLike
++ + + + November 8, 2009 at 5:38 am + | + + +
I guess never say never, huh?
+LikeLike
+