Skip to content

Commit 7b31b45

Browse files
committed
Added feature paquettg#164
1 parent 7f3c253 commit 7b31b45

File tree

9 files changed

+92
-16
lines changed

9 files changed

+92
-16
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313
- Added new option `depthFirstSearch`.
1414
- Deprecated option `depthFirstSearch` and marked for removal in `3.0.0`.
1515
- Added multi class selections support.
16-
- Added case insensitive attribute matching
16+
- Added case insensitive attribute matching.
17+
- Added new option `htmlSpecialCharsDecode`.
1718

1819
### Changed
1920
- Started using a changelog.

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,10 @@ By default this is set to `false` for legacy support. Setting this to `true` wil
188188

189189
This option is depricated and will be removed in version `3.0.0` with the new behavior being as if it was set to `true`.
190190

191+
**htmlSpecialCharsDecode**
192+
193+
By default this is set to `false`. Setting this to `true` will apply the php function `htmlspecialchars_decode` too all attribute values and text nodes.
194+
191195
Static Facade
192196
-------------
193197

src/PHPHtmlParser/Dom.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,7 @@ protected function parse(): void
533533
{
534534
// add the root node
535535
$this->root = new HtmlNode('root');
536+
$this->root->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);
536537
$activeNode = $this->root;
537538
while ( ! is_null($activeNode)) {
538539
$str = $this->content->copyUntil('<');
@@ -580,6 +581,7 @@ protected function parse(): void
580581
) {
581582
// we found text we care about
582583
$textNode = new TextNode($str, $this->options->removeDoubleSpace);
584+
$textNode->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);
583585
$activeNode->addChild($textNode);
584586
}
585587
}
@@ -634,6 +636,7 @@ protected function parseTag(): array
634636
return $return;
635637
}
636638
$node = new HtmlNode($tag);
639+
$node->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);
637640

638641
// attributes
639642
while ($this->content->char() != '>' &&

src/PHPHtmlParser/Dom/AbstractNode.php

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ abstract class AbstractNode
6565
*/
6666
protected $children = [];
6767

68+
/**
69+
* @var bool
70+
*/
71+
protected $htmlSpecialCharsDecode = false;
72+
6873
/**
6974
* Creates a unique id for this node.
7075
*/
@@ -123,6 +128,16 @@ public function __toString()
123128
return $this->outerHtml();
124129
}
125130

131+
/**
132+
* @param bool $htmlSpecialCharsDecode
133+
* @return void
134+
*/
135+
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
136+
{
137+
$this->htmlSpecialCharsDecode = $htmlSpecialCharsDecode;
138+
}
139+
140+
126141
/**
127142
* Reset node counter
128143
*

src/PHPHtmlParser/Dom/HtmlNode.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,16 @@ public function __construct($tag)
5555
parent::__construct();
5656
}
5757

58+
/**
59+
* @param bool $htmlSpecialCharsDecode
60+
* @return void
61+
*/
62+
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
63+
{
64+
parent::setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
65+
$this->tag->setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
66+
}
67+
5868
/**
5969
* Gets the inner html of this node.
6070
*

src/PHPHtmlParser/Dom/Tag.php

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ class Tag
5252
*/
5353
protected $encode = null;
5454

55+
/**
56+
* @var bool
57+
*/
58+
private $HtmlSpecialCharsDecode = false;
59+
5560
/**
5661
* Sets up the tag with a name.
5762
*
@@ -142,6 +147,15 @@ public function setEncoding(Encode $encode): void
142147
$this->encode = $encode;
143148
}
144149

150+
/**
151+
* @param bool $htmlSpecialCharsDecode
152+
* @return void
153+
*/
154+
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
155+
{
156+
$this->HtmlSpecialCharsDecode = $htmlSpecialCharsDecode;
157+
}
158+
145159
/**
146160
* Sets the noise for this tag (if any)
147161
*
@@ -173,6 +187,9 @@ public function setAttribute(string $key, $value): Tag
173187
'doubleQuote' => true,
174188
];
175189
}
190+
if ($this->HtmlSpecialCharsDecode) {
191+
$value['value'] = htmlspecialchars_decode($value['value']);
192+
}
176193
$this->attr[$key] = $value;
177194

178195
return $this;
@@ -186,7 +203,6 @@ public function setAttribute(string $key, $value): Tag
186203
*/
187204
public function setStyleAttributeValue($attr_key, $attr_value): void
188205
{
189-
190206
$style_array = $this->getStyleAttributeArray();
191207
$style_array[$attr_key] = $attr_value;
192208

src/PHPHtmlParser/Dom/TextNode.php

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,28 +51,43 @@ public function __construct(string $text, $removeDoubleSpace = true)
5151
parent::__construct();
5252
}
5353

54+
/**
55+
* @param bool $htmlSpecialCharsDecode
56+
* @return void
57+
*/
58+
public function setHtmlSpecialCharsDecode($htmlSpecialCharsDecode = false): void
59+
{
60+
parent::setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
61+
$this->tag->setHtmlSpecialCharsDecode($htmlSpecialCharsDecode);
62+
}
63+
5464
/**
5565
* Returns the text of this node.
5666
*
5767
* @return string
5868
*/
5969
public function text(): string
6070
{
71+
if ($this->htmlSpecialCharsDecode) {
72+
$text = htmlspecialchars_decode($this->text);
73+
} else {
74+
$text = $this->text;
75+
}
6176
// convert charset
6277
if ( ! is_null($this->encode)) {
6378
if ( ! is_null($this->convertedText)) {
6479
// we already know the converted value
6580
return $this->convertedText;
6681
}
67-
$text = $this->encode->convert($this->text);
82+
$text = $this->encode->convert($text);
6883

6984
// remember the conversion
7085
$this->convertedText = $text;
7186

7287
return $text;
73-
} else {
74-
return $this->text;
7588
}
89+
90+
return $text;
7691
}
7792

7893
/**
@@ -84,7 +99,6 @@ public function text(): string
8499
public function setText(string $text): void
85100
{
86101
$this->text = $text;
87-
88102
if ( ! is_null($this->encode)) {
89103
$text = $this->encode->convert($text);
90104

src/PHPHtmlParser/Options.php

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* @property bool removeDoubleSpace
1616
* @property bool removeSmartyScripts
1717
* @property bool depthFirstSearch
18+
* @property bool htmlSpecialCharsDecode
1819
*/
1920
class Options
2021
{
@@ -25,16 +26,17 @@ class Options
2526
* @param array
2627
*/
2728
protected $defaults = [
28-
'whitespaceTextNode' => true,
29-
'strict' => false,
30-
'enforceEncoding' => null,
31-
'cleanupInput' => true,
32-
'removeScripts' => true,
33-
'removeStyles' => true,
34-
'preserveLineBreaks' => false,
35-
'removeDoubleSpace' => true,
36-
'removeSmartyScripts' => true,
37-
'depthFirstSearch' => false,
29+
'whitespaceTextNode' => true,
30+
'strict' => false,
31+
'enforceEncoding' => null,
32+
'cleanupInput' => true,
33+
'removeScripts' => true,
34+
'removeStyles' => true,
35+
'preserveLineBreaks' => false,
36+
'removeDoubleSpace' => true,
37+
'removeSmartyScripts' => true,
38+
'depthFirstSearch' => false,
39+
'htmlSpecialCharsDecode' => false,
3840
];
3941

4042
/**

tests/DomTest.php

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,17 @@ public function testGetComplexAttribute()
404404
$this->assertEquals('?search=Fort+William&session_type=face&distance=100&uqs=119846&page=4', $href);
405405
}
406406

407+
public function testGetComplexAttributeHtmlSpecialCharsDecode()
408+
{
409+
$dom = new Dom;
410+
$dom->setOptions(['htmlSpecialCharsDecode' => true]);
411+
$dom->load('<a href="?search=Fort+William&amp;session_type=face&amp;distance=100&amp;uqs=119846&amp;page=4" class="pagination-next">Next <span class="chevron">&gt;</span></a>');
412+
$a = $dom->find('a', 0);
413+
$this->assertEquals('Next <span class="chevron">></span>', $a->innerHtml);
414+
$href = $a->href;
415+
$this->assertEquals('?search=Fort+William&session_type=face&distance=100&uqs=119846&page=4', $href);
416+
}
417+
407418
public function testGetChildrenNoChildren()
408419
{
409420
$dom = new Dom();

0 commit comments

Comments
 (0)