Skip to content

Commit 8057f8e

Browse files
committed
Added options: cleanupInput, removeScripts and removeStyles
1 parent aecc9a0 commit 8057f8e

File tree

4 files changed

+111
-21
lines changed

4 files changed

+111
-21
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
vendor
2+
composer.lock

src/PHPHtmlParser/Dom.php

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use stringEncode\Encode;
99

1010
class Dom {
11-
11+
1212
/**
1313
* The charset we would like the output to be in.
1414
*
@@ -43,7 +43,7 @@ class Dom {
4343
* @var int
4444
*/
4545
protected $rawSize;
46-
46+
4747
/**
4848
* The size of the document after it is cleaned.
4949
*
@@ -59,7 +59,7 @@ class Dom {
5959
protected $globalOptions = [];
6060

6161
/**
62-
* A persistent option object to be used for all options in the
62+
* A persistent option object to be used for all options in the
6363
* parsing of the file.
6464
*
6565
* @var Options
@@ -232,7 +232,7 @@ public function addSelfClosingTag($tag)
232232
}
233233
return $this;
234234
}
235-
235+
236236
/**
237237
* Removes the tag (or tags in an array) from the list of tags that will
238238
* always be self closing.
@@ -297,7 +297,7 @@ public function getElementById($id)
297297
}
298298

299299
/**
300-
* Simple wrapper function that returns all elements by
300+
* Simple wrapper function that returns all elements by
301301
* tag name.
302302
*
303303
* @param string $name
@@ -343,6 +343,12 @@ protected function isLoaded()
343343
*/
344344
protected function clean($str)
345345
{
346+
if ($this->options->get('cleanupInput') != true)
347+
{
348+
// skip entire cleanup step
349+
return $str;
350+
}
351+
346352
// clean out the \n\r
347353
$str = str_replace(["\r\n", "\r", "\n"], ' ', $str);
348354

@@ -351,24 +357,30 @@ protected function clean($str)
351357

352358
// strip out comments
353359
$str = mb_eregi_replace("<!--(.*?)-->", '', $str);
354-
360+
355361
// strip out cdata
356362
$str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
357-
363+
358364
// strip out <script> tags
359-
$str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
360-
$str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
361-
365+
if ($this->options->get('removeScripts') == true)
366+
{
367+
$str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
368+
$str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
369+
}
370+
362371
// strip out <style> tags
363-
$str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
364-
$str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
365-
372+
if ($this->options->get('removeStyles') == true)
373+
{
374+
$str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
375+
$str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
376+
}
377+
366378
// strip out preformatted tags
367379
$str = mb_eregi_replace("<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>", '', $str);
368-
380+
369381
// strip out server side scripts
370382
$str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
371-
383+
372384
// strip smarty scripts
373385
$str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
374386

@@ -469,7 +481,7 @@ protected function parseTag()
469481
// move to end of tag
470482
$this->content->copyUntil('>');
471483
$this->content->fastForward(1);
472-
484+
473485
// check if this closing tag counts
474486
$tag = strtolower($tag);
475487
if (in_array($tag, $this->selfClosing))
@@ -570,7 +582,7 @@ protected function parseTag()
570582
}
571583
elseif (in_array($tag, $this->selfClosing))
572584
{
573-
585+
574586
// Should be a self closing tag, check if we are strict
575587
if ( $this->options->strict)
576588
{
@@ -581,7 +593,7 @@ protected function parseTag()
581593
// We force self closing on this tag.
582594
$node->getTag()->selfClosing();
583595
}
584-
596+
585597
$this->content->fastForward(1);
586598

587599
$return['status'] = true;
@@ -630,7 +642,7 @@ protected function detectCharset()
630642
$this->root->propagateEncoding($encode);
631643
return true;
632644
}
633-
645+
634646
// no charset found
635647
$this->root->propagateEncoding($encode);
636648
return false;

src/PHPHtmlParser/Options.php

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,16 @@ class Options {
1212

1313
/**
1414
* The default options array
15-
*
15+
*
1616
* @param array
1717
*/
1818
protected $defaults = [
1919
'whitespaceTextNode' => true,
2020
'strict' => false,
2121
'enforceEncoding' => null,
22+
'cleanupInput' => true,
23+
'removeScripts' => true,
24+
'removeStyles' => true
2225
];
2326

2427
/**
@@ -76,7 +79,7 @@ public function get($key)
7679
{
7780
return $this->options[$key];
7881
}
79-
82+
8083
return null;
8184
}
8285
}

tests/Options/CleanupTest.php

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
<?php
2+
3+
use PHPHtmlParser\Dom;
4+
5+
class CleanupTest extends PHPUnit_Framework_TestCase {
6+
7+
public function testCleanupInputTrue()
8+
{
9+
$dom = new Dom;
10+
$dom->setOptions([
11+
'cleanupInput' => true,
12+
]);
13+
$dom->loadFromFile('tests/files/horrible.html');
14+
$this->assertEquals(0, count($dom->find('style')));
15+
$this->assertEquals(0, count($dom->find('script')));
16+
}
17+
18+
public function testCleanupInputFalse()
19+
{
20+
$dom = new Dom;
21+
$dom->setOptions([
22+
'cleanupInput' => false,
23+
]);
24+
$dom->loadFromFile('tests/files/horrible.html');
25+
$this->assertEquals(1, count($dom->find('style')));
26+
$this->assertEquals(1, count($dom->find('script')));
27+
}
28+
29+
public function testRemoveStylesTrue()
30+
{
31+
$dom = new Dom;
32+
$dom->setOptions([
33+
'removeStyles' => true,
34+
]);
35+
$dom->loadFromFile('tests/files/horrible.html');
36+
$this->assertEquals(0, count($dom->find('style')));
37+
}
38+
39+
public function testRemoveStylesFalse()
40+
{
41+
$dom = new Dom;
42+
$dom->setOptions([
43+
'removeStyles' => false,
44+
]);
45+
$dom->loadFromFile('tests/files/horrible.html');
46+
$this->assertEquals(1, count($dom->find('style')));
47+
$this->assertEquals('text/css',
48+
$dom->find('style')->getAttribute('type'));
49+
}
50+
51+
public function testRemoveScriptsTrue()
52+
{
53+
$dom = new Dom;
54+
$dom->setOptions([
55+
'removeScripts' => true,
56+
]);
57+
$dom->loadFromFile('tests/files/horrible.html');
58+
$this->assertEquals(0, count($dom->find('script')));
59+
}
60+
61+
public function testRemoveScriptsFalse()
62+
{
63+
$dom = new Dom;
64+
$dom->setOptions([
65+
'removeScripts' => false,
66+
]);
67+
$dom->loadFromFile('tests/files/horrible.html');
68+
$this->assertEquals(1, count($dom->find('script')));
69+
$this->assertEquals('text/JavaScript',
70+
$dom->find('script')->getAttribute('type'));
71+
}
72+
73+
}

0 commit comments

Comments
 (0)