Skip to content

Commit e570797

Browse files
committed
Added preserve line break option. Defaults to false.
fixes paquettg#40
1 parent 3a80041 commit e570797

File tree

5 files changed

+46
-3
lines changed

5 files changed

+46
-3
lines changed

README.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,36 @@ $dom->load('http://google.com', [
125125
$dom->load('http://gmail.com'); // will not have whitespaceTextNode set to false.
126126
```
127127

128-
At the moment we support 3 options, strict, whitespaceTextNode and enforceEncoding. Strict, by default false, will throw a `StrickException` if it find that the html is not strict complient (all tags must have a clossing tag, no attribute with out a value, etc.).
128+
At the moment we support 7 options.
129+
130+
**Strict**
131+
132+
Strict, by default false, will throw a `StrickException` if it find that the html is not strict complient (all tags must have a clossing tag, no attribute with out a value, etc.).
133+
134+
**whitespaceTextNode**
129135

130136
The whitespaceTextNode, by default true, option tells the parser to save textnodes even if the content of the node is empty (only whitespace). Setting it to false will ignore all whitespace only text node found in the document.
131137

138+
**enforceEncoding
139+
132140
The enforceEncoding, by default null, option will enforce an charater set to be used for reading the content and returning the content in that encoding. Setting it to null will trigger an attempt to figure out the encoding from within the content of the string given instead.
133141

142+
**cleanupInput**
143+
144+
Set this to `true` to skip the entire clean up phase of the parser. If this is set to true the next 3 options will be ignored. Defaults to `false`.
145+
146+
**removeScripts**
147+
148+
Set this to `false` to skip removing the script tags from the document body. This might have adverse effects. Defaults to `true`.
149+
150+
**removeStyles**
151+
152+
Set this to `false` to skip removing of style tags from the document body. This might have adverse effects. Defaults to `true`.
153+
154+
**preserveLineBreaks**
155+
156+
Preserves Line Breaks if set to `true`. If set to `false` line breaks are cleaned up as part of the input clean up process. Defaults to `false`.
157+
134158
Static Facade
135159
-------------
136160

src/PHPHtmlParser/Dom.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,12 @@ protected function clean($str)
366366
}
367367

368368
// clean out the \n\r
369-
$str = str_replace(["\r\n", "\r", "\n"], ' ', $str);
369+
$replace = ' ';
370+
if ($this->options->get('preserveLineBreaks'))
371+
{
372+
$replace = '&#10';
373+
}
374+
$str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
370375

371376
// strip the doctype
372377
$str = mb_eregi_replace("<!doctype(.*?)>", '', $str);

src/PHPHtmlParser/Dom/TextNode.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ public function __construct($text)
3939
// remove double spaces
4040
$text = mb_ereg_replace('\s+', ' ', $text);
4141

42+
// restore line breaks
43+
$text = str_replace('&#10', "\n", $text);
44+
4245
$this->text = $text;
4346
$this->tag = new Tag('text');
4447
parent::__construct();

src/PHPHtmlParser/Options.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ class Options {
2222
'enforceEncoding' => null,
2323
'cleanupInput' => true,
2424
'removeScripts' => true,
25-
'removeStyles' => true
25+
'removeStyles' => true,
26+
'preserveLineBreaks' => false,
2627
];
2728

2829
/**

tests/DomTest.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,16 @@ public function testLoadFileBigTwice()
187187
$this->assertEquals(' <p>Журчанье воды<br /> Черно-белые тени<br /> Вновь на фонтане</p> ', $post->find('.post-message', 0)->innerHtml);
188188
}
189189

190+
public function testLoadFileBigTwicePreserveOption()
191+
{
192+
$dom = new Dom;
193+
$dom->loadFromFile('tests/files/big.html', ['preserveLineBreaks' => true]);
194+
$post = $dom->find('.post-row', 0);
195+
$this->assertEquals('<p>Журчанье воды<br />
196+
Черно-белые тени<br />
197+
Вновь на фонтане</p>', trim($post->find('.post-message', 0)->innerHtml));
198+
}
199+
190200
public function testLoadFromUrl()
191201
{
192202
$curl = Mockery::mock('PHPHtmlParser\CurlInterface');

0 commit comments

Comments
 (0)