Skip to content

Commit 04321f9

Browse files
committed
fixes paquettg#82
1 parent 71c3758 commit 04321f9

File tree

7 files changed

+3096
-23
lines changed

7 files changed

+3096
-23
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
### Added
1111
- Added support for PSR7 HTTP clients and requests for URL calls.
1212
- Added PHAN support and fixed all issues from PHAN.
13+
- Added support for html5 charset detection.
1314

1415
### Changed
1516
- Fixed issue with \ causing an infite loop.
1617
- CDATA should not be altered when cleanupInput is false.
1718
- Added tag attribute DTO.
1819
- Cleaned up the selector logic.
20+
- Fixed issue with greedy regex for charset detection.
1921

2022
### Removed
2123
- Removed curl interface and curl implementation.

src/PHPHtmlParser/Dom.php

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39,57 +39,57 @@ class Dom
3939
*
4040
* @var string
4141
*/
42-
protected $defaultCharset = 'UTF-8';
42+
private $defaultCharset = 'UTF-8';
4343

4444
/**
4545
* The raw version of the document string.
4646
*
4747
* @var string
4848
*/
49-
protected $raw;
49+
private $raw;
5050

5151
/**
5252
* The document string.
5353
*
5454
* @var Content
5555
*/
56-
protected $content;
56+
private $content;
5757

5858
/**
5959
* The original file size of the document.
6060
*
6161
* @var int
6262
*/
63-
protected $rawSize;
63+
private $rawSize;
6464

6565
/**
6666
* The size of the document after it is cleaned.
6767
*
6868
* @var int
6969
*/
70-
protected $size;
70+
private $size;
7171

7272
/**
7373
* A global options array to be used by all load calls.
7474
*
7575
* @var array
7676
*/
77-
protected $globalOptions = [];
77+
private $globalOptions = [];
7878

7979
/**
8080
* A persistent option object to be used for all options in the
8181
* parsing of the file.
8282
*
8383
* @var Options
8484
*/
85-
protected $options;
85+
private $options;
8686

8787
/**
8888
* A list of tags which will always be self closing.
8989
*
9090
* @var array
9191
*/
92-
protected $selfClosing = [
92+
private $selfClosing = [
9393
'area',
9494
'base',
9595
'basefont',
@@ -114,7 +114,7 @@ class Dom
114114
*
115115
* @var array
116116
*/
117-
protected $noSlash = [];
117+
private $noSlash = [];
118118

119119
/**
120120
* Returns the inner html of the root node.
@@ -173,7 +173,7 @@ public function load(string $str, array $options = []): Dom
173173
*/
174174
public function loadFromFile(string $file, array $options = []): Dom
175175
{
176-
$content = \file_get_contents($file);
176+
$content = @\file_get_contents($file);
177177
if ($content === false) {
178178
throw new LogicalException('file_get_contents failed and returned false when trying to read "' . $file . '".');
179179
}
@@ -496,7 +496,7 @@ public function getElementsByClass(string $class)
496496
*
497497
* @throws NotLoadedException
498498
*/
499-
protected function isLoaded(): void
499+
private function isLoaded(): void
500500
{
501501
if (\is_null($this->content)) {
502502
throw new NotLoadedException('Content is not loaded!');
@@ -506,7 +506,7 @@ protected function isLoaded(): void
506506
/**
507507
* Cleans the html of any none-html information.
508508
*/
509-
protected function clean(string $str): string
509+
private function clean(string $str): string
510510
{
511511
if ($this->options->get('cleanupInput') != true) {
512512
// skip entire cleanup step
@@ -610,7 +610,7 @@ protected function clean(string $str): string
610610
* @throws StrictException
611611
* @throws LogicalException
612612
*/
613-
protected function parse(): void
613+
private function parse(): void
614614
{
615615
// add the root node
616616
$this->root = new HtmlNode('root');
@@ -679,7 +679,7 @@ protected function parse(): void
679679
*
680680
* @throws StrictException
681681
*/
682-
protected function parseTag(): array
682+
private function parseTag(): array
683683
{
684684
$return = [
685685
'status' => false,
@@ -823,7 +823,7 @@ protected function parseTag(): array
823823
*
824824
* @throws ChildNotFoundException
825825
*/
826-
protected function detectCharset(): bool
826+
private function detectCharset(): bool
827827
{
828828
// set the default
829829
$encode = new Encode();
@@ -841,11 +841,15 @@ protected function detectCharset(): bool
841841

842842
/** @var AbstractNode $meta */
843843
$meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
844-
if (\is_null($meta)) {
845-
// could not find meta tag
846-
$this->root->propagateEncoding($encode);
844+
if ($meta == null) {
845+
if (!$this->detectHTML5Charset($encode)) {
846+
// could not find meta tag
847+
$this->root->propagateEncoding($encode);
847848

848-
return false;
849+
return false;
850+
}
851+
852+
return true;
849853
}
850854
$content = $meta->getAttribute('content');
851855
if (\is_null($content)) {
@@ -855,7 +859,7 @@ protected function detectCharset(): bool
855859
return false;
856860
}
857861
$matches = [];
858-
if (\preg_match('/charset=(.+)/', $content, $matches)) {
862+
if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
859863
$encode->from(\trim($matches[1]));
860864
$this->root->propagateEncoding($encode);
861865

@@ -867,4 +871,18 @@ protected function detectCharset(): bool
867871

868872
return false;
869873
}
874+
875+
private function detectHTML5Charset(Encode $encode): bool
876+
{
877+
/** @var AbstractNode|null $meta */
878+
$meta = $this->root->find('meta[charset]', 0);
879+
if ($meta == null) {
880+
return false;
881+
}
882+
883+
$encode->from(\trim($meta->getAttribute('charset')));
884+
$this->root->propagateEncoding($encode);
885+
886+
return true;
887+
}
870888
}

src/PHPHtmlParser/Selector/Seeker.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,8 @@ private function match(
314314
}
315315

316316
return \preg_match('/' . $pattern . '/i', $value) == 1;
317+
default:
318+
return false;
317319
}
318-
319-
return false;
320320
}
321321
}

tests/DomTest.php

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,13 @@ public function testLoadFromFileFind()
196196
$this->assertEquals('VonBurgermeister', $dom->find('.post-row div .post-user font', 0)->text);
197197
}
198198

199+
public function testLoadFromFileNotFound()
200+
{
201+
$dom = new Dom();
202+
$this->expectException(\PHPHtmlParser\Exceptions\LogicalException::class);
203+
$dom->loadFromFile('tests/data/files/unkowne.html');
204+
}
205+
199206
public function testLoadUtf8()
200207
{
201208
$dom = new Dom();
@@ -531,6 +538,60 @@ public function testMultipleSquareSelector()
531538
$this->assertEquals(1, \count($items));
532539
}
533540

541+
public function testNotSquareSelector()
542+
{
543+
$dom = new Dom();
544+
$dom->load('<input name="foo" type="text" baz="fig">');
545+
546+
$items = $dom->find('input[type!=foo]');
547+
$this->assertEquals(1, \count($items));
548+
}
549+
550+
public function testStartSquareSelector()
551+
{
552+
$dom = new Dom();
553+
$dom->load('<input name="foo" type="text" baz="fig">');
554+
555+
$items = $dom->find('input[name^=f]');
556+
$this->assertEquals(1, \count($items));
557+
}
558+
559+
public function testEndSquareSelector()
560+
{
561+
$dom = new Dom();
562+
$dom->load('<input name="foo" type="text" baz="fig">');
563+
564+
$items = $dom->find('input[baz$=g]');
565+
$this->assertEquals(1, \count($items));
566+
}
567+
568+
public function testStarSquareSelector()
569+
{
570+
$dom = new Dom();
571+
$dom->load('<input name="foo" type="text" baz="fig">');
572+
573+
$items = $dom->find('input[baz*=*]');
574+
$this->assertEquals(1, \count($items));
575+
}
576+
577+
public function testStarFullRegexSquareSelector()
578+
{
579+
$dom = new Dom();
580+
$dom->load('<input name="foo" type="text" baz="fig">');
581+
582+
$items = $dom->find('input[baz*=/\w+/]');
583+
$this->assertEquals(1, \count($items));
584+
}
585+
586+
public function testFailedSquareSelector()
587+
{
588+
$dom = new Dom();
589+
$dom->load('<input name="foo" type="text" baz="fig">');
590+
591+
$items = $dom->find('input[baz%=g]');
592+
$this->assertEquals(1, \count($items));
593+
}
594+
534595
public function testLoadGetAttributeWithBackslash()
535596
{
536597
$dom = new Dom();
@@ -547,4 +608,14 @@ public function test25ChildrenFound()
547608
$children = $dom->find('#red-line-g *');
548609
$this->assertEquals(25, \count($children));
549610
}
611+
612+
public function testHtml5PageLoad()
613+
{
614+
$dom = new Dom();
615+
$dom->loadFromFile('tests/data/files/html5.html');
616+
617+
/** @var Dom\AbstractNode $meta */
618+
$div = $dom->find('div.d-inline-block', 0);
619+
$this->assertEquals('max-width: 29px', $div->getAttribute('style'));
620+
}
550621
}

tests/Selector/SeekerTest.php

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
use PHPHtmlParser\DTO\Selector\RuleDTO;
6+
use PHPHtmlParser\Selector\Seeker;
7+
use PHPUnit\Framework\TestCase;
8+
9+
class SeekerTest extends TestCase
10+
{
11+
public function testSeekReturnEmptyArray()
12+
{
13+
$ruleDTO = new RuleDTO([
14+
'tag' => 'tag',
15+
'key' => 1,
16+
'value' => null,
17+
'operator' => null,
18+
'noKey' => false,
19+
'alterNext' => false,
20+
]);
21+
$seeker = new Seeker();
22+
$results = $seeker->seek([], $ruleDTO, [], false);
23+
$this->assertCount(0, $results);
24+
}
25+
}

tests/data/files/big.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="ru">
44
<head>
5-
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
5+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8;charset=utf-8" />
66
<meta name="generator" content="vBulletin 3.8.3" />
77
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
88

0 commit comments

Comments
 (0)