Skip to content

Commit 14eeb07

Browse files
committed
Provisionarily: add back CDATA and PI nodes
1 parent d1506fb commit 14eeb07

File tree

1 file changed

+103
-1
lines changed

1 file changed

+103
-1
lines changed

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1664,6 +1664,24 @@ private function parse_next_tag() {
16641664
$this->text_starts_at = $this->token_starts_at + 2;
16651665
$this->text_length = $closer_at - $this->text_starts_at;
16661666
$this->bytes_already_parsed = $closer_at + 1;
1667+
1668+
// Identify nodes that would be CDATA if HTML had CDATA sections.
1669+
if (
1670+
$this->token_length >= 10 &&
1671+
'[' === $html[ $this->token_starts_at + 2 ] &&
1672+
'C' === $html[ $this->token_starts_at + 3 ] &&
1673+
'D' === $html[ $this->token_starts_at + 4 ] &&
1674+
'A' === $html[ $this->token_starts_at + 5 ] &&
1675+
'T' === $html[ $this->token_starts_at + 6 ] &&
1676+
'A' === $html[ $this->token_starts_at + 7 ] &&
1677+
'[' === $html[ $this->token_starts_at + 8 ] &&
1678+
']' === $html[ $closer_at - 1 ]
1679+
) {
1680+
$this->parser_state = self::STATE_CDATA_NODE;
1681+
$this->text_starts_at += 7;
1682+
$this->text_length -= 9;
1683+
}
1684+
16671685
return true;
16681686
}
16691687

@@ -1700,6 +1718,41 @@ private function parse_next_tag() {
17001718
$this->text_starts_at = $this->token_starts_at + 2;
17011719
$this->text_length = $closer_at - $this->text_starts_at;
17021720
$this->bytes_already_parsed = $closer_at + 1;
1721+
1722+
/*
1723+
* Identify a Processing Instruction node were HTML to have them.
1724+
*
1725+
* XML allows for more target names, but this code only identifies
1726+
* a subset. This is more or less okay because ultimately these are
1727+
* HTML comments in the DOM and this safely supports _some_ kinds
1728+
* of PI Nodes without getting lost while parsing.
1729+
*
1730+
* This code identifies processing instruction nodes whose target
1731+
* name can be represented in single-byte UTF-8 / 7-bit ASCII.
1732+
*
1733+
* > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
1734+
* [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
1735+
* [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
1736+
* [#x10000-#xEFFFF]
1737+
* > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
1738+
*
1739+
* @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
1740+
*/
1741+
if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
1742+
$comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
1743+
$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
1744+
1745+
if ( 0 < $pi_target_length ) {
1746+
$pi_target_length += strspn( $comment_text,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
1747+
1748+
$this->parser_state = self::STATE_PI_NODE;
1749+
$this->tag_name_starts_at = $this->token_starts_at + 2;
1750+
$this->tag_name_length = $pi_target_length;
1751+
$this->text_starts_at += $pi_target_length;
1752+
$this->text_length -= $pi_target_length + 1;
1753+
}
1754+
}
1755+
17031756
return true;
17041757
}
17051758

@@ -2507,6 +2560,9 @@ public function get_token_type() {
25072560
case self::STATE_DOCTYPE:
25082561
return '#doctype';
25092562

2563+
case self::STATE_PI_NODE:
2564+
return '#processing-instruction';
2565+
25102566
default:
25112567
return $this->get_token_name();
25122568
}
@@ -2540,6 +2596,12 @@ public function get_token_name() {
25402596
case self::STATE_TEXT_NODE:
25412597
return '#text';
25422598

2599+
case self::STATE_CDATA_NODE:
2600+
return '#cdata-section';
2601+
2602+
case self::STATE_PI_NODE:
2603+
return substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
2604+
25432605
case self::STATE_COMMENT:
25442606
return '#comment';
25452607

@@ -2576,7 +2638,15 @@ public function get_modifiable_text() {
25762638
$at = $this->text_starts_at;
25772639
$length = $this->text_length;
25782640
$text = substr( $this->html, $at, $length );
2579-
$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
2641+
2642+
if (
2643+
self::STATE_CDATA_NODE === $this->parser_state ||
2644+
self::STATE_PI_NODE === $this->parser_state
2645+
) {
2646+
return $text;
2647+
}
2648+
2649+
$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
25802650

25812651
if ( empty( $text ) ) {
25822652
return '';
@@ -3131,6 +3201,38 @@ private function matches() {
31313201
*/
31323202
const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
31333203

3204+
/**
3205+
* Parser CDATA Node State.
3206+
*
3207+
* Indicates that the parser has found a CDADA node and it's possible
3208+
* to read and modify its modifiable text. Note that in HTML there are
3209+
* no CDATA nodes outside foreign elements (SVG and MathML). Outside
3210+
* of foreign elements, they are treated as HTML comments. Nonetheless,
3211+
* the Tag Processor still recognizes them as they appear in the HTML
3212+
* stream and exposes them for inspection and modification.
3213+
*
3214+
* @since 6.5.0
3215+
*
3216+
* @access private
3217+
*/
3218+
const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
3219+
3220+
/**
3221+
* Parser Processing Instruction State.
3222+
*
3223+
* Indicates that the parser has found a Processing Instruction and
3224+
* it's possible to read and modify its modifiable text. Note that in
3225+
* HTML there are no Processing Instruction nodes and they are treated
3226+
* as HTML comments. Nonetheless, the Tag Processor still recognizes
3227+
* them as they appear in the HTML stream and exposes them for
3228+
* inspection and modification.
3229+
*
3230+
* @since 6.5.0
3231+
*
3232+
* @access private
3233+
*/
3234+
const STATE_PI_NODE = 'STATE_PI_NODE';
3235+
31343236
/**
31353237
* Indicates that the parser has found an HTML comment and it's
31363238
* possible to read and modify its modifiable text.

0 commit comments

Comments
 (0)