@@ -1664,6 +1664,24 @@ private function parse_next_tag() {
16641664 $ this ->text_starts_at = $ this ->token_starts_at + 2 ;
16651665 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
16661666 $ this ->bytes_already_parsed = $ closer_at + 1 ;
1667+
1668+ // Identify nodes that would be CDATA if HTML had CDATA sections.
1669+ if (
1670+ $ this ->token_length >= 10 &&
1671+ '[ ' === $ html [ $ this ->token_starts_at + 2 ] &&
1672+ 'C ' === $ html [ $ this ->token_starts_at + 3 ] &&
1673+ 'D ' === $ html [ $ this ->token_starts_at + 4 ] &&
1674+ 'A ' === $ html [ $ this ->token_starts_at + 5 ] &&
1675+ 'T ' === $ html [ $ this ->token_starts_at + 6 ] &&
1676+ 'A ' === $ html [ $ this ->token_starts_at + 7 ] &&
1677+ '[ ' === $ html [ $ this ->token_starts_at + 8 ] &&
1678+ '] ' === $ html [ $ closer_at - 1 ]
1679+ ) {
1680+ $ this ->parser_state = self ::STATE_CDATA_NODE ;
1681+ $ this ->text_starts_at += 7 ;
1682+ $ this ->text_length -= 9 ;
1683+ }
1684+
16671685 return true ;
16681686 }
16691687
@@ -1700,6 +1718,41 @@ private function parse_next_tag() {
17001718 $ this ->text_starts_at = $ this ->token_starts_at + 2 ;
17011719 $ this ->text_length = $ closer_at - $ this ->text_starts_at ;
17021720 $ this ->bytes_already_parsed = $ closer_at + 1 ;
1721+
1722+ /*
1723+ * Identify a Processing Instruction node were HTML to have them.
1724+ *
1725+ * XML allows for more target names, but this code only identifies
1726+ * a subset. This is more or less okay because ultimately these are
1727+ * HTML comments in the DOM and this safely supports _some_ kinds
1728+ * of PI Nodes without getting lost while parsing.
1729+ *
1730+ * This code identifies processing instruction nodes whose target
1731+ * name can be represented in single-byte UTF-8 / 7-bit ASCII.
1732+ *
1733+ * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
1734+ * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
1735+ * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
1736+ * [#x10000-#xEFFFF]
1737+ * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
1738+ *
1739+ * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
1740+ */
1741+ if ( $ this ->token_length >= 5 && '? ' === $ html [ $ closer_at - 1 ] ) {
1742+ $ comment_text = substr ( $ html , $ this ->token_starts_at + 2 , $ this ->token_length - 4 );
1743+ $ pi_target_length = strspn ( $ comment_text , 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_ ' );
1744+
1745+ if ( 0 < $ pi_target_length ) {
1746+ $ pi_target_length += strspn ( $ comment_text ,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-. ' , $ pi_target_length );
1747+
1748+ $ this ->parser_state = self ::STATE_PI_NODE ;
1749+ $ this ->tag_name_starts_at = $ this ->token_starts_at + 2 ;
1750+ $ this ->tag_name_length = $ pi_target_length ;
1751+ $ this ->text_starts_at += $ pi_target_length ;
1752+ $ this ->text_length -= $ pi_target_length + 1 ;
1753+ }
1754+ }
1755+
17031756 return true ;
17041757 }
17051758
@@ -2507,6 +2560,9 @@ public function get_token_type() {
25072560 case self ::STATE_DOCTYPE :
25082561 return '#doctype ' ;
25092562
2563+ case self ::STATE_PI_NODE :
2564+ return '#processing-instruction ' ;
2565+
25102566 default :
25112567 return $ this ->get_token_name ();
25122568 }
@@ -2540,6 +2596,12 @@ public function get_token_name() {
25402596 case self ::STATE_TEXT_NODE :
25412597 return '#text ' ;
25422598
2599+ case self ::STATE_CDATA_NODE :
2600+ return '#cdata-section ' ;
2601+
2602+ case self ::STATE_PI_NODE :
2603+ return substr ( $ this ->html , $ this ->tag_name_starts_at , $ this ->tag_name_length );
2604+
25432605 case self ::STATE_COMMENT :
25442606 return '#comment ' ;
25452607
@@ -2576,7 +2638,15 @@ public function get_modifiable_text() {
25762638 $ at = $ this ->text_starts_at ;
25772639 $ length = $ this ->text_length ;
25782640 $ text = substr ( $ this ->html , $ at , $ length );
2579- $ text = html_entity_decode ( $ text , ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
2641+
2642+ if (
2643+ self ::STATE_CDATA_NODE === $ this ->parser_state ||
2644+ self ::STATE_PI_NODE === $ this ->parser_state
2645+ ) {
2646+ return $ text ;
2647+ }
2648+
2649+ $ text = html_entity_decode ( $ text , ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
25802650
25812651 if ( empty ( $ text ) ) {
25822652 return '' ;
@@ -3131,6 +3201,38 @@ private function matches() {
31313201 */
31323202 const STATE_TEXT_NODE = 'STATE_TEXT_NODE ' ;
31333203
3204+ /**
3205+ * Parser CDATA Node State.
3206+ *
3207+ * Indicates that the parser has found a CDADA node and it's possible
3208+ * to read and modify its modifiable text. Note that in HTML there are
3209+ * no CDATA nodes outside foreign elements (SVG and MathML). Outside
3210+ * of foreign elements, they are treated as HTML comments. Nonetheless,
3211+ * the Tag Processor still recognizes them as they appear in the HTML
3212+ * stream and exposes them for inspection and modification.
3213+ *
3214+ * @since 6.5.0
3215+ *
3216+ * @access private
3217+ */
3218+ const STATE_CDATA_NODE = 'STATE_CDATA_NODE ' ;
3219+
3220+ /**
3221+ * Parser Processing Instruction State.
3222+ *
3223+ * Indicates that the parser has found a Processing Instruction and
3224+ * it's possible to read and modify its modifiable text. Note that in
3225+ * HTML there are no Processing Instruction nodes and they are treated
3226+ * as HTML comments. Nonetheless, the Tag Processor still recognizes
3227+ * them as they appear in the HTML stream and exposes them for
3228+ * inspection and modification.
3229+ *
3230+ * @since 6.5.0
3231+ *
3232+ * @access private
3233+ */
3234+ const STATE_PI_NODE = 'STATE_PI_NODE ' ;
3235+
31343236 /**
31353237 * Indicates that the parser has found an HTML comment and it's
31363238 * possible to read and modify its modifiable text.
0 commit comments