Provisionarily: add back CDATA and PI nodes

dmsnell · dmsnell · commit 14eeb0710428 · 2024-01-12T07:54:18.000-05:00
diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -1664,6 +1664,24 @@ private function parse_next_tag() {
 				$this->text_starts_at       = $this->token_starts_at + 2;
 				$this->text_length          = $closer_at - $this->text_starts_at;
 				$this->bytes_already_parsed = $closer_at + 1;
+
+				// Identify nodes that would be CDATA if HTML had CDATA sections.
+				if (
+					$this->token_length >= 10 &&
+					'[' === $html[ $this->token_starts_at + 2 ] &&
+					'C' === $html[ $this->token_starts_at + 3 ] &&
+					'D' === $html[ $this->token_starts_at + 4 ] &&
+					'A' === $html[ $this->token_starts_at + 5 ] &&
+					'T' === $html[ $this->token_starts_at + 6 ] &&
+					'A' === $html[ $this->token_starts_at + 7 ] &&
+					'[' === $html[ $this->token_starts_at + 8 ] &&
+					']' === $html[ $closer_at - 1 ]
+				) {
+					$this->parser_state    = self::STATE_CDATA_NODE;
+					$this->text_starts_at += 7;
+					$this->text_length    -= 9;
+				}
+
 				return true;
 			}
 
@@ -1700,6 +1718,41 @@ private function parse_next_tag() {
 				$this->text_starts_at       = $this->token_starts_at + 2;
 				$this->text_length          = $closer_at - $this->text_starts_at;
 				$this->bytes_already_parsed = $closer_at + 1;
+
+				/*
+				 * Identify a Processing Instruction node were HTML to have them.
+				 *
+				 * XML allows for more target names, but this code only identifies
+				 * a subset. This is more or less okay because ultimately these are
+				 * HTML comments in the DOM and this safely supports _some_ kinds
+				 * of PI Nodes without getting lost while parsing.
+				 *
+				 * This code identifies processing instruction nodes whose target
+				 * name can be represented in single-byte UTF-8 / 7-bit ASCII.
+				 *
+				 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
+				 *                     [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+				 *                     [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+				 *                     [#x10000-#xEFFFF]
+				 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+				 *
+				 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
+				 */
+				if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
+					$comment_text     = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
+					$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
+
+					if ( 0 < $pi_target_length ) {
+						 $pi_target_length += strspn( $comment_text,'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
+
+						 $this->parser_state       = self::STATE_PI_NODE;
+						 $this->tag_name_starts_at = $this->token_starts_at + 2;
+						 $this->tag_name_length    = $pi_target_length;
+						 $this->text_starts_at    += $pi_target_length;
+						 $this->text_length       -= $pi_target_length + 1;
+					}
+				}
+
 				return true;
 			}
 
@@ -2507,6 +2560,9 @@ public function get_token_type() {
 			case self::STATE_DOCTYPE:
 				return '#doctype';
 
+			case self::STATE_PI_NODE:
+				return '#processing-instruction';
+
 			default:
 				return $this->get_token_name();
 		}
@@ -2540,6 +2596,12 @@ public function get_token_name() {
 			case self::STATE_TEXT_NODE:
 				return '#text';
 
+			case self::STATE_CDATA_NODE:
+				return '#cdata-section';
+
+			case self::STATE_PI_NODE:
+				return substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
+
 			case self::STATE_COMMENT:
 				return '#comment';
 
@@ -2576,7 +2638,15 @@ public function get_modifiable_text() {
 		$at     = $this->text_starts_at;
 		$length = $this->text_length;
 		$text   = substr( $this->html, $at, $length );
-		$text   = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
+
+		if (
+			self::STATE_CDATA_NODE === $this->parser_state ||
+			self::STATE_PI_NODE === $this->parser_state
+		) {
+			return $text;
+		}
+
+		$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
 
 		if ( empty( $text ) ) {
 			return '';
@@ -3131,6 +3201,38 @@ private function matches() {
 	 */
 	const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
 
+	/**
+	 * Parser CDATA Node State.
+	 *
+	 * Indicates that the parser has found a CDADA node and it's possible
+	 * to read and modify its modifiable text. Note that in HTML there are
+	 * no CDATA nodes outside foreign elements (SVG and MathML). Outside
+	 * of foreign elements, they are treated as HTML comments. Nonetheless,
+	 * the Tag Processor still recognizes them as they appear in the HTML
+	 * stream and exposes them for inspection and modification.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
+
+	/**
+	 * Parser Processing Instruction State.
+	 *
+	 * Indicates that the parser has found a Processing Instruction and
+	 * it's possible to read and modify its modifiable text. Note that in
+	 * HTML there are no Processing Instruction nodes and they are treated
+	 * as HTML comments. Nonetheless, the Tag Processor still recognizes
+	 * them as they appear in the HTML stream and exposes them for
+	 * inspection and modification.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_PI_NODE = 'STATE_PI_NODE';
+
 	/**
 	 * Indicates that the parser has found an HTML comment and it's
 	 * possible to read and modify its modifiable text.