|
249 | 249 | * |
250 | 250 | * ## Tokens and finer-grained processing. |
251 | 251 | * |
252 | | - * >>> Stub documentation. |
253 | | - * |
254 | 252 | * It's also possible to scan through every lexical token in |
255 | 253 | * the HTML document using the `next_token()` function. This |
256 | 254 | * alternative form takes no argument and provides no built-in |
|
261 | 259 | * $title = '(untitled)'; |
262 | 260 | * $text_content = ''; |
263 | 261 | * while ( $processor->next_token() ) { |
264 | | - * switch ( $processor->get_node_name() ) { |
| 262 | + * switch ( $processor->get_token_name() ) { |
265 | 263 | * case '#text': |
266 | 264 | * $text .= $processor->get_node_text(); |
267 | 265 | * break; |
268 | 266 | * |
269 | | - * case 'HR': |
| 267 | + * case 'BR': |
270 | 268 | * $text .= "\n"; |
271 | 269 | * break; |
272 | 270 | * |
|
305 | 303 | * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any |
306 | 304 | * character references are decoded. E.g. "1 &lt; 2 < 3" becomes "1 < 2 < 3". |
307 | 305 | * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as |
308 | | - * raw plaintext and left as-si. E.g. "1 &lt; 2 < 3" remains "1 &lt; 2 < 3". |
| 306 | + * raw plaintext and left as-is. E.g. "1 &lt; 2 < 3" remains "1 &lt; 2 < 3". |
309 | 307 | * |
310 | 308 | * #### Other tokens with modifiable text. |
311 | 309 | * |
|
314 | 312 | * - `#text` nodes, whose entire token _is_ the modifiable text. |
315 | 313 | * - Comment nodes and nodes that became comments because of some syntax error. The |
316 | 314 | * text for these nodes is the portion of the comment inside of the syntax. E.g. for |
317 | | - * "<!-- comment -->" the text is " comment " (note that the spaces are part of it). |
| 315 | + * `<!-- comment -->` the text is `" comment "` (note that the spaces are part of it). |
318 | 316 | * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for |
319 | | - * "<![CDATA[some content]]>" the text is "some content". |
| 317 | + * `<![CDATA[some content]]>` the text is `"some content"`. |
320 | 318 | * - "Funky comments," which are a special case of invalid closing tags whose name is |
321 | 319 | * invalid. The text for these nodes is the text that a browser would transform into |
322 | | - * an HTML when parsing. E.g. for "</%post_author>" the text is "%post_author". |
| 320 | + * an HTML when parsing. E.g. for `</%post_author>` the text is `%post_author`. |
323 | 321 | * |
324 | 322 | * And there are non-elements which are atomic in nature but have no modifiable text. |
325 | | - * - `DOCTYPE` nodes like "<DOCTYPE html>" which have no closing tag. |
326 | | - * - XML Processing instruction nodes like "<<?xml charset="utf8"?>". |
327 | | - * - The empty end tag "<</>" which is ignored in the browser and DOM but exposed |
| 323 | + * - `DOCTYPE` nodes like `<DOCTYPE html>` which have no closing tag. |
| 324 | + * - XML Processing instruction nodes like `<?xml charset="utf8"?>`. |
| 325 | + * - The empty end tag `</>` which is ignored in the browser and DOM but exposed |
328 | 326 | * to the HTML API. |
329 | 327 | * |
330 | 328 | * ## Design and limitations |
@@ -849,9 +847,10 @@ public function next_token() { |
849 | 847 | } |
850 | 848 |
|
851 | 849 | /* |
852 | | - * for legacy reasons the rest of this function handles tags and their |
853 | | - * attributes. if the processor has reached the end of the document |
854 | | - * or if it matched any other token then it should return here. |
| 850 | + * For legacy reasons the rest of this function handles tags and their |
| 851 | + * attributes. If the processor has reached the end of the document |
| 852 | + * or if it matched any other token then it should return here to avoid |
| 853 | + * attempting to process tag-specific syntax. |
855 | 854 | */ |
856 | 855 | if ( |
857 | 856 | self::STATE_INCOMPLETE !== $this->parser_state && |
@@ -922,7 +921,12 @@ public function next_token() { |
922 | 921 | return true; |
923 | 922 | } |
924 | 923 |
|
925 | | - // Preserve the opening tag pointers. |
| 924 | + /* |
| 925 | + * Preserve the opening tag pointers, as these will be overwritten |
| 926 | + * when finding the closing tag. They will be reset after finding |
| 927 | + * the closing to tag to point to the opening of the special atomic |
| 928 | + * tag sequence. |
| 929 | + */ |
926 | 930 | $tag_name_starts_at = $this->tag_name_starts_at; |
927 | 931 | $tag_name_length = $this->tag_name_length; |
928 | 932 | $tag_ends_at = $this->token_starts_at + $this->token_length; |
@@ -956,7 +960,7 @@ public function next_token() { |
956 | 960 |
|
957 | 961 | /* |
958 | 962 | * The values here look like they reference the opening tag but they reference |
959 | | - * the closing that instead. This is why the opening tag values were stored |
| 963 | + * the closing tag instead. This is why the opening tag values were stored |
960 | 964 | * above in a variable. It reads confusingly here, but that's because the |
961 | 965 | * functions that skip the contents have moved all the internal cursors past |
962 | 966 | * the inner content of the tag. |
@@ -1473,7 +1477,7 @@ private function parse_next_tag() { |
1473 | 1477 | $was_at = $this->bytes_already_parsed; |
1474 | 1478 | $at = $was_at; |
1475 | 1479 |
|
1476 | | - while ( false !== $at && $at <= $doc_length ) { |
| 1480 | + while ( false !== $at && $at < $doc_length ) { |
1477 | 1481 | $at = strpos( $html, '<', $at ); |
1478 | 1482 |
|
1479 | 1483 | if ( $at > $was_at ) { |
@@ -1568,7 +1572,7 @@ private function parse_next_tag() { |
1568 | 1572 | // Abruptly-closed empty comments are a sequence of dashes followed by `>`. |
1569 | 1573 | $span_of_dashes = strspn( $html, '-', $closer_at ); |
1570 | 1574 | if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { |
1571 | | - // @todo This could go wrong if the closer is shorter than `<!---->` because there's no inside. |
| 1575 | + // @todo This could go wrong if the closer is shorter than `<!---->` because there's no inside content. |
1572 | 1576 | $this->parser_state = self::STATE_COMMENT; |
1573 | 1577 | $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; |
1574 | 1578 | $this->text_starts_at = $this->token_starts_at + 4; |
|
0 commit comments