Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix some bugs discovered while processing the single-page.html HTML…
…5 spec
  • Loading branch information
dmsnell committed Jul 21, 2022
commit 2c3a48f82533cb5e8bff6079672db4cd135a115c
123 changes: 69 additions & 54 deletions lib/experimental/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,14 @@ class WP_Tag_Find_Descriptor {
public static function parse( $query ) {
Copy link
Copy Markdown
Contributor

@adamziel adamziel Jul 26, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be a constructor instead of a static method?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably, but at the time I wasn't sure if we'd want to create them through other means. I think there are still some questions I have about the role here. for instance, I do think if this were a construction I'd want to still call it like so

public function __construct( $query = array() ) {
    $parsed = self::parse( $query );
    $this->tag_name = $parsed->tag_name;
    …
}

and at that point I don't see what the value is in having the constructor. my original intent was to make the parse function failable, and maybe there's still a point in doing that, but for now it fails to safe defaults silently.

if this is the only way we'd want to create a descriptor then maybe we just convert it to a constructor, but I think for some purposes we might imagine creating one manually

$d = new Descriptor();
$d->tag_name = '<invalid';

$descriptor = new WP_Tag_Find_Descriptor();

$tag_name = $query['tag_name'];
$descriptor->tag_name = null !== $tag_name
? self::comparable( $tag_name )
$descriptor->tag_name = isset( $query['tag_name'] )
? self::comparable( $query['tag_name'] )
: null;

$match_offset = isset( $query['match_offset'] ) ? $query['match_offset'] : 0;
$match_offset = isset( $query['match_offset'] ) ? $query['match_offset'] : 1;
$descriptor->match_offset = is_integer( $match_offset )
? $match_offset
: 0;
: 1;

$containing_class = isset( $query['class_name'] ) ? $query['class_name'] : null;
$descriptor->class_name = is_string( $containing_class )
Expand All @@ -205,17 +204,36 @@ public static function comparable( $input ) {
return strtolower( trim( $input ) );
}

/**
* @param string $tag
* @param array<WP_HTML_Attribute_Token> $attributes
* @return 'matches'|'cannot-match'|'might-match'|'not-implemented'
*/
public function check( $tag, $attributes ) {
// @TODO Special-case h1..h6 here
if ( $this->tag_name !== null && $this->tag_name !== $tag ) {
return 'cannot-match';
}

public function needs_only_tag() {
return $this->class_name === null && $this->data_attribute === null;
}
if ( null === $this->class_name && null === $this->data_attribute ) {
return 'matches';
}

public function needs_class() {
return $this->class_name !== null;
}
if ( null === $this->data_attribute && isset( $attributes['class'] ) ) {
// @TODO: Avoid this nested if
if ( null === $attributes['class']->value ) {
return false;
}

public function needs_data_attribute() {
return $this->data_attribute !== null;
$pattern_class = preg_quote( $this->class_name, '~' );
return 1 === preg_match( "~(?:^|[\t ]){$pattern_class}(?:[\t ]|$)~", $attributes['class']->value->comparable );
}

if ( null === $this->class_name ) {
return 'not-implemented';
}

return 'might-match';
}
}

Expand Down Expand Up @@ -313,22 +331,6 @@ class WP_HTML_Scanner {
public $start_at = 0;


/**
* Whether we've already found the tag meeting our search constraints.
*
* @var bool
*/
public $did_match = false;


/**
* Whether we've reached the end of the document and failed to find our match.
*
* @var bool
*/
public $did_fail_match = false;


/**
* Byte offset in the current tag being inspected starts, includes leading `<`.
*
Expand Down Expand Up @@ -375,17 +377,24 @@ public function __construct( $input ) {
}


public function scan( WP_Tag_Find_Descriptor $descriptor, $found_already = 0 ) {
public function scan( WP_Tag_Find_Descriptor $descriptor, $found_already = 0, $tag = null ) {
if ( $found_already === $descriptor->match_offset ) {
return;
return $found_already;
return $tag;
}

$tag = $this->find_next_tag();
if ( ! $tag ) {
return false;
}

switch ( $descriptor->check( $tag->comparable, array() ) ) {
case 'cannot-match':
return $this->scan( $descriptor, $found_already, $tag );

case 'matches':
return $this->scan( $descriptor, $found_already + 1, $tag );
}

// @TODO: Are we done matching?
while ( $attribute = $this->find_next_attribute() ) {
// HTML5 says duplicate values are ignored
Expand All @@ -395,42 +404,49 @@ public function scan( WP_Tag_Find_Descriptor $descriptor, $found_already = 0 ) {

$this->attributes[ $attribute->name->comparable ] = $attribute;

// @TODO: Are we done matching?
switch ( $descriptor->check( $tag->comparable, $this->attributes ) ) {
case 'matches':
return $this->scan( $descriptor, $found_already + 1, $tag );
}
}

// @TODO: Remove this debugging call.
// var_export( [
// 'tag' => $tag,
// 'attributes' => $this->attributes
// ] );
// try to find the next tag
return $this->scan( $descriptor, $found_already, $tag );
}

/**
* @return WP_HTML_Scanner_Token|false
*/
public function find_next_tag() {
if ( 1 !== preg_match(
"~<!--(?>.*?-->)|<!\[CDATA\[(?>.*?>)|<\?(?>.*?)>|<(?P<TAG>[a-z][^\t\x{0A}\x{0C} />]*)~mui",
/*
* Unfortunately we can't try to search for only the tag name we want because that might
* lead us to skip over other tags and lose track of our place. So we need to search for
* _every_ tag and then check after we find one if it's the one we are looking for.
*/
"~<!--(?>.*?-->)|<!\[CDATA\[(?>.*?>)|<\?(?>.*?)>|<(?P<TAG>[a-z][^\t\x{0A}\x{0C} /?>]*)~mui",
$this->document,
$tag_match,
PREG_OFFSET_CAPTURE,
$this->start_at
) ) {
$this->start_at = strlen( $this->document );
$this->did_fail_match = true;
return false;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to reset the state if there's no match:

Suggested change
return false;
$this->start_at = null;
$this->tag_name = null;
$this->tag_start = null;
$this->attributes = array();
return false;

Otherwise the following code adds two attributes to the span:

$p = new WP_HTML_Processor( '<span/>' );
$p->find( ['tag_name'=>'span'] );
$p->set_attribute( 'attr-1', 'a' );
$p->find( ['tag_name'=>'div'] );
$p->set_attribute( 'attr-2', 'a' );
echo $p;

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very good catch. I will update this

}

list( list( $full_match, $start_at ) ) = $tag_match;
list( $full_match, $start_at ) = $tag_match[0];

// Keep scanning if we found a comment or CDATA section.
if ( ! isset( $tag_match['TAG'] ) ) {
$this->start_at += strlen( $full_match );
$this->start_at = $start_at + strlen( $full_match );
return $this->find_next_tag();
}

$this->tag_start = $this->start_at;
$this->start_at = $start_at + strlen( $full_match );
$this->tag_name = WP_HTML_Scanner_Token::from_preg_match( $tag_match['TAG'] );
$this->start_at += strlen( $full_match );
$this->tag_start = $this->tag_name->start - 1; // rewind past the leading `<`
$this->attributes = array();

return true;
return $this->tag_name;
}


Expand All @@ -446,15 +462,15 @@ public function find_next_attribute() {
return false;
}
Comment on lines +599 to +713
Copy link
Copy Markdown
Contributor

@adamziel adamziel Jul 26, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With three occurences of preg_match this is mostly a matter of taste, but FYI in #42485 I used a shorthand:

$attribute_match = $this->match( '~[\t\x{0a}\x{0c}\x{0d} ]*(?P<NAME>=?[^=/>\t\x{0C} ]*)~Smiu' );

It's possible because every call to preg_match uses the exact same of arguments aside of the regexp. $this->match also throws a No_Match exception on failure. Feel free to either ignore this or use it as an inspiration.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah as noted in #42485 I was preferring to leave the duplication in to keep the preg_ functions concrete; for searchability and comprehension within the code as well as for better IDE integration.


list( list( $full_match, $start_at ) ) = $attribute_match;
list( $full_match, $start_at ) = $attribute_match[0];
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick – this could take advantage of the token logic:

Suggested change
list( $full_match, $start_at ) = $attribute_match[0];
$full_match = WP_HTML_Scanner_Token::from_preg_match( $attribute_match[0] );
// ...
$this->start_at = $full_match->end; // end would have to be added


if ( empty( $full_match ) ) {
return false;
}

$name_token = WP_HTML_Scanner_Token::from_preg_match( $attribute_match['NAME'] );

$this->start_at += strlen( $full_match );
$this->start_at = $start_at + strlen( $full_match );
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to ignore the following whitespace characters around the = just as the After attribute name state
and Before attribute value state specify:

  • U+0009 CHARACTER TABULATION (tab)
  • U+000A LINE FEED (LF)
  • U+000C FORM FEED (FF)
  • U+0020 SPACE

This markup correctly defines an attribute attr with a value value:

<div attr= "value" />

However, the processor currently doesn't deal with that as expected:

$p = new WP_HTML_Processor( '<div attr = "old" />' );
$p->find( ['tag_name'=>'span'] );
$p->set_attribute( 'attr', 'updated' );
echo $p;
// <div attr="updated" = "old" />

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't remember why we had to alternations for the leading attribute name = and so I removed it, but now I think this is the very reason, to swallow the post-attribute-equal whitespace. Will update the pattern 👍

if ( '=' !== $this->document[ $this->start_at ] ) {
return new WP_HTML_Attribute_Token( $name_token, null, $name_token->start, $start_at + strlen( $full_match ) );
Copy link
Copy Markdown
Contributor

@adamziel adamziel Jul 26, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar nitpick as above – it would be nice to have an end property:

Suggested change
return new WP_HTML_Attribute_Token( $name_token, null, $name_token->start, $start_at + strlen( $full_match ) );
return new WP_HTML_Attribute_Token( $name_token, null, $name_token->start, $name_token->end );

}
Expand All @@ -465,16 +481,16 @@ public function find_next_attribute() {
switch ( $this->document[ $this->start_at ] ) {
case '"':
$this->start_at += 1;
$pattern = '~(?P<VALUE>[^"]*)"~';
$pattern = '~(?P<VALUE>[^"]*)"~miu';
break;

case "'":
$this->start_at += 1;
$pattern = "~(?P<VALUE>[^']*)'~";
$pattern = "~(?P<VALUE>[^']*)'~miu";
break;

default:
$pattern = '~(?P<VALUE>[^\t\x{0a}\x{0c}\x{0d} />]*)~';
$pattern = '~(?P<VALUE>[^\t\x{0a}\x{0c}\x{0d} >]*)~miu';
break;
}
Comment on lines +625 to +745
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: We can avoid the repetition:

Suggested change
switch ( $this->document[ $this->start_at ] ) {
case '"':
$this->start_at += 1;
$pattern = '~(?P<VALUE>[^"]*)"~Smiu';
break;
case "'":
$this->start_at += 1;
$pattern = "~(?P<VALUE>[^']*)'~Smiu";
break;
default:
$pattern = '~(?P<VALUE>[^\t\x{0a}\x{0c}\x{0d} >]*)~Smiu';
break;
}
$quote_maybe = $this->document[ $this->start_at ];
switch ( $quote_maybe ) {
case '"':
case "'":
$this->start_at += 1;
$pattern = "~(?P<VALUE>[^$quote_maybe]*)$quote_maybe~Smiu";
break;
default:
$pattern = '~(?P<VALUE>[^\t\x{0a}\x{0c}\x{0d} >]*)~Smiu';
break;
}

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was comfortable with the repetition for the purpose of searchability within the code and also of having a static PCRE pattern. I've also considered taking different branching here instead of PCRE searches but didn't get to that yet.


Expand All @@ -489,17 +505,16 @@ public function find_next_attribute() {
return new WP_HTML_Attribute_Token( $name_token, null, $name_token->start, $name_token->start + $name_token->length + 1 );
}

list( list( $full_match, $value_start_at ) ) = $value_match;
list( $full_match, $value_start_at ) = $value_match[0];
$value_token = WP_HTML_Scanner_Token::from_preg_match( $value_match['VALUE'] );
$this->start_at += strlen( $full_match );
$this->start_at = $value_start_at + strlen( $full_match );

return new WP_HTML_Attribute_Token( $name_token, $value_token, $start_at, $value_start_at + strlen( $full_match ) );
}

public function reset() {
$this->tag_name = null;
$this->attributes = array();
$this->did_match = false;
$this->tag_start = null;
}
}
Expand Down