Skip to content

Commit f50ffa9

Browse files
committed
python#13273: fix a bug that prevented HTMLParser to properly detect some tags when strict=False.
1 parent 0b85cd0 commit f50ffa9

3 files changed

Lines changed: 38 additions & 3 deletions

File tree

Lib/html/parser.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
3131
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
3232
attrfind_tolerant = re.compile(
33-
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
33+
r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
3434
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
3535
locatestarttagend = re.compile(r"""
3636
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
@@ -277,12 +277,11 @@ def parse_starttag(self, i):
277277
assert match, 'unexpected call to parse_starttag()'
278278
k = match.end()
279279
self.lasttag = tag = rawdata[i+1:k].lower()
280-
281280
while k < endpos:
282281
if self.strict:
283282
m = attrfind.match(rawdata, k)
284283
else:
285-
m = attrfind_tolerant.search(rawdata, k)
284+
m = attrfind_tolerant.match(rawdata, k)
286285
if not m:
287286
break
288287
attrname, rest, attrvalue = m.group(1, 2, 3)

Lib/test/test_htmlparser.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,39 @@ def test_weird_chars_in_unquoted_attribute_values(self):
373373
[('action', 'bogus|&#()value')])],
374374
collector = self.collector)
375375

376+
def test_issue13273(self):
377+
html = ('<div style="" ><b>The <a href="some_url">rain</a> '
378+
'<br /> in <span>Spain</span></b></div>')
379+
expected = [
380+
('starttag', 'div', [('style', '')]),
381+
('starttag', 'b', []),
382+
('data', 'The '),
383+
('starttag', 'a', [('href', 'some_url')]),
384+
('data', 'rain'),
385+
('endtag', 'a'),
386+
('data', ' '),
387+
('startendtag', 'br', []),
388+
('data', ' in '),
389+
('starttag', 'span', []),
390+
('data', 'Spain'),
391+
('endtag', 'span'),
392+
('endtag', 'b'),
393+
('endtag', 'div')
394+
]
395+
self._run_check(html, expected, collector=self.collector)
396+
397+
def test_issue13273_2(self):
398+
html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
399+
expected = [
400+
('starttag', 'div', [('style', ''), ('foo', 'bar')]),
401+
('starttag', 'b', []),
402+
('data', 'The '),
403+
('starttag', 'a', [('href', 'some_url')]),
404+
('data', 'rain'),
405+
('endtag', 'a'),
406+
]
407+
self._run_check(html, expected, collector=self.collector)
408+
376409
def test_unescape_function(self):
377410
p = html.parser.HTMLParser()
378411
self.assertEqual(p.unescape('&#bad;'),'&#bad;')

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ Core and Builtins
6161
Library
6262
-------
6363

64+
- Issue #13273: fix a bug that prevented HTMLParser to properly detect some
65+
tags when strict=False.
66+
6467
- Issue #10332: multiprocessing: fix a race condition when a Pool is closed
6568
before all tasks have completed.
6669

0 commit comments

Comments
 (0)