From e05edfc076c48a4a1fd9adc9d32f41a395830eda Mon Sep 17 00:00:00 2001 From: Chad Birch Date: Thu, 11 Oct 2018 15:51:18 -0600 Subject: [PATCH 01/18] Define a full list of valid elements for HTML_TAGS Previously, the list of valid HTML tags being used by linkify was a combination of various element lists from the html5lib constants. However, these lists don't cover all of the valid HTML elements, so linkify was escaping some valid tags (including , , and more). This commit just defines a full list of valid, non-deprecated HTML elements for linkify to use instead. --- bleach/html5lib_shim.py | 122 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 111 insertions(+), 11 deletions(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 4e6e054f..50d7d3e9 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -45,19 +45,119 @@ CHARACTERS_TYPE = constants.tokenTypes['Characters'] -#: List of HTML tags +#: List of valid HTML tags HTML_TAGS = [ - tag for namespace, tag in - ( - list(constants.scopingElements) + - list(constants.formattingElements) + - list(constants.specialElements) + - list(constants.htmlIntegrationPointElements) + - list(constants.mathmlTextIntegrationPointElements) - ) + 'a', + 'abbr', + 'address', + 'area', + 'article', + 'aside', + 'audio', + 'b', + 'base', + 'bdi', + 'bdo', + 'blockquote', + 'body', + 'br', + 'button', + 'canvas', + 'caption', + 'cite', + 'code', + 'col', + 'colgroup', + 'command', + 'data', + 'datalist', + 'dd', + 'del', + 'details', + 'dfn', + 'dialog', + 'div', + 'dl', + 'dt', + 'em', + 'embed', + 'fieldset', + 'figcaption', + 'figure', + 'footer', + 'form', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'head', + 'header', + 'hgroup', + 'hr', + 'html', + 'i', + 'iframe', + 'img', + 'input', + 'ins', + 'kbd', + 'keygen', + 'label', + 'legend', + 'li', + 'link', + 'map', + 'mark', + 'menu', + 'meta', + 'meter', + 'nav', + 'noscript', + 'object', + 'ol', + 'optgroup', + 'option', + 'output', + 'p', + 'param', + 'pre', + 'progress', + 'q', + 'rp', + 'rt', + 'ruby', + 's', + 'samp', + 'script', + 'section', + 'select', + 'small', + 'source', + 'span', + 'strong', + 'style', + 'sub', + 'summary', + 'sup', + 'table', + 'tbody', + 'td', + 'textarea', + 'tfoot', + 'th', + 'thead', + 'time', + 'title', + 'tr', + 'track', + 'u', + 'ul', + 'var', + 'video', + 'wbr', ] -# Add tags that aren't in html5lib.constants -HTML_TAGS.extend(['abbr']) class InputStreamWithMemory(object): From 6f16fec8cff27ff665efe0b8ac22ef1cecf8c8dd Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Tue, 16 Oct 2018 17:16:01 -0400 Subject: [PATCH 02/18] fixed docs on callback location --- docs/linkify.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/linkify.rst b/docs/linkify.rst index 6665300a..d60e17b6 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -58,7 +58,7 @@ links will be removed leaving the innerText left in its place. The default callback adds ``rel="nofollow"``. See ``bleach.callbacks`` for some included callback functions. -This defaults to ``bleach.linkify.DEFAULT_CALLBACKS``. +This defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``. .. autodata:: bleach.linkifier.DEFAULT_CALLBACKS From d445230222ec710542c89046103c4bad715f7c0a Mon Sep 17 00:00:00 2001 From: Chad Birch Date: Wed, 17 Oct 2018 14:54:45 -0600 Subject: [PATCH 03/18] Clarify source of HTML tag list, fix discrepancies --- bleach/html5lib_shim.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 50d7d3e9..63a4f25c 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -45,7 +45,8 @@ CHARACTERS_TYPE = constants.tokenTypes['Characters'] -#: List of valid HTML tags +#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 +#: https://html.spec.whatwg.org/multipage/indices.html#elements-3 HTML_TAGS = [ 'a', 'abbr', @@ -68,7 +69,6 @@ 'code', 'col', 'colgroup', - 'command', 'data', 'datalist', 'dd', @@ -122,6 +122,7 @@ 'output', 'p', 'param', + 'picture', 'pre', 'progress', 'q', @@ -133,6 +134,7 @@ 'script', 'section', 'select', + 'slot', 'small', 'source', 'span', @@ -144,6 +146,7 @@ 'table', 'tbody', 'td', + 'template', 'textarea', 'tfoot', 'th', From 5c72a54b5c7e1286147bf20328465f3867878d2b Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 25 Oct 2018 16:05:42 -0400 Subject: [PATCH 04/18] Fix regex strings--they should be marked raw --- bleach/sanitizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 9ba4c57b..c60c26b3 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -425,7 +425,7 @@ def sanitize_uri_value(self, value, allowed_protocols): # Nix backtick, space characters, and control characters new_value = re.sub( - "[`\000-\040\177-\240\s]+", + r"[`\000-\040\177-\240\s]+", '', new_value ) @@ -574,7 +574,7 @@ def sanitize_css(self, style): style = html5lib_shim.convert_entities(style) # Drop any url values before we do anything else - style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # The gauntlet of sanitization @@ -589,11 +589,11 @@ def sanitize_css(self, style): if not gauntlet.match(part): return '' - if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] - for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): + for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style): if not value: continue From bdb28650a2ffe7a994f59b4a958e8b8763b53b64 Mon Sep 17 00:00:00 2001 From: dave-shawley Date: Mon, 29 Oct 2018 12:47:56 -0400 Subject: [PATCH 05/18] setup.py: six >=1.9 is required. (#416) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0de45ad4..bf01a4ed 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ ] install_requires = [ - 'six', + 'six>=1.9.0', # html5lib requirements 'webencodings', ] From 83738b9dd8ac5242fe1ca54059b1ca1310ac6610 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 12 Dec 2018 16:58:32 -0500 Subject: [PATCH 06/18] Add recognized_tags to Linker arguments recognized_tags lets someone specify a different set of recognized tags to the Linker when they're using it by itself. One use of this is if you're doing a clean pass, storing that, and then linkifying on demand. The recognized_tags argument lets you re-use the allowed tags from the clean step and then there isn't any additional oddities occuring in the linkify step. Fixes #409 --- bleach/linkifier.py | 7 +++++-- docs/linkify.rst | 21 +++++++++++---------- tests/test_linkify.py | 17 +++++++++++++++++ 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index 5d815f8b..95baba14 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -85,7 +85,7 @@ class Linker(object): """ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False, - url_re=URL_RE, email_re=EMAIL_RE): + url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS): """Creates a Linker instance :arg list callbacks: list of callbacks to run when adjusting tag attributes; @@ -101,6 +101,9 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals :arg re email_re: email matching regex + :arg list-of-strings recognized_tags: the list of tags that linkify knows about; + everything else gets escaped + :returns: linkified text as unicode """ @@ -113,7 +116,7 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals # Create a parser/tokenizer that allows all HTML tags and escapes # anything not in that list. self.parser = html5lib_shim.BleachHTMLParser( - tags=html5lib_shim.HTML_TAGS, + tags=recognized_tags, strip=False, consume_entities=True, namespaceHTMLElements=False, diff --git a/docs/linkify.rst b/docs/linkify.rst index d60e17b6..b8e7884e 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -12,13 +12,14 @@ For example, you could pass in text and have all URL things converted into HTML links. It works by parsing the text as HTML and building a document tree. In this -way, it's guaranteed never to do weird things to URLs in attribute values, -can modify the value of attributes on ```` tags and can even do things -like skip ``
`` sections.
+way, you're guaranteed to get valid HTML back without weird things like
+having URLs in tag attributes getting linkified.
 
-If you plan to sanitize/clean the text and linkify it, you should do that
-in a single pass using :ref:`LinkifyFilter `. This
-is faster and it'll use the list of allowed tags from clean.
+.. note::
+
+   If you plan to sanitize/clean the text and linkify it, you should do that
+   in a single pass using :ref:`LinkifyFilter `. This
+   is faster and it'll use the list of allowed tags from clean.
 
 .. note::
 
@@ -297,8 +298,8 @@ writing callbacks that may need to behave differently if the protocol is
 Using ``bleach.linkifier.Linker``
 =================================
 
-If you're linking a lot of text and passing the same argument values or you want
-more configurability, consider using a :py:class:`bleach.linkifier.Linker`
+If you're linking a lot of text and passing the same argument values or you
+need more configurability, consider using a :py:class:`bleach.linkifier.Linker`
 instance.
 
 .. doctest::
@@ -325,8 +326,8 @@ Using ``bleach.linkifier.LinkifyFilter``
 the ``bleach.linkifier.LinkifyFilter`` when walking the tree and serializing it
 back into text.
 
-You can use this filter wherever you can use an html5lib Filter. For example, you
-could use it with ``bleach.Cleaner`` to clean and linkify in one step.
+You can use this filter wherever you can use an html5lib Filter. This lets you
+use it with ``bleach.Cleaner`` to clean and linkify in one step.
 
 For example, using all the defaults:
 
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index eeea3e32..d29a5c82 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -625,6 +625,23 @@ def test_email_re_arg():
     )
 
 
+def test_recognized_tags_arg():
+    """Verifies that recognized_tags works"""
+    # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it
+    linker = Linker(recognized_tags=['p'])
+    assert (
+        linker.linkify('

http://example.com/

') == + '

http://example.com/

<sarcasm>' # noqa + ) + + # The html parser recognizes "sarcasm" as a tag and fixes it + linker = Linker(recognized_tags=['p', 'sarcasm']) + assert ( + linker.linkify('

http://example.com/

') == + '

http://example.com/

' # noqa + ) + + def test_linkify_idempotent(): dirty = 'invalid & < extra http://link.com' assert linkify(linkify(dirty)) == linkify(dirty) From 821a0ff8959b96d91f1e2b1779c9ad8b42da4173 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 12 Dec 2018 19:12:37 -0500 Subject: [PATCH 07/18] Update for 3.0.3 development --- CHANGES | 22 ++++++++++++++++++++++ bleach/__init__.py | 4 ++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index 176dde1c..31d62dcd 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,28 @@ Bleach changes ============== +Version 3.0.3 (In development) +------------------------------ + +**Security fixes** + +None + +**Backwards incompatible changes** + +None + +**Features** + +* Add ``recognized_tags`` argument to the linkify ``Linker`` class. This + fixes issues when linkifying on its own and having some tags get escaped. + It defaults to a list of HTML5 tags. Thank you, Chad Birch! (#409) + +**Bug fixes** + +* Add ``six>=1.9`` to requirements. Thank you, Dave Shawley (#416) + + Version 3.0.2 (October 11th, 2018) ---------------------------------- diff --git a/bleach/__init__.py b/bleach/__init__.py index 8ed01763..14049e2d 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -18,9 +18,9 @@ # yyyymmdd -__releasedate__ = '20181011' +__releasedate__ = '' # x.y.z or x.y.z.dev0 -- semver -__version__ = '3.0.2' +__version__ = '3.0.3' VERSION = parse_version(__version__) From 33c4a23886ed7e33995529a25247a724b1feac1c Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Fri, 28 Dec 2018 10:05:20 -0500 Subject: [PATCH 08/18] Drop invalid attribute names (#419) It's possible for the tokenizer to kick up an invalid-character-in-attribute-name error. When it does that, the BleachHTMLTokenizer should drop the attribute with the invalid name. This fixes that. --- bleach/__init__.py | 2 +- bleach/html5lib_shim.py | 32 ++++++++++++++++++++++++++++---- tests/test_linkify.py | 13 ++++++++++++- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/bleach/__init__.py b/bleach/__init__.py index 14049e2d..a6445d02 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -20,7 +20,7 @@ # yyyymmdd __releasedate__ = '' # x.y.z or x.y.z.dev0 -- semver -__version__ = '3.0.3' +__version__ = '3.0.3.dev0' VERSION = parse_version(__version__) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 63a4f25c..88876678 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -43,6 +43,7 @@ constants.tokenTypes['EmptyTag'] ]) CHARACTERS_TYPE = constants.tokenTypes['Characters'] +PARSEERROR_TYPE = constants.tokenTypes['ParseError'] #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 @@ -232,7 +233,21 @@ def __iter__(self): for token in super(BleachHTMLTokenizer, self).__iter__(): if last_error_token is not None: - if ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and + if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and + token['type'] in TAG_TOKEN_TYPES and + token.get('data'))): + # Remove attribute names that have ', " or < in them + # because those characters are invalid for attribute names. + token['data'] = [ + item for item in token['data'] + if ('"' not in item[0] and + "'" not in item[0] and + '<' not in item[0]) + ] + last_error_token = None + yield token + + elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and token['data'].lower().strip() not in self.parser.tags)): # We've got either a malformed tag or a pseudo-tag or # something that html5lib wants to turn into a malformed @@ -248,24 +263,33 @@ def __iter__(self): token['data'] = self.stream.get_tag() token['type'] = CHARACTERS_TYPE - # Yield the adjusted token + last_error_token = None yield token + elif token['type'] == PARSEERROR_TYPE: + # If the token is a parse error, then let the last_error_token + # go, and make token the new last_error_token + yield last_error_token + last_error_token = token + else: yield last_error_token yield token + last_error_token = None - last_error_token = None continue # If the token is a ParseError, we hold on to it so we can get the # next token and potentially fix it. - if token['type'] == constants.tokenTypes['ParseError']: + if token['type'] == PARSEERROR_TYPE: last_error_token = token continue yield token + if last_error_token: + yield last_error_token + def consumeEntity(self, allowedChar=None, fromAttribute=False): # If this tokenizer is set to consume entities, then we can let the # superclass do its thing. diff --git a/tests/test_linkify.py b/tests/test_linkify.py index d29a5c82..584a5b0d 100644 --- a/tests/test_linkify.py +++ b/tests/test_linkify.py @@ -69,6 +69,17 @@ def ft(attrs, new=False): ) +def test_invalid_attribute_names(): + """Test that "invalid-character-in-attribute-name" errors in tokenizing + result in attributes with invalid names get dropped. + + """ + assert ( + linkify('') == + '' + ) + + @pytest.mark.parametrize('data,parse_email,expected', [ ( 'a james@example.com mailto', @@ -119,7 +130,7 @@ def test_email_link(data, parse_email, expected): assert linkify(data, parse_email=parse_email) == expected -@pytest.mark.parametrize('data,expected', [ +@pytest.mark.parametrize('data, expected', [ ( '"james"@example.com', '''"james"@example.com''' From 4a4c4956ff6bfd9e354b5db330f1bbd9f2828453 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Mon, 31 Dec 2018 16:15:35 -0500 Subject: [PATCH 09/18] Add note to CHANGES --- CHANGES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES b/CHANGES index 31d62dcd..eb3dc23b 100644 --- a/CHANGES +++ b/CHANGES @@ -22,6 +22,9 @@ None * Add ``six>=1.9`` to requirements. Thank you, Dave Shawley (#416) +* Fix cases where attribute names could have invalid characters in them. + (#419) + Version 3.0.2 (October 11th, 2018) ---------------------------------- From 8d7fd48179b5020d9b1521be7b81e06648d868d3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Wed, 12 Dec 2018 19:29:17 -0500 Subject: [PATCH 10/18] Convert & to & as a Characters token This fixes a problem in LinkifyFilter when using it with the Cleaner where the Cleaner sets up the tokenizer to not consume entities. So character entities end up in their own Entity tokens and Linkifyfilter can't match links that cross token boundaries. If there's a &, then LinkifyFilter won't match across that. This fixes that by converting & to & in the sanitizer when it's pulling out entities and putting them in separate Entity tokens. The & Characters tokens will get merged by BleachSanitizerFilter.__iter__ and & will get converted back to & in the serialier. Fixes #422 --- bleach/sanitizer.py | 13 ++++++++++++- tests/test_linkify.py | 4 ++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index c60c26b3..79b80f5b 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -395,7 +395,18 @@ def sanitize_characters(self, token): if part.startswith('&'): entity = html5lib_shim.match_entity(part) if entity is not None: - new_tokens.append({'type': 'Entity', 'name': entity}) + if entity == 'amp': + # LinkifyFilter can't match urls across token boundaries + # which is problematic with & since that shows up in + # querystrings all the time. This special-cases & + # and converts it to a & and sticks it in as a + # Characters token. It'll get merged with surrounding + # tokens in the BleachSanitizerfilter.__iter__ and + # escaped in the serializer. + new_tokens.append({'type': 'Characters', 'data': '&'}) + else: + new_tokens.append({'type': 'Entity', 'name': entity}) + # Length of the entity plus 2--one for & at the beginning # and and one for ; at the end remainder = part[len(entity) + 2:] diff --git a/tests/test_linkify.py b/tests/test_linkify.py index 584a5b0d..ab1c5134 100644 --- a/tests/test_linkify.py +++ b/tests/test_linkify.py @@ -694,6 +694,10 @@ def test_only_text_is_linkified(self): 'http://example.com?b=1&c=2', 'http://example.com?b=1&c=2' ), + ( + 'http://example.com?b=1&c=2', + 'http://example.com?b=1&c=2' + ), ( 'link: https://example.com/watch#anchor', 'link: https://example.com/watch#anchor' From cb156cb9054c34b817f8ed2dff92801a594b9107 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 8 Jan 2019 10:33:54 -0500 Subject: [PATCH 11/18] Fix parsing "meta" tag with encoding attribute When parsing a tag, the parser calls charEncoding and changeEncoding in the input stream, but the InputStreamWithMemory wrapper didn't have those methods. This fixes that. This also creates a new test set for BleachHTMLParser functionality. Fixes #431 --- CHANGES | 8 ++++- bleach/__init__.py | 2 +- bleach/html5lib_shim.py | 8 +++++ tests/test_html5lib_shim.py | 62 +++++++++++++++++++++++++++++++++++++ tests/test_linkify.py | 11 ------- 5 files changed, 78 insertions(+), 13 deletions(-) diff --git a/CHANGES b/CHANGES index eb3dc23b..4fe065e8 100644 --- a/CHANGES +++ b/CHANGES @@ -1,7 +1,7 @@ Bleach changes ============== -Version 3.0.3 (In development) +Version 3.1.0 (In development) ------------------------------ **Security fixes** @@ -25,6 +25,12 @@ None * Fix cases where attribute names could have invalid characters in them. (#419) +* Fix problems with ``LinkifyFilter`` not being able to match links + across ``&``. (#422) + +* Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is + parsing ``meta`` tags. (#431) + Version 3.0.2 (October 11th, 2018) ---------------------------------- diff --git a/bleach/__init__.py b/bleach/__init__.py index a6445d02..6249bf81 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -20,7 +20,7 @@ # yyyymmdd __releasedate__ = '' # x.y.z or x.y.z.dev0 -- semver -__version__ = '3.0.3.dev0' +__version__ = '3.1.0.dev0' VERSION = parse_version(__version__) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 88876678..25e3e955 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -181,6 +181,14 @@ def __init__(self, inner_stream): def errors(self): return self._inner_stream.errors + @property + def charEncoding(self): + return self._inner_stream.charEncoding + + @property + def changeEncoding(self): + return self._inner_stream.changeEncoding + def char(self): c = self._inner_stream.char() # char() can return None if EOF, so ignore that diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py index 5712d338..ce15de7e 100644 --- a/tests/test_html5lib_shim.py +++ b/tests/test_html5lib_shim.py @@ -80,3 +80,65 @@ def test_serializer(data, expected): serialized = serializer.render(walker(dom)) assert serialized == expected + + +@pytest.mark.parametrize('parser_args, data, expected', [ + # Make sure InputStreamWithMemory has charEncoding and changeEncoding + ( + {}, + '', + '' + ), + # Handle consume entities False--all entities are passed along and then + # escaped when serialized + ( + {'consume_entities': False}, + 'text &>"', + 'text &amp;&gt;&quot;' + ), + # Handle consume entities True--all entities are consumed and converted + # to their character equivalents and then &, <, and > are escaped when + # serialized + ( + {'consume_entities': True}, + 'text &>"', + 'text &>"' + ), + # Test that "invalid-character-in-attribute-name" errors in tokenizing + # result in attributes with invalid names getting dropped + ( + {}, + '', + '' + ), + ( + {}, + '', + '' + ) +]) +def test_bleach_html_parser(parser_args, data, expected): + args = { + 'tags': None, + 'strip': True, + 'consume_entities': True + } + args.update(parser_args) + + # Build a parser, walker, and serializer just like we do in clean() + parser = html5lib_shim.BleachHTMLParser(**args) + walker = html5lib_shim.getTreeWalker('etree') + serializer = html5lib_shim.BleachHTMLSerializer( + quote_attr_values='always', + omit_optional_tags=False, + escape_lt_in_attrs=True, + resolve_entities=False, + sanitize=False, + alphabetical_attributes=False, + ) + + # Parse, walk, and then serialize the output + dom = parser.parseFragment(data) + serialized = serializer.render(walker(dom)) + + assert serialized == expected diff --git a/tests/test_linkify.py b/tests/test_linkify.py index ab1c5134..f1211894 100644 --- a/tests/test_linkify.py +++ b/tests/test_linkify.py @@ -69,17 +69,6 @@ def ft(attrs, new=False): ) -def test_invalid_attribute_names(): - """Test that "invalid-character-in-attribute-name" errors in tokenizing - result in attributes with invalid names get dropped. - - """ - assert ( - linkify('') == - '' - ) - - @pytest.mark.parametrize('data,parse_email,expected', [ ( 'a james@example.com mailto', From 245c21c3cef788dbfdb380514434497866443e87 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 8 Jan 2019 12:11:15 -0500 Subject: [PATCH 12/18] Fix doctest failures This reworks the doctests to run and pass in Python 3. Fixes #357 --- docs/clean.rst | 46 +++++++++++++++++----------------- docs/linkify.rst | 64 ++++++++++++++++++++++++------------------------ 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/docs/clean.rst b/docs/clean.rst index 68178ce5..c786f2a3 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -63,10 +63,10 @@ For example: >>> import bleach >>> bleach.clean( - ... u'an example', + ... 'an example', ... tags=['b'], ... ) - u'<i>an example</i>' + '<i>an example</i>' The default value is a relatively conservative list found in @@ -106,12 +106,12 @@ For example: >>> import bleach >>> bleach.clean( - ... u'

blah blah blah

', + ... '

blah blah blah

', ... tags=['p'], ... attributes=['style'], ... styles=['color'], ... ) - u'

blah blah blah

' + '

blah blah blah

' As a dict @@ -135,11 +135,11 @@ and "class" for any tag (including "a" and "img"): ... } >>> bleach.clean( - ... u'an example', + ... 'an example', ... tags=['img'], ... attributes=attrs ... ) - u'an example' + 'an example' Using functions @@ -161,11 +161,11 @@ For example: ... return name[0] == 'h' >>> bleach.clean( - ... u'link', + ... 'link', ... tags=['a'], ... attributes=allow_h, ... ) - u'link' + 'link' You can also pass a callable as a value in an attributes dict and it'll run for @@ -173,7 +173,7 @@ attributes for specified tags: .. doctest:: - >>> from urlparse import urlparse + >>> from six.moves.urllib.parse import urlparse >>> import bleach >>> def allow_src(tag, name, value): @@ -185,13 +185,13 @@ attributes for specified tags: ... return False >>> bleach.clean( - ... u'an example', + ... 'an example', ... tags=['img'], ... attributes={ ... 'img': allow_src ... } ... ) - u'an example' + 'an example' .. versionchanged:: 2.0 @@ -223,12 +223,12 @@ For example, to allow users to set the color and font-weight of text: >>> styles = ['color', 'font-weight'] >>> bleach.clean( - ... u'

my html

', + ... '

my html

', ... tags=tags, ... attributes=attrs, ... styles=styles ... ) - u'

my html

' + '

my html

' Default styles are stored in ``bleach.sanitizer.ALLOWED_STYLES``. @@ -252,7 +252,7 @@ For example, this sets allowed protocols to http, https and smb: ... 'allowed protocol', ... protocols=['http', 'https', 'smb'] ... ) - u'allowed protocol' + 'allowed protocol' This adds smb to the Bleach-specified set of allowed protocols: @@ -265,7 +265,7 @@ This adds smb to the Bleach-specified set of allowed protocols: ... 'allowed protocol', ... protocols=bleach.ALLOWED_PROTOCOLS + ['smb'] ... ) - u'allowed protocol' + 'allowed protocol' Default protocols are in ``bleach.sanitizer.ALLOWED_PROTOCOLS``. @@ -284,10 +284,10 @@ and invalid markup. For example: >>> import bleach >>> bleach.clean('is not allowed') - u'<span>is not allowed</span>' + '<span>is not allowed</span>' >>> bleach.clean('is not allowed', tags=['b']) - u'<span>is not allowed</span>' + '<span>is not allowed</span>' If you would rather Bleach stripped this markup entirely, you can pass @@ -298,10 +298,10 @@ If you would rather Bleach stripped this markup entirely, you can pass >>> import bleach >>> bleach.clean('is not allowed', strip=True) - u'is not allowed' + 'is not allowed' >>> bleach.clean('is not allowed', tags=['b'], strip=True) - u'is not allowed' + 'is not allowed' Stripping comments (``strip_comments``) @@ -317,10 +317,10 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set >>> html = 'my html' >>> bleach.clean(html) - u'my html' + 'my html' >>> bleach.clean(html, strip_comments=False) - u'my html' + 'my html' Using ``bleach.sanitizer.Cleaner`` @@ -353,7 +353,7 @@ Trivial Filter example: .. doctest:: >>> from bleach.sanitizer import Cleaner - >>> from html5lib.filters.base import Filter + >>> from bleach.html5lib_shim import Filter >>> class MooFilter(Filter): ... def __iter__(self): @@ -371,7 +371,7 @@ Trivial Filter example: >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) >>> dirty = 'this is cute! ' >>> cleaner.clean(dirty) - u'this is cute! ' + 'this is cute! ' .. Warning:: diff --git a/docs/linkify.rst b/docs/linkify.rst index b8e7884e..b5d9d20f 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -80,12 +80,12 @@ For example, you could add a ``title`` attribute to all links: >>> from bleach.linkifier import Linker >>> def set_title(attrs, new=False): - ... attrs[(None, u'title')] = u'link in user text' + ... attrs[(None, 'title')] = 'link in user text' ... return attrs ... >>> linker = Linker(callbacks=[set_title]) >>> linker.linkify('abc http://example.com def') - u'abc http://example.com def' + 'abc http://example.com def' This would set the value of the ``rel`` attribute, stomping on a previous value @@ -96,21 +96,21 @@ an external link: .. doctest:: - >>> from urlparse import urlparse + >>> from six.moves.urllib.parse import urlparse >>> from bleach.linkifier import Linker >>> def set_target(attrs, new=False): - ... p = urlparse(attrs[(None, u'href')]) + ... p = urlparse(attrs[(None, 'href')]) ... if p.netloc not in ['my-domain.com', 'other-domain.com']: - ... attrs[(None, u'target')] = u'_blank' - ... attrs[(None, u'class')] = u'external' + ... attrs[(None, 'target')] = '_blank' + ... attrs[(None, 'class')] = 'external' ... else: - ... attrs.pop((None, u'target'), None) + ... attrs.pop((None, 'target'), None) ... return attrs ... >>> linker = Linker(callbacks=[set_target]) >>> linker.linkify('abc http://example.com def') - u'abc http://example.com def' + 'abc http://example.com def' Removing Attributes @@ -127,17 +127,17 @@ sanitizing attributes.) >>> def allowed_attrs(attrs, new=False): ... """Only allow href, target, rel and title.""" ... allowed = [ - ... (None, u'href'), - ... (None, u'target'), - ... (None, u'rel'), - ... (None, u'title'), - ... u'_text', + ... (None, 'href'), + ... (None, 'target'), + ... (None, 'rel'), + ... (None, 'title'), + ... '_text', ... ] ... return dict((k, v) for k, v in attrs.items() if k in allowed) ... >>> linker = Linker(callbacks=[allowed_attrs]) >>> linker.linkify('link') - u'link' + 'link' Or you could remove a specific attribute, if it exists: @@ -147,15 +147,15 @@ Or you could remove a specific attribute, if it exists: >>> from bleach.linkifier import Linker >>> def remove_title(attrs, new=False): - ... attrs.pop((None, u'title'), None) + ... attrs.pop((None, 'title'), None) ... return attrs ... >>> linker = Linker(callbacks=[remove_title]) >>> linker.linkify('link') - u'link' + 'link' >>> linker.linkify('link') - u'link' + 'link' Altering Attributes @@ -177,14 +177,14 @@ Example of shortening link text: ... if not new: ... return attrs ... # _text will be the same as the URL for new links - ... text = attrs[u'_text'] + ... text = attrs['_text'] ... if len(text) > 25: - ... attrs[u'_text'] = text[0:22] + u'...' + ... attrs['_text'] = text[0:22] + '...' ... return attrs ... >>> linker = Linker(callbacks=[shorten_url]) >>> linker.linkify('http://example.com/longlonglonglonglongurl') - u'http://example.com/lon...' + 'http://example.com/lon...' Example of switching all links to go through a bouncer first: @@ -196,7 +196,7 @@ Example of switching all links to go through a bouncer first: >>> def outgoing_bouncer(attrs, new=False): ... """Send outgoing links through a bouncer.""" - ... href_key = (None, u'href') + ... href_key = (None, 'href') ... p = urlparse(attrs.get(href_key, None)) ... if p.netloc not in ['example.com', 'www.example.com', '']: ... bouncer = 'http://bn.ce/?destination=%s' @@ -205,10 +205,10 @@ Example of switching all links to go through a bouncer first: ... >>> linker = Linker(callbacks=[outgoing_bouncer]) >>> linker.linkify('http://example.com') - u'http://example.com' + 'http://example.com' >>> linker.linkify('http://foo.com') - u'http://foo.com' + 'http://foo.com' Preventing Links @@ -230,7 +230,7 @@ write the following callback: ... return attrs ... # If the TLD is '.py', make sure it starts with http: or https:. ... # Use _text because that's the original text - ... link_text = attrs[u'_text'] + ... link_text = attrs['_text'] ... if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')): ... # This looks like a Python file, not a URL. Don't make a link. ... return None @@ -239,10 +239,10 @@ write the following callback: ... >>> linker = Linker(callbacks=[dont_linkify_python]) >>> linker.linkify('abc http://example.com def') - u'abc http://example.com def' + 'abc http://example.com def' >>> linker.linkify('abc models.py def') - u'abc models.py def' + 'abc models.py def' .. _Crate: https://crate.io/ @@ -261,13 +261,13 @@ For example, this removes any ``mailto:`` links: >>> from bleach.linkifier import Linker >>> def remove_mailto(attrs, new=False): - ... if attrs[(None, u'href')].startswith(u'mailto:'): + ... if attrs[(None, 'href')].startswith('mailto:'): ... return None ... return attrs ... >>> linker = Linker(callbacks=[remove_mailto]) >>> linker.linkify('mail janet!') - u'mail janet!' + 'mail janet!' Skipping links in specified tag blocks (``skip_tags``) @@ -308,7 +308,7 @@ instance. >>> linker = Linker(skip_tags=['pre']) >>> linker.linkify('a b c http://example.com d e f') - u'a b c http://example.com d e f' + 'a b c http://example.com d e f' .. autoclass:: bleach.linkifier.Linker @@ -340,11 +340,11 @@ For example, using all the defaults: >>> cleaner = Cleaner(tags=['pre']) >>> cleaner.clean('
http://example.com
') - u'
http://example.com
' + '
http://example.com
' >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter]) >>> cleaner.clean('
http://example.com
') - u'
http://example.com
' + '
http://example.com
' And passing parameters to ``LinkifyFilter``: @@ -362,7 +362,7 @@ And passing parameters to ``LinkifyFilter``: ... ) ... >>> cleaner.clean('
http://example.com
') - u'
http://example.com
' + '
http://example.com
' .. autoclass:: bleach.linkifier.LinkifyFilter From ad910ce30926f8698cf7c8f4ec8b32d00d0897b2 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Tue, 8 Jan 2019 12:05:35 -0500 Subject: [PATCH 13/18] Update for 3.1.0 release --- CHANGES | 6 ++++-- CONTRIBUTORS | 12 ++++++++++-- bleach/__init__.py | 4 ++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/CHANGES b/CHANGES index 4fe065e8..838393b3 100644 --- a/CHANGES +++ b/CHANGES @@ -1,8 +1,8 @@ Bleach changes ============== -Version 3.1.0 (In development) ------------------------------- +Version 3.1.0 (January 9th, 2019) +--------------------------------- **Security fixes** @@ -31,6 +31,8 @@ None * Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is parsing ``meta`` tags. (#431) +* Fix doctests. (#357) + Version 3.0.2 (October 11th, 2018) ---------------------------------- diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 5783ab17..2b0137d0 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -18,21 +18,25 @@ Contributors: - Adam Lofts - Adrian "ThiefMaster" - Alek -- Alexandre Macabies -- Alexandr N. Zamaraev - Alex Defsen - Alex Ehlke +- Alexandre Macabies +- Alexandr N. Zamaraev - Alireza Savand - Andreas Malecki - Andy Freeland - Antoine Leclair +- Anton Backer - Anton Kovalyov +- Chad Birch - Chris Beaven - Dan Gayle +- dave-shawley - Erik Rose - Gaurav Dadhania - Geoffrey Sneddon - Greg Guthe +- hugovk - Istvan Albert - Jaime Irurzun - James Socol @@ -49,6 +53,7 @@ Contributors: - Mark Lee - Mark Paschal - mdxs +- Nikita Sobolev - nikolas - Oh Jinkyun - Paul Craciunoiu @@ -56,8 +61,11 @@ Contributors: - Ryan Niemeyer - Sébastien Fievet - sedrubal +- Stephane Blondon +- Stu Cox - Tim Dumol - Timothy Fitz +- Vadim Kotov - Vitaly Volkov - Will Kahn-Greene - Zoltán diff --git a/bleach/__init__.py b/bleach/__init__.py index 6249bf81..9816549b 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -18,9 +18,9 @@ # yyyymmdd -__releasedate__ = '' +__releasedate__ = '20190109' # x.y.z or x.y.z.dev0 -- semver -__version__ = '3.1.0.dev0' +__version__ = '3.1.0' VERSION = parse_version(__version__) From 996cde7a2439a2323f9c4b2567c8b8449d393351 Mon Sep 17 00:00:00 2001 From: Greg Guthe Date: Thu, 13 Feb 2020 16:09:52 -0500 Subject: [PATCH 14/18] fix bug 1615315 --- bleach/html5lib_shim.py | 7 ++++++- tests/test_clean.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 25e3e955..169c4027 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -376,7 +376,12 @@ def __init__(self, tags, strip, consume_entities, **kwargs): self.consume_entities = consume_entities super(BleachHTMLParser, self).__init__(**kwargs) - def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kwargs): + def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs): + # set scripting=True to parse