From e05edfc076c48a4a1fd9adc9d32f41a395830eda Mon Sep 17 00:00:00 2001
From: Chad Birch <chad.birch@gmail.com>
Date: Thu, 11 Oct 2018 15:51:18 -0600
Subject: [PATCH 01/18] Define a full list of valid elements for HTML_TAGS

Previously, the list of valid HTML tags being used by linkify was a
combination of various element lists from the html5lib constants.
However, these lists don't cover all of the valid HTML elements, so
linkify was escaping some valid tags (including <abbr>, <span>, and
more).

This commit just defines a full list of valid, non-deprecated HTML
elements for linkify to use instead.
---
 bleach/html5lib_shim.py | 122 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 111 insertions(+), 11 deletions(-)
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 4e6e054f..50d7d3e9 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -45,19 +45,119 @@
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 
 
-#: List of HTML tags
+#: List of valid HTML tags
 HTML_TAGS = [
-    tag for namespace, tag in
-    (
-        list(constants.scopingElements) +
-        list(constants.formattingElements) +
-        list(constants.specialElements) +
-        list(constants.htmlIntegrationPointElements) +
-        list(constants.mathmlTextIntegrationPointElements)
-    )
+    'a',
+    'abbr',
+    'address',
+    'area',
+    'article',
+    'aside',
+    'audio',
+    'b',
+    'base',
+    'bdi',
+    'bdo',
+    'blockquote',
+    'body',
+    'br',
+    'button',
+    'canvas',
+    'caption',
+    'cite',
+    'code',
+    'col',
+    'colgroup',
+    'command',
+    'data',
+    'datalist',
+    'dd',
+    'del',
+    'details',
+    'dfn',
+    'dialog',
+    'div',
+    'dl',
+    'dt',
+    'em',
+    'embed',
+    'fieldset',
+    'figcaption',
+    'figure',
+    'footer',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'head',
+    'header',
+    'hgroup',
+    'hr',
+    'html',
+    'i',
+    'iframe',
+    'img',
+    'input',
+    'ins',
+    'kbd',
+    'keygen',
+    'label',
+    'legend',
+    'li',
+    'link',
+    'map',
+    'mark',
+    'menu',
+    'meta',
+    'meter',
+    'nav',
+    'noscript',
+    'object',
+    'ol',
+    'optgroup',
+    'option',
+    'output',
+    'p',
+    'param',
+    'pre',
+    'progress',
+    'q',
+    'rp',
+    'rt',
+    'ruby',
+    's',
+    'samp',
+    'script',
+    'section',
+    'select',
+    'small',
+    'source',
+    'span',
+    'strong',
+    'style',
+    'sub',
+    'summary',
+    'sup',
+    'table',
+    'tbody',
+    'td',
+    'textarea',
+    'tfoot',
+    'th',
+    'thead',
+    'time',
+    'title',
+    'tr',
+    'track',
+    'u',
+    'ul',
+    'var',
+    'video',
+    'wbr',
 ]
-# Add tags that aren't in html5lib.constants
-HTML_TAGS.extend(['abbr'])
 
 
 class InputStreamWithMemory(object):

From 6f16fec8cff27ff665efe0b8ac22ef1cecf8c8dd Mon Sep 17 00:00:00 2001
From: jonathan vanasco <jonathan@2xlp.com>
Date: Tue, 16 Oct 2018 17:16:01 -0400
Subject: [PATCH 02/18] fixed docs on callback location

---
 docs/linkify.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/linkify.rst b/docs/linkify.rst
index 6665300a..d60e17b6 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -58,7 +58,7 @@ links will be removed leaving the innerText left in its place.
 The default callback adds ``rel="nofollow"``. See ``bleach.callbacks`` for some
 included callback functions.
 
-This defaults to ``bleach.linkify.DEFAULT_CALLBACKS``.
+This defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``.
 
 
 .. autodata:: bleach.linkifier.DEFAULT_CALLBACKS

From d445230222ec710542c89046103c4bad715f7c0a Mon Sep 17 00:00:00 2001
From: Chad Birch <chad.birch@gmail.com>
Date: Wed, 17 Oct 2018 14:54:45 -0600
Subject: [PATCH 03/18] Clarify source of HTML tag list, fix discrepancies

---
 bleach/html5lib_shim.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 50d7d3e9..63a4f25c 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -45,7 +45,8 @@
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
 
 
-#: List of valid HTML tags
+#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
+#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
 HTML_TAGS = [
     'a',
     'abbr',
@@ -68,7 +69,6 @@
     'code',
     'col',
     'colgroup',
-    'command',
     'data',
     'datalist',
     'dd',
@@ -122,6 +122,7 @@
     'output',
     'p',
     'param',
+    'picture',
     'pre',
     'progress',
     'q',
@@ -133,6 +134,7 @@
     'script',
     'section',
     'select',
+    'slot',
     'small',
     'source',
     'span',
@@ -144,6 +146,7 @@
     'table',
     'tbody',
     'td',
+    'template',
     'textarea',
     'tfoot',
     'th',

From 5c72a54b5c7e1286147bf20328465f3867878d2b Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Thu, 25 Oct 2018 16:05:42 -0400
Subject: [PATCH 04/18] Fix regex strings--they should be marked raw

---
 bleach/sanitizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 9ba4c57b..c60c26b3 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -425,7 +425,7 @@ def sanitize_uri_value(self, value, allowed_protocols):
 
         # Nix backtick, space characters, and control characters
         new_value = re.sub(
-            "[`\000-\040\177-\240\s]+",
+            r"[`\000-\040\177-\240\s]+",
             '',
             new_value
         )
@@ -574,7 +574,7 @@ def sanitize_css(self, style):
         style = html5lib_shim.convert_entities(style)
 
         # Drop any url values before we do anything else
-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+        style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
 
         # The gauntlet of sanitization
 
@@ -589,11 +589,11 @@ def sanitize_css(self, style):
             if not gauntlet.match(part):
                 return ''
 
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+        if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
             return ''
 
         clean = []
-        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+        for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
             if not value:
                 continue
 

From bdb28650a2ffe7a994f59b4a958e8b8763b53b64 Mon Sep 17 00:00:00 2001
From: dave-shawley <daveshawley@gmail.com>
Date: Mon, 29 Oct 2018 12:47:56 -0400
Subject: [PATCH 05/18] setup.py: six >=1.9 is required. (#416)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0de45ad4..bf01a4ed 100755
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
 ]
 
 install_requires = [
-    'six',
+    'six>=1.9.0',
     # html5lib requirements
     'webencodings',
 ]

From 83738b9dd8ac5242fe1ca54059b1ca1310ac6610 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 12 Dec 2018 16:58:32 -0500
Subject: [PATCH 06/18] Add recognized_tags to Linker arguments

recognized_tags lets someone specify a different set of recognized tags
to the Linker when they're using it by itself. One use of this is if
you're doing a clean pass, storing that, and then linkifying on demand.
The recognized_tags argument lets you re-use the allowed tags from the
clean step and then there isn't any additional oddities occuring in
the linkify step.

Fixes #409
---
 bleach/linkifier.py   |  7 +++++--
 docs/linkify.rst      | 21 +++++++++++----------
 tests/test_linkify.py | 17 +++++++++++++++++
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
index 5d815f8b..95baba14 100644
--- a/bleach/linkifier.py
+++ b/bleach/linkifier.py
@@ -85,7 +85,7 @@ class Linker(object):
 
     """
     def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
-                 url_re=URL_RE, email_re=EMAIL_RE):
+                 url_re=URL_RE, email_re=EMAIL_RE, recognized_tags=html5lib_shim.HTML_TAGS):
         """Creates a Linker instance
 
         :arg list callbacks: list of callbacks to run when adjusting tag attributes;
@@ -101,6 +101,9 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
 
         :arg re email_re: email matching regex
 
+        :arg list-of-strings recognized_tags: the list of tags that linkify knows about;
+            everything else gets escaped
+
         :returns: linkified text as unicode
 
         """
@@ -113,7 +116,7 @@ def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=Fals
         # Create a parser/tokenizer that allows all HTML tags and escapes
         # anything not in that list.
         self.parser = html5lib_shim.BleachHTMLParser(
-            tags=html5lib_shim.HTML_TAGS,
+            tags=recognized_tags,
             strip=False,
             consume_entities=True,
             namespaceHTMLElements=False,
diff --git a/docs/linkify.rst b/docs/linkify.rst
index d60e17b6..b8e7884e 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -12,13 +12,14 @@ For example, you could pass in text and have all URL things converted into
 HTML links.
 
 It works by parsing the text as HTML and building a document tree. In this
-way, it's guaranteed never to do weird things to URLs in attribute values,
-can modify the value of attributes on ``<a>`` tags and can even do things
-like skip ``<pre>`` sections.
+way, you're guaranteed to get valid HTML back without weird things like
+having URLs in tag attributes getting linkified.
 
-If you plan to sanitize/clean the text and linkify it, you should do that
-in a single pass using :ref:`LinkifyFilter <linkify-LinkifyFilter>`. This
-is faster and it'll use the list of allowed tags from clean.
+.. note::
+
+   If you plan to sanitize/clean the text and linkify it, you should do that
+   in a single pass using :ref:`LinkifyFilter <linkify-LinkifyFilter>`. This
+   is faster and it'll use the list of allowed tags from clean.
 
 .. note::
 
@@ -297,8 +298,8 @@ writing callbacks that may need to behave differently if the protocol is
 Using ``bleach.linkifier.Linker``
 =================================
 
-If you're linking a lot of text and passing the same argument values or you want
-more configurability, consider using a :py:class:`bleach.linkifier.Linker`
+If you're linking a lot of text and passing the same argument values or you
+need more configurability, consider using a :py:class:`bleach.linkifier.Linker`
 instance.
 
 .. doctest::
@@ -325,8 +326,8 @@ Using ``bleach.linkifier.LinkifyFilter``
 the ``bleach.linkifier.LinkifyFilter`` when walking the tree and serializing it
 back into text.
 
-You can use this filter wherever you can use an html5lib Filter. For example, you
-could use it with ``bleach.Cleaner`` to clean and linkify in one step.
+You can use this filter wherever you can use an html5lib Filter. This lets you
+use it with ``bleach.Cleaner`` to clean and linkify in one step.
 
 For example, using all the defaults:
 
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index eeea3e32..d29a5c82 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -625,6 +625,23 @@ def test_email_re_arg():
     )
 
 
+def test_recognized_tags_arg():
+    """Verifies that recognized_tags works"""
+    # The html parser doesn't recognize "sarcasm" as a tag, so it escapes it
+    linker = Linker(recognized_tags=['p'])
+    assert (
+        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
+        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p>&lt;sarcasm&gt;'  # noqa
+    )
+
+    # The html parser recognizes "sarcasm" as a tag and fixes it
+    linker = Linker(recognized_tags=['p', 'sarcasm'])
+    assert (
+        linker.linkify('<p>http://example.com/</p><sarcasm>') ==
+        '<p><a href="http://example.com/" rel="nofollow">http://example.com/</a></p><sarcasm></sarcasm>'  # noqa
+    )
+
+
 def test_linkify_idempotent():
     dirty = '<span>invalid & </span> < extra http://link.com<em>'
     assert linkify(linkify(dirty)) == linkify(dirty)

From 821a0ff8959b96d91f1e2b1779c9ad8b42da4173 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 12 Dec 2018 19:12:37 -0500
Subject: [PATCH 07/18] Update for 3.0.3 development

---
 CHANGES            | 22 ++++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 176dde1c..31d62dcd 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,28 @@
 Bleach changes
 ==============
 
+Version 3.0.3 (In development)
+------------------------------
+
+**Security fixes**
+
+None
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+* Add ``recognized_tags`` argument to the linkify ``Linker`` class. This
+  fixes issues when linkifying on its own and having some tags get escaped.
+  It defaults to a list of HTML5 tags. Thank you, Chad Birch! (#409)
+
+**Bug fixes**
+
+* Add ``six>=1.9`` to requirements. Thank you, Dave Shawley (#416)
+
+
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 8ed01763..14049e2d 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20181011'
+__releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.2'
+__version__ = '3.0.3'
 VERSION = parse_version(__version__)
 
 

From 33c4a23886ed7e33995529a25247a724b1feac1c Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Fri, 28 Dec 2018 10:05:20 -0500
Subject: [PATCH 08/18] Drop invalid attribute names (#419)

It's possible for the tokenizer to kick up an
invalid-character-in-attribute-name error. When it does that, the
BleachHTMLTokenizer should drop the attribute with the invalid name.
This fixes that.
---
 bleach/__init__.py      |  2 +-
 bleach/html5lib_shim.py | 32 ++++++++++++++++++++++++++++----
 tests/test_linkify.py   | 13 ++++++++++++-
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/bleach/__init__.py b/bleach/__init__.py
index 14049e2d..a6445d02 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -20,7 +20,7 @@
 # yyyymmdd
 __releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.3'
+__version__ = '3.0.3.dev0'
 VERSION = parse_version(__version__)
 
 
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 63a4f25c..88876678 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -43,6 +43,7 @@
     constants.tokenTypes['EmptyTag']
 ])
 CHARACTERS_TYPE = constants.tokenTypes['Characters']
+PARSEERROR_TYPE = constants.tokenTypes['ParseError']
 
 
 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
@@ -232,7 +233,21 @@ def __iter__(self):
 
         for token in super(BleachHTMLTokenizer, self).__iter__():
             if last_error_token is not None:
-                if ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
+                if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
+                     token['type'] in TAG_TOKEN_TYPES and
+                     token.get('data'))):
+                    # Remove attribute names that have ', " or < in them
+                    # because those characters are invalid for attribute names.
+                    token['data'] = [
+                        item for item in token['data']
+                        if ('"' not in item[0] and
+                            "'" not in item[0] and
+                            '<' not in item[0])
+                    ]
+                    last_error_token = None
+                    yield token
+
+                elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
                      token['data'].lower().strip() not in self.parser.tags)):
                     # We've got either a malformed tag or a pseudo-tag or
                     # something that html5lib wants to turn into a malformed
@@ -248,24 +263,33 @@ def __iter__(self):
                     token['data'] = self.stream.get_tag()
                     token['type'] = CHARACTERS_TYPE
 
-                    # Yield the adjusted token
+                    last_error_token = None
                     yield token
 
+                elif token['type'] == PARSEERROR_TYPE:
+                    # If the token is a parse error, then let the last_error_token
+                    # go, and make token the new last_error_token
+                    yield last_error_token
+                    last_error_token = token
+
                 else:
                     yield last_error_token
                     yield token
+                    last_error_token = None
 
-                last_error_token = None
                 continue
 
             # If the token is a ParseError, we hold on to it so we can get the
             # next token and potentially fix it.
-            if token['type'] == constants.tokenTypes['ParseError']:
+            if token['type'] == PARSEERROR_TYPE:
                 last_error_token = token
                 continue
 
             yield token
 
+        if last_error_token:
+            yield last_error_token
+
     def consumeEntity(self, allowedChar=None, fromAttribute=False):
         # If this tokenizer is set to consume entities, then we can let the
         # superclass do its thing.
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index d29a5c82..584a5b0d 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -69,6 +69,17 @@ def ft(attrs, new=False):
     )
 
 
+def test_invalid_attribute_names():
+    """Test that "invalid-character-in-attribute-name" errors in tokenizing
+    result in attributes with invalid names get dropped.
+
+    """
+    assert (
+        linkify('<a href="http://example.com/"">') ==
+        '<a href="http://example.com/" rel="nofollow"></a>'
+    )
+
+
 @pytest.mark.parametrize('data,parse_email,expected', [
     (
         'a james@example.com mailto',
@@ -119,7 +130,7 @@ def test_email_link(data, parse_email, expected):
     assert linkify(data, parse_email=parse_email) == expected
 
 
-@pytest.mark.parametrize('data,expected', [
+@pytest.mark.parametrize('data, expected', [
     (
         '"james"@example.com',
         '''<a href='mailto:"james"@example.com'>"james"@example.com</a>'''

From 4a4c4956ff6bfd9e354b5db330f1bbd9f2828453 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Mon, 31 Dec 2018 16:15:35 -0500
Subject: [PATCH 09/18] Add note to CHANGES

---
 CHANGES | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGES b/CHANGES
index 31d62dcd..eb3dc23b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -22,6 +22,9 @@ None
 
 * Add ``six>=1.9`` to requirements. Thank you, Dave Shawley (#416)
 
+* Fix cases where attribute names could have invalid characters in them.
+  (#419)
+
 
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------

From 8d7fd48179b5020d9b1521be7b81e06648d868d3 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Wed, 12 Dec 2018 19:29:17 -0500
Subject: [PATCH 10/18] Convert &amp; to & as a Characters token

This fixes a problem in LinkifyFilter when using it with the Cleaner where
the Cleaner sets up the tokenizer to not consume entities. So character
entities end up in their own Entity tokens and Linkifyfilter can't match
links that cross token boundaries. If there's a &amp;, then LinkifyFilter
won't match across that.

This fixes that by converting &amp; to & in the sanitizer when it's pulling out
entities and putting them in separate Entity tokens. The & Characters tokens
will get merged by BleachSanitizerFilter.__iter__ and & will get converted
back to &amp; in the serialier.

Fixes #422
---
 bleach/sanitizer.py   | 13 ++++++++++++-
 tests/test_linkify.py |  4 ++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index c60c26b3..79b80f5b 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -395,7 +395,18 @@ def sanitize_characters(self, token):
             if part.startswith('&'):
                 entity = html5lib_shim.match_entity(part)
                 if entity is not None:
-                    new_tokens.append({'type': 'Entity', 'name': entity})
+                    if entity == 'amp':
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({'type': 'Characters', 'data': '&'})
+                    else:
+                        new_tokens.append({'type': 'Entity', 'name': entity})
+
                     # Length of the entity plus 2--one for & at the beginning
                     # and and one for ; at the end
                     remainder = part[len(entity) + 2:]
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index 584a5b0d..ab1c5134 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -694,6 +694,10 @@ def test_only_text_is_linkified(self):
         'http://example.com?b=1&c=2',
         '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
     ),
+    (
+        'http://example.com?b=1&amp;c=2',
+        '<a href="http://example.com?b=1&amp;c=2">http://example.com?b=1&amp;c=2</a>'
+    ),
     (
         'link: https://example.com/watch#anchor',
         'link: <a href="https://example.com/watch#anchor">https://example.com/watch#anchor</a>'

From cb156cb9054c34b817f8ed2dff92801a594b9107 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 8 Jan 2019 10:33:54 -0500
Subject: [PATCH 11/18] Fix parsing "meta" tag with encoding attribute

When parsing a <meta encoding=""> tag, the parser calls charEncoding
and changeEncoding in the input stream, but the InputStreamWithMemory
wrapper didn't have those methods. This fixes that.

This also creates a new test set for BleachHTMLParser functionality.

Fixes #431
---
 CHANGES                     |  8 ++++-
 bleach/__init__.py          |  2 +-
 bleach/html5lib_shim.py     |  8 +++++
 tests/test_html5lib_shim.py | 62 +++++++++++++++++++++++++++++++++++++
 tests/test_linkify.py       | 11 -------
 5 files changed, 78 insertions(+), 13 deletions(-)

diff --git a/CHANGES b/CHANGES
index eb3dc23b..4fe065e8 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,7 +1,7 @@
 Bleach changes
 ==============
 
-Version 3.0.3 (In development)
+Version 3.1.0 (In development)
 ------------------------------
 
 **Security fixes**
@@ -25,6 +25,12 @@ None
 * Fix cases where attribute names could have invalid characters in them.
   (#419)
 
+* Fix problems with ``LinkifyFilter`` not being able to match links
+  across ``&amp;``. (#422)
+
+* Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is
+  parsing ``meta`` tags. (#431)
+
 
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index a6445d02..6249bf81 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -20,7 +20,7 @@
 # yyyymmdd
 __releasedate__ = ''
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.0.3.dev0'
+__version__ = '3.1.0.dev0'
 VERSION = parse_version(__version__)
 
 
diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 88876678..25e3e955 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -181,6 +181,14 @@ def __init__(self, inner_stream):
     def errors(self):
         return self._inner_stream.errors
 
+    @property
+    def charEncoding(self):
+        return self._inner_stream.charEncoding
+
+    @property
+    def changeEncoding(self):
+        return self._inner_stream.changeEncoding
+
     def char(self):
         c = self._inner_stream.char()
         # char() can return None if EOF, so ignore that
diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py
index 5712d338..ce15de7e 100644
--- a/tests/test_html5lib_shim.py
+++ b/tests/test_html5lib_shim.py
@@ -80,3 +80,65 @@ def test_serializer(data, expected):
     serialized = serializer.render(walker(dom))
 
     assert serialized == expected
+
+
+@pytest.mark.parametrize('parser_args, data, expected', [
+    # Make sure InputStreamWithMemory has charEncoding and changeEncoding
+    (
+        {},
+        '<meta charset="utf-8">',
+        '<meta charset="utf-8">'
+    ),
+    # Handle consume entities False--all entities are passed along and then
+    # escaped when serialized
+    (
+        {'consume_entities': False},
+        'text &amp;&gt;&quot;',
+        'text &amp;amp;&amp;gt;&amp;quot;'
+    ),
+    # Handle consume entities True--all entities are consumed and converted
+    # to their character equivalents and then &, <, and > are escaped when
+    # serialized
+    (
+        {'consume_entities': True},
+        'text &amp;&gt;&quot;',
+        'text &amp;&gt;"'
+    ),
+    # Test that "invalid-character-in-attribute-name" errors in tokenizing
+    # result in attributes with invalid names getting dropped
+    (
+        {},
+        '<a href="http://example.com"">',
+        '<a href="http://example.com"></a>'
+    ),
+    (
+        {},
+        '<a href=\'http://example.com\'\'>',
+        '<a href="http://example.com"></a>'
+    )
+])
+def test_bleach_html_parser(parser_args, data, expected):
+    args = {
+        'tags': None,
+        'strip': True,
+        'consume_entities': True
+    }
+    args.update(parser_args)
+
+    # Build a parser, walker, and serializer just like we do in clean()
+    parser = html5lib_shim.BleachHTMLParser(**args)
+    walker = html5lib_shim.getTreeWalker('etree')
+    serializer = html5lib_shim.BleachHTMLSerializer(
+        quote_attr_values='always',
+        omit_optional_tags=False,
+        escape_lt_in_attrs=True,
+        resolve_entities=False,
+        sanitize=False,
+        alphabetical_attributes=False,
+    )
+
+    # Parse, walk, and then serialize the output
+    dom = parser.parseFragment(data)
+    serialized = serializer.render(walker(dom))
+
+    assert serialized == expected
diff --git a/tests/test_linkify.py b/tests/test_linkify.py
index ab1c5134..f1211894 100644
--- a/tests/test_linkify.py
+++ b/tests/test_linkify.py
@@ -69,17 +69,6 @@ def ft(attrs, new=False):
     )
 
 
-def test_invalid_attribute_names():
-    """Test that "invalid-character-in-attribute-name" errors in tokenizing
-    result in attributes with invalid names get dropped.
-
-    """
-    assert (
-        linkify('<a href="http://example.com/"">') ==
-        '<a href="http://example.com/" rel="nofollow"></a>'
-    )
-
-
 @pytest.mark.parametrize('data,parse_email,expected', [
     (
         'a james@example.com mailto',

From 245c21c3cef788dbfdb380514434497866443e87 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 8 Jan 2019 12:11:15 -0500
Subject: [PATCH 12/18] Fix doctest failures

This reworks the doctests to run and pass in Python 3.

Fixes #357
---
 docs/clean.rst   | 46 +++++++++++++++++-----------------
 docs/linkify.rst | 64 ++++++++++++++++++++++++------------------------
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/docs/clean.rst b/docs/clean.rst
index 68178ce5..c786f2a3 100644
--- a/docs/clean.rst
+++ b/docs/clean.rst
@@ -63,10 +63,10 @@ For example:
    >>> import bleach
 
    >>> bleach.clean(
-   ...     u'<b><i>an example</i></b>',
+   ...     '<b><i>an example</i></b>',
    ...     tags=['b'],
    ... )
-   u'<b>&lt;i&gt;an example&lt;/i&gt;</b>'
+   '<b>&lt;i&gt;an example&lt;/i&gt;</b>'
 
 
 The default value is a relatively conservative list found in
@@ -106,12 +106,12 @@ For example:
    >>> import bleach
 
    >>> bleach.clean(
-   ...     u'<p class="foo" style="color: red; font-weight: bold;">blah blah blah</p>',
+   ...     '<p class="foo" style="color: red; font-weight: bold;">blah blah blah</p>',
    ...     tags=['p'],
    ...     attributes=['style'],
    ...     styles=['color'],
    ... )
-   u'<p style="color: red;">blah blah blah</p>'
+   '<p style="color: red;">blah blah blah</p>'
 
 
 As a dict
@@ -135,11 +135,11 @@ and "class" for any tag (including "a" and "img"):
    ... }
 
    >>> bleach.clean(
-   ...    u'<img alt="an example" width=500>',
+   ...    '<img alt="an example" width=500>',
    ...    tags=['img'],
    ...    attributes=attrs
    ... )
-   u'<img alt="an example">'
+   '<img alt="an example">'
 
 
 Using functions
@@ -161,11 +161,11 @@ For example:
    ...     return name[0] == 'h'
 
    >>> bleach.clean(
-   ...    u'<a href="http://example.com" title="link">link</a>',
+   ...    '<a href="http://example.com" title="link">link</a>',
    ...    tags=['a'],
    ...    attributes=allow_h,
    ... )
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
 
 You can also pass a callable as a value in an attributes dict and it'll run for
@@ -173,7 +173,7 @@ attributes for specified tags:
 
 .. doctest::
 
-   >>> from urlparse import urlparse
+   >>> from six.moves.urllib.parse import urlparse
    >>> import bleach
 
    >>> def allow_src(tag, name, value):
@@ -185,13 +185,13 @@ attributes for specified tags:
    ...     return False
 
    >>> bleach.clean(
-   ...    u'<img src="http://example.com" alt="an example">',
+   ...    '<img src="http://example.com" alt="an example">',
    ...    tags=['img'],
    ...    attributes={
    ...        'img': allow_src
    ...    }
    ... )
-   u'<img alt="an example">'
+   '<img alt="an example">'
 
 
 .. versionchanged:: 2.0
@@ -223,12 +223,12 @@ For example, to allow users to set the color and font-weight of text:
    >>> styles = ['color', 'font-weight']
 
    >>> bleach.clean(
-   ...     u'<p style="font-weight: heavy;">my html</p>',
+   ...     '<p style="font-weight: heavy;">my html</p>',
    ...     tags=tags,
    ...     attributes=attrs,
    ...     styles=styles
    ... )
-   u'<p style="font-weight: heavy;">my html</p>'
+   '<p style="font-weight: heavy;">my html</p>'
 
 
 Default styles are stored in ``bleach.sanitizer.ALLOWED_STYLES``.
@@ -252,7 +252,7 @@ For example, this sets allowed protocols to http, https and smb:
    ...     '<a href="smb://more_text">allowed protocol</a>',
    ...     protocols=['http', 'https', 'smb']
    ... )
-   u'<a href="smb://more_text">allowed protocol</a>'
+   '<a href="smb://more_text">allowed protocol</a>'
 
 
 This adds smb to the Bleach-specified set of allowed protocols:
@@ -265,7 +265,7 @@ This adds smb to the Bleach-specified set of allowed protocols:
    ...     '<a href="smb://more_text">allowed protocol</a>',
    ...     protocols=bleach.ALLOWED_PROTOCOLS + ['smb']
    ... )
-   u'<a href="smb://more_text">allowed protocol</a>'
+   '<a href="smb://more_text">allowed protocol</a>'
 
 
 Default protocols are in ``bleach.sanitizer.ALLOWED_PROTOCOLS``.
@@ -284,10 +284,10 @@ and invalid markup. For example:
    >>> import bleach
 
    >>> bleach.clean('<span>is not allowed</span>')
-   u'&lt;span&gt;is not allowed&lt;/span&gt;'
+   '&lt;span&gt;is not allowed&lt;/span&gt;'
 
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'])
-   u'<b>&lt;span&gt;is not allowed&lt;/span&gt;</b>'
+   '<b>&lt;span&gt;is not allowed&lt;/span&gt;</b>'
 
 
 If you would rather Bleach stripped this markup entirely, you can pass
@@ -298,10 +298,10 @@ If you would rather Bleach stripped this markup entirely, you can pass
    >>> import bleach
 
    >>> bleach.clean('<span>is not allowed</span>', strip=True)
-   u'is not allowed'
+   'is not allowed'
 
    >>> bleach.clean('<b><span>is not allowed</span></b>', tags=['b'], strip=True)
-   u'<b>is not allowed</b>'
+   '<b>is not allowed</b>'
 
 
 Stripping comments (``strip_comments``)
@@ -317,10 +317,10 @@ By default, Bleach will strip out HTML comments. To disable this behavior, set
    >>> html = 'my<!-- commented --> html'
 
    >>> bleach.clean(html)
-   u'my html'
+   'my html'
 
    >>> bleach.clean(html, strip_comments=False)
-   u'my<!-- commented --> html'
+   'my<!-- commented --> html'
 
 
 Using ``bleach.sanitizer.Cleaner``
@@ -353,7 +353,7 @@ Trivial Filter example:
 .. doctest::
 
    >>> from bleach.sanitizer import Cleaner
-   >>> from html5lib.filters.base import Filter
+   >>> from bleach.html5lib_shim import Filter
 
    >>> class MooFilter(Filter):
    ...     def __iter__(self):
@@ -371,7 +371,7 @@ Trivial Filter example:
    >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
    >>> dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
    >>> cleaner.clean(dirty)
-   u'this is cute! <img rel="moo" src="moo">'
+   'this is cute! <img rel="moo" src="moo">'
 
 
 .. Warning::
diff --git a/docs/linkify.rst b/docs/linkify.rst
index b8e7884e..b5d9d20f 100644
--- a/docs/linkify.rst
+++ b/docs/linkify.rst
@@ -80,12 +80,12 @@ For example, you could add a ``title`` attribute to all links:
    >>> from bleach.linkifier import Linker
 
    >>> def set_title(attrs, new=False):
-   ...     attrs[(None, u'title')] = u'link in user text'
+   ...     attrs[(None, 'title')] = 'link in user text'
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[set_title])
    >>> linker.linkify('abc http://example.com def')
-   u'abc <a href="http://example.com" title="link in user text">http://example.com</a> def'
+   'abc <a href="http://example.com" title="link in user text">http://example.com</a> def'
 
 
 This would set the value of the ``rel`` attribute, stomping on a previous value
@@ -96,21 +96,21 @@ an external link:
 
 .. doctest::
 
-   >>> from urlparse import urlparse
+   >>> from six.moves.urllib.parse import urlparse
    >>> from bleach.linkifier import Linker
 
    >>> def set_target(attrs, new=False):
-   ...     p = urlparse(attrs[(None, u'href')])
+   ...     p = urlparse(attrs[(None, 'href')])
    ...     if p.netloc not in ['my-domain.com', 'other-domain.com']:
-   ...         attrs[(None, u'target')] = u'_blank'
-   ...         attrs[(None, u'class')] = u'external'
+   ...         attrs[(None, 'target')] = '_blank'
+   ...         attrs[(None, 'class')] = 'external'
    ...     else:
-   ...         attrs.pop((None, u'target'), None)
+   ...         attrs.pop((None, 'target'), None)
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[set_target])
    >>> linker.linkify('abc http://example.com def')
-   u'abc <a class="external" href="http://example.com" target="_blank">http://example.com</a> def'
+   'abc <a class="external" href="http://example.com" target="_blank">http://example.com</a> def'
 
 
 Removing Attributes
@@ -127,17 +127,17 @@ sanitizing attributes.)
    >>> def allowed_attrs(attrs, new=False):
    ...     """Only allow href, target, rel and title."""
    ...     allowed = [
-   ...         (None, u'href'),
-   ...         (None, u'target'),
-   ...         (None, u'rel'),
-   ...         (None, u'title'),
-   ...         u'_text',
+   ...         (None, 'href'),
+   ...         (None, 'target'),
+   ...         (None, 'rel'),
+   ...         (None, 'title'),
+   ...         '_text',
    ...     ]
    ...     return dict((k, v) for k, v in attrs.items() if k in allowed)
    ...
    >>> linker = Linker(callbacks=[allowed_attrs])
    >>> linker.linkify('<a style="font-weight: super bold;" href="http://example.com">link</a>')
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
 
 Or you could remove a specific attribute, if it exists:
@@ -147,15 +147,15 @@ Or you could remove a specific attribute, if it exists:
    >>> from bleach.linkifier import Linker
 
    >>> def remove_title(attrs, new=False):
-   ...     attrs.pop((None, u'title'), None)
+   ...     attrs.pop((None, 'title'), None)
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[remove_title])
    >>> linker.linkify('<a href="http://example.com">link</a>')
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
    >>> linker.linkify('<a title="bad title" href="http://example.com">link</a>')
-   u'<a href="http://example.com">link</a>'
+   '<a href="http://example.com">link</a>'
 
 
 Altering Attributes
@@ -177,14 +177,14 @@ Example of shortening link text:
    ...     if not new:
    ...         return attrs
    ...     # _text will be the same as the URL for new links
-   ...     text = attrs[u'_text']
+   ...     text = attrs['_text']
    ...     if len(text) > 25:
-   ...         attrs[u'_text'] = text[0:22] + u'...'
+   ...         attrs['_text'] = text[0:22] + '...'
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[shorten_url])
    >>> linker.linkify('http://example.com/longlonglonglonglongurl')
-   u'<a href="http://example.com/longlonglonglonglongurl">http://example.com/lon...</a>'
+   '<a href="http://example.com/longlonglonglonglongurl">http://example.com/lon...</a>'
 
 
 Example of switching all links to go through a bouncer first:
@@ -196,7 +196,7 @@ Example of switching all links to go through a bouncer first:
 
    >>> def outgoing_bouncer(attrs, new=False):
    ...     """Send outgoing links through a bouncer."""
-   ...     href_key = (None, u'href')
+   ...     href_key = (None, 'href')
    ...     p = urlparse(attrs.get(href_key, None))
    ...     if p.netloc not in ['example.com', 'www.example.com', '']:
    ...         bouncer = 'http://bn.ce/?destination=%s'
@@ -205,10 +205,10 @@ Example of switching all links to go through a bouncer first:
    ...
    >>> linker = Linker(callbacks=[outgoing_bouncer])
    >>> linker.linkify('http://example.com')
-   u'<a href="http://example.com">http://example.com</a>'
+   '<a href="http://example.com">http://example.com</a>'
 
    >>> linker.linkify('http://foo.com')
-   u'<a href="http://bn.ce/?destination=http%3A//foo.com">http://foo.com</a>'
+   '<a href="http://bn.ce/?destination=http%3A//foo.com">http://foo.com</a>'
 
 
 Preventing Links
@@ -230,7 +230,7 @@ write the following callback:
    ...         return attrs
    ...     # If the TLD is '.py', make sure it starts with http: or https:.
    ...     # Use _text because that's the original text
-   ...     link_text = attrs[u'_text']
+   ...     link_text = attrs['_text']
    ...     if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
    ...         # This looks like a Python file, not a URL. Don't make a link.
    ...         return None
@@ -239,10 +239,10 @@ write the following callback:
    ...
    >>> linker = Linker(callbacks=[dont_linkify_python])
    >>> linker.linkify('abc http://example.com def')
-   u'abc <a href="http://example.com">http://example.com</a> def'
+   'abc <a href="http://example.com">http://example.com</a> def'
 
    >>> linker.linkify('abc models.py def')
-   u'abc models.py def'
+   'abc models.py def'
 
 
 .. _Crate: https://crate.io/
@@ -261,13 +261,13 @@ For example, this removes any ``mailto:`` links:
    >>> from bleach.linkifier import Linker
 
    >>> def remove_mailto(attrs, new=False):
-   ...     if attrs[(None, u'href')].startswith(u'mailto:'):
+   ...     if attrs[(None, 'href')].startswith('mailto:'):
    ...         return None
    ...     return attrs
    ...
    >>> linker = Linker(callbacks=[remove_mailto])
    >>> linker.linkify('<a href="mailto:janet@example.com">mail janet!</a>')
-   u'mail janet!'
+   'mail janet!'
 
 
 Skipping links in specified tag blocks (``skip_tags``)
@@ -308,7 +308,7 @@ instance.
 
    >>> linker = Linker(skip_tags=['pre'])
    >>> linker.linkify('a b c http://example.com d e f')
-   u'a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f'
+   'a b c <a href="http://example.com" rel="nofollow">http://example.com</a> d e f'
 
 
 .. autoclass:: bleach.linkifier.Linker
@@ -340,11 +340,11 @@ For example, using all the defaults:
 
    >>> cleaner = Cleaner(tags=['pre'])
    >>> cleaner.clean('<pre>http://example.com</pre>')
-   u'<pre>http://example.com</pre>'
+   '<pre>http://example.com</pre>'
 
    >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter])
    >>> cleaner.clean('<pre>http://example.com</pre>')
-   u'<pre><a href="http://example.com">http://example.com</a></pre>'
+   '<pre><a href="http://example.com">http://example.com</a></pre>'
 
 
 And passing parameters to ``LinkifyFilter``:
@@ -362,7 +362,7 @@ And passing parameters to ``LinkifyFilter``:
    ... )
    ...
    >>> cleaner.clean('<pre>http://example.com</pre>')
-   u'<pre>http://example.com</pre>'
+   '<pre>http://example.com</pre>'
 
 
 .. autoclass:: bleach.linkifier.LinkifyFilter

From ad910ce30926f8698cf7c8f4ec8b32d00d0897b2 Mon Sep 17 00:00:00 2001
From: Will Kahn-Greene <willkg@mozilla.com>
Date: Tue, 8 Jan 2019 12:05:35 -0500
Subject: [PATCH 13/18] Update for 3.1.0 release

---
 CHANGES            |  6 ++++--
 CONTRIBUTORS       | 12 ++++++++++--
 bleach/__init__.py |  4 ++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/CHANGES b/CHANGES
index 4fe065e8..838393b3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,8 +1,8 @@
 Bleach changes
 ==============
 
-Version 3.1.0 (In development)
-------------------------------
+Version 3.1.0 (January 9th, 2019)
+---------------------------------
 
 **Security fixes**
 
@@ -31,6 +31,8 @@ None
 * Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is
   parsing ``meta`` tags. (#431)
 
+* Fix doctests. (#357)
+
 
 Version 3.0.2 (October 11th, 2018)
 ----------------------------------
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 5783ab17..2b0137d0 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -18,21 +18,25 @@ Contributors:
 - Adam Lofts
 - Adrian "ThiefMaster"
 - Alek
-- Alexandre Macabies
-- Alexandr N. Zamaraev
 - Alex Defsen
 - Alex Ehlke
+- Alexandre Macabies
+- Alexandr N. Zamaraev
 - Alireza Savand
 - Andreas Malecki
 - Andy Freeland
 - Antoine Leclair
+- Anton Backer
 - Anton Kovalyov
+- Chad Birch
 - Chris Beaven
 - Dan Gayle
+- dave-shawley
 - Erik Rose
 - Gaurav Dadhania
 - Geoffrey Sneddon
 - Greg Guthe
+- hugovk
 - Istvan Albert
 - Jaime Irurzun
 - James Socol
@@ -49,6 +53,7 @@ Contributors:
 - Mark Lee
 - Mark Paschal
 - mdxs
+- Nikita Sobolev
 - nikolas
 - Oh Jinkyun
 - Paul Craciunoiu
@@ -56,8 +61,11 @@ Contributors:
 - Ryan Niemeyer
 - Sébastien Fievet
 - sedrubal
+- Stephane Blondon
+- Stu Cox
 - Tim Dumol
 - Timothy Fitz
+- Vadim Kotov
 - Vitaly Volkov
 - Will Kahn-Greene
 - Zoltán
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 6249bf81..9816549b 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = ''
+__releasedate__ = '20190109'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.0.dev0'
+__version__ = '3.1.0'
 VERSION = parse_version(__version__)
 
 

From 996cde7a2439a2323f9c4b2567c8b8449d393351 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 13 Feb 2020 16:09:52 -0500
Subject: [PATCH 14/18] fix bug 1615315

---
 bleach/html5lib_shim.py |  7 ++++++-
 tests/test_clean.py     | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 25e3e955..169c4027 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -376,7 +376,12 @@ def __init__(self, tags, strip, consume_entities, **kwargs):
         self.consume_entities = consume_entities
         super(BleachHTMLParser, self).__init__(**kwargs)
 
-    def _parse(self, stream, innerHTML=False, container='div', scripting=False, **kwargs):
+    def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
+        # set scripting=True to parse <noscript> as though JS is enabled to
+        # match the expected context in browsers
+        #
+        # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
+        #
         # Override HTMLParser so we can swap out the tokenizer for our own.
         self.innerHTMLMode = innerHTML
         self.container = container
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 53227677..f3c00001 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -769,6 +769,34 @@ def test_nonexistent_namespace():
     assert clean('<d {c}>') == '&lt;d {c}&gt;'
 
 
+# tags that get content passed through (i.e. parsed with parseRCDataRawtext)
+_raw_tags = [
+    "title",
+    "textarea",
+    "script",
+    "style",
+    "noembed",
+    "noframes",
+    "iframe",
+    "xmp",
+]
+
+@pytest.mark.parametrize(
+    "raw_tag, data, expected",
+    [
+        (
+            raw_tag,
+            "<noscript><%s></noscript><img src=x onerror=alert(1) />" % raw_tag,
+            "<noscript><%s></noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
+        )
+        for raw_tag in _raw_tags
+    ],
+)
+def test_noscript_rawtag_(raw_tag, data, expected):
+    # refs: bug 1615315 / GHSA-q65m-pv3f-wr5r
+    assert clean(data, tags=["noscript", raw_tag]) == expected
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 

From 0d88dd83e425c4ba381d5b83fe61bfae5bbbd627 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Thu, 13 Feb 2020 12:49:57 -0500
Subject: [PATCH 15/18] Update for v3.1.1 release

---
 CHANGES            | 37 ++++++++++++++++++++++++++++++++++++-
 bleach/__init__.py |  4 ++--
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/CHANGES b/CHANGES
index 838393b3..6cf295e1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,41 @@
 Bleach changes
 ==============
 
+Version 3.1.1 (February 13th, 2020)
+-----------------------------------
+
+**Security fixes**
+
+* ``bleach.clean`` behavior parsing ``noscript`` tags did not match
+  browser behavior.
+
+  Calls to ``bleach.clean`` allowing ``noscript`` and one or more of
+  the raw text tags (``title``, ``textarea``, ``script``, ``style``,
+  ``noembed``, ``noframes``, ``iframe``, and ``xmp``) were vulnerable
+  to a mutation XSS.
+
+  This security issue was confirmed in Bleach versions v2.1.4, v3.0.2,
+  and v3.1.0. Earlier versions are probably affected too.
+
+  Anyone using Bleach <=v3.1.0 is highly encouraged to upgrade.
+
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1615315
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
+Bleach changes
+==============
+
 Version 3.1.0 (January 9th, 2019)
 ---------------------------------
 
@@ -76,7 +111,7 @@ None
 
 * Fix ``list`` object has no attribute ``lower`` in ``clean``. (#398)
 * Fix ``abbr`` getting escaped in ``linkify``. (#400)
- 
+
 
 Version 3.0.0 (October 3rd, 2018)
 ---------------------------------
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 9816549b..30f8fb84 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20190109'
+__releasedate__ = '20200213'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.0'
+__version__ = '3.1.1'
 VERSION = parse_version(__version__)
 
 

From e4e9e21e7aebff40c88fafa4319bba4636a602d9 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 15:52:36 -0400
Subject: [PATCH 16/18] fix bug 1621692

---
 bleach/html5lib_shim.py | 13 ++++++++++++-
 tests/test_clean.py     | 27 +++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
index 169c4027..dcd98a1a 100644
--- a/bleach/html5lib_shim.py
+++ b/bleach/html5lib_shim.py
@@ -533,7 +533,18 @@ def next_possible_entity(text):
 
 
 class BleachHTMLSerializer(HTMLSerializer):
-    """HTMLSerializer that undoes & -> &amp; in attributes"""
+    """HTMLSerializer that undoes & -> &amp; in attributes and sets
+    escape_rcdata to True
+    """
+
+    # per the HTMLSerializer.__init__ docstring:
+    #
+    # Whether to escape characters that need to be
+    # escaped within normal elements within rcdata elements such as
+    # style.
+    #
+    escape_rcdata = True
+
     def escape_base_amp(self, stoken):
         """Escapes just bare & in HTML attribute values"""
         # First, undo escaping of &. We need to do this because html5lib's
diff --git a/tests/test_clean.py b/tests/test_clean.py
index f3c00001..cd5360b8 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -5,7 +5,7 @@
 from bleach import clean
 from bleach.html5lib_shim import Filter
 from bleach.sanitizer import Cleaner
-
+from bleach._vendor.html5lib.constants import rcdataElements
 
 def test_clean_idempotent():
     """Make sure that applying the filter twice doesn't change anything."""
@@ -787,7 +787,7 @@ def test_nonexistent_namespace():
         (
             raw_tag,
             "<noscript><%s></noscript><img src=x onerror=alert(1) />" % raw_tag,
-            "<noscript><%s></noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
+            "<noscript>&lt;%s&gt;</noscript>&lt;img src=x onerror=alert(1) /&gt;" % raw_tag,
         )
         for raw_tag in _raw_tags
     ],
@@ -797,6 +797,29 @@ def test_noscript_rawtag_(raw_tag, data, expected):
     assert clean(data, tags=["noscript", raw_tag]) == expected
 
 
+@pytest.mark.parametrize(
+    "namespace_tag, rc_data_element_tag, data, expected",
+    [
+        (
+            namespace_tag,
+            rc_data_element_tag,
+            "<%s><%s><img src=x onerror=alert(1)>" % (namespace_tag, rc_data_element_tag),
+            "<%s><%s>&lt;img src=x onerror=alert(1)&gt;</%s></%s>" % (namespace_tag, rc_data_element_tag, rc_data_element_tag, namespace_tag),
+        )
+        for namespace_tag in ["math", "svg"]
+        # https://dev.w3.org/html5/html-author/#rcdata-elements
+        # https://html.spec.whatwg.org/index.html#parsing-html-fragments
+        # in html5lib: 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', and 'noscript'
+        for rc_data_element_tag in rcdataElements
+    ],
+)
+def test_namespace_rc_data_element_strip_false(namespace_tag, rc_data_element_tag, data, expected):
+    # refs: bug 1621692 / GHSA-m6xf-fq7q-8743
+    #
+    # browsers will pull the img out of the namespace and rc data tag resulting in XSS
+    assert clean(data, tags=[namespace_tag, rc_data_element_tag], strip=False) == expected
+
+
 def get_ids_and_tests():
     """Retrieves regression tests from data/ directory
 

From 7b625ff9f6888a08037700269fb23e3ef863b8a7 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 15:56:43 -0400
Subject: [PATCH 17/18] add wheel to requirements-dev

---
 requirements-dev.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 758459aa..2e999d52 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -11,3 +11,6 @@ Sphinx
 
 # Requirements for updating package
 twine
+
+# Requirements for running setup.py bdist_wheel
+wheel

From 78a06726dd6c72a42c90c1f7a8fa5d21ebcfa587 Mon Sep 17 00:00:00 2001
From: Greg Guthe <gguthe@mozilla.com>
Date: Wed, 11 Mar 2020 16:17:57 -0400
Subject: [PATCH 18/18] Update for v3.1.2 release

---
 CHANGES            | 34 ++++++++++++++++++++++++++++++++++
 bleach/__init__.py |  4 ++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/CHANGES b/CHANGES
index 6cf295e1..2005da7c 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,40 @@
 Bleach changes
 ==============
 
+Version 3.1.2 (March 11th, 2020)
+--------------------------------
+
+**Security fixes**
+
+* ``bleach.clean`` behavior parsing embedded MathML and SVG content
+  with RCDATA tags did not match browser behavior and could result in
+  a mutation XSS.
+
+  Calls to ``bleach.clean`` with ``strip=False`` and ``math`` or
+  ``svg`` tags and one or more of the RCDATA tags ``script``,
+  ``noscript``, ``style``, ``noframes``, ``iframe``, ``noembed``, or
+  ``xmp`` in the allowed tags whitelist were vulnerable to a mutation
+  XSS.
+
+  This security issue was confirmed in Bleach version v3.1.1. Earlier
+  versions are likely affected too.
+
+  Anyone using Bleach <=v3.1.1 is encouraged to upgrade.
+
+  https://bugzilla.mozilla.org/show_bug.cgi?id=1621692
+
+**Backwards incompatible changes**
+
+None
+
+**Features**
+
+None
+
+**Bug fixes**
+
+None
+
 Version 3.1.1 (February 13th, 2020)
 -----------------------------------
 
diff --git a/bleach/__init__.py b/bleach/__init__.py
index 30f8fb84..bb5a5b6d 100644
--- a/bleach/__init__.py
+++ b/bleach/__init__.py
@@ -18,9 +18,9 @@
 
 
 # yyyymmdd
-__releasedate__ = '20200213'
+__releasedate__ = '20200311'
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = '3.1.1'
+__version__ = '3.1.2'
 VERSION = parse_version(__version__)