Python-Markdown · waylan · Sep 22, 2020 · Mar 7, 2019 · Mar 7, 2019 · Mar 8, 2019
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
@@ -16,11 +16,100 @@
 
 from . import Extension
 from ..blockprocessors import BlockProcessor
+from ..preprocessors import Preprocessor
 from .. import util
+from ..htmlparser import HTMLExtractor
+from html import parser
 import re
 import xml.etree.ElementTree as etree
 
 
+class HTMLExtractorExtra(HTMLExtractor):
+
+    def reset(self):
+        """Reset this instance.  Loses all unprocessed data."""
+        self.mdstack = []  # When markdown=1, stack contains a list of tags
+        super().reset()
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        self.stack.append(tag)
+
+        if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw:
+            if not attrs.get('markdown', None) == '1':
+                # Started a new raw block
+                self.inraw = True
+                self.container_index = len(self.stack) - 1
+            if len(self.cleandoc):
+                # Insert blank line between this and previous line.
+                self.cleandoc.append('\n')
+
+        if not self.inraw and 'markdown' in attrs:
+            self.mdstack.append(tag)
+            # Remove markdown attribute and rebuild start tag.
+            attrs.pop('markdown')
+            attrs_str = ' ' + ' '.join('{}="{}"'.format(k, v) for k, v in attrs.items()) if attrs else ''
+            text = '<{}{}>'.format(tag, attrs_str)
+            self.cleandoc.append(self.md.htmlStash.store(text))
+            if tag != 'p':
+                self.cleandoc.append('\n\n')
+        else:
+            text = self.get_starttag_text()
+            if self.inraw:
+                self._cache.append(text)
+            else:
+                self.cleandoc.append(text)
+
+    def handle_endtag(self, tag):
+        # Attempt to extract actual tag from raw source text
+        start = self.line_offset + self.offset
+        m = parser.endendtag.search(self.rawdata, start)
+        if m:
+            text = self.rawdata[start:m.end()]
+        else:
+            # Failed to extract from raw data. Assume well formed and lowercase.
+            text = '</{}>'.format(tag)
+
+        if tag in self.stack:
+            while self.stack:
+                if self.stack.pop() == tag:
+                    break
+        if self.inraw and len(self.stack) <= self.container_index:
+            # End of raw block
+            self.inraw = False
+            self.stack = [] # Reset stack as it could have extranious items in it.
+            self.container_index = -1
+            self._cache.append(text)
+            self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
+            # Insert blank line between this and next line. TODO: make this conditional??
+            self.cleandoc.append('\n\n')
+            self._cache = []
+        elif self.inraw:
+            self._cache.append(text)
+        elif tag in self.mdstack:
+            # Handle closing tag of markdown=1 element
+            while self.mdstack:
+                if self.mdstack.pop() == tag:
+                    break
+            if tag != 'p':
+                self.cleandoc.append('\n\n')
+            self.cleandoc.append(self.md.htmlStash.store(text))
+            self.cleandoc.append('\n\n')
+        else:
+            self.cleandoc.append(text)
+
+
+class HtmlBlockPreprocessor(Preprocessor):
+    """Remove html blocks from the text and store them for later retrieval."""
+
+    def run(self, lines):
+        source = '\n'.join(lines)
+        parser = HTMLExtractorExtra(self.md)
+        parser.feed(source)
+        parser.close()
+        return ''.join(parser.cleandoc).split('\n')
+
+
 class MarkdownInHtmlProcessor(BlockProcessor):
     """Process Markdown Inside HTML Blocks."""
     def test(self, parent, block):
@@ -86,14 +175,14 @@ class MarkdownInHtmlExtension(Extension):
     def extendMarkdown(self, md):
         """ Register extension instances. """
 
-        # Turn on processing of markdown text within raw html
-        md.preprocessors['html_block'].markdown_in_raw = True
-        md.parser.blockprocessors.register(
-            MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
-        )
-        md.parser.blockprocessors.tag_counter = -1
-        md.parser.blockprocessors.contain_span_tags = re.compile(
-            r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)
+        # Replace raw HTML preprocessor
+        md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
+        # md.parser.blockprocessors.register(
+        #     MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
+        # )
+        # md.parser.blockprocessors.tag_counter = -1
+        # md.parser.blockprocessors.contain_span_tags = re.compile(
+        #     r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)
 
 
 def makeExtension(**kwargs):  # pragma: no cover

diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
@@ -0,0 +1,175 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+from html import parser
+import re
+
+# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
+parser.piclose = re.compile(r'\?>')
+# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
+parser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
+# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
+# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
+# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
+parser.incomplete = parser.entityref
+
+
+class HTMLExtractor(parser.HTMLParser):
+    """
+    Extract raw HTML from text.
+
+    The raw HTML is stored in the `htmlStash` of the Markdown instance passed
+    to `md` and the remaining text is stored in `cleandoc` as a list of strings.
+    """
+
+    def __init__(self, md, *args, **kwargs):
+        if 'convert_charrefs' not in kwargs:
+            kwargs['convert_charrefs'] = False
+        # This calls self.reset
+        super().__init__(*args, **kwargs)
+        self.md = md
+
+    def reset(self):
+        """Reset this instance.  Loses all unprocessed data."""
+        self.inraw = False
+        self.stack = []  # When inraw==True, stack contains a list of tags
+        self._cache = []
+        self.cleandoc = []
+        super().reset()
+
+    def close(self):
+        """Handle any buffered data."""
+        super().close()
+        # Handle any unclosed tags.
+        if len(self._cache):
+            self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
+            self._cache = []
+
+    @property
+    def line_offset(self):
+        """Returns char index in self.rawdata for the start of the current line. """
+        if self.lineno > 1:
+            return re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata).end()
+        return 0
+
+    def at_line_start(self):
+        """
+        Returns True if current position is at start of line.
+
+        Allows for up to three blank spaces at start of line.
+        """
+        if self.offset == 0:
+            return True
+        if self.offset > 3:
+            return False
+        # Confirm up to first 3 chars are whitespace
+        return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
+
+    def get_endtag_text(self, tag):
+        """
+        Returns the text of the end tag.
+
+        If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
+        """
+        # Attempt to extract actual tag from raw source text
+        start = self.line_offset + self.offset
+        m = parser.endendtag.search(self.rawdata, start)
+        if m:
+            return self.rawdata[start:m.end()]
+        else:
+            # Failed to extract from raw data. Assume well formed and lowercase.
+            return '</{}>'.format(tag)
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+
+        if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw:
+            # Started a new raw block. Prepare stack.
+            self.inraw = True
+            self.cleandoc.append('\n')
+
+        text = self.get_starttag_text()
+        if self.inraw:
+            self.stack.append(tag)
+            self._cache.append(text)
+        else:
+            self.cleandoc.append(text)
+
+    def handle_endtag(self, tag):
+        text = self.get_endtag_text(tag)
+
+        if self.inraw:
+            self._cache.append(text)
+            if tag in self.stack:
+                # Remove tag from stack
+                while self.stack:
+                    if self.stack.pop() == tag:
+                        break
+            if len(self.stack) == 0:
+                # End of raw block. Reset stack.
+                self.inraw = False
+                self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
+                # Insert blank line between this and next line.
+                self.cleandoc.append('\n\n')
+                self._cache = []
+        else:
+            self.cleandoc.append(text)
+
+    def handle_data(self, data):
+        if self.inraw:
+            self._cache.append(data)
+        else:
+            self.cleandoc.append(data)
+
+    def handle_empty_tag(self, data, is_block):
+        """ Handle empty tags (`<data>`). """
+        if self.inraw:
+            # Append this to the existing raw block
+            self._cache.append(data)
+        elif self.at_line_start() and is_block:
+            # Handle this as a standalone raw block
+            self.cleandoc.append(self.md.htmlStash.store(data))
+            # Insert blank line between this and next line.
+            self.cleandoc.append('\n\n')
+        else:
+            self.cleandoc.append(data)
+
+    def handle_startendtag(self, tag, attrs):
+        self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
+
+    def handle_charref(self, name):
+        self.handle_empty_tag('&#{};'.format(name), is_block=False)
+
+    def handle_entityref(self, name):
+        self.handle_empty_tag('&{};'.format(name), is_block=False)
+
+    def handle_comment(self, data):
+        self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
+
+    def handle_decl(self, data):
+        self.handle_empty_tag('<!{}>'.format(data), is_block=True)
+
+    def handle_pi(self, data):
+        self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
+
+    def unknown_decl(self, data):
+        end = ']]>' if data.startswith('CDATA[') else ']>'
+        self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
@@ -71,9 +71,8 @@ def run(self, text):
         for i in range(self.md.htmlStash.html_counter):
             html = self.md.htmlStash.rawHtmlBlocks[i]
             if self.isblocklevel(html):
-                replacements["<p>%s</p>" %
-                             (self.md.htmlStash.get_placeholder(i))] = \
-                    html + "\n"
+                replacements["<p>{}</p>".format(
+                    self.md.htmlStash.get_placeholder(i))] = html
             replacements[self.md.htmlStash.get_placeholder(i)] = html
 
         if replacements: