Skip to content
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
585f207
Refactor HTML Parser
waylan Mar 7, 2019
77baade
fix silly error
waylan Mar 7, 2019
d4c8951
Add some new tests
waylan Mar 8, 2019
356f5c3
More tests.
waylan Mar 8, 2019
ff0f8f2
Round out tests of valid markup.
waylan Mar 11, 2019
6efe8d5
Some cleanup and bugfixes
waylan Mar 11, 2019
e5f9ca4
Some minor tweaks
waylan Mar 11, 2019
95e8498
comments partially fixed.
waylan Mar 11, 2019
ea98546
Support 0-3 spaces of indent for raw HTML blocks
waylan Mar 13, 2019
23e41d3
Remove need to wrap raw in blank lines
waylan Mar 14, 2019
46b3a1b
More tests passing
waylan Mar 14, 2019
8a17794
All handle_* methods are now defined and tested
waylan Mar 15, 2019
845637a
Some test cleanup
waylan Mar 16, 2019
eee4e49
Monkeypatch HTMLParser piclose
waylan Mar 16, 2019
b8f70b7
unknown_decl is not a handle method
waylan Mar 16, 2019
7a8a6b5
Switch back to a preprocessor
waylan Mar 16, 2019
22151c7
Start audit of legacy tests
waylan Mar 20, 2019
a0c37e1
More legacy test audits.
waylan Mar 20, 2019
0e4a545
More test audits
waylan Mar 21, 2019
49c187d
Fix amperstand handling
waylan Mar 21, 2019
3bc2960
preserve actual closing tags
waylan Mar 21, 2019
4953272
More bugs fixed
waylan Mar 22, 2019
29cc7ba
Account for code spans at start of line.
waylan Mar 22, 2019
d09d602
Code spans at start of line 2nd attempt.
waylan Mar 24, 2019
1e16fd0
Drop py2 and cleanup after rebase.
waylan Jul 1, 2020
9fe2473
First attempt at md in raw.
waylan Jul 1, 2020
e4a8796
Support markdown=1
waylan Jul 2, 2020
1d17525
Eliminate extra blank lines.
waylan Jul 7, 2020
6b4b351
Add more tests
waylan Jul 7, 2020
c0194f3
Track index of containing tag in stack.
waylan Jul 7, 2020
23375a5
Minor tweaks.
waylan Jul 7, 2020
9ffead5
break md_in_html out into subclass of HTML parser.
waylan Jul 8, 2020
e3ff368
Only put raw tags in stack.
waylan Jul 8, 2020
c96efad
Refactor and simplify logic.
waylan Jul 9, 2020
37ff86a
Disable 'incomplete' entity handling of HTMLParser.
waylan Jul 12, 2020
f02b427
Fixed whitespace issues.
waylan Jul 13, 2020
efa36c8
Import copy of html.parser so our monkeypatches don't break user's code.
waylan Jul 13, 2020
a8145f8
Handle raw blocks in tail of previous block.
waylan Jul 14, 2020
70d2624
Account for extra whitespace on blank lines.
waylan Jul 14, 2020
335816e
Handle inline raw html in tail.
waylan Jul 14, 2020
5776e97
Update md_in_html with recent htmlparser changes.
waylan Jul 15, 2020
4888464
Add test_md_in_html.py
waylan Jul 22, 2020
aae6676
More tests
waylan Jul 27, 2020
183537f
Handle markdown=1 attrs.
waylan Jul 28, 2020
7783d48
Fix some bugs.
waylan Sep 1, 2020
cae2ef0
track mdstate down and back up nested elements.
waylan Sep 1, 2020
56111c4
fix nested multiline paragraphs.
waylan Sep 2, 2020
dda2755
Move link reference handling to block parser.
waylan Sep 3, 2020
370d601
Move abbr reference handling to block parser.
waylan Sep 8, 2020
81ac09d
Move footnote reference handling to block parser.
waylan Sep 8, 2020
6b068e3
Cleanup
waylan Sep 8, 2020
7a85397
Remove reference to comments and PIs in TreeBuilder as unused.
waylan Sep 8, 2020
42299a8
Remove other reference to comments and PIs in TreeBuilder.
waylan Sep 8, 2020
fbae484
Rewrite extension docs.
waylan Sep 9, 2020
097f52c
Fix normalization docs to match behavior.
waylan Sep 9, 2020
df14000
Update spelling dict with unclosed
waylan Sep 9, 2020
f61eb28
Address some coverage.
waylan Sep 11, 2020
2d8ce54
Ensure extension doesn't break default behavior.
waylan Sep 15, 2020
4856e86
update abbr tests
waylan Sep 15, 2020
07c9267
add basic link ref tests.
waylan Sep 15, 2020
82b97e5
flake8 cleanup
waylan Sep 15, 2020
1a0a893
footnote tests. 100% patch coverage
waylan Sep 15, 2020
46ac436
Add test for case in #1012.
waylan Sep 15, 2020
9cfbf20
Add release notes.
waylan Sep 15, 2020
1eb9fd3
Avoid duplicate tests.
waylan Sep 15, 2020
6f3b417
Fix a broken link
waylan Sep 15, 2020
15b431a
Final cleanup.
waylan Sep 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 97 additions & 8 deletions markdown/extensions/md_in_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,100 @@

from . import Extension
from ..blockprocessors import BlockProcessor
from ..preprocessors import Preprocessor
from .. import util
from ..htmlparser import HTMLExtractor
from html import parser
import re
import xml.etree.ElementTree as etree


class HTMLExtractorExtra(HTMLExtractor):

def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.mdstack = [] # When markdown=1, stack contains a list of tags
super().reset()

def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
self.stack.append(tag)

if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw:
if not attrs.get('markdown', None) == '1':
# Started a new raw block
self.inraw = True
self.container_index = len(self.stack) - 1
if len(self.cleandoc):
# Insert blank line between this and previous line.
self.cleandoc.append('\n')

if not self.inraw and 'markdown' in attrs:
self.mdstack.append(tag)
# Remove markdown attribute and rebuild start tag.
attrs.pop('markdown')
attrs_str = ' ' + ' '.join('{}="{}"'.format(k, v) for k, v in attrs.items()) if attrs else ''
text = '<{}{}>'.format(tag, attrs_str)
self.cleandoc.append(self.md.htmlStash.store(text))
if tag != 'p':
self.cleandoc.append('\n\n')
else:
text = self.get_starttag_text()
if self.inraw:
self._cache.append(text)
else:
self.cleandoc.append(text)

def handle_endtag(self, tag):
# Attempt to extract actual tag from raw source text
start = self.line_offset + self.offset
m = parser.endendtag.search(self.rawdata, start)
if m:
text = self.rawdata[start:m.end()]
else:
# Failed to extract from raw data. Assume well formed and lowercase.
text = '</{}>'.format(tag)

if tag in self.stack:
while self.stack:
if self.stack.pop() == tag:
break
if self.inraw and len(self.stack) <= self.container_index:
# End of raw block
self.inraw = False
self.stack = [] # Reset stack as it could have extranious items in it.
self.container_index = -1
self._cache.append(text)
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
# Insert blank line between this and next line. TODO: make this conditional??
self.cleandoc.append('\n\n')
self._cache = []
elif self.inraw:
self._cache.append(text)
elif tag in self.mdstack:
# Handle closing tag of markdown=1 element
while self.mdstack:
if self.mdstack.pop() == tag:
break
if tag != 'p':
self.cleandoc.append('\n\n')
self.cleandoc.append(self.md.htmlStash.store(text))
self.cleandoc.append('\n\n')
else:
self.cleandoc.append(text)


class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""

def run(self, lines):
source = '\n'.join(lines)
parser = HTMLExtractorExtra(self.md)
parser.feed(source)
parser.close()
return ''.join(parser.cleandoc).split('\n')


class MarkdownInHtmlProcessor(BlockProcessor):
"""Process Markdown Inside HTML Blocks."""
def test(self, parent, block):
Expand Down Expand Up @@ -86,14 +175,14 @@ class MarkdownInHtmlExtension(Extension):
def extendMarkdown(self, md):
""" Register extension instances. """

# Turn on processing of markdown text within raw html
md.preprocessors['html_block'].markdown_in_raw = True
md.parser.blockprocessors.register(
MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
)
md.parser.blockprocessors.tag_counter = -1
md.parser.blockprocessors.contain_span_tags = re.compile(
r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)
# Replace raw HTML preprocessor
md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
# md.parser.blockprocessors.register(
# MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
# )
# md.parser.blockprocessors.tag_counter = -1
# md.parser.blockprocessors.contain_span_tags = re.compile(
# r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)


def makeExtension(**kwargs): # pragma: no cover
Expand Down
175 changes: 175 additions & 0 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""
Python Markdown

A Python implementation of John Gruber's Markdown.

Documentation: https://python-markdown.github.io/
GitHub: https://github.com/Python-Markdown/markdown/
PyPI: https://pypi.org/project/Markdown/

Started by Manfred Stienstra (http://www.dwerg.net/).
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
Currently maintained by Waylan Limberg (https://github.com/waylan),
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)

License: BSD (see LICENSE.md for details).
"""

from html import parser
import re

# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
parser.piclose = re.compile(r'\?>')
# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
parser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
parser.incomplete = parser.entityref


class HTMLExtractor(parser.HTMLParser):
"""
Extract raw HTML from text.

The raw HTML is stored in the `htmlStash` of the Markdown instance passed
to `md` and the remaining text is stored in `cleandoc` as a list of strings.
"""

def __init__(self, md, *args, **kwargs):
if 'convert_charrefs' not in kwargs:
kwargs['convert_charrefs'] = False
# This calls self.reset
super().__init__(*args, **kwargs)
self.md = md

def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.inraw = False
self.stack = [] # When inraw==True, stack contains a list of tags
self._cache = []
self.cleandoc = []
super().reset()

def close(self):
"""Handle any buffered data."""
super().close()
# Handle any unclosed tags.
if len(self._cache):
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
self._cache = []

@property
def line_offset(self):
"""Returns char index in self.rawdata for the start of the current line. """
if self.lineno > 1:
return re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata).end()
return 0

def at_line_start(self):
"""
Returns True if current position is at start of line.

Allows for up to three blank spaces at start of line.
"""
if self.offset == 0:
return True
if self.offset > 3:
return False
# Confirm up to first 3 chars are whitespace
return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

def get_endtag_text(self, tag):
"""
Returns the text of the end tag.

If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
"""
# Attempt to extract actual tag from raw source text
start = self.line_offset + self.offset
m = parser.endendtag.search(self.rawdata, start)
if m:
return self.rawdata[start:m.end()]
else:
# Failed to extract from raw data. Assume well formed and lowercase.
return '</{}>'.format(tag)

def handle_starttag(self, tag, attrs):
attrs = dict(attrs)

if self.at_line_start() and self.md.is_block_level(tag) and not self.inraw:
# Started a new raw block. Prepare stack.
self.inraw = True
self.cleandoc.append('\n')

text = self.get_starttag_text()
if self.inraw:
self.stack.append(tag)
self._cache.append(text)
else:
self.cleandoc.append(text)

def handle_endtag(self, tag):
text = self.get_endtag_text(tag)

if self.inraw:
self._cache.append(text)
if tag in self.stack:
# Remove tag from stack
while self.stack:
if self.stack.pop() == tag:
break
if len(self.stack) == 0:
# End of raw block. Reset stack.
self.inraw = False
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
# Insert blank line between this and next line.
self.cleandoc.append('\n\n')
self._cache = []
else:
self.cleandoc.append(text)

def handle_data(self, data):
if self.inraw:
self._cache.append(data)
else:
self.cleandoc.append(data)

def handle_empty_tag(self, data, is_block):
""" Handle empty tags (`<data>`). """
if self.inraw:
# Append this to the existing raw block
self._cache.append(data)
elif self.at_line_start() and is_block:
# Handle this as a standalone raw block
self.cleandoc.append(self.md.htmlStash.store(data))
# Insert blank line between this and next line.
self.cleandoc.append('\n\n')
else:
self.cleandoc.append(data)

def handle_startendtag(self, tag, attrs):
self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

def handle_charref(self, name):
self.handle_empty_tag('&#{};'.format(name), is_block=False)

def handle_entityref(self, name):
self.handle_empty_tag('&{};'.format(name), is_block=False)

def handle_comment(self, data):
self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)

def handle_decl(self, data):
self.handle_empty_tag('<!{}>'.format(data), is_block=True)

def handle_pi(self, data):
self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

def unknown_decl(self, data):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
5 changes: 2 additions & 3 deletions markdown/postprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,8 @@ def run(self, text):
for i in range(self.md.htmlStash.html_counter):
html = self.md.htmlStash.rawHtmlBlocks[i]
if self.isblocklevel(html):
replacements["<p>%s</p>" %
(self.md.htmlStash.get_placeholder(i))] = \
html + "\n"
replacements["<p>{}</p>".format(
self.md.htmlStash.get_placeholder(i))] = html
replacements[self.md.htmlStash.get_placeholder(i)] = html

if replacements:
Expand Down
Loading