Skip to content

Commit 59e3fa5

Browse files
committed
Filter invalid characters to avoid PCDATA errors
1 parent 744276a commit 59e3fa5

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

pdf_diff/command_line.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from lxml import etree
55
from PIL import Image, ImageDraw, ImageOps
66

7+
from six import int2byte
8+
79
def compute_changes(pdf_fn_1, pdf_fn_2, top_margin=0):
810
# Serialize the text in the two PDFs.
911
docs = [serialize_pdf(0, pdf_fn_1, top_margin), serialize_pdf(1, pdf_fn_2, top_margin)]
@@ -54,7 +56,15 @@ def pdf_to_bboxes(pdf_index, fn, top_margin=0):
5456
"file": fn,
5557
}
5658
xml = subprocess.check_output(["pdftotext", "-bbox", fn, "/dev/stdout"])
57-
dom = etree.fromstring(xml)
59+
60+
# This avoids PCDATA errors
61+
codes_to_avoid = [ 0, 1, 2, 3, 4, 5, 6, 7, 8,
62+
11, 12,
63+
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ]
64+
65+
cleaned_xml = bytes([x for x in xml if x not in codes_to_avoid])
66+
67+
dom = etree.fromstring(cleaned_xml)
5868
for i, page in enumerate(dom.findall(".//{http://www.w3.org/1999/xhtml}page")):
5969
pagedict = {
6070
"number": i+1,

0 commit comments

Comments
 (0)