Skip to content

Commit b8caf00

Browse files
committed
make it work entirely in Python 3
1 parent 713f7ce commit b8caf00

File tree

3 files changed

+258
-106
lines changed

3 files changed

+258
-106
lines changed

README.md

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,32 @@ Finds differences between two PDF documents:
88

99
![Example Image Output](example.png)
1010

11-
Unfortunately while I started this project in node.js, I couldn't figure out how to quickly do the rendering part in node.js and so I switched to Python where I had some similar code laying around already.
11+
The script is written in Python 3, and it relies on the `pdftotext` program.
1212

1313
Installation
1414
------------
1515

16-
# for the comparison tool
16+
sudo pip3 install pillow lxml
17+
18+
# get my Python extension module for the Google Diff Match Patch library
19+
# so we can compute differences in text very quickly
20+
git clone --recursive https://github.com/JoshData/diff_match_patch-python
21+
cd diff_match_patch-python
22+
python3 setup.py build
23+
sudo python3 setup.py install
24+
25+
Running
26+
-------
27+
28+
Turn two PDFs into one large PNG image showing the differences:
29+
30+
python3 pdf-diff.py before.pdf after.pdf > test.png
31+
32+
33+
Node Version
34+
------------
35+
36+
There's also a node.js version for computing the changes that uses Mozilla's pdf.js instead of `pdftotext`. Getting pdf.js to work in node.js isn't straightforward:
1737

1838
npm install
1939

@@ -22,21 +42,14 @@ Installation
2242
node make singlefile
2343
cd ..
2444

25-
# for the renderer
26-
27-
sudo pip3 install pillow
28-
29-
Running
30-
-------
31-
3245
Compute the changes (writes a JSON file):
3346

3447
node index.js before.pdf after.pdf | grep -v "^Warning:" > changes.json
3548

3649
(Unfortunately the pdf.js library prints warnings on STDOUT, so we have to filter those out.)
3750

38-
Render the changes (turns the PDFs + JSON file into a big PNG image):
39-
40-
python3 render.py < changes.json > test.png
51+
Render the changes:
4152

53+
python3 pdf-diff.py --changes < changes.json > test.png
4254

55+
`pdftotext` gives bounding boxes for each word in the document while pdf.js only gives bounding boxes for text runs, so the granularity is not as good.

pdf-diff.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
#!/usr/bin/python3
2+
3+
import sys, json, subprocess, io
4+
from lxml import etree
5+
from PIL import Image, ImageDraw, ImageOps
6+
7+
def compute_changes(pdf_fn_1, pdf_fn_2):
8+
# Serialize the text in the two PDFs.
9+
docs = [serialize_pdf(0, pdf_fn_1), serialize_pdf(1, pdf_fn_2)]
10+
11+
# Compute differences between the serialized text.
12+
diff = perform_diff(docs[0][1], docs[1][1])
13+
changes = process_hunks(diff, [docs[0][0], docs[1][0]])
14+
15+
return changes
16+
17+
def serialize_pdf(i, fn):
18+
boxes = []
19+
text = []
20+
textlength = 0
21+
for run in pdf_to_bboxes(i, fn):
22+
normalized_text = run["text"]
23+
24+
# Ensure that each run ends with a space, since pdftotext
25+
# strips spaces between words. If we do a word-by-word diff,
26+
# that would be important.
27+
normalized_text = normalized_text.strip() + " "
28+
29+
run["startIndex"] = textlength
30+
run["textLength"] = len(normalized_text)
31+
boxes.append(run)
32+
text.append(normalized_text)
33+
textlength += len(normalized_text)
34+
35+
text = "".join(text)
36+
return boxes, text
37+
38+
def pdf_to_bboxes(pdf_index, fn):
39+
# Get the bounding boxes of text runs in the PDF.
40+
# Each text run is returned as a dict.
41+
pdfdict = {
42+
"index": pdf_index,
43+
"file": fn,
44+
}
45+
xml = subprocess.check_output(["pdftotext", "-bbox", fn, "/dev/stdout"])
46+
dom = etree.fromstring(xml)
47+
for i, page in enumerate(dom.findall(".//{http://www.w3.org/1999/xhtml}page")):
48+
pagedict = {
49+
"number": i+1,
50+
"width": float(page.get("width")),
51+
"height": float(page.get("height"))
52+
}
53+
for word in page.findall("{http://www.w3.org/1999/xhtml}word"):
54+
yield {
55+
"pdf": pdfdict,
56+
"page": pagedict,
57+
"x": float(word.get("xMin")),
58+
"y": float(word.get("yMin")),
59+
"width": float(word.get("xMax"))-float(word.get("xMin")),
60+
"height": float(word.get("yMax"))-float(word.get("yMin")),
61+
"text": word.text,
62+
}
63+
64+
def perform_diff(doc1text, doc2text):
65+
import diff_match_patch
66+
return diff_match_patch.diff(
67+
doc1text,
68+
doc2text,
69+
timelimit=0,
70+
checklines=False)
71+
72+
def process_hunks(hunks, boxes):
73+
# Process each diff hunk one by one and look at their corresponding
74+
# text boxes in the original PDFs.
75+
offsets = [0, 0]
76+
changes = []
77+
for op, oplen in hunks:
78+
if op == "=":
79+
# This hunk represents a region in the two text documents that are
80+
# in common. So nothing to process but advance the counters.
81+
offsets[0] += oplen;
82+
offsets[1] += oplen;
83+
84+
# Put a marker in the changes so we can line up equivalent parts
85+
# later.
86+
if len(changes) > 0 and changes[-1] != '*':
87+
changes.append("*");
88+
89+
elif op in ("-", "+"):
90+
# This hunk represents a region of text only in the left (op == "-")
91+
# or right (op == "+") document. The change is oplen chars long.
92+
idx = 0 if (op == "-") else 1
93+
94+
mark_difference(oplen, offsets[idx], boxes[idx], changes)
95+
96+
offsets[idx] += oplen
97+
98+
# Although the text doesn't exist in the other document, we want to
99+
# mark the position where that text may have been to indicate an
100+
# insertion.
101+
idx2 = 1 - idx
102+
mark_difference(1, offsets[idx2]-1, boxes[idx2], changes)
103+
mark_difference(0, offsets[idx2]+0, boxes[idx2], changes)
104+
105+
else:
106+
raise ValueError(op)
107+
108+
# Remove any final asterisk.
109+
if len(changes) > 0 and changes[-1] == "*":
110+
changes.pop()
111+
112+
return changes
113+
114+
def mark_difference(hunk_length, offset, boxes, changes):
115+
# We're passed an offset and length into a document given to us
116+
# by the text comparison, and we'll mark the text boxes passed
117+
# in boxes as having changed content.
118+
119+
# Discard boxes whose text is entirely before this hunk
120+
while len(boxes) > 0 and (boxes[0]["startIndex"] + boxes[0]["textLength"]) <= offset:
121+
boxes.pop(0)
122+
123+
# Process the boxes that intersect this hunk. We can't subdivide boxes,
124+
# so even though not all of the text in the box might be changed we'll
125+
# mark the whole box as changed.
126+
while len(boxes) > 0 and boxes[0]["startIndex"] < offset + hunk_length:
127+
# Mark this box as changed.
128+
changes.append(boxes[0])
129+
130+
# Discard box. Now that we know it's changed, there's no reason to
131+
# hold onto it. It can't be marked as changed twice.
132+
boxes.pop(0)
133+
134+
# Turns a JSON object of PDF changes into a PNG and writes it to stream.
135+
def render_changes(changes, stream):
136+
# Load all of the pages named in changes.
137+
138+
pages = [{}, {}]
139+
for change in changes:
140+
if change == "*": continue # not handled yet
141+
if change["page"]["number"] not in pages[change["pdf"]["index"]]:
142+
pages[change["pdf"]["index"]][change["page"]["number"]] = pdftopng(change["pdf"]["file"], change["page"]["number"])
143+
144+
# Draw red boxes around changes.
145+
146+
for change in changes:
147+
if change == "*": continue # not handled yet
148+
149+
im = pages[change["pdf"]["index"]][change["page"]["number"]]
150+
151+
coords = (
152+
change["x"] * im.size[0]/change["page"]["width"],
153+
change["y"] * im.size[1]/change["page"]["height"],
154+
(change["x"]+change["width"]) * im.size[0]/change["page"]["width"],
155+
(change["y"]+change["height"]) * im.size[1]/change["page"]["height"],
156+
)
157+
158+
draw = ImageDraw.Draw(im)
159+
draw.rectangle(coords, outline="red")
160+
del draw
161+
162+
# Zealous crop all of the pages. Vertical margins can be cropped
163+
# however, but be sure to crop all pages the same horizontally.
164+
for idx in (0, 1):
165+
# min horizontal extremes
166+
minx = None
167+
maxx = None
168+
width = None
169+
for pdf in pages[idx].values():
170+
bbox = ImageOps.invert(pdf.convert("L")).getbbox()
171+
minx = min(bbox[0], minx) if minx else bbox[0]
172+
maxx = min(bbox[2], maxx) if maxx else bbox[2]
173+
width = pdf.size[0]
174+
if width != None:
175+
minx = max(0, minx-int(.02*width)) # add back some margins
176+
maxx = min(width, maxx+int(.02*width))
177+
# do crop
178+
for pg in pages[idx]:
179+
im = pages[idx][pg]
180+
bbox = ImageOps.invert(im.convert("L")).getbbox() # .invert() requires a grayscale image
181+
vpad = int(.02*im.size[1])
182+
pages[idx][pg] = im.crop( (minx, max(0, bbox[1]-vpad), maxx, min(im.size[1], bbox[3]+vpad) ) )
183+
184+
# Stack all of the changed pages into a final PDF.
185+
186+
# Compute the dimensions of the final image.
187+
height = 0
188+
width = [0, 0]
189+
for idx in (0, 1):
190+
side_height = 0
191+
for pdf in pages[idx].values():
192+
side_height += pdf.size[1]
193+
width[idx] = max(width[idx], pdf.size[0])
194+
height = max(height, side_height)
195+
196+
# Paste in the page.
197+
img = Image.new("RGBA", (sum(width), height))
198+
draw = ImageDraw.Draw(img)
199+
for idx in (0, 1):
200+
y = 0
201+
for pg in sorted(pages[idx]):
202+
pgimg = pages[idx][pg]
203+
img.paste(pgimg, (idx * width[0], y))
204+
draw.line( (0 if idx == 0 else width[0], y, sum(width[0:idx+1]), y), fill="black")
205+
y += pgimg.size[1]
206+
207+
# Draw a vertical line between the two sides.
208+
draw.line( (width[0], 0, width[0], height), fill="black")
209+
210+
del draw
211+
212+
# Write it out.
213+
214+
img.save(stream, "PNG")
215+
216+
# Rasterizes a page of a PDF.
217+
def pdftopng(pdffile, pagenumber, width=900):
218+
pngbytes = subprocess.check_output(["/usr/bin/pdftoppm", "-f", str(pagenumber), "-l", str(pagenumber), "-scale-to", str(width), "-png", pdffile])
219+
im = Image.open(io.BytesIO(pngbytes))
220+
return im.convert("RGBA")
221+
222+
if __name__ == "__main__":
223+
if len(sys.argv) == 2 and sys.argv[1] == "--changes":
224+
# to just do the rendering part
225+
render_changes(json.load(sys.stdin), sys.stdout.buffer)
226+
sys.exit(0)
227+
228+
if len(sys.argv) <= 1:
229+
print("Usage: python3 pdf-diff.py before.pdf after.pdf > changes.png", file=sys.stderr)
230+
sys.exit(1)
231+
232+
changes = compute_changes(sys.argv[1], sys.argv[2])
233+
render_changes(changes, sys.stdout.buffer)

render.py

Lines changed: 0 additions & 94 deletions
This file was deleted.

0 commit comments

Comments
 (0)