|
| 1 | +#!/usr/bin/python3 |
| 2 | + |
| 3 | +import sys, json, subprocess, io |
| 4 | +from lxml import etree |
| 5 | +from PIL import Image, ImageDraw, ImageOps |
| 6 | + |
| 7 | +def compute_changes(pdf_fn_1, pdf_fn_2): |
| 8 | + # Serialize the text in the two PDFs. |
| 9 | + docs = [serialize_pdf(0, pdf_fn_1), serialize_pdf(1, pdf_fn_2)] |
| 10 | + |
| 11 | + # Compute differences between the serialized text. |
| 12 | + diff = perform_diff(docs[0][1], docs[1][1]) |
| 13 | + changes = process_hunks(diff, [docs[0][0], docs[1][0]]) |
| 14 | + |
| 15 | + return changes |
| 16 | + |
| 17 | +def serialize_pdf(i, fn): |
| 18 | + boxes = [] |
| 19 | + text = [] |
| 20 | + textlength = 0 |
| 21 | + for run in pdf_to_bboxes(i, fn): |
| 22 | + normalized_text = run["text"] |
| 23 | + |
| 24 | + # Ensure that each run ends with a space, since pdftotext |
| 25 | + # strips spaces between words. If we do a word-by-word diff, |
| 26 | + # that would be important. |
| 27 | + normalized_text = normalized_text.strip() + " " |
| 28 | + |
| 29 | + run["startIndex"] = textlength |
| 30 | + run["textLength"] = len(normalized_text) |
| 31 | + boxes.append(run) |
| 32 | + text.append(normalized_text) |
| 33 | + textlength += len(normalized_text) |
| 34 | + |
| 35 | + text = "".join(text) |
| 36 | + return boxes, text |
| 37 | + |
| 38 | +def pdf_to_bboxes(pdf_index, fn): |
| 39 | + # Get the bounding boxes of text runs in the PDF. |
| 40 | + # Each text run is returned as a dict. |
| 41 | + pdfdict = { |
| 42 | + "index": pdf_index, |
| 43 | + "file": fn, |
| 44 | + } |
| 45 | + xml = subprocess.check_output(["pdftotext", "-bbox", fn, "/dev/stdout"]) |
| 46 | + dom = etree.fromstring(xml) |
| 47 | + for i, page in enumerate(dom.findall(".//{http://www.w3.org/1999/xhtml}page")): |
| 48 | + pagedict = { |
| 49 | + "number": i+1, |
| 50 | + "width": float(page.get("width")), |
| 51 | + "height": float(page.get("height")) |
| 52 | + } |
| 53 | + for word in page.findall("{http://www.w3.org/1999/xhtml}word"): |
| 54 | + yield { |
| 55 | + "pdf": pdfdict, |
| 56 | + "page": pagedict, |
| 57 | + "x": float(word.get("xMin")), |
| 58 | + "y": float(word.get("yMin")), |
| 59 | + "width": float(word.get("xMax"))-float(word.get("xMin")), |
| 60 | + "height": float(word.get("yMax"))-float(word.get("yMin")), |
| 61 | + "text": word.text, |
| 62 | + } |
| 63 | + |
| 64 | +def perform_diff(doc1text, doc2text): |
| 65 | + import diff_match_patch |
| 66 | + return diff_match_patch.diff( |
| 67 | + doc1text, |
| 68 | + doc2text, |
| 69 | + timelimit=0, |
| 70 | + checklines=False) |
| 71 | + |
| 72 | +def process_hunks(hunks, boxes): |
| 73 | + # Process each diff hunk one by one and look at their corresponding |
| 74 | + # text boxes in the original PDFs. |
| 75 | + offsets = [0, 0] |
| 76 | + changes = [] |
| 77 | + for op, oplen in hunks: |
| 78 | + if op == "=": |
| 79 | + # This hunk represents a region in the two text documents that are |
| 80 | + # in common. So nothing to process but advance the counters. |
| 81 | + offsets[0] += oplen; |
| 82 | + offsets[1] += oplen; |
| 83 | + |
| 84 | + # Put a marker in the changes so we can line up equivalent parts |
| 85 | + # later. |
| 86 | + if len(changes) > 0 and changes[-1] != '*': |
| 87 | + changes.append("*"); |
| 88 | + |
| 89 | + elif op in ("-", "+"): |
| 90 | + # This hunk represents a region of text only in the left (op == "-") |
| 91 | + # or right (op == "+") document. The change is oplen chars long. |
| 92 | + idx = 0 if (op == "-") else 1 |
| 93 | + |
| 94 | + mark_difference(oplen, offsets[idx], boxes[idx], changes) |
| 95 | + |
| 96 | + offsets[idx] += oplen |
| 97 | + |
| 98 | + # Although the text doesn't exist in the other document, we want to |
| 99 | + # mark the position where that text may have been to indicate an |
| 100 | + # insertion. |
| 101 | + idx2 = 1 - idx |
| 102 | + mark_difference(1, offsets[idx2]-1, boxes[idx2], changes) |
| 103 | + mark_difference(0, offsets[idx2]+0, boxes[idx2], changes) |
| 104 | + |
| 105 | + else: |
| 106 | + raise ValueError(op) |
| 107 | + |
| 108 | + # Remove any final asterisk. |
| 109 | + if len(changes) > 0 and changes[-1] == "*": |
| 110 | + changes.pop() |
| 111 | + |
| 112 | + return changes |
| 113 | + |
| 114 | +def mark_difference(hunk_length, offset, boxes, changes): |
| 115 | + # We're passed an offset and length into a document given to us |
| 116 | + # by the text comparison, and we'll mark the text boxes passed |
| 117 | + # in boxes as having changed content. |
| 118 | + |
| 119 | + # Discard boxes whose text is entirely before this hunk |
| 120 | + while len(boxes) > 0 and (boxes[0]["startIndex"] + boxes[0]["textLength"]) <= offset: |
| 121 | + boxes.pop(0) |
| 122 | + |
| 123 | + # Process the boxes that intersect this hunk. We can't subdivide boxes, |
| 124 | + # so even though not all of the text in the box might be changed we'll |
| 125 | + # mark the whole box as changed. |
| 126 | + while len(boxes) > 0 and boxes[0]["startIndex"] < offset + hunk_length: |
| 127 | + # Mark this box as changed. |
| 128 | + changes.append(boxes[0]) |
| 129 | + |
| 130 | + # Discard box. Now that we know it's changed, there's no reason to |
| 131 | + # hold onto it. It can't be marked as changed twice. |
| 132 | + boxes.pop(0) |
| 133 | + |
| 134 | +# Turns a JSON object of PDF changes into a PNG and writes it to stream. |
| 135 | +def render_changes(changes, stream): |
| 136 | + # Load all of the pages named in changes. |
| 137 | + |
| 138 | + pages = [{}, {}] |
| 139 | + for change in changes: |
| 140 | + if change == "*": continue # not handled yet |
| 141 | + if change["page"]["number"] not in pages[change["pdf"]["index"]]: |
| 142 | + pages[change["pdf"]["index"]][change["page"]["number"]] = pdftopng(change["pdf"]["file"], change["page"]["number"]) |
| 143 | + |
| 144 | + # Draw red boxes around changes. |
| 145 | + |
| 146 | + for change in changes: |
| 147 | + if change == "*": continue # not handled yet |
| 148 | + |
| 149 | + im = pages[change["pdf"]["index"]][change["page"]["number"]] |
| 150 | + |
| 151 | + coords = ( |
| 152 | + change["x"] * im.size[0]/change["page"]["width"], |
| 153 | + change["y"] * im.size[1]/change["page"]["height"], |
| 154 | + (change["x"]+change["width"]) * im.size[0]/change["page"]["width"], |
| 155 | + (change["y"]+change["height"]) * im.size[1]/change["page"]["height"], |
| 156 | + ) |
| 157 | + |
| 158 | + draw = ImageDraw.Draw(im) |
| 159 | + draw.rectangle(coords, outline="red") |
| 160 | + del draw |
| 161 | + |
| 162 | + # Zealous crop all of the pages. Vertical margins can be cropped |
| 163 | + # however, but be sure to crop all pages the same horizontally. |
| 164 | + for idx in (0, 1): |
| 165 | + # min horizontal extremes |
| 166 | + minx = None |
| 167 | + maxx = None |
| 168 | + width = None |
| 169 | + for pdf in pages[idx].values(): |
| 170 | + bbox = ImageOps.invert(pdf.convert("L")).getbbox() |
| 171 | + minx = min(bbox[0], minx) if minx else bbox[0] |
| 172 | + maxx = min(bbox[2], maxx) if maxx else bbox[2] |
| 173 | + width = pdf.size[0] |
| 174 | + if width != None: |
| 175 | + minx = max(0, minx-int(.02*width)) # add back some margins |
| 176 | + maxx = min(width, maxx+int(.02*width)) |
| 177 | + # do crop |
| 178 | + for pg in pages[idx]: |
| 179 | + im = pages[idx][pg] |
| 180 | + bbox = ImageOps.invert(im.convert("L")).getbbox() # .invert() requires a grayscale image |
| 181 | + vpad = int(.02*im.size[1]) |
| 182 | + pages[idx][pg] = im.crop( (minx, max(0, bbox[1]-vpad), maxx, min(im.size[1], bbox[3]+vpad) ) ) |
| 183 | + |
| 184 | + # Stack all of the changed pages into a final PDF. |
| 185 | + |
| 186 | + # Compute the dimensions of the final image. |
| 187 | + height = 0 |
| 188 | + width = [0, 0] |
| 189 | + for idx in (0, 1): |
| 190 | + side_height = 0 |
| 191 | + for pdf in pages[idx].values(): |
| 192 | + side_height += pdf.size[1] |
| 193 | + width[idx] = max(width[idx], pdf.size[0]) |
| 194 | + height = max(height, side_height) |
| 195 | + |
| 196 | + # Paste in the page. |
| 197 | + img = Image.new("RGBA", (sum(width), height)) |
| 198 | + draw = ImageDraw.Draw(img) |
| 199 | + for idx in (0, 1): |
| 200 | + y = 0 |
| 201 | + for pg in sorted(pages[idx]): |
| 202 | + pgimg = pages[idx][pg] |
| 203 | + img.paste(pgimg, (idx * width[0], y)) |
| 204 | + draw.line( (0 if idx == 0 else width[0], y, sum(width[0:idx+1]), y), fill="black") |
| 205 | + y += pgimg.size[1] |
| 206 | + |
| 207 | + # Draw a vertical line between the two sides. |
| 208 | + draw.line( (width[0], 0, width[0], height), fill="black") |
| 209 | + |
| 210 | + del draw |
| 211 | + |
| 212 | + # Write it out. |
| 213 | + |
| 214 | + img.save(stream, "PNG") |
| 215 | + |
| 216 | +# Rasterizes a page of a PDF. |
| 217 | +def pdftopng(pdffile, pagenumber, width=900): |
| 218 | + pngbytes = subprocess.check_output(["/usr/bin/pdftoppm", "-f", str(pagenumber), "-l", str(pagenumber), "-scale-to", str(width), "-png", pdffile]) |
| 219 | + im = Image.open(io.BytesIO(pngbytes)) |
| 220 | + return im.convert("RGBA") |
| 221 | + |
| 222 | +if __name__ == "__main__": |
| 223 | + if len(sys.argv) == 2 and sys.argv[1] == "--changes": |
| 224 | + # to just do the rendering part |
| 225 | + render_changes(json.load(sys.stdin), sys.stdout.buffer) |
| 226 | + sys.exit(0) |
| 227 | + |
| 228 | + if len(sys.argv) <= 1: |
| 229 | + print("Usage: python3 pdf-diff.py before.pdf after.pdf > changes.png", file=sys.stderr) |
| 230 | + sys.exit(1) |
| 231 | + |
| 232 | + changes = compute_changes(sys.argv[1], sys.argv[2]) |
| 233 | + render_changes(changes, sys.stdout.buffer) |
0 commit comments