Skip to content

Commit 0c13b88

Browse files
committed
accept a new top-margin command line argument for ignoring all text above a certain point on the page, by percent, so that changes in page break points dont cause page header text to appear to jump around
1 parent ea6c5bd commit 0c13b88

File tree

1 file changed

+20
-9
lines changed

1 file changed

+20
-9
lines changed

pdf-diff.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,18 @@
44
from lxml import etree
55
from PIL import Image, ImageDraw, ImageOps
66

7-
def compute_changes(pdf_fn_1, pdf_fn_2):
7+
def compute_changes(pdf_fn_1, pdf_fn_2, top_margin=0):
88
# Serialize the text in the two PDFs.
9-
docs = [serialize_pdf(0, pdf_fn_1), serialize_pdf(1, pdf_fn_2)]
9+
docs = [serialize_pdf(0, pdf_fn_1, top_margin), serialize_pdf(1, pdf_fn_2, top_margin)]
1010

1111
# Compute differences between the serialized text.
1212
diff = perform_diff(docs[0][1], docs[1][1])
1313
changes = process_hunks(diff, [docs[0][0], docs[1][0]])
1414

1515
return changes
1616

17-
def serialize_pdf(i, fn):
18-
box_generator = pdf_to_bboxes(i, fn)
17+
def serialize_pdf(i, fn, top_margin):
18+
box_generator = pdf_to_bboxes(i, fn, top_margin)
1919
box_generator = mark_eol_hyphens(box_generator)
2020

2121
boxes = []
@@ -45,7 +45,7 @@ def serialize_pdf(i, fn):
4545
text = "".join(text)
4646
return boxes, text
4747

48-
def pdf_to_bboxes(pdf_index, fn):
48+
def pdf_to_bboxes(pdf_index, fn, top_margin=0):
4949
# Get the bounding boxes of text runs in the PDF.
5050
# Each text run is returned as a dict.
5151
box_index = 0
@@ -62,6 +62,9 @@ def pdf_to_bboxes(pdf_index, fn):
6262
"height": float(page.get("height"))
6363
}
6464
for word in page.findall("{http://www.w3.org/1999/xhtml}word"):
65+
if float(word.get("yMax")) < (top_margin/100.0)*float(page.get("height")):
66+
continue
67+
6568
yield {
6669
"index": box_index,
6770
"pdf": pdfdict,
@@ -435,12 +438,20 @@ def pdftopng(pdffile, pagenumber, width=900):
435438
args = sys.argv[1:]
436439

437440
styles = ["strike", "underline"]
438-
if args[0] == "--style":
439-
args.pop(0)
440-
styles = args.pop(0).split(',')
441+
top_margin = 0
442+
while True:
443+
if args[0] == "--style":
444+
args.pop(0)
445+
styles = args.pop(0).split(',')
446+
continue
447+
if args[0] == "--top-margin":
448+
args.pop(0)
449+
top_margin = float(args.pop(0))
450+
continue
451+
break
441452

442453
left_file = args.pop(0)
443454
right_file = args.pop(0)
444455

445-
changes = compute_changes(left_file, right_file)
456+
changes = compute_changes(left_file, right_file, top_margin=top_margin)
446457
render_changes(changes, styles, sys.stdout.buffer)

0 commit comments

Comments
 (0)