9
9
from lxml import etree
10
10
from PIL import Image , ImageDraw , ImageOps
11
11
12
- def compute_changes (pdf_fn_1 , pdf_fn_2 , top_margin = 0 ):
12
+ def compute_changes (pdf_fn_1 , pdf_fn_2 , top_margin = 0 , bottom_margin = 100 ):
13
13
# Serialize the text in the two PDFs.
14
- docs = [serialize_pdf (0 , pdf_fn_1 , top_margin ), serialize_pdf (1 , pdf_fn_2 , top_margin )]
14
+ docs = [serialize_pdf (0 , pdf_fn_1 , top_margin , bottom_margin ), serialize_pdf (1 , pdf_fn_2 , top_margin , bottom_margin )]
15
15
16
16
# Compute differences between the serialized text.
17
17
diff = perform_diff (docs [0 ][1 ], docs [1 ][1 ])
18
18
changes = process_hunks (diff , [docs [0 ][0 ], docs [1 ][0 ]])
19
19
20
20
return changes
21
21
22
- def serialize_pdf (i , fn , top_margin ):
23
- box_generator = pdf_to_bboxes (i , fn , top_margin )
22
+ def serialize_pdf (i , fn , top_margin , bottom_margin ):
23
+ box_generator = pdf_to_bboxes (i , fn , top_margin , bottom_margin )
24
24
box_generator = mark_eol_hyphens (box_generator )
25
25
26
26
boxes = []
@@ -50,7 +50,7 @@ def serialize_pdf(i, fn, top_margin):
50
50
text = "" .join (text )
51
51
return boxes , text
52
52
53
- def pdf_to_bboxes (pdf_index , fn , top_margin = 0 ):
53
+ def pdf_to_bboxes (pdf_index , fn , top_margin = 0 , bottom_margin = 100 ):
54
54
# Get the bounding boxes of text runs in the PDF.
55
55
# Each text run is returned as a dict.
56
56
box_index = 0
@@ -77,6 +77,8 @@ def pdf_to_bboxes(pdf_index, fn, top_margin=0):
77
77
for word in page .findall ("{http://www.w3.org/1999/xhtml}word" ):
78
78
if float (word .get ("yMax" )) < (top_margin / 100.0 )* float (page .get ("height" )):
79
79
continue
80
+ if float (word .get ("yMin" )) > (bottom_margin / 100.0 )* float (page .get ("height" )):
81
+ continue
80
82
81
83
yield {
82
84
"index" : box_index ,
@@ -454,7 +456,9 @@ def main():
454
456
parser .add_argument ('-f' , '--format' , choices = ['png' ,'gif' ,'jpeg' ,'ppm' ,'tiff' ], default = 'png' ,
455
457
help = 'output format in which to render (default: png)' )
456
458
parser .add_argument ('-t' , '--top-margin' , metavar = 'margin' , default = 0. , type = float ,
457
- help = 'TODO (default 0.0)' )
459
+ help = 'top margin (ignored area) end in percent of page height (default 0.0)' )
460
+ parser .add_argument ('-b' , '--bottom-margin' , metavar = 'margin' , default = 100. , type = float ,
461
+ help = 'bottom margin (ignored area) begin in percent of page height (default 100.0)' )
458
462
args = parser .parse_args ()
459
463
460
464
def invalid_usage (msg ):
@@ -484,7 +488,7 @@ def invalid_usage(msg):
484
488
if len (args .files ) != 2 :
485
489
invalid_usage ('Insufficient number of files to compare; please supply exactly 2.' )
486
490
487
- changes = compute_changes (args .files [0 ], args .files [1 ], top_margin = float (args .top_margin ))
491
+ changes = compute_changes (args .files [0 ], args .files [1 ], top_margin = float (args .top_margin ), bottom_margin = float ( args . bottom_margin ) )
488
492
img = render_changes (changes , style )
489
493
img .save (sys .stdout .buffer , args .format .upper ())
490
494
0 commit comments