4
4
from lxml import etree
5
5
from PIL import Image , ImageDraw , ImageOps
6
6
7
- def compute_changes (pdf_fn_1 , pdf_fn_2 ):
7
+ def compute_changes (pdf_fn_1 , pdf_fn_2 , top_margin = 0 ):
8
8
# Serialize the text in the two PDFs.
9
- docs = [serialize_pdf (0 , pdf_fn_1 ), serialize_pdf (1 , pdf_fn_2 )]
9
+ docs = [serialize_pdf (0 , pdf_fn_1 , top_margin ), serialize_pdf (1 , pdf_fn_2 , top_margin )]
10
10
11
11
# Compute differences between the serialized text.
12
12
diff = perform_diff (docs [0 ][1 ], docs [1 ][1 ])
13
13
changes = process_hunks (diff , [docs [0 ][0 ], docs [1 ][0 ]])
14
14
15
15
return changes
16
16
17
- def serialize_pdf (i , fn ):
18
- box_generator = pdf_to_bboxes (i , fn )
17
+ def serialize_pdf (i , fn , top_margin ):
18
+ box_generator = pdf_to_bboxes (i , fn , top_margin )
19
19
box_generator = mark_eol_hyphens (box_generator )
20
20
21
21
boxes = []
@@ -45,7 +45,7 @@ def serialize_pdf(i, fn):
45
45
text = "" .join (text )
46
46
return boxes , text
47
47
48
- def pdf_to_bboxes (pdf_index , fn ):
48
+ def pdf_to_bboxes (pdf_index , fn , top_margin = 0 ):
49
49
# Get the bounding boxes of text runs in the PDF.
50
50
# Each text run is returned as a dict.
51
51
box_index = 0
@@ -62,6 +62,9 @@ def pdf_to_bboxes(pdf_index, fn):
62
62
"height" : float (page .get ("height" ))
63
63
}
64
64
for word in page .findall ("{http://www.w3.org/1999/xhtml}word" ):
65
+ if float (word .get ("yMax" )) < (top_margin / 100.0 )* float (page .get ("height" )):
66
+ continue
67
+
65
68
yield {
66
69
"index" : box_index ,
67
70
"pdf" : pdfdict ,
@@ -435,12 +438,20 @@ def pdftopng(pdffile, pagenumber, width=900):
435
438
args = sys .argv [1 :]
436
439
437
440
styles = ["strike" , "underline" ]
438
- if args [0 ] == "--style" :
439
- args .pop (0 )
440
- styles = args .pop (0 ).split (',' )
441
+ top_margin = 0
442
+ while True :
443
+ if args [0 ] == "--style" :
444
+ args .pop (0 )
445
+ styles = args .pop (0 ).split (',' )
446
+ continue
447
+ if args [0 ] == "--top-margin" :
448
+ args .pop (0 )
449
+ top_margin = float (args .pop (0 ))
450
+ continue
451
+ break
441
452
442
453
left_file = args .pop (0 )
443
454
right_file = args .pop (0 )
444
455
445
- changes = compute_changes (left_file , right_file )
456
+ changes = compute_changes (left_file , right_file , top_margin = top_margin )
446
457
render_changes (changes , styles , sys .stdout .buffer )
0 commit comments