Skip to content

Commit cae3a1b

Browse files
committed
Have LZWCodec and FlateCodec return bytes data, as instructed by ISO 32000.
1 parent 0c9ceb6 commit cae3a1b

File tree

2 files changed

+50
-24
lines changed

2 files changed

+50
-24
lines changed

pypdf/filters.py

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,11 @@
3434
import math
3535
import struct
3636
import sys
37+
from io import BytesIO
3738
from sys import version_info
3839

3940
from .utils import PdfReadError, pypdfOrd, paethPredictor, PdfStreamError
4041

41-
if version_info < (3, 0):
42-
from cStringIO import StringIO
43-
else:
44-
from io import StringIO
45-
4642
try:
4743
import zlib
4844

@@ -135,9 +131,9 @@ def encode(data, decodeParms=None):
135131
def decode(data, decodeParms=None):
136132
"""
137133
:param data: flate-encoded data.
138-
:param decodeParms: a dictionary of values, understanding the
139-
"/Predictor":<int> key only
134+
:param decodeParms: a dictionary of parameter values.
140135
:return: the flate-decoded data.
136+
:rtype: bytes
141137
"""
142138
data = decompress(data)
143139
predictor = 1
@@ -156,7 +152,7 @@ def decode(data, decodeParms=None):
156152

157153
# PNG prediction:
158154
if 10 <= predictor <= 15:
159-
output = StringIO()
155+
output = BytesIO()
160156
# PNG prediction can vary from row to row
161157
row_length = columns + 1
162158
assert len(data) % row_length == 0
@@ -165,22 +161,22 @@ def decode(data, decodeParms=None):
165161
for row in range(len(data) // row_length):
166162
rowdata = [
167163
pypdfOrd(x) for x in
168-
data[(row*row_length):((row+1)*row_length)]
164+
data[(row * row_length):((row + 1) * row_length)]
169165
]
170166
filterByte = rowdata[0]
171167

172168
if filterByte == 0:
173169
pass
174170
elif filterByte == 1:
175171
for i in range(2, row_length):
176-
rowdata[i] = (rowdata[i] + rowdata[i-1]) % 256
172+
rowdata[i] = (rowdata[i] + rowdata[i - 1]) % 256
177173
elif filterByte == 2:
178174
for i in range(1, row_length):
179175
rowdata[i] = (rowdata[i] + prev_rowdata[i]) % 256
180176
elif filterByte == 3:
181177
for i in range(1, row_length):
182178
left = rowdata[i - 1] if i > 1 else 0
183-
floor = math.floor(left + prev_rowdata[i])/2
179+
floor = math.floor(left + prev_rowdata[i]) / 2
184180
rowdata[i] = (rowdata[i] + int(floor)) % 256
185181
elif filterByte == 4:
186182
for i in range(1, row_length):
@@ -194,8 +190,15 @@ def decode(data, decodeParms=None):
194190
raise PdfReadError(
195191
"Unsupported PNG filter %r" % filterByte
196192
)
193+
197194
prev_rowdata = rowdata
198-
output.write(''.join([chr(x) for x in rowdata[1:]]))
195+
196+
for d in rowdata:
197+
if sys.version_info < (3, 0):
198+
output.write(chr(d))
199+
else:
200+
output.write(bytes([d]))
201+
199202

200203
data = output.getvalue()
201204
else:
@@ -399,25 +402,28 @@ def __init__(self, data):
399402
self.data = data
400403
self.bytepos = 0
401404
self.bitpos = 0
402-
self.dict = [u""] * self.MAX_ENTRIES
405+
self.dict = [b""] * self.MAX_ENTRIES
403406
self.dictindex = None
404407
self.bitspercode = None
405408

406409
for i in range(256):
407410
if sys.version_info < (3, 0):
408-
self.dict[i] = chr(i).decode("LATIN1")
409-
else:
410411
self.dict[i] = chr(i)
412+
else:
413+
self.dict[i] = bytes([i])
411414

412415
self._resetDict()
413416

414417
def decode(self):
415418
"""
416419
TIFF 6.0 specification explains in sufficient details the steps to
417420
implement the LZW encode() and decode() algorithms.
421+
422+
:rtype: bytes
418423
"""
424+
# TO-DO Make return value type bytes, as instructed by ISO 32000
419425
cW = self.CLEARDICT
420-
output = u""
426+
output = b""
421427

422428
while True:
423429
pW = cW
@@ -436,11 +442,19 @@ def decode(self):
436442
else:
437443
if cW < self.dictindex:
438444
output += self.dict[cW]
439-
p = self.dict[pW] + self.dict[cW][0]
445+
446+
if sys.version_info > (3, 0):
447+
p = self.dict[pW] + bytes([self.dict[cW][0]])
448+
else:
449+
p = self.dict[pW] + self.dict[cW][0]
440450

441451
self._addCodeToTable(p)
442452
else:
443-
p = self.dict[pW] + self.dict[pW][0]
453+
if sys.version_info > (3, 0):
454+
p = self.dict[pW] + bytes([self.dict[pW][0]])
455+
else:
456+
p = self.dict[pW] + self.dict[pW][0]
457+
444458
output += p
445459
self._addCodeToTable(p)
446460

@@ -486,10 +500,21 @@ def _addCodeToTable(self, data):
486500

487501
@staticmethod
488502
def encode(data, decodeParms=None):
503+
"""
504+
:param data: ``str`` or ``bytes`` input to encode.
505+
:param decodeParms:
506+
:return: encoded LZW text.
507+
"""
489508
return LZWCodec.Encoder(data).encode()
490509

491510
@staticmethod
492511
def decode(data, decodeParms=None):
512+
"""
513+
:param data: ``bytes`` or ``str`` text to decode.
514+
:param decodeParms: a dictionary of parameter values.
515+
:return: decoded data.
516+
:rtype: bytes
517+
"""
493518
return LZWCodec.Decoder(data).decode()
494519

495520

tests/test_filters.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -273,20 +273,21 @@ def testEncodeDecode(self):
273273
string.ascii_letters, 2000 * string.ascii_letters
274274
]
275275

276+
if sys.version_info > (3, 0):
277+
for index, e in enumerate(inputs):
278+
inputs[index] = e.encode("LATIN1")
279+
276280
for f in ("Hamlet.txt", "TheHappyPrince.txt", ):
277281
with open(join(TEST_DATA_ROOT, f), "rb") as infile:
278282
# TO-DO If we approach the number of read bytes to 10K the
279283
# codec stops working correctly. This is a bug to fix!
280284
inputs.append(infile.read())
281285

282-
for t in inputs:
283-
e = LZWCodec.Encoder(t)
286+
for b in inputs:
287+
e = LZWCodec.Encoder(b)
284288
d = LZWCodec.Decoder(e.encode())
285289

286-
if isinstance(t, bytes) and sys.version_info > (3, 0):
287-
self.assertEqual(t, d.decode().encode("LATIN1"))
288-
else:
289-
self.assertEqual(t, d.decode())
290+
self.assertEqual(b, d.decode())
290291

291292

292293
class DecodeStreamDataTestCase(unittest.TestCase):

0 commit comments

Comments
 (0)