diff --git a/CHANGELOG.md b/CHANGELOG.md index 2848f68ea5..ba40f6c3bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,40 @@ +Version 3.9.0, 2023-05-21 +------------------------- + +New Features (ENH): +- Simplify metadata input (Document Information Dictionary) (#1851) +- Extend cmap compatibilty to GBK_EUC_H/V (#1812) + +Bug Fixes (BUG): +- Prevent infinite loop when no character follows after a comment (#1828) +- get_contents does not return ContentStream (#1847) +- Accept XYZ destination with zoom missing (default to zoom=0.0) (#1844) +- Cope with 1 Bit images (#1815) + +Robustness (ROB): +- Handle missing /Type entry in Page tree (#1845) + +Documentation (DOC): +- Expand file size explanations (#1835) +- Add comparison with pdfplumber (#1837) +- Clarify that PyPDF2 is dead (#1827) +- Add Hunter King as Contributor for #1806 + +Maintenance (MAINT): +- Refactor internal Encryption class (#1821) +- Add R parameter to generate_values (#1820) +- Make encryption_key parameter of write_to_stream optional (#1819) +- Prepare for adding AES enryption support (#1818) + +Testing (TST): +- Parametrize test_cmap_encodings (#1823) + +Code Style (STY): +- Iterate directly over the list instead of using range (#1839) +- Minor refactorings in _encryption.py (#1822) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.1...3.8.2) + # CHANGELOG ## Version 3.8.1, 2023-04-23 diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 471a225283..5d36b9ac80 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -19,6 +19,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra * [Hale, Joseph](https://github.com/thehale) * [JianzhengLuo](https://github.com/JianzhengLuo) * [Karvonen, Harry](https://github.com/Hatell/) +* [King, Hunter](https://github.com/neversphere) * [Kotler, Mitchell](https://github.com/mitchelljkotler) * [KourFrost](https://github.com/KourFrost) * [Lightup1](https://github.com/Lightup1) diff --git a/docs/meta/comparisons.md b/docs/meta/comparisons.md index 9c7ddc2f58..d38861495a 100644 --- a/docs/meta/comparisons.md +++ b/docs/meta/comparisons.md @@ -38,8 +38,10 @@ As pypdf is free software, there were attempts to fork it and continue the development. PyPDF3 was first released in 2018 and still receives updates. PyPDF4 has only one release from 2018. -I, Martin Thoma, the current maintainer of pypdf, hope that we can -bring the community back to one path of development. Let's see. +I (Martin Thoma, the current maintainer of pypdf and PyPDF2), hope that we can +bring the community back to one path of development. I deprecated PyPDF2 in +favor of pypdf already and pypdf has now more features and a cleaner interface +than PyPDF2. See [history of pypdf](history.md). [free]: https://en.wikipedia.org/wiki/Free_software [PyMuPDF]: https://pypi.org/project/PyMuPDF/ @@ -48,13 +50,17 @@ bring the community back to one path of development. Let's see. [QPDF]: https://github.com/qpdf/qpdf -## pdfminer +## pdfminer.six and pdfplumber [`pdfminer.six`](https://pypi.org/project/pdfminer.six/) is capable of extracting the [font size](https://stackoverflow.com/a/69962459/562769) / font weight (bold-ness). It has no capabilities for writing PDF files. -## pdfrw / pdfminer / pdfplumber +[`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file. + +The `pdfplumber` community is active in answering questions and the library is maintained as of May 2023. + +## pdfrw / pdfrw2 I don't have experience with any of those libraries. Please add a comparison if you know pypdf and [`pdfrw`](https://pypi.org/project/pdfrw/)! @@ -64,8 +70,6 @@ Please be aware that there is also Then there is [`pdfrw2`](https://pypi.org/project/pdfrw2/) which doesn't have a large community behind it. -And there is also [`pdfplumber`](https://pypi.org/project/pdfplumber/) - ## Document Generation There are (Python) [tools to generate PDF documents](https://github.com/py-pdf/awesome-pdf#generators). diff --git a/docs/user/file-size.md b/docs/user/file-size.md index a7b2d3cc45..b87d3b16b5 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -30,7 +30,7 @@ It depends on the PDF how well this works, but we have seen an 86% file reduction (from 5.7 MB to 0.8 MB) within a real PDF. -## Remove images +## Removing Images ```python @@ -75,3 +75,13 @@ with open("out.pdf", "wb") as f: Using this method, we have seen a reduction by 70% (from 11.8 MB to 3.5 MB) with a real PDF. + +## Removing Sources + +When a page is removed from the page list, its content will still be present in the PDF file. This means that the data may still be used elsewhere. + +Simply removing a page from the page list will reduce the page count but not the file size. In order to exclude the content completely, the pages should not be added to the PDF using the PdfWriter.append() function. Instead, only the desired pages should be selected for inclusion (note: [PR #1843](https://github.com/py-pdf/pypdf/pull/1843) will add a page deletion feature). + +There can be issues with poor PDF formatting, such as when all pages are linked to the same resource. In such cases, dropping references to specific pages becomes useless because there is only one source for all pages. + +Cropping is an ineffective method for reducing the file size because it only adjusts the viewboxes and not the external parts of the source image. Therefore, the content that is no longer visible will still be present in the PDF. diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index e907b57a36..f0db464bfd 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -93,6 +93,8 @@ def build_char_map( "/GB-EUC-V": "gbk", # TBC "/GBpc-EUC-H": "gb2312", # TBC "/GBpc-EUC-V": "gb2312", # TBC + "/GBK-EUC-H": "gbk", # TBC + "/GBK-EUC-V": "gbk", # TBC # UCS2 in code } diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index ccdfcbc5e6..2647ed6214 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -24,7 +24,6 @@ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. - import hashlib import secrets import struct @@ -56,14 +55,6 @@ class CryptIdentity(CryptBase): pass -def _randrange(lower_inclusive: int, upper_exclusive: int) -> int: - return secrets.choice(range(lower_inclusive, upper_exclusive)) - - -def _randint(lower_inclusive: int, upper_inclusive: int) -> int: - return secrets.choice(range(lower_inclusive, upper_inclusive + 1)) - - try: from Crypto.Cipher import AES, ARC4 # type: ignore[import] from Crypto.Util.Padding import pad # type: ignore[import] @@ -83,7 +74,7 @@ def __init__(self, key: bytes) -> None: self.key = key def encrypt(self, data: bytes) -> bytes: - iv = bytes(bytearray(_randint(0, 255) for _ in range(16))) + iv = secrets.token_bytes(16) p = 16 - len(data) % 16 data += bytes(bytearray(p for _ in range(p))) aes = AES.new(self.key, AES.MODE_CBC, iv) @@ -320,7 +311,7 @@ def compute_key( u_hash.update(o_entry) u_hash.update(struct.pack("= 4 and metadata_encrypted is False: + if rev >= 4 and not metadata_encrypted: u_hash.update(b"\xff\xff\xff\xff") u_hash_digest = u_hash.digest() length = key_size // 8 @@ -739,14 +730,15 @@ def verify_perms( @staticmethod def generate_values( + R: int, user_password: bytes, owner_password: bytes, key: bytes, p: int, metadata_encrypted: bool, ) -> Dict[Any, Any]: - u_value, ue_value = AlgV5.compute_U_value(user_password, key) - o_value, oe_value = AlgV5.compute_O_value(owner_password, key, u_value) + u_value, ue_value = AlgV5.compute_U_value(R, user_password, key) + o_value, oe_value = AlgV5.compute_O_value(R, owner_password, key, u_value) perms = AlgV5.compute_Perms_value(key, p, metadata_encrypted) return { "/U": u_value, @@ -757,7 +749,7 @@ def generate_values( } @staticmethod - def compute_U_value(password: bytes, key: bytes) -> Tuple[bytes, bytes]: + def compute_U_value(R: int, password: bytes, key: bytes) -> Tuple[bytes, bytes]: """ Algorithm 3.8 Computing the encryption dictionary’s U (user password) and UE (user encryption key) values. @@ -775,25 +767,26 @@ def compute_U_value(password: bytes, key: bytes) -> Tuple[bytes, bytes]: as the UE key. Args: + R: password: key: Returns: A tuple (u-value, ue value) """ - random_bytes = bytes(_randrange(0, 256) for _ in range(16)) + random_bytes = secrets.token_bytes(16) val_salt = random_bytes[:8] key_salt = random_bytes[8:] - u_value = hashlib.sha256(password + val_salt).digest() + val_salt + key_salt + u_value = AlgV5.calculate_hash(R, password, val_salt, b"") + val_salt + key_salt - tmp_key = hashlib.sha256(password + key_salt).digest() + tmp_key = AlgV5.calculate_hash(R, password, key_salt, b"") iv = bytes(0 for _ in range(16)) ue_value = AES_CBC_encrypt(tmp_key, iv, key) return u_value, ue_value @staticmethod def compute_O_value( - password: bytes, key: bytes, u_value: bytes + R: int, password: bytes, key: bytes, u_value: bytes ) -> Tuple[bytes, bytes]: """ Algorithm 3.9 Computing the encryption dictionary’s O (owner password) @@ -815,6 +808,7 @@ def compute_O_value( The resulting 32-byte string is stored as the OE key. Args: + R: password: key: u_value: A 32-byte string, based on the user password, that shall be @@ -824,14 +818,13 @@ def compute_O_value( Returns: A tuple (O value, OE value) """ - random_bytes = bytes(_randrange(0, 256) for _ in range(16)) + random_bytes = secrets.token_bytes(16) val_salt = random_bytes[:8] key_salt = random_bytes[8:] o_value = ( - hashlib.sha256(password + val_salt + u_value).digest() + val_salt + key_salt + AlgV5.calculate_hash(R, password, val_salt, u_value) + val_salt + key_salt ) - - tmp_key = hashlib.sha256(password + key_salt + u_value).digest() + tmp_key = AlgV5.calculate_hash(R, password, key_salt, u_value[:48]) iv = bytes(0 for _ in range(16)) oe_value = AES_CBC_encrypt(tmp_key, iv, key) return o_value, oe_value @@ -869,7 +862,7 @@ def compute_Perms_value(key: bytes, p: int, metadata_encrypted: bool) -> bytes: The perms value """ b8 = b"T" if metadata_encrypted else b"F" - rr = bytes(_randrange(0, 256) for _ in range(4)) + rr = secrets.token_bytes(4) data = struct.pack(" None: # See TABLE 3.18 Entries common to all encryption dictionaries - self.algV = algV - self.algR = algR + # use same name as keys of encryption dictionaries entries + self.V = V + self.R = R + self.Length = Length # key_size + self.P = (P + 0x100000000) % 0x100000000 # maybe P < 0 self.entry = entry - self.key_size = entry.get("/Length", 40) + self.EncryptMetadata = EncryptMetadata self.id1_entry = first_id_entry self.StmF = StmF self.StrF = StrF self.EFF = EFF + self.values: EncryptionValues = values if values else EncryptionValues() - # 1 => owner password - # 2 => user password self._password_type = PasswordType.NOT_DECRYPTED self._key: Optional[bytes] = None @@ -911,6 +941,27 @@ def is_decrypted(self) -> bool: return self._password_type != PasswordType.NOT_DECRYPTED def decrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObject: + # skip calculate key + if not self._is_encryption_object(obj): + return obj + + cf = self._make_crypt_filter(idnum, generation) + return cf.decrypt_object(obj) + + @staticmethod + def _is_encryption_object(obj: PdfObject) -> bool: + return isinstance( + obj, + ( + ByteStringObject, + TextStringObject, + StreamObject, + ArrayObject, + DictionaryObject, + ), + ) + + def _make_crypt_filter(self, idnum: int, generation: int) -> CryptFilter: """ Algorithm 1: Encryption of data using the RC4 or AES algorithms. @@ -949,21 +1000,13 @@ def decrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObje 16 bytes, and the initialization vector is a 16-byte random number that is stored as the first 16 bytes of the encrypted stream or string. The output is the encrypted data to be stored in the PDF file. - - Args: - obj: - idnum: - generation: - - Returns: - The PdfObject """ pack1 = struct.pack(" PdfObje StrCrypt = self._get_crypt(self.StrF, rc4_key, aes128_key, aes256_key) efCrypt = self._get_crypt(self.EFF, rc4_key, aes128_key, aes256_key) - cf = CryptFilter(stmCrypt, StrCrypt, efCrypt) - return cf.decrypt_object(obj) + return CryptFilter(stmCrypt, StrCrypt, efCrypt) @staticmethod def _get_crypt( @@ -994,7 +1036,8 @@ def _get_crypt( else: return CryptRC4(rc4_key) - def verify(self, password: Union[bytes, str]) -> PasswordType: + @staticmethod + def _encode_password(password: Union[bytes, str]) -> bytes: if isinstance(password, str): try: pwd = password.encode("latin-1") @@ -1002,45 +1045,39 @@ def verify(self, password: Union[bytes, str]) -> PasswordType: pwd = password.encode("utf-8") else: pwd = password + return pwd - key, rc = self.verify_v4(pwd) if self.algV <= 4 else self.verify_v5(pwd) + def verify(self, password: Union[bytes, str]) -> PasswordType: + pwd = self._encode_password(password) + key, rc = self.verify_v4(pwd) if self.V <= 4 else self.verify_v5(pwd) if rc != PasswordType.NOT_DECRYPTED: self._password_type = rc self._key = key return rc def verify_v4(self, password: bytes) -> Tuple[bytes, PasswordType]: - R = cast(int, self.entry["/R"]) - P = cast(int, self.entry["/P"]) - P = (P + 0x100000000) % 0x100000000 # maybe < 0 - # make type(metadata_encrypted) == bool - em = self.entry.get("/EncryptMetadata") - metadata_encrypted = em.value if em else True - o_entry = cast(ByteStringObject, self.entry["/O"].get_object()).original_bytes - u_entry = cast(ByteStringObject, self.entry["/U"].get_object()).original_bytes - # verify owner password first key = AlgV4.verify_owner_password( password, - R, - self.key_size, - o_entry, - u_entry, - P, + self.R, + self.Length, + self.values.O, + self.values.U, + self.P, self.id1_entry, - metadata_encrypted, + self.EncryptMetadata, ) if key: return key, PasswordType.OWNER_PASSWORD key = AlgV4.verify_user_password( password, - R, - self.key_size, - o_entry, - u_entry, - P, + self.R, + self.Length, + self.values.O, + self.values.U, + self.P, self.id1_entry, - metadata_encrypted, + self.EncryptMetadata, ) if key: return key, PasswordType.USER_PASSWORD @@ -1048,28 +1085,21 @@ def verify_v4(self, password: bytes) -> Tuple[bytes, PasswordType]: def verify_v5(self, password: bytes) -> Tuple[bytes, PasswordType]: # TODO: use SASLprep process - o_entry = cast(ByteStringObject, self.entry["/O"].get_object()).original_bytes - u_entry = cast(ByteStringObject, self.entry["/U"].get_object()).original_bytes - oe_entry = cast(ByteStringObject, self.entry["/OE"].get_object()).original_bytes - ue_entry = cast(ByteStringObject, self.entry["/UE"].get_object()).original_bytes - # verify owner password first key = AlgV5.verify_owner_password( - self.algR, password, o_entry, oe_entry, u_entry + self.R, password, self.values.O, self.values.OE, self.values.U ) rc = PasswordType.OWNER_PASSWORD if not key: - key = AlgV5.verify_user_password(self.algR, password, u_entry, ue_entry) + key = AlgV5.verify_user_password( + self.R, password, self.values.U, self.values.UE + ) rc = PasswordType.USER_PASSWORD if not key: return b"", PasswordType.NOT_DECRYPTED # verify Perms - perms = cast(ByteStringObject, self.entry["/Perms"].get_object()).original_bytes - P = cast(int, self.entry["/P"]) - P = (P + 0x100000000) % 0x100000000 # maybe < 0 - metadata_encrypted = self.entry.get("/EncryptMetadata", True) - if not AlgV5.verify_perms(key, perms, P, metadata_encrypted): + if not AlgV5.verify_perms(key, self.values.Perms, self.P, self.EncryptMetadata): logger_warning("ignore '/Perms' verify failed", __name__) return key, rc @@ -1106,11 +1136,33 @@ def read(encryption_entry: DictionaryObject, first_id_entry: bytes) -> "Encrypti allowed_methods = ("/Identity", "/V2", "/AESV2", "/AESV3") if StmF not in allowed_methods: - raise NotImplementedError("StmF Method {StmF} NOT supported!") + raise NotImplementedError(f"StmF Method {StmF} NOT supported!") if StrF not in allowed_methods: raise NotImplementedError(f"StrF Method {StrF} NOT supported!") if EFF not in allowed_methods: raise NotImplementedError(f"EFF Method {EFF} NOT supported!") R = cast(int, encryption_entry["/R"]) - return Encryption(V, R, encryption_entry, first_id_entry, StmF, StrF, EFF) + P = cast(int, encryption_entry["/P"]) + Length = encryption_entry.get("/Length", 40) + EncryptMetadata = encryption_entry.get("/EncryptMetadata") + EncryptMetadata = EncryptMetadata.value if EncryptMetadata is not None else True + values = EncryptionValues() + values.O = cast(ByteStringObject, encryption_entry["/O"]).original_bytes + values.U = cast(ByteStringObject, encryption_entry["/U"]).original_bytes + values.OE = encryption_entry.get("/OE", ByteStringObject()).original_bytes + values.UE = encryption_entry.get("/UE", ByteStringObject()).original_bytes + values.Perms = encryption_entry.get("/Perms", ByteStringObject()).original_bytes + return Encryption( + V=V, + R=R, + Length=Length, + P=P, + EncryptMetadata=EncryptMetadata, + first_id_entry=first_id_entry, + values=values, + StmF=StmF, + StrF=StrF, + EFF=EFF, + entry=encryption_entry, # can be deleted? + ) diff --git a/pypdf/_page.py b/pypdf/_page.py index 3f2a7e3090..4d53d1b540 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -644,13 +644,11 @@ def _content_stream_rename( stream = ContentStream(stream, pdf) for operands, _operator in stream.operations: if isinstance(operands, list): - for i in range(len(operands)): - op = operands[i] + for i, op in enumerate(operands): if isinstance(op, NameObject): operands[i] = rename.get(op, op) elif isinstance(operands, dict): - for i in operands: - op = operands[i] + for i, op in operands.items(): if isinstance(op, NameObject): operands[i] = rename.get(op, op) else: @@ -703,7 +701,11 @@ def get_contents(self) -> Optional[ContentStream]: ``/Contents`` is optional, as described in PDF Reference 7.7.3.3 """ if PG.CONTENTS in self: - return self[PG.CONTENTS].get_object() # type: ignore + try: + pdf = cast(IndirectObject, self.indirect_reference).pdf + except AttributeError: + pdf = None + return ContentStream(self[PG.CONTENTS].get_object(), pdf) else: return None @@ -821,7 +823,6 @@ def _merge_page( page2content = page2.get_contents() if page2content is not None: - page2content = ContentStream(page2content, self.pdf) rect = getattr(page2, MERGE_CROP_BOX) page2content.operations.insert( 0, @@ -957,7 +958,6 @@ def _merge_page_writer( page2content = page2.get_contents() if page2content is not None: - page2content = ContentStream(page2content, self.pdf) rect = getattr(page2, MERGE_CROP_BOX) page2content.operations.insert( 0, @@ -1493,12 +1493,7 @@ def compress_content_streams(self) -> None: """ content = self.get_contents() if content is not None: - content_obj: Any - if not isinstance(content, ContentStream): - content_obj = ContentStream(content, self.pdf) - else: - content_obj = content - content_obj = content_obj.flate_encode() + content_obj = content.flate_encode() try: content.indirect_reference.pdf._objects[ # type: ignore content.indirect_reference.idnum - 1 # type: ignore diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index ba6cd8a3c9..c6f2bbebde 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -33,7 +33,7 @@ def hash_value(self) -> bytes: ... def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: ... diff --git a/pypdf/_reader.py b/pypdf/_reader.py index a09c6a4ded..3bf9909a50 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1187,9 +1187,13 @@ def _flatten( pages = catalog["/Pages"].get_object() # type: ignore self.flattened_pages = [] - t = "/Pages" if PA.TYPE in pages: t = pages[PA.TYPE] # type: ignore + # if pdf has no type, considered as a page if /Kids is missing + elif PA.KIDS not in pages: + t = "/Page" + else: + t = "/Pages" if t == "/Pages": for attr in inheritable_page_attributes: diff --git a/pypdf/_version.py b/pypdf/_version.py index e4e78c0b9d..fcd7ddb9e4 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.8.1" +__version__ = "3.9.0" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 8838f64456..e3968e96c8 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1181,6 +1181,7 @@ def _write_pdf_structure(self, stream: StreamType) -> List[int]: object_positions = [] stream.write(self.pdf_header + b"\n") stream.write(b"%\xE2\xE3\xCF\xD3\n") + for i, obj in enumerate(self._objects): obj = self._objects[i] # If the obj is None we can't write anything @@ -1230,7 +1231,7 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None: trailer[NameObject(TK.ID)] = self._ID if hasattr(self, "_encrypt"): trailer[NameObject(TK.ENCRYPT)] = self._encrypt - trailer.write_to_stream(stream, None) + trailer.write_to_stream(stream) stream.write(b_(f"\nstartxref\n{xref_location}\n%%EOF\n")) # eof def add_metadata(self, infos: Dict[str, Any]) -> None: @@ -1242,9 +1243,13 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: and each value is your new metadata. """ args = {} + if isinstance(infos, PdfObject): + infos = cast(DictionaryObject, infos.get_object()) for key, value in list(infos.items()): - args[NameObject(key)] = create_string_object(value) - self.get_object(self._info).update(args) # type: ignore + if isinstance(value, PdfObject): + value = value.get_object() + args[NameObject(key)] = create_string_object(str(value)) + cast(DictionaryObject, self._info.get_object()).update(args) def addMetadata(self, infos: Dict[str, Any]) -> None: # deprecated """ diff --git a/pypdf/filters.py b/pypdf/filters.py index dd0ba6ccd9..4bece9c4f0 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -652,7 +652,9 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB ): # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes - mode: Literal["RGB", "P", "L", "RGBA"] = "RGB" + mode: Literal["1", "RGB", "P", "L", "RGBA"] = "RGB" + elif x_object_obj.get("/BitsPerComponent", 8) == 1: + mode = "1" else: mode = "P" extension = None diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index be3d71c457..457e88acac 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -146,7 +146,7 @@ def getObject(self) -> Optional["PdfObject"]: # deprecated return self.get_object() def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: raise NotImplementedError @@ -164,7 +164,7 @@ def clone( ) def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(b"null") @@ -218,7 +218,7 @@ def __repr__(self) -> str: return "True" if self.value else "False" def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if self.value: stream.write(b"true") @@ -311,7 +311,7 @@ def __ne__(self, other: Any) -> bool: return not self.__eq__(other) def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(b_(f"{self.idnum} {self.generation} R")) @@ -397,7 +397,7 @@ def as_numeric(self) -> float: return float(self) def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(self.myrepr().encode("utf8")) @@ -434,7 +434,7 @@ def as_numeric(self) -> int: return int(repr(self).encode("utf8")) def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(repr(self).encode("utf8")) @@ -488,7 +488,7 @@ def original_bytes(self) -> bytes: return self def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: bytearr = self if encryption_key: @@ -556,7 +556,7 @@ def get_original_bytes(self) -> bytes: raise Exception("no information about original bytes") def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: # Try to write the string out as a PDFDocEncoding encoded string. It's # nicer to look at in the PDF file. Sadly, we take a performance hit @@ -570,7 +570,7 @@ def write_to_stream( bytearr = RC4_encrypt(encryption_key, bytearr) obj = ByteStringObject(bytearr) - obj.write_to_stream(stream, None) + obj.write_to_stream(stream) else: stream.write(b"(") for c in bytearr: @@ -615,7 +615,7 @@ def clone( ) def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(self.renumber()) # b_(renumber(self))) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 9854243229..91f59f746d 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -111,7 +111,7 @@ def items(self) -> Iterable[Any]: return enumerate(self) def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(b"[") for data in self: @@ -338,7 +338,7 @@ def xmpMetadata(self) -> Optional[PdfObject]: # deprecated return self.xmp_metadata def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(b"<<\n") for key, value in list(self.items()): @@ -783,7 +783,7 @@ def _data(self, value: Any) -> None: self.__data = value def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: self[NameObject(SA.LENGTH)] = NumberObject(len(self._data)) DictionaryObject.write_to_stream(self, stream, encryption_key) @@ -1016,7 +1016,7 @@ def __parse_content_stream(self, stream: StreamType) -> None: # encountering a comment -- but read_object assumes that # following the comment must be the object we're trying to # read. In this case, it could be an operator instead. - while peek not in (b"\r", b"\n"): + while peek not in (b"\r", b"\n", b""): peek = stream.read(1) else: operands.append(read_object(stream, None, self.forced_encoding)) @@ -1106,14 +1106,14 @@ def _data(self) -> bytes: if operator == b"INLINE IMAGE": new_data.write(b"BI") dict_text = BytesIO() - operands["settings"].write_to_stream(dict_text, None) + operands["settings"].write_to_stream(dict_text) new_data.write(dict_text.getvalue()[2:-2]) new_data.write(b"ID ") new_data.write(operands["data"]) new_data.write(b"EI") else: for op in operands: - op.write_to_stream(new_data, None) + op.write_to_stream(new_data) new_data.write(b" ") new_data.write(b_(operator)) new_data.write(b"\n") @@ -1363,6 +1363,8 @@ def __init__( # from table 8.2 of the PDF 1.7 reference. if typ == "/XYZ": + if len(args) < 3: # zoom is missing + args.append(NumberObject(0.0)) ( self[NameObject(TA.LEFT)], self[NameObject(TA.TOP)], @@ -1411,7 +1413,7 @@ def getDestArray(self) -> "ArrayObject": # deprecated return self.dest_array def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(b"<<\n") key = NameObject("/D") diff --git a/pypdf/generic/_outline.py b/pypdf/generic/_outline.py index dcff76d6ed..e67ce02420 100644 --- a/pypdf/generic/_outline.py +++ b/pypdf/generic/_outline.py @@ -7,7 +7,7 @@ class OutlineItem(Destination): def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: stream.write(b"<<\n") for key in [ diff --git a/pypdf/xmp.py b/pypdf/xmp.py index 23e8ad3f77..d909641136 100644 --- a/pypdf/xmp.py +++ b/pypdf/xmp.py @@ -232,7 +232,7 @@ def rdfRoot(self) -> XmlElement: # deprecated return self.rdf_root def write_to_stream( - self, stream: StreamType, encryption_key: Union[None, str, bytes] + self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: self.stream.write_to_stream(stream, encryption_key) diff --git a/pyproject.toml b/pyproject.toml index f0aa3b3f5c..4936bcd853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -176,6 +176,7 @@ ignore = [ "PTH123", # `open()` should be replaced by `Path.open()` "S101", # Use of `assert` detected "SLF001", # Private member accessed + "PD011", # Use `.to_numpy()` instead of `.values` ] [tool.ruff.per-file-ignores] diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index a54dddde5e..3cec546a8c 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -50,10 +50,13 @@ pytest==7.2.2 # -r requirements/ci.in # pytest-benchmark # pytest-socket + # pytest-timeout pytest-benchmark==4.0.0 # via -r requirements/ci.in pytest-socket==0.6.0 # via -r requirements/ci.in +pytest-timeout==2.1.0 + # via -r requirements/ci.in ruff==0.0.259 # via -r requirements/ci.in typeguard==3.0.2 diff --git a/requirements/ci.in b/requirements/ci.in index 1d41d32028..2150ddf0f4 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -9,6 +9,7 @@ pycryptodome pytest pytest-benchmark pytest-socket +pytest-timeout typeguard types-dataclasses types-Pillow diff --git a/requirements/ci.txt b/requirements/ci.txt index a7a12e49ab..235b13e079 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -62,10 +62,13 @@ pytest==7.0.1 # -r requirements/ci.in # pytest-benchmark # pytest-socket + # pytest-timeout pytest-benchmark==3.4.1 # via -r requirements/ci.in pytest-socket==0.4.1 # via -r requirements/ci.in +pytest-timeout==2.1.0 + # via -r requirements/ci.in six==1.16.0 # via flake8-print tomli==1.2.3 diff --git a/tests/test_cmap.py b/tests/test_cmap.py index a7b1b451fe..ce91fd23cd 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -109,20 +109,28 @@ def test_ascii_charset(): @pytest.mark.enable_socket() @pytest.mark.parametrize( - ("url", "name", "page_nb"), + ("url", "name", "page_nb", "within_text"), [ ( "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf", "cmap1370.pdf", 0, + "", + ), + ( + "https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", + "02voc.pdf", + 2, + "Document delineation and character sequence decoding", ), - ("https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", "02voc.pdf", 2), ], ids=["iss1370", "iss1379"], ) -def test_text_extraction_of_specific_pages(url: str, name: str, page_nb: int): +def test_text_extraction_of_specific_pages( + url: str, name: str, page_nb: int, within_text +): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.pages[page_nb].extract_text() + assert within_text in reader.pages[page_nb].extract_text() @pytest.mark.enable_socket() @@ -135,9 +143,28 @@ def test_iss1533(): @pytest.mark.enable_socket() -def test_ucs2(caplog): - url = "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf" - name = "tstUCS2.pdf" +@pytest.mark.parametrize( + ("url", "name", "page_index", "within_text", "caplog_text"), + [ + ( + "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf", + "tstUCS2.pdf", + 1, + ["2 / 12", "S0490520090001", "于博"], + "", + ), + ( + "https://github.com/py-pdf/pypdf/files/11315397/3.pdf", + "tst-GBK_EUC.pdf", + 0, + ["NJA", "中华男科学杂志"], + "Multiple definitions in dictionary at byte 0x5cb42 for key /MediaBox\n", + ), + ], +) +def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.pages[1].extract_text() # no error - assert caplog.text == "" + extracted = reader.pages[page_index].extract_text() # no error + for contained in within_text: + assert contained in extracted + assert caplog_text in caplog.text diff --git a/tests/test_encryption.py b/tests/test_encryption.py index 33c86f85b5..0db7a2731a 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -5,7 +5,7 @@ import pypdf from pypdf import PasswordType, PdfReader -from pypdf._encryption import AlgV5, CryptRC4, _randint, _randrange +from pypdf._encryption import AlgV5, CryptRC4 from pypdf.errors import DependencyError, PdfReadError try: @@ -214,6 +214,7 @@ def test_alg_v5_generate_values(): return key = b"0123456789123451" values = AlgV5.generate_values( + R=4, user_password=b"foo", owner_password=b"bar", key=key, @@ -227,23 +228,3 @@ def test_alg_v5_generate_values(): "/OE": values["/OE"], "/Perms": values["/Perms"], } - - -def test_randrange_function(): - """ - _randrange() function generates a range of unique random numbers. - - This test might randomly fail in very rare cases. - """ - random_set = {_randrange(0, 10) for _ in range(1000)} - assert random_set == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} - - -def test_randint_function(): - """ - _randint() function generates a range of unique random numbers, including the upper bound. - - This test might randomly fail in very rare cases. - """ - random_set = {_randint(0, 10) for _ in range(1000)} - assert random_set == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} diff --git a/tests/test_filters.py b/tests/test_filters.py index 57d2da179c..08e42ff268 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -290,3 +290,13 @@ def test_pa_image_extraction(): "issue-1801.png", ) assert data == images[0].data + + +@pytest.mark.enable_socket() +def test_1bit_image_extraction(): + """Cf issue #1814""" + url = "https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf" + name = "grimm10" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for p in reader.pages: + p.images diff --git a/tests/test_generic.py b/tests/test_generic.py index dfac41c9a2..5e464460d8 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -86,7 +86,7 @@ def test_boolean_object(value, expected, tell): def test_boolean_object_write(): stream = BytesIO() boolobj = BooleanObject(None) - boolobj.write_to_stream(stream, encryption_key=None) + boolobj.write_to_stream(stream) stream.seek(0, 0) assert stream.read() == b"false" @@ -213,23 +213,23 @@ def test_name_object(caplog): # test write b = BytesIO() - NameObject("/hello").write_to_stream(b, None) + NameObject("/hello").write_to_stream(b) assert bytes(b.getbuffer()) == b"/hello" caplog.clear() b = BytesIO() - NameObject("hello").write_to_stream(b, None) + NameObject("hello").write_to_stream(b) assert bytes(b.getbuffer()) == b"hello" assert "Incorrect first char" in caplog.text caplog.clear() b = BytesIO() - NameObject("/DIJMAC+Arial Black#1").write_to_stream(b, None) + NameObject("/DIJMAC+Arial Black#1").write_to_stream(b) assert bytes(b.getbuffer()) == b"/DIJMAC+Arial#20Black#231" assert caplog.text == "" b = BytesIO() - NameObject("/你好世界").write_to_stream(b, None) + NameObject("/你好世界").write_to_stream(b) assert bytes(b.getbuffer()) == b"/#E4#BD#A0#E5#A5#BD#E4#B8#96#E7#95#8C" assert caplog.text == "" @@ -259,7 +259,7 @@ def test_destination_fit_v(): def test_outline_item_write_to_stream(): stream = BytesIO() oi = OutlineItem(NameObject("title"), NullObject(), Fit.fit_vertically(left=0)) - oi.write_to_stream(stream, None) + oi.write_to_stream(stream) stream.seek(0, 0) assert stream.read() == b"<<\n/Title (title)\n/Dest [ null /FitV 0.0 ]\n>>" @@ -1156,3 +1156,15 @@ def test_iss1615_1673(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) + + +@pytest.mark.enable_socket() +def test_destination_withoutzoom(): + """Cf issue #1832""" + url = ( + "https://raw.githubusercontent.com/xrkk/tmpppppp/main/" + "2021%20----%20book%20-%20Security%20of%20biquitous%20Computing%20Systems.pdf" + ) + name = "2021_book_security.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader.outline diff --git a/tests/test_page.py b/tests/test_page.py index 68c0807440..a7fa503649 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -102,7 +102,13 @@ def test_page_operations(pdf_path, password): assert abs(t.ctm[4] + 100) < 0.01 assert abs(t.ctm[5] - 50) < 0.01 - transformation = Transformation().rotate(90).scale(1).translate(1, 1).transform(Transformation((1, 0, 0, -1, 0, 0))) + transformation = ( + Transformation() + .rotate(90) + .scale(1) + .translate(1, 1) + .transform(Transformation((1, 0, 0, -1, 0, 0))) + ) page.add_transformation(transformation, expand=True) page.add_transformation((1, 0, 0, 0, 0, 0)) page.scale(2, 2) @@ -178,7 +184,10 @@ def test_transformation_equivalence2(): w.append(reader_add) height = reader_add.pages[0].mediabox.height w.pages[0].merge_transformed_page( - reader_base.pages[0], Transformation().transform(Transformation((1, 0, 0, -1, 0, height))), False, False + reader_base.pages[0], + Transformation().transform(Transformation((1, 0, 0, -1, 0, height))), + False, + False, ) # No special assert: Visual check the page has been increased and all is visible (box+graph) @@ -255,7 +264,9 @@ def test_compress_content_streams(pdf_path, password): writer = PdfWriter() if password: reader.decrypt(password) + assert isinstance(reader.pages[0].get_contents(), ContentStream) writer.clone_document_from_reader(reader) + assert isinstance(writer.pages[0].get_contents(), ContentStream) for page in writer.pages: page.compress_content_streams() @@ -321,7 +332,10 @@ def test_page_scale(): def test_add_transformation_on_page_without_contents(): page = PageObject() + assert page.get_contents() is None page.add_transformation(Transformation()) + page[NameObject("/Contents")] = ContentStream(None, None) + assert isinstance(page.get_contents(), ContentStream) @pytest.mark.enable_socket() @@ -1111,3 +1125,10 @@ def test_pages_printing(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) assert str(reader.pages) == "[PageObject(0)]" + + +def test_pdf_pages_missing_type(): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + del reader.trailer["/Root"]["/Pages"]["/Kids"][0].get_object()["/Type"] + reader.pages[0] diff --git a/tests/test_protocols.py b/tests/test_protocols.py index 8c8a6ff50c..8ca8167c07 100644 --- a/tests/test_protocols.py +++ b/tests/test_protocols.py @@ -12,4 +12,4 @@ def test_pdfobjectprotocol(): assert o._reference_clone(None, None) is None assert o.get_object() is None assert o.hash_value() is None - assert o.write_to_stream(None, None) is None + assert o.write_to_stream(None) is None diff --git a/tests/test_reader.py b/tests/test_reader.py index 8a0beb987e..f606757e9b 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1401,3 +1401,13 @@ def test_iss1756(): in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) in_pdf.trailer["/ID"] # removed to cope with missing cryptodome during commit check : len(in_pdf.pages) + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(30) +def test_iss1825(): + url = "https://github.com/py-pdf/pypdf/files/11367871/MiFO_LFO_FEIS_NOA_Published.3.pdf" + name = "iss1825.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + page = reader.pages[0] + page.extract_text() diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 7a9ddcb53d..d3eabdbc3b 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -904,14 +904,14 @@ def test_extra_test_iss1541(): cs = ContentStream(reader.pages[0]["/Contents"], None, None) cs.operations.insert(-1, ([], b"EMC")) bu = BytesIO() - cs.write_to_stream(bu, None) + cs.write_to_stream(bu) bu.seek(0) ContentStream(read_object(bu, None, None), None, None).operations cs = ContentStream(reader.pages[0]["/Contents"], None, None) cs.operations.insert(-1, ([], b"E!C")) bu = BytesIO() - cs.write_to_stream(bu, None) + cs.write_to_stream(bu) bu.seek(0) with pytest.raises(PdfReadError) as exc: ContentStream(read_object(bu, None, None), None, None).operations diff --git a/tests/test_writer.py b/tests/test_writer.py index f2c2fa192b..80a3158aad 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -192,7 +192,9 @@ def writer_operate(writer: PdfWriter) -> None: writer.remove_images() - writer.add_metadata({"author": "Martin Thoma"}) + writer.add_metadata(reader.metadata) + writer.add_metadata({"/Author": "Martin Thoma"}) + writer.add_metadata({"/MyCustom": 1234}) writer.add_attachment("foobar.gif", b"foobarcontent")