From 0476799bce16ed5d527c04be1a4b3ba783be50f9 Mon Sep 17 00:00:00 2001 From: Adel Haddad <26027314+adehad@users.noreply.github.com> Date: Mon, 27 Mar 2023 07:53:35 +0100 Subject: [PATCH 01/11] DOC: Add readthedocs.yml and bump docs dependencies using `tox -e deps` (#1750) --- readthedocs.yml | 14 +++++++++++++ requirements/docs.in | 2 +- requirements/docs.txt | 46 +++++++++++++++++++++---------------------- 3 files changed, 38 insertions(+), 24 deletions(-) create mode 100644 readthedocs.yml diff --git a/readthedocs.yml b/readthedocs.yml new file mode 100644 index 0000000000..82300c6f90 --- /dev/null +++ b/readthedocs.yml @@ -0,0 +1,14 @@ +--- +version: 2 +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +formats: all + +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +python: + install: + - requirements: requirements/docs.txt diff --git a/requirements/docs.in b/requirements/docs.in index 6fe145949f..bbfe5118b8 100644 --- a/requirements/docs.in +++ b/requirements/docs.in @@ -1,5 +1,5 @@ sphinx sphinx_rtd_theme -myst_parser==0.16.1 +myst_parser -e . attrs # required for myst, but not automatically installed by myst diff --git a/requirements/docs.txt b/requirements/docs.txt index cb44bcd2bb..ac8a6a95ab 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.7 -# by the following command: +# This file is autogenerated by pip-compile with python 3.6 +# To update, run: # -# pip-compile requirements/docs.in +# pip-compile --output-file=requirements/docs.txt requirements/docs.in # -e . # via -r requirements/docs.in @@ -10,13 +10,15 @@ alabaster==0.7.13 # via sphinx attrs==22.2.0 # via -r requirements/docs.in -babel==2.11.0 +babel==2.12.1 # via sphinx certifi==2022.12.7 # via requests -charset-normalizer==3.0.1 +charset-normalizer==3.1.0 # via requests -docutils==0.17.1 +colorama==0.4.6 + # via sphinx +docutils==0.18.1 # via # myst-parser # sphinx @@ -25,61 +27,59 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via sphinx jinja2==3.1.2 # via # myst-parser # sphinx -markdown-it-py==2.1.0 +markdown-it-py==2.2.0 # via # mdit-py-plugins # myst-parser markupsafe==2.1.2 # via jinja2 -mdit-py-plugins==0.3.3 +mdit-py-plugins==0.3.5 # via myst-parser mdurl==0.1.2 # via markdown-it-py -myst-parser==0.16.1 +myst-parser==1.0.0 # via -r requirements/docs.in packaging==23.0 # via sphinx pygments==2.14.0 # via sphinx -pytz==2022.7.1 - # via babel pyyaml==6.0 # via myst-parser requests==2.28.2 # via sphinx snowballstemmer==2.2.0 # via sphinx -sphinx==4.5.0 +sphinx==6.1.3 # via # -r requirements/docs.in # myst-parser # sphinx-rtd-theme -sphinx-rtd-theme==1.1.1 + # sphinxcontrib-jquery +sphinx-rtd-theme==1.2.0 # via -r requirements/docs.in -sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-applehelp==1.0.4 # via sphinx sphinxcontrib-devhelp==1.0.2 # via sphinx -sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-htmlhelp==2.0.1 # via sphinx +sphinxcontrib-jquery==4.1 + # via sphinx-rtd-theme sphinxcontrib-jsmath==1.0.1 # via sphinx sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -typing-extensions==4.4.0 - # via - # importlib-metadata - # markdown-it-py - # pypdf -urllib3==1.26.14 +typing-extensions==4.5.0 + # via pypdf +urllib3==1.26.15 # via requests -zipp==3.11.0 +zipp==3.15.0 # via importlib-metadata From 1563e8e90b672226d1d0d9e0ab4af550b29a6379 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 27 Mar 2023 09:25:57 +0200 Subject: [PATCH 02/11] DOC: Pin myst_parser==0.16.1 and rename .readthedocs.yaml (#1752) myst_parser is pinned to that version as links are otherwise broken, see https://github.com/py-pdf/pypdf/issues/1569 Causing-commit: 0476799bce16ed5d527c04be1a4b3ba783be50f9 --- .readthedocs.yaml | 24 ++++++++++++++++++++++++ readthedocs.yml | 14 -------------- requirements/docs.in | 3 +-- requirements/docs.txt | 26 +++++++++++++------------- 4 files changed, 38 insertions(+), 29 deletions(-) create mode 100644 .readthedocs.yaml delete mode 100644 readthedocs.yml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000000..e96b3d12e9 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,24 @@ +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details +version: 2 + + +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# If using Sphinx, optionally build your docs in additional formats such as PDF +formats: all + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: requirements/docs.txt + - method: pip + path: . + extra_requirements: + - full diff --git a/readthedocs.yml b/readthedocs.yml deleted file mode 100644 index 82300c6f90..0000000000 --- a/readthedocs.yml +++ /dev/null @@ -1,14 +0,0 @@ ---- -version: 2 -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -formats: all - -build: - os: ubuntu-22.04 - tools: - python: "3.10" - -python: - install: - - requirements: requirements/docs.txt diff --git a/requirements/docs.in b/requirements/docs.in index bbfe5118b8..58eb4813c8 100644 --- a/requirements/docs.in +++ b/requirements/docs.in @@ -1,5 +1,4 @@ sphinx sphinx_rtd_theme -myst_parser --e . +myst_parser==0.16.1 attrs # required for myst, but not automatically installed by myst diff --git a/requirements/docs.txt b/requirements/docs.txt index ac8a6a95ab..f681c64485 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,11 +1,9 @@ # -# This file is autogenerated by pip-compile with python 3.6 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.7 +# by the following command: # -# pip-compile --output-file=requirements/docs.txt requirements/docs.in +# pip-compile requirements/docs.in # --e . - # via -r requirements/docs.in alabaster==0.7.13 # via sphinx attrs==22.2.0 @@ -16,9 +14,7 @@ certifi==2022.12.7 # via requests charset-normalizer==3.1.0 # via requests -colorama==0.4.6 - # via sphinx -docutils==0.18.1 +docutils==0.17.1 # via # myst-parser # sphinx @@ -43,19 +39,21 @@ mdit-py-plugins==0.3.5 # via myst-parser mdurl==0.1.2 # via markdown-it-py -myst-parser==1.0.0 +myst-parser==0.16.1 # via -r requirements/docs.in packaging==23.0 # via sphinx pygments==2.14.0 # via sphinx +pytz==2023.2 + # via babel pyyaml==6.0 # via myst-parser requests==2.28.2 # via sphinx snowballstemmer==2.2.0 # via sphinx -sphinx==6.1.3 +sphinx==4.5.0 # via # -r requirements/docs.in # myst-parser @@ -63,11 +61,11 @@ sphinx==6.1.3 # sphinxcontrib-jquery sphinx-rtd-theme==1.2.0 # via -r requirements/docs.in -sphinxcontrib-applehelp==1.0.4 +sphinxcontrib-applehelp==1.0.2 # via sphinx sphinxcontrib-devhelp==1.0.2 # via sphinx -sphinxcontrib-htmlhelp==2.0.1 +sphinxcontrib-htmlhelp==2.0.0 # via sphinx sphinxcontrib-jquery==4.1 # via sphinx-rtd-theme @@ -78,7 +76,9 @@ sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.5 # via sphinx typing-extensions==4.5.0 - # via pypdf + # via + # importlib-metadata + # markdown-it-py urllib3==1.26.15 # via requests zipp==3.15.0 From 0917dfccfbed26ad940cf3f2f0e89f1f31ac2d54 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 29 Mar 2023 17:42:26 +0200 Subject: [PATCH 03/11] SEC: Warn about PDF encryption security (#1755) See #1754 --- docs/user/encryption-decryption.md | 5 +++++ pypdf/_writer.py | 6 ++++++ tests/test_workflows.py | 3 ++- tests/test_writer.py | 11 ++++++----- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/docs/user/encryption-decryption.md b/docs/user/encryption-decryption.md index b95b33f86d..ff211320c6 100644 --- a/docs/user/encryption-decryption.md +++ b/docs/user/encryption-decryption.md @@ -5,6 +5,11 @@ ## Encrypt +> ⚠️ WARNING ⚠️: pypdf only implements [RC4 encryption](https://en.wikipedia.org/wiki/RC4). +> This encryption algorithm is insecure. The more modern and secure AES +> encryption is not implemented. pypdf can only decrypt, but not encrypt with +> AES. + Add a password to a PDF (encrypt it): ```python diff --git a/pypdf/_writer.py b/pypdf/_writer.py index b0ae266ebd..5501e58bba 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1027,6 +1027,12 @@ def encrypt( 5 and 6 control annotations, 9 for form fields, 10 for extraction of text and graphics. """ + warnings.warn( + "pypdf only implements RC4 encryption so far. " + "The RC4 algorithm is insecure. Either use a library that supports " + "AES for encryption or put the PDF in an encrypted container, " + "for example an encrypted ZIP file." + ) if user_pwd is not None: if user_password is not None: raise ValueError( diff --git a/tests/test_workflows.py b/tests/test_workflows.py index f8c9660bb7..f3552deb84 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -62,7 +62,8 @@ def test_basic_features(tmp_path): # encrypt your new PDF and add a password password = "secret" - writer.encrypt(password) + with pytest.warns(UserWarning, match="pypdf only implements RC4 encryption"): + writer.encrypt(password) # finally, write "output" to pypdf-output.pdf write_path = tmp_path / "pypdf-output.pdf" diff --git a/tests/test_writer.py b/tests/test_writer.py index 7eafe5cdbc..10943c5096 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -479,11 +479,12 @@ def test_encrypt(use_128bit, user_password, owner_password, pdf_file_path): orig_text = page.extract_text() writer.add_page(page) - writer.encrypt( - user_password=user_password, - owner_password=owner_password, - use_128bit=use_128bit, - ) + with pytest.warns(UserWarning, match="pypdf only implements RC4 encryption"): + writer.encrypt( + user_password=user_password, + owner_password=owner_password, + use_128bit=use_128bit, + ) # write "output" to pypdf-output.pdf with open(pdf_file_path, "wb") as output_stream: From b385ce9acd9e398710b52212b64fa4c9594fc3c3 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Fri, 31 Mar 2023 19:22:58 +0200 Subject: [PATCH 04/11] DEV: Make make_changelog.py idempotent --- make_changelog.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/make_changelog.py b/make_changelog.py index b375c8f251..59efb30df3 100644 --- a/make_changelog.py +++ b/make_changelog.py @@ -37,8 +37,11 @@ def main(changelog_path: str) -> None: new_entry = header + changes + trailer print(new_entry) - # TODO: Make idempotent - multiple calls to this script - # should not change the changelog + # Make the script idempotent by checking if the new entry is already in the changelog + if new_entry in changelog: + print("Changelog is already up-to-date!") + return + new_changelog = new_entry + changelog write_changelog(new_changelog, changelog_path) @@ -105,8 +108,21 @@ def get_formatted_changes(git_tag: str) -> str: grouped[commit.prefix].append({"msg": commit.message}) # Order prefixes - order = ["DEP", "ENH", "PI", "BUG", "ROB", "DOC", "DEV", "MAINT", "TST", "STY"] + order = [ + "SEC", + "DEP", + "ENH", + "PI", + "BUG", + "ROB", + "DOC", + "DEV", + "MAINT", + "TST", + "STY", + ] abbrev2long = { + "SEC": "Security", "DEP": "Deprecations", "ENH": "New Features", "BUG": "Bug Fixes", From 8146729eeb0f90478f2686f1dc395b545b49ba8c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 6 Apr 2023 14:11:27 +0200 Subject: [PATCH 05/11] ROB: Capture UnicodeDecodeError at PdfReader.pdf_header (#1768) Fixes #1758 --- pypdf/_reader.py | 28 +++++++++++++++++----------- tests/test_reader.py | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 89e7d248fb..36aa642122 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -32,7 +32,7 @@ import struct import zlib from datetime import datetime -from io import BytesIO +from io import BytesIO, UnsupportedOperation from pathlib import Path from typing import ( Any, @@ -360,7 +360,7 @@ def pdf_header(self) -> str: # but that needs a deprecation loc = self.stream.tell() self.stream.seek(0, 0) - pdf_file_version = self.stream.read(8).decode("utf-8") + pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") self.stream.seek(loc, 0) # return to where it was return pdf_file_version @@ -1541,19 +1541,22 @@ def read(self, stream: StreamType) -> None: def _basic_validation(self, stream: StreamType) -> None: """Ensure file is not empty. Read at most 5 bytes.""" - # start at the end: - stream.seek(0, os.SEEK_END) - if not stream.tell(): - raise EmptyFileError("Cannot read an empty file") - if self.strict: - stream.seek(0, os.SEEK_SET) + stream.seek(0, os.SEEK_SET) + try: header_byte = stream.read(5) - if header_byte != b"%PDF-": + except UnicodeDecodeError: + raise UnsupportedOperation("cannot read header") + if header_byte == b"": + raise EmptyFileError("Cannot read an empty file") + elif header_byte != b"%PDF-": + if self.strict: raise PdfReadError( f"PDF starts with '{header_byte.decode('utf8')}', " "but '%PDF-' expected" ) - stream.seek(0, os.SEEK_END) + else: + logger_warning(f"invalid pdf header: {header_byte}", __name__) + stream.seek(0, os.SEEK_END) def _find_eof_marker(self, stream: StreamType) -> None: """ @@ -1567,7 +1570,10 @@ def _find_eof_marker(self, stream: StreamType) -> None: line = b"" while line[:5] != b"%%EOF": if stream.tell() < HEADER_SIZE: - raise PdfReadError("EOF marker not found") + if self.strict: + raise PdfReadError("EOF marker not found") + else: + logger_warning("EOF marker not found", __name__) line = read_previous_line(stream) def _find_startxref_pos(self, stream: StreamType) -> int: diff --git a/tests/test_reader.py b/tests/test_reader.py index e2ccd6da52..967d2d1bf1 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -460,10 +460,16 @@ def test_read_empty(): assert exc.value.args[0] == "Cannot read an empty file" -def test_read_malformed_header(): +def test_read_malformed_header(caplog): with pytest.raises(PdfReadError) as exc: PdfReader(io.BytesIO(b"foo"), strict=True) assert exc.value.args[0] == "PDF starts with 'foo', but '%PDF-' expected" + caplog.clear() + try: + PdfReader(io.BytesIO(b"foo"), strict=False) + except Exception: + pass + assert caplog.messages[0].startswith("invalid pdf header") def test_read_malformed_body(): @@ -1352,3 +1358,37 @@ def test_iss1710(): name = "irbookonlinereading.pdf" in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) in_pdf.outline + + +def test_broken_file_header(): + pdf_data = ( + b"%%PDF-\xa0sd\n" + b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" + b"2 0 obj << >> endobj\n" + b"3 0 obj << >> endobj\n" + b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]" + b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" + b" /Resources << /Font << >> >>" + b" /Rotate 0 /Type /Page >> endobj\n" + b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" + b"xref 1 5\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"trailer << %s/Root 5 0 R /Size 6 >>\n" + b"startxref %d\n" + b"%%%%EOF" + ) + with_prev_0 = True + pdf_data = pdf_data % ( + pdf_data.find(b"1 0 obj"), + pdf_data.find(b"2 0 obj"), + pdf_data.find(b"3 0 obj"), + pdf_data.find(b"4 0 obj"), + pdf_data.find(b"5 0 obj"), + b"/Prev 0 " if with_prev_0 else b"", + pdf_data.find(b"xref") - 1, + ) + PdfReader(io.BytesIO(pdf_data)) From f26388e7d85eeb4e216046c8fd70c71ab4fb5dfd Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 7 Apr 2023 15:16:32 +0200 Subject: [PATCH 06/11] ROB: Prevent loop in Cloning (#1770) An issue was occurring with object 589/0 of file https://github.com/py-pdf/pypdf/files/11138472/test.pdf, which caused a loop during cloning due to its correspondence with both the file trailer and an XObject for filled text ("test"). This behavior was not intended, and a robustness improvement has been made to prevent the loop. Please note that if you run your code, the text "test" may be hidden by the trailer object. Fixes #1767 --- pypdf/_protocols.py | 3 +++ pypdf/generic/_base.py | 5 ++++- pypdf/generic/_data_structures.py | 5 ++++- tests/test_writer.py | 11 +++++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 85e9e0a568..ba6cd8a3c9 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -73,6 +73,9 @@ def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: ... + def _add_object(self, obj: Any) -> Any: + ... + @property def pages(self) -> List[Any]: ... diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index f75e66dd64..be3d71c457 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -277,7 +277,10 @@ def clone( obj = NullObject() assert isinstance(self, (IndirectObject,)) obj.indirect_reference = self - dup = obj.clone(pdf_dest, force_duplicate, ignore_fields) + dup = pdf_dest._add_object( + obj.clone(pdf_dest, force_duplicate, ignore_fields) + ) + # asserts added to prevent errors in mypy assert dup is not None assert dup.indirect_reference is not None return dup.indirect_reference diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 1fd196027c..b8aaf12d47 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -750,7 +750,10 @@ def _clone( if decoded_self is None: self.decoded_self = None else: - self.decoded_self = decoded_self.clone(pdf_dest, True, ignore_fields) # type: ignore[assignment] + self.decoded_self = cast( + "DecodedStreamObject", + decoded_self.clone(pdf_dest, force_duplicate, ignore_fields), + ) except Exception: pass super()._clone(src, pdf_dest, force_duplicate, ignore_fields) diff --git a/tests/test_writer.py b/tests/test_writer.py index 10943c5096..5066eecb65 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1292,3 +1292,14 @@ def test_iss1723(): in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) out_pdf = PdfWriter() out_pdf.append(in_pdf, (3, 5)) + + +@pytest.mark.enable_socket() +def test_iss1767(): + # test with a pdf which is buggy because the object 389,0 exists 3 times: + # twice to define catalog and one as an XObject inducing a loop when + # cloning + url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" + name = "iss1723.pdf" + in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + PdfWriter(clone_from=in_pdf) From bb2603ee73b5ef0564d25feae57483b7d1930d21 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 8 Apr 2023 15:12:46 +0200 Subject: [PATCH 07/11] STY: Test names, docstrings, and parametrization (#1771) --- docs/dev/testing.md | 21 +++++ tests/test_cmap.py | 160 +++++++++++++++++++-------------------- tests/test_constants.py | 9 +++ tests/test_encryption.py | 54 ++++++++++--- tests/test_filters.py | 25 +++--- tests/test_papersizes.py | 9 ++- tests/test_reader.py | 2 +- tests/test_utils.py | 28 +++---- tests/test_xmp.py | 63 +++++++++------ 9 files changed, 223 insertions(+), 148 deletions(-) diff --git a/docs/dev/testing.md b/docs/dev/testing.md index 2e7fb7f19d..30259d4335 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -34,3 +34,24 @@ pyenv install 3.10.2 Then you can execute `tox` which will create a coverage report in HTML form in the end. The execution takes about 30 minutes. + + +## Docstrings in Unit tests + +The first line of a docstring in a unit test should be written in a way that +you could prefix it with "This tests ensures that ...", e.g. + +* Invalid XML in xmp_metadata is gracefully handled. +* The identity is returning its input. +* xmp_modify_date is extracted correctly. + +This way, plugins like [`pytest-testdox`](https://pypi.org/project/pytest-testdox/) +can generate really nice output when the tests are running. This looks similar +to the output of [mocha.js](https://mochajs.org/). + +If the test is a regression test, write + +> This test is a regression test for issue #1234 + +If the regression test is just one parameter of other tests, then add it as +a comment for that parameter. diff --git a/tests/test_cmap.py b/tests/test_cmap.py index a371b92fe6..666d3ecfa0 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -12,28 +12,79 @@ @pytest.mark.enable_socket() @pytest.mark.slow() -def test_compute_space_width(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf" - name = "tika-923406.pdf" - - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) +@pytest.mark.parametrize( + ("url", "name", "strict"), + [ + # compute_space_width: + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf", + "tika-923406.pdf", + False, + ), + # _parse_to_unicode_process_rg: + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf", + "tika-959173.pdf", + False, + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf", + "tika-959173.pdf", + True, + ), + # issue #1718: + ( + "https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf", + "iss1718.pdf", + False, + ), + ], +) +def test_text_extraction_slow(caplog, url: str, name: str, strict: bool): + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict) for page in reader.pages: page.extract_text() + assert caplog.text == "" @pytest.mark.enable_socket() -@pytest.mark.slow() -def test_parse_to_unicode_process_rg(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf" - name = "tika-959173.pdf" - - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - for page in reader.pages: - page.extract_text() - - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=True) +@pytest.mark.parametrize( + ("url", "name", "strict"), + [ + # bfchar_on_2_chars: issue #1293 + ( + "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/" + "2007%2CASurveyofImageClassificationBasedTechniques.pdf", + "ASurveyofImageClassificationBasedTechniques.pdf", + False, + ), + # L40, get_font_width_from_default + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf", + "tika-908104.pdf", + False, + ), + # multiline_bfrange / regression test for issue #1285: + ( + "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/" + "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf", + "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf", + False, + ), + ( + "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/" + "Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf", + "Giacalone.pdf", + False, + ), + ], +) +def test_text_extraction_fast(caplog, url: str, name: str, strict: bool): + """Text extraction runs without exceptions or warnings""" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict) for page in reader.pages: page.extract_text() + assert caplog.text == "" @pytest.mark.enable_socket() @@ -47,49 +98,6 @@ def test_parse_encoding_advanced_encoding_not_implemented(): page.extract_text() -@pytest.mark.enable_socket() -def test_get_font_width_from_default(): # L40 - url = "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf" - name = "tika-908104.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - for page in reader.pages: - page.extract_text() - - -@pytest.mark.enable_socket() -def test_multiline_bfrange(): - # non regression test for iss_1285 - url = ( - "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/" - "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf" - ) - name = "tika-908104.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - for page in reader.pages: - page.extract_text() - url = ( - "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/" - "Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf" - ) - name = "Giacalone.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - for page in reader.pages: - page.extract_text() - - -@pytest.mark.enable_socket() -def test_bfchar_on_2_chars(): - # iss #1293 - url = ( - "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/" - "2007%2CASurveyofImageClassificationBasedTechniques.pdf" - ) - name = "ASurveyofImageClassificationBasedTechniques.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - for page in reader.pages: - page.extract_text() - - @pytest.mark.enable_socket() def test_ascii_charset(): # iss #1312 @@ -100,19 +108,21 @@ def test_ascii_charset(): @pytest.mark.enable_socket() -def test_iss1370(): - url = "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf" - name = "cmap1370.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.pages[0].extract_text() - - -@pytest.mark.enable_socket() -def test_iss1379(): - url = "https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf" - name = "02voc.pdf" +@pytest.mark.parametrize( + ("url", "name", "page_nb"), + [ + ( + "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf", + "cmap1370.pdf", + 0, + ), + ("https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", "02voc.pdf", 2), + ], + ids=["iss1370", "iss1379"], +) +def test_text_extraction_of_specific_pages(url: str, name: str, page_nb: int): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - reader.pages[2].extract_text() + reader.pages[page_nb].extract_text() @pytest.mark.enable_socket() @@ -122,13 +132,3 @@ def test_iss1533(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[0].extract_text() # no error assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü" - - -@pytest.mark.enable_socket() -def test_iss1718(caplog): - url = "https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf" - name = "iss1718.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - for p in reader.pages: - _txt = p.extract_text() - assert caplog.text == "" diff --git a/tests/test_constants.py b/tests/test_constants.py index ab3166f7c0..da4f307ccc 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -6,6 +6,15 @@ def test_slash_prefix(): + """ + Naming conventions of PDF_KEYS (constant names) are followed. + + This test function validates if PDF key names follow the required pattern: + - Starts with a slash '/' + - Followed by an uppercase letter + - Contains alphanumeric characters (letters and digits) + - The attribute name should be a case-insensitive match, with underscores removed + """ pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$") for cls in PDF_KEYS: for attr in dir(cls): diff --git a/tests/test_encryption.py b/tests/test_encryption.py index e2e09de095..33c86f85b5 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -70,6 +70,15 @@ ], ) def test_encryption(name, requires_pycryptodome): + """ + Encrypted PDFs are handled correctly. + + This test function ensures that: + - If PyCryptodome is not available and required, a DependencyError is raised + - Encrypted PDFs are identified correctly + - Decryption works for encrypted PDFs + - Metadata is properly extracted from the decrypted PDF + """ inputfile = RESOURCE_ROOT / "encryption" / name if requires_pycryptodome and not HAS_PYCRYPTODOME: with pytest.raises(DependencyError) as exc: @@ -108,7 +117,16 @@ def test_encryption(name, requires_pycryptodome): ], ) @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome") -def test_both_password(name, user_passwd, owner_passwd): +def test_pdf_with_both_passwords(name, user_passwd, owner_passwd): + """ + PDFs with both user and owner passwords are handled correctly. + + This test function ensures that: + - Encrypted PDFs with both user and owner passwords are identified correctly + - Decryption works for both user and owner passwords + - The correct password type is returned after decryption + - The number of pages is correctly identified after decryption + """ inputfile = RESOURCE_ROOT / "encryption" / name ipdf = pypdf.PdfReader(inputfile) assert ipdf.is_encrypted @@ -125,9 +143,9 @@ def test_both_password(name, user_passwd, owner_passwd): ], ) @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome") -def test_get_page_of_encrypted_file_new_algorithm(pdffile, password): +def test_read_page_from_encrypted_file_aes_256(pdffile, password): """ - Check if we can read a page of an encrypted file. + A page can be read from an encrypted. This is a regression test for issue 327: IndexError for get_page() of decrypted file @@ -150,7 +168,8 @@ def test_get_page_of_encrypted_file_new_algorithm(pdffile, password): ], ) @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome") -def test_encryption_merge(names): +def test_merge_encrypted_pdfs(names): + """Encrypted PDFs can be merged after decryption.""" merger = pypdf.PdfMerger() files = [RESOURCE_ROOT / "encryption" / x for x in names] pdfs = [pypdf.PdfReader(x) for x in files] @@ -168,24 +187,27 @@ def test_encryption_merge(names): CryptRC4, ], ) -def test_encrypt_decrypt_class(cryptcls): +def test_encrypt_decrypt_with_cipher_class(cryptcls): + """Encryption and decryption using a cipher class work as expected.""" message = b"Hello World" key = bytes(0 for _ in range(128)) # b"secret key" crypt = cryptcls(key) assert crypt.decrypt(crypt.encrypt(message)) == message -def test_decrypt_not_decrypted_pdf(): +def test_attempt_decrypt_unencrypted_pdf(): + """Attempting to decrypt an unencrypted PDF raises a PdfReadError.""" path = RESOURCE_ROOT / "crazyones.pdf" with pytest.raises(PdfReadError) as exc: PdfReader(path, password="nonexistant") assert exc.value.args[0] == "Not encrypted file" -def test_generate_values(): +def test_alg_v5_generate_values(): """ - This test only checks if there is an exception. + Algorithm V5 values are generated without raising exceptions. + This test function checks if there is an exception during the value generation. It does not verify that the content is correct. """ if not HAS_PYCRYPTODOME: @@ -207,13 +229,21 @@ def test_generate_values(): } -def test_randrange(): - # This might randomly fail in very rare cases +def test_randrange_function(): + """ + _randrange() function generates a range of unique random numbers. + + This test might randomly fail in very rare cases. + """ random_set = {_randrange(0, 10) for _ in range(1000)} assert random_set == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} -def test_randint(): - # This might randomly fail in very rare cases +def test_randint_function(): + """ + _randint() function generates a range of unique random numbers, including the upper bound. + + This test might randomly fail in very rare cases. + """ random_set = {_randint(0, 10) for _ in range(1000)} assert random_set == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} diff --git a/tests/test_filters.py b/tests/test_filters.py index 58baef1706..80bf4af0fe 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -35,8 +35,8 @@ @pytest.mark.parametrize( ("predictor", "s"), list(cartesian_product([1], filter_inputs)) ) -def test_flatedecode(predictor, s): - """Tests FlateDecode decode() and encode() methods.""" +def test_flate_decode_encode(predictor, s): + """FlateDecode encode() and decode() methods work as expected.""" codec = FlateDecode() s = s.encode() encoded = codec.encode(s) @@ -45,11 +45,13 @@ def test_flatedecode(predictor, s): def test_flatedecode_unsupported_predictor(): """ - Inputs an unsupported predictor (outside the [10, 15] range) checking that - PdfReadError() is raised. + FlateDecode raises PdfReadError for unsupported predictors. - Once this predictor support is updated in the future, this test case may be - removed. + Predictors outside the [10, 15] range are not supported. + + This test function checks that a PdfReadError is raised when decoding with + unsupported predictors. Once this predictor support is updated in the + future, this test case may be removed. """ codec = FlateDecode() predictors = (-10, -1, 0, 9, 16, 20, 100) @@ -63,7 +65,8 @@ def test_flatedecode_unsupported_predictor(): @pytest.mark.parametrize( "params", [ArrayObject([]), ArrayObject([{"/Predictor": 1}]), "a"] ) -def test_flatedecode_decompress_array_params(params): +def test_flate_decode_decompress_with_array_params(params): + """FlateDecode decode() method works correctly with array parameters.""" codec = FlateDecode() s = "" s = s.encode() @@ -106,7 +109,7 @@ def test_flatedecode_decompress_array_params(params): "whitespace", ], ) -def test_ascii_hex_decode(data, expected): +def test_ascii_hex_decode_method(data, expected): """ Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the correct output is returned. @@ -118,8 +121,8 @@ def test_ascii_hex_decode(data, expected): assert ASCIIHexDecode.decode(data) == expected -def test_ascii_hex_decode_no_eod(): - """Ensuring an exception is raised when no EOD character is present.""" +def test_ascii_hex_decode_missing_eod(): + """ASCIIHexDecode.decode() raises error when no EOD character is present.""" with pytest.raises(PdfStreamError) as exc: ASCIIHexDecode.decode("") assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode" @@ -146,6 +149,8 @@ def test_ascii85decode_with_overflow(): def test_ascii85decode_five_zero_bytes(): """ + ASCII85Decode handles the special case of five zero bytes correctly. + From ISO 32000 (2008) §7.4.3: «As a special case, if all five bytes are 0, they shall be represented by diff --git a/tests/test_papersizes.py b/tests/test_papersizes.py index d50948bf5d..c38a2e9fb5 100644 --- a/tests/test_papersizes.py +++ b/tests/test_papersizes.py @@ -4,7 +4,8 @@ from pypdf import papersizes -def test_din_a0(): +def test_din_a0_paper_size(): + """The dimensions and area of the DIN A0 paper size are correct.""" dim = papersizes.PaperSize.A0 area_square_pixels = float(dim.width) * dim.height @@ -20,7 +21,8 @@ def test_din_a0(): @pytest.mark.parametrize("dimensions", papersizes._din_a) -def test_din_a_ratio(dimensions): +def test_din_a_aspect_ratio(dimensions): + """The aspect ratio of DIN A paper sizes is correct.""" assert abs(dimensions.height - dimensions.width * 2**0.5) <= 2.5 @@ -28,5 +30,6 @@ def test_din_a_ratio(dimensions): ("dimensions_a", "dimensions_b"), list(zip(papersizes._din_a, papersizes._din_a[1:])), ) -def test_din_a_doubling(dimensions_a, dimensions_b): +def test_din_a_size_doubling(dimensions_a, dimensions_b): + """The height of a DIN A paper size doubles when moving to the next size.""" assert abs(dimensions_a.height - 2 * dimensions_b.width) <= 4 diff --git a/tests/test_reader.py b/tests/test_reader.py index 967d2d1bf1..c1c24fb460 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1191,7 +1191,7 @@ def test_outline_with_invalid_destinations(): @pytest.mark.enable_socket() def test_pdfreader_multiple_definitions(caplog): - # iss325 + """iss325""" url = "https://github.com/py-pdf/pypdf/files/9176644/multipledefs.pdf" name = "multipledefs.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) diff --git a/tests/test_utils.py b/tests/test_utils.py index cfc8d7f883..abb022db52 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -124,21 +124,6 @@ def test_paeth_predictor(left, up, upleft, expected): assert pypdf._utils.paeth_predictor(left, up, upleft) == expected -@pytest.mark.parametrize( - ("dat", "pos", "to_read"), - [ - (b"", 0, 1), - (b"a", 0, 1), - (b"abc", 0, 10), - ], -) -def test_read_block_backwards_errs(dat, pos, to_read): - with pytest.raises(PdfStreamError) as _: - s = io.BytesIO(dat) - s.seek(pos) - read_block_backwards(s, to_read) - - @pytest.mark.parametrize( ("dat", "pos", "to_read", "expected", "expected_pos"), [ @@ -149,12 +134,19 @@ def test_read_block_backwards_errs(dat, pos, to_read): (b"abc", 3, 1, b"c", 2), (b"abc", 3, 2, b"bc", 1), (b"abc", 3, 3, b"abc", 0), + (b"", 0, 1, None, 0), + (b"a", 0, 1, None, 0), + (b"abc", 0, 10, None, 0), ], ) def test_read_block_backwards(dat, pos, to_read, expected, expected_pos): s = io.BytesIO(dat) s.seek(pos) - assert read_block_backwards(s, to_read) == expected + if expected is not None: + assert read_block_backwards(s, to_read) == expected + else: + with pytest.raises(PdfStreamError): + read_block_backwards(s, to_read) assert s.tell() == expected_pos @@ -264,10 +256,12 @@ def test_escapedcode_followed_by_int(): ], ) def test_human_readable_bytes(input_int, expected_output): + """_human_readable_bytes correctly transforms the integer to a string.""" assert _human_readable_bytes(input_int) == expected_output -def test_file(): +def test_file_class(): + """File class can be instanciated and string representation is ok.""" f = File(name="image.png", data=b"") assert str(f) == "File(name=image.png, data: 0 Byte)" assert repr(f) == "File(name=image.png, data: 0 Byte, hash: 0)" diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 1fc1184ab2..50555b476d 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -24,7 +24,8 @@ (RESOURCE_ROOT / "crazyones.pdf", False), ], ) -def test_read_xmp(src, has_xmp): +def test_read_xmp_metadata(src, has_xmp): + """Read XMP metadata from PDF files.""" reader = PdfReader(src) xmp = reader.xmp_metadata assert (xmp is None) == (not has_xmp) @@ -39,6 +40,7 @@ def test_read_xmp(src, has_xmp): def get_all_tiff(xmp: pypdf.xmp.XmpInformation): + """Return all TIFF metadata as a dictionary.""" data = {} tiff_ns = xmp.get_nodes_in_namespace( about_uri="", namespace="http://ns.adobe.com/tiff/1.0/" @@ -51,30 +53,29 @@ def get_all_tiff(xmp: pypdf.xmp.XmpInformation): return data -def test_regression_issue774(): +def test_converter_date(): + """ + _converter_date returns the correct datetime. + + This is a regression test for issue #774. + """ date = pypdf.xmp._converter_date("2021-04-28T12:23:34.123Z") - assert date.year == 2021 - assert date.month == 4 - assert date.day == 28 - assert date.hour == 12 - assert date.minute == 23 - assert date.second == 34 - assert date.microsecond == 123000 + assert date == datetime(2021, 4, 28, 12, 23, 34, 123000) + with pytest.raises(ValueError) as exc: pypdf.xmp._converter_date("today") assert exc.value.args[0].startswith("Invalid date format") date = pypdf.xmp._converter_date("2021-04-28T12:23:01-03:00") - assert date.year == 2021 - assert date.month == 4 - assert date.day == 28 - assert date.hour == 15 - assert date.minute == 23 - assert date.second == 1 - assert date.microsecond == 0 + assert date == datetime(2021, 4, 28, 15, 23, 1) -def test_regression_issue914(): +def test_modify_date(): + """ + xmp_modify_date is extracted correctly. + + This is a regression test for issue #914. + """ path = RESOURCE_ROOT / "issue-914-xmp-data.pdf" reader = PdfReader(path) assert reader.xmp_metadata.xmp_modify_date == datetime(2022, 4, 9, 15, 22, 43) @@ -84,7 +85,8 @@ def test_regression_issue914(): "x", ["a", 42, 3.141, False, True], ) -def test_identity(x): +def test_identity_function(x): + """The identity is returning its input.""" assert pypdf.xmp._identity(x) == x @@ -99,7 +101,8 @@ def test_identity(x): ) ], ) -def test_xmpmm(url, name, xmpmm_instance_id): +def test_xmpmm_instance_id(url, name, xmpmm_instance_id): + """XMPMM instance id is correctly extracted.""" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.xmpmm_instance_id == xmpmm_instance_id @@ -108,7 +111,8 @@ def test_xmpmm(url, name, xmpmm_instance_id): @pytest.mark.enable_socket() -def test_dc_description(): +def test_xmp_dc_description_extraction(): + """XMP dc_description is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf" name = "tika-953770.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -123,7 +127,8 @@ def test_dc_description(): @pytest.mark.enable_socket() -def test_dc_creator(): +def test_dc_creator_extraction(): + """XMP dc_creator is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf" name = "tika-953770.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -134,7 +139,8 @@ def test_dc_creator(): @pytest.mark.enable_socket() -def test_custom_properties(): +def test_custom_properties_extraction(): + """XMP custom_properties is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/986/986065.pdf" name = "tika-986065.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -145,7 +151,8 @@ def test_custom_properties(): @pytest.mark.enable_socket() -def test_dc_subject(): +def test_dc_subject_extraction(): + """XMP dc_subject is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959519.pdf" name = "tika-959519.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -176,7 +183,12 @@ def test_dc_subject(): @pytest.mark.enable_socket() -def test_issue585(): +def test_invalid_xmp_information_handling(): + """ + Invalid XML in xmp_metadata is gracefully handled. + + This is a regression test for issue #585. + """ url = "https://github.com/py-pdf/pypdf/files/5536984/test.pdf" name = "pypdf-5536984.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -185,7 +197,8 @@ def test_issue585(): assert exc.value.args[0].startswith("XML in XmpInformation was invalid") -def test_getter_bag(): +def test_xmp_getter_bag_function(): + """xmp._getter_bag does not crash.""" f = pypdf.xmp._getter_bag("namespace", "name") class Tst: # to replace pdf From 117ce458c5cc3d8e2dfbdda1dbdd4d18d1e7c60e Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sat, 8 Apr 2023 15:47:53 +0200 Subject: [PATCH 08/11] MAINT: Move generation of file identifiers to a method (#1760) * Ensure that the content is used to generate the file identifiers * Ensure /ID[0] is not overwritten * MAINT: Rename PdfWriter._write_header to PdfWriter._write_pdf_structure --- pypdf/_writer.py | 47 +++++++++++++++++++++++++++++------------ tests/test_workflows.py | 2 ++ 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 5501e58bba..5f31224bc8 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -31,11 +31,10 @@ import collections import decimal import enum +import hashlib import logging import re -import secrets import struct -import time import uuid import warnings from hashlib import md5 @@ -144,6 +143,13 @@ class ObjectDeletionFlag(enum.IntFlag): ALL_ANNOTATIONS = enum.auto() +def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: + hash = hashlib.md5() + for block in iter(lambda: stream.read(blocksize), b""): + hash.update(block) + return hash.hexdigest() + + class PdfWriter: """ Write a PDF file out, given pages produced by another class. @@ -974,7 +980,7 @@ def clone_document_from_reader( self.clone_reader_document_root(reader) self._info = reader.trailer[TK.INFO].clone(self).indirect_reference # type: ignore try: - self._ID = reader.trailer[TK.ID].clone(self) # type: ignore + self._ID = cast(ArrayObject, reader.trailer[TK.ID].clone(self)) # type: ignore except KeyError: pass if callable(after_page_append): @@ -998,6 +1004,26 @@ def cloneDocumentFromReader( ) self.clone_document_from_reader(reader, after_page_append) + def _compute_document_identifier_from_content(self) -> ByteStringObject: + stream = BytesIO() + self._write_pdf_structure(stream) + stream.seek(0) + return ByteStringObject(_rolling_checksum(stream).encode("utf8")) + + def generate_file_identifiers(self) -> None: + """ + Generate an identifier for the PDF that will be written. + + The only point of this is ensuring uniqueness. Reproducibility is not + required; see 14.4 "File Identifiers". + """ + if hasattr(self, "_ID") and self._ID and len(self._ID) == 2: + ID_1 = self._ID[0] + else: + ID_1 = self._compute_document_identifier_from_content() + ID_2 = self._compute_document_identifier_from_content() + self._ID = ArrayObject((ID_1, ID_2)) + def encrypt( self, user_password: Optional[str] = None, @@ -1078,19 +1104,14 @@ def encrypt( V = 1 rev = 2 keylen = int(40 / 8) - secrets_generator = secrets.SystemRandom() P = permissions_flag O = ByteStringObject(_alg33(owner_password, user_password, rev, keylen)) # type: ignore[arg-type] # noqa - ID_1 = ByteStringObject(md5((repr(time.time())).encode("utf8")).digest()) - ID_2 = ByteStringObject( - md5((repr(secrets_generator.uniform(0, 1))).encode("utf8")).digest() - ) - self._ID = ArrayObject((ID_1, ID_2)) + self.generate_file_identifiers() if rev == 2: - U, key = _alg34(user_password, O, P, ID_1) + U, key = _alg34(user_password, O, P, self._ID[0]) else: assert rev == 3 - U, key = _alg35(user_password, rev, keylen, O, P, ID_1, False) # type: ignore[arg-type] + U, key = _alg35(user_password, rev, keylen, O, P, self._ID[0], False) # type: ignore[arg-type] encrypt = DictionaryObject() encrypt[NameObject(SA.FILTER)] = NameObject("/Standard") encrypt[NameObject("/V")] = NumberObject(V) @@ -1124,7 +1145,7 @@ def write_stream(self, stream: StreamType) -> None: # copying in a new copy of the page object. self._sweep_indirect_references(self._root) - object_positions = self._write_header(stream) + object_positions = self._write_pdf_structure(stream) xref_location = self._write_xref_table(stream, object_positions) self._write_trailer(stream) stream.write(b_(f"\nstartxref\n{xref_location}\n%%EOF\n")) # eof @@ -1159,7 +1180,7 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: return my_file, stream - def _write_header(self, stream: StreamType) -> List[int]: + def _write_pdf_structure(self, stream: StreamType) -> List[int]: object_positions = [] stream.write(self.pdf_header + b"\n") stream.write(b"%\xE2\xE3\xCF\xD3\n") diff --git a/tests/test_workflows.py b/tests/test_workflows.py index f3552deb84..7a9ddcb53d 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -64,6 +64,8 @@ def test_basic_features(tmp_path): password = "secret" with pytest.warns(UserWarning, match="pypdf only implements RC4 encryption"): writer.encrypt(password) + # doing it twice should not change anything + writer.encrypt(password) # finally, write "output" to pypdf-output.pdf write_path = tmp_path / "pypdf-output.pdf" From 10cc05775b481da61261e3cc0af38ef116bd0040 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 9 Apr 2023 16:33:44 +0200 Subject: [PATCH 09/11] STY: Improve language, add docstrings, fix TODOs (#1772) --- README.md | 30 +++++------ docs/user/cropping-and-transforming.md | 4 ++ pypdf/_reader.py | 4 +- pypdf/constants.py | 72 +++++++++++++++++++++++++- pypdf/filters.py | 42 ++++++++++++++- pypdf/generic/__init__.py | 3 ++ pyproject.toml | 3 +- sample-files | 2 +- tests/test_constants.py | 12 ++++- 9 files changed, 146 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 5c4c9173c4..de4799b2d5 100644 --- a/README.md +++ b/README.md @@ -21,23 +21,22 @@ from PDFs as well. ## Installation -You can install pypdf via pip: +Install pypdf using pip: ``` pip install pypdf ``` -If you plan to use pypdf for encrypting or decrypting PDFs that use AES, you -will need to install some extra dependencies. Encryption using RC4 is supported -using the regular installation. +For using pypdf with AES encryption or decryption, install extra dependencies: ``` pip install pypdf[crypto] ``` -> **NOTE**: `pypdf>=3.1.0` improved a lot compared to `pyPdf<2.0.0` and compared to -> `PyPDF2 < 2.0.0`. Please -> read [the migration guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html). +> **NOTE**: `pypdf` 3.1.0 and above include significant improvements compared to +> previous versions. Please refer to [the migration +> guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html) for +> more information. ## Usage @@ -51,19 +50,18 @@ text = page.extract_text() ``` pypdf can do a lot more, e.g. splitting, merging, reading and creating -annotations, decrypting and encrypting, and more. +annotations, decrypting and encrypting, and more. Check out [the +documentation](https://pypdf.readthedocs.io/en/stable/) for additional usage +examples! -Please see [the documentation](https://pypdf.readthedocs.io/en/stable/) -for more usage examples! - -A lot of questions are asked and answered -[on StackOverflow](https://stackoverflow.com/questions/tagged/pypdf) -(formerly tagged with [PyPDF2](https://stackoverflow.com/questions/tagged/pypdf2)). +For questions and answers, visit +[StackOverflow](https://stackoverflow.com/questions/tagged/pypdf) +(tagged with [pypdf](https://stackoverflow.com/questions/tagged/pypdf)). ## Contributions -Maintaining pypdf is a collaborative effort. You can support pypdf by writing -documentation, helping to narrow down issues, and adding code. +Maintaining pypdf is a collaborative effort. You can support the project by +writing documentation, helping to narrow down issues, and submitting code. ### Q&A diff --git a/docs/user/cropping-and-transforming.md b/docs/user/cropping-and-transforming.md index 2afc2a2e6b..d002ef1d8f 100644 --- a/docs/user/cropping-and-transforming.md +++ b/docs/user/cropping-and-transforming.md @@ -1,5 +1,9 @@ # Cropping and Transforming PDFs +> **Notice**: Just because content is no longer visible, it is not gone. +> Cropping works by adjusting the viewbox. That means content that was cropped +> away can still be restored. + ```python from pypdf import PdfWriter, PdfReader diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 36aa642122..3b21c29e73 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1273,8 +1273,8 @@ def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: This is equivalent to generic.IndirectObject(num,gen,self).get_object() Args: - num: - gen: + num: The object number of the indirect object. + gen: The generation number of the indirect object. Returns: A PdfObject diff --git a/pypdf/constants.py b/pypdf/constants.py index 9f7327adfe..d1be774079 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -306,6 +306,17 @@ class FieldDictionaryAttributes: @classmethod def attributes(cls) -> Tuple[str, ...]: + """ + Get a tuple of all the attributes present in a Field Dictionary. + + This method returns a tuple of all the attribute constants defined in + the FieldDictionaryAttributes class. These attributes correspond to the + entries that are common to all field dictionaries as specified in the + PDF 1.7 reference. + + Returns: + A tuple containing all the attribute constants. + """ return ( cls.TM, cls.T, @@ -321,6 +332,18 @@ def attributes(cls) -> Tuple[str, ...]: @classmethod def attributes_dict(cls) -> Dict[str, str]: + """ + Get a dictionary of attribute keys and their human-readable names. + + This method returns a dictionary where the keys are the attribute + constants defined in the FieldDictionaryAttributes class and the values + are their corresponding human-readable names. These attributes + correspond to the entries that are common to all field dictionaries as + specified in the PDF 1.7 reference. + + Returns: + A dictionary containing attribute keys and their names. + """ return { cls.FT: "Field Type", cls.Parent: "Parent", @@ -340,10 +363,33 @@ class CheckboxRadioButtonAttributes: @classmethod def attributes(cls) -> Tuple[str, ...]: + """ + Get a tuple of all the attributes present in a Field Dictionary. + + This method returns a tuple of all the attribute constants defined in + the CheckboxRadioButtonAttributes class. These attributes correspond to + the entries that are common to all field dictionaries as specified in + the PDF 1.7 reference. + + Returns: + A tuple containing all the attribute constants. + """ return (cls.Opt,) @classmethod def attributes_dict(cls) -> Dict[str, str]: + """ + Get a dictionary of attribute keys and their human-readable names. + + This method returns a dictionary where the keys are the attribute + constants defined in the CheckboxRadioButtonAttributes class and the + values are their corresponding human-readable names. These attributes + correspond to the entries that are common to all field dictionaries as + specified in the PDF 1.7 reference. + + Returns: + A dictionary containing attribute keys and their names. + """ return { cls.Opt: "Options", } @@ -381,13 +427,35 @@ class PageLayouts: class GraphicsStateParameters: - """Table 4.8 of the 1.7 reference.""" + """Table 58 – Entries in a Graphics State Parameter Dictionary""" TYPE = "/Type" # name, optional LW = "/LW" # number, optional - # TODO: Many more! + LC = "/LC" # integer, optional + LJ = "/LJ" # integer, optional + ML = "/ML" # number, optional + D = "/D" # array, optional + RI = "/RI" # name, optional + OP = "/OP" + op = "/op" + OPM = "/OPM" FONT = "/Font" # array, optional + BG = "/BG" + BG2 = "/BG2" + UCR = "/UCR" + UCR2 = "/UCR2" + TR = "/TR" + TR2 = "/TR2" + HT = "/HT" + FL = "/FL" + SM = "/SM" + SA = "/SA" + BM = "/BM" S_MASK = "/SMask" # dictionary or name, optional + CA = "/CA" + ca = "/ca" + AIS = "/AIS" + TK = "/TK" class CatalogDictionary: diff --git a/pypdf/filters.py b/pypdf/filters.py index 72b4243163..086a8a2f53 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -67,6 +67,19 @@ def decompress(data: bytes) -> bytes: + """ + Decompress the given data using zlib. + + This function attempts to decompress the input data using zlib. If the + decompression fails due to a zlib error, it falls back to using a + decompression object with a larger window size. + + Args: + data: The input data to be decompressed. + + Returns: + The decompressed data. + """ try: return zlib.decompress(data) except zlib.error: @@ -195,6 +208,15 @@ def _decode_png_prediction(data: str, columns: int, rowlength: int) -> bytes: @staticmethod def encode(data: bytes) -> bytes: + """ + Compress the input data using zlib. + + Args: + data: The data to be compressed. + + Returns: + The compressed data. + """ return zlib.compress(data) @@ -376,7 +398,7 @@ def decode( group_index = b = 0 out = bytearray() for char in data: - if ord("!") <= char and char <= ord("u"): + if ord("!") <= char <= ord("u"): group_index += 1 b = b * 85 + (char - 33) if group_index == 5: @@ -536,6 +558,23 @@ def decode( def decode_stream_data(stream: Any) -> Union[str, bytes]: # utils.StreamObject + """ + Decode the stream data based on the specified filters. + + This function decodes the stream data using the filters provided in the + stream. It supports various filter types, including FlateDecode, + ASCIIHexDecode, LZWDecode, ASCII85Decode, DCTDecode, JPXDecode, and + CCITTFaxDecode. + + Args: + stream: The input stream object containing the data and filters. + + Returns: + The decoded stream data. + + Raises: + NotImplementedError: If an unsupported filter type is encountered. + """ filters = stream.get(SA.FILTER, ()) if isinstance(filters, IndirectObject): filters = cast(ArrayObject, filters.get_object()) @@ -580,6 +619,7 @@ def decode_stream_data(stream: Any) -> Union[str, bytes]: # utils.StreamObject def decodeStreamData(stream: Any) -> Union[str, bytes]: # deprecated + """Deprecated. Use decode_stream_data.""" deprecate_with_replacement("decodeStreamData", "decode_stream_data", "4.0.0") return decode_stream_data(stream) diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index 984bbf2c24..a26b448102 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -73,6 +73,7 @@ def readHexStringFromStream( stream: StreamType, ) -> Union["TextStringObject", "ByteStringObject"]: # deprecated + """Deprecated, use read_hex_string_from_stream.""" deprecate_with_replacement( "readHexStringFromStream", "read_hex_string_from_stream", "4.0.0" ) @@ -83,6 +84,7 @@ def readStringFromStream( stream: StreamType, forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union["TextStringObject", "ByteStringObject"]: # deprecated + """Deprecated, use read_string_from_stream.""" deprecate_with_replacement( "readStringFromStream", "read_string_from_stream", "4.0.0" ) @@ -93,6 +95,7 @@ def createStringObject( string: Union[str, bytes], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union[TextStringObject, ByteStringObject]: # deprecated + """Deprecated, use create_string_object.""" deprecate_with_replacement("createStringObject", "create_string_object", "4.0.0") return create_string_object(string, forced_encoding) diff --git a/pyproject.toml b/pyproject.toml index fdf89e0854..933917f9db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,7 +156,6 @@ ignore = [ "C901", "D101", # Missing docstring in public class "D102", # Missing docstring in public method - "D103", # Missing docstring in public function "D417", # Missing argument descriptions in the docstring "FBT001", # Boolean positional arg in function definition "FBT002", # Boolean default value in function definition @@ -177,7 +176,7 @@ ignore = [ ] [tool.ruff.per-file-ignores] -"tests/*" = ["S101", "ANN001", "ANN201","D104", "S105", "S106"] +"tests/*" = ["S101", "ANN001", "ANN201","D104", "S105", "S106", "D103"] "sample-files/*" = ["D100", "INP001"] "_encryption.py" = ["S324", "S311"] "_security.py" = ["S324"] diff --git a/sample-files b/sample-files index fb7a080b35..65e2d2ca8a 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit fb7a080b35b3553bd10221282beeda7847959e83 +Subproject commit 65e2d2ca8a137bfb1807b9991d5ca97f90365cc3 diff --git a/tests/test_constants.py b/tests/test_constants.py index da4f307ccc..62fbae7433 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -2,7 +2,7 @@ import re from typing import Callable -from pypdf.constants import PDF_KEYS +from pypdf.constants import PDF_KEYS, GraphicsStateParameters def test_slash_prefix(): @@ -18,11 +18,19 @@ def test_slash_prefix(): pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$") for cls in PDF_KEYS: for attr in dir(cls): + # Skip magic methods if attr.startswith("__") and attr.endswith("__"): continue + + # Skip methods constant_value = getattr(cls, attr) if isinstance(constant_value, Callable): continue + assert constant_value.startswith("/") - assert pattern.match(constant_value) assert attr.replace("_", "").lower() == constant_value[1:].lower() + + # There are a few exceptions that may be lowercase + if cls == GraphicsStateParameters and attr in ["ca", "op"]: + continue + assert pattern.match(constant_value) From ac0cb986cdb34baaae140c159b3219785e87be3b Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 9 Apr 2023 21:32:50 +0200 Subject: [PATCH 10/11] TST: Add xmp test (#1775) --- sample-files | 2 +- tests/test_utils.py | 4 ++-- tests/test_xmp.py | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/sample-files b/sample-files index 65e2d2ca8a..0c3b1d3879 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 65e2d2ca8a137bfb1807b9991d5ca97f90365cc3 +Subproject commit 0c3b1d3879c5cd7d913b3d931fa33b37529d7346 diff --git a/tests/test_utils.py b/tests/test_utils.py index abb022db52..3ae80bddb9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -221,8 +221,8 @@ def test_read_block_backwards_exception(): def test_deprecation_bookmark(): @deprecation_bookmark(old_param="new_param") - def foo(old_param: int = 1, baz: int = 2) -> float: - return old_param * baz + def foo(old_param: int = 1, baz: int = 2) -> None: + pass with pytest.raises(DeprecationError) as exc: foo(old_param=12, new_param=13) diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 50555b476d..cfcf021119 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -15,6 +15,26 @@ TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" +SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files" + + +@pytest.mark.samples() +@pytest.mark.parametrize( + "src", + [ + (SAMPLE_ROOT / "020-xmp/output_with_metadata_pymupdf.pdf"), + ], +) +def test_read_xmp_metadata_samples(src): + reader = PdfReader(src) + xmp = reader.xmp_metadata + assert xmp + assert xmp.dc_contributor == [] + assert xmp.dc_creator == ["John Doe"] + assert xmp.dc_source == "Martin Thoma" # attribute node + assert xmp.dc_description == {"x-default": "This is a text"} + assert xmp.dc_date == [datetime(1990, 4, 28, 0, 0)] + assert xmp.dc_title == {"x-default": "Sample PDF with XMP Metadata"} @pytest.mark.parametrize( From a876a77b3af5ecd64699cfc0b687d2657de8f526 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 9 Apr 2023 22:25:11 +0200 Subject: [PATCH 11/11] REL: 3.7.1 Security (SEC): - Warn about PDF encryption security (#1755) Robustness (ROB): - Prevent loop in Cloning (#1770) - Capture UnicodeDecodeError at PdfReader.pdf_header (#1768) Documentation (DOC): - Pin myst_parser==0.16.1 and rename .readthedocs.yaml (#1752) - Add readthedocs.yml and bump docs dependencies using `tox -e deps` (#1750) Developer Experience (DEV): - Make make_changelog.py idempotent Maintenance (MAINT): - Move generation of file identifiers to a method (#1760) Testing (TST): - Add xmp test (#1775) Code Style (STY): - Improve language, add docstrings, fix TODOs (#1772) - Test names, docstrings, and parametrization (#1771) [Full Changelog](https://github.com/py-pdf/pypdf/compare/3.7.0...3.7.1) --- CHANGELOG.md | 23 +++++++++++++++++++++++ pypdf/_version.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e9f5a8fdb..319b7586a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,28 @@ # CHANGELOG +## Version 3.7.1, 2023-04-09 + +### Security (SEC) +- Warn about PDF encryption security (#1755) + +### Robustness (ROB) +- Prevent loop in Cloning (#1770) +- Capture UnicodeDecodeError at PdfReader.pdf_header (#1768) + +### Documentation (DOC) +- Add .readthedocs.yaml and bump docs dependencies using `tox -e deps` (#1750, #1752) + +### Developer Experience (DEV) +- Make make_changelog.py idempotent + +### Maintenance (MAINT) +- Move generation of file identifiers to a method (#1760) + +### Testing (TST) +- Add xmp test (#1775) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.7.0...3.7.1) + ## Version 3.7.0, 2023-03-26 ### Security (SEC) diff --git a/pypdf/_version.py b/pypdf/_version.py index 46f67e7f8d..975f69142a 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.7.0" +__version__ = "3.7.1"