From 0476799bce16ed5d527c04be1a4b3ba783be50f9 Mon Sep 17 00:00:00 2001
From: Adel Haddad <26027314+adehad@users.noreply.github.com>
Date: Mon, 27 Mar 2023 07:53:35 +0100
Subject: [PATCH 01/11] DOC: Add readthedocs.yml and bump docs dependencies
 using `tox -e deps` (#1750)

---
 readthedocs.yml       | 14 +++++++++++++
 requirements/docs.in  |  2 +-
 requirements/docs.txt | 46 +++++++++++++++++++++----------------------
 3 files changed, 38 insertions(+), 24 deletions(-)
 create mode 100644 readthedocs.yml

diff --git a/readthedocs.yml b/readthedocs.yml
new file mode 100644
index 0000000000..82300c6f90
--- /dev/null
+++ b/readthedocs.yml
@@ -0,0 +1,14 @@
+---
+version: 2
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+formats: all
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.10"
+
+python:
+  install:
+    - requirements: requirements/docs.txt
diff --git a/requirements/docs.in b/requirements/docs.in
index 6fe145949f..bbfe5118b8 100644
--- a/requirements/docs.in
+++ b/requirements/docs.in
@@ -1,5 +1,5 @@
 sphinx
 sphinx_rtd_theme
-myst_parser==0.16.1
+myst_parser
 -e .
 attrs  # required for myst, but not automatically installed by myst
diff --git a/requirements/docs.txt b/requirements/docs.txt
index cb44bcd2bb..ac8a6a95ab 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,8 +1,8 @@
 #
-# This file is autogenerated by pip-compile with Python 3.7
-# by the following command:
+# This file is autogenerated by pip-compile with python 3.6
+# To update, run:
 #
-#    pip-compile requirements/docs.in
+#    pip-compile --output-file=requirements/docs.txt requirements/docs.in
 #
 -e .
     # via -r requirements/docs.in
@@ -10,13 +10,15 @@ alabaster==0.7.13
     # via sphinx
 attrs==22.2.0
     # via -r requirements/docs.in
-babel==2.11.0
+babel==2.12.1
     # via sphinx
 certifi==2022.12.7
     # via requests
-charset-normalizer==3.0.1
+charset-normalizer==3.1.0
     # via requests
-docutils==0.17.1
+colorama==0.4.6
+    # via sphinx
+docutils==0.18.1
     # via
     #   myst-parser
     #   sphinx
@@ -25,61 +27,59 @@ idna==3.4
     # via requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==6.0.0
+importlib-metadata==6.1.0
     # via sphinx
 jinja2==3.1.2
     # via
     #   myst-parser
     #   sphinx
-markdown-it-py==2.1.0
+markdown-it-py==2.2.0
     # via
     #   mdit-py-plugins
     #   myst-parser
 markupsafe==2.1.2
     # via jinja2
-mdit-py-plugins==0.3.3
+mdit-py-plugins==0.3.5
     # via myst-parser
 mdurl==0.1.2
     # via markdown-it-py
-myst-parser==0.16.1
+myst-parser==1.0.0
     # via -r requirements/docs.in
 packaging==23.0
     # via sphinx
 pygments==2.14.0
     # via sphinx
-pytz==2022.7.1
-    # via babel
 pyyaml==6.0
     # via myst-parser
 requests==2.28.2
     # via sphinx
 snowballstemmer==2.2.0
     # via sphinx
-sphinx==4.5.0
+sphinx==6.1.3
     # via
     #   -r requirements/docs.in
     #   myst-parser
     #   sphinx-rtd-theme
-sphinx-rtd-theme==1.1.1
+    #   sphinxcontrib-jquery
+sphinx-rtd-theme==1.2.0
     # via -r requirements/docs.in
-sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-applehelp==1.0.4
     # via sphinx
 sphinxcontrib-devhelp==1.0.2
     # via sphinx
-sphinxcontrib-htmlhelp==2.0.0
+sphinxcontrib-htmlhelp==2.0.1
     # via sphinx
+sphinxcontrib-jquery==4.1
+    # via sphinx-rtd-theme
 sphinxcontrib-jsmath==1.0.1
     # via sphinx
 sphinxcontrib-qthelp==1.0.3
     # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
-typing-extensions==4.4.0
-    # via
-    #   importlib-metadata
-    #   markdown-it-py
-    #   pypdf
-urllib3==1.26.14
+typing-extensions==4.5.0
+    # via pypdf
+urllib3==1.26.15
     # via requests
-zipp==3.11.0
+zipp==3.15.0
     # via importlib-metadata

From 1563e8e90b672226d1d0d9e0ab4af550b29a6379 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Mon, 27 Mar 2023 09:25:57 +0200
Subject: [PATCH 02/11] DOC: Pin myst_parser==0.16.1 and rename
 .readthedocs.yaml (#1752)

myst_parser is pinned to that version as links are otherwise broken,
see https://github.com/py-pdf/pypdf/issues/1569

Causing-commit: 0476799bce16ed5d527c04be1a4b3ba783be50f9
---
 .readthedocs.yaml     | 24 ++++++++++++++++++++++++
 readthedocs.yml       | 14 --------------
 requirements/docs.in  |  3 +--
 requirements/docs.txt | 26 +++++++++++++-------------
 4 files changed, 38 insertions(+), 29 deletions(-)
 create mode 100644 .readthedocs.yaml
 delete mode 100644 readthedocs.yml

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000000..e96b3d12e9
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,24 @@
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+version: 2
+
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+formats: all
+
+# Optionally declare the Python requirements required to build your docs
+python:
+  install:
+    - requirements: requirements/docs.txt
+    - method: pip
+      path: .
+      extra_requirements:
+        - full
diff --git a/readthedocs.yml b/readthedocs.yml
deleted file mode 100644
index 82300c6f90..0000000000
--- a/readthedocs.yml
+++ /dev/null
@@ -1,14 +0,0 @@
----
-version: 2
-# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-
-formats: all
-
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.10"
-
-python:
-  install:
-    - requirements: requirements/docs.txt
diff --git a/requirements/docs.in b/requirements/docs.in
index bbfe5118b8..58eb4813c8 100644
--- a/requirements/docs.in
+++ b/requirements/docs.in
@@ -1,5 +1,4 @@
 sphinx
 sphinx_rtd_theme
-myst_parser
--e .
+myst_parser==0.16.1
 attrs  # required for myst, but not automatically installed by myst
diff --git a/requirements/docs.txt b/requirements/docs.txt
index ac8a6a95ab..f681c64485 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,11 +1,9 @@
 #
-# This file is autogenerated by pip-compile with python 3.6
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.7
+# by the following command:
 #
-#    pip-compile --output-file=requirements/docs.txt requirements/docs.in
+#    pip-compile requirements/docs.in
 #
--e .
-    # via -r requirements/docs.in
 alabaster==0.7.13
     # via sphinx
 attrs==22.2.0
@@ -16,9 +14,7 @@ certifi==2022.12.7
     # via requests
 charset-normalizer==3.1.0
     # via requests
-colorama==0.4.6
-    # via sphinx
-docutils==0.18.1
+docutils==0.17.1
     # via
     #   myst-parser
     #   sphinx
@@ -43,19 +39,21 @@ mdit-py-plugins==0.3.5
     # via myst-parser
 mdurl==0.1.2
     # via markdown-it-py
-myst-parser==1.0.0
+myst-parser==0.16.1
     # via -r requirements/docs.in
 packaging==23.0
     # via sphinx
 pygments==2.14.0
     # via sphinx
+pytz==2023.2
+    # via babel
 pyyaml==6.0
     # via myst-parser
 requests==2.28.2
     # via sphinx
 snowballstemmer==2.2.0
     # via sphinx
-sphinx==6.1.3
+sphinx==4.5.0
     # via
     #   -r requirements/docs.in
     #   myst-parser
@@ -63,11 +61,11 @@ sphinx==6.1.3
     #   sphinxcontrib-jquery
 sphinx-rtd-theme==1.2.0
     # via -r requirements/docs.in
-sphinxcontrib-applehelp==1.0.4
+sphinxcontrib-applehelp==1.0.2
     # via sphinx
 sphinxcontrib-devhelp==1.0.2
     # via sphinx
-sphinxcontrib-htmlhelp==2.0.1
+sphinxcontrib-htmlhelp==2.0.0
     # via sphinx
 sphinxcontrib-jquery==4.1
     # via sphinx-rtd-theme
@@ -78,7 +76,9 @@ sphinxcontrib-qthelp==1.0.3
 sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 typing-extensions==4.5.0
-    # via pypdf
+    # via
+    #   importlib-metadata
+    #   markdown-it-py
 urllib3==1.26.15
     # via requests
 zipp==3.15.0

From 0917dfccfbed26ad940cf3f2f0e89f1f31ac2d54 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Wed, 29 Mar 2023 17:42:26 +0200
Subject: [PATCH 03/11] SEC: Warn about PDF encryption security (#1755)

See #1754
---
 docs/user/encryption-decryption.md |  5 +++++
 pypdf/_writer.py                   |  6 ++++++
 tests/test_workflows.py            |  3 ++-
 tests/test_writer.py               | 11 ++++++-----
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/docs/user/encryption-decryption.md b/docs/user/encryption-decryption.md
index b95b33f86d..ff211320c6 100644
--- a/docs/user/encryption-decryption.md
+++ b/docs/user/encryption-decryption.md
@@ -5,6 +5,11 @@
 
 ## Encrypt
 
+> ⚠️ WARNING ⚠️: pypdf only implements [RC4 encryption](https://en.wikipedia.org/wiki/RC4).
+> This encryption algorithm is insecure. The more modern and secure AES
+> encryption is not implemented. pypdf can only decrypt, but not encrypt with
+> AES.
+
 Add a password to a PDF (encrypt it):
 
 ```python
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index b0ae266ebd..5501e58bba 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -1027,6 +1027,12 @@ def encrypt(
                 5 and 6 control annotations, 9 for form fields,
                 10 for extraction of text and graphics.
         """
+        warnings.warn(
+            "pypdf only implements RC4 encryption so far. "
+            "The RC4 algorithm is insecure. Either use a library that supports "
+            "AES for encryption or put the PDF in an encrypted container, "
+            "for example an encrypted ZIP file."
+        )
         if user_pwd is not None:
             if user_password is not None:
                 raise ValueError(
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
index f8c9660bb7..f3552deb84 100644
--- a/tests/test_workflows.py
+++ b/tests/test_workflows.py
@@ -62,7 +62,8 @@ def test_basic_features(tmp_path):
 
     # encrypt your new PDF and add a password
     password = "secret"
-    writer.encrypt(password)
+    with pytest.warns(UserWarning, match="pypdf only implements RC4 encryption"):
+        writer.encrypt(password)
 
     # finally, write "output" to pypdf-output.pdf
     write_path = tmp_path / "pypdf-output.pdf"
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 7eafe5cdbc..10943c5096 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -479,11 +479,12 @@ def test_encrypt(use_128bit, user_password, owner_password, pdf_file_path):
     orig_text = page.extract_text()
 
     writer.add_page(page)
-    writer.encrypt(
-        user_password=user_password,
-        owner_password=owner_password,
-        use_128bit=use_128bit,
-    )
+    with pytest.warns(UserWarning, match="pypdf only implements RC4 encryption"):
+        writer.encrypt(
+            user_password=user_password,
+            owner_password=owner_password,
+            use_128bit=use_128bit,
+        )
 
     # write "output" to pypdf-output.pdf
     with open(pdf_file_path, "wb") as output_stream:

From b385ce9acd9e398710b52212b64fa4c9594fc3c3 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Fri, 31 Mar 2023 19:22:58 +0200
Subject: [PATCH 04/11] DEV: Make make_changelog.py idempotent

---
 make_changelog.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/make_changelog.py b/make_changelog.py
index b375c8f251..59efb30df3 100644
--- a/make_changelog.py
+++ b/make_changelog.py
@@ -37,8 +37,11 @@ def main(changelog_path: str) -> None:
     new_entry = header + changes + trailer
     print(new_entry)
 
-    # TODO: Make idempotent - multiple calls to this script
-    # should not change the changelog
+    # Make the script idempotent by checking if the new entry is already in the changelog
+    if new_entry in changelog:
+        print("Changelog is already up-to-date!")
+        return
+
     new_changelog = new_entry + changelog
     write_changelog(new_changelog, changelog_path)
 
@@ -105,8 +108,21 @@ def get_formatted_changes(git_tag: str) -> str:
         grouped[commit.prefix].append({"msg": commit.message})
 
     # Order prefixes
-    order = ["DEP", "ENH", "PI", "BUG", "ROB", "DOC", "DEV", "MAINT", "TST", "STY"]
+    order = [
+        "SEC",
+        "DEP",
+        "ENH",
+        "PI",
+        "BUG",
+        "ROB",
+        "DOC",
+        "DEV",
+        "MAINT",
+        "TST",
+        "STY",
+    ]
     abbrev2long = {
+        "SEC": "Security",
         "DEP": "Deprecations",
         "ENH": "New Features",
         "BUG": "Bug Fixes",

From 8146729eeb0f90478f2686f1dc395b545b49ba8c Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 6 Apr 2023 14:11:27 +0200
Subject: [PATCH 05/11] ROB: Capture UnicodeDecodeError at PdfReader.pdf_header
 (#1768)

Fixes #1758
---
 pypdf/_reader.py     | 28 +++++++++++++++++-----------
 tests/test_reader.py | 42 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
index 89e7d248fb..36aa642122 100644
--- a/pypdf/_reader.py
+++ b/pypdf/_reader.py
@@ -32,7 +32,7 @@
 import struct
 import zlib
 from datetime import datetime
-from io import BytesIO
+from io import BytesIO, UnsupportedOperation
 from pathlib import Path
 from typing import (
     Any,
@@ -360,7 +360,7 @@ def pdf_header(self) -> str:
         #       but that needs a deprecation
         loc = self.stream.tell()
         self.stream.seek(0, 0)
-        pdf_file_version = self.stream.read(8).decode("utf-8")
+        pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace")
         self.stream.seek(loc, 0)  # return to where it was
         return pdf_file_version
 
@@ -1541,19 +1541,22 @@ def read(self, stream: StreamType) -> None:
 
     def _basic_validation(self, stream: StreamType) -> None:
         """Ensure file is not empty. Read at most 5 bytes."""
-        # start at the end:
-        stream.seek(0, os.SEEK_END)
-        if not stream.tell():
-            raise EmptyFileError("Cannot read an empty file")
-        if self.strict:
-            stream.seek(0, os.SEEK_SET)
+        stream.seek(0, os.SEEK_SET)
+        try:
             header_byte = stream.read(5)
-            if header_byte != b"%PDF-":
+        except UnicodeDecodeError:
+            raise UnsupportedOperation("cannot read header")
+        if header_byte == b"":
+            raise EmptyFileError("Cannot read an empty file")
+        elif header_byte != b"%PDF-":
+            if self.strict:
                 raise PdfReadError(
                     f"PDF starts with '{header_byte.decode('utf8')}', "
                     "but '%PDF-' expected"
                 )
-            stream.seek(0, os.SEEK_END)
+            else:
+                logger_warning(f"invalid pdf header: {header_byte}", __name__)
+        stream.seek(0, os.SEEK_END)
 
     def _find_eof_marker(self, stream: StreamType) -> None:
         """
@@ -1567,7 +1570,10 @@ def _find_eof_marker(self, stream: StreamType) -> None:
         line = b""
         while line[:5] != b"%%EOF":
             if stream.tell() < HEADER_SIZE:
-                raise PdfReadError("EOF marker not found")
+                if self.strict:
+                    raise PdfReadError("EOF marker not found")
+                else:
+                    logger_warning("EOF marker not found", __name__)
             line = read_previous_line(stream)
 
     def _find_startxref_pos(self, stream: StreamType) -> int:
diff --git a/tests/test_reader.py b/tests/test_reader.py
index e2ccd6da52..967d2d1bf1 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -460,10 +460,16 @@ def test_read_empty():
     assert exc.value.args[0] == "Cannot read an empty file"
 
 
-def test_read_malformed_header():
+def test_read_malformed_header(caplog):
     with pytest.raises(PdfReadError) as exc:
         PdfReader(io.BytesIO(b"foo"), strict=True)
     assert exc.value.args[0] == "PDF starts with 'foo', but '%PDF-' expected"
+    caplog.clear()
+    try:
+        PdfReader(io.BytesIO(b"foo"), strict=False)
+    except Exception:
+        pass
+    assert caplog.messages[0].startswith("invalid pdf header")
 
 
 def test_read_malformed_body():
@@ -1352,3 +1358,37 @@ def test_iss1710():
     name = "irbookonlinereading.pdf"
     in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
     in_pdf.outline
+
+
+def test_broken_file_header():
+    pdf_data = (
+        b"%%PDF-\xa0sd\n"
+        b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n"
+        b"2 0 obj << >> endobj\n"
+        b"3 0 obj << >> endobj\n"
+        b"4 0 obj << /Contents 3 0 R /CropBox [0.0 0.0 2550.0 3508.0]"
+        b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
+        b" /Resources << /Font << >> >>"
+        b" /Rotate 0 /Type /Page >> endobj\n"
+        b"5 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
+        b"xref 1 5\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"%010d 00000 n\n"
+        b"trailer << %s/Root 5 0 R /Size 6 >>\n"
+        b"startxref %d\n"
+        b"%%%%EOF"
+    )
+    with_prev_0 = True
+    pdf_data = pdf_data % (
+        pdf_data.find(b"1 0 obj"),
+        pdf_data.find(b"2 0 obj"),
+        pdf_data.find(b"3 0 obj"),
+        pdf_data.find(b"4 0 obj"),
+        pdf_data.find(b"5 0 obj"),
+        b"/Prev 0 " if with_prev_0 else b"",
+        pdf_data.find(b"xref") - 1,
+    )
+    PdfReader(io.BytesIO(pdf_data))

From f26388e7d85eeb4e216046c8fd70c71ab4fb5dfd Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Fri, 7 Apr 2023 15:16:32 +0200
Subject: [PATCH 06/11] ROB: Prevent loop in Cloning (#1770)

An issue was occurring with object 589/0 of file https://github.com/py-pdf/pypdf/files/11138472/test.pdf, which caused a loop during cloning due to its correspondence with both the file trailer and an XObject for filled text ("test"). This behavior was not intended, and a robustness improvement has been made to prevent the loop. Please note that if you run your code, the text "test" may be hidden by the trailer object.

Fixes  #1767
---
 pypdf/_protocols.py               |  3 +++
 pypdf/generic/_base.py            |  5 ++++-
 pypdf/generic/_data_structures.py |  5 ++++-
 tests/test_writer.py              | 11 +++++++++++
 4 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
index 85e9e0a568..ba6cd8a3c9 100644
--- a/pypdf/_protocols.py
+++ b/pypdf/_protocols.py
@@ -73,6 +73,9 @@ def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]:
     def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]:
         ...
 
+    def _add_object(self, obj: Any) -> Any:
+        ...
+
     @property
     def pages(self) -> List[Any]:
         ...
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
index f75e66dd64..be3d71c457 100644
--- a/pypdf/generic/_base.py
+++ b/pypdf/generic/_base.py
@@ -277,7 +277,10 @@ def clone(
                 obj = NullObject()
                 assert isinstance(self, (IndirectObject,))
                 obj.indirect_reference = self
-            dup = obj.clone(pdf_dest, force_duplicate, ignore_fields)
+            dup = pdf_dest._add_object(
+                obj.clone(pdf_dest, force_duplicate, ignore_fields)
+            )
+        # asserts added to prevent errors in mypy
         assert dup is not None
         assert dup.indirect_reference is not None
         return dup.indirect_reference
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index 1fd196027c..b8aaf12d47 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -750,7 +750,10 @@ def _clone(
             if decoded_self is None:
                 self.decoded_self = None
             else:
-                self.decoded_self = decoded_self.clone(pdf_dest, True, ignore_fields)  # type: ignore[assignment]
+                self.decoded_self = cast(
+                    "DecodedStreamObject",
+                    decoded_self.clone(pdf_dest, force_duplicate, ignore_fields),
+                )
         except Exception:
             pass
         super()._clone(src, pdf_dest, force_duplicate, ignore_fields)
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 10943c5096..5066eecb65 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -1292,3 +1292,14 @@ def test_iss1723():
     in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
     out_pdf = PdfWriter()
     out_pdf.append(in_pdf, (3, 5))
+
+
+@pytest.mark.enable_socket()
+def test_iss1767():
+    # test with a pdf which is buggy because the object 389,0 exists 3 times:
+    # twice to define catalog and one as an XObject inducing a loop when
+    # cloning
+    url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf"
+    name = "iss1723.pdf"
+    in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    PdfWriter(clone_from=in_pdf)

From bb2603ee73b5ef0564d25feae57483b7d1930d21 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sat, 8 Apr 2023 15:12:46 +0200
Subject: [PATCH 07/11] STY: Test names, docstrings, and parametrization
 (#1771)

---
 docs/dev/testing.md      |  21 +++++
 tests/test_cmap.py       | 160 +++++++++++++++++++--------------------
 tests/test_constants.py  |   9 +++
 tests/test_encryption.py |  54 ++++++++++---
 tests/test_filters.py    |  25 +++---
 tests/test_papersizes.py |   9 ++-
 tests/test_reader.py     |   2 +-
 tests/test_utils.py      |  28 +++----
 tests/test_xmp.py        |  63 +++++++++------
 9 files changed, 223 insertions(+), 148 deletions(-)

diff --git a/docs/dev/testing.md b/docs/dev/testing.md
index 2e7fb7f19d..30259d4335 100644
--- a/docs/dev/testing.md
+++ b/docs/dev/testing.md
@@ -34,3 +34,24 @@ pyenv install 3.10.2
 
 Then you can execute `tox` which will create a coverage report in HTML form
 in the end. The execution takes about 30 minutes.
+
+
+## Docstrings in Unit tests
+
+The first line of a docstring in a unit test should be written in a way that
+you could prefix it with "This tests ensures that ...", e.g.
+
+* Invalid XML in xmp_metadata is gracefully handled.
+* The identity is returning its input.
+* xmp_modify_date is extracted correctly.
+
+This way, plugins like [`pytest-testdox`](https://pypi.org/project/pytest-testdox/)
+can generate really nice output when the tests are running. This looks similar
+to the output of [mocha.js](https://mochajs.org/).
+
+If the test is a regression test, write
+
+> This test is a regression test for issue #1234
+
+If the regression test is just one parameter of other tests, then add it as
+a comment for that parameter.
diff --git a/tests/test_cmap.py b/tests/test_cmap.py
index a371b92fe6..666d3ecfa0 100644
--- a/tests/test_cmap.py
+++ b/tests/test_cmap.py
@@ -12,28 +12,79 @@
 
 @pytest.mark.enable_socket()
 @pytest.mark.slow()
-def test_compute_space_width():
-    url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf"
-    name = "tika-923406.pdf"
-
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+@pytest.mark.parametrize(
+    ("url", "name", "strict"),
+    [
+        # compute_space_width:
+        (
+            "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf",
+            "tika-923406.pdf",
+            False,
+        ),
+        # _parse_to_unicode_process_rg:
+        (
+            "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf",
+            "tika-959173.pdf",
+            False,
+        ),
+        (
+            "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf",
+            "tika-959173.pdf",
+            True,
+        ),
+        # issue #1718:
+        (
+            "https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf",
+            "iss1718.pdf",
+            False,
+        ),
+    ],
+)
+def test_text_extraction_slow(caplog, url: str, name: str, strict: bool):
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict)
     for page in reader.pages:
         page.extract_text()
+    assert caplog.text == ""
 
 
 @pytest.mark.enable_socket()
-@pytest.mark.slow()
-def test_parse_to_unicode_process_rg():
-    url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf"
-    name = "tika-959173.pdf"
-
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    for page in reader.pages:
-        page.extract_text()
-
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=True)
+@pytest.mark.parametrize(
+    ("url", "name", "strict"),
+    [
+        # bfchar_on_2_chars: issue #1293
+        (
+            "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/"
+            "2007%2CASurveyofImageClassificationBasedTechniques.pdf",
+            "ASurveyofImageClassificationBasedTechniques.pdf",
+            False,
+        ),
+        # L40, get_font_width_from_default
+        (
+            "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf",
+            "tika-908104.pdf",
+            False,
+        ),
+        # multiline_bfrange / regression test for issue #1285:
+        (
+            "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/"
+            "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf",
+            "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf",
+            False,
+        ),
+        (
+            "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/"
+            "Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf",
+            "Giacalone.pdf",
+            False,
+        ),
+    ],
+)
+def test_text_extraction_fast(caplog, url: str, name: str, strict: bool):
+    """Text extraction runs without exceptions or warnings"""
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict)
     for page in reader.pages:
         page.extract_text()
+    assert caplog.text == ""
 
 
 @pytest.mark.enable_socket()
@@ -47,49 +98,6 @@ def test_parse_encoding_advanced_encoding_not_implemented():
             page.extract_text()
 
 
-@pytest.mark.enable_socket()
-def test_get_font_width_from_default():  # L40
-    url = "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf"
-    name = "tika-908104.pdf"
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    for page in reader.pages:
-        page.extract_text()
-
-
-@pytest.mark.enable_socket()
-def test_multiline_bfrange():
-    # non regression test for iss_1285
-    url = (
-        "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/"
-        "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf"
-    )
-    name = "tika-908104.pdf"
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    for page in reader.pages:
-        page.extract_text()
-    url = (
-        "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/"
-        "Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf"
-    )
-    name = "Giacalone.pdf"
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    for page in reader.pages:
-        page.extract_text()
-
-
-@pytest.mark.enable_socket()
-def test_bfchar_on_2_chars():
-    # iss #1293
-    url = (
-        "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/"
-        "2007%2CASurveyofImageClassificationBasedTechniques.pdf"
-    )
-    name = "ASurveyofImageClassificationBasedTechniques.pdf"
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    for page in reader.pages:
-        page.extract_text()
-
-
 @pytest.mark.enable_socket()
 def test_ascii_charset():
     # iss #1312
@@ -100,19 +108,21 @@ def test_ascii_charset():
 
 
 @pytest.mark.enable_socket()
-def test_iss1370():
-    url = "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf"
-    name = "cmap1370.pdf"
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    reader.pages[0].extract_text()
-
-
-@pytest.mark.enable_socket()
-def test_iss1379():
-    url = "https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf"
-    name = "02voc.pdf"
+@pytest.mark.parametrize(
+    ("url", "name", "page_nb"),
+    [
+        (
+            "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf",
+            "cmap1370.pdf",
+            0,
+        ),
+        ("https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", "02voc.pdf", 2),
+    ],
+    ids=["iss1370", "iss1379"],
+)
+def test_text_extraction_of_specific_pages(url: str, name: str, page_nb: int):
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    reader.pages[2].extract_text()
+    reader.pages[page_nb].extract_text()
 
 
 @pytest.mark.enable_socket()
@@ -122,13 +132,3 @@ def test_iss1533():
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
     reader.pages[0].extract_text()  # no error
     assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü"
-
-
-@pytest.mark.enable_socket()
-def test_iss1718(caplog):
-    url = "https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf"
-    name = "iss1718.pdf"
-    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
-    for p in reader.pages:
-        _txt = p.extract_text()
-    assert caplog.text == ""
diff --git a/tests/test_constants.py b/tests/test_constants.py
index ab3166f7c0..da4f307ccc 100644
--- a/tests/test_constants.py
+++ b/tests/test_constants.py
@@ -6,6 +6,15 @@
 
 
 def test_slash_prefix():
+    """
+    Naming conventions of PDF_KEYS (constant names) are followed.
+
+    This test function validates if PDF key names follow the required pattern:
+    - Starts with a slash '/'
+    - Followed by an uppercase letter
+    - Contains alphanumeric characters (letters and digits)
+    - The attribute name should be a case-insensitive match, with underscores removed
+    """
     pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$")
     for cls in PDF_KEYS:
         for attr in dir(cls):
diff --git a/tests/test_encryption.py b/tests/test_encryption.py
index e2e09de095..33c86f85b5 100644
--- a/tests/test_encryption.py
+++ b/tests/test_encryption.py
@@ -70,6 +70,15 @@
     ],
 )
 def test_encryption(name, requires_pycryptodome):
+    """
+    Encrypted PDFs are handled correctly.
+
+    This test function ensures that:
+    - If PyCryptodome is not available and required, a DependencyError is raised
+    - Encrypted PDFs are identified correctly
+    - Decryption works for encrypted PDFs
+    - Metadata is properly extracted from the decrypted PDF
+    """
     inputfile = RESOURCE_ROOT / "encryption" / name
     if requires_pycryptodome and not HAS_PYCRYPTODOME:
         with pytest.raises(DependencyError) as exc:
@@ -108,7 +117,16 @@ def test_encryption(name, requires_pycryptodome):
     ],
 )
 @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome")
-def test_both_password(name, user_passwd, owner_passwd):
+def test_pdf_with_both_passwords(name, user_passwd, owner_passwd):
+    """
+    PDFs with both user and owner passwords are handled correctly.
+
+    This test function ensures that:
+    - Encrypted PDFs with both user and owner passwords are identified correctly
+    - Decryption works for both user and owner passwords
+    - The correct password type is returned after decryption
+    - The number of pages is correctly identified after decryption
+    """
     inputfile = RESOURCE_ROOT / "encryption" / name
     ipdf = pypdf.PdfReader(inputfile)
     assert ipdf.is_encrypted
@@ -125,9 +143,9 @@ def test_both_password(name, user_passwd, owner_passwd):
     ],
 )
 @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome")
-def test_get_page_of_encrypted_file_new_algorithm(pdffile, password):
+def test_read_page_from_encrypted_file_aes_256(pdffile, password):
     """
-    Check if we can read a page of an encrypted file.
+    A page can be read from an encrypted.
 
     This is a regression test for issue 327:
     IndexError for get_page() of decrypted file
@@ -150,7 +168,8 @@ def test_get_page_of_encrypted_file_new_algorithm(pdffile, password):
     ],
 )
 @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome")
-def test_encryption_merge(names):
+def test_merge_encrypted_pdfs(names):
+    """Encrypted PDFs can be merged after decryption."""
     merger = pypdf.PdfMerger()
     files = [RESOURCE_ROOT / "encryption" / x for x in names]
     pdfs = [pypdf.PdfReader(x) for x in files]
@@ -168,24 +187,27 @@ def test_encryption_merge(names):
         CryptRC4,
     ],
 )
-def test_encrypt_decrypt_class(cryptcls):
+def test_encrypt_decrypt_with_cipher_class(cryptcls):
+    """Encryption and decryption using a cipher class work as expected."""
     message = b"Hello World"
     key = bytes(0 for _ in range(128))  # b"secret key"
     crypt = cryptcls(key)
     assert crypt.decrypt(crypt.encrypt(message)) == message
 
 
-def test_decrypt_not_decrypted_pdf():
+def test_attempt_decrypt_unencrypted_pdf():
+    """Attempting to decrypt an unencrypted PDF raises a PdfReadError."""
     path = RESOURCE_ROOT / "crazyones.pdf"
     with pytest.raises(PdfReadError) as exc:
         PdfReader(path, password="nonexistant")
     assert exc.value.args[0] == "Not encrypted file"
 
 
-def test_generate_values():
+def test_alg_v5_generate_values():
     """
-    This test only checks if there is an exception.
+    Algorithm V5 values are generated without raising exceptions.
 
+    This test function checks if there is an exception during the value generation.
     It does not verify that the content is correct.
     """
     if not HAS_PYCRYPTODOME:
@@ -207,13 +229,21 @@ def test_generate_values():
     }
 
 
-def test_randrange():
-    # This might randomly fail in very rare cases
+def test_randrange_function():
+    """
+    _randrange() function generates a range of unique random numbers.
+
+    This test might randomly fail in very rare cases.
+    """
     random_set = {_randrange(0, 10) for _ in range(1000)}
     assert random_set == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
 
 
-def test_randint():
-    # This might randomly fail in very rare cases
+def test_randint_function():
+    """
+    _randint() function generates a range of unique random numbers, including the upper bound.
+
+    This test might randomly fail in very rare cases.
+    """
     random_set = {_randint(0, 10) for _ in range(1000)}
     assert random_set == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 58baef1706..80bf4af0fe 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -35,8 +35,8 @@
 @pytest.mark.parametrize(
     ("predictor", "s"), list(cartesian_product([1], filter_inputs))
 )
-def test_flatedecode(predictor, s):
-    """Tests FlateDecode decode() and encode() methods."""
+def test_flate_decode_encode(predictor, s):
+    """FlateDecode encode() and decode() methods work as expected."""
     codec = FlateDecode()
     s = s.encode()
     encoded = codec.encode(s)
@@ -45,11 +45,13 @@ def test_flatedecode(predictor, s):
 
 def test_flatedecode_unsupported_predictor():
     """
-    Inputs an unsupported predictor (outside the [10, 15] range) checking that
-    PdfReadError() is raised.
+    FlateDecode raises PdfReadError for unsupported predictors.
 
-    Once this predictor support is updated in the future, this test case may be
-    removed.
+    Predictors outside the [10, 15] range are not supported.
+
+    This test function checks that a PdfReadError is raised when decoding with
+    unsupported predictors. Once this predictor support is updated in the
+    future, this test case may be removed.
     """
     codec = FlateDecode()
     predictors = (-10, -1, 0, 9, 16, 20, 100)
@@ -63,7 +65,8 @@ def test_flatedecode_unsupported_predictor():
 @pytest.mark.parametrize(
     "params", [ArrayObject([]), ArrayObject([{"/Predictor": 1}]), "a"]
 )
-def test_flatedecode_decompress_array_params(params):
+def test_flate_decode_decompress_with_array_params(params):
+    """FlateDecode decode() method works correctly with array parameters."""
     codec = FlateDecode()
     s = ""
     s = s.encode()
@@ -106,7 +109,7 @@ def test_flatedecode_decompress_array_params(params):
         "whitespace",
     ],
 )
-def test_ascii_hex_decode(data, expected):
+def test_ascii_hex_decode_method(data, expected):
     """
     Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the
     correct output is returned.
@@ -118,8 +121,8 @@ def test_ascii_hex_decode(data, expected):
     assert ASCIIHexDecode.decode(data) == expected
 
 
-def test_ascii_hex_decode_no_eod():
-    """Ensuring an exception is raised when no EOD character is present."""
+def test_ascii_hex_decode_missing_eod():
+    """ASCIIHexDecode.decode() raises error when no EOD character is present."""
     with pytest.raises(PdfStreamError) as exc:
         ASCIIHexDecode.decode("")
     assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode"
@@ -146,6 +149,8 @@ def test_ascii85decode_with_overflow():
 
 def test_ascii85decode_five_zero_bytes():
     """
+    ASCII85Decode handles the special case of five zero bytes correctly.
+
     From ISO 32000 (2008) §7.4.3:
 
     «As a special case, if all five bytes are 0, they shall be represented by
diff --git a/tests/test_papersizes.py b/tests/test_papersizes.py
index d50948bf5d..c38a2e9fb5 100644
--- a/tests/test_papersizes.py
+++ b/tests/test_papersizes.py
@@ -4,7 +4,8 @@
 from pypdf import papersizes
 
 
-def test_din_a0():
+def test_din_a0_paper_size():
+    """The dimensions and area of the DIN A0 paper size are correct."""
     dim = papersizes.PaperSize.A0
     area_square_pixels = float(dim.width) * dim.height
 
@@ -20,7 +21,8 @@ def test_din_a0():
 
 
 @pytest.mark.parametrize("dimensions", papersizes._din_a)
-def test_din_a_ratio(dimensions):
+def test_din_a_aspect_ratio(dimensions):
+    """The aspect ratio of DIN A paper sizes is correct."""
     assert abs(dimensions.height - dimensions.width * 2**0.5) <= 2.5
 
 
@@ -28,5 +30,6 @@ def test_din_a_ratio(dimensions):
     ("dimensions_a", "dimensions_b"),
     list(zip(papersizes._din_a, papersizes._din_a[1:])),
 )
-def test_din_a_doubling(dimensions_a, dimensions_b):
+def test_din_a_size_doubling(dimensions_a, dimensions_b):
+    """The height of a DIN A paper size doubles when moving to the next size."""
     assert abs(dimensions_a.height - 2 * dimensions_b.width) <= 4
diff --git a/tests/test_reader.py b/tests/test_reader.py
index 967d2d1bf1..c1c24fb460 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -1191,7 +1191,7 @@ def test_outline_with_invalid_destinations():
 
 @pytest.mark.enable_socket()
 def test_pdfreader_multiple_definitions(caplog):
-    # iss325
+    """iss325"""
     url = "https://github.com/py-pdf/pypdf/files/9176644/multipledefs.pdf"
     name = "multipledefs.pdf"
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
diff --git a/tests/test_utils.py b/tests/test_utils.py
index cfc8d7f883..abb022db52 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -124,21 +124,6 @@ def test_paeth_predictor(left, up, upleft, expected):
     assert pypdf._utils.paeth_predictor(left, up, upleft) == expected
 
 
-@pytest.mark.parametrize(
-    ("dat", "pos", "to_read"),
-    [
-        (b"", 0, 1),
-        (b"a", 0, 1),
-        (b"abc", 0, 10),
-    ],
-)
-def test_read_block_backwards_errs(dat, pos, to_read):
-    with pytest.raises(PdfStreamError) as _:
-        s = io.BytesIO(dat)
-        s.seek(pos)
-        read_block_backwards(s, to_read)
-
-
 @pytest.mark.parametrize(
     ("dat", "pos", "to_read", "expected", "expected_pos"),
     [
@@ -149,12 +134,19 @@ def test_read_block_backwards_errs(dat, pos, to_read):
         (b"abc", 3, 1, b"c", 2),
         (b"abc", 3, 2, b"bc", 1),
         (b"abc", 3, 3, b"abc", 0),
+        (b"", 0, 1, None, 0),
+        (b"a", 0, 1, None, 0),
+        (b"abc", 0, 10, None, 0),
     ],
 )
 def test_read_block_backwards(dat, pos, to_read, expected, expected_pos):
     s = io.BytesIO(dat)
     s.seek(pos)
-    assert read_block_backwards(s, to_read) == expected
+    if expected is not None:
+        assert read_block_backwards(s, to_read) == expected
+    else:
+        with pytest.raises(PdfStreamError):
+            read_block_backwards(s, to_read)
     assert s.tell() == expected_pos
 
 
@@ -264,10 +256,12 @@ def test_escapedcode_followed_by_int():
     ],
 )
 def test_human_readable_bytes(input_int, expected_output):
+    """_human_readable_bytes correctly transforms the integer to a string."""
     assert _human_readable_bytes(input_int) == expected_output
 
 
-def test_file():
+def test_file_class():
+    """File class can be instanciated and string representation is ok."""
     f = File(name="image.png", data=b"")
     assert str(f) == "File(name=image.png, data: 0 Byte)"
     assert repr(f) == "File(name=image.png, data: 0 Byte, hash: 0)"
diff --git a/tests/test_xmp.py b/tests/test_xmp.py
index 1fc1184ab2..50555b476d 100644
--- a/tests/test_xmp.py
+++ b/tests/test_xmp.py
@@ -24,7 +24,8 @@
         (RESOURCE_ROOT / "crazyones.pdf", False),
     ],
 )
-def test_read_xmp(src, has_xmp):
+def test_read_xmp_metadata(src, has_xmp):
+    """Read XMP metadata from PDF files."""
     reader = PdfReader(src)
     xmp = reader.xmp_metadata
     assert (xmp is None) == (not has_xmp)
@@ -39,6 +40,7 @@ def test_read_xmp(src, has_xmp):
 
 
 def get_all_tiff(xmp: pypdf.xmp.XmpInformation):
+    """Return all TIFF metadata as a dictionary."""
     data = {}
     tiff_ns = xmp.get_nodes_in_namespace(
         about_uri="", namespace="http://ns.adobe.com/tiff/1.0/"
@@ -51,30 +53,29 @@ def get_all_tiff(xmp: pypdf.xmp.XmpInformation):
     return data
 
 
-def test_regression_issue774():
+def test_converter_date():
+    """
+    _converter_date returns the correct datetime.
+
+    This is a regression test for issue #774.
+    """
     date = pypdf.xmp._converter_date("2021-04-28T12:23:34.123Z")
-    assert date.year == 2021
-    assert date.month == 4
-    assert date.day == 28
-    assert date.hour == 12
-    assert date.minute == 23
-    assert date.second == 34
-    assert date.microsecond == 123000
+    assert date == datetime(2021, 4, 28, 12, 23, 34, 123000)
+
     with pytest.raises(ValueError) as exc:
         pypdf.xmp._converter_date("today")
     assert exc.value.args[0].startswith("Invalid date format")
 
     date = pypdf.xmp._converter_date("2021-04-28T12:23:01-03:00")
-    assert date.year == 2021
-    assert date.month == 4
-    assert date.day == 28
-    assert date.hour == 15
-    assert date.minute == 23
-    assert date.second == 1
-    assert date.microsecond == 0
+    assert date == datetime(2021, 4, 28, 15, 23, 1)
 
 
-def test_regression_issue914():
+def test_modify_date():
+    """
+    xmp_modify_date is extracted correctly.
+
+    This is a regression test for issue #914.
+    """
     path = RESOURCE_ROOT / "issue-914-xmp-data.pdf"
     reader = PdfReader(path)
     assert reader.xmp_metadata.xmp_modify_date == datetime(2022, 4, 9, 15, 22, 43)
@@ -84,7 +85,8 @@ def test_regression_issue914():
     "x",
     ["a", 42, 3.141, False, True],
 )
-def test_identity(x):
+def test_identity_function(x):
+    """The identity is returning its input."""
     assert pypdf.xmp._identity(x) == x
 
 
@@ -99,7 +101,8 @@ def test_identity(x):
         )
     ],
 )
-def test_xmpmm(url, name, xmpmm_instance_id):
+def test_xmpmm_instance_id(url, name, xmpmm_instance_id):
+    """XMPMM instance id is correctly extracted."""
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
     xmp_metadata = reader.xmp_metadata
     assert xmp_metadata.xmpmm_instance_id == xmpmm_instance_id
@@ -108,7 +111,8 @@ def test_xmpmm(url, name, xmpmm_instance_id):
 
 
 @pytest.mark.enable_socket()
-def test_dc_description():
+def test_xmp_dc_description_extraction():
+    """XMP dc_description is correctly extracted."""
     url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf"
     name = "tika-953770.pdf"
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
@@ -123,7 +127,8 @@ def test_dc_description():
 
 
 @pytest.mark.enable_socket()
-def test_dc_creator():
+def test_dc_creator_extraction():
+    """XMP dc_creator is correctly extracted."""
     url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf"
     name = "tika-953770.pdf"
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
@@ -134,7 +139,8 @@ def test_dc_creator():
 
 
 @pytest.mark.enable_socket()
-def test_custom_properties():
+def test_custom_properties_extraction():
+    """XMP custom_properties is correctly extracted."""
     url = "https://corpora.tika.apache.org/base/docs/govdocs1/986/986065.pdf"
     name = "tika-986065.pdf"
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
@@ -145,7 +151,8 @@ def test_custom_properties():
 
 
 @pytest.mark.enable_socket()
-def test_dc_subject():
+def test_dc_subject_extraction():
+    """XMP dc_subject is correctly extracted."""
     url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959519.pdf"
     name = "tika-959519.pdf"
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
@@ -176,7 +183,12 @@ def test_dc_subject():
 
 
 @pytest.mark.enable_socket()
-def test_issue585():
+def test_invalid_xmp_information_handling():
+    """
+    Invalid XML in xmp_metadata is gracefully handled.
+
+    This is a regression test for issue #585.
+    """
     url = "https://github.com/py-pdf/pypdf/files/5536984/test.pdf"
     name = "pypdf-5536984.pdf"
     reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
@@ -185,7 +197,8 @@ def test_issue585():
     assert exc.value.args[0].startswith("XML in XmpInformation was invalid")
 
 
-def test_getter_bag():
+def test_xmp_getter_bag_function():
+    """xmp._getter_bag does not crash."""
     f = pypdf.xmp._getter_bag("namespace", "name")
 
     class Tst:  # to replace pdf

From 117ce458c5cc3d8e2dfbdda1dbdd4d18d1e7c60e Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sat, 8 Apr 2023 15:47:53 +0200
Subject: [PATCH 08/11] MAINT: Move generation of file identifiers to a method
 (#1760)

* Ensure that the content is used to generate the file identifiers
* Ensure /ID[0] is not overwritten
* MAINT: Rename PdfWriter._write_header to PdfWriter._write_pdf_structure
---
 pypdf/_writer.py        | 47 +++++++++++++++++++++++++++++------------
 tests/test_workflows.py |  2 ++
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
index 5501e58bba..5f31224bc8 100644
--- a/pypdf/_writer.py
+++ b/pypdf/_writer.py
@@ -31,11 +31,10 @@
 import collections
 import decimal
 import enum
+import hashlib
 import logging
 import re
-import secrets
 import struct
-import time
 import uuid
 import warnings
 from hashlib import md5
@@ -144,6 +143,13 @@ class ObjectDeletionFlag(enum.IntFlag):
     ALL_ANNOTATIONS = enum.auto()
 
 
+def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
+    hash = hashlib.md5()
+    for block in iter(lambda: stream.read(blocksize), b""):
+        hash.update(block)
+    return hash.hexdigest()
+
+
 class PdfWriter:
     """
     Write a PDF file out, given pages produced by another class.
@@ -974,7 +980,7 @@ def clone_document_from_reader(
         self.clone_reader_document_root(reader)
         self._info = reader.trailer[TK.INFO].clone(self).indirect_reference  # type: ignore
         try:
-            self._ID = reader.trailer[TK.ID].clone(self)  # type: ignore
+            self._ID = cast(ArrayObject, reader.trailer[TK.ID].clone(self))  # type: ignore
         except KeyError:
             pass
         if callable(after_page_append):
@@ -998,6 +1004,26 @@ def cloneDocumentFromReader(
         )
         self.clone_document_from_reader(reader, after_page_append)
 
+    def _compute_document_identifier_from_content(self) -> ByteStringObject:
+        stream = BytesIO()
+        self._write_pdf_structure(stream)
+        stream.seek(0)
+        return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
+
+    def generate_file_identifiers(self) -> None:
+        """
+        Generate an identifier for the PDF that will be written.
+
+        The only point of this is ensuring uniqueness. Reproducibility is not
+        required; see 14.4 "File Identifiers".
+        """
+        if hasattr(self, "_ID") and self._ID and len(self._ID) == 2:
+            ID_1 = self._ID[0]
+        else:
+            ID_1 = self._compute_document_identifier_from_content()
+        ID_2 = self._compute_document_identifier_from_content()
+        self._ID = ArrayObject((ID_1, ID_2))
+
     def encrypt(
         self,
         user_password: Optional[str] = None,
@@ -1078,19 +1104,14 @@ def encrypt(
             V = 1
             rev = 2
             keylen = int(40 / 8)
-        secrets_generator = secrets.SystemRandom()
         P = permissions_flag
         O = ByteStringObject(_alg33(owner_password, user_password, rev, keylen))  # type: ignore[arg-type]  # noqa
-        ID_1 = ByteStringObject(md5((repr(time.time())).encode("utf8")).digest())
-        ID_2 = ByteStringObject(
-            md5((repr(secrets_generator.uniform(0, 1))).encode("utf8")).digest()
-        )
-        self._ID = ArrayObject((ID_1, ID_2))
+        self.generate_file_identifiers()
         if rev == 2:
-            U, key = _alg34(user_password, O, P, ID_1)
+            U, key = _alg34(user_password, O, P, self._ID[0])
         else:
             assert rev == 3
-            U, key = _alg35(user_password, rev, keylen, O, P, ID_1, False)  # type: ignore[arg-type]
+            U, key = _alg35(user_password, rev, keylen, O, P, self._ID[0], False)  # type: ignore[arg-type]
         encrypt = DictionaryObject()
         encrypt[NameObject(SA.FILTER)] = NameObject("/Standard")
         encrypt[NameObject("/V")] = NumberObject(V)
@@ -1124,7 +1145,7 @@ def write_stream(self, stream: StreamType) -> None:
         # copying in a new copy of the page object.
         self._sweep_indirect_references(self._root)
 
-        object_positions = self._write_header(stream)
+        object_positions = self._write_pdf_structure(stream)
         xref_location = self._write_xref_table(stream, object_positions)
         self._write_trailer(stream)
         stream.write(b_(f"\nstartxref\n{xref_location}\n%%EOF\n"))  # eof
@@ -1159,7 +1180,7 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]:
 
         return my_file, stream
 
-    def _write_header(self, stream: StreamType) -> List[int]:
+    def _write_pdf_structure(self, stream: StreamType) -> List[int]:
         object_positions = []
         stream.write(self.pdf_header + b"\n")
         stream.write(b"%\xE2\xE3\xCF\xD3\n")
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
index f3552deb84..7a9ddcb53d 100644
--- a/tests/test_workflows.py
+++ b/tests/test_workflows.py
@@ -64,6 +64,8 @@ def test_basic_features(tmp_path):
     password = "secret"
     with pytest.warns(UserWarning, match="pypdf only implements RC4 encryption"):
         writer.encrypt(password)
+        # doing it twice should not change anything
+        writer.encrypt(password)
 
     # finally, write "output" to pypdf-output.pdf
     write_path = tmp_path / "pypdf-output.pdf"

From 10cc05775b481da61261e3cc0af38ef116bd0040 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 9 Apr 2023 16:33:44 +0200
Subject: [PATCH 09/11] STY: Improve language, add docstrings, fix TODOs
 (#1772)

---
 README.md                              | 30 +++++------
 docs/user/cropping-and-transforming.md |  4 ++
 pypdf/_reader.py                       |  4 +-
 pypdf/constants.py                     | 72 +++++++++++++++++++++++++-
 pypdf/filters.py                       | 42 ++++++++++++++-
 pypdf/generic/__init__.py              |  3 ++
 pyproject.toml                         |  3 +-
 sample-files                           |  2 +-
 tests/test_constants.py                | 12 ++++-
 9 files changed, 146 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 5c4c9173c4..de4799b2d5 100644
--- a/README.md
+++ b/README.md
@@ -21,23 +21,22 @@ from PDFs as well.
 
 ## Installation
 
-You can install pypdf via pip:
+Install pypdf using pip:
 
 ```
 pip install pypdf
 ```
 
-If you plan to use pypdf for encrypting or decrypting PDFs that use AES, you
-will need to install some extra dependencies. Encryption using RC4 is supported
-using the regular installation.
+For using pypdf with AES encryption or decryption, install extra dependencies:
 
 ```
 pip install pypdf[crypto]
 ```
 
-> **NOTE**: `pypdf>=3.1.0` improved a lot compared to `pyPdf<2.0.0` and compared to
-> `PyPDF2 < 2.0.0`. Please
-> read [the migration guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html).
+> **NOTE**: `pypdf` 3.1.0 and above include significant improvements compared to
+> previous versions. Please refer to [the migration
+> guide](https://pypdf.readthedocs.io/en/latest/user/migration-1-to-2.html) for
+> more information.
 
 ## Usage
 
@@ -51,19 +50,18 @@ text = page.extract_text()
 ```
 
 pypdf can do a lot more, e.g. splitting, merging, reading and creating
-annotations, decrypting and encrypting, and more.
+annotations, decrypting and encrypting, and more. Check out [the
+documentation](https://pypdf.readthedocs.io/en/stable/) for additional usage
+examples!
 
-Please see [the documentation](https://pypdf.readthedocs.io/en/stable/)
-for more usage examples!
-
-A lot of questions are asked and answered
-[on StackOverflow](https://stackoverflow.com/questions/tagged/pypdf)
-(formerly tagged with [PyPDF2](https://stackoverflow.com/questions/tagged/pypdf2)).
+For questions and answers, visit
+[StackOverflow](https://stackoverflow.com/questions/tagged/pypdf)
+(tagged with [pypdf](https://stackoverflow.com/questions/tagged/pypdf)).
 
 ## Contributions
 
-Maintaining pypdf is a collaborative effort. You can support pypdf by writing
-documentation, helping to narrow down issues, and adding code.
+Maintaining pypdf is a collaborative effort. You can support the project by
+writing documentation, helping to narrow down issues, and submitting code.
 
 ### Q&A
 
diff --git a/docs/user/cropping-and-transforming.md b/docs/user/cropping-and-transforming.md
index 2afc2a2e6b..d002ef1d8f 100644
--- a/docs/user/cropping-and-transforming.md
+++ b/docs/user/cropping-and-transforming.md
@@ -1,5 +1,9 @@
 # Cropping and Transforming PDFs
 
+> **Notice**: Just because content is no longer visible, it is not gone.
+> Cropping works by adjusting the viewbox. That means content that was cropped
+> away can still be restored.
+
 ```python
 from pypdf import PdfWriter, PdfReader
 
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
index 36aa642122..3b21c29e73 100644
--- a/pypdf/_reader.py
+++ b/pypdf/_reader.py
@@ -1273,8 +1273,8 @@ def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
         This is equivalent to generic.IndirectObject(num,gen,self).get_object()
 
         Args:
-            num:
-            gen:
+            num: The object number of the indirect object.
+            gen: The generation number of the indirect object.
 
         Returns:
             A PdfObject
diff --git a/pypdf/constants.py b/pypdf/constants.py
index 9f7327adfe..d1be774079 100644
--- a/pypdf/constants.py
+++ b/pypdf/constants.py
@@ -306,6 +306,17 @@ class FieldDictionaryAttributes:
 
     @classmethod
     def attributes(cls) -> Tuple[str, ...]:
+        """
+        Get a tuple of all the attributes present in a Field Dictionary.
+
+        This method returns a tuple of all the attribute constants defined in
+        the FieldDictionaryAttributes class. These attributes correspond to the
+        entries that are common to all field dictionaries as specified in the
+        PDF 1.7 reference.
+
+        Returns:
+            A tuple containing all the attribute constants.
+        """
         return (
             cls.TM,
             cls.T,
@@ -321,6 +332,18 @@ def attributes(cls) -> Tuple[str, ...]:
 
     @classmethod
     def attributes_dict(cls) -> Dict[str, str]:
+        """
+        Get a dictionary of attribute keys and their human-readable names.
+
+        This method returns a dictionary where the keys are the attribute
+        constants defined in the FieldDictionaryAttributes class and the values
+        are their corresponding human-readable names. These attributes
+        correspond to the entries that are common to all field dictionaries as
+        specified in the PDF 1.7 reference.
+
+        Returns:
+            A dictionary containing attribute keys and their names.
+        """
         return {
             cls.FT: "Field Type",
             cls.Parent: "Parent",
@@ -340,10 +363,33 @@ class CheckboxRadioButtonAttributes:
 
     @classmethod
     def attributes(cls) -> Tuple[str, ...]:
+        """
+        Get a tuple of all the attributes present in a Field Dictionary.
+
+        This method returns a tuple of all the attribute constants defined in
+        the CheckboxRadioButtonAttributes class. These attributes correspond to
+        the entries that are common to all field dictionaries as specified in
+        the PDF 1.7 reference.
+
+        Returns:
+            A tuple containing all the attribute constants.
+        """
         return (cls.Opt,)
 
     @classmethod
     def attributes_dict(cls) -> Dict[str, str]:
+        """
+        Get a dictionary of attribute keys and their human-readable names.
+
+        This method returns a dictionary where the keys are the attribute
+        constants defined in the CheckboxRadioButtonAttributes class and the
+        values are their corresponding human-readable names. These attributes
+        correspond to the entries that are common to all field dictionaries as
+        specified in the PDF 1.7 reference.
+
+        Returns:
+            A dictionary containing attribute keys and their names.
+        """
         return {
             cls.Opt: "Options",
         }
@@ -381,13 +427,35 @@ class PageLayouts:
 
 
 class GraphicsStateParameters:
-    """Table 4.8 of the 1.7 reference."""
+    """Table 58 – Entries in a Graphics State Parameter Dictionary"""
 
     TYPE = "/Type"  # name, optional
     LW = "/LW"  # number, optional
-    # TODO: Many more!
+    LC = "/LC"  # integer, optional
+    LJ = "/LJ"  # integer, optional
+    ML = "/ML"  # number, optional
+    D = "/D"  # array, optional
+    RI = "/RI"  # name, optional
+    OP = "/OP"
+    op = "/op"
+    OPM = "/OPM"
     FONT = "/Font"  # array, optional
+    BG = "/BG"
+    BG2 = "/BG2"
+    UCR = "/UCR"
+    UCR2 = "/UCR2"
+    TR = "/TR"
+    TR2 = "/TR2"
+    HT = "/HT"
+    FL = "/FL"
+    SM = "/SM"
+    SA = "/SA"
+    BM = "/BM"
     S_MASK = "/SMask"  # dictionary or name, optional
+    CA = "/CA"
+    ca = "/ca"
+    AIS = "/AIS"
+    TK = "/TK"
 
 
 class CatalogDictionary:
diff --git a/pypdf/filters.py b/pypdf/filters.py
index 72b4243163..086a8a2f53 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -67,6 +67,19 @@
 
 
 def decompress(data: bytes) -> bytes:
+    """
+    Decompress the given data using zlib.
+
+    This function attempts to decompress the input data using zlib. If the
+    decompression fails due to a zlib error, it falls back to using a
+    decompression object with a larger window size.
+
+    Args:
+        data: The input data to be decompressed.
+
+    Returns:
+        The decompressed data.
+    """
     try:
         return zlib.decompress(data)
     except zlib.error:
@@ -195,6 +208,15 @@ def _decode_png_prediction(data: str, columns: int, rowlength: int) -> bytes:
 
     @staticmethod
     def encode(data: bytes) -> bytes:
+        """
+        Compress the input data using zlib.
+
+        Args:
+            data: The data to be compressed.
+
+        Returns:
+            The compressed data.
+        """
         return zlib.compress(data)
 
 
@@ -376,7 +398,7 @@ def decode(
         group_index = b = 0
         out = bytearray()
         for char in data:
-            if ord("!") <= char and char <= ord("u"):
+            if ord("!") <= char <= ord("u"):
                 group_index += 1
                 b = b * 85 + (char - 33)
                 if group_index == 5:
@@ -536,6 +558,23 @@ def decode(
 
 
 def decode_stream_data(stream: Any) -> Union[str, bytes]:  # utils.StreamObject
+    """
+    Decode the stream data based on the specified filters.
+
+    This function decodes the stream data using the filters provided in the
+    stream. It supports various filter types, including FlateDecode,
+    ASCIIHexDecode, LZWDecode, ASCII85Decode, DCTDecode, JPXDecode, and
+    CCITTFaxDecode.
+
+    Args:
+        stream: The input stream object containing the data and filters.
+
+    Returns:
+        The decoded stream data.
+
+    Raises:
+        NotImplementedError: If an unsupported filter type is encountered.
+    """
     filters = stream.get(SA.FILTER, ())
     if isinstance(filters, IndirectObject):
         filters = cast(ArrayObject, filters.get_object())
@@ -580,6 +619,7 @@ def decode_stream_data(stream: Any) -> Union[str, bytes]:  # utils.StreamObject
 
 
 def decodeStreamData(stream: Any) -> Union[str, bytes]:  # deprecated
+    """Deprecated. Use decode_stream_data."""
     deprecate_with_replacement("decodeStreamData", "decode_stream_data", "4.0.0")
     return decode_stream_data(stream)
 
diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py
index 984bbf2c24..a26b448102 100644
--- a/pypdf/generic/__init__.py
+++ b/pypdf/generic/__init__.py
@@ -73,6 +73,7 @@
 def readHexStringFromStream(
     stream: StreamType,
 ) -> Union["TextStringObject", "ByteStringObject"]:  # deprecated
+    """Deprecated, use read_hex_string_from_stream."""
     deprecate_with_replacement(
         "readHexStringFromStream", "read_hex_string_from_stream", "4.0.0"
     )
@@ -83,6 +84,7 @@ def readStringFromStream(
     stream: StreamType,
     forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
 ) -> Union["TextStringObject", "ByteStringObject"]:  # deprecated
+    """Deprecated, use read_string_from_stream."""
     deprecate_with_replacement(
         "readStringFromStream", "read_string_from_stream", "4.0.0"
     )
@@ -93,6 +95,7 @@ def createStringObject(
     string: Union[str, bytes],
     forced_encoding: Union[None, str, List[str], Dict[int, str]] = None,
 ) -> Union[TextStringObject, ByteStringObject]:  # deprecated
+    """Deprecated, use create_string_object."""
     deprecate_with_replacement("createStringObject", "create_string_object", "4.0.0")
     return create_string_object(string, forced_encoding)
 
diff --git a/pyproject.toml b/pyproject.toml
index fdf89e0854..933917f9db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,7 +156,6 @@ ignore = [
     "C901",
     "D101",  # Missing docstring in public class
     "D102", # Missing docstring in public method
-    "D103",  # Missing docstring in public function
     "D417",  # Missing argument descriptions in the docstring
     "FBT001", # Boolean positional arg in function definition
     "FBT002", # Boolean default value in function definition
@@ -177,7 +176,7 @@ ignore = [
 ]
 
 [tool.ruff.per-file-ignores]
-"tests/*" = ["S101", "ANN001", "ANN201","D104", "S105", "S106"]
+"tests/*" = ["S101", "ANN001", "ANN201","D104", "S105", "S106", "D103"]
 "sample-files/*" = ["D100", "INP001"]
 "_encryption.py" = ["S324", "S311"]
 "_security.py" = ["S324"]
diff --git a/sample-files b/sample-files
index fb7a080b35..65e2d2ca8a 160000
--- a/sample-files
+++ b/sample-files
@@ -1 +1 @@
-Subproject commit fb7a080b35b3553bd10221282beeda7847959e83
+Subproject commit 65e2d2ca8a137bfb1807b9991d5ca97f90365cc3
diff --git a/tests/test_constants.py b/tests/test_constants.py
index da4f307ccc..62fbae7433 100644
--- a/tests/test_constants.py
+++ b/tests/test_constants.py
@@ -2,7 +2,7 @@
 import re
 from typing import Callable
 
-from pypdf.constants import PDF_KEYS
+from pypdf.constants import PDF_KEYS, GraphicsStateParameters
 
 
 def test_slash_prefix():
@@ -18,11 +18,19 @@ def test_slash_prefix():
     pattern = re.compile(r"^\/[A-Z]+[a-zA-Z0-9]*$")
     for cls in PDF_KEYS:
         for attr in dir(cls):
+            # Skip magic methods
             if attr.startswith("__") and attr.endswith("__"):
                 continue
+
+            # Skip methods
             constant_value = getattr(cls, attr)
             if isinstance(constant_value, Callable):
                 continue
+
             assert constant_value.startswith("/")
-            assert pattern.match(constant_value)
             assert attr.replace("_", "").lower() == constant_value[1:].lower()
+
+            # There are a few exceptions that may be lowercase
+            if cls == GraphicsStateParameters and attr in ["ca", "op"]:
+                continue
+            assert pattern.match(constant_value)

From ac0cb986cdb34baaae140c159b3219785e87be3b Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 9 Apr 2023 21:32:50 +0200
Subject: [PATCH 10/11] TST: Add xmp test (#1775)

---
 sample-files        |  2 +-
 tests/test_utils.py |  4 ++--
 tests/test_xmp.py   | 20 ++++++++++++++++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/sample-files b/sample-files
index 65e2d2ca8a..0c3b1d3879 160000
--- a/sample-files
+++ b/sample-files
@@ -1 +1 @@
-Subproject commit 65e2d2ca8a137bfb1807b9991d5ca97f90365cc3
+Subproject commit 0c3b1d3879c5cd7d913b3d931fa33b37529d7346
diff --git a/tests/test_utils.py b/tests/test_utils.py
index abb022db52..3ae80bddb9 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -221,8 +221,8 @@ def test_read_block_backwards_exception():
 
 def test_deprecation_bookmark():
     @deprecation_bookmark(old_param="new_param")
-    def foo(old_param: int = 1, baz: int = 2) -> float:
-        return old_param * baz
+    def foo(old_param: int = 1, baz: int = 2) -> None:
+        pass
 
     with pytest.raises(DeprecationError) as exc:
         foo(old_param=12, new_param=13)
diff --git a/tests/test_xmp.py b/tests/test_xmp.py
index 50555b476d..cfcf021119 100644
--- a/tests/test_xmp.py
+++ b/tests/test_xmp.py
@@ -15,6 +15,26 @@
 TESTS_ROOT = Path(__file__).parent.resolve()
 PROJECT_ROOT = TESTS_ROOT.parent
 RESOURCE_ROOT = PROJECT_ROOT / "resources"
+SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files"
+
+
+@pytest.mark.samples()
+@pytest.mark.parametrize(
+    "src",
+    [
+        (SAMPLE_ROOT / "020-xmp/output_with_metadata_pymupdf.pdf"),
+    ],
+)
+def test_read_xmp_metadata_samples(src):
+    reader = PdfReader(src)
+    xmp = reader.xmp_metadata
+    assert xmp
+    assert xmp.dc_contributor == []
+    assert xmp.dc_creator == ["John Doe"]
+    assert xmp.dc_source == "Martin Thoma"  # attribute node
+    assert xmp.dc_description == {"x-default": "This is a text"}
+    assert xmp.dc_date == [datetime(1990, 4, 28, 0, 0)]
+    assert xmp.dc_title == {"x-default": "Sample PDF with XMP Metadata"}
 
 
 @pytest.mark.parametrize(

From a876a77b3af5ecd64699cfc0b687d2657de8f526 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 9 Apr 2023 22:25:11 +0200
Subject: [PATCH 11/11] REL: 3.7.1

Security (SEC):
-  Warn about PDF encryption security (#1755)

Robustness (ROB):
-  Prevent loop in Cloning (#1770)
-  Capture UnicodeDecodeError at PdfReader.pdf_header (#1768)

Documentation (DOC):
-  Pin myst_parser==0.16.1 and rename .readthedocs.yaml (#1752)
-  Add readthedocs.yml and bump docs dependencies using `tox -e deps` (#1750)

Developer Experience (DEV):
-  Make make_changelog.py idempotent

Maintenance (MAINT):
-  Move generation of file identifiers to a method (#1760)

Testing (TST):
-  Add xmp test (#1775)

Code Style (STY):
-  Improve language, add docstrings, fix TODOs (#1772)
-  Test names, docstrings, and parametrization (#1771)

[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.7.0...3.7.1)
---
 CHANGELOG.md      | 23 +++++++++++++++++++++++
 pypdf/_version.py |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e9f5a8fdb..319b7586a3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,28 @@
 # CHANGELOG
 
+## Version 3.7.1, 2023-04-09
+
+### Security (SEC)
+-  Warn about PDF encryption security (#1755)
+
+### Robustness (ROB)
+-  Prevent loop in Cloning (#1770)
+-  Capture UnicodeDecodeError at PdfReader.pdf_header (#1768)
+
+### Documentation (DOC)
+-  Add .readthedocs.yaml and bump docs dependencies using `tox -e deps` (#1750, #1752)
+
+### Developer Experience (DEV)
+-  Make make_changelog.py idempotent
+
+### Maintenance (MAINT)
+-  Move generation of file identifiers to a method (#1760)
+
+### Testing (TST)
+-  Add xmp test (#1775)
+
+[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.7.0...3.7.1)
+
 ## Version 3.7.0, 2023-03-26
 
 ### Security (SEC)
diff --git a/pypdf/_version.py b/pypdf/_version.py
index 46f67e7f8d..975f69142a 100644
--- a/pypdf/_version.py
+++ b/pypdf/_version.py
@@ -1 +1 @@
-__version__ = "3.7.0"
+__version__ = "3.7.1"