diff --git a/CHANGELOG.md b/CHANGELOG.md index ba40f6c3bc..a61efeeb88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,41 +1,48 @@ -Version 3.9.0, 2023-05-21 -------------------------- +# CHANGELOG + +## Version 3.9.1, 2023-06-04 + +### Deprecations (DEP) +- Deprecate PdfMerger (#1866) + +### Bug Fixes (BUG) +- Ignore UTF-8 decode errors (#1865) + +### Robustness (ROB) +- Handle missing /Type entry in Page tree (#1859) + + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.9.0...3.9.1) -New Features (ENH): +## Version 3.9.0, 2023-05-21 + +### New Features (ENH) - Simplify metadata input (Document Information Dictionary) (#1851) - Extend cmap compatibilty to GBK_EUC_H/V (#1812) -Bug Fixes (BUG): +### Bug Fixes (BUG) - Prevent infinite loop when no character follows after a comment (#1828) - get_contents does not return ContentStream (#1847) - Accept XYZ destination with zoom missing (default to zoom=0.0) (#1844) - Cope with 1 Bit images (#1815) -Robustness (ROB): +### Robustness (ROB) - Handle missing /Type entry in Page tree (#1845) -Documentation (DOC): +### Documentation (DOC) - Expand file size explanations (#1835) - Add comparison with pdfplumber (#1837) - Clarify that PyPDF2 is dead (#1827) - Add Hunter King as Contributor for #1806 -Maintenance (MAINT): +### Maintenance (MAINT) - Refactor internal Encryption class (#1821) - Add R parameter to generate_values (#1820) - Make encryption_key parameter of write_to_stream optional (#1819) - Prepare for adding AES enryption support (#1818) -Testing (TST): -- Parametrize test_cmap_encodings (#1823) - -Code Style (STY): -- Iterate directly over the list instead of using range (#1839) -- Minor refactorings in _encryption.py (#1822) +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.1...3.9.0) -[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.1...3.8.2) - -# CHANGELOG ## Version 3.8.1, 2023-04-23 diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 5d36b9ac80..6b9f0d2850 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -34,6 +34,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra * [Pinheiro, Arthur](https://github.com/xilopaint) * [programmarchy](https://github.com/programmarchy) * [pubpub-zz](https://github.com/pubpub-zz): involved in community development +* [RitchieP](https://github.com/RitchieP) | [LinkedIn](https://www.linkedin.com/in/ritchie-p-892b31115/) | [StackOverflow](https://stackoverflow.com/users/13328625/casual-r?tab=profile) * [Rogmann, Sascha](https://github.com/srogmann) * [robbiebusinessacc](https://github.com/robbiebusinessacc) * [sietzeberends](https://github.com/sietzeberends) diff --git a/docs/meta/comparisons.md b/docs/meta/comparisons.md index d38861495a..1d921a5393 100644 --- a/docs/meta/comparisons.md +++ b/docs/meta/comparisons.md @@ -56,7 +56,7 @@ than PyPDF2. See [history of pypdf](history.md). extracting the [font size](https://stackoverflow.com/a/69962459/562769) / font weight (bold-ness). It has no capabilities for writing PDF files. -[`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file. +[`pdfplumber`](https://pypi.org/project/pdfplumber/) is a library focused on extracting data from PDF documents. Since `pdfplumber` is built on top of `pdfminer.six`, there are **no capabilities of exporting or modifying a PDF file** (see [#440 (discussions)](https://github.com/jsvine/pdfplumber/discussions/440#discussioncomment-803880)). However, `pdfplumber` is capable of converting a PDF file into an image, [draw lines and rectangles on the image](https://github.com/jsvine/pdfplumber#drawing-methods), and save it as an image file. Please note that the image conversion is done via ImageMagick (see [`pdfplumber`'s documentation](https://github.com/jsvine/pdfplumber#visual-debugging)). The `pdfplumber` community is active in answering questions and the library is maintained as of May 2023. diff --git a/make_changelog.py b/make_changelog.py index 59efb30df3..36270485d5 100644 --- a/make_changelog.py +++ b/make_changelog.py @@ -25,13 +25,14 @@ def main(changelog_path: str) -> None: changelog = get_changelog(changelog_path) git_tag = get_most_recent_git_tag() changes = get_formatted_changes(git_tag) - print("-" * 80) + if changes == "": + print("No changes") + return print(changes) new_version = version_bump(git_tag) today = datetime.now(tz=timezone.utc) - header = f"Version {new_version}, {today:%Y-%m-%d}\n" - header = header + "-" * (len(header) - 1) + "\n" + header = f"## Version {new_version}, {today:%Y-%m-%d}\n" url = f"https://github.com/py-pdf/pypdf/compare/{git_tag}...{new_version}" trailer = f"\n[Full Changelog]({url})\n\n" new_entry = header + changes + trailer @@ -42,10 +43,15 @@ def main(changelog_path: str) -> None: print("Changelog is already up-to-date!") return - new_changelog = new_entry + changelog + new_changelog = "# CHANGELOG\n\n" + new_entry + strip_header(changelog) write_changelog(new_changelog, changelog_path) +def strip_header(md: str) -> str: + """Remove the 'CHANGELOG' header.""" + return md.lstrip("# CHANGELOG").strip() # noqa + + def version_bump(git_tag: str) -> str: """ Increase the patch version of the git tag by one. @@ -117,6 +123,7 @@ def get_formatted_changes(git_tag: str) -> str: "ROB", "DOC", "DEV", + "CI", "MAINT", "TST", "STY", @@ -129,6 +136,7 @@ def get_formatted_changes(git_tag: str) -> str: "ROB": "Robustness", "DOC": "Documentation", "DEV": "Developer Experience", + "CI": "Continuous Integration", "MAINT": "Maintenance", "TST": "Testing", "STY": "Code Style", @@ -140,17 +148,15 @@ def get_formatted_changes(git_tag: str) -> str: for prefix in order: if prefix not in grouped: continue - output += f"\n{abbrev2long[prefix]} ({prefix}):\n" # header + output += f"\n### {abbrev2long[prefix]} ({prefix})\n" # header for commit in grouped[prefix]: output += f"- {commit['msg']}\n" del grouped[prefix] if grouped: - print("@" * 80) - output += "\nYou forgot something!:\n" + output += "\n### Other\n" for prefix in grouped: output += f"- {prefix}: {grouped[prefix]}\n" - print("@" * 80) return output @@ -193,7 +199,7 @@ def get_git_commits_since_tag(git_tag: str) -> List[Change]: stderr=subprocess.STDOUT, ) ).strip("'b\\n") - return [parse_commit_line(line) for line in commits.split("\\n")] + return [parse_commit_line(line) for line in commits.split("\\n") if line != ""] def parse_commit_line(line: str) -> Change: @@ -210,7 +216,7 @@ def parse_commit_line(line: str) -> Change: ValueError: The commit line is not well-structured """ if "\\t" not in line: - raise ValueError(f"Invalid commit line: {line}") + raise ValueError(f"Invalid commit line: '{line}'") commit_hash, rest = line.split("\\t", 1) if ":" in rest: prefix, message = rest.split(":", 1) diff --git a/pypdf/_merger.py b/pypdf/_merger.py index b98a6ea250..924f184954 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -46,6 +46,7 @@ from ._reader import PdfReader from ._utils import ( StrByteType, + deprecate_with_replacement, deprecation_bookmark, deprecation_with_replacement, str_, @@ -86,26 +87,16 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None: class PdfMerger: """ - Initialize a ``PdfMerger`` object. + Use :class:`PdfWriter` instead. - ``PdfMerger`` merges multiple PDFs into a single PDF. - It can concatenate, slice, insert, or any combination of the above. - - See the functions :meth:`merge()` (or :meth:`append()`) - and :meth:`write()` for usage information. - - Args: - strict: Determines whether user should be warned of all - problems and also causes some correctable problems to be fatal. - Defaults to ``False``. - fileobj: Output file. Can be a filename or any kind of - file-like object. + .. deprecated:: 5.0.0 """ @deprecation_bookmark(bookmarks="outline") def __init__( self, strict: bool = False, fileobj: Union[Path, StrByteType] = "" ) -> None: + deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0") self.inputs: List[Tuple[Any, PdfReader]] = [] self.pages: List[Any] = [] self.output: Optional[PdfWriter] = PdfWriter() @@ -117,6 +108,7 @@ def __init__( def __enter__(self) -> "PdfMerger": # There is nothing to do. + deprecate_with_replacement("PdfMerger", "PdfWriter", "5.0.0") return self def __exit__( @@ -522,13 +514,13 @@ def _write_dests(self) -> None: raise RuntimeError(ERR_CLOSED_WRITER) for named_dest in self.named_dests: page_index = None - if "/Page" in named_dest: + if "/Page" in named_dest: # deprecated for page_index, page in enumerate(self.pages): # noqa: B007 if page.id == named_dest["/Page"]: named_dest[NameObject("/Page")] = page.out_pagedata break - if page_index is not None: + if page_index is not None: # deprecated self.output.add_named_destination_object(named_dest) @deprecation_bookmark(bookmarks="outline") @@ -606,7 +598,7 @@ def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None: if np.get_object() == page.pagedata.get_object(): page_index = page.id - if page_index is None: + if page_index is None: # deprecated raise ValueError( f"Unresolved named destination '{named_dest['/Title']}'" ) @@ -651,7 +643,7 @@ def find_outline_item( # oi_enum is still an inner node # (OutlineType, if recursive types were supported by mypy) res = self.find_outline_item(outline_item, oi_enum) # type: ignore - if res: + if res: # deprecated return [i] + res elif ( oi_enum == outline_item diff --git a/pypdf/_version.py b/pypdf/_version.py index fcd7ddb9e4..3e9a639238 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.9.0" +__version__ = "3.9.1" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e3968e96c8..aa42c0f278 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -940,9 +940,14 @@ def _flatten( pages = cast(DictionaryObject, self._root_object["/Pages"]) self.flattened_pages = ArrayObject() assert pages is not None # hint for mypy - t = "/Pages" + if PA.TYPE in pages: - t = cast(str, pages[PA.TYPE]) + t = str(pages[PA.TYPE]) + # if pdf has no type, considered as a page if /Kids is missing + elif PA.KIDS not in pages: + t = "/Page" + else: + t = "/Pages" if t == "/Pages": for attr in inheritable_page_attributes: diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index 695736769b..03041aa553 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -104,7 +104,7 @@ def read_string_from_stream( # line break was escaped: tok = b"" else: - msg = rf"Unexpected escaped string: {tok.decode('utf8')}" + msg = f"Unexpected escaped string: {tok.decode('utf-8','ignore')}" logger_warning(msg, __name__) txt.append(tok) return create_string_object(b"".join(txt), forced_encoding) diff --git a/tests/test_encryption.py b/tests/test_encryption.py index 0db7a2731a..1cbb26fd61 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -168,6 +168,7 @@ def test_read_page_from_encrypted_file_aes_256(pdffile, password): ], ) @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome") +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_encrypted_pdfs(names): """Encrypted PDFs can be merged after decryption.""" merger = pypdf.PdfMerger() diff --git a/tests/test_generic.py b/tests/test_generic.py index 5e464460d8..2302767f9b 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -678,6 +678,7 @@ def test_bool_repr(tmp_path): @pytest.mark.enable_socket() @patch("pypdf._reader.logger_warning") +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_issue_997(mock_logger_warning, pdf_file_path): url = ( "https://github.com/py-pdf/pypdf/files/8908874/" diff --git a/tests/test_merger.py b/tests/test_merger.py index 0057e512d1..094df1014c 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -19,6 +19,7 @@ sys.path.append(str(PROJECT_ROOT)) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def merger_operate(merger): pdf_path = RESOURCE_ROOT / "crazyones.pdf" outline = RESOURCE_ROOT / "pdflatex-outline.pdf" @@ -156,6 +157,7 @@ def check_outline(tmp_path): tmp_filename = "dont_commit_merged.pdf" +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merger_operations_by_traditional_usage(tmp_path): # Arrange merger = PdfMerger() @@ -183,6 +185,7 @@ def test_merger_operations_by_traditional_usage_with_writer(tmp_path): check_outline(path) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merger_operations_by_semi_traditional_usage(tmp_path): path = tmp_path / tmp_filename @@ -207,6 +210,7 @@ def test_merger_operations_by_semi_traditional_usage_with_writer(tmp_path): check_outline(path) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merger_operation_by_new_usage(tmp_path): path = tmp_path / tmp_filename with PdfMerger(fileobj=path) as merger: @@ -226,6 +230,7 @@ def test_merger_operation_by_new_usage_with_writer(tmp_path): check_outline(path) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_page_exception(): merger = pypdf.PdfMerger() pdf_path = RESOURCE_ROOT / "crazyones.pdf" @@ -247,6 +252,7 @@ def test_merge_page_exception_with_writer(): merger.close() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_page_tuple(): merger = pypdf.PdfMerger() pdf_path = RESOURCE_ROOT / "crazyones.pdf" @@ -261,6 +267,7 @@ def test_merge_page_tuple_with_writer(): merger.close() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_write_closed_fh(): merger = pypdf.PdfMerger() pdf_path = RESOURCE_ROOT / "crazyones.pdf" @@ -312,6 +319,7 @@ def test_merge_write_closed_fh_with_writer(pdf_file_path): @pytest.mark.enable_socket() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_trim_outline_list(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/995/995175.pdf" name = "tika-995175.pdf" @@ -334,6 +342,7 @@ def test_trim_outline_list_with_writer(pdf_file_path): @pytest.mark.enable_socket() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_zoom(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994759.pdf" name = "tika-994759.pdf" @@ -356,6 +365,7 @@ def test_zoom_with_writer(pdf_file_path): @pytest.mark.enable_socket() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_zoom_xyz_no_left(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/933/933322.pdf" name = "tika-933322.pdf" @@ -378,6 +388,7 @@ def test_zoom_xyz_no_left_with_writer(pdf_file_path): @pytest.mark.enable_socket() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_outline_item(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/997/997511.pdf" name = "tika-997511.pdf" @@ -402,6 +413,7 @@ def test_outline_item_with_writer(pdf_file_path): @pytest.mark.enable_socket() @pytest.mark.slow() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_trim_outline(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/982/982336.pdf" name = "tika-982336.pdf" @@ -426,6 +438,7 @@ def test_trim_outline_with_writer(pdf_file_path): @pytest.mark.enable_socket() @pytest.mark.slow() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test1(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923621.pdf" name = "tika-923621.pdf" @@ -450,6 +463,7 @@ def test1_with_writer(pdf_file_path): @pytest.mark.enable_socket() @pytest.mark.slow() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sweep_recursion1(pdf_file_path): # TODO: This test looks like an infinite loop. url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" @@ -496,6 +510,7 @@ def test_sweep_recursion1_with_writer(pdf_file_path): ), ], ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sweep_recursion2(url, name, pdf_file_path): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) merger = PdfMerger() @@ -525,7 +540,7 @@ def test_sweep_recursion2(url, name, pdf_file_path): ) def test_sweep_recursion2_with_writer(url, name, pdf_file_path): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - merger = PdfMerger() + merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) merger.close() @@ -535,6 +550,7 @@ def test_sweep_recursion2_with_writer(url, name, pdf_file_path): @pytest.mark.enable_socket() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sweep_indirect_list_newobj_is_none(caplog, pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/906/906769.pdf" name = "tika-906769.pdf" @@ -565,6 +581,7 @@ def test_sweep_indirect_list_newobj_is_none_with_writer(caplog, pdf_file_path): @pytest.mark.enable_socket() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_iss1145(): # issue with FitH destination with null param url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" @@ -584,6 +601,7 @@ def test_iss1145_with_writer(): merger.close() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_deprecation_bookmark_decorator_deprecationexcp(): reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") merger = PdfMerger() @@ -610,6 +628,7 @@ def test_deprecation_bookmark_decorator_deprecationexcp_with_writer(): merger.merge(0, reader, import_bookmarks=True) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_deprecation_bookmark_decorator_output(): reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") merger = PdfMerger() @@ -625,6 +644,7 @@ def test_deprecation_bookmark_decorator_output_with_writer(): @pytest.mark.enable_socket() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_iss1344(caplog): url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" name = "iss1344.pdf" @@ -663,3 +683,8 @@ def test_articles_with_writer(caplog): r = PdfReader(b) assert len(r.threads) == 4 assert r.threads[0].get_object()["/F"]["/P"] == r.pages[0] + + +def test_deprecate_pdfmerger(): + with pytest.warns(DeprecationWarning), PdfMerger() as merger: + merger.append(RESOURCE_ROOT / "crazyones.pdf") diff --git a/tests/test_page.py b/tests/test_page.py index a7fa503649..14ae7e5c1f 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1132,3 +1132,5 @@ def test_pdf_pages_missing_type(): reader = PdfReader(pdf_path) del reader.trailer["/Root"]["/Pages"]["/Kids"][0].get_object()["/Type"] reader.pages[0] + writer = PdfWriter(clone_from=reader) + writer.pages[0] diff --git a/tests/test_workflows.py b/tests/test_workflows.py index d3eabdbc3b..6afd7646ba 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -348,6 +348,7 @@ def test_overlay(pdf_file_path, base_path, overlay_path): ) ], ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_with_warning(tmp_path, url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) @@ -367,6 +368,7 @@ def test_merge_with_warning(tmp_path, url, name): ) ], ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge(tmp_path, url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) @@ -565,6 +567,7 @@ def test_scale_rectangle_indirect_object(): page.scale(sx=2, sy=3) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_output(caplog): # Arrange base = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR.pdf" diff --git a/tests/test_writer.py b/tests/test_writer.py index 80a3158aad..2d59a39728 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -739,6 +739,7 @@ def test_append_pages_from_reader_append(): @pytest.mark.enable_socket() @pytest.mark.slow() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sweep_indirect_references_nullobject_exception(pdf_file_path): # TODO: Check this more closely... this looks weird url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" @@ -765,6 +766,7 @@ def test_sweep_indirect_references_nullobject_exception(pdf_file_path): ("https://github.com/py-pdf/pypdf/files/10715624/test.pdf", "iss1627.pdf"), ], ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_some_appends(pdf_file_path, url, name): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) # PdfMerger