diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b627c38b6..88cb014214 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,24 @@ # CHANGELOG +## Version 3.6.0, 2023-03-18 + +### New Features (ENH) +- Extend PdfWriter.append() to PageObjects (#1704) +- Support qualified names in update_page_form_field_values (#1695) + +### Robustness (ROB) +- Tolerate streams without length field (#1717) +- Accept DictionaryObject in /D of NamedDestination (#1720) +- Widths def in cmap calls IndirectObject (#1719) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.5.2...3.6.0) + + ## Version 3.5.2, 2023-03-12 +⚠️ We discovered that compress_content_stream has to be applied to a page of + the PdfWriter. It may not be applied to a page of the PdfReader! + ### Bug Fixes (BUG) - compress_content_stream not readable in Adobe Acrobat (#1698) - Pass logging parameters correctly in set_need_appearances_writer (#1697) diff --git a/docs/user/file-size.md b/docs/user/file-size.md index 314595d3a2..a7b2d3cc45 100644 --- a/docs/user/file-size.md +++ b/docs/user/file-size.md @@ -63,9 +63,12 @@ reader = PdfReader("example.pdf") writer = PdfWriter() for page in reader.pages: - page.compress_content_streams() # This is CPU intensive! writer.add_page(page) +for page in writer.pages: + # ⚠️ This has to be done on the writer, not the reader! + page.compress_content_streams() # This is CPU intensive! + with open("out.pdf", "wb") as f: writer.write(f) ``` diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 03669abe37..74b4f0fe12 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -382,7 +382,7 @@ def compute_space_width( w = [] while len(w) > 0: st = w[0] - second = w[1] + second = w[1].get_object() if isinstance(second, int): for x in range(st, second): w1[x] = w[2] diff --git a/pypdf/_page.py b/pypdf/_page.py index 3ccfc8c2a4..427ba48ee4 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -38,10 +38,12 @@ Iterator, List, Optional, + Sequence, Set, Tuple, Union, cast, + overload, ) from ._cmap import build_char_map, unknown_char_map @@ -2139,7 +2141,7 @@ def annotations(self, value: Optional[ArrayObject]) -> None: self[NameObject("/Annots")] = value -class _VirtualList: +class _VirtualList(Sequence): def __init__( self, length_function: Callable[[], int], @@ -2152,11 +2154,21 @@ def __init__( def __len__(self) -> int: return self.length_function() + @overload def __getitem__(self, index: int) -> PageObject: + ... + + @overload + def __getitem__(self, index: slice) -> Sequence[PageObject]: + ... + + def __getitem__( + self, index: Union[int, slice] + ) -> Union[PageObject, Sequence[PageObject]]: if isinstance(index, slice): indices = range(*index.indices(len(self))) cls = type(self) - return cls(indices.__len__, lambda idx: self[indices[idx]]) # type: ignore + return cls(indices.__len__, lambda idx: self[indices[idx]]) if not isinstance(index, int): raise TypeError("sequence indices must be integers") len_self = len(self) @@ -2171,6 +2183,10 @@ def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): yield self[i] + def __str__(self) -> str: + p = [f"PageObject({i})" for i in range(self.length_function())] + return f"[{', '.join(p)}]" + def _get_fonts_walk( obj: DictionaryObject, diff --git a/pypdf/_reader.py b/pypdf/_reader.py index b8a42ba828..c742386367 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -778,6 +778,8 @@ def _get_named_destinations( else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF1.1 for k__, v__ in tree.items(): val = v__.get_object() + if isinstance(val, DictionaryObject): + val = val["/D"].get_object() dest = self._build_destination(k__, val) if dest is not None: retval[k__] = dest diff --git a/pypdf/_utils.py b/pypdf/_utils.py index ae3885e1fa..55cf7cb12f 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -178,10 +178,10 @@ def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: tok = stream.read(16) if not tok: return name - m = regex.search(tok) + m = regex.search(name + tok) if m is not None: - name += tok[: m.start()] - stream.seek(m.start() - len(tok), 1) + stream.seek(m.start() - (len(name) + len(tok)), 1) + name = (name + tok)[: m.start()] break name += tok return name diff --git a/pypdf/_version.py b/pypdf/_version.py index dae42b1bd2..85197cb4a0 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.5.2" +__version__ = "3.6.0" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 887373ccf0..fb67e2febe 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -318,7 +318,9 @@ def set_need_appearances_writer(self) -> None: try: # get the AcroForm tree if CatalogDictionary.ACRO_FORM not in self._root_object: - self._root_object[NameObject(CatalogDictionary.ACRO_FORM)] = self._add_object(DictionaryObject()) + self._root_object[ + NameObject(CatalogDictionary.ACRO_FORM) + ] = self._add_object(DictionaryObject()) need_appearances = NameObject(InteractiveFormDictEntries.NeedAppearances) self._root_object[CatalogDictionary.ACRO_FORM][need_appearances] = BooleanObject(True) # type: ignore @@ -766,6 +768,23 @@ def appendPagesFromReader( ) self.append_pages_from_reader(reader, after_page_append) + def _get_qualified_field_name(self, parent: DictionaryObject) -> Optional[str]: + if "/TM" in parent: + return cast(str, parent["/TM"]) + elif "/T" not in parent: + return None + elif "/Parent" in parent: + qualified_parent = self._get_qualified_field_name( + cast(DictionaryObject, parent["/Parent"]) + ) + if qualified_parent is not None: + return ( + qualified_parent + + "." + + cast(str, parent["/T"]) + ) + return cast(str, parent["/T"]) + def update_page_form_field_values( self, page: PageObject, @@ -795,11 +814,14 @@ def update_page_form_field_values( for j in range(len(page[PG.ANNOTS])): # type: ignore writer_annot = page[PG.ANNOTS][j].get_object() # type: ignore # retrieve parent field values, if present - writer_parent_annot = {} # fallback if it's not there + writer_parent_annot = DictionaryObject() # fallback if it's not there if PG.PARENT in writer_annot: writer_parent_annot = writer_annot[PG.PARENT] for field in fields: - if writer_annot.get(FieldDictionaryAttributes.T) == field: + if ( + writer_annot.get(FieldDictionaryAttributes.T) == field + or self._get_qualified_field_name(writer_annot) == field + ): if writer_annot.get(FieldDictionaryAttributes.FT) == "/Btn": writer_annot.update( { @@ -823,7 +845,10 @@ def update_page_form_field_values( ) } ) - elif writer_parent_annot.get(FieldDictionaryAttributes.T) == field: + elif ( + writer_parent_annot.get(FieldDictionaryAttributes.T) == field + or self._get_qualified_field_name(writer_parent_annot) == field + ): writer_parent_annot.update( { NameObject(FieldDictionaryAttributes.V): TextStringObject( @@ -2486,7 +2511,12 @@ def append( str, None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] ] = None, pages: Union[ - None, PageRange, Tuple[int, int], Tuple[int, int, int], List[int] + None, + PageRange, + Tuple[int, int], + Tuple[int, int, int], + List[int], + List[PageObject], ] = None, import_outline: bool = True, excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = None, @@ -2547,7 +2577,7 @@ def merge( position: Optional[int], fileobj: Union[Path, StrByteType, PdfReader], outline_item: Optional[str] = None, - pages: Optional[PageRangeSpec] = None, + pages: Optional[Union[PageRangeSpec, List[PageObject]]] = None, import_outline: bool = True, excluded_fields: Optional[Union[List[str], Tuple[str, ...]]] = (), ) -> None: @@ -2604,8 +2634,11 @@ def merge( ) srcpages = {} - for i in pages: - pg = reader.pages[i] + for page in pages: + if isinstance(page, PageObject): + pg = page + else: + pg = reader.pages[page] assert pg.indirect_reference is not None if position is None: srcpages[pg.indirect_reference.idnum] = self.add_page( diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 824dc16d41..c6b6603370 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -447,7 +447,13 @@ def read_unsized_from_steam( stream.seek(-1, 1) # this is a stream object, not a dictionary if SA.LENGTH not in data: - raise PdfStreamError("Stream length not defined") + if pdf is not None and pdf.strict: + raise PdfStreamError("Stream length not defined") + else: + logger_warning( + f"Stream length not defined @pos={stream.tell()}", __name__ + ) + data[NameObject(SA.LENGTH)] = NumberObject(-1) length = data[SA.LENGTH] if isinstance(length, IndirectObject): t = stream.tell() @@ -455,7 +461,12 @@ def read_unsized_from_steam( length = pdf.get_object(length) stream.seek(t, 0) pstart = stream.tell() - data["__streamdata__"] = stream.read(length) + if length > 0: + data["__streamdata__"] = stream.read(length) + else: + data["__streamdata__"] = read_until_regex( + stream, re.compile(b"endstream") + ) e = read_non_whitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b"endstream": diff --git a/stream.pdf b/stream.pdf deleted file mode 100644 index b32d3b131d..0000000000 Binary files a/stream.pdf and /dev/null differ diff --git a/tests/bench.py b/tests/bench.py index e98fd57960..d10a08b8b2 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -8,7 +8,7 @@ from pathlib import Path import pypdf -from pypdf import PdfReader, Transformation +from pypdf import PdfReader, PdfWriter, Transformation from pypdf.generic import Destination, read_string_from_stream TESTS_ROOT = Path(__file__).parent.resolve() @@ -21,11 +21,13 @@ def page_ops(pdf_path, password): pdf_path = RESOURCE_ROOT / pdf_path reader = PdfReader(pdf_path) + writer = PdfWriter() if password: reader.decrypt(password) page = reader.pages[0] + writer.add_page(page) op = Transformation().rotate(90).scale(1.2) page.add_transformation(op) @@ -43,6 +45,8 @@ def page_ops(pdf_path, password): page.scale(2, 2) page.scale_by(0.5) page.scale_to(100, 100) + + page = writer.pages[0] page.compress_content_streams() page.extract_text() diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 438ca1f403..a371b92fe6 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -122,3 +122,13 @@ def test_iss1533(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) reader.pages[0].extract_text() # no error assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü" + + +@pytest.mark.enable_socket() +def test_iss1718(caplog): + url = "https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf" + name = "iss1718.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + for p in reader.pages: + _txt = p.extract_text() + assert caplog.text == "" diff --git a/tests/test_generic.py b/tests/test_generic.py index b8910a5c09..ef6b6eac91 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -380,17 +380,22 @@ def test_dictionaryobject_read_from_stream_stream_no_newline(): @pytest.mark.parametrize(("strict"), [(True), (False)]) -def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict): - stream = BytesIO(b"<< /S /GoTo >>stream\n") +def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict, caplog): + stream = BytesIO(b"<< /S /GoTo >>stream\n123456789endstream abcd") class Tst: # to replace pdf strict = False pdf = Tst() pdf.strict = strict - with pytest.raises(PdfReadError) as exc: - DictionaryObject.read_from_stream(stream, pdf) - assert exc.value.args[0] == "Stream length not defined" + if strict: + with pytest.raises(PdfReadError) as exc: + DictionaryObject.read_from_stream(stream, pdf) + assert exc.value.args[0] == "Stream length not defined" + else: + o = DictionaryObject.read_from_stream(stream, pdf) + assert "Stream length not defined" in caplog.text + assert o.get_data() == b"123456789" @pytest.mark.parametrize( diff --git a/tests/test_merger.py b/tests/test_merger.py index cdebead8f5..6ea070c392 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -270,7 +270,7 @@ def test_merge_write_closed_fh(): merger.close() with pytest.raises(RuntimeError) as exc: - merger.write("stream.pdf") + merger.write("test_merge_write_closed_fh.pdf") assert exc.value.args[0] == err_closed with pytest.raises(RuntimeError) as exc: diff --git a/tests/test_page.py b/tests/test_page.py index 774029a3a1..d4ebefce88 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1101,3 +1101,9 @@ def test_merge_transformed_page_into_blank(): True, True, ) + + +def test_pages_printing(): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + assert str(reader.pages) == "[PageObject(0)]" diff --git a/tests/test_reader.py b/tests/test_reader.py index 5f70ed3536..c2a01f0d3b 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1350,3 +1350,11 @@ def test_iss1689(): name = "iss1689.pdf" in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) in_pdf.pages[0] + + +@pytest.mark.enable_socket() +def test_iss1710(): + url = "https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf" + name = "irbookonlinereading.pdf" + in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + in_pdf.outline diff --git a/tests/test_writer.py b/tests/test_writer.py index 9c2d95f971..ab43e41fd8 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -466,6 +466,24 @@ def test_fill_form(): Path(tmp_filename).unlink() # cleanup +def test_fill_form_with_qualified(): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + reader.add_form_topname("top") + + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.add_page(reader.pages[0]) + writer.update_page_form_field_values( + writer.pages[0], {"top.foo": "filling"}, flags=1 + ) + b = BytesIO() + writer.write(b) + + reader2 = PdfReader(b) + fields = reader2.get_fields() + assert fields["top.foo"]["/V"] == "filling" + + @pytest.mark.parametrize( ("use_128bit", "user_password", "owner_password"), [(True, "userpwd", "ownerpwd"), (False, "userpwd", "ownerpwd")], @@ -987,7 +1005,9 @@ def test_reset_translation(): writer.reset_translation() writer.append(reader, (0, 10)) assert len(writer._objects) >= nb + 200 - nb = len(writer._objects) + nb = len(writer.pages) + writer.append(reader, [reader.pages[0], reader.pages[0]]) + assert len(writer.pages) == nb + 2 def test_threads_empty():