Skip to content

Commit 67b085b

Browse files
authored
ENH: Add PdfWriter.remove_objects_from_page(page: PageObject, to_delete: ObjectDeletionFlag) (#1648)
This fixes remove_text to set contents as indirect_objects in accordance with the PDF specification. It wipes out text in XObject forms as well. The same issues were fixed for remove_images() Finally, the new method PdfWriter.remove_objects_from_page(page: PageObject, to_delete: ObjectDeletionFlag) was created. This allows a more fine-granular control of what to delete. It also is easy to expand via the to_delete flag. Closes #1644 Closes #1650
1 parent 44bc1fc commit 67b085b

File tree

3 files changed

+219
-119
lines changed

3 files changed

+219
-119
lines changed

pypdf/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from ._page import PageObject, Transformation
1313
from ._reader import DocumentInformation, PdfFileReader, PdfReader
1414
from ._version import __version__
15-
from ._writer import PdfFileWriter, PdfWriter
15+
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
1616
from .pagerange import PageRange, parse_filename_page_ranges
1717
from .papersizes import PaperSize
1818

@@ -21,6 +21,7 @@
2121
"PageRange",
2222
"PaperSize",
2323
"DocumentInformation",
24+
"ObjectDeletionFlag",
2425
"parse_filename_page_ranges",
2526
"PdfFileMerger", # will be removed in pypdf==4.0.0; use PdfMerger instead
2627
"PdfFileReader", # will be removed in pypdf==4.0.0; use PdfReader instead

pypdf/_writer.py

Lines changed: 155 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import codecs
3131
import collections
3232
import decimal
33+
import enum
3334
import logging
3435
import random
3536
import re
@@ -132,6 +133,15 @@
132133
ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions((2**31 - 1) - 3)
133134

134135

136+
class ObjectDeletionFlag(enum.IntFlag):
137+
TEXT = enum.auto()
138+
IMAGES = enum.auto()
139+
LINKS = enum.auto()
140+
ATTACHMENTS = enum.auto()
141+
OBJECTS_3D = enum.auto()
142+
ALL_ANNOTATIONS = enum.auto()
143+
144+
135145
class PdfWriter:
136146
"""
137147
Write a PDF file out, given pages produced by another class.
@@ -1796,12 +1806,8 @@ def addNamedDestination(
17961806

17971807
def remove_links(self) -> None:
17981808
"""Remove links and annotations from this output."""
1799-
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
1800-
pages = cast(ArrayObject, pg_dict[PA.KIDS])
1801-
for page in pages:
1802-
page_ref = cast(DictionaryObject, self.get_object(page))
1803-
if PG.ANNOTS in page_ref:
1804-
del page_ref[PG.ANNOTS]
1809+
for page in self.pages:
1810+
self.remove_objects_from_page(page, ObjectDeletionFlag.ALL_ANNOTATIONS)
18051811

18061812
def removeLinks(self) -> None: # deprecated
18071813
"""
@@ -1812,85 +1818,151 @@ def removeLinks(self) -> None: # deprecated
18121818
deprecation_with_replacement("removeLinks", "remove_links", "3.0.0")
18131819
return self.remove_links()
18141820

1821+
def remove_annots(self, subtypes: Optional[Union[str, Iterable[str]]]) -> None:
1822+
"""
1823+
Remove annotations by Subtype
1824+
args:
1825+
subtypes : SubType or list of SubTypes to be removed. None=all
1826+
"""
1827+
for page in self.pages:
1828+
self._remove_annots_from_page(page, subtypes)
1829+
1830+
def _remove_annots_from_page(
1831+
self,
1832+
page: Union[IndirectObject, PageObject, DictionaryObject],
1833+
subtypes: Optional[Iterable[str]],
1834+
) -> None:
1835+
page = cast(DictionaryObject, page.get_object())
1836+
if PG.ANNOTS in page:
1837+
i = 0
1838+
while i < len(cast(ArrayObject, page[PG.ANNOTS])):
1839+
an = cast(ArrayObject, page[PG.ANNOTS])[i]
1840+
obj = cast(DictionaryObject, an.get_object())
1841+
if subtypes is None or cast(str, obj["/Subtype"]) in subtypes:
1842+
if isinstance(an, IndirectObject):
1843+
self._objects[an.idnum - 1] = NullObject() # to reduce PDF size
1844+
del page[PG.ANNOTS][i] # type:ignore
1845+
else:
1846+
i += 1
1847+
1848+
def remove_objects_from_page(
1849+
self,
1850+
page: Union[PageObject, DictionaryObject],
1851+
to_delete: Union[ObjectDeletionFlag, Iterable[ObjectDeletionFlag]],
1852+
) -> None:
1853+
"""
1854+
Remove objects specified by `to_delete` from the given page.
1855+
1856+
Args:
1857+
page: Page object to clean up
1858+
to_delete: Objects to be deleted; can be a `ObjectDeletionFlag` or a list of ObjectDeletionFlag
1859+
"""
1860+
if isinstance(to_delete, (list, tuple)):
1861+
for to_d in to_delete:
1862+
self.remove_objects_from_page(page, to_d)
1863+
return
1864+
assert isinstance(to_delete, ObjectDeletionFlag)
1865+
1866+
if to_delete & ObjectDeletionFlag.LINKS:
1867+
return self._remove_annots_from_page(page, ("/Link",))
1868+
if to_delete & ObjectDeletionFlag.ATTACHMENTS:
1869+
return self._remove_annots_from_page(
1870+
page, ("/FileAttachment", "/Sound", "/Movie", "/Screen")
1871+
)
1872+
if to_delete & ObjectDeletionFlag.OBJECTS_3D:
1873+
return self._remove_annots_from_page(page, ("/3D",))
1874+
if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
1875+
return self._remove_annots_from_page(page, None)
1876+
1877+
if to_delete & ObjectDeletionFlag.IMAGES:
1878+
jump_operators = (
1879+
[b"w", b"J", b"j", b"M", b"d", b"i"]
1880+
+ [b"W", b"W*"]
1881+
+ [b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n"]
1882+
+ [b"m", b"l", b"c", b"v", b"y", b"h", b"re"]
1883+
+ [b"sh"]
1884+
)
1885+
else: # del text
1886+
jump_operators = [b"Tj", b"TJ", b"'", b'"']
1887+
1888+
images = []
1889+
forms = []
1890+
1891+
def clean(content: ContentStream) -> None:
1892+
nonlocal images, forms, to_delete
1893+
i = 0
1894+
while i < len(content.operations):
1895+
operands, operator = content.operations[i]
1896+
if operator in jump_operators:
1897+
del content.operations[i]
1898+
elif operator == b"Do":
1899+
if (
1900+
cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.IMAGES
1901+
and operands[0] in images
1902+
or cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.TEXT
1903+
and operands[0] in forms
1904+
):
1905+
del content.operations[i]
1906+
i += 1
1907+
else:
1908+
i += 1
1909+
1910+
try:
1911+
d = cast(dict, cast(DictionaryObject, page["/Resources"])["/XObject"])
1912+
except KeyError:
1913+
d = {}
1914+
for k, v in d.items():
1915+
o = v.get_object()
1916+
try:
1917+
content: Any = None
1918+
if to_delete & ObjectDeletionFlag.IMAGES and o["/Subtype"] == "/Image":
1919+
content = NullObject()
1920+
images.append(k)
1921+
if o["/Subtype"] == "/Form":
1922+
forms.append(k)
1923+
if isinstance(o, ContentStream):
1924+
content = o
1925+
else:
1926+
content = ContentStream(o, self)
1927+
content.update(o.items())
1928+
for k1 in ["/Length", "/Filter", "/DecodeParms"]:
1929+
try:
1930+
del content[k1]
1931+
except KeyError:
1932+
pass
1933+
clean(content)
1934+
if content is not None:
1935+
if isinstance(v, IndirectObject):
1936+
self._objects[v.idnum - 1] = content
1937+
else:
1938+
d[k] = self._add_object(content)
1939+
except (TypeError, KeyError):
1940+
pass
1941+
if "/Contents" in page:
1942+
content = page["/Contents"].get_object()
1943+
if not isinstance(content, ContentStream):
1944+
content = ContentStream(content, page)
1945+
clean(cast(ContentStream, content))
1946+
if isinstance(page["/Contents"], ArrayObject):
1947+
for o in cast(ArrayObject, page["/Contents"]):
1948+
self._objects[o.idnum - 1] = NullObject()
1949+
try:
1950+
self._objects[
1951+
cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1
1952+
] = NullObject()
1953+
except AttributeError:
1954+
pass
1955+
page[NameObject("/Contents")] = self._add_object(content)
1956+
18151957
def remove_images(self, ignore_byte_string_object: bool = False) -> None:
18161958
"""
18171959
Remove images from this output.
18181960
18191961
Args:
1820-
ignore_byte_string_object: optional parameter
1821-
to ignore ByteString Objects.
1822-
"""
1823-
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
1824-
pages = cast(ArrayObject, pg_dict[PA.KIDS])
1825-
jump_operators = (
1826-
b"cm",
1827-
b"w",
1828-
b"J",
1829-
b"j",
1830-
b"M",
1831-
b"d",
1832-
b"ri",
1833-
b"i",
1834-
b"gs",
1835-
b"W",
1836-
b"b",
1837-
b"s",
1838-
b"S",
1839-
b"f",
1840-
b"F",
1841-
b"n",
1842-
b"m",
1843-
b"l",
1844-
b"c",
1845-
b"v",
1846-
b"y",
1847-
b"h",
1848-
b"B",
1849-
b"Do",
1850-
b"sh",
1851-
)
1852-
for page in pages:
1853-
page_ref = cast(DictionaryObject, self.get_object(page))
1854-
if "/Contents" not in page_ref:
1855-
return
1856-
content = page_ref["/Contents"].get_object()
1857-
if not isinstance(content, ContentStream):
1858-
content = ContentStream(content, page_ref)
1859-
1860-
_operations = []
1861-
seq_graphics = False
1862-
for operands, operator in content.operations:
1863-
if operator in [b"Tj", b"'"]:
1864-
text = operands[0]
1865-
if ignore_byte_string_object and not isinstance(
1866-
text, TextStringObject
1867-
):
1868-
operands[0] = TextStringObject()
1869-
elif operator == b'"':
1870-
text = operands[2]
1871-
if ignore_byte_string_object and not isinstance(
1872-
text, TextStringObject
1873-
):
1874-
operands[2] = TextStringObject()
1875-
elif operator == b"TJ":
1876-
for i in range(len(operands[0])):
1877-
if ignore_byte_string_object and not isinstance(
1878-
operands[0][i], TextStringObject
1879-
):
1880-
operands[0][i] = TextStringObject()
1881-
1882-
if operator == b"q":
1883-
seq_graphics = True
1884-
if operator == b"Q":
1885-
seq_graphics = False
1886-
if seq_graphics and operator in jump_operators:
1887-
continue
1888-
if operator == b"re":
1889-
continue
1890-
_operations.append((operands, operator))
1891-
1892-
content.operations = _operations
1893-
page_ref.__setitem__(NameObject("/Contents"), content)
1962+
ignore_byte_string_object: obsolete
1963+
"""
1964+
for page in self.pages:
1965+
self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES)
18941966

18951967
def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated
18961968
"""
@@ -1906,44 +1978,10 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
19061978
Remove text from this output.
19071979
19081980
Args:
1909-
ignore_byte_string_object: optional parameter
1981+
ignore_byte_string_object: obsolete
19101982
"""
1911-
pg_dict = cast(DictionaryObject, self.get_object(self._pages))
1912-
pages = cast(List[IndirectObject], pg_dict[PA.KIDS])
1913-
for page in pages:
1914-
page_ref = cast(PageObject, self.get_object(page))
1915-
content = page_ref["/Contents"].get_object()
1916-
if not isinstance(content, ContentStream):
1917-
content = ContentStream(content, page_ref)
1918-
for operands, operator in content.operations:
1919-
if operator in [b"Tj", b"'"]:
1920-
text = operands[0]
1921-
if not ignore_byte_string_object:
1922-
if isinstance(text, TextStringObject):
1923-
operands[0] = TextStringObject()
1924-
else:
1925-
if isinstance(text, (TextStringObject, ByteStringObject)):
1926-
operands[0] = TextStringObject()
1927-
elif operator == b'"':
1928-
text = operands[2]
1929-
if not ignore_byte_string_object:
1930-
if isinstance(text, TextStringObject):
1931-
operands[2] = TextStringObject()
1932-
else:
1933-
if isinstance(text, (TextStringObject, ByteStringObject)):
1934-
operands[2] = TextStringObject()
1935-
elif operator == b"TJ":
1936-
for i in range(len(operands[0])):
1937-
if not ignore_byte_string_object:
1938-
if isinstance(operands[0][i], TextStringObject):
1939-
operands[0][i] = TextStringObject()
1940-
else:
1941-
if isinstance(
1942-
operands[0][i], (TextStringObject, ByteStringObject)
1943-
):
1944-
operands[0][i] = TextStringObject()
1945-
1946-
page_ref.__setitem__(NameObject("/Contents"), content)
1983+
for page in self.pages:
1984+
self.remove_objects_from_page(page, ObjectDeletionFlag.TEXT)
19471985

19481986
def removeText(self, ignoreByteStringObject: bool = False) -> None: # deprecated
19491987
"""

0 commit comments

Comments
 (0)