3030import codecs
3131import collections
3232import decimal
33+ import enum
3334import logging
3435import random
3536import re
132133ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions ((2 ** 31 - 1 ) - 3 )
133134
134135
136+ class ObjectDeletionFlag (enum .IntFlag ):
137+ TEXT = enum .auto ()
138+ IMAGES = enum .auto ()
139+ LINKS = enum .auto ()
140+ ATTACHMENTS = enum .auto ()
141+ OBJECTS_3D = enum .auto ()
142+ ALL_ANNOTATIONS = enum .auto ()
143+
144+
135145class PdfWriter :
136146 """
137147 Write a PDF file out, given pages produced by another class.
@@ -1796,12 +1806,8 @@ def addNamedDestination(
17961806
17971807 def remove_links (self ) -> None :
17981808 """Remove links and annotations from this output."""
1799- pg_dict = cast (DictionaryObject , self .get_object (self ._pages ))
1800- pages = cast (ArrayObject , pg_dict [PA .KIDS ])
1801- for page in pages :
1802- page_ref = cast (DictionaryObject , self .get_object (page ))
1803- if PG .ANNOTS in page_ref :
1804- del page_ref [PG .ANNOTS ]
1809+ for page in self .pages :
1810+ self .remove_objects_from_page (page , ObjectDeletionFlag .ALL_ANNOTATIONS )
18051811
18061812 def removeLinks (self ) -> None : # deprecated
18071813 """
@@ -1812,85 +1818,151 @@ def removeLinks(self) -> None: # deprecated
18121818 deprecation_with_replacement ("removeLinks" , "remove_links" , "3.0.0" )
18131819 return self .remove_links ()
18141820
1821+ def remove_annots (self , subtypes : Optional [Union [str , Iterable [str ]]]) -> None :
1822+ """
1823+ Remove annotations by Subtype
1824+ args:
1825+ subtypes : SubType or list of SubTypes to be removed. None=all
1826+ """
1827+ for page in self .pages :
1828+ self ._remove_annots_from_page (page , subtypes )
1829+
1830+ def _remove_annots_from_page (
1831+ self ,
1832+ page : Union [IndirectObject , PageObject , DictionaryObject ],
1833+ subtypes : Optional [Iterable [str ]],
1834+ ) -> None :
1835+ page = cast (DictionaryObject , page .get_object ())
1836+ if PG .ANNOTS in page :
1837+ i = 0
1838+ while i < len (cast (ArrayObject , page [PG .ANNOTS ])):
1839+ an = cast (ArrayObject , page [PG .ANNOTS ])[i ]
1840+ obj = cast (DictionaryObject , an .get_object ())
1841+ if subtypes is None or cast (str , obj ["/Subtype" ]) in subtypes :
1842+ if isinstance (an , IndirectObject ):
1843+ self ._objects [an .idnum - 1 ] = NullObject () # to reduce PDF size
1844+ del page [PG .ANNOTS ][i ] # type:ignore
1845+ else :
1846+ i += 1
1847+
1848+ def remove_objects_from_page (
1849+ self ,
1850+ page : Union [PageObject , DictionaryObject ],
1851+ to_delete : Union [ObjectDeletionFlag , Iterable [ObjectDeletionFlag ]],
1852+ ) -> None :
1853+ """
1854+ Remove objects specified by `to_delete` from the given page.
1855+
1856+ Args:
1857+ page: Page object to clean up
1858+ to_delete: Objects to be deleted; can be a `ObjectDeletionFlag` or a list of ObjectDeletionFlag
1859+ """
1860+ if isinstance (to_delete , (list , tuple )):
1861+ for to_d in to_delete :
1862+ self .remove_objects_from_page (page , to_d )
1863+ return
1864+ assert isinstance (to_delete , ObjectDeletionFlag )
1865+
1866+ if to_delete & ObjectDeletionFlag .LINKS :
1867+ return self ._remove_annots_from_page (page , ("/Link" ,))
1868+ if to_delete & ObjectDeletionFlag .ATTACHMENTS :
1869+ return self ._remove_annots_from_page (
1870+ page , ("/FileAttachment" , "/Sound" , "/Movie" , "/Screen" )
1871+ )
1872+ if to_delete & ObjectDeletionFlag .OBJECTS_3D :
1873+ return self ._remove_annots_from_page (page , ("/3D" ,))
1874+ if to_delete & ObjectDeletionFlag .ALL_ANNOTATIONS :
1875+ return self ._remove_annots_from_page (page , None )
1876+
1877+ if to_delete & ObjectDeletionFlag .IMAGES :
1878+ jump_operators = (
1879+ [b"w" , b"J" , b"j" , b"M" , b"d" , b"i" ]
1880+ + [b"W" , b"W*" ]
1881+ + [b"b" , b"b*" , b"B" , b"B*" , b"S" , b"s" , b"f" , b"f*" , b"F" , b"n" ]
1882+ + [b"m" , b"l" , b"c" , b"v" , b"y" , b"h" , b"re" ]
1883+ + [b"sh" ]
1884+ )
1885+ else : # del text
1886+ jump_operators = [b"Tj" , b"TJ" , b"'" , b'"' ]
1887+
1888+ images = []
1889+ forms = []
1890+
1891+ def clean (content : ContentStream ) -> None :
1892+ nonlocal images , forms , to_delete
1893+ i = 0
1894+ while i < len (content .operations ):
1895+ operands , operator = content .operations [i ]
1896+ if operator in jump_operators :
1897+ del content .operations [i ]
1898+ elif operator == b"Do" :
1899+ if (
1900+ cast (ObjectDeletionFlag , to_delete ) & ObjectDeletionFlag .IMAGES
1901+ and operands [0 ] in images
1902+ or cast (ObjectDeletionFlag , to_delete ) & ObjectDeletionFlag .TEXT
1903+ and operands [0 ] in forms
1904+ ):
1905+ del content .operations [i ]
1906+ i += 1
1907+ else :
1908+ i += 1
1909+
1910+ try :
1911+ d = cast (dict , cast (DictionaryObject , page ["/Resources" ])["/XObject" ])
1912+ except KeyError :
1913+ d = {}
1914+ for k , v in d .items ():
1915+ o = v .get_object ()
1916+ try :
1917+ content : Any = None
1918+ if to_delete & ObjectDeletionFlag .IMAGES and o ["/Subtype" ] == "/Image" :
1919+ content = NullObject ()
1920+ images .append (k )
1921+ if o ["/Subtype" ] == "/Form" :
1922+ forms .append (k )
1923+ if isinstance (o , ContentStream ):
1924+ content = o
1925+ else :
1926+ content = ContentStream (o , self )
1927+ content .update (o .items ())
1928+ for k1 in ["/Length" , "/Filter" , "/DecodeParms" ]:
1929+ try :
1930+ del content [k1 ]
1931+ except KeyError :
1932+ pass
1933+ clean (content )
1934+ if content is not None :
1935+ if isinstance (v , IndirectObject ):
1936+ self ._objects [v .idnum - 1 ] = content
1937+ else :
1938+ d [k ] = self ._add_object (content )
1939+ except (TypeError , KeyError ):
1940+ pass
1941+ if "/Contents" in page :
1942+ content = page ["/Contents" ].get_object ()
1943+ if not isinstance (content , ContentStream ):
1944+ content = ContentStream (content , page )
1945+ clean (cast (ContentStream , content ))
1946+ if isinstance (page ["/Contents" ], ArrayObject ):
1947+ for o in cast (ArrayObject , page ["/Contents" ]):
1948+ self ._objects [o .idnum - 1 ] = NullObject ()
1949+ try :
1950+ self ._objects [
1951+ cast (IndirectObject , page ["/Contents" ].indirect_reference ).idnum - 1
1952+ ] = NullObject ()
1953+ except AttributeError :
1954+ pass
1955+ page [NameObject ("/Contents" )] = self ._add_object (content )
1956+
18151957 def remove_images (self , ignore_byte_string_object : bool = False ) -> None :
18161958 """
18171959 Remove images from this output.
18181960
18191961 Args:
1820- ignore_byte_string_object: optional parameter
1821- to ignore ByteString Objects.
1822- """
1823- pg_dict = cast (DictionaryObject , self .get_object (self ._pages ))
1824- pages = cast (ArrayObject , pg_dict [PA .KIDS ])
1825- jump_operators = (
1826- b"cm" ,
1827- b"w" ,
1828- b"J" ,
1829- b"j" ,
1830- b"M" ,
1831- b"d" ,
1832- b"ri" ,
1833- b"i" ,
1834- b"gs" ,
1835- b"W" ,
1836- b"b" ,
1837- b"s" ,
1838- b"S" ,
1839- b"f" ,
1840- b"F" ,
1841- b"n" ,
1842- b"m" ,
1843- b"l" ,
1844- b"c" ,
1845- b"v" ,
1846- b"y" ,
1847- b"h" ,
1848- b"B" ,
1849- b"Do" ,
1850- b"sh" ,
1851- )
1852- for page in pages :
1853- page_ref = cast (DictionaryObject , self .get_object (page ))
1854- if "/Contents" not in page_ref :
1855- return
1856- content = page_ref ["/Contents" ].get_object ()
1857- if not isinstance (content , ContentStream ):
1858- content = ContentStream (content , page_ref )
1859-
1860- _operations = []
1861- seq_graphics = False
1862- for operands , operator in content .operations :
1863- if operator in [b"Tj" , b"'" ]:
1864- text = operands [0 ]
1865- if ignore_byte_string_object and not isinstance (
1866- text , TextStringObject
1867- ):
1868- operands [0 ] = TextStringObject ()
1869- elif operator == b'"' :
1870- text = operands [2 ]
1871- if ignore_byte_string_object and not isinstance (
1872- text , TextStringObject
1873- ):
1874- operands [2 ] = TextStringObject ()
1875- elif operator == b"TJ" :
1876- for i in range (len (operands [0 ])):
1877- if ignore_byte_string_object and not isinstance (
1878- operands [0 ][i ], TextStringObject
1879- ):
1880- operands [0 ][i ] = TextStringObject ()
1881-
1882- if operator == b"q" :
1883- seq_graphics = True
1884- if operator == b"Q" :
1885- seq_graphics = False
1886- if seq_graphics and operator in jump_operators :
1887- continue
1888- if operator == b"re" :
1889- continue
1890- _operations .append ((operands , operator ))
1891-
1892- content .operations = _operations
1893- page_ref .__setitem__ (NameObject ("/Contents" ), content )
1962+ ignore_byte_string_object: obsolete
1963+ """
1964+ for page in self .pages :
1965+ self .remove_objects_from_page (page , ObjectDeletionFlag .IMAGES )
18941966
18951967 def removeImages (self , ignoreByteStringObject : bool = False ) -> None : # deprecated
18961968 """
@@ -1906,44 +1978,10 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None:
19061978 Remove text from this output.
19071979
19081980 Args:
1909- ignore_byte_string_object: optional parameter
1981+ ignore_byte_string_object: obsolete
19101982 """
1911- pg_dict = cast (DictionaryObject , self .get_object (self ._pages ))
1912- pages = cast (List [IndirectObject ], pg_dict [PA .KIDS ])
1913- for page in pages :
1914- page_ref = cast (PageObject , self .get_object (page ))
1915- content = page_ref ["/Contents" ].get_object ()
1916- if not isinstance (content , ContentStream ):
1917- content = ContentStream (content , page_ref )
1918- for operands , operator in content .operations :
1919- if operator in [b"Tj" , b"'" ]:
1920- text = operands [0 ]
1921- if not ignore_byte_string_object :
1922- if isinstance (text , TextStringObject ):
1923- operands [0 ] = TextStringObject ()
1924- else :
1925- if isinstance (text , (TextStringObject , ByteStringObject )):
1926- operands [0 ] = TextStringObject ()
1927- elif operator == b'"' :
1928- text = operands [2 ]
1929- if not ignore_byte_string_object :
1930- if isinstance (text , TextStringObject ):
1931- operands [2 ] = TextStringObject ()
1932- else :
1933- if isinstance (text , (TextStringObject , ByteStringObject )):
1934- operands [2 ] = TextStringObject ()
1935- elif operator == b"TJ" :
1936- for i in range (len (operands [0 ])):
1937- if not ignore_byte_string_object :
1938- if isinstance (operands [0 ][i ], TextStringObject ):
1939- operands [0 ][i ] = TextStringObject ()
1940- else :
1941- if isinstance (
1942- operands [0 ][i ], (TextStringObject , ByteStringObject )
1943- ):
1944- operands [0 ][i ] = TextStringObject ()
1945-
1946- page_ref .__setitem__ (NameObject ("/Contents" ), content )
1983+ for page in self .pages :
1984+ self .remove_objects_from_page (page , ObjectDeletionFlag .TEXT )
19471985
19481986 def removeText (self , ignoreByteStringObject : bool = False ) -> None : # deprecated
19491987 """
0 commit comments