Skip to content

Commit 215df56

Browse files
authored
ENH: Complete attachments functions (get/add) (#1611)
Fixes #1047 Fixes #527 Fixes #169
1 parent cc32b59 commit 215df56

File tree

3 files changed

+120
-15
lines changed

3 files changed

+120
-15
lines changed

pypdf/_reader.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2130,6 +2130,79 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]:
21302130
interim[NameObject("/T")] = TextStringObject(name)
21312131
return interim
21322132

2133+
def _list_attachments(self) -> List[str]:
2134+
"""
2135+
Retrieves the list of filenames of file attachments.
2136+
2137+
Returns:
2138+
list of filenames
2139+
"""
2140+
catalog = cast(DictionaryObject, self.trailer["/Root"])
2141+
# From the catalog get the embedded file names
2142+
try:
2143+
filenames = cast(
2144+
ArrayObject,
2145+
cast(
2146+
DictionaryObject,
2147+
cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
2148+
)["/Names"],
2149+
)
2150+
except KeyError:
2151+
return []
2152+
attachments_names = []
2153+
# Loop through attachments
2154+
for f in filenames:
2155+
if isinstance(f, str):
2156+
attachments_names.append(f)
2157+
return attachments_names
2158+
2159+
def _get_attachments(
2160+
self, filename: Optional[str] = None
2161+
) -> Dict[str, Union[bytes, List[bytes]]]:
2162+
"""
2163+
Retrieves all or selected file attachments of the PDF as a dictionary of file names
2164+
and the file data as a bytestring.
2165+
2166+
Args:
2167+
filename: If filename is None, then a dictionary of all attachments
2168+
will be returned, where the key is the filename and the value
2169+
is the content. Otherwise, a dictionary with just a single key
2170+
- the filename - and its content will be returned.
2171+
2172+
Returns:
2173+
dictionary of filename -> Union[bytestring or List[ByteString]]
2174+
if the filename exists multiple times a List of the different version will be provided
2175+
"""
2176+
catalog = cast(DictionaryObject, self.trailer["/Root"])
2177+
# From the catalog get the embedded file names
2178+
try:
2179+
filenames = cast(
2180+
ArrayObject,
2181+
cast(
2182+
DictionaryObject,
2183+
cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"],
2184+
)["/Names"],
2185+
)
2186+
except KeyError:
2187+
return {}
2188+
attachments: Dict[str, Union[bytes, List[bytes]]] = {}
2189+
# Loop through attachments
2190+
for i in range(len(filenames)):
2191+
f = filenames[i]
2192+
if isinstance(f, str):
2193+
if filename is not None and f != filename:
2194+
continue
2195+
name = f
2196+
f_dict = filenames[i + 1].get_object()
2197+
f_data = f_dict["/EF"]["/F"].get_data()
2198+
if name in attachments:
2199+
if not isinstance(attachments[name], list):
2200+
attachments[name] = [attachments[name]] # type:ignore
2201+
attachments[name].append(f_data) # type:ignore
2202+
else:
2203+
attachments[name] = f_data
2204+
return attachments
2205+
21332206

21342207
class PdfFileReader(PdfReader): # deprecated
21352208
def __init__(self, *args: Any, **kwargs: Any) -> None:

pypdf/_writer.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -660,7 +660,7 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:
660660
# >>
661661

662662
ef_entry = DictionaryObject()
663-
ef_entry.update({NameObject("/F"): file_entry})
663+
ef_entry.update({NameObject("/F"): self._add_object(file_entry)})
664664

665665
filespec = DictionaryObject()
666666
filespec.update(
@@ -685,21 +685,25 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:
685685
# >>
686686
# endobj
687687

688-
embedded_files_names_dictionary = DictionaryObject()
689-
embedded_files_names_dictionary.update(
690-
{
691-
NameObject(CA.NAMES): ArrayObject(
692-
[create_string_object(filename), filespec]
693-
)
694-
}
695-
)
696-
697-
embedded_files_dictionary = DictionaryObject()
698-
embedded_files_dictionary.update(
699-
{NameObject("/EmbeddedFiles"): embedded_files_names_dictionary}
688+
if CA.NAMES not in self._root_object:
689+
self._root_object[NameObject(CA.NAMES)] = self._add_object(
690+
DictionaryObject()
691+
)
692+
if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]):
693+
embedded_files_names_dictionary = DictionaryObject(
694+
{NameObject(CA.NAMES): ArrayObject()}
695+
)
696+
cast(DictionaryObject, self._root_object[CA.NAMES])[
697+
NameObject("/EmbeddedFiles")
698+
] = self._add_object(embedded_files_names_dictionary)
699+
else:
700+
embedded_files_names_dictionary = cast(
701+
DictionaryObject,
702+
cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"],
703+
)
704+
cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend(
705+
[create_string_object(filename), filespec]
700706
)
701-
# Update the root
702-
self._root_object.update({NameObject(CA.NAMES): embedded_files_dictionary})
703707

704708
def addAttachment(self, fname: str, fdata: Union[str, bytes]) -> None: # deprecated
705709
"""

tests/test_writer.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,34 @@ def test_iss1601():
11941194
)
11951195

11961196

1197+
def test_attachments():
1198+
writer = PdfWriter()
1199+
writer.add_blank_page(100, 100)
1200+
b = BytesIO()
1201+
writer.write(b)
1202+
b.seek(0)
1203+
reader = PdfReader(b)
1204+
b = None
1205+
assert reader._list_attachments() == []
1206+
assert reader._get_attachments() == {}
1207+
writer.add_attachment("foobar.txt", b"foobarcontent")
1208+
writer.add_attachment("foobar2.txt", b"foobarcontent2")
1209+
writer.add_attachment("foobar2.txt", b"2nd_foobarcontent")
1210+
1211+
b = BytesIO()
1212+
writer.write(b)
1213+
b.seek(0)
1214+
reader = PdfReader(b)
1215+
b = None
1216+
assert reader._list_attachments() == ["foobar.txt", "foobar2.txt", "foobar2.txt"]
1217+
att = reader._get_attachments()
1218+
assert len(att) == 2
1219+
assert att["foobar.txt"] == b"foobarcontent"
1220+
att = reader._get_attachments("foobar2.txt")
1221+
assert len(att) == 1
1222+
assert att["foobar2.txt"] == [b"foobarcontent2", b"2nd_foobarcontent"]
1223+
1224+
11971225
@pytest.mark.external
11981226
def test_iss1614():
11991227
# test of an annotation(link) directly stored in the /Annots in the page

0 commit comments

Comments
 (0)