diff --git a/CHANGELOG.md b/CHANGELOG.md index c3fe78542e..2848f68ea5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # CHANGELOG +## Version 3.8.1, 2023-04-23 + +### Bug Fixes (BUG) +- Convert color space before saving (#1802) + +### Documentation (DOC) +- PDF/A (#1807) +- Use append instead of add_page +- Document core mechanics of pypdf (#1783) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.8.0...3.8.1) + ## Version 3.8.0, 2023-04-16 ### New Features (ENH) diff --git a/docs/dev/pypdf-parsing.md b/docs/dev/pypdf-parsing.md new file mode 100644 index 0000000000..f85e79249f --- /dev/null +++ b/docs/dev/pypdf-parsing.md @@ -0,0 +1,32 @@ +# How pypdf parses PDF files + +pypdf uses {py:class}`PdfReader ` to parse PDF files. +The method {py:meth}`PdfReader.read ` shows the basic +structure of parsing: + +1. **Finding and reading the cross-reference tables / trailer**: The + cross-reference table (xref table) is a table of byte offsets that indicate + the locations of objects within the file. The trailer provides additional + information such as the root object (Catalog) and the Info object containing + metadata. +2. **Parsing the objects**: After locating the xref table and the trailer, pypdf + proceeds to parse the objects in the PDF. Objects in a PDF can be of various + types such as dictionaries, arrays, streams, and simple data types (e.g., + integers, strings). pypdf parses these objects and stores them in + {py:meth}`PdfReader.resolved_objects ` + via {py:meth}`cache_indirect_object `. +3. **Decoding content streams**: The content of a PDF is typically stored in + content streams, which are sequences of PDF operators and operands. pypdf + decodes these content streams by applying filters (e.g., `FlateDecode`, + `LZWDecode`) specified in the stream's dictionary. This is only done when the + object is requested via {py:meth}`PdfReader.get_object + ` in the + {py:meth}`PdfReader._get_object_from_stream + ` method. + +## References + +[PDF 1.7 specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf): +* 7.5 File Structure +* 7.5.4 Cross-Reference Table +* 7.8 Content Streams and Resources diff --git a/docs/dev/pypdf-writing.md b/docs/dev/pypdf-writing.md new file mode 100644 index 0000000000..087a575afa --- /dev/null +++ b/docs/dev/pypdf-writing.md @@ -0,0 +1,68 @@ +# How pypdf writes PDF files + +pypdf uses {py:class}`PdfWriter ` to write PDF files. pypdf has +{py:class}`PdfObject ` and several subclasses with the +{py:meth}`write_to_stream ` method. +The {py:meth}`PdfWriter.write ` method uses the +`write_to_stream` methods of the referenced objects. + +The {py:meth}`PdfWriter.write_stream ` method +has the following core steps: + +1. `_sweep_indirect_references`: This step ensures that any circular references + to objects are correctly handled. It adds the object reference numbers of any + circularly referenced objects to an external reference map, so that + self-page-referencing trees can reference the correct new object location, + rather than copying in a new copy of the page object. +2. **Write the File Header and Body** with `_write_pdf_structure`: In this step, + the PDF header and objects are written to the output stream. This includes + the PDF version (e.g., %PDF-1.7) and the objects that make up the content of + the PDF, such as pages, annotations, and form fields. The locations (byte + offsets) of these objects are stored for later use in generating the xref + table. +3. **Write the Cross-Reference Table** with `_write_xref_table`: Using the stored + object locations, this step generates and writes the cross-reference table + (xref table) to the output stream. The cross-reference table contains the + byte offsets for each object in the PDF file, allowing for quick random + access to objects when reading the PDF. +4. **Write the File Trailer** with `_write_trailer`: The trailer is written to + the output stream in this step. The trailer contains essential information, + such as the number of objects in the PDF, the location of the root object + (Catalog), and the Info object containing metadata. The trailer also + specifies the location of the xref table. + + +## How others do it + +Looking at altrnative software designs and implementations can help to improve +our choices. + +### fpdf2 + +[fpdf2](https://pypi.org/project/fpdf2/) has a [`PDFObject` class](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py) +with a serialize method which roughly maps to `pypdf.PdfObject.write_to_stream`. +Some other similarities include: + +* [fpdf.output.OutputProducer.buffersize](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/output.py#L370-L485) vs {py:meth}`pypdf.PdfWriter.write_stream ` +* [fpdpf.syntax.Name](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L124) vs {py:class}`pypdf.generic.NameObject ` +* [fpdf.syntax.build_obj_dict](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/syntax.py#L222) vs {py:class}`pypdf.generic.DictionaryObject ` +* [fpdf.structure_tree.NumberTree](https://github.com/PyFPDF/fpdf2/blob/master/fpdf/structure_tree.py#L17) vs + {py:class}`pypdf.generic.TreeObject ` + + +### pdfrw + +[pdfrw](https://pypi.org/project/pdfrw/), in contrast, seems to work more with +the standard Python objects (bool, float, string) and not wrap them in custom +objects, if possible. It still has: + +* [PdfArray](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfarray.py#L13) +* [PdfDict](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfdict.py#L49) +* [PdfName](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfname.py#L65) +* [PdfString](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfstring.py#L322) +* [PdfIndirect](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/objects/pdfindirect.py#L10) + +The core classes of pdfrw are +[PdfReader](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfreader.py#L26) +and +[PdfWriter](https://github.com/pmaupin/pdfrw/blob/master/pdfrw/pdfwriter.py#L224) diff --git a/docs/index.rst b/docs/index.rst index 6b0057b61b..c458cdc581 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -36,6 +36,7 @@ You can contribute to `pypdf on GitHub `_. user/streaming-data user/file-size user/pdf-version-support + user/pdfa-compliance .. toctree:: @@ -63,6 +64,8 @@ You can contribute to `pypdf on GitHub `_. dev/intro dev/pdf-format + dev/pypdf-parsing + dev/pypdf-writing dev/cmaps dev/deprecations dev/documentation diff --git a/docs/user/forms.md b/docs/user/forms.md index 4a61657aae..ff729d0596 100644 --- a/docs/user/forms.md +++ b/docs/user/forms.md @@ -21,7 +21,7 @@ writer = PdfWriter() page = reader.pages[0] fields = reader.get_fields() -writer.add_page(page) +writer.append(reader) writer.update_page_form_field_values( writer.pages[0], {"fieldname": "some filled in text"} diff --git a/docs/user/pdfa-compliance.md b/docs/user/pdfa-compliance.md new file mode 100644 index 0000000000..f7ae8d6b3b --- /dev/null +++ b/docs/user/pdfa-compliance.md @@ -0,0 +1,82 @@ +# PDF/A Compliance + +PDF/A is a specialized, ISO-standardized version of the Portable Document Format +(PDF) specifically designed for the long-term preservation and archiving of +electronic documents. It ensures that files remain accessible, readable, and +true to their original appearance by embedding all necessary fonts, images, and +metadata within the document itself. By adhering to strict guidelines and +minimizing dependencies on external resources or proprietary software, PDF/A +ensures the consistent and reliable reproduction of content, safeguarding it +against future technological changes and obsolescence. + +## PDF/A Versions + +* **PDF/A-1**: Based on PDF 1.4, PDF/A-1 is the first version of the standard + and is divided into two levels: PDF/A-1a (Level A, ensuring accessibility) and + PDF/A-1b (Level B, ensuring visual preservation). + * **Level B** (Basic): Ensures visual preservation and basic requirements for archiving. + * **Level A** (Accessible): Everything from level B, but includes additional + requirements for accessibility, such as tagging, Unicode character + mapping, and logical structure. +* **PDF/A-2**: Based on PDF 1.7 (ISO 32000-1), PDF/A-2 adds features and + improvements over PDF/A-1, while maintaining compatibility with PDF/A-1b + (Level B) documents. + * **Level B** (Basic): Like PDF/A-1b, but support for PDF 1.7 features such + as transparancy layers. + * **Level U** (Unicode): Ensures Unicode mapping without the full + accessibility requirements of PDF/A-1a (Level A). + * **Level A** (Accessible): Similar to PDF/A-1a +* **PDF/A-3**: Based on PDF 1.7 (ISO 32000-1), PDF/A-3 is similar to PDF/A-2 but + allows the embedding of non-PDF/A files as attachments, enabling the archiving + of source or supplementary data alongside the PDF/A document. This is + interesting for invoices which can add XML files. +* **PDF/A-4**: Based on PDF 2.0 (ISO 32000-2), PDF/A-4 introduces new features + and improvements for better archiving and accessibility. The previous levels + are replaced by PDF/A-4f (ensuring visual preservation and allowing attachments) + and PDF/A-4e (Engineering, allows 3D content). + +## PDF/A-1b + +In contrast to other PDF documents, PDF/A-1b documents must fulfill those +requirements: + +* **MarkInfo Object**: The MarkInfo object is a dictionary object within a PDF/A + file that provides information about the logical structure and tagging of the + document. The MarkInfo object indicates whether the document is tagged, + contains optional content, or has a structure tree that describes the logical + arrangement of content such as headings, paragraphs, lists, and tables. By + including the MarkInfo object, PDF/A ensures that electronic documents are + accessible to users with disabilities, such as those using screen readers or + other assistive technologies. +* **Embedded fonts**: All fonts used in the document must be embedded to ensure + consistent text rendering across different devices and systems. +* **Color Spaces**: DeviceRGB is a device-dependent color space that relies on + the specific characteristics of the output device, which can lead to + inconsistent color rendering across various devices. To achieve accurate and + consistent color representation, PDF/A requires the use of device-independent + color spaces, such as ICC-based color profiles. +* **XMP (Extensible Metadata Platform) metadata**: XMP metadata provides a + standardized and extensible way to store essential information about a + document and its properties. XMP metadata is an XML-based format embedded + directly within a PDF/A file. It contains various types of information, such + as document title, author, creation and modification dates, keywords, and + copyright information, as well as PDF/A-specific details like conformance + level and OutputIntent. + +## Validation + +[VeraPDF](https://docs.verapdf.org/install/) is the go-to PDF/A validator. + +There are several online-validators which allow you to simply upload the document: + +* [pdfen.com](https://www.pdfen.com/pdf-a-validator) +* [avepdf.com](https://avepdf.com/pdfa-validation) : Gives an error report +* [pdfa.org](https://pdfa.org/pdfa-online-verification-service/) +* [visual-paradigm.com](https://online.visual-paradigm.com/de/online-pdf-editor/pdfa-validator/) - can convert the PDF to a PDF/A +* [pdf2go.com](https://www.pdf2go.com/validate-pdfa) +* [slub-dresden.de](https://www.slub-dresden.de/veroeffentlichen/dissertationen-habilitationen/elektronische-veroeffentlichung/slub-pdfa-validator) links to relevant parts in the specification. + +## pypdf and PDF/A + +At the moment, pypdf does not make any guarantees regarding PDF/A. +[Support is very welcome](https://github.com/py-pdf/pypdf/labels/is-pdf%2Fa-compliance). diff --git a/pypdf/_version.py b/pypdf/_version.py index 32a781905a..e4e78c0b9d 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.8.0" +__version__ = "3.8.1" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 5f31224bc8..8838f64456 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -163,8 +163,13 @@ def __init__( clone_from: Union[None, PdfReader, StrByteType, Path] = None, ) -> None: self._header = b"%PDF-1.3" - self._objects: List[PdfObject] = [] # array of indirect objects + + self._objects: List[PdfObject] = [] + """The indirect objects in the PDF.""" + self._idnum_hash: Dict[bytes, IndirectObject] = {} + """Maps hash values of indirect objects to their IndirectObject instances.""" + self._id_translated: Dict[int, Dict[int, int]] = {} # The root of our page tree node. @@ -198,6 +203,7 @@ def __init__( } ) self._root = self._add_object(self._root_object) + if clone_from is not None: if not isinstance(clone_from, PdfReader): clone_from = PdfReader(clone_from) @@ -1135,20 +1141,11 @@ def write_stream(self, stream: StreamType) -> None: if not self._root: self._root = self._add_object(self._root_object) - # PDF objects sometimes have circular references to their /Page objects - # inside their object tree (for example, annotations). Those will be - # indirect references to objects that we've recreated in this PDF. To - # address this problem, PageObject's store their original object - # reference number, and we add it to the external reference map before - # we sweep for indirect references. This forces self-page-referencing - # trees to reference the correct new object location, rather than - # copying in a new copy of the page object. self._sweep_indirect_references(self._root) object_positions = self._write_pdf_structure(stream) xref_location = self._write_xref_table(stream, object_positions) - self._write_trailer(stream) - stream.write(b_(f"\nstartxref\n{xref_location}\n%%EOF\n")) # eof + self._write_trailer(stream, xref_location) def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: """ @@ -1212,7 +1209,14 @@ def _write_xref_table(self, stream: StreamType, object_positions: List[int]) -> stream.write(b_(f"{offset:0>10} {0:0>5} n \n")) return xref_location - def _write_trailer(self, stream: StreamType) -> None: + def _write_trailer(self, stream: StreamType, xref_location: int) -> None: + """ + Write the PDF trailer to the stream. + + To quote the PDF specification: + [The] trailer [gives] the location of the cross-reference table and + of certain special objects within the body of the file. + """ stream.write(b"trailer\n") trailer = DictionaryObject() trailer.update( @@ -1227,6 +1231,7 @@ def _write_trailer(self, stream: StreamType) -> None: if hasattr(self, "_encrypt"): trailer[NameObject(TK.ENCRYPT)] = self._encrypt trailer.write_to_stream(stream, None) + stream.write(b_(f"\nstartxref\n{xref_location}\n%%EOF\n")) # eof def add_metadata(self, infos: Dict[str, Any]) -> None: """ @@ -1265,6 +1270,21 @@ def _sweep_indirect_references( NullObject, ], ) -> None: + """ + Resolving any circular references to Page objects. + + Circular references to Page objects can arise when objects such as + annotations refer to their associated page. If these references are not + properly handled, the PDF file will contain multiple copies of the same + Page object. To address this problem, Page objects store their original + object reference number. This method adds the reference number of any + circularly referenced Page objects to an external reference map. This + ensures that self-referencing trees reference the correct new object + location, rather than copying in a new copy of the Page object. + + Args: + root: The root of the PDF object tree to sweep. + """ stack: Deque[ Tuple[ Any, @@ -1333,16 +1353,28 @@ def _sweep_indirect_references( def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject: """ - Resolves indirect object to this pdf indirect objects. + Resolves an indirect object to an indirect object in this PDF file. + + If the input indirect object already belongs to this PDF file, it is + returned directly. Otherwise, the object is retrieved from the input + object's PDF file using the object's ID number and generation number. If + the object cannot be found, a warning is logged and a `NullObject` is + returned. - If it is a new object then it is added to self._objects - and new idnum is given and generation is always 0. + If the object is not already in this PDF file, it is added to the file's + list of objects and assigned a new ID number and generation number of 0. + The hash value of the object is then added to the `_idnum_hash` + dictionary, with the corresponding `IndirectObject` reference as the + value. Args: - data: + data: The `IndirectObject` to resolve. Returns: - The resolved indirect object + The resolved `IndirectObject` in this PDF file. + + Raises: + ValueError: If the input stream is closed. """ if hasattr(data.pdf, "stream") and data.pdf.stream.closed: raise ValueError(f"I/O operation on closed file: {data.pdf.stream.name}") diff --git a/pypdf/filters.py b/pypdf/filters.py index 086a8a2f53..dd0ba6ccd9 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -652,7 +652,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: and x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB ): # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes - mode: Literal["RGB", "P"] = "RGB" + mode: Literal["RGB", "P", "L", "RGBA"] = "RGB" else: mode = "P" extension = None @@ -683,11 +683,29 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes]: else: img.putpalette(lookup.get_data()) img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB") + elif color_space is not None and color_space[0] == "/ICCBased": + # see Table 66 - Additional Entries Specific to an ICC Profile + # Stream Dictionary + icc_profile = color_space[1].get_object() + color_components = cast(int, icc_profile["/N"]) + alternate_colorspace = icc_profile["/Alternate"] + color_space = alternate_colorspace + mode_map = { + "/DeviceGray": "L", + "/DeviceRGB": "RGB", + "/DeviceCMYK": "RGBA", + } + mode = ( + mode_map.get(color_space) # type: ignore + or {1: "L", 3: "RGB", 4: "RGBA"}.get(color_components) + or mode + ) # type: ignore + img = Image.frombytes(mode, size, data) if G.S_MASK in x_object_obj: # add alpha channel alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].get_data()) img.putalpha(alpha) img_byte_arr = BytesIO() - img.save(img_byte_arr, format="PNG") + img.convert("RGBA").save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() elif x_object_obj[SA.FILTER] in ( [FT.LZW_DECODE], diff --git a/sample-files b/sample-files index d3d250321b..69b31cff6b 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit d3d250321b01ca1194e16a814d24508897862fe8 +Subproject commit 69b31cff6b183a42f9081709ef9bee4047976f56 diff --git a/tests/test_filters.py b/tests/test_filters.py index 80bf4af0fe..57d2da179c 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -265,3 +265,28 @@ def test_issue_1737(): reader.pages[0]["/Resources"]["/XObject"]["/Im0"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im1"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data() + + +@pytest.mark.enable_socket() +def test_pa_image_extraction(): + """ + PNG images with PA mode can be extracted. + + This is a regression test for issue #1801 + """ + url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf" + name = "issue-1801.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + + page0 = reader.pages[0] + images = page0.images + assert len(images) == 1 + assert images[0].name == "Im1.png" + + # Ensure visual appearence + data = get_pdf_from_url( + "https://user-images.githubusercontent.com/" + "1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png", + "issue-1801.png", + ) + assert data == images[0].data diff --git a/tests/test_pdfa.py b/tests/test_pdfa.py new file mode 100644 index 0000000000..2ba1942477 --- /dev/null +++ b/tests/test_pdfa.py @@ -0,0 +1,57 @@ +"""Ensure that pypdf doesn't break PDF/A compliance.""" + +from io import BytesIO +from pathlib import Path +from typing import Optional + +import pytest + +from pypdf import PdfReader, PdfWriter + +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" +SAMPLE_ROOT = PROJECT_ROOT / "sample-files" + + +def is_pdfa1b_compliant(src: BytesIO): + """Check if a PDF is PDF/A-1b compliant.""" + + def document_information_has_analoguos_xml(src: BytesIO) -> bool: + reader = PdfReader(src) + meta = reader.metadata + xmp = reader.xmp_metadata + if not meta: + return True + if not xmp: + return False + if meta.title and not xmp.dc_title: + return meta.title == xmp.dc_title + return True + + return document_information_has_analoguos_xml(src) + + +@pytest.mark.parametrize( + ("src", "diagnostic_write_name"), + [ + (SAMPLE_ROOT / "021-pdfa/crazyones-pdfa.pdf", None), + ], +) +def test_pdfa(src: Path, diagnostic_write_name: Optional[str]): + with open(src, "rb") as fp: + data = BytesIO(fp.read()) + reader = PdfReader(src) + assert is_pdfa1b_compliant(data) + writer = PdfWriter() + writer.clone_document_from_reader(reader) + + stream = BytesIO() + writer.write(stream) + stream.seek(0) + + assert is_pdfa1b_compliant(stream) + if diagnostic_write_name: + with open(diagnostic_write_name, "wb") as fp: + stream.seek(0) + fp.write(stream.read())