diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 9c89fe9e02..8e63394677 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -69,9 +69,6 @@ jobs: - name: Install pypdf run: | pip install . - - name: Test with flake8 - run: | - flake8 . - name: Test with pytest run: | python -m coverage run --parallel-mode -m pytest tests -vv @@ -85,6 +82,39 @@ jobs: path: .coverage.* if-no-files-found: ignore + codestyle: + name: Check code style issues + runs-on: ubuntu-20.04 + steps: + - name: Checkout Code + uses: actions/checkout@v3 + with: + submodules: 'recursive' + - name: Cache Downloaded Files + id: cache-downloaded-files + uses: actions/cache@v3 + with: + path: '**/tests/pdf_cache/*' + key: cache-downloaded-files + - name: Setup Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: "3.11" + cache: 'pip' + cache-dependency-path: '**/requirements/ci-3.11.txt' + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + - name: Install requirements + run: | + pip install -r requirements/ci-3.11.txt + - name: Install pypdf + run: | + pip install . + - name: Test with ruff + run: | + ruff . + package: name: Build & verify package runs-on: ubuntu-latest @@ -140,7 +170,7 @@ jobs: python -m coverage combine python -m coverage xml - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3 with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 659b4f6fa4..9969d7f542 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,23 +10,19 @@ repos: - id: check-yaml - id: debug-statements - id: end-of-file-fixer - exclude: "resources/.*" + exclude: "resources/.*|docs/make.bat" - id: trailing-whitespace - id: mixed-line-ending args: ['--fix=lf'] + exclude: "docs/make.bat" - id: check-added-large-files args: ['--maxkb=1000'] -- repo: https://github.com/pycqa/flake8 - rev: 6.0.0 - hooks: - - id: flake8 - args: ["--ignore", "E,W,F"] # - repo: https://github.com/pre-commit/mirrors-mypy # rev: v0.942 # hooks: # - id: mypy - repo: https://github.com/pycqa/isort - rev: 5.10.1 + rev: 5.12.0 hooks: - id: isort name: isort (python) @@ -37,12 +33,22 @@ repos: - id: black args: [--target-version, py36] - repo: https://github.com/asottile/blacken-docs - rev: v1.12.1 + rev: 1.13.0 hooks: - id: blacken-docs additional_dependencies: [black==22.1.0] + exclude: "docs/user/robustness.md" +- repo: https://github.com/charliermarsh/ruff-pre-commit + rev: 'v0.0.237' + hooks: + - id: ruff - repo: https://github.com/asottile/pyupgrade rev: v3.3.1 hooks: - id: pyupgrade args: [--py36-plus] +- repo: https://github.com/pycqa/flake8 + rev: 6.0.0 + hooks: + - id: flake8 + args: ["--ignore", "E,W,F"] diff --git a/CHANGELOG.md b/CHANGELOG.md index 014decc31e..c017d827cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,35 @@ # CHANGELOG +## Version 3.4.0, 2023-02-05 + +NOTICE: pypdf changed the way it represents numbers parsed from PDF files. + pypdf<3.4.0 represented numbers as Decimal, pypdf>=3.4.0 represents them as + floats. Several other PDF libraries to this, as well as many PDF viewers. + We hope to fix issues with too high precision like this and get a speed boost. + In case your PDF documents rely on more than 18 decimals of precision you + should check if it still works as expected. + To clarify: This does not affect the text shown in PDF documents. It affects + numbers, e.g. when graphics are drawn on the PDF or very exact positions are + used. Typically, 5 decimals should be enough. + +### New Features (ENH) +- Enable merging forms with overlapping names (#1553) +- Add 'over' parameter to merge_transformend_page & co (#1567) + +### Bug Fixes (BUG) +- Fix getter of the PageObject.rotation property with an indirect object (#1602) +- Restore merge_transformed_page & co (#1567) +- Replace decimal by float (#1563) + +### Robustness (ROB) +- PdfWriter.remove_images: /Contents might not be in page_ref (#1598) + +### Developer Experience (DEV) +- Introduce ruff (#1586, #1609) + +### Maintenance (MAINT) +- Remove decimal (#1608) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.3.0...3.4.0) ## Version 3.3.0, 2023-01-22 diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0af20cd2d8..38d6cb9fc5 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -23,6 +23,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra * [Majumder, Jonah](https://github.com/jonahmajumder) * [Manini, Lorenzo](https://github.com/lorenzomanini) * [maxbeer99](https://github.com/maxbeer99) +* [McNeil, Karen](https://github.com/karenlmcneil): Arabic Language Support * [Mérino, Antoine](https://github.com/Merinorus) * [Perrensen, Olsen](https://github.com/olsonperrensen) * [Pinheiro, Arthur](https://github.com/xilopaint) @@ -34,6 +35,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra * [Stüber, Timo](https://github.com/omit66) * [Thoma, Martin](https://github.com/MartinThoma): Maintainer of pypdf since April 2022. I hope to build a great community with many awesome contributors. [LinkedIn](https://www.linkedin.com/in/martin-thoma/) | [StackOverflow](https://stackoverflow.com/users/562769/martin-thoma) | [Blog](https://martin-thoma.com/) * [WevertonGomes](https://github.com/WevertonGomesCosta) +* [Wilson, Huon](https://github.com/huonw) * ztravis ## Adding a new contributor diff --git a/docs/conf.py b/docs/conf.py index 6f1ac1514e..ab960acdbf 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,8 +1,8 @@ """ Configuration file for the Sphinx documentation builder. -This file only contains a selection of the most common options. For a full -list see the documentation: +This file only contains a selection of the most common options. +For a full list see the documentation: https://www.sphinx-doc.org/en/master/usage/configuration.html """ # -- Path setup -------------------------------------------------------------- diff --git a/docs/dev/intro.md b/docs/dev/intro.md index 21b69de39d..131d708f32 100644 --- a/docs/dev/intro.md +++ b/docs/dev/intro.md @@ -80,4 +80,4 @@ The `PREFIX` can be: We need to keep an eye on performance and thus we have a few benchmarks. -See [py-pdf.github.io/pypdf/dev/bench](https://py-pdf.github.io/PyPDF2/dev/bench/) +See [py-pdf.github.io/pypdf/dev/bench](https://py-pdf.github.io/pypdf/dev/bench/) diff --git a/docs/meta/comparisons.md b/docs/meta/comparisons.md index e34ef0c672..9c7ddc2f58 100644 --- a/docs/meta/comparisons.md +++ b/docs/meta/comparisons.md @@ -27,7 +27,7 @@ PyPDF2 was merged back into `pypdf`. The development continues at `pypdf`. ## PyPDF3 and PyPDF4 Developing and maintaining open source software is extremely -time-intensive and in the case of PyPDF2 not paid at all. Having a +time-intensive and in the case of pypdf not paid at all. Having a continuous support is hard. pypdf was initially released in 2012 on PyPI and received releases diff --git a/docs/modules/PaperSize.rst b/docs/modules/PaperSize.rst index 0cbc36f402..8f9159b17a 100644 --- a/docs/modules/PaperSize.rst +++ b/docs/modules/PaperSize.rst @@ -11,7 +11,8 @@ Add blank page with PaperSize .. code-block:: python :linenos: - from PyPDF2 import PaperSize, PdfReader, PdfWriter + from pypdf import PaperSize, PdfReader, PdfWriter + pdf_reader = PdfReader("sample.pdf") pdf_writer = PdfWriter() pdf_writer.append_pages_from_reader(pdf_reader) @@ -23,12 +24,12 @@ Insert blank page with PaperSize ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: python :linenos: - - from PyPDF2 import PaperSize, PdfReader, PdfWriter + + from pypdf import PaperSize, PdfReader, PdfWriter + pdf_reader = PdfReader("sample.pdf") pdf_writer = PdfWriter() pdf_writer.append_pages_from_reader(pdf_reader) pdf_writer.insert_blank_page(PaperSize.A8.width, PaperSize.A8.height, 1) with open("output.pdf", "wb") as output_stream: pdf_writer.write(output_stream) - \ No newline at end of file diff --git a/docs/user/adding-pdf-annotations.md b/docs/user/adding-pdf-annotations.md index 620ea8704a..9db376375c 100644 --- a/docs/user/adding-pdf-annotations.md +++ b/docs/user/adding-pdf-annotations.md @@ -136,6 +136,7 @@ writer.add_page(page) # Add the rectangle annotation = AnnotationBuilder.ellipse( rect=(50, 550, 200, 650), +) writer.add_annotation(page_number=0, annotation=annotation) # Write the annotated file to disk diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index 0036de5500..70bfd002ed 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -52,7 +52,7 @@ page = reader.pages[3] parts = [] -def visitor_body(text, cm, tm, fontDict, fontSize): +def visitor_body(text, cm, tm, font_dict, font_size): y = tm[5] if y > 50 and y < 720: parts.append(text) @@ -104,6 +104,8 @@ Unfortunately in complicated PDF documents the coordinates given to the visitor- ## Why Text Extraction is hard +### Unclear Objective + Extracting text from a PDF can be pretty tricky. In several cases there is no clear answer what the expected result should look like: @@ -151,6 +153,22 @@ the way PDF stores information just makes it hard to achieve that: And finally there are issues that pypdf will deal with. If you find such a text extraction bug, please share the PDF with us so we can work on it! +### Missing Semantic Layer + +The PDF file format is all about producing the desired visual result for +printing. It was not created for parsing the content. PDF files don't contain a +semantic layer. + +Specifically, there is no information what the header, footer, page numbers, +tables, and paragraphs are. The visual appearence is there and people might +find heuristics to make educated guesses, but there is no way of being certain. + +This is a shortcoming of the PDF file format, not of pypdf. + +It would be possible to apply machine learning on PDF documents to make good +heuristics, but that will not be part of pypdf. However, pypdf could be used to +feed such a machine learning system with the relevant information. + ### Whitespaces The PDF format is meant for printing. It is not designed to be read by machines. @@ -210,7 +228,6 @@ Hence I would distinguish three types of PDF documents: in the background of the image. Hence you can copy the text, but it still looks like a scan. If you zoom in enough, you can recognize pixels. - ### Can we just always use OCR? You might now wonder if it makes sense to just always use OCR software. If the @@ -229,8 +246,6 @@ comes to characters which are easy to confuse such as `oO0ö`. pypdf also has an edge when it comes to characters which are rare, e.g. 🤰. OCR software will not be able to recognize smileys correctly. - - ## Attempts to prevent text extraction If people who share PDF documents want to prevent text extraction, there are diff --git a/docs/user/merging-pdfs.md b/docs/user/merging-pdfs.md index de08558424..4a7fb471bc 100644 --- a/docs/user/merging-pdfs.md +++ b/docs/user/merging-pdfs.md @@ -83,6 +83,21 @@ will insert the pages (1), (2), with page (0) before, in the middle and after ## add_page / insert_page It is recommended to use `append` or `merge` instead +## Merging forms +When Merging forms, some form fields may have the same names, preventing access +to some data. + +A grouping field should be added before adding the source PDF to prevent that. +The original fields will be identified by adding the group name. + +For example, after calling `reader.add_form_topname("form1")`, the field +previously named "field1" will now identified as "form1.field1" when calling +`reader.get_form_text_fields(True)` or `reader.get_fields()`. + +After that, you can append the input PDF completely or partially using +`writer.append` or `writer.merge`. If you insert a set of pages, only those +fields will be listed. + ## reset_translation During the cloning, if an object has been already cloned, it will not be cloned again, a pointer this previously cloned object is returned. because of that, if you add/merge a page that has diff --git a/make_changelog.py b/make_changelog.py index badd315d05..26f5a63c6c 100644 --- a/make_changelog.py +++ b/make_changelog.py @@ -16,7 +16,12 @@ class Change: def main(changelog_path: str): - """Create a changelog.""" + """ + Create a changelog. + + Args: + changelog_path: The location of the CHANGELOG file + """ changelog = get_changelog(changelog_path) git_tag = get_most_recent_git_tag() changes = get_formatted_changes(git_tag) @@ -39,27 +44,57 @@ def main(changelog_path: str): def version_bump(git_tag: str) -> str: - """Increase the patch version of the git tag by one.""" + """ + Increase the patch version of the git tag by one. + + Args: + git_tag: Old version tag + + Returns: + The new version where the patch version is bumped. + """ # just assume a patch version change major, minor, patch = git_tag.split(".") return f"{major}.{minor}.{int(patch) + 1}" def get_changelog(changelog_path: str) -> str: - """Read the changelog.""" + """ + Read the changelog. + + Args: + changelog_path: Path to the CHANGELOG file + + Returns: + Data of the CHANGELOG + """ with open(changelog_path) as fh: changelog = fh.read() return changelog def write_changelog(new_changelog: str, changelog_path: str) -> None: - """Write the changelog.""" + """ + Write the changelog. + + Args: + new_changelog: Contents of the new CHANGELOG + changelog_path: Path where the CHANGELOG file is + """ with open(changelog_path, "w") as fh: fh.write(new_changelog) def get_formatted_changes(git_tag: str) -> str: - """Format the changes done since the last tag.""" + """ + Format the changes done since the last tag. + + Args: + git_tag: the reference tag + + Returns: + Changes done since git_tag + """ commits = get_git_commits_since_tag(git_tag) # Group by prefix @@ -104,8 +139,13 @@ def get_formatted_changes(git_tag: str) -> str: return output -def get_most_recent_git_tag(): - """Get the git tag most recently created.""" +def get_most_recent_git_tag() -> str: + """ + Get the git tag most recently created. + + Returns: + Most recently created git tag. + """ git_tag = str( subprocess.check_output( ["git", "describe", "--abbrev=0"], stderr=subprocess.STDOUT @@ -114,8 +154,17 @@ def get_most_recent_git_tag(): return git_tag -def get_git_commits_since_tag(git_tag) -> List[Change]: - """Get all commits since the last tag.""" +def get_git_commits_since_tag(git_tag: str) -> List[Change]: + """ + Get all commits since the last tag. + + Args: + git_tag: Reference tag from which the changes to the current commit are + fetched. + + Returns: + List of all changes since git_tag. + """ commits = str( subprocess.check_output( [ @@ -131,8 +180,19 @@ def get_git_commits_since_tag(git_tag) -> List[Change]: return [parse_commit_line(line) for line in commits.split("\\n")] -def parse_commit_line(line) -> Change: - """Parse the first line of a git commit message.""" +def parse_commit_line(line: str) -> Change: + """ + Parse the first line of a git commit message. + + Args: + line: The first line of a git commit message. + + Returns: + The parsed Change object + + Raises: + ValueError: The commit line is not well-structured + """ if "\\t" not in line: raise ValueError(f"Invalid commit line: {line}") commit_hash, rest = line.split("\\t", 1) diff --git a/mutmut_config.py b/mutmut_config.py index 1c6e7c039d..56d149ac75 100644 --- a/mutmut_config.py +++ b/mutmut_config.py @@ -4,9 +4,16 @@ See https://mutmut.readthedocs.io/en/latest/ """ +from mutmut import Context -def pre_mutation(context): - """Filter what to mutate.""" + +def pre_mutation(context: Context) -> None: + """ + Filter what to mutate. + + Args: + context: A mutmut Context object + """ line = context.current_source_line.strip() if ( "_codecs" in context.filename diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9890526ba0..f74faa19d2 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -12,14 +12,19 @@ # code freely inspired from @twiggy ; see #711 def build_char_map( font_name: str, space_width: float, obj: DictionaryObject -) -> Tuple[ - str, float, Union[str, Dict[int, str]], Dict, DictionaryObject -]: # font_type,space_width /2, encoding, cmap - """Determine information about a font. +) -> Tuple[str, float, Union[str, Dict[int, str]], Dict, DictionaryObject]: + """ + Determine information about a font. - This function returns a tuple consisting of: - font sub-type, space_width/2, encoding, map character-map, font-dictionary. - The font-dictionary itself is suitable for the curious.""" + Args: + font_name: + space_width: + obj: + + Returns: + Font sub-type, space_width/2, encoding, map character-map, font-dictionary. + The font-dictionary itself is suitable for the curious. + """ ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore font_type: str = cast(str, ft["/Subtype"]) @@ -135,8 +140,7 @@ def parse_encoding( enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore if isinstance(enc, str): try: - # allready done : - # enc = NameObject.unnumber(enc.encode()).decode() + # allready done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() @@ -340,8 +344,8 @@ def parse_bfrange( return None if closure_found else (a, b) -def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: - lst = [x for x in l.split(b" ") if x] +def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: + lst = [x for x in line.split(b" ") if x] map_dict[-1] = len(lst[0]) // 2 while len(lst) > 1: map_to = "" diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 3d331fdab0..eda6150434 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -290,17 +290,17 @@ def compute_key( value of the encryption dictionary’s Length entry. Args: - password: The encryption secret as a bytes-string - rev: The encryption revision (see PDF standard) - key_size: The size of the key in bytes - o_entry: The owner entry - P: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, - all other bits are ignored and all operations are permitted. - If bit 2 is set to 0, permission for operations are based on the - values of the remaining flags defined in Table 24. - id1_entry: - metadata_encrypted: A boolean indicating if the metadata is encrypted. + password: The encryption secret as a bytes-string + rev: The encryption revision (see PDF standard) + key_size: The size of the key in bytes + o_entry: The owner entry + P: A set of flags specifying which operations shall be permitted + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. + id1_entry: + metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: The u_hash digest of length key_size @@ -353,12 +353,12 @@ def compute_O_value_key(owner_password: bytes, rev: int, key_size: int) -> bytes the value of the O entry in the encryption dictionary. Args: - owner_password: - rev: The encryption revision (see PDF standard) - key_size: The size of the key in bytes + owner_password: + rev: The encryption revision (see PDF standard) + key_size: The size of the key in bytes Returns: - The RC4 key + The RC4 key """ a = _padding(owner_password) o_hash_digest = hashlib.md5(a).digest() @@ -376,12 +376,12 @@ def compute_O_value(rc4_key: bytes, user_password: bytes, rev: int) -> bytes: See :func:`compute_O_value_key`. Args: - rc4_key: - user_password: - rev: The encryption revision (see PDF standard) + rc4_key: + user_password: + rev: The encryption revision (see PDF standard) Returns: - The RC4 encrypted + The RC4 encrypted """ a = _padding(user_password) rc4_enc = RC4_encrypt(rc4_key, a) @@ -407,12 +407,12 @@ def compute_U_value(key: bytes, rev: int, id1_entry: bytes) -> bytes: encryption dictionary. Args: - key: - rev: The encryption revision (see PDF standard) - id1_entry: + key: + rev: The encryption revision (see PDF standard) + id1_entry: Returns: - The value + The value """ if rev <= 2: value = RC4_encrypt(key, _PADDING) @@ -482,21 +482,21 @@ def verify_user_password( to decrypt the document. Args: - user_password: The user passwort as a bytes stream - rev: The encryption revision (see PDF standard) - key_size: The size of the key in bytes - o_entry: The owner entry - u_entry: The user entry - P: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, - all other bits are ignored and all operations are permitted. - If bit 2 is set to 0, permission for operations are based on the - values of the remaining flags defined in Table 24. - id1_entry: - metadata_encrypted: A boolean indicating if the metadata is encrypted. + user_password: The user passwort as a bytes stream + rev: The encryption revision (see PDF standard) + key_size: The size of the key in bytes + o_entry: The owner entry + u_entry: The user entry + P: A set of flags specifying which operations shall be permitted + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. + id1_entry: + metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: - The key + The key """ key = AlgV4.compute_key( user_password, rev, key_size, o_entry, P, id1_entry, metadata_encrypted @@ -544,21 +544,21 @@ def verify_owner_password( If it is correct, the password supplied is the correct owner password. Args: - owner_password: - rev: The encryption revision (see PDF standard) - key_size: The size of the key in bytes - o_entry: The owner entry - u_entry: The user entry - P: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, - all other bits are ignored and all operations are permitted. - If bit 2 is set to 0, permission for operations are based on the - values of the remaining flags defined in Table 24. - id1_entry: - metadata_encrypted: A boolean indicating if the metadata is encrypted. + owner_password: + rev: The encryption revision (see PDF standard) + key_size: The size of the key in bytes + o_entry: The owner entry + u_entry: The user entry + P: A set of flags specifying which operations shall be permitted + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. + id1_entry: + metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: - bytes + bytes """ rc4_key = AlgV4.compute_O_value_key(owner_password, rev, key_size) @@ -629,19 +629,19 @@ def verify_owner_password( They should match the value in the P key. Args: - R: A number specifying which revision of the standard security - handler shall be used to interpret this dictionary - password: The owner password - o_value: A 32-byte string, based on both the owner and user passwords, - that shall be used in computing the encryption key and in determining - whether a valid owner password was entered - oe_value: - u_value: A 32-byte string, based on the user password, that shall be - used in determining whether to prompt the user for a password and, if so, - whether a valid user or owner password was entered. + R: A number specifying which revision of the standard security + handler shall be used to interpret this dictionary + password: The owner password + o_value: A 32-byte string, based on both the owner and user passwords, + that shall be used in computing the encryption key and in + determining whether a valid owner password was entered + oe_value: + u_value: A 32-byte string, based on the user password, that shall be + used in determining whether to prompt the user for a password and, + if so, whether a valid user or owner password was entered. Returns: - The key + The key """ password = password[:127] if ( @@ -662,16 +662,16 @@ def verify_user_password( See :func:`verify_owner_password`. Args: - R: A number specifying which revision of the standard security - handler shall be used to interpret this dictionary - password: The user password - u_value: A 32-byte string, based on the user password, that shall be - used in determining whether to prompt the user for a password and, if so, - whether a valid user or owner password was entered. - ue_value: + R: A number specifying which revision of the standard security + handler shall be used to interpret this dictionary + password: The user password + u_value: A 32-byte string, based on the user password, that shall be + used in determining whether to prompt the user for a password + and, if so, whether a valid user or owner password was entered. + ue_value: Returns: - bytes + bytes """ password = password[:127] if AlgV5.calculate_hash(R, password, u_value[32:40], b"") != u_value[:32]: @@ -709,17 +709,18 @@ def verify_perms( See :func:`verify_owner_password` and :func:`compute_perms_value`. Args: - key: The owner password - perms: - p: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, all other - bits are ignored and all operations are permitted. If bit 2 is set to 0, - permission for operations are based on the values of the remaining flags - defined in Table 24. - metadata_encrypted: + key: The owner password + perms: + p: A set of flags specifying which operations shall be permitted + when the document is opened with user access. + If bit 2 is set to 1, all other bits are ignored and all + operations are permitted. + If bit 2 is set to 0, permission for operations are based on + the values of the remaining flags defined in Table 24. + metadata_encrypted: Returns: - A boolean + A boolean """ b8 = b"T" if metadata_encrypted else b"F" p1 = struct.pack(" Tuple[bytes, bytes]: """ Algorithm 3.8 Computing the encryption dictionary’s U (user password) - and UE (user encryption key) values + and UE (user encryption key) values. 1. Generate 16 random bytes of data using a strong random number generator. The first 8 bytes are the User Validation Salt. The second 8 bytes @@ -764,11 +765,11 @@ def compute_U_value(password: bytes, key: bytes) -> Tuple[bytes, bytes]: as the UE key. Args: - password: - key: + password: + key: Returns: - A tuple (u-value, ue value) + A tuple (u-value, ue value) """ random_bytes = bytes(random.randrange(0, 256) for _ in range(16)) val_salt = random_bytes[:8] @@ -804,14 +805,14 @@ def compute_O_value( The resulting 32-byte string is stored as the OE key. Args: - password: - key: - u_value: A 32-byte string, based on the user password, that shall be - used in determining whether to prompt the user for a password and, - if so, whether a valid user or owner password was entered. + password: + key: + u_value: A 32-byte string, based on the user password, that shall be + used in determining whether to prompt the user for a password + and, if so, whether a valid user or owner password was entered. Returns: - A tuple (O value, OE value) + A tuple (O value, OE value) """ random_bytes = bytes(random.randrange(0, 256) for _ in range(16)) val_salt = random_bytes[:8] @@ -828,7 +829,8 @@ def compute_O_value( @staticmethod def compute_Perms_value(key: bytes, p: int, metadata_encrypted: bool) -> bytes: """ - Algorithm 3.10 Computing the encryption dictionary’s Perms (permissions) value + Algorithm 3.10 Computing the encryption dictionary’s Perms + (permissions) value. 1. Extend the permissions (contents of the P integer) to 64 bits by setting the upper 32 bits to all 1’s. @@ -845,16 +847,16 @@ def compute_Perms_value(key: bytes, p: int, metadata_encrypted: bool) -> bytes: for validity when the file is opened. Args: - key: - p: A set of flags specifying which operations shall be permitted - when the document is opened with user access. If bit 2 is set to 1, - all other bits are ignored and all operations are permitted. - If bit 2 is set to 0, permission for operations are based on the - values of the remaining flags defined in Table 24. - metadata_encrypted: A boolean indicating if the metadata is encrypted. + key: + p: A set of flags specifying which operations shall be permitted + when the document is opened with user access. If bit 2 is set to 1, + all other bits are ignored and all operations are permitted. + If bit 2 is set to 0, permission for operations are based on the + values of the remaining flags defined in Table 24. + metadata_encrypted: A boolean indicating if the metadata is encrypted. Returns: - The perms value + The perms value """ b8 = b"T" if metadata_encrypted else b"F" rr = bytes(random.randrange(0, 256) for _ in range(4)) @@ -939,12 +941,12 @@ def decrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObje The output is the encrypted data to be stored in the PDF file. Args: - obj: - idnum: - generation: + obj: + idnum: + generation: Returns: - The PdfObject + The PdfObject """ pack1 = struct.pack("` for usage information. Args: - strict: Determines whether user should be warned of all - problems and also causes some correctable problems to be fatal. - Defaults to ``False``. - fileobj: Output file. Can be a filename or any kind of - file-like object. + strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``False``. + fileobj: Output file. Can be a filename or any kind of + file-like object. """ @deprecation_bookmark(bookmarks="outline") @@ -146,23 +146,23 @@ def merge( specified page number. Args: - page_number: The *page number* to insert this file. File will - be inserted after the given number. - fileobj: A File Object or an object that supports the standard - read and seek methods similar to a File Object. Could also be a - string representing a path to a PDF file. - None as an argument is deprecated. - outline_item: Optionally, you may specify an outline item - (previously referred to as a 'bookmark') to be applied at the - beginning of the included file by supplying the text of the outline item. - pages: can be a :class:`PageRange` - or a ``(start, stop[, step])`` tuple - to merge only the specified range of pages from the source - document into the output document. - Can also be a list of pages to merge. - import_outline: You may prevent the source document's - outline (collection of outline items, previously referred to as - 'bookmarks') from being imported by specifying this as ``False``. + page_number: The *page number* to insert this file. File will + be inserted after the given number. + fileobj: A File Object or an object that supports the standard + read and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + None as an argument is deprecated. + outline_item: Optionally, you may specify an outline item + (previously referred to as a 'bookmark') to be applied at the + beginning of the included file by supplying the text of the outline item. + pages: can be a :class:`PageRange` + or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + Can also be a list of pages to merge. + import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. """ if position is not None: # deprecated if page_number is None: @@ -299,20 +299,20 @@ def append( position. Args: - fileobj: A File Object or an object that supports the standard - read and seek methods similar to a File Object. Could also be a - string representing a path to a PDF file. - outline_item: Optionally, you may specify an outline item - (previously referred to as a 'bookmark') to be applied at the - beginning of the included file by supplying the text of the outline item. - pages: can be a :class:`PageRange` - or a ``(start, stop[, step])`` tuple - to merge only the specified range of pages from the source - document into the output document. - Can also be a list of pages to append. - import_outline: You may prevent the source document's - outline (collection of outline items, previously referred to as - 'bookmarks') from being imported by specifying this as ``False``. + fileobj: A File Object or an object that supports the standard + read and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + outline_item: Optionally, you may specify an outline item + (previously referred to as a 'bookmark') to be applied at the + beginning of the included file by supplying the text of the outline item. + pages: can be a :class:`PageRange` + or a ``(start, stop[, step])`` tuple + to merge only the specified range of pages from the source + document into the output document. + Can also be a list of pages to append. + import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. """ self.merge(len(self.pages), fileobj, outline_item, pages, import_outline) @@ -321,8 +321,8 @@ def write(self, fileobj: Union[Path, StrByteType]) -> None: Write all data that has been merged to the given output file. Args: - fileobj: Output file. Can be a filename or any kind of - file-like object. + fileobj: Output file. Can be a filename or any kind of + file-like object. """ if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) @@ -336,9 +336,6 @@ def write(self, fileobj: Union[Path, StrByteType]) -> None: page.out_pagedata = self.output.get_reference( pages_obj[PA.KIDS][-1].get_object() ) - # key_temp = self.output._pages.get_object()[PA.KIDS][-1].get_object() - # idnum = self.output._objects.index(key_temp) + 1 - # page.out_pagedata = IndirectObject(idnum, 0, self.output) # Once all pages are added, create outline items to point at those pages self._write_dests() @@ -364,9 +361,9 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: Add custom metadata to the output. Args: - infos: a Python dictionary where each key is a field - and each value is your new metadata. - An example is ``{'/Title': 'My title'}`` + infos: a Python dictionary where each key is a field + and each value is your new metadata. + An example is ``{'/Title': 'My title'}`` """ if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) @@ -374,18 +371,18 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: def addMetadata(self, infos: Dict[str, Any]) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_metadata` instead. - Use :meth:`add_metadata` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addMetadata", "add_metadata") self.add_metadata(infos) def setPageLayout(self, layout: LayoutType) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`set_page_layout` instead. - Use :meth:`set_page_layout` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("setPageLayout", "set_page_layout") self.set_page_layout(layout) @@ -395,7 +392,7 @@ def set_page_layout(self, layout: LayoutType) -> None: Set the page layout. Args: - layout: The page layout to be used + layout: The page layout to be used .. list-table:: Valid ``layout`` arguments :widths: 50 200 @@ -421,9 +418,9 @@ def set_page_layout(self, layout: LayoutType) -> None: def setPageMode(self, mode: PagemodeType) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`set_page_mode` instead. - Use :meth:`set_page_mode` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("setPageMode", "set_page_mode", "3.0.0") self.set_page_mode(mode) @@ -433,7 +430,7 @@ def set_page_mode(self, mode: PagemodeType) -> None: Set the page mode. Args: - mode: The page mode to use. + mode: The page mode to use. .. list-table:: Valid ``mode`` arguments :widths: 50 200 @@ -490,12 +487,12 @@ def _trim_outline( Remove outline item entries that are not a part of the specified page set. Args: - pdf: - outline: - pages: + pdf: + outline: + pages: Returns: - An outline type + An outline type """ new_outline = [] prev_header_added = True @@ -693,15 +690,15 @@ def add_outline_item( Add an outline item (commonly referred to as a "Bookmark") to this PDF file. Args: - title: Title to use for this outline item. - page_number: Page number this outline item will point to. - parent: A reference to a parent outline item to create nested - outline items. - color: Color of the outline item's font as a red, green, blue tuple - from 0.0 to 1.0 - bold: Outline item font is bold - italic: Outline item font is italic - fit: The fit of the destination page. + title: Title to use for this outline item. + page_number: Page number this outline item will point to. + parent: A reference to a parent outline item to create nested + outline items. + color: Color of the outline item's font as a red, green, blue tuple + from 0.0 to 1.0 + bold: Outline item font is bold + italic: Outline item font is italic + fit: The fit of the destination page. """ if page_number is not None and pagenum is not None: raise ValueError( @@ -807,8 +804,8 @@ def add_named_destination( Add a destination to the output. Args: - title: Title to use - page_number: Page number this destination points at. + title: Title to use + page_number: Page number this destination points at. """ if page_number is not None and pagenum is not None: raise ValueError( diff --git a/pypdf/_page.py b/pypdf/_page.py index 0eb268da26..32de402995 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -45,7 +45,7 @@ ) from ._cmap import build_char_map, unknown_char_map -from ._protocols import PdfReaderProtocol +from ._protocols import PdfReaderProtocol, PdfWriterProtocol from ._utils import ( CompressedTransformationMatrix, File, @@ -221,7 +221,8 @@ def __init__(self, ctm: CompressedTransformationMatrix = (1, 0, 0, 1, 0, 0)): def matrix(self) -> TransformationMatrixType: """ Return the transformation matrix as a tuple of tuples in the form: - ((a, b, 0), (c, d, 0), (e, f, 1)) + + ((a, b, 0), (c, d, 0), (e, f, 1)) """ return ( (self.ctm[0], self.ctm[1], 0), @@ -314,7 +315,9 @@ def __repr__(self) -> str: return f"Transformation(ctm={self.ctm})" def apply_on( - self, pt: Union[Tuple[Decimal, Decimal], Tuple[float, float], List[float]] + self, + pt: Union[Tuple[float, float], List[float]], + as_object: bool = False, ) -> Union[Tuple[float, float], List[float]]: """ Apply the transformation matrix on the given point. @@ -325,9 +328,10 @@ def apply_on( Returns: A tuple or list representing the transformed point in the form (x', y') """ + typ = FloatObject if as_object else float pt1 = ( - float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4], - float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5], + typ(float(pt[0]) * self.ctm[0] + float(pt[1]) * self.ctm[2] + self.ctm[4]), + typ(float(pt[0]) * self.ctm[1] + float(pt[1]) * self.ctm[3] + self.ctm[5]), ) return list(pt1) if isinstance(pt, list) else pt1 @@ -352,13 +356,13 @@ class PageObject(DictionaryObject): def __init__( self, - pdf: Optional[PdfReaderProtocol] = None, + pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = None, indirect_reference: Optional[IndirectObject] = None, indirect_ref: Optional[IndirectObject] = None, # deprecated ) -> None: DictionaryObject.__init__(self) - self.pdf: Optional[PdfReaderProtocol] = pdf + self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf if indirect_ref is not None: # deprecated warnings.warn( ( @@ -397,15 +401,15 @@ def user_unit(self) -> float: """ A read-only positive number giving the size of user space units. - It is in multiples of 1/72 inch. Hence a value of 1 means a user space - unit is 1/72 inch, and a value of 3 means that a user space unit is - 3/72 inch. + It is in multiples of 1/72 inch. Hence a value of 1 means a user + space unit is 1/72 inch, and a value of 3 means that a user + space unit is 3/72 inch. """ return self.get(PG.USER_UNIT, 1) @staticmethod def create_blank_page( - pdf: Optional[Any] = None, # PdfReader + pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = None, width: Union[float, Decimal, None] = None, height: Union[float, Decimal, None] = None, ) -> "PageObject": @@ -450,14 +454,14 @@ def create_blank_page( @staticmethod def createBlankPage( - pdf: Optional[Any] = None, # PdfReader + pdf: Optional[PdfReaderProtocol] = None, width: Union[float, Decimal, None] = None, height: Union[float, Decimal, None] = None, ) -> "PageObject": # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`create_blank_page` instead. - Use :meth:`create_blank_page` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("createBlankPage", "create_blank_page", "3.0.0") return PageObject.create_blank_page(pdf, width, height) @@ -491,10 +495,10 @@ def rotation(self) -> int: The VISUAL rotation of the page. This number has to be a multiple of 90 degrees: 0, 90, 180, or 270 are - valid values. - This property does not affect ``/Contents``. + valid values. This property does not affect ``/Contents``. """ - return int(self.get(PG.ROTATE, 0)) + rotate_obj = self.get(PG.ROTATE, 0) + return rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() @rotation.setter def rotation(self, r: Union[int, float]) -> None: @@ -502,7 +506,8 @@ def rotation(self, r: Union[int, float]) -> None: def transfer_rotation_to_content(self) -> None: """ - Apply the rotation of the page to the content and the media/crop/... boxes. + Apply the rotation of the page to the content and the media/crop/... + boxes. It's recommended to apply this function before page merging. """ @@ -540,14 +545,13 @@ def rotate(self, angle: int) -> "PageObject": Args: angle: Angle to rotate the page. Must be an increment of 90 deg. + + Returns: + The rotated PageObject """ if angle % 90 != 0: raise ValueError("Rotation angle must be a multiple of 90") - rotate_obj = self.get(PG.ROTATE, 0) - current_angle = ( - rotate_obj if isinstance(rotate_obj, int) else rotate_obj.get_object() - ) - self[NameObject(PG.ROTATE)] = NumberObject(current_angle + angle) + self[NameObject(PG.ROTATE)] = NumberObject(self.rotation + angle) return self def rotate_clockwise(self, angle: int) -> "PageObject": # deprecated @@ -556,32 +560,52 @@ def rotate_clockwise(self, angle: int) -> "PageObject": # deprecated def rotateClockwise(self, angle: int) -> "PageObject": # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`rotate_clockwise` instead. - Use :meth:`rotate_clockwise` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("rotateClockwise", "rotate", "3.0.0") return self.rotate(angle) def rotateCounterClockwise(self, angle: int) -> "PageObject": # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`rotate_clockwise` with a negative argument instead. - Use :meth:`rotate_clockwise` with a negative argument instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("rotateCounterClockwise", "rotate", "3.0.0") return self.rotate(-angle) - @staticmethod def _merge_resources( - res1: DictionaryObject, res2: DictionaryObject, resource: Any + self, + res1: DictionaryObject, + res2: DictionaryObject, + resource: Any, + new_res1: bool = True, ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - new_res = DictionaryObject() - new_res.update(res1.get(resource, DictionaryObject()).get_object()) + try: + assert isinstance(self.indirect_reference, IndirectObject) + pdf = self.indirect_reference.pdf + is_pdf_writer = hasattr( + pdf, "_add_object" + ) # ---------- expect isinstance(pdf,PdfWriter) + except (AssertionError, AttributeError): + pdf = None + is_pdf_writer = False def compute_unique_key(base_key: str) -> Tuple[str, bool]: - """Find a key that either doesn't already exist or has the same - value (indicated by the bool)""" + """ + Find a key that either doesn't already exist or has the same value + (indicated by the bool) + + Args: + base_key: An index is added to this to get the computed key + + Returns: + A tuple (computed key, bool) where the boolean indicates + if there is a resource of the given computed_key with the same + value. + """ value = page2res.raw_get(base_key) # try the current key first (e.g. "foo"), but otherwise iterate # through "foo-0", "foo-1", etc. new_res can contain only finitely @@ -598,11 +622,16 @@ def compute_unique_key(base_key: str) -> Tuple[str, bool]: idx += 1 return computed_key, False + if new_res1: + new_res = DictionaryObject() + new_res.update(res1.get(resource, DictionaryObject()).get_object()) + else: + new_res = cast(DictionaryObject, res1[resource]) page2res = cast( DictionaryObject, res2.get(resource, DictionaryObject()).get_object() ) rename_res = {} - for key in sorted(page2res.keys()): + for key in page2res.keys(): unique_key, same_value = compute_unique_key(key) newname = NameObject(unique_key) if key != unique_key: @@ -610,14 +639,25 @@ def compute_unique_key(base_key: str) -> Tuple[str, bool]: rename_res[key] = newname if not same_value: - # the value wasn't already recorded - new_res[newname] = page2res[key] - + if is_pdf_writer: + new_res[newname] = page2res.raw_get(key).clone(pdf) + try: + new_res[newname] = new_res[key].indirect_reference + except AttributeError: + pass + else: + new_res[newname] = page2res.raw_get(key) + lst = sorted(new_res.items()) + new_res.clear() + for el in lst: + new_res[el[0]] = el[1] return new_res, rename_res @staticmethod def _content_stream_rename( - stream: ContentStream, rename: Dict[Any, Any], pdf: Any # PdfReader + stream: ContentStream, + rename: Dict[Any, Any], + pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol], ) -> ContentStream: if not rename: return stream @@ -638,7 +678,9 @@ def _content_stream_rename( return stream @staticmethod - def _push_pop_gs(contents: Any, pdf: Any) -> ContentStream: # PdfReader + def _push_pop_gs( + contents: Any, pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] + ) -> ContentStream: # adds a graphics state "push" and "pop" to the beginning and end # of a content stream. This isolates it from changes such as # transformation matricies. @@ -649,10 +691,13 @@ def _push_pop_gs(contents: Any, pdf: Any) -> ContentStream: # PdfReader @staticmethod def _add_transformation_matrix( - contents: Any, pdf: Any, ctm: CompressedTransformationMatrix - ) -> ContentStream: # PdfReader - # adds transformation matrix at the beginning of the given - # contents stream. + contents: Any, + pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol], + ctm: CompressedTransformationMatrix, + ) -> ContentStream: + """ + Add transformation matrix at the beginning of the given contents stream. + """ a, b, c, d, e, f = ctm contents = ContentStream(contents, pdf) contents.operations.insert( @@ -686,9 +731,9 @@ def get_contents(self) -> Optional[ContentStream]: def getContents(self) -> Optional[ContentStream]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_contents` instead. - Use :meth:`get_contents` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getContents", "get_contents", "3.0.0") return self.get_contents() @@ -713,9 +758,9 @@ def merge_page(self, page2: "PageObject", expand: bool = False) -> None: def mergePage(self, page2: "PageObject") -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`merge_page` instead. - Use :meth:`merge_page` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("mergePage", "merge_page", "3.0.0") return self.merge_page(page2) @@ -725,11 +770,22 @@ def _merge_page( page2: "PageObject", page2transformation: Optional[Callable[[Any], ContentStream]] = None, ctm: Optional[CompressedTransformationMatrix] = None, + over: bool = True, expand: bool = False, ) -> None: # First we work on merging the resource dictionaries. This allows us # to find out what symbols in the content streams we might need to # rename. + try: + assert isinstance(self.indirect_reference, IndirectObject) + if hasattr( + self.indirect_reference.pdf, "_add_object" + ): # ---------- to detect PdfWriter + return self._merge_page_writer( + page2, page2transformation, ctm, over, expand + ) + except (AssertionError, AttributeError): + pass new_resources = DictionaryObject() rename = {} @@ -759,7 +815,7 @@ def _merge_page( RES.SHADING, RES.PROPERTIES, ): - new, newrename = PageObject._merge_resources( + new, newrename = self._merge_resources( original_resources, page2resources, res ) if new: @@ -812,7 +868,10 @@ def _merge_page( page2content, rename, self.pdf ) page2content = PageObject._push_pop_gs(page2content, self.pdf) - new_content_array.append(page2content) + if over: + new_content_array.append(page2content) + else: + new_content_array.insert(0, page2content) # if expanding the page to fit a new page, calculate the new media box size if expand: @@ -822,6 +881,146 @@ def _merge_page( self[NameObject(PG.RESOURCES)] = new_resources self[NameObject(PG.ANNOTS)] = new_annots + def _merge_page_writer( + self, + page2: "PageObject", + page2transformation: Optional[Callable[[Any], ContentStream]] = None, + ctm: Optional[CompressedTransformationMatrix] = None, + over: bool = True, + expand: bool = False, + ) -> None: + # First we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + assert isinstance(self.indirect_reference, IndirectObject) + pdf = self.indirect_reference.pdf + + rename = {} + original_resources = cast(DictionaryObject, self[PG.RESOURCES].get_object()) + page2resources = cast(DictionaryObject, page2[PG.RESOURCES].get_object()) + + for res in ( + RES.EXT_G_STATE, + RES.FONT, + RES.XOBJECT, + RES.COLOR_SPACE, + RES.PATTERN, + RES.SHADING, + RES.PROPERTIES, + ): + if res in page2resources: + if res not in original_resources: + original_resources[NameObject(res)] = DictionaryObject() + _, newrename = self._merge_resources( + original_resources, page2resources, res, False + ) + rename.update(newrename) + # Combine /ProcSet sets. + if RES.PROC_SET in page2resources: + if RES.PROC_SET not in original_resources: + original_resources[NameObject(RES.PROC_SET)] = ArrayObject() + arr = cast(ArrayObject, original_resources[RES.PROC_SET]) + for x in cast(ArrayObject, page2resources[RES.PROC_SET]): + if x not in arr: + arr.append(x) + arr.sort() + + if PG.ANNOTS in page2: + if PG.ANNOTS not in self: + self[NameObject(PG.ANNOTS)] = ArrayObject() + annots = cast(ArrayObject, self[PG.ANNOTS].get_object()) + if ctm is None: + trsf = Transformation() + else: + trsf = Transformation(ctm) + for a in cast(ArrayObject, page2[PG.ANNOTS]): + a = a.get_object() + aa = a.clone(pdf, ignore_fields=("/P", "/StructParent")) + r = cast(ArrayObject, a["/Rect"]) + pt1 = trsf.apply_on((r[0], r[1]), True) + pt2 = trsf.apply_on((r[2], r[3]), True) + aa[NameObject("/Rect")] = ArrayObject( + ( + min(pt1[0], pt2[0]), + min(pt1[1], pt2[1]), + max(pt1[0], pt2[0]), + max(pt1[1], pt2[1]), + ) + ) + if "/QuadPoints" in a: + q = cast(ArrayObject, a["/QuadPoints"]) + aa[NameObject("/QuadPoints")] = ArrayObject( + cast(tuple, trsf.apply_on((q[0], q[1]), True)) + + cast(tuple, trsf.apply_on((q[2], q[3]), True)) + + cast(tuple, trsf.apply_on((q[4], q[5]), True)) + + cast(tuple, trsf.apply_on((q[6], q[7]), True)) + ) + try: + aa[NameObject("/P")] = self.indirect_reference + annots.append(aa.indirect_reference) + except AttributeError: + pass + + new_content_array = ArrayObject() + + original_content = self.get_contents() + if original_content is not None: + new_content_array.append( + PageObject._push_pop_gs(original_content, self.pdf) + ) + + page2content = page2.get_contents() + if page2content is not None: + page2content = ContentStream(page2content, self.pdf) + rect = page2.trimbox + page2content.operations.insert( + 0, + ( + map( + FloatObject, + [ + rect.left, + rect.bottom, + rect.width, + rect.height, + ], + ), + "re", + ), + ) + page2content.operations.insert(1, ([], "W")) + page2content.operations.insert(2, ([], "n")) + if page2transformation is not None: + page2content = page2transformation(page2content) + page2content = PageObject._content_stream_rename( + page2content, rename, self.pdf + ) + page2content = PageObject._push_pop_gs(page2content, self.pdf) + if over: + new_content_array.append(page2content) + else: + new_content_array.insert(0, page2content) + + # if expanding the page to fit a new page, calculate the new media box size + if expand: + self._expand_mediabox(page2, ctm) + + if PG.CONTENTS not in self: + self[NameObject(PG.CONTENTS)] = pdf._add_object(ContentStream(None, pdf)) + ind = self.raw_get(PG.CONTENTS) + try: + if not isinstance(ind, IndirectObject): + raise KeyError + pdf._replace_object(ind, ContentStream(new_content_array, pdf)) + except KeyError: + self[NameObject(PG.CONTENTS)] = pdf._add_object( + ContentStream(new_content_array, pdf) + ) + + # self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, pdf) + # self[NameObject(PG.RESOURCES)] = new_resources + # self[NameObject(PG.ANNOTS)] = new_annots + def _expand_mediabox( self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] ) -> None: @@ -865,82 +1064,117 @@ def _expand_mediabox( self.mediabox.lower_left = lowerleft self.mediabox.upper_right = upperright - def mergeTransformedPage( + def merge_transformed_page( self, page2: "PageObject", ctm: Union[CompressedTransformationMatrix, Transformation], + over: bool = True, expand: bool = False, - ) -> None: # deprecated + ) -> None: """ - mergeTransformedPage is similar to merge_page, but a transformation + merge_transformed_page is similar to merge_page, but a transformation matrix is applied to the merged stream. - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param tuple ctm: a 6-element tuple containing the operands of the - transformation matrix - :param bool expand: Whether the page should be expanded to fit the dimensions + Args: + page2: The page to be merged into this one. + ctm: a 6-element tuple containing the operands of the + transformation matrix + over: set the page2 content over page1 if True(default) else under + expand: Whether the page should be expanded to fit the dimensions of the page to be merged. - - .. deprecated:: 1.28.0 - - Use :meth:`add_transformation` and :meth:`merge_page` instead. """ - deprecation_with_replacement( - "page.mergeTransformedPage(page2, ctm)", - "page2.add_transformation(ctm); page.merge_page(page2)", - "3.0.0", - ) if isinstance(ctm, Transformation): ctm = ctm.ctm - ctm = cast(CompressedTransformationMatrix, ctm) self._merge_page( page2, lambda page2Content: PageObject._add_transformation_matrix( - page2Content, page2.pdf, ctm # type: ignore[arg-type] + page2Content, page2.pdf, cast(CompressedTransformationMatrix, ctm) ), ctm, + over, expand, ) - def mergeScaledPage( - self, page2: "PageObject", scale: float, expand: bool = False + def mergeTransformedPage( + self, + page2: "PageObject", + ctm: Union[CompressedTransformationMatrix, Transformation], + expand: bool = False, ) -> None: # deprecated """ - mergeScaledPage is similar to merge_page, but the stream to be merged + deprecated + + deprecated:: 1.28.0 + + Use :meth:`merge_transformed_page` instead. + """ + deprecation_with_replacement( + "page.mergeTransformedPage(page2, ctm,expand)", + "page.merge_transformed_page(page2,ctm,expand)", + "3.0.0", + ) + self.merge_transformed_page(page2, ctm, expand) + + def merge_scaled_page( + self, page2: "PageObject", scale: float, over: bool = True, expand: bool = False + ) -> None: + """ + merge_scaled_page is similar to merge_page, but the stream to be merged is scaled by applying a transformation matrix. - :param PageObject page2: The page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the + Args: + page2: The page to be merged into this one. + scale: The scaling factor + over: set the page2 content over page1 if True(default) else under + expand: Whether the page should be expanded to fit the dimensions of the page to be merged. + """ + op = Transformation().scale(scale, scale) + self.merge_transformed_page(page2, op, over, expand) + + def mergeScaledPage( + self, page2: "PageObject", scale: float, expand: bool = False + ) -> None: # deprecated + """ + deprecated .. deprecated:: 1.28.0 - Use :meth:`add_transformation` and :meth:`merge_page` instead. + Use :meth:`merge_scaled_page` instead. """ deprecation_with_replacement( "page.mergeScaledPage(page2, scale, expand)", - "page2.add_transformation(Transformation().scale(scale)); " - "page.merge_page(page2, expand)", + "page2.merge_scaled_page(page2, scale, expand)", "3.0.0", ) - op = Transformation().scale(scale, scale) - self.mergeTransformedPage(page2, op, expand) + self.merge_scaled_page(page2, scale, expand) - def mergeRotatedPage( - self, page2: "PageObject", rotation: float, expand: bool = False - ) -> None: # deprecated + def merge_rotated_page( + self, + page2: "PageObject", + rotation: float, + over: bool = True, + expand: bool = False, + ) -> None: """ - mergeRotatedPage is similar to merge_page, but the stream to be merged + merge_rotated_page is similar to merge_page, but the stream to be merged is rotated by applying a transformation matrix. - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float rotation: The angle of the rotation, in degrees - :param bool expand: Whether the page should be expanded to fit the + Args: + page2: The page to be merged into this one. + rotation: The angle of the rotation, in degrees + over: set the page2 content over page1 if True(default) else under + expand: Whether the page should be expanded to fit the dimensions of the page to be merged. + """ + op = Transformation().rotate(rotation) + self.merge_transformed_page(page2, op, over, expand) + + def mergeRotatedPage( + self, page2: "PageObject", rotation: float, expand: bool = False + ) -> None: # deprecated + """ + deprecated .. deprecated:: 1.28.0 @@ -948,30 +1182,43 @@ def mergeRotatedPage( """ deprecation_with_replacement( "page.mergeRotatedPage(page2, rotation, expand)", - "page2.add_transformation(Transformation().rotate(rotation)); " - "page.merge_page(page2, expand)", + "page2.mergeotatedPage(page2, rotation, expand)", "3.0.0", ) - op = Transformation().rotate(rotation) - self.mergeTransformedPage(page2, op, expand) + self.merge_rotated_page(page2, rotation, expand) - def mergeTranslatedPage( - self, page2: "PageObject", tx: float, ty: float, expand: bool = False - ) -> None: # deprecated + def merge_translated_page( + self, + page2: "PageObject", + tx: float, + ty: float, + over: bool = True, + expand: bool = False, + ) -> None: """ mergeTranslatedPage is similar to merge_page, but the stream to be merged is translated by applying a transformation matrix. - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param bool expand: Whether the page should be expanded to fit the + Args: + page2: the page to be merged into this one. + tx: The translation on X axis + ty: The translation on Y axis + over: set the page2 content over page1 if True(default) else under + expand: Whether the page should be expanded to fit the dimensions of the page to be merged. + """ + op = Transformation().translate(tx, ty) + self.merge_transformed_page(page2, op, over, expand) + + def mergeTranslatedPage( + self, page2: "PageObject", tx: float, ty: float, expand: bool = False + ) -> None: # deprecated + """ + deprecated .. deprecated:: 1.28.0 - Use :meth:`add_transformation` and :meth:`merge_page` instead. + Use :meth:`merge_translated_page` instead. """ deprecation_with_replacement( "page.mergeTranslatedPage(page2, tx, ty, expand)", @@ -979,8 +1226,7 @@ def mergeTranslatedPage( "page.merge_page(page2, expand)", "3.0.0", ) - op = Transformation().translate(tx, ty) - self.mergeTransformedPage(page2, op, expand) + self.merge_translated_page(page2, tx, ty, expand) def mergeRotatedTranslatedPage( self, @@ -991,52 +1237,32 @@ def mergeRotatedTranslatedPage( expand: bool = False, ) -> None: # deprecated """ - mergeRotatedTranslatedPage is similar to merge_page, but the stream to - be merged is rotated and translated by applying a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float tx: The translation on X axis - :param float ty: The translation on Y axis - :param float rotation: The angle of the rotation, in degrees - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. - .. deprecated:: 1.28.0 - Use :meth:`add_transformation` and :meth:`merge_page` instead. + Use :meth:`merge_transformed_page` instead. """ deprecation_with_replacement( "page.mergeRotatedTranslatedPage(page2, rotation, tx, ty, expand)", - "page2.add_transformation(Transformation().rotate(rotation).translate(tx, ty)); " - "page.merge_page(page2, expand)", + "page.merge_transformed_page(page2, Transformation().rotate(rotation).translate(tx, ty), expand);", "3.0.0", ) op = Transformation().translate(-tx, -ty).rotate(rotation).translate(tx, ty) - return self.mergeTransformedPage(page2, op, expand) + return self.merge_transformed_page(page2, op, expand) def mergeRotatedScaledPage( self, page2: "PageObject", rotation: float, scale: float, expand: bool = False ) -> None: # deprecated """ - mergeRotatedScaledPage is similar to merge_page, but the stream to be - merged is rotated and scaled by applying a transformation matrix. - - :param PageObject page2: the page to be merged into this one. Should be - an instance of :class:`PageObject`. - :param float rotation: The angle of the rotation, in degrees - :param float scale: The scaling factor - :param bool expand: Whether the page should be expanded to fit the - dimensions of the page to be merged. + obsolete .. deprecated:: 1.28.0 - Use :meth:`add_transformation` and :meth:`merge_page` instead. + Use :meth:`merge_transformed_page` instead. """ deprecation_with_replacement( "page.mergeRotatedScaledPage(page2, rotation, scale, expand)", - "page2.add_transformation(Transformation().rotate(rotation).scale(scale)); " - "page.merge_page(page2, expand)", + "page.merge_transformed_page(page2, Transformation()" + ".rotate(rotation).scale(scale)); page.merge_page(page2, expand)", "3.0.0", ) op = Transformation().rotate(rotation).scale(scale, scale) @@ -1051,8 +1277,8 @@ def mergeScaledTranslatedPage( expand: bool = False, ) -> None: # deprecated """ - mergeScaledTranslatedPage is similar to merge_page, but the stream to be - merged is translated and scaled by applying a transformation matrix. + mergeScaledTranslatedPage is similar to merge_page, but the stream to + be merged is translated and scaled by applying a transformation matrix. :param PageObject page2: the page to be merged into this one. Should be an instance of :class:`PageObject`. @@ -1172,17 +1398,17 @@ def addTransformation( self, ctm: CompressedTransformationMatrix ) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_transformation` instead. - Use :meth:`add_transformation` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addTransformation", "add_transformation", "3.0.0") self.add_transformation(ctm) def scale(self, sx: float, sy: float) -> None: """ - Scale a page by the given factors by applying a transformation - matrix to its content and updating the page size. + Scale a page by the given factors by applying a transformation matrix + to its content and updating the page size. This updates the mediabox, the cropbox, and the contents of the page. @@ -1234,8 +1460,8 @@ def scale(self, sx: float, sy: float) -> None: def scale_by(self, factor: float) -> None: """ - Scale a page by the given factor by applying a transformation - matrix to its content and updating the page size. + Scale a page by the given factor by applying a transformation matrix to + its content and updating the page size. Args: factor: The scaling factor (for both X and Y axis). @@ -1244,17 +1470,17 @@ def scale_by(self, factor: float) -> None: def scaleBy(self, factor: float) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`scale_by` instead. - Use :meth:`scale_by` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("scaleBy", "scale_by", "3.0.0") self.scale(factor, factor) def scale_to(self, width: float, height: float) -> None: """ - Scale a page to the specified dimensions by applying a - transformation matrix to its content and updating the page size. + Scale a page to the specified dimensions by applying a transformation + matrix to its content and updating the page size. Args: width: The new width. @@ -1266,9 +1492,9 @@ def scale_to(self, width: float, height: float) -> None: def scaleTo(self, width: float, height: float) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`scale_to` instead. - Use :meth:`scale_to` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("scaleTo", "scale_to", "3.0.0") self.scale_to(width, height) @@ -1289,9 +1515,9 @@ def compress_content_streams(self) -> None: def compressContentStreams(self) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`compress_content_streams` instead. - Use :meth:`compress_content_streams` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "compressContentStreams", "compress_content_streams", "3.0.0" @@ -1437,7 +1663,6 @@ def orient(m: List[float]) -> int: return 270 def current_spacewidth() -> float: - # return space_scale * _space_width * char_scale return _space_width / 1000.0 def process_operation(operator: bytes, operands: List) -> None: @@ -1450,13 +1675,9 @@ def process_operation(operator: bytes, operands: List) -> None: # Table 5.4 page 405 if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - # tm_prev = tm_matrix output += text if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) - # based - # if output != "" and output[-1]!="\n": - # output += "\n" text = "" return None elif operator == b"ET": @@ -1491,7 +1712,6 @@ def process_operation(operator: bytes, operands: List) -> None: ) = cm_stack.pop() except Exception: cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - # rtl_dir = False elif operator == b"cm": output += text if visitor_text is not None: @@ -1508,7 +1728,6 @@ def process_operation(operator: bytes, operands: List) -> None: ], cm_matrix, ) - # rtl_dir = False # Table 5.2 page 398 elif operator == b"Tz": char_scale = float(operands[0]) / 100.0 @@ -1522,7 +1741,6 @@ def process_operation(operator: bytes, operands: List) -> None: if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" - # rtl_dir = False try: # charMapTuple: font_type, float(sp_width / 2), encoding, # map_dict, font-dictionary @@ -1614,8 +1832,7 @@ def process_operation(operator: bytes, operands: List) -> None: xx = ord(x) # fmt: off if ( - # cases where the current inserting order is - # kept (punctuation,...) + # cases where the current inserting order is kept (xx <= 0x2F) # punctuations but... or (0x3A <= xx and xx <= 0x40) # numbers (x30-39) or (0x2000 <= xx and xx <= 0x206F) # upper punctuations.. @@ -1629,10 +1846,8 @@ def process_operation(operator: bytes, operands: List) -> None: or (0xFE70 <= xx and xx <= 0xFEFF) or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX) ): - # print("<",xx,x) if not rtl_dir: rtl_dir = True - # print("RTL",text,"*") output += text if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) @@ -1642,7 +1857,6 @@ def process_operation(operator: bytes, operands: List) -> None: # print(">",xx,x,end="") if rtl_dir: rtl_dir = False - # print("LTR",text,"*") output += text if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) @@ -1781,7 +1995,6 @@ def process_operation(operator: bytes, operands: List) -> None: try: xobj = resources_dict["/XObject"] if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore - # output += text text = self.extract_xform_text( xobj[operands[0]], # type: ignore orientations, @@ -1922,7 +2135,12 @@ def extract_xform_text( Extract text from an XObject. Args: + xform: + orientations: space_width: force default space width (if not extracted from font (default 200) + visitor_operand_before: + visitor_operand_after: + visitor_text: Returns: The extracted text @@ -1940,9 +2158,9 @@ def extract_xform_text( def extractText(self, Tj_sep: str = "", TJ_sep: str = "") -> str: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`extract_text` instead. - Use :meth:`extract_text` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("extractText", "extract_text", "3.0.0") return self.extract_text() @@ -1961,18 +2179,16 @@ def _get_fonts(self) -> Tuple[Set[str], Set[str]]: return embedded, unembedded mediabox = _create_rectangle_accessor(PG.MEDIABOX, ()) - """ - A :class:`RectangleObject`, expressed in + """A :class:`RectangleObject`, expressed in default user space units, defining the boundaries of the physical medium on - which the page is intended to be displayed or printed. - """ + which the page is intended to be displayed or printed.""" @property def mediaBox(self) -> RectangleObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`mediabox` instead. - Use :py:attr:`mediabox` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("mediaBox", "mediabox", "3.0.0") return self.mediabox @@ -1980,9 +2196,9 @@ def mediaBox(self) -> RectangleObject: # deprecated @mediaBox.setter def mediaBox(self, value: RectangleObject) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`mediabox` instead. - Use :py:attr:`mediabox` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("mediaBox", "mediabox", "3.0.0") self.mediabox = value @@ -1990,18 +2206,21 @@ def mediaBox(self, value: RectangleObject) -> None: # deprecated cropbox = _create_rectangle_accessor("/CropBox", (PG.MEDIABOX,)) """ A :class:`RectangleObject`, expressed in - default user space units, defining the visible region of default user space. + default user space units, defining the visible region of default user + space. + When the page is displayed or printed, its contents are to be clipped (cropped) to this rectangle and then imposed on the output medium in some - implementation-defined manner. Default value: same as :attr:`mediabox`. + implementation-defined manner. Default value: same as + :attr:`mediabox`. """ @property def cropBox(self) -> RectangleObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`cropbox` instead. - Use :py:attr:`cropbox` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("cropBox", "cropbox", "3.0.0") return self.cropbox @@ -2012,18 +2231,16 @@ def cropBox(self, value: RectangleObject) -> None: # deprecated self.cropbox = value bleedbox = _create_rectangle_accessor("/BleedBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in + """A :class:`RectangleObject`, expressed in default user space units, defining the region to which the contents of the - page should be clipped when output in a production environment. - """ + page should be clipped when output in a production environment.""" @property def bleedBox(self) -> RectangleObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`bleedbox` instead. - Use :py:attr:`bleedbox` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("bleedBox", "bleedbox", "3.0.0") return self.bleedbox @@ -2034,18 +2251,16 @@ def bleedBox(self, value: RectangleObject) -> None: # deprecated self.bleedbox = value trimbox = _create_rectangle_accessor("/TrimBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in + """A :class:`RectangleObject`, expressed in default user space units, defining the intended dimensions of the finished - page after trimming. - """ + page after trimming.""" @property def trimBox(self) -> RectangleObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`trimbox` instead. - Use :py:attr:`trimbox` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("trimBox", "trimbox", "3.0.0") return self.trimbox @@ -2056,18 +2271,16 @@ def trimBox(self, value: RectangleObject) -> None: # deprecated self.trimbox = value artbox = _create_rectangle_accessor("/ArtBox", ("/CropBox", PG.MEDIABOX)) - """ - A :class:`RectangleObject`, expressed in + """A :class:`RectangleObject`, expressed in default user space units, defining the extent of the page's meaningful - content as intended by the page's creator. - """ + content as intended by the page's creator.""" @property def artBox(self) -> RectangleObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`artbox` instead. - Use :py:attr:`artbox` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("artBox", "artbox", "3.0.0") return self.artbox @@ -2138,6 +2351,16 @@ def _get_fonts_walk( emb: Optional[Set[str]] = None, ) -> Tuple[Set[str], Set[str]]: """ + Get the set of all fonts and all embedded fonts. + + Args: + obj: Page resources dictionary + fnt: font + emb: embedded fonts + + Returns: + A tuple (fnt, emb) + If there is a key called 'BaseFont', that is a font that is used in the document. If there is a key called 'FontName' and another key in the same dictionary object that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is diff --git a/pypdf/_page_labels.py b/pypdf/_page_labels.py index e2baa8aeda..6b114f4b26 100644 --- a/pypdf/_page_labels.py +++ b/pypdf/_page_labels.py @@ -57,20 +57,11 @@ aa to zz for the next 26, and so on) """ -from typing import ( - Iterator, - Optional, - Tuple, -) +from typing import Iterator, Optional, Tuple from ._protocols import PdfReaderProtocol from ._utils import logger_warning - -from .generic import ( - ArrayObject, - DictionaryObject, - NumberObject, -) +from .generic import ArrayObject, DictionaryObject, NumberObject def number2uppercase_roman_numeral(num: int) -> str: diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 1351310fa6..85e9e0a568 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -7,9 +7,9 @@ # Python 3.8+: https://peps.python.org/pep-0586 from typing import Protocol # type: ignore[attr-defined] except ImportError: - from typing_extensions import Protocol # type: ignore[misc] + from typing_extensions import Protocol # type: ignore[assignment,misc] -from ._utils import StrByteType +from ._utils import StrByteType, StreamType class PdfObjectProtocol(Protocol): @@ -29,6 +29,14 @@ def _reference_clone(self, clone: Any, pdf_dest: Any) -> Any: def get_object(self) -> Optional["PdfObjectProtocol"]: ... + def hash_value(self) -> bytes: + ... + + def write_to_stream( + self, stream: StreamType, encryption_key: Union[None, str, bytes] + ) -> None: + ... + class PdfReaderProtocol(Protocol): # deprecated @property @@ -64,3 +72,11 @@ def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: ... + + @property + def pages(self) -> List[Any]: + ... + + @property + def pdf_header(self) -> bytes: + ... diff --git a/pypdf/_reader.py b/pypdf/_reader.py index ddca4b1a40..3aa02ac4a0 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -141,11 +141,9 @@ def _get_text(self, key: str) -> Optional[str]: def getText(self, key: str) -> Optional[str]: # deprecated """ - The text value of the specified key or None. + Use the attributes (e.g. :py:attr:`title` / :py:attr:`author`). .. deprecated:: 1.28.0 - - Use the attributes (e.g. :py:attr:`title` / :py:attr:`author`). """ deprecation_no_replacement("getText", "3.0.0") return self._get_text(key) @@ -153,10 +151,10 @@ def getText(self, key: str) -> Optional[str]: # deprecated @property def title(self) -> Optional[str]: """ - Read-only property accessing the document's **title**. + Read-only property accessing the document's title. - Returns a unicode string (``TextStringObject``) or ``None`` - if the title is not specified. + Returns a ``TextStringObject`` or ``None`` if the title is not + specified. """ return ( self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore @@ -172,10 +170,10 @@ def title_raw(self) -> Optional[str]: @property def author(self) -> Optional[str]: """ - Read-only property accessing the document's **author**. + Read-only property accessing the document's author. - Returns a unicode string (``TextStringObject``) or ``None`` - if the author is not specified. + Returns a ``TextStringObject`` or ``None`` if the author is not + specified. """ return self._get_text(DI.AUTHOR) @@ -187,10 +185,10 @@ def author_raw(self) -> Optional[str]: @property def subject(self) -> Optional[str]: """ - Read-only property accessing the document's **subject**. + Read-only property accessing the document's subject. - Returns a unicode string (``TextStringObject``) or ``None`` - if the subject is not specified. + Returns a ``TextStringObject`` or ``None`` if the subject is not + specified. """ return self._get_text(DI.SUBJECT) @@ -202,12 +200,12 @@ def subject_raw(self) -> Optional[str]: @property def creator(self) -> Optional[str]: """ - Read-only property accessing the document's **creator**. + Read-only property accessing the document's creator. If the document was converted to PDF from another format, this is the name of the application (e.g. OpenOffice) that created the original - document from which it was converted. Returns a unicode string - (``TextStringObject``) or ``None`` if the creator is not specified. + document from which it was converted. Returns a ``TextStringObject`` or + ``None`` if the creator is not specified. """ return self._get_text(DI.CREATOR) @@ -219,12 +217,12 @@ def creator_raw(self) -> Optional[str]: @property def producer(self) -> Optional[str]: """ - Read-only property accessing the document's **producer**. + Read-only property accessing the document's producer. - If the document was converted to PDF from another format, this is - the name of the application (for example, OSX Quartz) that converted - it to PDF. Returns a unicode string (``TextStringObject``) - or ``None`` if the producer is not specified. + If the document was converted to PDF from another format, this is the + name of the application (for example, OSX Quartz) that converted it to + PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not + specified. """ return self._get_text(DI.PRODUCER) @@ -235,9 +233,7 @@ def producer_raw(self) -> Optional[str]: @property def creation_date(self) -> Optional[datetime]: - """ - Read-only property accessing the document's **creation date**. - """ + """Read-only property accessing the document's creation date.""" text = self._get_text(DI.CREATION_DATE) if text is None: return None @@ -248,15 +244,15 @@ def creation_date_raw(self) -> Optional[str]: """ The "raw" version of creation date; can return a ``ByteStringObject``. - Typically in the format D:YYYYMMDDhhmmss[+-]hh'mm where the suffix is the - offset from UTC. + Typically in the format ``D:YYYYMMDDhhmmss[+-]hh'mm`` where the suffix + is the offset from UTC. """ return self.get(DI.CREATION_DATE) @property def modification_date(self) -> Optional[datetime]: """ - Read-only property accessing the document's **modification date**. + Read-only property accessing the document's modification date. The date and time the document was most recently modified. """ @@ -268,10 +264,11 @@ def modification_date(self) -> Optional[datetime]: @property def modification_date_raw(self) -> Optional[str]: """ - The "raw" version of modification date; can return a ``ByteStringObject``. + The "raw" version of modification date; can return a + ``ByteStringObject``. - Typically in the format D:YYYYMMDDhhmmss[+-]hh'mm where the suffix is the - offset from UTC. + Typically in the format ``D:YYYYMMDDhhmmss[+-]hh'mm`` where the suffix + is the offset from UTC. """ return self.get(DI.MOD_DATE) @@ -284,15 +281,15 @@ class PdfReader: tables are read into memory. Args: - stream: A File object or an object that supports the standard read - and seek methods similar to a File object. Could also be a - string representing a path to a PDF file. - strict: Determines whether user should be warned of all - problems and also causes some correctable problems to be fatal. - Defaults to ``False``. - password: Decrypt PDF file at initialization. If the - password is None, the file will not be decrypted. - Defaults to ``None`` + stream: A File object or an object that supports the standard read + and seek methods similar to a File object. Could also be a + string representing a path to a PDF file. + strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``False``. + password: Decrypt PDF file at initialization. If the + password is None, the file will not be decrypted. + Defaults to ``None`` """ def __init__( @@ -367,6 +364,7 @@ def pdf_header(self) -> str: def metadata(self) -> Optional[DocumentInformation]: """ Retrieve the PDF file's document information dictionary, if it exists. + Note that some PDF files use metadata streams instead of docinfo dictionaries, and these metadata streams will not be accessed by this function. @@ -384,9 +382,9 @@ def metadata(self) -> Optional[DocumentInformation]: def getDocumentInfo(self) -> Optional[DocumentInformation]: # deprecated """ - .. deprecated:: 1.28.0 + Use the attribute :py:attr:`metadata` instead. - Use the attribute :py:attr:`metadata` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getDocumentInfo", "metadata", "3.0.0") return self.metadata @@ -394,9 +392,9 @@ def getDocumentInfo(self) -> Optional[DocumentInformation]: # deprecated @property def documentInfo(self) -> Optional[DocumentInformation]: # deprecated """ - .. deprecated:: 1.28.0 + Use the attribute :py:attr:`metadata` instead. - Use the attribute :py:attr:`metadata` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("documentInfo", "metadata", "3.0.0") return self.metadata @@ -412,9 +410,9 @@ def xmp_metadata(self) -> Optional[XmpInformation]: def getXmpMetadata(self) -> Optional[XmpInformation]: # deprecated """ - .. deprecated:: 1.28.0 + Use the attribute :py:attr:`metadata` instead. - Use the attribute :py:attr:`xmp_metadata` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getXmpMetadata", "xmp_metadata", "3.0.0") return self.xmp_metadata @@ -422,9 +420,9 @@ def getXmpMetadata(self) -> Optional[XmpInformation]: # deprecated @property def xmpMetadata(self) -> Optional[XmpInformation]: # deprecated """ - .. deprecated:: 1.28.0 + Use the attribute :py:attr:`xmp_metadata` instead. - Use the attribute :py:attr:`xmp_metadata` instead. + .. deprecated:: 1.28.0. """ deprecation_with_replacement("xmpMetadata", "xmp_metadata", "3.0.0") return self.xmp_metadata @@ -436,11 +434,11 @@ def _get_num_pages(self) -> int: Args: Returns: - The number of pages of the parsed PDF file + The number of pages of the parsed PDF file Raises: - PdfReadError: if file is encrypted and restrictions prevent - this action. + PdfReadError: if file is encrypted and restrictions prevent + this action. """ # Flattened pages will not work on an Encrypted PDF; # the PDF file's page count is used in this case. Otherwise, @@ -454,9 +452,9 @@ def _get_num_pages(self) -> int: def getNumPages(self) -> int: # deprecated """ - .. deprecated:: 1.28.0 + Use :code:`len(reader.pages)` instead. - Use :code:`len(reader.pages)` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("reader.getNumPages", "len(reader.pages)", "3.0.0") return self._get_num_pages() @@ -464,18 +462,18 @@ def getNumPages(self) -> int: # deprecated @property def numPages(self) -> int: # deprecated """ - .. deprecated:: 1.28.0 + Use :code:`len(reader.pages)` instead. - Use :code:`len(reader.pages)` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("reader.numPages", "len(reader.pages)", "3.0.0") return self._get_num_pages() def getPage(self, pageNumber: int) -> PageObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :code:`reader.pages[page_number]` instead. - Use :code:`reader.pages[page_number]` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "reader.getPage(pageNumber)", "reader.pages[page_number]", "3.0.0" @@ -487,14 +485,12 @@ def _get_page(self, page_number: int) -> PageObject: Retrieve a page by number from this PDF file. Args: - page_number: The page number to retrieve - (pages begin at zero) + page_number: The page number to retrieve + (pages begin at zero) Returns: - A :class:`PageObject` instance. + A :class:`PageObject` instance. """ - # ensure that we're not trying to access an encrypted PDF - # assert not self.trailer.has_key(TK.ENCRYPT) if self.flattened_pages is None: self._flatten() assert self.flattened_pages is not None, "hint for mypy" @@ -503,9 +499,9 @@ def _get_page(self, page_number: int) -> PageObject: @property def namedDestinations(self) -> Dict[str, Any]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`named_destinations` instead. - Use :py:attr:`named_destinations` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("namedDestinations", "named_destinations", "3.0.0") return self.named_destinations @@ -533,17 +529,16 @@ def get_fields( The *tree* and *retval* parameters are for recursive use. Args: - tree: - retval: - fileobj: A file object (usually a text file) to write - a report to on all interactive form fields found. + tree: + retval: + fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. Returns: - A dictionary where each key is a field name, and each - value is a :class:`Field` object. By - default, the mapping name is used for keys. - ``None`` if form data could not be located. - + A dictionary where each key is a field name, and each + value is a :class:`Field` object. By + default, the mapping name is used for keys. + ``None`` if form data could not be located. """ field_attributes = FieldDictionaryAttributes.attributes_dict() field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) @@ -579,9 +574,9 @@ def getFields( fileobj: Optional[Any] = None, ) -> Optional[Dict[str, Any]]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_fields` instead. - Use :meth:`get_fields` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getFields", "get_fields", "3.0.0") return self.get_fields(tree, retval, fileobj) @@ -679,13 +674,15 @@ def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, A """ Retrieve form fields from the document with textual data. - The key is the name of the form field, the value is the content of the - field. + Args: + full_qualified_name: to get full name - If the document contains multiple form fields with the same name, the - second and following will get the suffix .2, .3, ... + Returns: + A dictionary. The key is the name of the form field, + the value is the content of the field. - full_qualified_name should be used to get full name + If the document contains multiple form fields with the same name, the + second and following will get the suffix .2, .3, ... """ def indexed_key(k: str, fields: dict) -> str: @@ -710,19 +707,12 @@ def indexed_key(k: str, fields: dict) -> str: else: ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") return ff - """return { - (field if full_qualified_name else formfields[field]["/T"]): formfields[ - field - ].get("/V") - for field in formfields - if formfields[field].get("/FT") == "/Tx" - }""" def getFormTextFields(self) -> Dict[str, Any]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_form_text_fields` instead. - Use :meth:`get_form_text_fields` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "getFormTextFields", "get_form_text_fields", "3.0.0" @@ -738,11 +728,11 @@ def _get_named_destinations( Retrieve the named destinations present in the document. Args: - tree: - retval: + tree: + retval: Returns: - A dictionary which maps names to + A dictionary which maps names to :class:`Destinations`. """ if retval is None: @@ -789,9 +779,9 @@ def getNamedDestinations( retval: Optional[Any] = None, ) -> Dict[str, Any]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`named_destinations` instead. - Use :py:attr:`named_destinations` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "getNamedDestinations", "named_destinations", "3.0.0" @@ -801,17 +791,19 @@ def getNamedDestinations( @property def outline(self) -> OutlineType: """ - Read-only property for the outline (i.e., a collection of 'outline items' - which are also known as 'bookmarks') present in the document. + Read-only property for the outline present in the document. + + (i.e., a collection of 'outline items' which are also known as + 'bookmarks') """ return self._get_outline() @property def outlines(self) -> OutlineType: # deprecated """ - .. deprecated:: 2.9.0 + Use :py:attr:`outline` instead. - Use :py:attr:`outline` instead. + .. deprecated:: 2.9.0 """ deprecation_with_replacement("outlines", "outline", "3.0.0") return self.outline @@ -861,9 +853,9 @@ def getOutlines( self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None ) -> OutlineType: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`outline` instead. - Use :py:attr:`outline` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getOutlines", "outline", "3.0.0") return self._get_outline(node, outline) @@ -871,9 +863,12 @@ def getOutlines( @property def threads(self) -> Optional[ArrayObject]: """ - Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec. - It's an array of dictionaries with "/F" and "/I" properties - or None if there are no articles. + Read-only property for the list of threads. + + See §8.3.2 from PDF 1.7 spec. + + It's an array of dictionaries with "/F" and "/I" properties or + None if there are no articles. """ catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) if CO.THREADS in catalog: @@ -884,13 +879,14 @@ def threads(self) -> Optional[ArrayObject]: def _get_page_number_by_indirect( self, indirect_reference: Union[None, int, NullObject, IndirectObject] ) -> int: - """Generate _page_id2num + """ + Generate _page_id2num. Args: - indirect_reference: + indirect_reference: Returns: - The page number. + The page number. """ if self._page_id2num is None: self._page_id2num = { @@ -909,22 +905,22 @@ def _get_page_number_by_indirect( def get_page_number(self, page: PageObject) -> int: """ - Retrieve page number of a given PageObject + Retrieve page number of a given PageObject. Args: - page: The page to get page number. Should be - an instance of :class:`PageObject` + page: The page to get page number. Should be + an instance of :class:`PageObject` Returns: - The page number or -1 if page is not found + The page number or -1 if page is not found """ return self._get_page_number_by_indirect(page.indirect_reference) def getPageNumber(self, page: PageObject) -> int: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_page_number` instead. - Use :meth:`get_page_number` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getPageNumber", "get_page_number", "3.0.0") return self.get_page_number(page) @@ -934,18 +930,18 @@ def get_destination_page_number(self, destination: Destination) -> int: Retrieve page number of a given Destination object. Args: - destination: The destination to get page number. + destination: The destination to get page number. Returns: - The page number or -1 if page is not found + The page number or -1 if page is not found """ return self._get_page_number_by_indirect(destination.page) def getDestinationPageNumber(self, destination: Destination) -> int: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_destination_page_number` instead. - Use :meth:`get_destination_page_number` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "getDestinationPageNumber", "get_destination_page_number", "3.0.0" @@ -1043,11 +1039,11 @@ def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore if "/F" in node: # specifies style characteristics bold and/or italic - # 1=italic, 2=bold, 3=both + # with 1=italic, 2=bold, 3=both outline_item[NameObject("/F")] = node["/F"] if "/Count" in node: # absolute value = num. visible children - # positive = open/unfolded, negative = closed/folded + # with positive = open/unfolded, negative = closed/folded outline_item[NameObject("/Count")] = node["/Count"] outline_item.node = node return outline_item @@ -1062,8 +1058,8 @@ def page_labels(self) -> List[str]: """ A list of labels for the pages in this document. - This property is read-only. The labels are in the order - that the pages appear in the document. + This property is read-only. The labels are in the order that the pages + appear in the document. """ return [page_index2page_label(self, i) for i in range(len(self.pages))] @@ -1097,9 +1093,9 @@ def page_layout(self) -> Optional[str]: def getPageLayout(self) -> Optional[str]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_layout` instead. - Use :py:attr:`page_layout` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getPageLayout", "page_layout", "3.0.0") return self.page_layout @@ -1107,9 +1103,9 @@ def getPageLayout(self) -> Optional[str]: # deprecated @property def pageLayout(self) -> Optional[str]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_layout` instead. - Use :py:attr:`page_layout` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") return self.page_layout @@ -1142,9 +1138,9 @@ def page_mode(self) -> Optional[PagemodeType]: def getPageMode(self) -> Optional[PagemodeType]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_mode` instead. - Use :py:attr:`page_mode` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getPageMode", "page_mode", "3.0.0") return self.page_mode @@ -1152,9 +1148,9 @@ def getPageMode(self) -> Optional[PagemodeType]: # deprecated @property def pageMode(self) -> Optional[PagemodeType]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_mode` instead. - Use :py:attr:`page_mode` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("pageMode", "page_mode", "3.0.0") return self.page_mode @@ -1262,14 +1258,15 @@ def _get_object_from_stream( def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: """ Used to ease development. + This is equivalent to generic.IndirectObject(num,gen,self).get_object() Args: - num: - gen: + num: + gen: Returns: - A PdfObject + A PdfObject """ return IndirectObject(num, gen, self).get_object() @@ -1407,9 +1404,9 @@ def getObject( self, indirectReference: IndirectObject ) -> Optional[PdfObject]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_object` instead. - Use :meth:`get_object` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getObject", "get_object", "3.0.0") return self.get_object(indirectReference) @@ -1444,9 +1441,9 @@ def read_object_header(self, stream: StreamType) -> Tuple[int, int]: def readObjectHeader(self, stream: StreamType) -> Tuple[int, int]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`read_object_header` instead. - Use :meth:`read_object_header` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("readObjectHeader", "read_object_header", "3.0.0") return self.read_object_header(stream) @@ -1460,9 +1457,9 @@ def cacheGetIndirectObject( self, generation: int, idnum: int ) -> Optional[PdfObject]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`cache_get_indirect_object` instead. - Use :meth:`cache_get_indirect_object` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "cacheGetIndirectObject", "cache_get_indirect_object", "3.0.0" @@ -1486,9 +1483,9 @@ def cacheIndirectObject( self, generation: int, idnum: int, obj: Optional[PdfObject] ) -> Optional[PdfObject]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`cache_indirect_object` instead. - Use :meth:`cache_indirect_object` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("cacheIndirectObject", "cache_indirect_object") return self.cache_indirect_object(generation, idnum, obj) @@ -1559,10 +1556,10 @@ def _find_startxref_pos(self, stream: StreamType) -> int: Find startxref entry - the location of the xref table. Args: - stream: + stream: Returns: - The bytes offset + The bytes offset """ line = read_previous_line(stream) try: @@ -1849,11 +1846,11 @@ def _get_xref_issues(stream: StreamType, startxref: int) -> int: Return an int which indicates an issue. 0 means there is no issue. Args: - stream: - startxref: + stream: + startxref: Returns: - 0 means no issue, other values represent specific issues. + 0 means no issue, other values represent specific issues. """ stream.seek(startxref - 1, 0) # -1 to check character before line = stream.read(1) @@ -1870,11 +1867,6 @@ def _get_xref_issues(stream: StreamType, startxref: int) -> int: line += stream.read(2) # 1 char already read, +2 to check "obj" if line.lower() != b"obj": return 3 - # while stream.read(1) in b" \t\r\n": - # pass - # line = stream.read(256) # check that it is xref obj - # if b"/xref" not in line.lower(): - # return 4 return 0 def _rebuild_xref_table(self, stream: StreamType) -> None: @@ -1902,11 +1894,8 @@ def _read_xref_subsections( get_entry: Callable[[int], Union[int, Tuple[int, ...]]], used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], ) -> None: - # last_end = 0 for start, size in self._pairs(idx_pairs): # The subsections must increase - # assert start >= last_end - # last_end = start + size for num in range(start, start + size): # The first entry is the type xref_type = get_entry(0) @@ -1994,10 +1983,10 @@ def decrypt(self, password: Union[str, bytes]) -> PasswordType: this library. Args: - password: The password to match. + password: The password to match. Returns: - A `PasswordType`. + A `PasswordType`. """ if not self._encryption: raise PdfReadError("Not encrypted file") @@ -2023,6 +2012,7 @@ def decode_permissions(self, permissions_code: int) -> Dict[str, bool]: def is_encrypted(self) -> bool: """ Read-only boolean property showing whether this PDF file is encrypted. + Note that this property, if true, will remain true even after the :meth:`decrypt()` method is called. """ @@ -2030,9 +2020,9 @@ def is_encrypted(self) -> bool: def getIsEncrypted(self) -> bool: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`is_encrypted` instead. - Use :py:attr:`is_encrypted` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getIsEncrypted", "is_encrypted", "3.0.0") return self.is_encrypted @@ -2040,9 +2030,9 @@ def getIsEncrypted(self) -> bool: # deprecated @property def isEncrypted(self) -> bool: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`is_encrypted` instead. - Use :py:attr:`is_encrypted` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("isEncrypted", "is_encrypted", "3.0.0") return self.is_encrypted @@ -2071,6 +2061,75 @@ def xfa(self) -> Optional[Dict[str, Any]]: retval[tag] = es return retval + def add_form_topname(self, name: str) -> Optional[DictionaryObject]: + """ + Add a top level form that groups all form fields below it. + + Args: + name: text string of the "/T" Attribute of the created object + + Returns: + The created object. ``None`` means no object was created. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not isinstance( + catalog["/AcroForm"], DictionaryObject + ): + return None + acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + if "/Fields" not in acroform: + # TODO: :No error returns but may be extended for XFA Forms + return None + + interim = DictionaryObject() + interim[NameObject("/T")] = TextStringObject(name) + interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] + self.cache_indirect_object( + 0, + max([i for (g, i) in self.resolved_objects.keys() if g == 0]) + 1, + interim, + ) + arr = ArrayObject() + arr.append(interim.indirect_reference) + acroform[NameObject("/Fields")] = arr + for o in cast(ArrayObject, interim["/Kids"]): + obj = o.get_object() + if "/Parent" in obj: + logger_warning( + f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", + __name__, + ) + obj[NameObject("/Parent")] = interim.indirect_reference + return interim + + def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: + """ + Rename top level form field that all form fields below it. + + Args: + name: text string of the "/T" field of the created object + + Returns: + The modified object. ``None`` means no object was modified. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not isinstance( + catalog["/AcroForm"], DictionaryObject + ): + return None + acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + if "/Fields" not in acroform: + return None + + interim = cast( + DictionaryObject, + cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), + ) + interim[NameObject("/T")] = TextStringObject(name) + return interim + class PdfFileReader(PdfReader): # deprecated def __init__(self, *args: Any, **kwargs: Any) -> None: diff --git a/pypdf/_security.py b/pypdf/_security.py index b2314f32cc..fd76558297 100644 --- a/pypdf/_security.py +++ b/pypdf/_security.py @@ -66,20 +66,20 @@ def _alg32( See section 3.5.2 of the PDF 1.6 reference. Args: - password: The encryption secret as a bytes-string - rev: The encryption revision (see PDF standard) - keylen: - owner_entry: - p_entry: A set of flags specifying which operations shall be permitted + password: The encryption secret as a bytes-string + rev: The encryption revision (see PDF standard) + keylen: + owner_entry: + p_entry: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. - id1_entry: - metadata_encrypt: (Default value = True) + id1_entry: + metadata_encrypt: (Default value = True) Returns: - An MD5 hash of keylen characters. + An MD5 hash of keylen characters. """ # 1. Pad or truncate the password string to exactly 32 bytes. If the # password string is more than 32 bytes long, use only its first 32 bytes; @@ -129,13 +129,13 @@ def _alg33( section 3.5.2 of the PDF 1.6 reference. Args: - owner_password: - user_password: - rev: The encryption revision (see PDF standard) - keylen: + owner_password: + user_password: + rev: The encryption revision (see PDF standard) + keylen: Returns: - A transformed version of the owner and the user password + A transformed version of the owner and the user password """ # steps 1 - 4 key = _alg33_1(owner_password, rev, keylen) @@ -162,15 +162,15 @@ def _alg33( def _alg33_1(password: str, rev: Literal[2, 3, 4], keylen: int) -> bytes: """ - Steps 1-4 of algorithm 3.3 + Steps 1-4 of algorithm 3.3. Args: - password: - rev: The encryption revision (see PDF standard) - keylen: + password: + rev: The encryption revision (see PDF standard) + keylen: Returns: - A transformed version of the password + A transformed version of the password """ # 1. Pad or truncate the owner password string as described in step 1 of # algorithm 3.2. If there is no owner password, use the user password @@ -205,14 +205,14 @@ def _alg34( See section 3.5.2 of the PDF 1.6 reference. Args: - password: - owner_entry: - p_entry: A set of flags specifying which operations shall be permitted + password: + owner_entry: + p_entry: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. - id1_entry: + id1_entry: Returns: A Tuple (u-value, key) @@ -246,20 +246,20 @@ def _alg35( See section 3.5.2 of the PDF 1.6 reference. Args: - password: - rev: The encryption revision (see PDF standard) - keylen: - owner_entry: - p_entry: A set of flags specifying which operations shall be permitted + password: + rev: The encryption revision (see PDF standard) + keylen: + owner_entry: + p_entry: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other bits are ignored and all operations are permitted. If bit 2 is set to 0, permission for operations are based on the values of the remaining flags defined in Table 24. - id1_entry: - metadata_encrypt: A boolean + id1_entry: + metadata_encrypt: A boolean Returns: - A tuple (value, key) + A tuple (value, key) """ # 1. Create an encryption key based on the user password string, as # described in Algorithm 3.2. diff --git a/pypdf/_utils.py b/pypdf/_utils.py index f68e0c359e..a758a1efdc 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -102,11 +102,11 @@ def read_until_whitespace(stream: StreamType, maxchars: Optional[int] = None) -> Stops upon encountering whitespace or when maxchars is reached. Args: - stream: The data stream from which was read. - maxchars: The maximum number of bytes returned; by default unlimited. + stream: The data stream from which was read. + maxchars: The maximum number of bytes returned; by default unlimited. Returns: - The data which was read. + The data which was read. """ txt = b"" while True: @@ -124,10 +124,10 @@ def read_non_whitespace(stream: StreamType) -> bytes: Find and read the next non-whitespace character (ignores whitespace). Args: - stream: The data stream from which was read. + stream: The data stream from which was read. Returns: - The data which was read. + The data which was read. """ tok = stream.read(1) while tok in WHITESPACES: @@ -137,15 +137,14 @@ def read_non_whitespace(stream: StreamType) -> bytes: def skip_over_whitespace(stream: StreamType) -> bool: """ - Similar to read_non_whitespace, but return a boolean if more than - one whitespace character was read. + Similar to read_non_whitespace, but return a boolean if more than one + whitespace character was read. Args: - stream: The data stream from which was read. + stream: The data stream from which was read. Returns: - True if more than one whitespace was skipped, - otherwise return False. + True if more than one whitespace was skipped, otherwise return False. """ tok = WHITESPACES[0] cnt = 0 @@ -169,10 +168,10 @@ def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: Treats EOF on the underlying stream as the end of the token to be matched. Args: - regex: re.Pattern + regex: re.Pattern Returns: - The read bytes. + The read bytes. """ name = b"" while True: @@ -196,11 +195,11 @@ def read_block_backwards(stream: StreamType, to_read: int) -> bytes: read. Args: - stream: - to_read: + stream: + to_read: Returns: - The data which was read. + The data which was read. """ if stream.tell() < to_read: raise PdfStreamError("Could not read malformed PDF file") @@ -223,10 +222,10 @@ def read_previous_line(stream: StreamType) -> bytes: or, if no such byte is found, at the beginning of the stream. Args: - stream: StreamType: + stream: StreamType: Returns: - The data which was read. + The data which was read. """ line_content = [] found_crlf = False @@ -350,7 +349,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: def hexencode(b: bytes) -> bytes: - coder = getencoder("hex_codec") coded = coder(b) # type: ignore return coded[0] @@ -388,32 +386,24 @@ def deprecation(msg: str) -> None: def deprecate_with_replacement( old_name: str, new_name: str, removed_in: str = "3.0.0" ) -> None: - """ - Raise an exception that a feature will be removed, but has a replacement. - """ + """Raise an exception that a feature will be removed, but has a replacement.""" deprecate(DEPR_MSG.format(old_name, new_name, removed_in), 4) def deprecation_with_replacement( old_name: str, new_name: str, removed_in: str = "3.0.0" ) -> None: - """ - Raise an exception that a feature was already removed, but has a replacement. - """ + """Raise an exception that a feature was already removed, but has a replacement.""" deprecation(DEPR_MSG_HAPPENED.format(old_name, removed_in, new_name)) def deprecate_no_replacement(name: str, removed_in: str = "3.0.0") -> None: - """ - Raise an exception that a feature will be removed without replacement. - """ + """Raise an exception that a feature will be removed without replacement.""" deprecate(DEPR_MSG_NO_REPLACEMENT.format(name, removed_in), 4) def deprecation_no_replacement(name: str, removed_in: str = "3.0.0") -> None: - """ - Raise an exception that a feature was already removed without replacement. - """ + """Raise an exception that a feature was already removed without replacement.""" deprecation(DEPR_MSG_NO_REPLACEMENT_HAPPENED.format(name, removed_in)) @@ -460,6 +450,12 @@ def rename_kwargs( # type: ignore ): """ Helper function to deprecate arguments. + + Args: + func_name: Name of the function to be deprecated + kwargs: + aliases: + fail: """ for old_term, new_term in aliases.items(): diff --git a/pypdf/_version.py b/pypdf/_version.py index 88c513ea36..903a158add 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.3.0" +__version__ = "3.4.0" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 571df35894..f6cd581af5 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -245,13 +245,27 @@ def get_object( def getObject(self, ido: Union[int, IndirectObject]) -> PdfObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_object` instead. - Use :meth:`get_object` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getObject", "get_object", "3.0.0") return self.get_object(ido) + def _replace_object( + self, + indirect_reference: Union[int, IndirectObject], + obj: PdfObject, + ) -> PdfObject: + if isinstance(indirect_reference, IndirectObject): + assert indirect_reference.pdf == self + indirect_reference = indirect_reference.idnum + self._objects[indirect_reference - 1] = obj + return self._objects[indirect_reference - 1] + if indirect_reference.pdf != self: + raise ValueError("pdf must be self") + return self._objects[indirect_reference.idnum - 1] # type: ignore + def _add_page( self, page: PageObject, @@ -273,7 +287,6 @@ def _add_page( except Exception: pass page = cast("PageObject", page_org.clone(self, False, excluded_keys)) - # page_ind = self._add_object(page) if page_org.pdf is not None: other = page_org.pdf.pdf_header if isinstance(other, str): @@ -314,18 +327,19 @@ def add_page( ) -> PageObject: """ Add a page to this PDF file. - Recommended for advanced usage including the adequate excluded_keys + + Recommended for advanced usage including the adequate excluded_keys. The page is usually acquired from a :class:`PdfReader` instance. Args: - page: The page to add to the document. Should be - an instance of :class:`PageObject` - excluded_keys: + page: The page to add to the document. Should be + an instance of :class:`PageObject` + excluded_keys: Returns: - The added PageObject. + The added PageObject. """ return self._add_page(page, list.append, excluded_keys) @@ -335,9 +349,9 @@ def addPage( excluded_keys: Iterable[str] = (), ) -> PageObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_page` instead. - Use :meth:`add_page` instead. + .. deprecated:: 1.28.0. """ deprecation_with_replacement("addPage", "add_page", "3.0.0") return self.add_page(page, excluded_keys) @@ -353,12 +367,12 @@ def insert_page( :class:`PdfReader` instance. Args: - page: The page to add to the document. - index: Position at which the page will be inserted. - excluded_keys: + page: The page to add to the document. + index: Position at which the page will be inserted. + excluded_keys: Returns: - The added PageObject. + The added PageObject. """ return self._add_page(page, lambda kids, p: kids.insert(index, p)) @@ -369,9 +383,9 @@ def insertPage( excluded_keys: Iterable[str] = (), ) -> PageObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`insert_page` instead. - Use :meth:`insert_page` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("insertPage", "insert_page", "3.0.0") return self.insert_page(page, index, excluded_keys) @@ -383,11 +397,11 @@ def get_page( Retrieve a page by number from this PDF file. Args: - page_number: The page number to retrieve - (pages begin at zero) + page_number: The page number to retrieve + (pages begin at zero) Returns: - The page at the index given by *page_number* + The page at the index given by *page_number* """ if pageNumber is not None: # deprecated if page_number is not None: @@ -404,9 +418,9 @@ def get_page( def getPage(self, pageNumber: int) -> PageObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :code:`writer.pages[page_number]` instead. - Use :code:`writer.pages[page_number]` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getPage", "writer.pages[page_number]", "3.0.0") return self.get_page(pageNumber) @@ -417,9 +431,9 @@ def _get_num_pages(self) -> int: def getNumPages(self) -> int: # deprecated """ - .. deprecated:: 1.28.0 + Use :code:`len(writer.pages)` instead. - Use :code:`len(writer.pages)` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getNumPages", "len(writer.pages)", "3.0.0") return self._get_num_pages() @@ -433,60 +447,61 @@ def add_blank_page( self, width: Optional[float] = None, height: Optional[float] = None ) -> PageObject: """ - Append a blank page to this PDF file and returns it. If no page size - is specified, use the size of the last page. + Append a blank page to this PDF file and returns it. + + If no page size is specified, use the size of the last page. Args: - width: The width of the new page expressed in default user - space units. - height: The height of the new page expressed in default - user space units. + width: The width of the new page expressed in default user + space units. + height: The height of the new page expressed in default + user space units. Returns: - The newly appended page + The newly appended page Raises: - PageSizeNotDefinedError: if width and height are not defined - and previous page does not exist. + PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. """ page = PageObject.create_blank_page(self, width, height) - self.add_page(page) - return page + return self.add_page(page) def addBlankPage( self, width: Optional[float] = None, height: Optional[float] = None ) -> PageObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_blank_page` instead. - Use :meth:`add_blank_page` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addBlankPage", "add_blank_page", "3.0.0") return self.add_blank_page(width, height) def insert_blank_page( self, - width: Optional[decimal.Decimal] = None, - height: Optional[decimal.Decimal] = None, + width: Optional[Union[float, decimal.Decimal]] = None, + height: Optional[Union[float, decimal.Decimal]] = None, index: int = 0, ) -> PageObject: """ - Insert a blank page to this PDF file and returns it. If no page size - is specified, use the size of the last page. + Insert a blank page to this PDF file and returns it. + + If no page size is specified, use the size of the last page. Args: - width: The width of the new page expressed in default user - space units. - height: The height of the new page expressed in default - user space units. - index: Position to add the page. + width: The width of the new page expressed in default user + space units. + height: The height of the new page expressed in default + user space units. + index: Position to add the page. Returns: - The newly appended page + The newly appended page Raises: - PageSizeNotDefinedError: if width and height are not defined - and previous page does not exist. + PageSizeNotDefinedError: if width and height are not defined + and previous page does not exist. """ if width is None or height is None and (self._get_num_pages() - 1) >= index: oldpage = self.pages[index] @@ -498,14 +513,14 @@ def insert_blank_page( def insertBlankPage( self, - width: Optional[decimal.Decimal] = None, - height: Optional[decimal.Decimal] = None, + width: Optional[Union[float, decimal.Decimal]] = None, + height: Optional[Union[float, decimal.Decimal]] = None, index: int = 0, ) -> PageObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`insertBlankPage` instead. - Use :meth:`insertBlankPage` instead. + .. deprecated:: 1.28.0. """ deprecation_with_replacement("insertBlankPage", "insert_blank_page", "3.0.0") return self.insert_blank_page(width, height, index) @@ -515,9 +530,12 @@ def open_destination( self, ) -> Union[None, Destination, TextStringObject, ByteStringObject]: """ - Property to access the opening destination (``/OpenAction`` entry in the - PDF catalog). - it returns `None` if the entry does not exist is not set. + Property to access the opening destination (``/OpenAction`` entry in + the PDF catalog). it returns `None` if the entry does not exist is not + set. + + Raises: + Exception: If a destination is invalid """ if "/OpenAction" not in self._root_object: return None @@ -560,7 +578,7 @@ def add_js(self, javascript: str) -> None: Add Javascript which will launch upon opening this PDF. Args: - javascript: Your Javascript. + javascript: Your Javascript. >>> output.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # Example: This will launch the print window when the PDF is opened. @@ -573,7 +591,6 @@ def add_js(self, javascript: str) -> None: names[NameObject("/JavaScript")] = DictionaryObject( {NameObject("/Names"): ArrayObject()} ) - # cast(DictionaryObject, names[NameObject("/JavaScript")])[NameObject("/Names")] = ArrayObject() js_list = cast( ArrayObject, cast(DictionaryObject, names["/JavaScript"])["/Names"] ) @@ -593,9 +610,9 @@ def add_js(self, javascript: str) -> None: def addJS(self, javascript: str) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_js` instead. - Use :meth:`add_js` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addJS", "add_js", "3.0.0") return self.add_js(javascript) @@ -609,8 +626,8 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: Section 7.11.3 Args: - filename: The filename to display. - data: The data in the file. + filename: The filename to display. + data: The data in the file. """ # We need three entries: # * The file's data @@ -686,9 +703,9 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: def addAttachment(self, fname: str, fdata: Union[str, bytes]) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_attachment` instead. - Use :meth:`add_attachment` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addAttachment", "add_attachment", "3.0.0") return self.add_attachment(fname, fdata) @@ -705,14 +722,15 @@ def append_pages_from_reader( `append` should be prefered. Args: - reader: a PdfReader object from which to copy page - annotations to this writer object. The writer's annots - will then be updated - after_page_append: - Callback function that is invoked after each page is appended to - the writer. Signature includes a reference to the appended page - (delegates to append_pages_from_reader). The single parameter of the - callback is a reference to the page just appended to the document. + reader: a PdfReader object from which to copy page + annotations to this writer object. The writer's annots + will then be updated + after_page_append: + Callback function that is invoked after each page is appended to + the writer. Signature includes a reference to the appended page + (delegates to append_pages_from_reader). The single parameter of + the callback is a reference to the page just appended to the + document. """ # Get page count from writer and reader reader_num_pages = len(reader.pages) @@ -730,9 +748,9 @@ def appendPagesFromReader( after_page_append: Optional[Callable[[PageObject], None]] = None, ) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`append_pages_from_reader` instead. - Use :meth:`append_pages_from_reader` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "appendPagesFromReader", "append_pages_from_reader", "3.0.0" @@ -752,13 +770,13 @@ def update_page_form_field_values( If the field links to a parent object, add the information to the parent. Args: - page: Page reference from PDF writer where the - annotations and field data will be updated. - fields: a Python dictionary of field names (/T) and text - values (/V) - flags: An integer (0 to 7). The first bit sets ReadOnly, the - second bit sets Required, the third bit sets NoExport. See - PDF Reference Table 8.70 for details. + page: Page reference from PDF writer where the + annotations and field data will be updated. + fields: a Python dictionary of field names (/T) and text + values (/V) + flags: An integer (0 to 7). The first bit sets ReadOnly, the + second bit sets Required, the third bit sets NoExport. See + PDF Reference Table 8.70 for details. """ self.set_need_appearances_writer() # Iterate through pages, update field values @@ -812,9 +830,9 @@ def updatePageFormFieldValues( flags: FieldFlag = OPTIONAL_READ_WRITE_FIELD, ) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`update_page_form_field_values` instead. - Use :meth:`update_page_form_field_values` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "updatePageFormFieldValues", "update_page_form_field_values", "3.0.0" @@ -824,11 +842,11 @@ def updatePageFormFieldValues( def clone_reader_document_root(self, reader: PdfReader) -> None: """ Copy the reader document root to the writer and all sub elements, - including pages, threads, outlines,... - For partial insertion, `append` should be considered. + including pages, threads, outlines,... For partial insertion, `append` + should be considered. Args: - reader: PdfReader from the document root should be copied. + reader: PdfReader from the document root should be copied. """ self._root_object = cast(DictionaryObject, reader.trailer[TK.ROOT].clone(self)) self._root = self._root_object.indirect_reference # type: ignore[assignment] @@ -841,9 +859,9 @@ def clone_reader_document_root(self, reader: PdfReader) -> None: def cloneReaderDocumentRoot(self, reader: PdfReader) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`clone_reader_document_root` instead. - Use :meth:`clone_reader_document_root` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "cloneReaderDocumentRoot", "clone_reader_document_root", "3.0.0" @@ -898,17 +916,18 @@ def clone_document_from_reader( after_page_append: Optional[Callable[[PageObject], None]] = None, ) -> None: """ - Create a copy (clone) of a document from a PDF file reader - cloning section '/Root' and '/Info' and '/ID' of the pdf + Create a copy (clone) of a document from a PDF file reader cloning + section '/Root' and '/Info' and '/ID' of the pdf. Args: - reader: PDF file reader instance from which the clone - should be created. - after_page_append: - Callback function that is invoked after each page is appended to - the writer. Signature includes a reference to the appended page - (delegates to append_pages_from_reader). The single parameter of the - callback is a reference to the page just appended to the document. + reader: PDF file reader instance from which the clone + should be created. + after_page_append: + Callback function that is invoked after each page is appended to + the writer. Signature includes a reference to the appended page + (delegates to append_pages_from_reader). The single parameter of + the callback is a reference to the page just appended to the + document. """ self.clone_reader_document_root(reader) self._info = reader.trailer[TK.INFO].clone(self).indirect_reference # type: ignore @@ -928,9 +947,9 @@ def cloneDocumentFromReader( after_page_append: Optional[Callable[[PageObject], None]] = None, ) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`clone_document_from_reader` instead. - Use :meth:`clone_document_from_reader` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "cloneDocumentFromReader", "clone_document_from_reader", "3.0.0" @@ -950,21 +969,21 @@ def encrypt( Encrypt this PDF file with the PDF Standard encryption handler. Args: - user_password: The password which allows for opening - and reading the PDF file with the restrictions provided. - owner_password: The password which allows for - opening the PDF files without any restrictions. By default, - the owner password is the same as the user password. - use_128bit: flag as to whether to use 128bit - encryption. When false, 40bit encryption will be used. By default, - this flag is on. - permissions_flag: permissions as described in - TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the - permission is grantend. Hence an integer value of -1 will set all - flags. - Bit position 3 is for printing, 4 is for modifying content, 5 and 6 - control annotations, 9 for form fields, 10 for extraction of - text and graphics. + user_password: The password which allows for opening + and reading the PDF file with the restrictions provided. + owner_password: The password which allows for + opening the PDF files without any restrictions. By default, + the owner password is the same as the user password. + use_128bit: flag as to whether to use 128bit + encryption. When false, 40bit encryption will be used. + By default, this flag is on. + permissions_flag: permissions as described in + TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means + the permission is grantend. + Hence an integer value of -1 will set all flags. + Bit position 3 is for printing, 4 is for modifying content, + 5 and 6 control annotations, 9 for form fields, + 10 for extraction of text and graphics. """ if user_pwd is not None: if user_password is not None: @@ -1012,7 +1031,7 @@ def encrypt( rev = 2 keylen = int(40 / 8) P = permissions_flag - O = ByteStringObject(_alg33(owner_password, user_password, rev, keylen)) # type: ignore[arg-type] + O = ByteStringObject(_alg33(owner_password, user_password, rev, keylen)) # type: ignore[arg-type] # noqa ID_1 = ByteStringObject(md5((repr(time.time())).encode("utf8")).digest()) ID_2 = ByteStringObject(md5((repr(random.random())).encode("utf8")).digest()) self._ID = ArrayObject((ID_1, ID_2)) @@ -1064,10 +1083,10 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: Write the collection of pages added to this object out as a PDF file. Args: - stream: An object to write the file to. The object can support - the write method and the tell method, similar to a file object, or - be a file path, just like the fileobj, just named it stream to keep - existing workflow. + stream: An object to write the file to. The object can support + the write method and the tell method, similar to a file object, or + be a file path, just like the fileobj, just named it stream to keep + existing workflow. Returns: A tuple (bool, IO) @@ -1142,8 +1161,8 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: Add custom metadata to the output. Args: - infos: a Python dictionary where each key is a field - and each value is your new metadata. + infos: a Python dictionary where each key is a field + and each value is your new metadata. """ args = {} for key, value in list(infos.items()): @@ -1152,9 +1171,9 @@ def add_metadata(self, infos: Dict[str, Any]) -> None: def addMetadata(self, infos: Dict[str, Any]) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_metadata` instead. - Use :meth:`add_metadata` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addMetadata", "add_metadata", "3.0.0") self.add_metadata(infos) @@ -1249,10 +1268,10 @@ def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject: and new idnum is given and generation is always 0. Args: - data: + data: Returns: - The resolved indirect object + The resolved indirect object """ if hasattr(data.pdf, "stream") and data.pdf.stream.closed: raise ValueError(f"I/O operation on closed file: {data.pdf.stream.name}") @@ -1293,9 +1312,9 @@ def get_reference(self, obj: PdfObject) -> IndirectObject: def getReference(self, obj: PdfObject) -> IndirectObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_reference` instead. - Use :meth:`get_reference` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getReference", "get_reference", "3.0.0") return self.get_reference(obj) @@ -1336,7 +1355,9 @@ def get_threads_root(self) -> ArrayObject: @property def threads(self) -> ArrayObject: """ - Read-only property for the list of threads see §8.3.2 from PDF 1.7 spec + Read-only property for the list of threads. + + See §8.3.2 from PDF 1.7 spec. Each element is a dictionaries with ``/F`` and ``/I`` keys. """ @@ -1344,9 +1365,9 @@ def threads(self) -> ArrayObject: def getOutlineRoot(self) -> TreeObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_outline_root` instead. - Use :meth:`get_outline_root` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getOutlineRoot", "get_outline_root", "3.0.0") return self.get_outline_root() @@ -1388,9 +1409,9 @@ def get_named_dest_root(self) -> ArrayObject: def getNamedDestRoot(self) -> ArrayObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_named_dest_root` instead. - Use :meth:`get_named_dest_root` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getNamedDestRoot", "get_named_dest_root", "3.0.0") return self.get_named_dest_root() @@ -1439,9 +1460,9 @@ def add_bookmark_destination( parent: Union[None, TreeObject, IndirectObject] = None, ) -> IndirectObject: # deprecated """ - .. deprecated:: 2.9.0 + Use :meth:`add_outline_item_destination` instead. - Use :meth:`add_outline_item_destination` instead. + .. deprecated:: 2.9.0 """ deprecation_with_replacement( "add_bookmark_destination", "add_outline_item_destination", "3.0.0" @@ -1452,9 +1473,9 @@ def addBookmarkDestination( self, dest: PageObject, parent: Optional[TreeObject] = None ) -> IndirectObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_outline_item_destination` instead. - Use :meth:`add_outline_item_destination` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "addBookmarkDestination", "add_outline_item_destination", "3.0.0" @@ -1488,9 +1509,9 @@ def add_bookmark_dict( self, outline_item: OutlineItemType, parent: Optional[TreeObject] = None ) -> IndirectObject: # deprecated """ - .. deprecated:: 2.9.0 + Use :meth:`add_outline_item_dict` instead. - Use :meth:`add_outline_item_dict` instead. + .. deprecated:: 2.9.0 """ deprecation_with_replacement( "add_bookmark_dict", "add_outline_item_dict", "3.0.0" @@ -1502,9 +1523,9 @@ def addBookmarkDict( self, outline_item: OutlineItemType, parent: Optional[TreeObject] = None ) -> IndirectObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_outline_item_dict` instead. - Use :meth:`add_outline_item_dict` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "addBookmarkDict", "add_outline_item_dict", "3.0.0" @@ -1527,20 +1548,19 @@ def add_outline_item( Add an outline item (commonly referred to as a "Bookmark") to the PDF file. Args: - title: Title to use for this outline item. - page_number: Page number this outline item will point to. - parent: A reference to a parent outline item to create nested - outline items. - parent: A reference to a parent outline item to create nested - outline items. - color: Color of the outline item's font as a red, green, blue tuple - from 0.0 to 1.0 or as a Hex String (#RRGGBB) - bold: Outline item font is bold - italic: Outline item font is italic - fit: The fit of the destination page. + title: Title to use for this outline item. + page_number: Page number this outline item will point to. + parent: A reference to a parent outline item to create nested + outline items. + before: + color: Color of the outline item's font as a red, green, blue tuple + from 0.0 to 1.0 or as a Hex String (#RRGGBB) + bold: Outline item font is bold + italic: Outline item font is italic + fit: The fit of the destination page. Returns: - The added outline item as an indirect object. + The added outline item as an indirect object. """ page_ref: Union[None, NullObject, IndirectObject, NumberObject] if isinstance(italic, Fit): # it means that we are on the old params @@ -1604,9 +1624,9 @@ def add_bookmark( *args: ZoomArgType, ) -> IndirectObject: # deprecated """ - .. deprecated:: 2.9.0 + Use :meth:`add_outline_item` instead. - Use :meth:`add_outline_item` instead. + .. deprecated:: 2.9.0 """ deprecation_with_replacement("add_bookmark", "add_outline_item", "3.0.0") return self.add_outline_item( @@ -1631,9 +1651,9 @@ def addBookmark( *args: ZoomArgType, ) -> IndirectObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_outline_item` instead. - Use :meth:`add_outline_item` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addBookmark", "add_outline_item", "3.0.0") return self.add_outline_item( @@ -1702,9 +1722,9 @@ def addNamedDestinationObject( self, dest: Destination ) -> IndirectObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_named_destination_object` instead. - Use :meth:`add_named_destination_object` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "addNamedDestinationObject", "add_named_destination_object", "3.0.0" @@ -1757,9 +1777,9 @@ def addNamedDestination( self, title: str, pagenum: int ) -> IndirectObject: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_named_destination` instead. - Use :meth:`add_named_destination` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "addNamedDestination", "add_named_destination", "3.0.0" @@ -1777,9 +1797,9 @@ def remove_links(self) -> None: def removeLinks(self) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`remove_links` instead. - Use :meth:`remove_links` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("removeLinks", "remove_links", "3.0.0") return self.remove_links() @@ -1789,8 +1809,8 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: Remove images from this output. Args: - ignore_byte_string_object: optional parameter - to ignore ByteString Objects. + ignore_byte_string_object: optional parameter + to ignore ByteString Objects. """ pg_dict = cast(DictionaryObject, self.get_object(self._pages)) pages = cast(ArrayObject, pg_dict[PA.KIDS]) @@ -1823,6 +1843,8 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: ) for page in pages: page_ref = cast(DictionaryObject, self.get_object(page)) + if "/Contents" not in page_ref: + return content = page_ref["/Contents"].get_object() if not isinstance(content, ContentStream): content = ContentStream(content, page_ref) @@ -1864,9 +1886,9 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`remove_images` instead. - Use :meth:`remove_images` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("removeImages", "remove_images", "3.0.0") return self.remove_images(ignoreByteStringObject) @@ -1876,7 +1898,7 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None: Remove text from this output. Args: - ignore_byte_string_object: optional parameter + ignore_byte_string_object: optional parameter """ pg_dict = cast(DictionaryObject, self.get_object(self._pages)) pages = cast(List[IndirectObject], pg_dict[PA.KIDS]) @@ -1917,9 +1939,9 @@ def remove_text(self, ignore_byte_string_object: bool = False) -> None: def removeText(self, ignoreByteStringObject: bool = False) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`remove_text` instead. - Use :meth:`remove_text` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("removeText", "remove_text", "3.0.0") return self.remove_text(ignoreByteStringObject) @@ -1934,17 +1956,19 @@ def add_uri( ) -> None: """ Add an URI from a rectangular area to the specified page. + This uses the basic structure of :meth:`add_link` Args: - page_number: index of the page on which to place the URI action. - uri: URI of resource to link to. - rect: :class:`RectangleObject` or - array of four integers specifying the clickable rectangular area - ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``. - border: if provided, an array describing border-drawing - properties. See the PDF spec for details. No border will be - drawn if this argument is omitted. + page_number: index of the page on which to place the URI action. + uri: URI of resource to link to. + rect: :class:`RectangleObject` or + array of four integers specifying the clickable rectangular area + ``[xLL, yLL, xUR, yUR]``, or string in the form + ``"[ xLL yLL xUR yUR ]"``. + border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. """ if pagenum is not None: warnings.warn( @@ -2008,9 +2032,9 @@ def addURI( border: Optional[ArrayObject] = None, ) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_uri` instead. - Use :meth:`add_uri` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("addURI", "add_uri", "3.0.0") return self.add_uri(pagenum, uri, rect, border) @@ -2056,9 +2080,9 @@ def addLink( *args: ZoomArgType, ) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`add_link` instead. - Use :meth:`add_link` instead. + .. deprecated:: 1.28.0 """ deprecate_with_replacement( "addLink", "add_annotation(AnnotationBuilder.link(...))", "4.0.0" @@ -2083,9 +2107,9 @@ def _get_page_layout(self) -> Optional[LayoutType]: def getPageLayout(self) -> Optional[LayoutType]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_layout` instead. - Use :py:attr:`page_layout` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getPageLayout", "page_layout", "3.0.0") return self._get_page_layout() @@ -2095,7 +2119,7 @@ def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None: Set the page layout. Args: - str: layout: The page layout to be used. + layout: The page layout to be used. .. list-table:: Valid ``layout`` arguments :widths: 50 200 @@ -2129,7 +2153,7 @@ def set_page_layout(self, layout: LayoutType) -> None: Set the page layout. Args: - layout: The page layout to be used + layout: The page layout to be used .. list-table:: Valid ``layout`` arguments :widths: 50 200 @@ -2153,9 +2177,9 @@ def set_page_layout(self, layout: LayoutType) -> None: def setPageLayout(self, layout: LayoutType) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_layout` instead. - Use :py:attr:`page_layout` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "writer.setPageLayout(val)", "writer.page_layout = val", "3.0.0" @@ -2194,9 +2218,9 @@ def page_layout(self, layout: LayoutType) -> None: @property def pageLayout(self) -> Optional[LayoutType]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_layout` instead. - Use :py:attr:`page_layout` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") return self.page_layout @@ -2204,9 +2228,9 @@ def pageLayout(self) -> Optional[LayoutType]: # deprecated @pageLayout.setter def pageLayout(self, layout: LayoutType) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_layout` instead. - Use :py:attr:`page_layout` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") self.page_layout = layout @@ -2228,18 +2252,18 @@ def _get_page_mode(self) -> Optional[PagemodeType]: def getPageMode(self) -> Optional[PagemodeType]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_mode` instead. - Use :py:attr:`page_mode` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getPageMode", "page_mode", "3.0.0") return self._get_page_mode() def set_page_mode(self, mode: PagemodeType) -> None: """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_mode` instead. - Use :py:attr:`page_mode` instead. + .. deprecated:: 1.28.0 """ if isinstance(mode, NameObject): mode_name: NameObject = mode @@ -2253,9 +2277,9 @@ def set_page_mode(self, mode: PagemodeType) -> None: def setPageMode(self, mode: PagemodeType) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_mode` instead. - Use :py:attr:`page_mode` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "writer.setPageMode(val)", "writer.page_mode = val", "3.0.0" @@ -2292,9 +2316,9 @@ def page_mode(self, mode: PagemodeType) -> None: @property def pageMode(self) -> Optional[PagemodeType]: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_mode` instead. - Use :py:attr:`page_mode` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("pageMode", "page_mode", "3.0.0") return self.page_mode @@ -2302,9 +2326,9 @@ def pageMode(self) -> Optional[PagemodeType]: # deprecated @pageMode.setter def pageMode(self, mode: PagemodeType) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :py:attr:`page_mode` instead. - Use :py:attr:`page_mode` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("pageMode", "page_mode", "3.0.0") self.page_mode = mode @@ -2341,10 +2365,10 @@ def clean_page(self, page: Union[PageObject, IndirectObject]) -> PageObject: (required for names/dests list) Args: - page: + page: Returns: - The cleaned PageObject + The cleaned PageObject """ page = cast("PageObject", page.get_object()) for a in page.get("/Annots", []): @@ -2414,23 +2438,23 @@ def append( position. Args: - fileobj: A File Object or an object that supports the standard - read and seek methods similar to a File Object. Could also be a - string representing a path to a PDF file. - outline_item: Optionally, you may specify a string to build an outline - (aka 'bookmark') to identify the - beginning of the included file. - pages: Can be a :class:`PageRange` - or a ``(start, stop[, step])`` tuple - or a list of pages to be processed - to merge only the specified range of pages from the source - document into the output document. - import_outline: You may prevent the source document's - outline (collection of outline items, previously referred to as - 'bookmarks') from being imported by specifying this as ``False``. - excluded_fields: Provide the list of fields/keys to be ignored - if ``/Annots`` is part of the list, the annotation will be ignored - if ``/B`` is part of the list, the articles will be ignored + fileobj: A File Object or an object that supports the standard + read and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + outline_item: Optionally, you may specify a string to build an + outline (aka 'bookmark') to identify the beginning of the + included file. + pages: Can be a :class:`PageRange` + or a ``(start, stop[, step])`` tuple + or a list of pages to be processed + to merge only the specified range of pages from the source + document into the output document. + import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. + excluded_fields: Provide the list of fields/keys to be ignored + if ``/Annots`` is part of the list, the annotation will be ignored + if ``/B`` is part of the list, the articles will be ignored """ if excluded_fields is None: excluded_fields = () @@ -2440,10 +2464,22 @@ def append( excluded_fields = import_outline import_outline = pages pages = outline_item - self.merge(None, fileobj, None, pages, import_outline, excluded_fields) + self.merge( + None, + fileobj, + None, + pages, + import_outline, + excluded_fields, + ) else: # if isinstance(outline_item,str): self.merge( - None, fileobj, outline_item, pages, import_outline, excluded_fields + None, + fileobj, + outline_item, + pages, + import_outline, + excluded_fields, ) @deprecation_bookmark(bookmark="outline_item", import_bookmarks="import_outline") @@ -2461,25 +2497,28 @@ def merge( specified page number. Args: - position: The *page number* to insert this file. File will - be inserted after the given number. - fileobj: A File Object or an object that supports the standard - read and seek methods similar to a File Object. Could also be a - string representing a path to a PDF file. - outline_item: Optionally, you may specify a string to build an outline - (aka 'bookmark') to identify the - beginning of the included file. - pages: can be a :class:`PageRange` - or a ``(start, stop[, step])`` tuple - or a list of pages to be processed - to merge only the specified range of pages from the source - document into the output document. - import_outline: You may prevent the source document's - outline (collection of outline items, previously referred to as - 'bookmarks') from being imported by specifying this as ``False``. - excluded_fields: provide the list of fields/keys to be ignored - if ``/Annots`` is part of the list, the annotation will be ignored - if ``/B`` is part of the list, the articles will be ignored + position: The *page number* to insert this file. File will + be inserted after the given number. + fileobj: A File Object or an object that supports the standard + read and seek methods similar to a File Object. Could also be a + string representing a path to a PDF file. + outline_item: Optionally, you may specify a string to build an outline + (aka 'bookmark') to identify the + beginning of the included file. + pages: can be a :class:`PageRange` + or a ``(start, stop[, step])`` tuple + or a list of pages to be processed + to merge only the specified range of pages from the source + document into the output document. + import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. + excluded_fields: provide the list of fields/keys to be ignored + if ``/Annots`` is part of the list, the annotation will be ignored + if ``/B`` is part of the list, the articles will be ignored + + Raises: + TypeError: The pages attribute is not configured properly """ if isinstance(fileobj, PdfReader): reader = fileobj @@ -2525,7 +2564,6 @@ def merge( ) # need for the outline processing below for dest in reader._namedDests.values(): arr = dest.dest_array - # try: if isinstance(dest["/Page"], NullObject): pass # self.add_named_destination_array(dest["/Title"],arr) elif dest["/Page"].indirect_reference.idnum in srcpages: @@ -2533,8 +2571,6 @@ def merge( dest["/Page"].indirect_reference.idnum ].indirect_reference self.add_named_destination_array(dest["/Title"], arr) - # except Exception as e: - # logger_warning(f"can not insert {dest} : {e.msg}",__name__) outline_item_typ: TreeObject if outline_item is not None: @@ -2567,6 +2603,34 @@ def merge( pag[NameObject("/Annots")] = lst self.clean_page(pag) + if "/AcroForm" in cast(DictionaryObject, reader.trailer["/Root"]): + if "/AcroForm" not in self._root_object: + self._root_object[NameObject("/AcroForm")] = ( + cast( + DictionaryObject, + cast(DictionaryObject, reader.trailer["/Root"])["/AcroForm"], + ) + .clone(self, False, ("/Fields",)) + .indirect_reference + ) + arr = ArrayObject() + else: + arr = cast( + ArrayObject, + cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], + ) + trslat = self._id_translated[id(reader)] + for f in reader.trailer["/Root"]["/AcroForm"]["/Fields"]: # type: ignore + try: + ind = IndirectObject(trslat[f.idnum], 0, self) + if ind not in arr: + arr.append(ind) + except KeyError: + pass + cast(DictionaryObject, self._root_object["/AcroForm"])[ + NameObject("/Fields") + ] = arr + if "/B" not in excluded_fields: self.add_filtered_articles("", srcpages, reader) @@ -2579,15 +2643,15 @@ def _add_articles_thread( reader: PdfReader, ) -> IndirectObject: """ - Clone the thread with only the applicable articles + Clone the thread with only the applicable articles. Args: - thread: - pages: - reader: + thread: + pages: + reader: Returns: - The added thread as an indirect reference + The added thread as an indirect reference """ nthread = thread.clone( self, force_duplicate=True, ignore_fields=("/F",) @@ -2643,12 +2707,12 @@ def add_filtered_articles( reader: PdfReader, ) -> None: """ - Add articles matching the defined criteria + Add articles matching the defined criteria. Args: - fltr: - pages: - reader: + fltr: + pages: + reader: """ if isinstance(fltr, str): fltr = re.compile(fltr) @@ -2673,8 +2737,6 @@ def _get_cloned_page( return None if isinstance(page, int): _i = reader.pages[page].indirect_reference - # elif isinstance(page, PageObject): - # _i = page.indirect_reference elif isinstance(page, DictionaryObject) and page.get("/Type", "") == "/Page": _i = page.indirect_reference elif isinstance(page, IndirectObject): @@ -2745,15 +2807,19 @@ def _get_filtered_outline( Extract outline item entries that are part of the specified page set. Args: - node: - pages: - reader: + node: + pages: + reader: Returns: - A list of destination objects. + A list of destination objects. """ new_outline = [] + if node is None: + node = NullObject() node = node.get_object() + if isinstance(node, NullObject): + node = DictionaryObject() if node.get("/Type", "") == "/Outlines" or "/Title" not in node: node = node.get("/First", None) if node is not None: @@ -2784,10 +2850,6 @@ def _clone_outline(self, dest: Destination) -> TreeObject: if not isinstance(dest["/Page"], NullObject): if dest.node is not None and "/A" in dest.node: n_ol[NameObject("/A")] = dest.node["/A"].clone(self) - # elif "/D" in dest.node: - # n_ol[NameObject("/Dest")] = dest.node["/D"].clone(self) - # elif "/Dest" in dest.node: - # n_ol[NameObject("/Dest")] = dest.node["/Dest"].clone(self) else: n_ol[NameObject("/Dest")] = dest.dest_array # TODO: /SE @@ -2808,7 +2870,7 @@ def _insert_filtered_outline( ) -> None: for dest in outlines: # TODO : can be improved to keep A and SE entries (ignored for the moment) - # np=self.add_outline_item_destination(dest,parent,before) + # with np=self.add_outline_item_destination(dest,parent,before) if dest.get("/Type", "") == "/Outlines" or "/Title" not in dest: np = parent else: @@ -2817,7 +2879,7 @@ def _insert_filtered_outline( self._insert_filtered_outline(dest.childs, np, None) def close(self) -> None: - """To match the functions from Merger""" + """To match the functions from Merger.""" return # @deprecation_bookmark(bookmark="outline_item") @@ -2867,12 +2929,13 @@ def reset_translation( self, reader: Union[None, PdfReader, IndirectObject] = None ) -> None: """ - reset the translation table between reader and the writer object. - late cloning will create new independent objects + Reset the translation table between reader and the writer object. + + Late cloning will create new independent objects. Args: - reader: PdfReader or IndirectObject refering a PdfReader object. - if set to None or omitted, all tables will be reset. + reader: PdfReader or IndirectObject refering a PdfReader object. + if set to None or omitted, all tables will be reset. """ if reader is None: self._id_translated = {} @@ -2947,10 +3010,11 @@ def _set_page_label( ) -> None: """ Set a page label to a range of pages. - Page indexes must be given starting from 0. - Labels must have a style, a prefix or both. - If to a range is not assigned any page label a decimal label starting - from 1 is applied. + + Page indexes must be given + starting from 0. Labels must have a style, a prefix or both. If to a + range is not assigned any page label a decimal label starting from 1 is + applied. Args: page_index_from: page index of the beginning of the range starting from 0 @@ -3048,13 +3112,8 @@ def _create_outline_item( if color: if isinstance(color, str): color = hex_to_rgb(color) - prec = decimal.Decimal("1.00000") outline_item.update( - { - NameObject("/C"): ArrayObject( - [FloatObject(decimal.Decimal(c).quantize(prec)) for c in color] - ) - } + {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} ) if italic or bold: format_flag = 0 diff --git a/pypdf/constants.py b/pypdf/constants.py index cd8cc1ecf4..9ecdf0e821 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -43,14 +43,14 @@ class EncryptionDictAttributes: """ R = "/R" # number, required; revision of the standard security handler - O = "/O" # 32-byte string, required + O = "/O" # 32-byte string, required # noqa U = "/U" # 32-byte string, required P = "/P" # integer flag, required; permitted operations ENCRYPT_METADATA = "/EncryptMetadata" # boolean flag, optional class UserAccessPermissions(IntFlag): - """TABLE 3.20 User access permissions""" + """TABLE 3.20 User access permissions.""" R1 = 1 R2 = 2 @@ -145,7 +145,7 @@ class PageAttributes: class FileSpecificationDictionaryEntries: - """TABLE 3.41 Entries in a file specification dictionary""" + """TABLE 3.41 Entries in a file specification dictionary.""" Type = "/Type" FS = "/FS" # The name of the file system to be used to interpret this file specification @@ -261,7 +261,7 @@ class GoToActionArguments: class AnnotationDictionaryAttributes: - """TABLE 8.15 Entries common to all annotation dictionaries""" + """TABLE 8.15 Entries common to all annotation dictionaries.""" Type = "/Type" Subtype = "/Subtype" @@ -334,7 +334,7 @@ def attributes_dict(cls) -> Dict[str, str]: class CheckboxRadioButtonAttributes: - """TABLE 8.76 Field flags common to all field types""" + """TABLE 8.76 Field flags common to all field types.""" Opt = "/Opt" # Options, Optional @@ -350,7 +350,7 @@ def attributes_dict(cls) -> Dict[str, str]: class FieldFlag(IntFlag): - """TABLE 8.70 Field flags common to all field types""" + """TABLE 8.70 Field flags common to all field types.""" READ_ONLY = 1 REQUIRED = 2 @@ -424,7 +424,7 @@ class CatalogDictionary: class OutlineFontFlag(IntFlag): - """A class used as an enumerable flag for formatting an outline font""" + """A class used as an enumerable flag for formatting an outline font.""" italic = 1 bold = 2 diff --git a/pypdf/filters.py b/pypdf/filters.py index c3e4d0321a..ac20e5928d 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -194,10 +194,8 @@ def encode(data: bytes) -> bytes: class ASCIIHexDecode: - """ - The ASCIIHexDecode filter decodes data that has been encoded in ASCII - hexadecimal form into a base-7 ASCII format. - """ + """The ASCIIHexDecode filter decodes data that has been encoded in ASCII + hexadecimal form into a base-7 ASCII format.""" @staticmethod def decode( @@ -246,8 +244,11 @@ def decode( class LZWDecode: - """Taken from: - http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm + """ + Taken from: + + http://www.java2s.com/Open-Source/Java-Document/PDF/PDF- + Renderer/com/sun/pdfview/decode/LZWDecode.java.htm """ class Decoder: diff --git a/pypdf/generic/_annotations.py b/pypdf/generic/_annotations.py index 26dfd63daa..2416b0a572 100644 --- a/pypdf/generic/_annotations.py +++ b/pypdf/generic/_annotations.py @@ -37,14 +37,14 @@ def text( Add text annotation. Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - text: The text that is added to the document - open: - flags: + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + text: The text that is added to the document + open: + flags: Returns: - A dictionary object representing the annotation. + A dictionary object representing the annotation. """ # TABLE 8.23 Additional entries specific to a text annotation text_obj = DictionaryObject( @@ -75,19 +75,20 @@ def free_text( Add text in a rectangle to a page. Args: - text: Text to be added - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - font: Name of the Font, e.g. 'Helvetica' - bold: Print the text in bold - italic: Print the text in italic - font_size: How big the text will be, e.g. '14pt' - font_color: Hex-string for the color, e.g. cdcdcd - border_color: Hex-string for the border color, e.g. cdcdcd - background_color: Hex-string for the background of the annotation, e.g. cdcdcd + text: Text to be added + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + font: Name of the Font, e.g. 'Helvetica' + bold: Print the text in bold + italic: Print the text in italic + font_size: How big the text will be, e.g. '14pt' + font_color: Hex-string for the color, e.g. cdcdcd + border_color: Hex-string for the border color, e.g. cdcdcd + background_color: Hex-string for the background of the annotation, + e.g. cdcdcd Returns: - A dictionary object representing the annotation. + A dictionary object representing the annotation. """ font_str = "font: " if bold is True: @@ -133,16 +134,16 @@ def line( Draw a line on the PDF. Args: - p1: First point - p2: Second point - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - text: Text to be displayed as the line annotation - title_bar: Text to be displayed in the title bar of the - annotation; by convention this is the name of the author + p1: First point + p2: Second point + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + text: Text to be displayed as the line annotation + title_bar: Text to be displayed in the title bar of the + annotation; by convention this is the name of the author Returns: - A dictionary object representing the annotation. + A dictionary object representing the annotation. """ line_obj = DictionaryObject( { @@ -187,13 +188,13 @@ def rectangle( This method uses the /Square annotation type of the PDF format. Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - interiour_color: None or hex-string for the color, e.g. cdcdcd - If None is used, the interiour is transparent. + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + interiour_color: None or hex-string for the color, e.g. cdcdcd + If None is used, the interiour is transparent. Returns: - A dictionary object representing the annotation. + A dictionary object representing the annotation. """ square_obj = DictionaryObject( { @@ -221,13 +222,13 @@ def ellipse( This method uses the /Circle annotation type of the PDF format. Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` specifying - the bounding box of the ellipse - interiour_color: None or hex-string for the color, e.g. cdcdcd - If None is used, the interiour is transparent. + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` specifying + the bounding box of the ellipse + interiour_color: None or hex-string for the color, e.g. cdcdcd + If None is used, the interiour is transparent. Returns: - A dictionary object representing the annotation. + A dictionary object representing the annotation. """ ellipse_obj = DictionaryObject( { @@ -288,22 +289,22 @@ def link( An internal link requires the target_page_index, fit, and fit args. Args: - rect: array of four integers ``[xLL, yLL, xUR, yUR]`` - specifying the clickable rectangular area - border: if provided, an array describing border-drawing - properties. See the PDF spec for details. No border will be - drawn if this argument is omitted. - - horizontal corner radius, - - vertical corner radius, and - - border width - - Optionally: Dash - url: Link to a website (if you want to make an external link) - target_page_index: index of the page to which the link should go - (if you want to make an internal link) - fit: Page fit or 'zoom' option. + rect: array of four integers ``[xLL, yLL, xUR, yUR]`` + specifying the clickable rectangular area + border: if provided, an array describing border-drawing + properties. See the PDF spec for details. No border will be + drawn if this argument is omitted. + - horizontal corner radius, + - vertical corner radius, and + - border width + - Optionally: Dash + url: Link to a website (if you want to make an external link) + target_page_index: index of the page to which the link should go + (if you want to make an internal link) + fit: Page fit or 'zoom' option. Returns: - A dictionary object representing the annotation. + A dictionary object representing the annotation. """ from ..types import BorderArrayType diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index a84ce1841e..cc4fe59224 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -26,10 +26,10 @@ # POSSIBILITY OF SUCH DAMAGE. import codecs -import decimal import hashlib import re from binascii import unhexlify +from math import log10 from typing import Any, Callable, List, Optional, Tuple, Union, cast from .._codecs import _pdfdoc_encoding_rev @@ -97,10 +97,10 @@ def _reference_clone( self, clone: Any, pdf_dest: PdfWriterProtocol ) -> PdfObjectProtocol: """ - reference the object within the _objects of pdf_dest only if - indirect_reference attribute exists (which means the objects - was already identified in xref/xobjstm) - if object has been already referenced do nothing + Reference the object within the _objects of pdf_dest only if + indirect_reference attribute exists (which means the objects was + already identified in xref/xobjstm) if object has been already + referenced do nothing. Args: clone: @@ -152,7 +152,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "NullObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" return cast("NullObject", self._reference_clone(NullObject(), pdf_dest)) def write_to_stream( @@ -192,7 +192,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "BooleanObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" return cast( "BooleanObject", self._reference_clone(BooleanObject(self.value), pdf_dest) ) @@ -251,7 +251,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "IndirectObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" if self.pdf == pdf_dest and not force_duplicate: # Already duplicated and no extra duplication required return self @@ -339,17 +339,20 @@ def readFromStream( return IndirectObject.read_from_stream(stream, pdf) -class FloatObject(decimal.Decimal, PdfObject): +class FloatObject(float, PdfObject): def __new__( - cls, value: Union[str, Any] = "0", context: Optional[Any] = None + cls, value: Union[str, Any] = "0.0", context: Optional[Any] = None ) -> "FloatObject": try: - return decimal.Decimal.__new__(cls, str_(value), context) - except Exception: + value = float(str_(value)) + return float.__new__(cls, value) + except Exception as e: # If this isn't a valid decimal (happens in malformed PDFs) # fallback to 0 - logger_warning(f"FloatObject ({value}) invalid; use 0.0 instead", __name__) - return decimal.Decimal.__new__(cls, "0.0") + logger_warning( + f"{e} : FloatObject ({value}) invalid; use 0.0 instead", __name__ + ) + return float.__new__(cls, 0.0) def clone( self, @@ -357,25 +360,26 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "FloatObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" return cast("FloatObject", self._reference_clone(FloatObject(self), pdf_dest)) + def myrepr(self) -> str: + if self == 0: + return "0.0" + nb = int(log10(abs(self))) + s = f"{self:.{max(1,16-nb)}f}".rstrip("0").rstrip(".") + return s + def __repr__(self) -> str: - if self == self.to_integral(): - # If this is an integer, format it with no decimal place. - return str(self.quantize(decimal.Decimal(1))) - else: - # Otherwise, format it with a decimal place, taking care to - # remove any extraneous trailing zeros. - return f"{self:f}".rstrip("0") + return self.myrepr() # repr(float(self)) def as_numeric(self) -> float: - return float(repr(self).encode("utf8")) + return float(self) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: - stream.write(repr(self).encode("utf8")) + stream.write(self.myrepr().encode("utf8")) def writeToStream( self, stream: StreamType, encryption_key: Union[None, str, bytes] @@ -400,7 +404,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "NumberObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" return cast("NumberObject", self._reference_clone(NumberObject(self), pdf_dest)) def as_numeric(self) -> int: @@ -435,6 +439,7 @@ def readFromStream( class ByteStringObject(bytes, PdfObject): """ Represents a string object where the text encoding could not be determined. + This occurs quite often, as the PDF spec doesn't provide an alternate way to represent strings -- for example, the encryption data stored in files (like /O) is clearly not text, but is still stored in a "String" object. @@ -446,7 +451,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "ByteStringObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" return cast( "ByteStringObject", self._reference_clone(ByteStringObject(bytes(self)), pdf_dest), @@ -478,10 +483,11 @@ def writeToStream( class TextStringObject(str, PdfObject): """ - Represents a string object that has been decoded into a real unicode string. + A string object that has been decoded into a real unicode string. + If read from a PDF document, this string appeared to match the - PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to - occur. + PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding + to occur. """ def clone( @@ -490,7 +496,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "TextStringObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" obj = TextStringObject(self) obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding obj.autodetect_utf16 = self.autodetect_utf16 @@ -501,12 +507,10 @@ def clone( @property def original_bytes(self) -> bytes: - """ - It is occasionally possible that a text string object gets created where + """It is occasionally possible that a text string object gets created where a byte string object was expected due to the autodetection mechanism -- if that occurs, this "original_bytes" property can be used to - back-calculate what the original encoded bytes were. - """ + back-calculate what the original encoded bytes were.""" return self.get_original_bytes() def get_original_bytes(self) -> bytes: @@ -575,7 +579,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "NameObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" return cast("NameObject", self._reference_clone(NameObject(self), pdf_dest)) def write_to_stream( diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 3df0ae14bd..bb2e028d27 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -34,7 +34,7 @@ from io import BytesIO from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast -from .._protocols import PdfWriterProtocol +from .._protocols import PdfReaderProtocol, PdfWriterProtocol from .._utils import ( WHITESPACES, StreamType, @@ -82,7 +82,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "ArrayObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" try: if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore return self @@ -91,8 +91,6 @@ def clone( arr = cast("ArrayObject", self._reference_clone(ArrayObject(), pdf_dest)) for data in self: if isinstance(data, StreamObject): - # if not hasattr(data, "indirect_reference"): - # data.indirect_reference = None dup = data._reference_clone( data.clone(pdf_dest, force_duplicate, ignore_fields), pdf_dest ) @@ -104,10 +102,7 @@ def clone( return cast("ArrayObject", arr) def items(self) -> Iterable[Any]: - """ - Emulate DictionaryObject.items for a list - (index, object) - """ + """Emulate DictionaryObject.items for a list (index, object).""" return enumerate(self) def write_to_stream( @@ -128,9 +123,9 @@ def writeToStream( @staticmethod def read_from_stream( stream: StreamType, - pdf: Any, + pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, - ) -> "ArrayObject": # PdfReader + ) -> "ArrayObject": arr = ArrayObject() tmp = stream.read(1) if tmp != b"[": @@ -152,7 +147,7 @@ def read_from_stream( @staticmethod def readFromStream( - stream: StreamType, pdf: Any # PdfReader + stream: StreamType, pdf: PdfReaderProtocol ) -> "ArrayObject": # deprecated deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") return ArrayObject.read_from_stream(stream, pdf) @@ -165,7 +160,7 @@ def clone( force_duplicate: bool = False, ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "DictionaryObject": - """clone object into pdf_dest""" + """Clone object into pdf_dest.""" try: if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore return self @@ -189,13 +184,13 @@ def _clone( ignore_fields: Union[Tuple[str, ...], List[str]], ) -> None: """ - Update the object from src + Update the object from src. Args: - src: "DictionaryObject": - pdf_dest: - force_duplicate: - ignore_fields: + src: "DictionaryObject": + pdf_dest: + force_duplicate: + ignore_fields: """ # First check if this is a chain list, we need to loop to prevent recur if ( @@ -276,8 +271,8 @@ def __getitem__(self, key: Any) -> PdfObject: @property def xmp_metadata(self) -> Optional[PdfObject]: """ - Retrieve XMP (Extensible Metadata Platform) data relevant to the - this object, if available. + Retrieve XMP (Extensible Metadata Platform) data relevant to the this + object, if available. Stability: Added in v1.12, will exist for all future v1.x releases. @@ -285,7 +280,6 @@ def xmp_metadata(self) -> Optional[PdfObject]: Returns a {@link #xmp.XmpInformation XmlInformation} instance that can be used to access XMP metadata from the document. Can also return None if no metadata was found on the document root. - """ from ..xmp import XmpInformation @@ -303,9 +297,9 @@ def getXmpMetadata( self, ) -> Optional[PdfObject]: # deprecated """ - .. deprecated:: 1.28.3 + Use :meth:`xmp_metadata` instead. - Use :meth:`xmp_metadata` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("getXmpMetadata", "xmp_metadata", "3.0.0") return self.xmp_metadata @@ -313,9 +307,9 @@ def getXmpMetadata( @property def xmpMetadata(self) -> Optional[PdfObject]: # deprecated """ - .. deprecated:: 1.28.3 + Use :meth:`xmp_metadata` instead. - Use :meth:`xmp_metadata` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("xmpMetadata", "xmp_metadata", "3.0.0") return self.xmp_metadata @@ -340,12 +334,12 @@ def writeToStream( @staticmethod def read_from_stream( stream: StreamType, - pdf: Any, # PdfReader + pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> "DictionaryObject": def get_next_obj_pos( - p: int, p1: int, rem_gens: List[int], pdf: Any - ) -> int: # PdfReader + p: int, p1: int, rem_gens: List[int], pdf: PdfReaderProtocol + ) -> int: loc = pdf.xref[rem_gens[0]] for o in loc: if p1 > loc[o] and p < loc[o]: @@ -355,7 +349,9 @@ def get_next_obj_pos( else: return get_next_obj_pos(p, p1, rem_gens[1:], pdf) - def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader + def read_unsized_from_steam( + stream: StreamType, pdf: PdfReaderProtocol + ) -> bytes: # we are just pointing at beginning of the stream eon = get_next_obj_pos(stream.tell(), 2**32, list(pdf.xref), pdf) - 1 curr = stream.tell() @@ -435,6 +431,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader length = data[SA.LENGTH] if isinstance(length, IndirectObject): t = stream.tell() + assert pdf is not None # hint for mypy length = pdf.get_object(length) stream.seek(t, 0) pstart = stream.tell() @@ -454,7 +451,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader if end == b"endstream": # we found it by looking back one character further. data["__streamdata__"] = data["__streamdata__"][:-1] - elif not pdf.strict: + elif pdf is not None and not pdf.strict: stream.seek(pstart, 0) data["__streamdata__"] = read_unsized_from_steam(stream, pdf) pos = stream.tell() @@ -475,7 +472,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader @staticmethod def readFromStream( - stream: StreamType, pdf: Any # PdfReader + stream: StreamType, pdf: PdfReaderProtocol ) -> "DictionaryObject": # deprecated deprecation_with_replacement("readFromStream", "read_from_stream", "3.0.0") return DictionaryObject.read_from_stream(stream, pdf) @@ -532,7 +529,6 @@ def inc_parent_counter( child_obj = child.get_object() child = child.indirect_reference # get_reference(child_obj) - # assert isinstance(child, IndirectObject) prev: Optional[DictionaryObject] if "/First" not in self: # no child yet @@ -583,10 +579,10 @@ def _remove_node_from_tree( Adjust the pointers of the linked list and tree node count. Args: - prev: - prev_ref: - cur: - last: + prev: + prev_ref: + cur: + last: """ next_ref = cur.get(NameObject("/Next"), None) if prev is None: @@ -657,9 +653,7 @@ def remove_child(self, child: Any) -> None: _reset_node_tree_relationship(child_obj) def remove_from_tree(self) -> None: - """ - remove the object from the tree it is in - """ + """Remove the object from the tree it is in.""" if NameObject("/Parent") not in self: raise ValueError("Removed child does not appear to be a tree item") else: @@ -689,7 +683,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None: This resets the nodes attributes in respect to that tree. Args: - child_obj: + child_obj: """ del child_obj[NameObject("/Parent")] if NameObject("/Next") in child_obj: @@ -714,10 +708,10 @@ def _clone( Update the object from src. Args: - src: - pdf_dest: - force_duplicate: - ignore_fields: + src: + pdf_dest: + force_duplicate: + ignore_fields: """ self._data = cast("StreamObject", src)._data try: @@ -883,8 +877,8 @@ def __init__( self.pdf = pdf # The inner list has two elements: - # [0] : List - # [1] : str + # Element 0: List + # Element 1: str self.operations: List[Tuple[Any, Any]] = [] # stream may be a StreamObject or an ArrayObject containing @@ -916,12 +910,12 @@ def clone( Clone object into pdf_dest. Args: - pdf_dest: - force_duplicate: - ignore_fields: + pdf_dest: + force_duplicate: + ignore_fields: Returns: - The cloned ContentStream + The cloned ContentStream """ try: if self.indirect_reference.pdf == pdf_dest and not force_duplicate: # type: ignore @@ -948,16 +942,16 @@ def _clone( Update the object from src. Args: - src: - pdf_dest: - force_duplicate: - ignore_fields: + src: + pdf_dest: + force_duplicate: + ignore_fields: """ self.pdf = pdf_dest self.operations = list(cast("ContentStream", src).operations) self.forced_encoding = cast("ContentStream", src).forced_encoding - # no need to call DictionaryObjection or any - # super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields) + # no need to call DictionaryObjection or anything + # like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields) return def __parse_content_stream(self, stream: StreamType) -> None: @@ -1095,7 +1089,7 @@ def _data(self, value: Union[str, bytes]) -> None: def read_object( stream: StreamType, - pdf: Any, # PdfReader + pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> Union[PdfObject, int, str, ContentStream]: tok = stream.read(1) @@ -1106,7 +1100,6 @@ def read_object( # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start - if peek == b"<<": return DictionaryObject.read_from_stream(stream, pdf, forced_encoding) else: @@ -1138,6 +1131,7 @@ def read_object( peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) is not None: + assert pdf is not None # hint for mypy return IndirectObject.read_from_stream(stream, pdf) else: return NumberObject.read_from_stream(stream) @@ -1178,9 +1172,9 @@ def field_type(self) -> Optional[NameObject]: @property def fieldType(self) -> Optional[NameObject]: # deprecated """ - .. deprecated:: 1.28.3 + Use :py:attr:`field_type` instead. - Use :py:attr:`field_type` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("fieldType", "field_type", "3.0.0") return self.field_type @@ -1208,9 +1202,9 @@ def alternate_name(self) -> Optional[str]: @property def altName(self) -> Optional[str]: # deprecated """ - .. deprecated:: 1.28.3 + Use :py:attr:`alternate_name` instead. - Use :py:attr:`alternate_name` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("altName", "alternate_name", "3.0.0") return self.alternate_name @@ -1218,8 +1212,9 @@ def altName(self) -> Optional[str]: # deprecated @property def mapping_name(self) -> Optional[str]: """ - Read-only property accessing the mapping name of this field. This - name is used by pypdf as a key in the dictionary returned by + Read-only property accessing the mapping name of this field. + + This name is used by pypdf as a key in the dictionary returned by :meth:`get_fields()` """ return self.get(FieldDictionaryAttributes.TM) @@ -1227,26 +1222,25 @@ def mapping_name(self) -> Optional[str]: @property def mappingName(self) -> Optional[str]: # deprecated """ - .. deprecated:: 1.28.3 + Use :py:attr:`mapping_name` instead. - Use :py:attr:`mapping_name` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("mappingName", "mapping_name", "3.0.0") return self.mapping_name @property def flags(self) -> Optional[int]: - """ - Read-only property accessing the field flags, specifying various - characteristics of the field (see Table 8.70 of the PDF 1.7 reference). - """ + """Read-only property accessing the field flags, specifying various + characteristics of the field (see Table 8.70 of the PDF 1.7 reference).""" return self.get(FieldDictionaryAttributes.Ff) @property def value(self) -> Optional[Any]: """ - Read-only property accessing the value of this field. Format - varies based on field type. + Read-only property accessing the value of this field. + + Format varies based on field type. """ return self.get(FieldDictionaryAttributes.V) @@ -1258,9 +1252,9 @@ def default_value(self) -> Optional[Any]: @property def defaultValue(self) -> Optional[Any]: # deprecated """ - .. deprecated:: 1.28.3 + Use :py:attr:`default_value` instead. - Use :py:attr:`default_value` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("defaultValue", "default_value", "3.0.0") return self.default_value @@ -1269,17 +1263,18 @@ def defaultValue(self) -> Optional[Any]: # deprecated def additional_actions(self) -> Optional[DictionaryObject]: """ Read-only property accessing the additional actions dictionary. - This dictionary defines the field's behavior in response to trigger events. - See Section 8.5.2 of the PDF 1.7 reference. + + This dictionary defines the field's behavior in response to trigger + events. See Section 8.5.2 of the PDF 1.7 reference. """ return self.get(FieldDictionaryAttributes.AA) @property def additionalActions(self) -> Optional[DictionaryObject]: # deprecated """ - .. deprecated:: 1.28.3 + Use :py:attr:`additional_actions` instead. - Use :py:attr:`additional_actions` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("additionalActions", "additional_actions", "3.0.0") return self.additional_actions @@ -1288,17 +1283,17 @@ def additionalActions(self) -> Optional[DictionaryObject]: # deprecated class Destination(TreeObject): """ A class representing a destination within a PDF file. + See section 8.2.1 of the PDF 1.6 reference. Args: - title: Title of this destination. - page: Reference to the page of this destination. Should - be an instance of :class:`IndirectObject`. - fit: How the destination is displayed. + title: Title of this destination. + page: Reference to the page of this destination. Should + be an instance of :class:`IndirectObject`. + fit: How the destination is displayed. Raises: - PdfReadError: If destination type is invalid. - + PdfReadError: If destination type is invalid. """ node: Optional[ @@ -1362,9 +1357,9 @@ def dest_array(self) -> "ArrayObject": def getDestArray(self) -> "ArrayObject": # deprecated """ - .. deprecated:: 1.28.3 + Use :py:attr:`dest_array` instead. - Use :py:attr:`dest_array` instead. + .. deprecated:: 1.28.3 """ deprecation_with_replacement("getDestArray", "dest_array", "3.0.0") return self.dest_array @@ -1430,20 +1425,25 @@ def bottom(self) -> Optional[FloatObject]: @property def color(self) -> Optional["ArrayObject"]: - """Read-only property accessing the color in (R, G, B) with values 0.0-1.0""" + """Read-only property accessing the color in (R, G, B) with values 0.0-1.0.""" return self.get( "/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)]) ) @property def font_format(self) -> Optional[OutlineFontFlag]: - """Read-only property accessing the font type. 1=italic, 2=bold, 3=both""" + """ + Read-only property accessing the font type. + + 1=italic, 2=bold, 3=both + """ return self.get("/F", 0) @property def outline_count(self) -> Optional[int]: """ Read-only property accessing the outline count. + positive = expanded negative = collapsed absolute value = number of visible descendents at all levels diff --git a/pypdf/generic/_fit.py b/pypdf/generic/_fit.py index e8ecaa76dd..571770afd5 100644 --- a/pypdf/generic/_fit.py +++ b/pypdf/generic/_fit.py @@ -21,7 +21,7 @@ def xyz( zoom: Optional[float] = None, ) -> "Fit": """ - Display the page designated by page, with the coordinates ( left , top ) + Display the page designated by page, with the coordinates (left , top) positioned at the upper-left corner of the window and the contents of the page magnified by the factor zoom. @@ -31,12 +31,12 @@ def xyz( A zoom value of 0 has the same meaning as a null value. Args: - left: - top: - zoom: + left: + top: + zoom: Returns: - The created fit object. + The created fit object. """ return Fit(fit_type="/XYZ", fit_args=(left, top, zoom)) @@ -45,9 +45,11 @@ def fit(cls) -> "Fit": """ Display the page designated by page, with its contents magnified just enough to fit the entire page within the window both horizontally and - vertically. If the required horizontal and vertical magnification - factors are different, use the smaller of the two, centering the page - within the window in the other dimension. + vertically. + + If the required horizontal and vertical magnification factors are + different, use the smaller of the two, centering the page within the + window in the other dimension. """ return Fit(fit_type="/Fit") @@ -63,10 +65,10 @@ def fit_horizontally(cls, top: Optional[float] = None) -> "Fit": parameter is to be retained unchanged. Args: - top: + top: Returns: - The created fit object. + The created fit object. """ return Fit(fit_type="/FitH", fit_args=(top,)) @@ -85,7 +87,7 @@ def fit_rectangle( """ Display the page designated by page , with its contents magnified just enough to fit the rectangle specified by the coordinates - left , bottom , right , and top entirely within the window + left, bottom, right, and top entirely within the window both horizontally and vertically. If the required horizontal and vertical magnification factors are @@ -96,62 +98,64 @@ def fit_rectangle( behavior. Args: - left: - bottom: - right: - top: + left: + bottom: + right: + top: Returns: - The created fit object. + The created fit object. """ return Fit(fit_type="/FitR", fit_args=(left, bottom, right, top)) @classmethod def fit_box(cls) -> "Fit": """ - Display the page designated by page , with its contents magnified - just enough to fit its bounding box entirely within the window both - horizontally and vertically. If the required horizontal and vertical - magnification factors are different, use the smaller of the two, - centering the bounding box within the window in the other dimension. + Display the page designated by page , with its contents magnified just + enough to fit its bounding box entirely within the window both + horizontally and vertically. + + If the required horizontal and vertical magnification factors are + different, use the smaller of the two, centering the bounding box + within the window in the other dimension. """ return Fit(fit_type="/FitB") @classmethod def fit_box_horizontally(cls, top: Optional[float] = None) -> "Fit": """ - Display the page designated by page , with the vertical coordinate - top positioned at the top edge of the window and the contents of the - page magnified just enough to fit the entire width of its bounding box + Display the page designated by page , with the vertical coordinate top + positioned at the top edge of the window and the contents of the page + magnified just enough to fit the entire width of its bounding box within the window. A null value for top specifies that the current value of that parameter is to be retained unchanged. Args: - top: + top: Returns: - The created fit object. + The created fit object. """ return Fit(fit_type="/FitBH", fit_args=(top,)) @classmethod def fit_box_vertically(cls, left: Optional[float] = None) -> "Fit": """ - Display the page designated by page , with the horizontal coordinate - left positioned at the left edge of the window and the contents of - the page magnified just enough to fit the entire height of its - bounding box within the window. + Display the page designated by page, with the horizontal coordinate + left positioned at the left edge of the window and the contents of the + page magnified just enough to fit the entire height of its bounding box + within the window. A null value for left specifies that the current value of that parameter is to be retained unchanged. Args: - left: + left: Returns: - The created fit object. + The created fit object. """ return Fit(fit_type="/FitBV", fit_args=(left,)) diff --git a/pypdf/generic/_rectangle.py b/pypdf/generic/_rectangle.py index 786b88ebde..7747ba5ac6 100644 --- a/pypdf/generic/_rectangle.py +++ b/pypdf/generic/_rectangle.py @@ -1,4 +1,3 @@ -import decimal from typing import Any, List, Tuple, Union from .._utils import deprecation_no_replacement, deprecation_with_replacement @@ -8,12 +7,15 @@ class RectangleObject(ArrayObject): """ - This class is used to represent *page boxes* in pypdf. These boxes include: - * :attr:`artbox ` - * :attr:`bleedbox ` - * :attr:`cropbox ` - * :attr:`mediabox ` - * :attr:`trimbox ` + This class is used to represent *page boxes* in pypdf. + + These boxes include: + + * :attr:`artbox ` + * :attr:`bleedbox ` + * :attr:`cropbox ` + * :attr:`mediabox ` + * :attr:`trimbox ` """ def __init__( @@ -113,7 +115,7 @@ def getLowerRight_y(self) -> FloatObject: # deprecated return self.bottom @property - def lower_left(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + def lower_left(self) -> Tuple[float, float]: """ Property to read and modify the lower left coordinate of this box in (x,y) form. @@ -125,7 +127,7 @@ def lower_left(self, value: List[Any]) -> None: self[0], self[1] = (self._ensure_is_number(x) for x in value) @property - def lower_right(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + def lower_right(self) -> Tuple[float, float]: """ Property to read and modify the lower right coordinate of this box in (x,y) form. @@ -137,7 +139,7 @@ def lower_right(self, value: List[Any]) -> None: self[2], self[1] = (self._ensure_is_number(x) for x in value) @property - def upper_left(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + def upper_left(self) -> Tuple[float, float]: """ Property to read and modify the upper left coordinate of this box in (x,y) form. @@ -149,7 +151,7 @@ def upper_left(self, value: List[Any]) -> None: self[0], self[3] = (self._ensure_is_number(x) for x in value) @property - def upper_right(self) -> Tuple[decimal.Decimal, decimal.Decimal]: + def upper_right(self) -> Tuple[float, float]: """ Property to read and modify the upper right coordinate of this box in (x,y) form. @@ -162,25 +164,25 @@ def upper_right(self, value: List[Any]) -> None: def getLowerLeft( self, - ) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + ) -> Tuple[float, float]: # deprecated deprecation_with_replacement("getLowerLeft", "lower_left", "3.0.0") return self.lower_left def getLowerRight( self, - ) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + ) -> Tuple[float, float]: # deprecated deprecation_with_replacement("getLowerRight", "lower_right", "3.0.0") return self.lower_right def getUpperLeft( self, - ) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + ) -> Tuple[float, float]: # deprecated deprecation_with_replacement("getUpperLeft", "upper_left", "3.0.0") return self.upper_left def getUpperRight( self, - ) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + ) -> Tuple[float, float]: # deprecated deprecation_with_replacement("getUpperRight", "upper_right", "3.0.0") return self.upper_right @@ -201,65 +203,57 @@ def setUpperRight(self, value: Tuple[float, float]) -> None: # deprecated self[2], self[3] = (self._ensure_is_number(x) for x in value) @property - def width(self) -> decimal.Decimal: + def width(self) -> float: return self.right - self.left - def getWidth(self) -> decimal.Decimal: # deprecated + def getWidth(self) -> float: # deprecated deprecation_with_replacement("getWidth", "width", "3.0.0") return self.width @property - def height(self) -> decimal.Decimal: + def height(self) -> float: return self.top - self.bottom - def getHeight(self) -> decimal.Decimal: # deprecated + def getHeight(self) -> float: # deprecated deprecation_with_replacement("getHeight", "height", "3.0.0") return self.height @property - def lowerLeft(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + def lowerLeft(self) -> Tuple[float, float]: # deprecated deprecation_with_replacement("lowerLeft", "lower_left", "3.0.0") return self.lower_left @lowerLeft.setter - def lowerLeft( - self, value: Tuple[decimal.Decimal, decimal.Decimal] - ) -> None: # deprecated + def lowerLeft(self, value: Tuple[float, float]) -> None: # deprecated deprecation_with_replacement("lowerLeft", "lower_left", "3.0.0") self.lower_left = value @property - def lowerRight(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + def lowerRight(self) -> Tuple[float, float]: # deprecated deprecation_with_replacement("lowerRight", "lower_right", "3.0.0") return self.lower_right @lowerRight.setter - def lowerRight( - self, value: Tuple[decimal.Decimal, decimal.Decimal] - ) -> None: # deprecated + def lowerRight(self, value: Tuple[float, float]) -> None: # deprecated deprecation_with_replacement("lowerRight", "lower_right", "3.0.0") self.lower_right = value @property - def upperLeft(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + def upperLeft(self) -> Tuple[float, float]: # deprecated deprecation_with_replacement("upperLeft", "upper_left", "3.0.0") return self.upper_left @upperLeft.setter - def upperLeft( - self, value: Tuple[decimal.Decimal, decimal.Decimal] - ) -> None: # deprecated + def upperLeft(self, value: Tuple[float, float]) -> None: # deprecated deprecation_with_replacement("upperLeft", "upper_left", "3.0.0") self.upper_left = value @property - def upperRight(self) -> Tuple[decimal.Decimal, decimal.Decimal]: # deprecated + def upperRight(self) -> Tuple[float, float]: # deprecated deprecation_with_replacement("upperRight", "upper_right", "3.0.0") return self.upper_right @upperRight.setter - def upperRight( - self, value: Tuple[decimal.Decimal, decimal.Decimal] - ) -> None: # deprecated + def upperRight(self, value: Tuple[float, float]) -> None: # deprecated deprecation_with_replacement("upperRight", "upper_right", "3.0.0") self.upper_right = value diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index 1643422ab8..24a77f8554 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -118,15 +118,14 @@ def create_string_object( Create a ByteStringObject or a TextStringObject from a string to represent the string. Args: - string: - forced_encoding: + string: The data being used + forced_encoding: Typically None, or an encoding string Returns: - A ByteStringObject + A ByteStringObject Raises: - TypeError: If string is not of type str or bytes. - + TypeError: If string is not of type str or bytes. """ if isinstance(string, str): return TextStringObject(string) diff --git a/pypdf/pagerange.py b/pypdf/pagerange.py index 7c6061f288..66169e61e0 100644 --- a/pypdf/pagerange.py +++ b/pypdf/pagerange.py @@ -31,7 +31,6 @@ class PageRange: - to_slice() gives the equivalent slice. - str() and repr() allow printing. - indices(n) is like slice.indices(n). - """ def __init__(self, arg: Union[slice, "PageRange", str]) -> None: @@ -43,6 +42,7 @@ def __init__(self, arg: Union[slice, "PageRange", str]) -> None: where the brackets indicate optional ints. Remember, page indices start with zero. Page range expression examples: + : all pages. -1 last page. 22 just the 23rd page. :-1 all but the last page. 0:3 the first three pages. -2 second-to-last page. @@ -82,10 +82,10 @@ def valid(input: Any) -> bool: True if input is a valid initializer for a PageRange. Args: - input: A possible PageRange string or a PageRange object. + input: A possible PageRange string or a PageRange object. Returns: - True, if the ``input`` is a valid PageRange. + True, if the ``input`` is a valid PageRange. """ return isinstance(input, (slice, PageRange)) or ( isinstance(input, str) and bool(re.match(PAGE_RANGE_RE, input)) @@ -114,16 +114,16 @@ def __repr__(self) -> str: def indices(self, n: int) -> Tuple[int, int, int]: """ - Assuming a sequence of length n, calculate the start and stop - indices, and the stride length of the PageRange. + Assuming a sequence of length n, calculate the start and stop indices, + and the stride length of the PageRange. See help(slice.indices). Args: - n: the length of the list of pages to choose from. + n: the length of the list of pages to choose from. Returns: - Arguments for range() + Arguments for range() """ return self._slice.indices(n) @@ -160,12 +160,12 @@ def parse_filename_page_ranges( Given a list of filenames and page ranges, return a list of (filename, page_range) pairs. Args: - args: A list where the first element is a filename. The other elements are - filenames, page-range expressions, slice objects, or PageRange objects. - A filename not followed by a page range indicates all pages of the file. + args: A list where the first element is a filename. The other elements are + filenames, page-range expressions, slice objects, or PageRange objects. + A filename not followed by a page range indicates all pages of the file. Returns: - A list of (filename, page_range) pairs. + A list of (filename, page_range) pairs. """ pairs: List[Tuple[str, PageRange]] = [] pdf_filename = None diff --git a/pypdf/xmp.py b/pypdf/xmp.py index aaae96a2ba..53955d56c8 100644 --- a/pypdf/xmp.py +++ b/pypdf/xmp.py @@ -37,26 +37,24 @@ PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" -# What is the PDFX namespace, you might ask? I might ask that too. It's -# a completely undocumented namespace used to place "custom metadata" +# What is the PDFX namespace, you might ask? +# It's documented here: https://github.com/adobe/xmp-docs/raw/master/XMPSpecifications/XMPSpecificationPart3.pdf +# This namespace is used to place "custom metadata" # properties, which are arbitrary metadata properties with no semantic or -# documented meaning. Elements in the namespace are key/value-style storage, +# documented meaning. +# +# Elements in the namespace are key/value-style storage, # where the element name is the key and the content is the value. The keys # are transformed into valid XML identifiers by substituting an invalid # identifier character with \u2182 followed by the unicode hex ID of the # original character. A key like "my car" is therefore "my\u21820020car". # -# \u2182, in case you're wondering, is the unicode character -# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for -# escaping characters. +# \u2182 is the unicode character \u{ROMAN NUMERAL TEN THOUSAND} # -# Intentional users of the pdfx namespace should be shot on sight. A +# The pdfx namespace should be avoided. A # custom data schema and sensical XML elements could be used instead, as is -# suggested by Adobe's own documentation on XMP (under "Extensibility of -# Schemas"). -# -# Information presented here on the /pdfx/ schema is a result of limited -# reverse engineering, and does not constitute a full specification. +# suggested by Adobe's own documentation on XMP under "Extensibility of +# Schemas". PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" iso8601 = re.compile( @@ -215,7 +213,6 @@ class XmpInformation(PdfObject): Raises: PdfReadError: if XML is invalid - """ def __init__(self, stream: ContentStream) -> None: @@ -244,9 +241,9 @@ def writeToStream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`write_to_stream` instead. - Use :meth:`write_to_stream` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("writeToStream", "write_to_stream", "3.0.0") self.write_to_stream(stream, encryption_key) @@ -263,9 +260,9 @@ def getElement( self, aboutUri: str, namespace: str, name: str ) -> Iterator[Any]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_element` instead. - Use :meth:`get_element` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement("getElement", "get_element", "3.0.0") return self.get_element(aboutUri, namespace, name) @@ -285,9 +282,9 @@ def getNodesInNamespace( self, aboutUri: str, namespace: str ) -> Iterator[Any]: # deprecated """ - .. deprecated:: 1.28.0 + Use :meth:`get_nodes_in_namespace` instead. - Use :meth:`get_nodes_in_namespace` instead. + .. deprecated:: 1.28.0 """ deprecation_with_replacement( "getNodesInNamespace", "get_nodes_in_namespace", "3.0.0" @@ -303,107 +300,79 @@ def _get_text(self, element: XmlElement) -> str: dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor")) """ - Contributors to the resource (other than the authors). An unsorted - array of names. + Contributors to the resource (other than the authors). + + An unsorted array of names. """ dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage")) - """ - Text describing the extent or scope of the resource. - """ + """Text describing the extent or scope of the resource.""" dc_creator = property(_getter_seq(DC_NAMESPACE, "creator")) - """ - A sorted array of names of the authors of the resource, listed in order - of precedence. - """ + """A sorted array of names of the authors of the resource, listed in order + of precedence.""" dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) """ A sorted array of dates (datetime.datetime instances) of significance to - the resource. The dates and times are in UTC. + the resource. + + The dates and times are in UTC. """ dc_description = property(_getter_langalt(DC_NAMESPACE, "description")) - """ - A language-keyed dictionary of textual descriptions of the content of the - resource. - """ + """A language-keyed dictionary of textual descriptions of the content of the + resource.""" dc_format = property(_getter_single(DC_NAMESPACE, "format")) - """ - The mime-type of the resource. - """ + """The mime-type of the resource.""" dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier")) - """ - Unique identifier of the resource. - """ + """Unique identifier of the resource.""" dc_language = property(_getter_bag(DC_NAMESPACE, "language")) - """ - An unordered array specifying the languages used in the resource. - """ + """An unordered array specifying the languages used in the resource.""" dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher")) - """ - An unordered array of publisher names. - """ + """An unordered array of publisher names.""" dc_relation = property(_getter_bag(DC_NAMESPACE, "relation")) - """ - An unordered array of text descriptions of relationships to other - documents. - """ + """An unordered array of text descriptions of relationships to other + documents.""" dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights")) - """ - A language-keyed dictionary of textual descriptions of the rights the - user has to this resource. - """ + """A language-keyed dictionary of textual descriptions of the rights the + user has to this resource.""" dc_source = property(_getter_single(DC_NAMESPACE, "source")) - """ - Unique identifier of the work from which this resource was derived. - """ + """Unique identifier of the work from which this resource was derived.""" dc_subject = property(_getter_bag(DC_NAMESPACE, "subject")) - """ - An unordered array of descriptive phrases or keywrods that specify the - topic of the content of the resource. - """ + """An unordered array of descriptive phrases or keywrods that specify the + topic of the content of the resource.""" dc_title = property(_getter_langalt(DC_NAMESPACE, "title")) - """ - A language-keyed dictionary of the title of the resource. - """ + """A language-keyed dictionary of the title of the resource.""" dc_type = property(_getter_bag(DC_NAMESPACE, "type")) - """ - An unordered array of textual descriptions of the document type. - """ + """An unordered array of textual descriptions of the document type.""" pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords")) - """ - An unformatted text string representing document keywords. - """ + """An unformatted text string representing document keywords.""" pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion")) - """ - The PDF file version, for example 1.0, 1.3. - """ + """The PDF file version, for example 1.0, 1.3.""" pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer")) - """ - The name of the tool that created the PDF document. - """ + """The name of the tool that created the PDF document.""" xmp_create_date = property( _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date) ) """ - The date and time the resource was originally created. The date and - time are returned as a UTC datetime.datetime object. + The date and time the resource was originally created. + + The date and time are returned as a UTC datetime.datetime object. """ @property @@ -420,8 +389,9 @@ def xmp_createDate(self, value: datetime.datetime) -> None: # deprecated _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date) ) """ - The date and time the resource was last modified. The date and time - are returned as a UTC datetime.datetime object. + The date and time the resource was last modified. + + The date and time are returned as a UTC datetime.datetime object. """ @property @@ -467,9 +437,7 @@ def xmp_creatorTool(self, value: str) -> None: # deprecated self.xmp_creator_tool = value xmpmm_document_id = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) - """ - The common identifier for all versions and renditions of this resource. - """ + """The common identifier for all versions and renditions of this resource.""" @property def xmpmm_documentId(self) -> str: # deprecated @@ -482,10 +450,8 @@ def xmpmm_documentId(self, value: str) -> None: # deprecated self.xmpmm_document_id = value xmpmm_instance_id = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) - """ - An identifier for a specific incarnation of a document, updated each - time a file is saved. - """ + """An identifier for a specific incarnation of a document, updated each + time a file is saved.""" @property def xmpmm_instanceId(self) -> str: # deprecated diff --git a/pyproject.toml b/pyproject.toml index 8a12f7e453..5603da5b7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,3 +100,8 @@ exclude_lines = [ [tool.ruff] line-length = 120 + +[tool.docformatter] +pre-summary-newline = true +wrap-summaries = 0 +wrap-descriptions = 0 diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index 0b48f36451..eae901cb3b 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -56,6 +56,8 @@ pytest==7.2.0 # pytest-benchmark pytest-benchmark==4.0.0 # via -r requirements/ci.in +ruff==0.0.241 + # via -r requirements/ci.in typeguard==2.13.3 # via -r requirements/ci.in types-dataclasses==0.6.6 diff --git a/requirements/ci.in b/requirements/ci.in index aa27ae1c1d..40ecd20713 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -5,9 +5,9 @@ flake8-bugbear flake8-print mypy pillow +pycryptodome pytest pytest-benchmark -pycryptodome typeguard -types-Pillow types-dataclasses +types-Pillow diff --git a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf index 0e9633ac16..365637b94a 100644 Binary files a/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf and b/resources/Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf differ diff --git a/resources/indirect-rotation.pdf b/resources/indirect-rotation.pdf new file mode 100644 index 0000000000..4890882cf2 Binary files /dev/null and b/resources/indirect-rotation.pdf differ diff --git a/sample-files b/sample-files index 0fe84b30ed..372294b066 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 0fe84b30ed33ff3daa9293e44349b8618f135699 +Subproject commit 372294b066cd3fbb4fb12fd9000ef935a2a86fcf diff --git a/setup.cfg b/setup.cfg index 9b09288301..75d6794cde 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,3 +53,7 @@ tests_dir = tests/ [tool:check-wheel-contents] package = ./pypdf + +[darglint] +enable=DAR201 +ignore=DAR002 diff --git a/stream.pdf b/stream.pdf new file mode 100644 index 0000000000..b32d3b131d Binary files /dev/null and b/stream.pdf differ diff --git a/tests/__init__.py b/tests/__init__.py index 8ce2a14b38..9a5866aacb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -17,6 +17,9 @@ def get_pdf_from_url(url: str, name: str) -> bytes: Args: url: location of the PDF file name: unique name across all files + + Returns: + Read PDF as bytes """ if url.startswith("file://"): with open(url[7:].replace("\\", "/"), "rb") as fp: @@ -45,6 +48,12 @@ def _strip_position(line: str) -> str: becomes Xref table not zero-indexed. + + Args: + line: the original line + + Returns: + A line with stripped position """ line = ".py".join(line.split(".py:")[1:]) line = " ".join(line.split(" ")[1:]) diff --git a/tests/bench.py b/tests/bench.py index a58cb38402..64a9525169 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -137,6 +137,7 @@ def test_read_string_from_stream_performance(benchmark): """ This test simulates reading an embedded base64 image of 256kb. It should be faster than a second, even on ancient machines. + Runs < 100ms on a 2019 notebook. Takes 10 seconds prior to #1350. """ benchmark(read_string_from_stream_performance) diff --git a/tests/test_encryption.py b/tests/test_encryption.py index 6400a7adc5..68b5b64d82 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -20,7 +20,7 @@ @pytest.mark.parametrize( - ("name", "requres_pycryptodome"), + ("name", "requires_pycryptodome"), [ # unencrypted pdf ("unencrypted.pdf", False), @@ -68,9 +68,9 @@ ("r6-owner-password.pdf", True), ], ) -def test_encryption(name, requres_pycryptodome): +def test_encryption(name, requires_pycryptodome): inputfile = RESOURCE_ROOT / "encryption" / name - if requres_pycryptodome and not HAS_PYCRYPTODOME: + if requires_pycryptodome and not HAS_PYCRYPTODOME: with pytest.raises(DependencyError) as exc: ipdf = pypdf.PdfReader(inputfile) ipdf.decrypt("asdfzxcv") diff --git a/tests/test_filters.py b/tests/test_filters.py index c43b2aa4c3..ab0505b37c 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -35,9 +35,7 @@ ("predictor", "s"), list(cartesian_product([1], filter_inputs)) ) def test_FlateDecode(predictor, s): - """ - Tests FlateDecode decode() and encode() methods. - """ + """Tests FlateDecode decode() and encode() methods.""" codec = FlateDecode() s = s.encode() encoded = codec.encode(s) @@ -46,9 +44,11 @@ def test_FlateDecode(predictor, s): def test_FlateDecode_unsupported_predictor(): """ - Inputs an unsupported predictor (outside the [10, 15] range) checking - that PdfReadError() is raised. Once this predictor support is updated - in the future, this test case may be removed. + Inputs an unsupported predictor (outside the [10, 15] range) checking that + PdfReadError() is raised. + + Once this predictor support is updated in the future, this test case may be + removed. """ codec = FlateDecode() predictors = (-10, -1, 0, 9, 16, 20, 100) @@ -109,16 +109,16 @@ def test_ASCIIHexDecode(data, expected): """ Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the correct output is returned. + TODO What is decode() supposed to do for such inputs as ">>", ">>>" or any other not terminated by ">"? (For the latter case, an exception is currently raised.) """ - assert ASCIIHexDecode.decode(data) == expected def test_ASCIIHexDecode_no_eod(): - """Ensuring an exception is raised when no EOD character is present""" + """Ensuring an exception is raised when no EOD character is present.""" with pytest.raises(PdfStreamError) as exc: ASCIIHexDecode.decode("") assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode" @@ -146,9 +146,10 @@ def test_ASCII85Decode_with_overflow(): def test_ASCII85Decode_five_zero_bytes(): """ From ISO 32000 (2008) §7.4.3: - «As a special case, if all five bytes are 0, they shall be represented - by the character with code 122 (z) instead of by five exclamation - points (!!!!!).» + + «As a special case, if all five bytes are 0, they shall be represented by + the character with code 122 (z) instead of by five exclamation points + (!!!!!).» """ inputs = ("z", "zz", "zzz") exp_outputs = ( diff --git a/tests/test_generic.py b/tests/test_generic.py index acdb3408e8..044608b30e 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -261,7 +261,7 @@ def test_outline_item_write_to_stream(): oi = OutlineItem(NameObject("title"), NullObject(), Fit.fit_vertically(left=0)) oi.write_to_stream(stream, None) stream.seek(0, 0) - assert stream.read() == b"<<\n/Title (title)\n/Dest [ null /FitV 0 ]\n>>" + assert stream.read() == b"<<\n/Title (title)\n/Dest [ null /FitV 0.0 ]\n>>" def test_encode_pdfdocencoding_keyerror(): @@ -464,7 +464,6 @@ def test_TextStringObject_autodetect_utf16(): def test_remove_child_not_in_tree(): - tree = TreeObject() with pytest.raises(ValueError) as exc: tree.remove_child(ChildDummy()) @@ -472,7 +471,6 @@ def test_remove_child_not_in_tree(): def test_remove_child_not_in_that_tree(): - tree = TreeObject() tree.indirect_reference = NullObject() child = TreeObject() @@ -562,7 +560,6 @@ def test_remove_child_found_in_tree(): assert len([el for el in tree.children()]) == 3 # Remove middle child - # tree.remove_child(child4) child4.remove_from_tree() assert tree[NameObject("/Count")] == 2 assert len([el for el in tree.children()]) == 2 @@ -972,32 +969,32 @@ def test_create_string_object_force(): @pytest.mark.parametrize( ("value", "expected"), [ - ("0.000000", "0"), - ("0.0", "0"), + ("0.000000", "0.0"), + ("0.0", "0.0"), ("1.0", "1"), ("0.123000", "0.123"), ("0.000123000", "0.000123"), - ("0.0", "0"), - ("0", "0"), + ("0.0", "0.0"), + ("0", "0.0"), ("1", "1"), ("1.0", "1"), ("1.01", "1.01"), ("1.010", "1.01"), - ("0000.0000", "0"), + ("0000.0000", "0.0"), ("0.10101010", "0.1010101"), ("50000000000", "50000000000"), - ("99900000000000000123", "99900000000000000123"), - ("99900000000000000123.456000", "99900000000000000123.456"), + ("99900000000000000123", "99900000000000000000"), + ("99900000000000000123.456000", "99900000000000000000"), ("0.00000000000000000000123", "0.00000000000000000000123"), ("0.00000000000000000000123000", "0.00000000000000000000123"), - ( - "50032481330523882508234.00000000000000000000123000", - "50032481330523882508234.00000000000000000000123", - ), - ( - "928457298572093487502198745102973402987412908743.75249875981374981237498213740000", - "928457298572093487502198745102973402987412908743.7524987598137498123749821374", - ), + # ( + # "50032481330523882508234.00000000000000000000123000", + # "50032481330523882508234.00000000000000000000123", + # ), + # ( + # "928457298572093487502198745102973402987412908743.75249875981374981237498213740000", + # "928457298572093487502198745102973402987412908743.7524987598137498123749821374", + # ), ], ) def test_float_object_decimal_to_string(value, expected): @@ -1005,9 +1002,6 @@ def test_float_object_decimal_to_string(value, expected): def test_cloning(caplog): - # pdf_path = RESOURCE_ROOT / "crazyones.pdf" - # reader = PdfReader(pdf_path) - # page = reader.pages[0] writer = PdfWriter() with pytest.raises(Exception) as exc: PdfObject().clone(writer) diff --git a/tests/test_merger.py b/tests/test_merger.py index ee0453c56e..d8cd9573eb 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -49,9 +49,6 @@ def merger_operate(merger): r = pypdf.PdfReader(pdf_path) merger.append(r, outline_item="foo", pages=list(range(len(r.pages)))) - # PdfReader object with List: - # merger.append(pypdf.PdfReader(pdf_path), outline_item="foo") - # File handle with open(pdf_path, "rb") as fh: merger.append(fh) @@ -306,36 +303,14 @@ def test_merge_write_closed_fh_with_writer(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" merger.append(pdf_path) - # err_closed = "close() was called and thus the writer cannot be used anymore" - merger.close() - # with pytest.raises(RuntimeError) as exc: - merger.write("stream.pdf") - # assert exc.value.args[0] == err_closed - - # with pytest.raises(RuntimeError) as exc: + merger.write("stream1.pdf") merger.add_metadata({"author": "Martin Thoma"}) - # assert exc.value.args[0] == err_closed - - # with pytest.raises(RuntimeError) as exc: merger.set_page_layout("/SinglePage") - # assert exc.value.args[0] == err_closed - - # with pytest.raises(RuntimeError) as exc: merger.set_page_mode("/UseNone") - # assert exc.value.args[0] == err_closed - - # with pytest.raises(RuntimeError) as exc: - # merger._write_outline() - # assert exc.value.args[0] == err_closed - - # with pytest.raises(RuntimeError) as exc: merger.add_outline_item("An outline item", 0) - # assert exc.value.args[0] == err_closed - # with pytest.raises(RuntimeError) as exc: - # merger._write_dests() - # assert exc.value.args[0] == err_closed + os.unlink("stream1.pdf") @pytest.mark.external diff --git a/tests/test_page.py b/tests/test_page.py index 1914636a2e..47919286a5 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -82,8 +82,8 @@ def test_page_operations(pdf_path, password): """ This test just checks if the operation throws an exception. - This should be done way more thoroughly: It should be checked if the - output is as expected. + This should be done way more thoroughly: It should be checked if the output + is as expected. """ if pdf_path.startswith("http"): pdf_path = BytesIO(get_pdf_from_url(pdf_path, pdf_path.split("/")[-1])) @@ -148,6 +148,41 @@ def test_transformation_equivalence(): ) +def test_transformation_equivalence2(): + pdf_path = RESOURCE_ROOT / "labeled-edges-center-image.pdf" + reader_base = PdfReader(pdf_path) + + pdf_path = RESOURCE_ROOT / "box.pdf" + reader_add = PdfReader(pdf_path) + + w = PdfWriter() + w.append(reader_base) + w.pages[0].merge_transformed_page( + reader_add.pages[0], Transformation().scale(2).rotate(-45), False, False + ) + w.pages[0].merge_transformed_page( + reader_add.pages[0], Transformation().scale(2).translate(100, 100), True, False + ) + # No special assert: the test should be visual in a viewer; 2 box with a arrow rotated and translated + + w = PdfWriter() + w.append(reader_add) + w.pages[0].merge_transformed_page( + reader_base.pages[0], Transformation(), True, True + ) + # No special assert: Visual check the page has been increased and all is visible (box+graph) + + pdf_path = RESOURCE_ROOT / "commented-xmp.pdf" + reader_comments = PdfReader(pdf_path) + + w = PdfWriter() + w.append(reader_base) + w.pages[0].merge_transformed_page( + reader_comments.pages[0], Transformation().rotate(-15), True, True + ) + # No special assert: Visual check the overlay has its comments at the good position + + def test_get_user_unit_property(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) @@ -246,6 +281,14 @@ def test_page_rotation(): ) +def test_page_indirect_rotation(): + reader = PdfReader(RESOURCE_ROOT / "indirect-rotation.pdf") + page = reader.pages[0] + + # test rotation + assert page.rotation == 0 + + def test_page_scale(): op = Transformation() with pytest.raises(ValueError) as exc: @@ -311,7 +354,8 @@ def test_iss_1142(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) txt = reader.pages[3].extract_text() # The following text is contained in two different cells: - # assert txt.find("有限公司郑州分公司") > 0 + assert txt.find("有限公司") > 0 + assert txt.find("郑州分公司") > 0 # 有限公司 = limited company # 郑州分公司 = branch office in Zhengzhou # First cell (see page 4/254): @@ -382,12 +426,12 @@ def test_extract_text_visitor_callbacks(): This test uses GeoBase_NHNC1_Data_Model_UML_EN.pdf. It extracts the labels of package-boxes in Figure 2. It extracts the texts in table "REVISION HISTORY". - """ import logging class PositionedText: - """Specify a text with coordinates, font-dictionary and font-size. + """ + Specify a text with coordinates, font-dictionary and font-size. The font-dictionary may be None in case of an unknown font. """ @@ -401,9 +445,11 @@ def __init__(self, text, x, y, font_dict, font_size) -> None: self.font_size = font_size def get_base_font(self) -> str: - """Gets the base font of the text. + """ + Gets the base font of the text. - Return UNKNOWN in case of an unknown font.""" + Return UNKNOWN in case of an unknown font. + """ if (self.font_dict is None) or "/BaseFont" not in self.font_dict: return "UNKNOWN" return self.font_dict["/BaseFont"] @@ -432,7 +478,8 @@ def extract_text_and_rectangles( Extracts texts and rectangles of a page of type pypdf._page.PageObject. This function supports simple coordinate transformations only. - The optional rect_filter-lambda can be used to filter wanted rectangles. + The optional rect_filter-lambda can be used to filter wanted + rectangles. rect_filter has Rectangle as argument and must return a boolean. It returns a tuple containing a list of extracted texts and @@ -894,7 +941,7 @@ def test_merge_page_reproducible_with_proc_set(): @pytest.mark.parametrize( - ("page1", "page2", "expected_result", "expected_renames"), + ("apage1", "apage2", "expected_result", "expected_renames"), [ # simple cases: pytest.param({}, {}, {}, {}, id="no resources"), @@ -951,29 +998,25 @@ def test_merge_page_reproducible_with_proc_set(): ), ], ) -def test_merge_resources(page1, page2, expected_result, expected_renames): - # Arrange - page1 = DictionaryObject( - { - PG.RESOURCES: DictionaryObject( - {NameObject(k): NameObject(v) for k, v in page1.items()} - ) - } - ) - page2 = DictionaryObject( - { - PG.RESOURCES: DictionaryObject( - {NameObject(k): NameObject(v) for k, v in page2.items()} - ) - } - ) +def test_merge_resources(apage1, apage2, expected_result, expected_renames): + for new_res in (False, True): + # Arrange + page1 = PageObject() + page1[NameObject(PG.RESOURCES)] = DictionaryObject() + for k, v in apage1.items(): + page1[PG.RESOURCES][NameObject(k)] = NameObject(v) - # Act - result, renames = PageObject._merge_resources(page1, page2, PG.RESOURCES) + page2 = PageObject() + page2[NameObject(PG.RESOURCES)] = DictionaryObject() + for k, v in apage2.items(): + page2[PG.RESOURCES][NameObject(k)] = NameObject(v) - # Assert - assert result == expected_result - assert renames == expected_renames + # Act + result, renames = page1._merge_resources(page1, page2, PG.RESOURCES, new_res) + + # Assert + assert result == expected_result + assert renames == expected_renames def test_merge_page_resources_smoke_test(): @@ -1041,3 +1084,20 @@ def test_merge_page_resources_smoke_test(): if name in (b"page1-contents", b"page2-contents") ] assert relevant_operations == expected_operations + + +def test_merge_transformed_page_into_blank(): + url = "https://github.com/py-pdf/pypdf/files/10540507/visitcard.pdf" + name = "visitcard.pdf" + r = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + w = PdfWriter() + w.add_blank_page(100, 100) + for x in range(4): + for y in range(7): + w.pages[0].merge_translated_page( + r.pages[0], + x * r.pages[0].trimbox[2], + y * r.pages[0].trimbox[3], + True, + True, + ) diff --git a/tests/test_reader.py b/tests/test_reader.py index b8a115877a..228ac3baec 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -619,10 +619,8 @@ def test_get_destination_page_number(): def test_do_not_get_stuck_on_large_files_without_start_xref(): - """ - Tests for the absence of a DoS bug, where a large file without an startxref - mark would cause the library to hang for minutes to hours - """ + """Tests for the absence of a DoS bug, where a large file without an + startxref mark would cause the library to hang for minutes to hours.""" start_time = time.time() broken_stream = BytesIO(b"\0" * 5 * 1000 * 1000) with pytest.raises(PdfReadError): @@ -640,7 +638,6 @@ def test_decrypt_when_no_id(): https://github.com/py-pdf/pypdf/issues/608 """ - with open(RESOURCE_ROOT / "encrypted_doc_no_id.pdf", "rb") as inputfile: ipdf = PdfReader(inputfile) ipdf.decrypt("") @@ -661,7 +658,7 @@ def test_reader_properties(): [True, False], ) def test_issue604(caplog, strict): - """Test with invalid destinations""" # todo + """Test with invalid destinations.""" # todo with open(RESOURCE_ROOT / "issue-604.pdf", "rb") as f: pdf = None outline = None @@ -823,6 +820,34 @@ def test_read_form_416(): assert len(fields) > 0 +def test_form_topname_with_and_without_acroform(caplog): + r = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + r.add_form_topname("no") + r.rename_form_topname("renamed") + assert "/AcroForm" not in r.trailer["/Root"] + r.trailer["/Root"][NameObject("/AcroForm")] = DictionaryObject() + r.add_form_topname("toto") + r.rename_form_topname("renamed") + assert len(r.get_fields()) == 0 + + r = PdfReader(RESOURCE_ROOT / "form.pdf") + r.add_form_topname("top") + flds = r.get_fields() + assert "top" in flds + assert "top.foo" in flds + r.rename_form_topname("renamed") + flds = r.get_fields() + assert "renamed" in flds + assert "renamed.foo" in flds + + r = PdfReader(RESOURCE_ROOT / "form.pdf") + r.get_fields()["foo"].indirect_reference.get_object()[ + NameObject("/Parent") + ] = DictionaryObject() + r.add_form_topname("top") + assert "have a non-expected parent" in caplog.text + + @pytest.mark.external def test_extract_text_xref_issue_2(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf @@ -877,7 +902,7 @@ def test_get_fields(): @pytest.mark.external def test_get_full_qualified_fields(): - url = "https://github.com/py-pdf/PyPDF2/files/10142389/fields_with_dots.pdf" + url = "https://github.com/py-pdf/pypdf/files/10142389/fields_with_dots.pdf" name = "fields_with_dots.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) fields = reader.get_form_text_fields(True) @@ -1299,6 +1324,7 @@ def test_page_labels(src, page_labels): assert PdfReader(src).page_labels[:max_indices] == page_labels[:max_indices] +@pytest.mark.external def test_iss1559(): url = "https://github.com/py-pdf/pypdf/files/10441992/default.pdf" name = "iss1559.pdf" diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 0e7e8c381b..fa9d7992f7 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -80,10 +80,11 @@ def test_dropdown_items(): def test_PdfReaderFileLoad(): """ - Test loading and parsing of a file. Extract text of the file and compare to expected - textual output. Expected outcome: file loads, text matches expected. - """ + Test loading and parsing of a file. + Extract text of the file and compare to expected textual output. Expected + outcome: file loads, text matches expected. + """ with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile: # Load PDF file from file reader = PdfReader(inputfile) @@ -112,7 +113,6 @@ def test_PdfReaderJpegImage(): Expected outcome: file loads, image matches expected. """ - with open(RESOURCE_ROOT / "jpeg.pdf", "rb") as inputfile: # Load PDF file from file reader = PdfReader(inputfile) @@ -887,6 +887,30 @@ def test_tounicode_is_identity(): @pytest.mark.external +def test_append_forms(): + # from #1538 + writer = PdfWriter() + + url = "https://github.com/py-pdf/pypdf/files/10367412/pdfa.pdf" + name = "form_a.pdf" + reader1 = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader1.add_form_topname("form_a") + writer.append(reader1) + + url = "https://github.com/py-pdf/pypdf/files/10367413/pdfb.pdf" + name = "form_b.pdf" + reader2 = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader2.add_form_topname("form_b") + writer.append(reader2) + + b = BytesIO() + writer.write(b) + reader = PdfReader(b) + assert len(reader.get_form_text_fields()) == len( + reader1.get_form_text_fields() + ) + len(reader2.get_form_text_fields()) + + def test_extra_test_iss1541(): url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf" name = "tst_iss1541.pdf" diff --git a/tests/test_writer.py b/tests/test_writer.py index df94af6e19..41b35296ce 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -5,10 +5,11 @@ import pytest -from pypdf import PageObject, PdfMerger, PdfReader, PdfWriter +from pypdf import PageObject, PdfMerger, PdfReader, PdfWriter, Transformation from pypdf.errors import DeprecationError, PageSizeNotDefinedError from pypdf.generic import ( ArrayObject, + ContentStream, Fit, IndirectObject, NameObject, @@ -91,9 +92,12 @@ def cat1(p): assert len(reader2.outline) == 2 -def writer_operate(writer): +def writer_operate(writer: PdfWriter) -> None: """ To test the writer that initialized by each of the four usages. + + Args: + writer: A PdfWriter object """ pdf_path = RESOURCE_ROOT / "crazyones.pdf" pdf_outline_path = RESOURCE_ROOT / "pdflatex-outline.pdf" @@ -167,8 +171,7 @@ def writer_operate(writer): writer.insert_blank_page(width=100, height=100) writer.insert_blank_page() # without parameters - # TODO: This gives "KeyError: '/Contents'" - is that a bug? - # writer.removeImages() + writer.remove_images() writer.add_metadata({"author": "Martin Thoma"}) @@ -672,7 +675,6 @@ def test_add_link(): def test_io_streams(): """This is the example from the docs ("Streaming data").""" - filepath = RESOURCE_ROOT / "pdflatex-outline.pdf" with open(filepath, "rb") as fh: bytes_stream = BytesIO(fh.read()) @@ -702,9 +704,7 @@ def test_regression_issue670(): def test_issue301(): - """ - Test with invalid stream length object - """ + """Test with invalid stream length object.""" with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: reader = PdfReader(f) writer = PdfWriter() @@ -714,7 +714,7 @@ def test_issue301(): def test_append_pages_from_reader_append(): - """use append_pages_from_reader with a callable""" + """Use append_pages_from_reader with a callable.""" with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: reader = PdfReader(f) writer = PdfWriter() @@ -860,7 +860,7 @@ def test_colors_in_outline_item(): reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") writer = PdfWriter() writer.clone_document_from_reader(reader) - purple_rgb = (0.50196, 0, 0.50196) + purple_rgb = (0.5019607843137255, 0.0, 0.5019607843137255) writer.add_outline_item("First Outline Item", page_number=2, color="800080") writer.add_outline_item("Second Outline Item", page_number=3, color="#800080") writer.add_outline_item("Third Outline Item", page_number=4, color=purple_rgb) @@ -986,6 +986,8 @@ def test_append_without_annots_and_articles(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, None, (0, 10), True, ["/B"]) + writer.reset_translation() + writer.append(reader, (0, 10), True, ["/B"]) assert writer.threads == [] writer = PdfWriter() writer.append(reader, None, (0, 10), True, ["/Annots"]) @@ -1150,3 +1152,26 @@ def test_set_page_label(): writer.write(target) os.remove(target) # comment to see result + + +def test_iss1601(): + url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" + name = "badge-38.pdf" + in_pdf = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + out_pdf = PdfWriter() + page_1 = out_pdf.add_blank_page( + in_pdf.pages[0].mediabox[2], in_pdf.pages[0].mediabox[3] + ) + page_1.merge_transformed_page(in_pdf.pages[0], Transformation()) + assert ( + ContentStream(in_pdf.pages[0].get_contents(), in_pdf).get_data() + in page_1.get_contents().get_data() + ) + page_1 = out_pdf.add_blank_page( + in_pdf.pages[0].mediabox[2], in_pdf.pages[0].mediabox[3] + ) + page_1.merge_page(in_pdf.pages[0]) + assert ( + ContentStream(in_pdf.pages[0].get_contents(), in_pdf).get_data() + in page_1.get_contents().get_data() + ) diff --git a/tests/test_xmp.py b/tests/test_xmp.py index c80ca23bf0..6c3449a9e0 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -184,23 +184,24 @@ def test_issue585(): assert exc.value.args[0].startswith("XML in XmpInformation was invalid") -# def test_getter_bag(): -# f = pypdf.xmp._getter_bag("namespace", "name") -# class Tst: # to replace pdf -# strict = False - -# reader = PdfReader(RESOURCE_ROOT / "commented-xmp.pdf") -# xmp_info = reader.xmp_metadata -# # -# # -# # - -# # -# # me -# # -# # -# # - -# assert xmp_info is not None -# f(xmp_info) +def test_getter_bag(): + f = pypdf.xmp._getter_bag("namespace", "name") + + class Tst: # to replace pdf + strict = False + + reader = PdfReader(RESOURCE_ROOT / "commented-xmp.pdf") + xmp_info = reader.xmp_metadata + # + # + # + + # + # me + # + # + # + + assert xmp_info is not None + f(xmp_info)