diff --git a/.gitignore b/.gitignore index 409d2d1b60..1ff1b76519 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,8 @@ coverage.xml # Docs docs/_build/ +.cspell/ + # Files generated by some of the scripts dont_commit_*.pdf pypdf-output.pdf diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0646cad6c0..b4def409e7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,12 +21,6 @@ repos: # rev: v0.942 # hooks: # - id: mypy -- repo: https://github.com/pycqa/isort - rev: 5.12.0 - hooks: - - id: isort - name: isort (python) - additional_dependencies: [toml] - repo: https://github.com/psf/black rev: 23.1.0 hooks: @@ -39,7 +33,7 @@ repos: additional_dependencies: [black==22.1.0] exclude: "docs/user/robustness.md" - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.0.252' + rev: 'v0.0.254' hooks: - id: ruff - repo: https://github.com/asottile/pyupgrade diff --git a/CHANGELOG.md b/CHANGELOG.md index 7daad0b7c6..94a65fdd0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # CHANGELOG +## Version 3.5.1, 2023-03-05 + +### Robustness (ROB) +- Some attributes not copied in DictionaryObject._clone (#1635) +- Allow merging multiple time pages with annots (#1624) + +### Testing (TST) +- Replace pytest.mark.external by enable_socket (#1657) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.5.0...3.5.1) + ## Version 3.5.0, 2023-02-26 ### New Features (ENH) diff --git a/docs/dev/testing.md b/docs/dev/testing.md index 283faa8c3d..2e7fb7f19d 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -8,10 +8,10 @@ pypdf makes use of the following pytest markers: * `slow`: Tests that require more than 5 seconds * `samples`: Tests that require the [the `sample-files` git submodule](https://github.com/py-pdf/sample-files) to be initialized. As of October 2022, this is about 25 MB. -* `external`: Tests that download PDF documents. They are stored locally and thus only need to be downloaded once. As of October 2022, this is about 200 MB. +* `enable_socket`: Tests that download PDF documents. They are stored locally and thus only need to be downloaded once. As of October 2022, this is about 200 MB. -You can disable them by `pytest -m "not external"` or `pytest -m "not samples"`. -You can even disable all of them: `pytest -m "not external" -m "not samples" -m "not slow"`. +You can disable them by `pytest -m "not enable_socket"` or `pytest -m "not samples"`. +You can even disable all of them: `pytest -m "not enable_socket" -m "not samples" -m "not slow"`. Please note that this reduces test coverage. The CI will always test all files. diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index 70bfd002ed..37a2ca7997 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -37,7 +37,7 @@ If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". **Caveat**: In complicated documents the calculated positions might be wrong. The function provided in argument visitor_operand_before has four arguments: -operand, operand-arguments, current transformation matrix and text matrix. +operator, operand-arguments, current transformation matrix and text matrix. ### Example 1: Ignore header and footer diff --git a/make_changelog.py b/make_changelog.py index 2eea38b534..b375c8f251 100644 --- a/make_changelog.py +++ b/make_changelog.py @@ -2,7 +2,7 @@ import subprocess from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from typing import List @@ -29,7 +29,7 @@ def main(changelog_path: str) -> None: print(changes) new_version = version_bump(git_tag) - today = datetime.now() + today = datetime.now(tz=timezone.utc) header = f"Version {new_version}, {today:%Y-%m-%d}\n" header = header + "-" * (len(header) - 1) + "\n" url = f"https://github.com/py-pdf/pypdf/compare/{git_tag}...{new_version}" diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index f74faa19d2..03669abe37 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -56,7 +56,7 @@ def build_char_map( space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] except Exception: pass - # I conside the space_code is available on one byte + # I consider the space_code is available on one byte if isinstance(space_code, str): try: # one byte sp = space_code.encode("charmap")[0] @@ -140,7 +140,7 @@ def parse_encoding( enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore if isinstance(enc, str): try: - # allready done : enc = NameObject.unnumber(enc.encode()).decode() + # already done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 7acf1b7cc2..b199a744b2 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -186,8 +186,8 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject: elif isinstance(obj, StreamObject): obj._data = self.stmCrypt.decrypt(obj._data) elif isinstance(obj, DictionaryObject): - for dictkey, value in list(obj.items()): - obj[dictkey] = self.decrypt_object(value) + for key, value in obj.items(): + obj[key] = self.decrypt_object(value) elif isinstance(obj, ArrayObject): for i in range(len(obj)): obj[i] = self.decrypt_object(obj[i]) @@ -484,7 +484,7 @@ def verify_user_password( to decrypt the document. Args: - user_password: The user passwort as a bytes stream + user_password: The user password as a bytes stream rev: The encryption revision (see PDF standard) key_size: The size of the key in bytes o_entry: The owner entry @@ -990,7 +990,7 @@ def verify(self, password: Union[bytes, str]) -> PasswordType: if isinstance(password, str): try: pwd = password.encode("latin-1") - except Exception: # noqa + except Exception: pwd = password.encode("utf-8") else: pwd = password diff --git a/pypdf/_merger.py b/pypdf/_merger.py index b07d971c76..f6cbbf2da3 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -51,9 +51,8 @@ str_, ) from ._writer import PdfWriter -from .constants import GoToActionArguments +from .constants import GoToActionArguments, TypArguments, TypFitArguments from .constants import PagesAttributes as PA -from .constants import TypArguments, TypFitArguments from .generic import ( PAGE_FIT, ArrayObject, @@ -183,7 +182,7 @@ def merge( ) if page_number is None: # deprecated - # The paremter is only marked as Optional as long as + # The parameter is only marked as Optional as long as # position is not fully deprecated raise ValueError("page_number may not be None") if fileobj is None: # deprecated @@ -272,8 +271,8 @@ def _create_stream( fileobj.stream.seek(orig_tell) elif hasattr(fileobj, "seek") and hasattr(fileobj, "read"): fileobj.seek(0) - filecontent = fileobj.read() - stream = BytesIO(filecontent) + file_content = fileobj.read() + stream = BytesIO(file_content) else: raise NotImplementedError( "PdfMerger.merge requires an object that PdfReader can parse. " @@ -522,14 +521,14 @@ def _write_dests(self) -> None: if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) for named_dest in self.named_dests: - pageno = None + page_index = None if "/Page" in named_dest: - for pageno, page in enumerate(self.pages): # noqa: B007 + for page_index, page in enumerate(self.pages): # noqa: B007 if page.id == named_dest["/Page"]: named_dest[NameObject("/Page")] = page.out_pagedata break - if pageno is not None: + if page_index is not None: self.output.add_named_destination_object(named_dest) @deprecation_bookmark(bookmarks="outline") @@ -597,7 +596,7 @@ def _write_outline_item_on_page( def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None: for named_dest in self.named_dests: - pageno = None + page_index = None np = named_dest["/Page"] if isinstance(np, NumberObject): @@ -605,13 +604,13 @@ def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None: for page in pages: if np.get_object() == page.pagedata.get_object(): - pageno = page.id + page_index = page.id - if pageno is None: + if page_index is None: raise ValueError( f"Unresolved named destination '{named_dest['/Title']}'" ) - named_dest[NameObject("/Page")] = NumberObject(pageno) + named_dest[NameObject("/Page")] = NumberObject(page_index) @deprecation_bookmark(bookmarks="outline") def _associate_outline_items_to_pages( @@ -625,7 +624,7 @@ def _associate_outline_items_to_pages( self._associate_outline_items_to_pages(pages, outline_item) continue - pageno = None + page_index = None outline_item_page = outline_item["/Page"] if isinstance(outline_item_page, NumberObject): @@ -633,10 +632,10 @@ def _associate_outline_items_to_pages( for p in pages: if outline_item_page.get_object() == p.pagedata.get_object(): - pageno = p.id + page_index = p.id - if pageno is not None: - outline_item[NameObject("/Page")] = NumberObject(pageno) + if page_index is not None: + outline_item[NameObject("/Page")] = NumberObject(page_index) @deprecation_bookmark(bookmark="outline_item") def find_outline_item( diff --git a/pypdf/_page.py b/pypdf/_page.py index 684cbfc954..b1df061696 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -633,7 +633,7 @@ def compute_unique_key(base_key: str) -> Tuple[str, bool]: DictionaryObject, res2.get(resource, DictionaryObject()).get_object() ) rename_res = {} - for key in page2res.keys(): + for key in page2res: unique_key, same_value = compute_unique_key(key) newname = NameObject(unique_key) if key != unique_key: @@ -935,7 +935,11 @@ def _merge_page_writer( trsf = Transformation(ctm) for a in cast(ArrayObject, page2[PG.ANNOTS]): a = a.get_object() - aa = a.clone(pdf, ignore_fields=("/P", "/StructParent")) + aa = a.clone( + pdf, + ignore_fields=("/P", "/StructParent", "/Parent"), + force_duplicate=True, + ) r = cast(ArrayObject, a["/Rect"]) pt1 = trsf.apply_on((r[0], r[1]), True) pt2 = trsf.apply_on((r[2], r[3]), True) @@ -955,6 +959,10 @@ def _merge_page_writer( + cast(tuple, trsf.apply_on((q[4], q[5]), True)) + cast(tuple, trsf.apply_on((q[6], q[7]), True)) ) + try: + aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference + except KeyError: + pass try: aa[NameObject("/P")] = self.indirect_reference annots.append(aa.indirect_reference) @@ -1892,9 +1900,9 @@ def process_operation(operator: bytes, operands: List) -> None: elif ( abs(delta_y) < f * 0.3 and abs(delta_x) > current_spacewidth() * f * 15 + and (output + text)[-1] != " " ): - if (output + text)[-1] != " ": - text += " " + text += " " elif orientation == 180: if delta_y > 0.8 * f: if (output + text)[-1] != "\n": @@ -1911,9 +1919,9 @@ def process_operation(operator: bytes, operands: List) -> None: elif ( abs(delta_y) < f * 0.3 and abs(delta_x) > current_spacewidth() * f * 15 + and (output + text)[-1] != " " ): - if (output + text)[-1] != " ": - text += " " + text += " " elif orientation == 90: if delta_x > 0.8 * f: if (output + text)[-1] != "\n": @@ -1930,9 +1938,9 @@ def process_operation(operator: bytes, operands: List) -> None: elif ( abs(delta_x) < f * 0.3 and abs(delta_y) > current_spacewidth() * f * 15 + and (output + text)[-1] != " " ): - if (output + text)[-1] != " ": - text += " " + text += " " elif orientation == 270: if delta_x < -0.8 * f: if (output + text)[-1] != "\n": @@ -1949,9 +1957,9 @@ def process_operation(operator: bytes, operands: List) -> None: elif ( abs(delta_x) < f * 0.3 and abs(delta_y) > current_spacewidth() * f * 15 + and (output + text)[-1] != " " ): - if (output + text)[-1] != " ": - text += " " + text += " " except Exception: pass @@ -1974,13 +1982,12 @@ def process_operation(operator: bytes, operands: List) -> None: for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) - if isinstance(op, (int, float, NumberObject, FloatObject)): - if ( - (abs(float(op)) >= _space_width) - and (len(text) > 0) - and (text[-1] != " ") - ): - process_operation(b"Tj", [" "]) + if isinstance(op, (int, float, NumberObject, FloatObject)) and ( + (abs(float(op)) >= _space_width) + and (len(text) > 0) + and (text[-1] != " ") + ): + process_operation(b"Tj", [" "]) elif operator == b"Do": output += text if visitor_text is not None: @@ -2048,7 +2055,7 @@ def extract_text( see function set_custom_rtl Additionally you can provide visitor-methods to get informed on all - operands and all text-objects. + operations and all text-objects. For example in some PDF files this can be useful to parse tables. Args: @@ -2060,11 +2067,11 @@ def extract_text( 270 (turned Right) space_width: force default space width if not extracted from font (default: 200) - visitor_operand_before: function to be called before processing an operand. - It has four arguments: operand, operand-arguments, + visitor_operand_before: function to be called before processing an operation. + It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. - visitor_operand_after: function to be called after processing an operand. - It has four arguments: operand, operand-arguments, + visitor_operand_after: function to be called after processing an operation. + It has four arguments: operator, operand-arguments, current transformation matrix and text matrix. visitor_text: function to be called when extracting some text at some position. It has five arguments: text, current transformation matrix, @@ -2377,11 +2384,11 @@ def _get_fonts_walk( fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") if "/BaseFont" in obj: fnt.add(cast(str, obj["/BaseFont"])) - if "/FontName" in obj: - if [x for x in fontkeys if x in obj]: # test to see if there is FontFile - emb.add(cast(str, obj["/FontName"])) + if "/FontName" in obj and [x for x in fontkeys if x in obj]: + # the list comprehension ensures there is FontFile + emb.add(cast(str, obj["/FontName"])) - for key in obj.keys(): + for key in obj: _get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb) return fnt, emb # return the sets for each page diff --git a/pypdf/_reader.py b/pypdf/_reader.py index a04cffe779..387265a829 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -67,10 +67,13 @@ ) from .constants import CatalogAttributes as CA from .constants import CatalogDictionary as CD -from .constants import CheckboxRadioButtonAttributes +from .constants import ( + CheckboxRadioButtonAttributes, + FieldDictionaryAttributes, + GoToActionArguments, +) from .constants import Core as CO from .constants import DocumentInformationAttributes as DI -from .constants import FieldDictionaryAttributes, GoToActionArguments from .constants import PageAttributes as PG from .constants import PagesAttributes as PA from .constants import TrailerKeys as TK @@ -342,9 +345,8 @@ def __init__( # raise if password provided raise WrongPasswordError("Wrong password") self._override_encryption = False - else: - if password is not None: - raise PdfReadError("Not encrypted file") + elif password is not None: + raise PdfReadError("Not encrypted file") @property def pdf_header(self) -> str: @@ -433,8 +435,6 @@ def _get_num_pages(self) -> int: """ Calculate the number of pages in this PDF file. - Args: - Returns: The number of pages of the parsed PDF file @@ -1437,7 +1437,7 @@ def read_object_header(self, stream: StreamType) -> Tuple[int, int]: stream.seek(-1, 1) # although it's not used, it might still be necessary to read - _obj = stream.read(3) # noqa: F841 + _obj = stream.read(3) read_non_whitespace(stream) stream.seek(-1, 1) @@ -1522,7 +1522,7 @@ def read(self, stream: StreamType) -> None: continue xref_k = sorted( xref_entry.keys() - ) # must ensure ascendant to prevent damange + ) # must ensure ascendant to prevent damage for id in xref_k: stream.seek(xref_entry[id], 0) try: @@ -1600,10 +1600,10 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: raise PdfReadError("xref table read error") read_non_whitespace(stream) stream.seek(-1, 1) - firsttime = True # check if the first time looking at the xref table + first_time = True # check if the first time looking at the xref table while True: num = cast(int, read_object(stream, self)) - if firsttime and num != 0: + if first_time and num != 0: self.xref_index = num if self.strict: logger_warning( @@ -1612,7 +1612,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: ) # if table not zero indexed, could be due to error from when PDF was created # which will lead to mismatched indices later on, only warned and corrected if self.strict==True - firsttime = False + first_time = False read_non_whitespace(stream) stream.seek(-1, 1) size = cast(int, read_object(stream, self)) @@ -1647,7 +1647,7 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: offset, generation = int(offset_b), int(generation_b) except Exception: - # if something wrong occured + # if something wrong occurred if hasattr(stream, "getbuffer"): buf = bytes(stream.getbuffer()) # type: ignore else: @@ -1695,8 +1695,8 @@ def _read_standard_xref_table(self, stream: StreamType) -> None: num += 1 read_non_whitespace(stream) stream.seek(-1, 1) - trailertag = stream.read(7) - if trailertag != b"trailer": + trailer_tag = stream.read(7) + if trailer_tag != b"trailer": # more xrefs! stream.seek(-7, 1) else: @@ -2105,7 +2105,7 @@ def add_form_topname(self, name: str) -> Optional[DictionaryObject]: interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] self.cache_indirect_object( 0, - max([i for (g, i) in self.resolved_objects.keys() if g == 0]) + 1, + max([i for (g, i) in self.resolved_objects if g == 0]) + 1, interim, ) arr = ArrayObject() diff --git a/pypdf/_security.py b/pypdf/_security.py index fd76558297..5ede2a35d1 100644 --- a/pypdf/_security.py +++ b/pypdf/_security.py @@ -54,7 +54,7 @@ def _alg32( password: str, rev: Literal[2, 3, 4], - keylen: int, + key_length: int, owner_entry: ByteStringObject, p_entry: int, id1_entry: ByteStringObject, @@ -68,7 +68,7 @@ def _alg32( Args: password: The encryption secret as a bytes-string rev: The encryption revision (see PDF standard) - keylen: + key_length: owner_entry: p_entry: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other @@ -79,7 +79,7 @@ def _alg32( metadata_encrypt: (Default value = True) Returns: - An MD5 hash of keylen characters. + An MD5 hash of key_length characters. """ # 1. Pad or truncate the password string to exactly 32 bytes. If the # password string is more than 32 bytes long, use only its first 32 bytes; @@ -113,16 +113,16 @@ def _alg32( # /Length entry. if rev >= 3: for _ in range(50): - md5_hash = md5(md5_hash[:keylen]).digest() + md5_hash = md5(md5_hash[:key_length]).digest() # 9. Set the encryption key to the first n bytes of the output from the # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or # greater, depends on the value of the encryption dictionary's /Length # entry. - return md5_hash[:keylen] + return md5_hash[:key_length] def _alg33( - owner_password: str, user_password: str, rev: Literal[2, 3, 4], keylen: int + owner_password: str, user_password: str, rev: Literal[2, 3, 4], key_length: int ) -> bytes: """ Implementation of algorithm 3.3 of the PDF standard security handler, @@ -132,13 +132,13 @@ def _alg33( owner_password: user_password: rev: The encryption revision (see PDF standard) - keylen: + key_length: Returns: A transformed version of the owner and the user password """ # steps 1 - 4 - key = _alg33_1(owner_password, rev, keylen) + key = _alg33_1(owner_password, rev, key_length) # 5. Pad or truncate the user password string as described in step 1 of # algorithm 3.2. user_password_bytes = b_((user_password + str_(_encryption_padding))[:32]) @@ -160,14 +160,14 @@ def _alg33( return val -def _alg33_1(password: str, rev: Literal[2, 3, 4], keylen: int) -> bytes: +def _alg33_1(password: str, rev: Literal[2, 3, 4], key_length: int) -> bytes: """ Steps 1-4 of algorithm 3.3. Args: password: rev: The encryption revision (see PDF standard) - keylen: + key_length: Returns: A transformed version of the password @@ -189,7 +189,7 @@ def _alg33_1(password: str, rev: Literal[2, 3, 4], keylen: int) -> bytes: # from the final MD5 hash, where n is always 5 for revision 2 but, for # revision 3 or greater, depends on the value of the encryption # dictionary's /Length entry. - key = md5_hash[:keylen] + key = md5_hash[:key_length] return key @@ -220,8 +220,8 @@ def _alg34( # 1. Create an encryption key based on the user password string, as # described in algorithm 3.2. rev: Literal[2] = 2 - keylen = 5 - key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + key_length = 5 + key = _alg32(password, rev, key_length, owner_entry, p_entry, id1_entry) # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2, # using an RC4 encryption function with the encryption key from the # preceding step. @@ -234,7 +234,7 @@ def _alg34( def _alg35( password: str, rev: Literal[2, 3, 4], - keylen: int, + key_length: int, owner_entry: ByteStringObject, p_entry: int, id1_entry: ByteStringObject, @@ -248,7 +248,7 @@ def _alg35( Args: password: rev: The encryption revision (see PDF standard) - keylen: + key_length: owner_entry: p_entry: A set of flags specifying which operations shall be permitted when the document is opened with user access. If bit 2 is set to 1, all other @@ -263,7 +263,7 @@ def _alg35( """ # 1. Create an encryption key based on the user password string, as # described in Algorithm 3.2. - key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry) + key = _alg32(password, rev, key_length, owner_entry, p_entry, id1_entry) # 2. Initialize the MD5 hash function and pass the 32-byte padding string # shown in step 1 of Algorithm 3.2 as input to this function. m = md5() diff --git a/pypdf/_version.py b/pypdf/_version.py index dcbfb52f61..0c11babd07 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.5.0" +__version__ = "3.5.1" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index dbc99a9372..34b8a23319 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -73,24 +73,25 @@ deprecation_with_replacement, logger_warning, ) -from .constants import AnnotationDictionaryAttributes -from .constants import CatalogAttributes as CA -from .constants import CatalogDictionary -from .constants import Core as CO -from .constants import EncryptionDictAttributes as ED from .constants import ( + AnnotationDictionaryAttributes, + CatalogDictionary, FieldDictionaryAttributes, FieldFlag, FileSpecificationDictionaryEntries, GoToActionArguments, InteractiveFormDictEntries, + PageLabelStyle, + TypFitArguments, + UserAccessPermissions, ) +from .constants import CatalogAttributes as CA +from .constants import Core as CO +from .constants import EncryptionDictAttributes as ED from .constants import PageAttributes as PG -from .constants import PageLabelStyle from .constants import PagesAttributes as PA from .constants import StreamAttributes as SA from .constants import TrailerKeys as TK -from .constants import TypFitArguments, UserAccessPermissions from .generic import ( PAGE_FIT, AnnotationBuilder, @@ -1242,13 +1243,12 @@ def _sweep_indirect_references( grant_parents + [parent] if parent is not None else [], ) ) - elif isinstance(data, IndirectObject): - if data.pdf != self: - data = self._resolve_indirect_object(data) + elif isinstance(data, IndirectObject) and data.pdf != self: + data = self._resolve_indirect_object(data) - if str(data) not in discovered: - discovered.append(str(data)) - stack.append((data.get_object(), None, None, [])) + if str(data) not in discovered: + discovered.append(str(data)) + stack.append((data.get_object(), None, None, [])) # Check if data has a parent and if it is a dict or # an array update the value @@ -2969,13 +2969,12 @@ def find_outline_item( or o.get("/Title", None) == outline_item ): return [i] - else: - if "/First" in o: - res = self.find_outline_item( - outline_item, cast(OutlineType, o["/First"]) - ) - if res: - return ([i] if "/Title" in o else []) + res + elif "/First" in o: + res = self.find_outline_item( + outline_item, cast(OutlineType, o["/First"]) + ) + if res: + return ([i] if "/Title" in o else []) + res if "/Next" in o: i += 1 o = cast(TreeObject, o["/Next"]) diff --git a/pypdf/filters.py b/pypdf/filters.py index 036a72b632..a81e8134e2 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -38,16 +38,7 @@ import struct import zlib from io import BytesIO -from typing import Any, Dict, Optional, Tuple, Union, cast - -from .generic import ArrayObject, DictionaryObject, IndirectObject, NameObject - -try: - from typing import Literal # type: ignore[attr-defined] -except ImportError: - # PEP 586 introduced typing.Literal with Python 3.8 - # For older Python versions, the backport typing_extensions is necessary: - from typing_extensions import Literal # type: ignore[misc] +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union, cast from ._utils import b_, deprecate_with_replacement, ord_, paeth_predictor from .constants import CcittFaxDecodeParameters as CCITT @@ -59,6 +50,15 @@ from .constants import LzwFilterParameters as LZW from .constants import StreamAttributes as SA from .errors import PdfReadError, PdfStreamError +from .generic import ArrayObject, DictionaryObject, IndirectObject, NameObject + +if TYPE_CHECKING: + try: + from typing import Literal # type: ignore[attr-defined] + except ImportError: + # PEP 586 introduced typing.Literal with Python 3.8 + # For older Python versions, the backport typing_extensions is necessary: + from typing_extensions import Literal # type: ignore[misc] def decompress(data: bytes) -> bytes: @@ -202,7 +202,7 @@ class ASCIIHexDecode: @staticmethod def decode( data: str, - decode_parms: Union[None, ArrayObject, DictionaryObject] = None, # noqa: F841 + decode_parms: Union[None, ArrayObject, DictionaryObject] = None, **kwargs: Any, ) -> str: """ diff --git a/pypdf/generic/_annotations.py b/pypdf/generic/_annotations.py index 145a33f26a..68e31a2508 100644 --- a/pypdf/generic/_annotations.py +++ b/pypdf/generic/_annotations.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union from ._base import ( BooleanObject, @@ -315,7 +315,8 @@ def link( Returns: A dictionary object representing the annotation. """ - from ..types import BorderArrayType + if TYPE_CHECKING: + from ..types import BorderArrayType is_external = url is not None is_internal = target_page_index is not None diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index fc7e848f44..f75e66dd64 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -94,7 +94,7 @@ def clone( raise Exception("clone PdfObject") def _reference_clone( - self, clone: Any, pdf_dest: PdfWriterProtocol + self, clone: Any, pdf_dest: PdfWriterProtocol, force_duplicate: bool = False ) -> PdfObjectProtocol: """ Reference the object within the _objects of pdf_dest only if @@ -110,25 +110,31 @@ def _reference_clone( The clone """ try: - if clone.indirect_reference.pdf == pdf_dest: + if not force_duplicate and clone.indirect_reference.pdf == pdf_dest: return clone except Exception: pass - if hasattr(self, "indirect_reference"): + # if hasattr(clone, "indirect_reference"): + try: ind = self.indirect_reference - i = len(pdf_dest._objects) + 1 - if ind is not None: - if id(ind.pdf) not in pdf_dest._id_translated: - pdf_dest._id_translated[id(ind.pdf)] = {} - if ind.idnum in pdf_dest._id_translated[id(ind.pdf)]: - obj = pdf_dest.get_object( - pdf_dest._id_translated[id(ind.pdf)][ind.idnum] - ) - assert obj is not None - return obj - pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i - pdf_dest._objects.append(clone) - clone.indirect_reference = IndirectObject(i, 0, pdf_dest) + except AttributeError: + return clone + i = len(pdf_dest._objects) + 1 + if ind is not None: + if id(ind.pdf) not in pdf_dest._id_translated: + pdf_dest._id_translated[id(ind.pdf)] = {} + if ( + not force_duplicate + and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] + ): + obj = pdf_dest.get_object( + pdf_dest._id_translated[id(ind.pdf)][ind.idnum] + ) + assert obj is not None + return obj + pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i + pdf_dest._objects.append(clone) + clone.indirect_reference = IndirectObject(i, 0, pdf_dest) return clone def get_object(self) -> Optional["PdfObject"]: @@ -153,7 +159,9 @@ def clone( ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "NullObject": """Clone object into pdf_dest.""" - return cast("NullObject", self._reference_clone(NullObject(), pdf_dest)) + return cast( + "NullObject", self._reference_clone(NullObject(), pdf_dest, force_duplicate) + ) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] @@ -194,7 +202,8 @@ def clone( ) -> "BooleanObject": """Clone object into pdf_dest.""" return cast( - "BooleanObject", self._reference_clone(BooleanObject(self.value), pdf_dest) + "BooleanObject", + self._reference_clone(BooleanObject(self.value), pdf_dest, force_duplicate), ) def __eq__(self, __o: object) -> bool: @@ -366,7 +375,10 @@ def clone( ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "FloatObject": """Clone object into pdf_dest.""" - return cast("FloatObject", self._reference_clone(FloatObject(self), pdf_dest)) + return cast( + "FloatObject", + self._reference_clone(FloatObject(self), pdf_dest, force_duplicate), + ) def myrepr(self) -> str: if self == 0: @@ -410,7 +422,10 @@ def clone( ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "NumberObject": """Clone object into pdf_dest.""" - return cast("NumberObject", self._reference_clone(NumberObject(self), pdf_dest)) + return cast( + "NumberObject", + self._reference_clone(NumberObject(self), pdf_dest, force_duplicate), + ) def as_numeric(self) -> int: return int(repr(self).encode("utf8")) @@ -459,7 +474,9 @@ def clone( """Clone object into pdf_dest.""" return cast( "ByteStringObject", - self._reference_clone(ByteStringObject(bytes(self)), pdf_dest), + self._reference_clone( + ByteStringObject(bytes(self)), pdf_dest, force_duplicate + ), ) @property @@ -505,7 +522,9 @@ def clone( obj = TextStringObject(self) obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding obj.autodetect_utf16 = self.autodetect_utf16 - return cast("TextStringObject", self._reference_clone(obj, pdf_dest)) + return cast( + "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) + ) autodetect_pdfdocencoding = False autodetect_utf16 = False @@ -587,7 +606,10 @@ def clone( ignore_fields: Union[Tuple[str, ...], List[str], None] = (), ) -> "NameObject": """Clone object into pdf_dest.""" - return cast("NameObject", self._reference_clone(NameObject(self), pdf_dest)) + return cast( + "NameObject", + self._reference_clone(NameObject(self), pdf_dest, force_duplicate), + ) def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] @@ -623,7 +645,7 @@ def unnumber(sin: bytes) -> bytes: sin = sin[:i] + unhexlify(sin[i + 1 : i + 3]) + sin[i + 3 :] i = sin.find(b"#", i + 1) except ValueError: - # if the 2 characters after # can not be converted to hexa + # if the 2 characters after # can not be converted to hex # we change nothing and carry on i = i + 1 return sin diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index e60b821610..824dc16d41 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -50,9 +50,9 @@ from ..constants import ( CheckboxRadioButtonAttributes, FieldDictionaryAttributes, + OutlineFontFlag, ) from ..constants import FilterTypes as FT -from ..constants import OutlineFontFlag from ..constants import StreamAttributes as SA from ..constants import TypArguments as TA from ..constants import TypFitArguments as TF @@ -88,11 +88,16 @@ def clone( return self except Exception: pass - arr = cast("ArrayObject", self._reference_clone(ArrayObject(), pdf_dest)) + arr = cast( + "ArrayObject", + self._reference_clone(ArrayObject(), pdf_dest, force_duplicate), + ) for data in self: if isinstance(data, StreamObject): dup = data._reference_clone( - data.clone(pdf_dest, force_duplicate, ignore_fields), pdf_dest + data.clone(pdf_dest, force_duplicate, ignore_fields), + pdf_dest, + force_duplicate, ) arr.append(dup.indirect_reference) elif hasattr(data, "clone"): @@ -137,8 +142,8 @@ def read_from_stream( tok = stream.read(1) stream.seek(-1, 1) # check for array ending - peekahead = stream.read(1) - if peekahead == b"]": + peek_ahead = stream.read(1) + if peek_ahead == b"]": break stream.seek(-1, 1) # read and append obj @@ -168,7 +173,8 @@ def clone( pass d__ = cast( - "DictionaryObject", self._reference_clone(self.__class__(), pdf_dest) + "DictionaryObject", + self._reference_clone(self.__class__(), pdf_dest, force_duplicate), ) if ignore_fields is None: ignore_fields = [] @@ -193,12 +199,18 @@ def _clone( ignore_fields: """ # First check if this is a chain list, we need to loop to prevent recur - if ( - ("/Next" not in ignore_fields and "/Next" in src) - or ("/Prev" not in ignore_fields and "/Prev" in src) - ) or ( - ("/N" not in ignore_fields and "/N" in src) - or ("/V" not in ignore_fields and "/V" in src) + if any( + field not in ignore_fields + and field in src + and isinstance(src.raw_get(field), IndirectObject) + and isinstance(src[field], DictionaryObject) + and ( + src.get("/Type", None) is None + or cast(DictionaryObject, src[field]).get("/Type", None) is None + or src.get("/Type", None) + == cast(DictionaryObject, src[field]).get("/Type", None) + ) + for field in ["/Next", "/Prev", "/N", "/V"] ): ignore_fields = list(ignore_fields) for lst in (("/Next", "/Prev"), ("/N", "/V")): @@ -208,6 +220,15 @@ def _clone( k in src and k not in self and isinstance(src.raw_get(k), IndirectObject) + and isinstance(src[k], DictionaryObject) + # IF need to go further the idea is to check + # that the types are the same: + and ( + src.get("/Type", None) is None + or cast(DictionaryObject, src[k]).get("/Type", None) is None + or src.get("/Type", None) + == cast(DictionaryObject, src[k]).get("/Type", None) + ) ): cur_obj: Optional["DictionaryObject"] = cast( "DictionaryObject", src[k] @@ -216,7 +237,9 @@ def _clone( while cur_obj is not None: clon = cast( "DictionaryObject", - cur_obj._reference_clone(cur_obj.__class__(), pdf_dest), + cur_obj._reference_clone( + cur_obj.__class__(), pdf_dest, force_duplicate + ), ) objs.append((cur_obj, clon)) assert prev_obj is not None @@ -230,7 +253,7 @@ def _clone( except Exception: cur_obj = None for s, c in objs: - c._clone(s, pdf_dest, force_duplicate, ignore_fields + [k]) + c._clone(s, pdf_dest, force_duplicate, ignore_fields) for k, v in src.items(): if k not in ignore_fields: @@ -240,13 +263,12 @@ def _clone( vv = v.clone(pdf_dest, force_duplicate, ignore_fields) assert vv.indirect_reference is not None self[k.clone(pdf_dest)] = vv.indirect_reference # type: ignore[attr-defined] - else: - if k not in self: - self[NameObject(k)] = ( - v.clone(pdf_dest, force_duplicate, ignore_fields) - if hasattr(v, "clone") - else v - ) + elif k not in self: + self[NameObject(k)] = ( + v.clone(pdf_dest, force_duplicate, ignore_fields) + if hasattr(v, "clone") + else v + ) def raw_get(self, key: Any) -> Any: return dict.__getitem__(self, key) @@ -421,10 +443,8 @@ def read_unsized_from_steam( eol = stream.read(1) if eol not in (b"\n", b"\r"): raise PdfStreamError("Stream data must be followed by a newline") - if eol == b"\r": - # read \n after - if stream.read(1) != b"\n": - stream.seek(-1, 1) + if eol == b"\r" and stream.read(1) != b"\n": + stream.seek(-1, 1) # this is a stream object, not a dictionary if SA.LENGTH not in data: raise PdfStreamError("Stream length not defined") @@ -924,7 +944,10 @@ def clone( pass d__ = cast( - "ContentStream", self._reference_clone(self.__class__(None, None), pdf_dest) + "ContentStream", + self._reference_clone( + self.__class__(None, None), pdf_dest, force_duplicate + ), ) if ignore_fields is None: ignore_fields = [] @@ -1064,23 +1087,23 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: @property def _data(self) -> bytes: - newdata = BytesIO() + new_data = BytesIO() for operands, operator in self.operations: if operator == b"INLINE IMAGE": - newdata.write(b"BI") - dicttext = BytesIO() - operands["settings"].write_to_stream(dicttext, None) - newdata.write(dicttext.getvalue()[2:-2]) - newdata.write(b"ID ") - newdata.write(operands["data"]) - newdata.write(b"EI") + new_data.write(b"BI") + dict_text = BytesIO() + operands["settings"].write_to_stream(dict_text, None) + new_data.write(dict_text.getvalue()[2:-2]) + new_data.write(b"ID ") + new_data.write(operands["data"]) + new_data.write(b"EI") else: for op in operands: - op.write_to_stream(newdata, None) - newdata.write(b" ") - newdata.write(b_(operator)) - newdata.write(b"\n") - return newdata.getvalue() + op.write_to_stream(new_data, None) + new_data.write(b" ") + new_data.write(b_(operator)) + new_data.write(b"\n") + return new_data.getvalue() @_data.setter def _data(self, value: Union[str, bytes]) -> None: @@ -1308,7 +1331,7 @@ class Destination(TreeObject): node: Optional[ DictionaryObject ] = None # node provide access to the original Object - childs: List[Any] = [] # used in PdfWriter + childs: List[Any] = [] # used in PdfWriter - TODO: should be children def __init__( self, @@ -1339,12 +1362,12 @@ def __init__( self[NameObject(TA.TOP)], ) = args elif typ in [TF.FIT_H, TF.FIT_BH]: - try: # Prefered to be more robust not only to null parameters + try: # Preferred to be more robust not only to null parameters (self[NameObject(TA.TOP)],) = args except Exception: (self[NameObject(TA.TOP)],) = (NullObject(),) elif typ in [TF.FIT_V, TF.FIT_BV]: - try: # Prefered to be more robust not only to null parameters + try: # Preferred to be more robust not only to null parameters (self[NameObject(TA.LEFT)],) = args except Exception: (self[NameObject(TA.LEFT)],) = (NullObject(),) @@ -1455,6 +1478,6 @@ def outline_count(self) -> Optional[int]: positive = expanded negative = collapsed - absolute value = number of visible descendents at all levels + absolute value = number of visible descendants at all levels """ return self.get("/Count", None) diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index 24a77f8554..695736769b 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -90,7 +90,7 @@ def read_string_from_stream( if b"0" <= ntok and ntok <= b"7": tok += ntok else: - stream.seek(-1, 1) # ntok has to be analysed + stream.seek(-1, 1) # ntok has to be analyzed break tok = b_(chr(int(tok, base=8))) elif tok in b"\n\r": diff --git a/pypdf/xmp.py b/pypdf/xmp.py index 53955d56c8..024c484411 100644 --- a/pypdf/xmp.py +++ b/pypdf/xmp.py @@ -18,9 +18,8 @@ Union, cast, ) -from xml.dom.minidom import Document +from xml.dom.minidom import Document, parseString from xml.dom.minidom import Element as XmlElement -from xml.dom.minidom import parseString from xml.parsers.expat import ExpatError from ._utils import ( diff --git a/pyproject.toml b/pyproject.toml index ac0af9f0a2..685aa98043 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,11 +56,12 @@ package = "./pypdf" exclude = [".github/*", "docs/*", "resources/*", "sample-files/*", "sample-files/.github/*", "sample-files/.gitignore", "sample-files/.pre-commit-config.yaml", "requirements/*", "tests/*", ".flake8", ".gitignore", ".gitmodules", ".pylintrc", "tox.ini", "make_changelog.py", "mutmut-test.sh", ".pre-commit-config.yaml", ".gitblame-ignore-revs", "Makefile", "mutmut_config.py"] [tool.pytest.ini_options] +addopts = "--disable-socket" filterwarnings = ["error"] markers = [ "slow: Test which require more than a second", "samples: Tests which use files from https://github.com/py-pdf/sample-files", - "external: Tests which need to download files" + "enable_socket: Tests which need to download files" ] testpaths = ["tests"] norecursedirs = ["tests/pdf_cache"] @@ -102,10 +103,7 @@ exclude_lines = [ line-length = 120 select = ["ALL"] ignore = [ - "D212", "D407", "D408", "D409", "D404", # First word of the docstring should not be "This" - "D406", # Section name should end with a newline ("Returns") - "I001", # Not compatible with isort ... maybe we should drop isort # I would like to have it, but there are a few annoying exceptions: "D401", # First line of docstring should be in imperative mood - false positives "ERA001", @@ -115,38 +113,79 @@ ignore = [ "D415", # First line should end with a period # Introduces bugs "RUF001", "RUF002", "RUF005", + "ARG", + "DTZ001", # The use of `datetime.datetime()` without `tzinfo` is necessary # Personal preference - "D105", # Missing docstring in magic method - "D106", # Missing docstring in public nested class - "D107", # Missing docstring in `__init__` - "D203", # one-blank-line-before-class - "COM812", # yes, they make the diff smaller + "D406", # Section name should end with a newline ("Returns") + "D212", # I want multiline-docstrings to start at the second line + "D407", # google-style docstrings don't have dashses + "N806", # Variable `NO` in function should be lowercase + "N814", # Camelcase `PageAttributes` imported as constant `PG` + "N817", # CamelCase `PagesAttributes` imported as acronym `PA` "ANN101", # annotating 'self' seems weird (at least before 3.11) "ANN102", # Missing type annotation for `cls` in classmethod "ANN204", # Missing return type annotation for special method `__init__` "ANN401", # Dynamically typed expressions (typing.Any) are disallowed + "BLE", # we want to capture Exception sometimes + "COM812", # yes, they make the diff smaller + "D105", # Missing docstring in magic method + "D106", # Missing docstring in public nested class + "D107", # Missing docstring in `__init__` + "D203", # one-blank-line-before-class + "EM", # exception messages + "G004", # f-string in logging statement + "RET", + "S110", # `try`-`except`-`pass` detected, consider logging the exception + "SIM105", # contextlib.suppress + "SIM108", # don't enforce ternary operators + "SIM300", # yoda conditions + "TID252", # we want relative imports + "TRY", # I don't know what this is about # As long as we are not on Python 3.9+ "UP035", # PEP 585 + # As long as we are not on Python 3.10+ + "UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)` - PEP 604 # As long as we are not on Python 3.11+ "UP006", "UP007", # for the moment, fix it later: - "D417", # Missing argument descriptions in the docstring + "A", # Variable is shadowing a built-in + "B904", # Within an `except` clause, raise exceptions with + "B905", # `zip()` without an explicit `strict=` parameter + "C901", "D101", # Missing docstring in public class "D102", # Missing docstring in public method "D103", # Missing docstring in public function - "D100", # Missing docstring in public module - "SLF", - "PTH", "PLR", "FBT", "PT", "N", "ARG", "TRY", "S", "EM", "RET", "TID", - "C901", "PGH", "DTZ", "TCH", "B", "G", "BLE", "SIM", "E", "INP", "A", "RUF", - "PLW", "PLE" + "D417", # Missing argument descriptions in the docstring + "FBT001", # Boolean positional arg in function definition + "FBT002", # Boolean default value in function definition + "FBT003", # Boolean positional value in function call + "PGH", # Use specific error messages + "PLE", # too many arguments for logging + "PLR0911", # Too many return statements + "PLR0912", # Too many branches + "PLR0913", # Too many arguments to function call + "PLR0915", # Too many statements + "PLR2004", # Magic value + "PLW", # global variables + "PT011", # `pytest.raises(ValueError)` is too broad, set the `match` + "PT012", # `pytest.raises()` block should contain a single simple statement + "PTH123", # `open()` should be replaced by `Path.open()` + "S101", # Use of `assert` detected + "SLF001", # Private member accessed ] [tool.ruff.per-file-ignores] -"tests/*" = ["S101", "ANN001", "ANN201","D104"] +"tests/*" = ["S101", "ANN001", "ANN201","D104", "S105", "S106"] +"sample-files/*" = ["D100", "INP001"] +"_encryption.py" = ["S324"] +"_security.py" = ["S324"] +"_writer.py" = ["S324"] "make_changelog.py" = ["T201"] "json_consistency.py" = ["T201"] "tests/test_workflows.py" = ["T201"] -"sample-files/*" = ["D100"] +"docs/conf.py" = ["PTH", "INP001"] +# We first need to deprecate old stuff: +"pypdf/*" = ["N802", "N803"] [tool.docformatter] pre-summary-newline = true diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index 96f43fec1e..94f4832ab4 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -1,6 +1,6 @@ # -# This file is autogenerated by pip-compile with python 3.11 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: # # pip-compile --output-file=requirements/ci-3.11.txt requirements/ci.in # @@ -54,9 +54,12 @@ pytest==7.2.0 # via # -r requirements/ci.in # pytest-benchmark + # pytest-socket pytest-benchmark==4.0.0 # via -r requirements/ci.in -ruff==0.0.252 +pytest-socket==0.6.0 + # via -r requirements/ci.in +ruff==0.0.254 # via -r requirements/ci.in typeguard==2.13.3 # via -r requirements/ci.in diff --git a/requirements/ci.in b/requirements/ci.in index 40ecd20713..1d41d32028 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -8,6 +8,7 @@ pillow pycryptodome pytest pytest-benchmark +pytest-socket typeguard types-dataclasses types-Pillow diff --git a/requirements/ci.txt b/requirements/ci.txt index 08b0fe670e..a7a12e49ab 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -61,8 +61,11 @@ pytest==7.0.1 # via # -r requirements/ci.in # pytest-benchmark + # pytest-socket pytest-benchmark==3.4.1 # via -r requirements/ci.in +pytest-socket==0.4.1 + # via -r requirements/ci.in six==1.16.0 # via flake8-print tomli==1.2.3 diff --git a/sample-files b/sample-files index 372294b066..fb7a080b35 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 372294b066cd3fbb4fb12fd9000ef935a2a86fcf +Subproject commit fb7a080b35b3553bd10221282beeda7847959e83 diff --git a/tests/__init__.py b/tests/__init__.py index b601d5fb35..304a60c107 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +1,6 @@ -import os import ssl import urllib.request +from pathlib import Path from typing import List from urllib.error import HTTPError @@ -25,17 +25,17 @@ def get_pdf_from_url(url: str, name: str) -> bytes: if url.startswith("file://"): with open(url[7:].replace("\\", "/"), "rb") as fp: return fp.read() - cache_dir = os.path.join(os.path.dirname(__file__), "pdf_cache") - if not os.path.exists(cache_dir): - os.mkdir(cache_dir) - cache_path = os.path.join(cache_dir, name) - if not os.path.exists(cache_path): + cache_dir = Path(__file__).parent / "pdf_cache" + if not cache_dir.exists(): + cache_dir.mkdir() + cache_path = cache_dir / name + if not cache_path.exists(): ssl._create_default_https_context = ssl._create_unverified_context cpt = 3 while cpt > 0: try: - with urllib.request.urlopen(url) as response, open( - cache_path, "wb" + with urllib.request.urlopen(url) as response, cache_path.open( + "wb" ) as out_file: out_file.write(response.read()) cpt = 0 diff --git a/tests/bench.py b/tests/bench.py index 64a9525169..e98fd57960 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -1,3 +1,9 @@ +""" +Benchmark the speed of pypdf. + +The results are on https://py-pdf.github.io/pypdf/dev/bench/ +Please keep in mind that the variance is high. +""" from io import BytesIO from pathlib import Path diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 38f745413e..438ca1f403 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -1,3 +1,4 @@ +"""Test the pypdf_cmap module.""" from io import BytesIO import pytest @@ -9,8 +10,8 @@ from . import get_pdf_from_url -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_compute_space_width(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf" name = "tika-923406.pdf" @@ -20,8 +21,8 @@ def test_compute_space_width(): page.extract_text() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_parse_to_unicode_process_rg(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf" name = "tika-959173.pdf" @@ -35,7 +36,7 @@ def test_parse_to_unicode_process_rg(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_parse_encoding_advanced_encoding_not_implemented(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf" name = "tika-957144.pdf" @@ -46,7 +47,7 @@ def test_parse_encoding_advanced_encoding_not_implemented(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_get_font_width_from_default(): # L40 url = "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf" name = "tika-908104.pdf" @@ -55,7 +56,7 @@ def test_get_font_width_from_default(): # L40 page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_multiline_bfrange(): # non regression test for iss_1285 url = ( @@ -76,7 +77,7 @@ def test_multiline_bfrange(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_bfchar_on_2_chars(): # iss #1293 url = ( @@ -89,7 +90,7 @@ def test_bfchar_on_2_chars(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_ascii_charset(): # iss #1312 url = "https://github.com/py-pdf/pypdf/files/9472500/main.pdf" @@ -98,7 +99,7 @@ def test_ascii_charset(): assert "/a" not in reader.pages[0].extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1370(): url = "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf" name = "cmap1370.pdf" @@ -106,7 +107,7 @@ def test_iss1370(): reader.pages[0].extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1379(): url = "https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf" name = "02voc.pdf" @@ -114,7 +115,7 @@ def test_iss1379(): reader.pages[2].extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1533(): url = "https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf" name = "iss1533.pdf" diff --git a/tests/test_constants.py b/tests/test_constants.py index 03adf3f798..ab3166f7c0 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -1,3 +1,4 @@ +"""Test the pypdf.constants module.""" import re from typing import Callable diff --git a/tests/test_encryption.py b/tests/test_encryption.py index 68b5b64d82..d5c1216eef 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -1,3 +1,4 @@ +"""Test the pypdf.encryption module.""" from pathlib import Path import pytest diff --git a/tests/test_filters.py b/tests/test_filters.py index ab0505b37c..1d2dbd1909 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -1,3 +1,4 @@ +"""Test the pypdf.filters module.""" import string import sys from io import BytesIO @@ -34,7 +35,7 @@ @pytest.mark.parametrize( ("predictor", "s"), list(cartesian_product([1], filter_inputs)) ) -def test_FlateDecode(predictor, s): +def test_flatedecode(predictor, s): """Tests FlateDecode decode() and encode() methods.""" codec = FlateDecode() s = s.encode() @@ -42,7 +43,7 @@ def test_FlateDecode(predictor, s): assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s -def test_FlateDecode_unsupported_predictor(): +def test_flatedecode_unsupported_predictor(): """ Inputs an unsupported predictor (outside the [10, 15] range) checking that PdfReadError() is raised. @@ -62,7 +63,7 @@ def test_FlateDecode_unsupported_predictor(): @pytest.mark.parametrize( "params", [ArrayObject([]), ArrayObject([{"/Predictor": 1}]), "a"] ) -def test_FlateDecode_decompress_array_params(params): +def test_flatedecode_decompress_array_params(params): codec = FlateDecode() s = "" s = s.encode() @@ -105,7 +106,7 @@ def test_FlateDecode_decompress_array_params(params): "whitespace", ], ) -def test_ASCIIHexDecode(data, expected): +def test_ascii_hex_decode(data, expected): """ Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the correct output is returned. @@ -117,7 +118,7 @@ def test_ASCIIHexDecode(data, expected): assert ASCIIHexDecode.decode(data) == expected -def test_ASCIIHexDecode_no_eod(): +def test_ascii_hex_decode_no_eod(): """Ensuring an exception is raised when no EOD character is present.""" with pytest.raises(PdfStreamError) as exc: ASCIIHexDecode.decode("") @@ -125,7 +126,7 @@ def test_ASCIIHexDecode_no_eod(): @pytest.mark.xfail() -def test_ASCII85Decode_with_overflow(): +def test_ascii85decode_with_overflow(): inputs = ( v + "~>" for v in "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0e\x0f" @@ -143,7 +144,7 @@ def test_ASCII85Decode_with_overflow(): assert exc.value.args[0] == "" -def test_ASCII85Decode_five_zero_bytes(): +def test_ascii85decode_five_zero_bytes(): """ From ISO 32000 (2008) §7.4.3: @@ -164,7 +165,7 @@ def test_ASCII85Decode_five_zero_bytes(): assert ASCII85Decode.decode(i) == expected -def test_CCITParameters(): +def test_ccitparameters(): parms = CCITParameters() assert parms.K == 0 # zero is the default according to page 78 assert parms.group == 3 @@ -177,12 +178,12 @@ def test_CCITParameters(): (ArrayObject([{"/K": 1}, {"/Columns": 13}]), 1), ], ) -def test_CCIT_get_parameters(parameters, expected_k): +def test_ccitt_get_parameters(parameters, expected_k): parmeters = CCITTFaxDecode._get_parameters(parameters=parameters, rows=0) assert parmeters.K == expected_k -def test_CCITTFaxDecode(): +def test_ccitt_fax_decode(): data = b"" parameters = DictionaryObject( {"/K": NumberObject(-1), "/Columns": NumberObject(17)} @@ -201,7 +202,7 @@ def test_CCITTFaxDecode(): ) -@pytest.mark.external +@pytest.mark.enable_socket() @patch("pypdf._reader.logger_warning") def test_decompress_zlib_error(mock_logger_warning): url = "https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf" @@ -214,18 +215,18 @@ def test_decompress_zlib_error(mock_logger_warning): ) -@pytest.mark.external +@pytest.mark.enable_socket() def test_lzw_decode_neg1(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf" name = "tika-921632.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + page = reader.pages[47] with pytest.raises(PdfReadError) as exc: - for page in reader.pages: - page.extract_text() + page.extract_text() assert exc.value.args[0] == "Missed the stop code in LZWDecode!" -@pytest.mark.external +@pytest.mark.enable_socket() def test_issue_399(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/976/976970.pdf" name = "tika-976970.pdf" @@ -233,7 +234,7 @@ def test_issue_399(): reader.pages[1].extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_image_without_imagemagic(): with patch.dict(sys.modules): sys.modules["PIL"] = None diff --git a/tests/test_generic.py b/tests/test_generic.py index 4e73769ed7..52b656ca71 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,4 +1,4 @@ -import os +"""Test the pypdf.generic module.""" from io import BytesIO from pathlib import Path from unittest.mock import patch @@ -132,53 +132,53 @@ def test_indirect_object_premature(value): assert exc.value.args[0] == "Stream has ended unexpectedly" -def test_readHexStringFromStream(): +def test_read_hex_string_from_stream(): stream = BytesIO(b"a1>") assert read_hex_string_from_stream(stream) == "\x10" -def test_readHexStringFromStream_exception(): +def test_read_hex_string_from_stream_exception(): stream = BytesIO(b"") with pytest.raises(PdfStreamError) as exc: read_hex_string_from_stream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" -def test_readStringFromStream_exception(): +def test_read_string_from_stream_exception(): stream = BytesIO(b"x") with pytest.raises(PdfStreamError) as exc: read_string_from_stream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" -def test_readStringFromStream_not_in_escapedict_no_digit(): +def test_read_string_from_stream_not_in_escapedict_no_digit(): stream = BytesIO(b"x\\y") with pytest.raises(PdfReadError) as exc: read_string_from_stream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" -def test_readStringFromStream_multichar_eol(): +def test_read_string_from_stream_multichar_eol(): stream = BytesIO(b"x\\\n )") assert read_string_from_stream(stream) == " " -def test_readStringFromStream_multichar_eol2(): +def test_read_string_from_stream_multichar_eol2(): stream = BytesIO(b"x\\\n\n)") assert read_string_from_stream(stream) == "" -def test_readStringFromStream_excape_digit(): +def test_read_string_from_stream_excape_digit(): stream = BytesIO(b"x\\1a )") assert read_string_from_stream(stream) == "\x01a " -def test_readStringFromStream_excape_digit2(): +def test_read_string_from_stream_excape_digit2(): stream = BytesIO(b"(hello \\1\\2\\3\\4)") assert read_string_from_stream(stream) == "hello \x01\x02\x03\x04" -def test_NameObject(caplog): +def test_name_object(caplog): stream = BytesIO(b"x") with pytest.raises(PdfReadError) as exc: NameObject.read_from_stream(stream, None) @@ -299,7 +299,7 @@ def test_read_object_comment(): assert out == 1 -def test_ByteStringObject(): +def test_bytestringobject(): bo = ByteStringObject("stream", encoding="utf-8") stream = BytesIO(b"") bo.write_to_stream(stream, encryption_key="foobar") @@ -307,52 +307,52 @@ def test_ByteStringObject(): assert stream.read() == b"<1cdd628b972e>" # TODO: how can we verify this? -def test_DictionaryObject_key_is_no_pdfobject(): +def test_dictionaryobject_key_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do["foo"] = NameObject("/GoTo") assert exc.value.args[0] == "key must be PdfObject" -def test_DictionaryObject_xmp_meta(): +def test_dictionaryobject_xmp_meta(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) assert do.xmp_metadata is None -def test_DictionaryObject_value_is_no_pdfobject(): +def test_dictionaryobject_value_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do[NameObject("/S")] = "/GoTo" assert exc.value.args[0] == "value must be PdfObject" -def test_DictionaryObject_setdefault_key_is_no_pdfobject(): +def test_dictionaryobject_setdefault_key_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do.setdefault("foo", NameObject("/GoTo")) assert exc.value.args[0] == "key must be PdfObject" -def test_DictionaryObject_setdefault_value_is_no_pdfobject(): +def test_dictionaryobject_setdefault_value_is_no_pdfobject(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) with pytest.raises(ValueError) as exc: do.setdefault(NameObject("/S"), "/GoTo") assert exc.value.args[0] == "value must be PdfObject" -def test_DictionaryObject_setdefault_value(): +def test_dictionaryobject_setdefault_value(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) do.setdefault(NameObject("/S"), NameObject("/GoTo")) -def test_DictionaryObject_read_from_stream(): +def test_dictionaryobject_read_from_stream(): stream = BytesIO(b"<< /S /GoTo >>") pdf = None out = DictionaryObject.read_from_stream(stream, pdf) assert out.get_object() == {NameObject("/S"): NameObject("/GoTo")} -def test_DictionaryObject_read_from_stream_broken(): +def test_dictionaryobject_read_from_stream_broken(): stream = BytesIO(b"< /S /GoTo >>") pdf = None with pytest.raises(PdfReadError) as exc: @@ -363,7 +363,7 @@ def test_DictionaryObject_read_from_stream_broken(): ) -def test_DictionaryObject_read_from_stream_unexpected_end(): +def test_dictionaryobject_read_from_stream_unexpected_end(): stream = BytesIO(b"<< \x00/S /GoTo") pdf = None with pytest.raises(PdfStreamError) as exc: @@ -371,7 +371,7 @@ def test_DictionaryObject_read_from_stream_unexpected_end(): assert exc.value.args[0] == "Stream has ended unexpectedly" -def test_DictionaryObject_read_from_stream_stream_no_newline(): +def test_dictionaryobject_read_from_stream_stream_no_newline(): stream = BytesIO(b"<< /S /GoTo >>stream") pdf = None with pytest.raises(PdfReadError) as exc: @@ -380,7 +380,7 @@ def test_DictionaryObject_read_from_stream_stream_no_newline(): @pytest.mark.parametrize(("strict"), [(True), (False)]) -def test_DictionaryObject_read_from_stream_stream_no_stream_length(strict): +def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict): stream = BytesIO(b"<< /S /GoTo >>stream\n") class Tst: # to replace pdf @@ -403,7 +403,7 @@ class Tst: # to replace pdf (False, 10, False), ], ) -def test_DictionaryObject_read_from_stream_stream_stream_valid( +def test_dictionaryobject_read_from_stream_stream_stream_valid( strict, length, should_fail ): stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length) @@ -423,7 +423,7 @@ class Tst: # to replace pdf assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__") -def test_RectangleObject(): +def test_rectangleobject(): ro = RectangleObject((1, 2, 3, 4)) assert ro.lower_left == (1, 2) assert ro.lower_right == (3, 2) @@ -450,14 +450,14 @@ def test_RectangleObject(): assert ro.upper_right == (14, 18) -def test_TextStringObject_exc(): +def test_textstringobject_exc(): tso = TextStringObject("foo") with pytest.raises(Exception) as exc: tso.get_original_bytes() assert exc.value.args[0] == "no information about original bytes" -def test_TextStringObject_autodetect_utf16(): +def test_textstringobject_autodetect_utf16(): tso = TextStringObject("foo") tso.autodetect_utf16 = True assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o" @@ -583,7 +583,7 @@ def test_remove_child_in_tree(): tree.empty_tree() -@pytest.mark.external +@pytest.mark.enable_socket() def test_dict_read_from_stream(caplog): url = "https://corpora.tika.apache.org/base/docs/govdocs1/984/984877.pdf" name = "tika-984877.pdf" @@ -597,7 +597,7 @@ def test_dict_read_from_stream(caplog): ) -@pytest.mark.external +@pytest.mark.enable_socket() def test_parse_content_stream_peek_percentage(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/985/985770.pdf" name = "tika-985770.pdf" @@ -607,7 +607,7 @@ def test_parse_content_stream_peek_percentage(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_read_inline_image_no_has_q(): # pdf/df7e1add3156af17a372bc165e47a244.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/998/998719.pdf" @@ -618,7 +618,7 @@ def test_read_inline_image_no_has_q(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_read_inline_image_loc_neg_1(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/935/935066.pdf" name = "tika-935066.pdf" @@ -628,8 +628,8 @@ def test_read_inline_image_loc_neg_1(): page.extract_text() -@pytest.mark.slow -@pytest.mark.external +@pytest.mark.slow() +@pytest.mark.enable_socket() def test_text_string_write_to_stream(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924562.pdf" name = "tika-924562.pdf" @@ -639,7 +639,7 @@ def test_text_string_write_to_stream(): page.compress_content_streams() -@pytest.mark.external +@pytest.mark.enable_socket() def test_name_object_read_from_stream_unicode_error(): # L588 url = "https://corpora.tika.apache.org/base/docs/govdocs1/974/974966.pdf" name = "tika-974966.pdf" @@ -649,7 +649,7 @@ def test_name_object_read_from_stream_unicode_error(): # L588 page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_bool_repr(tmp_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/932/932449.pdf" name = "tika-932449.pdf" @@ -669,7 +669,7 @@ def test_bool_repr(tmp_path): ) -@pytest.mark.external +@pytest.mark.enable_socket() @patch("pypdf._reader.logger_warning") def test_issue_997(mock_logger_warning): url = ( @@ -700,7 +700,7 @@ def test_issue_997(mock_logger_warning): merger.close() # cleanup - os.remove(merged_filename) + Path(merged_filename).unlink() def test_annotation_builder_free_text(): @@ -743,7 +743,7 @@ def test_annotation_builder_free_text(): with open(target, "wb") as fp: writer.write(fp) - os.remove(target) # comment this out for manual inspection + Path(target).unlink() # comment this out for manual inspection def test_annotation_builder_polygon(): @@ -771,7 +771,7 @@ def test_annotation_builder_polygon(): with open(target, "wb") as fp: writer.write(fp) - os.remove(target) # comment this out for manual inspection + Path(target).unlink() # comment this out for manual inspection def test_annotation_builder_line(): @@ -796,7 +796,7 @@ def test_annotation_builder_line(): with open(target, "wb") as fp: writer.write(fp) - os.remove(target) # comment this out for manual inspection + Path(target).unlink() # comment this out for manual inspection def test_annotation_builder_square(): @@ -823,7 +823,7 @@ def test_annotation_builder_square(): with open(target, "wb") as fp: writer.write(fp) - os.remove(target) # comment this out for manual inspection + Path(target).unlink() # comment this out for manual inspection def test_annotation_builder_circle(): @@ -851,7 +851,7 @@ def test_annotation_builder_circle(): with open(target, "wb") as fp: writer.write(fp) - os.remove(target) # comment this out for manual inspection + Path(target).unlink() # comment this out for manual inspection def test_annotation_builder_link(): @@ -909,7 +909,7 @@ def test_annotation_builder_link(): with open(target, "wb") as fp: writer.write(fp) - os.remove(target) # comment this out for manual inspection + Path(target).unlink() # comment this out for manual inspection def test_annotation_builder_text(): @@ -933,10 +933,10 @@ def test_annotation_builder_text(): with open(target, "wb") as fp: writer.write(fp) - os.remove(target) # comment this out for manual inspection + Path(target).unlink() # comment this out for manual inspection -def test_CheckboxRadioButtonAttributes_opt(): +def test_checkboxradiobuttonattributes_opt(): assert "/Opt" in CheckboxRadioButtonAttributes.attributes_dict() @@ -1046,7 +1046,7 @@ def test_cloning(caplog): assert isinstance(obj21.get("/Test2"), IndirectObject) -@pytest.mark.external +@pytest.mark.enable_socket() def test_append_with_indirectobject_not_pointing(caplog): """ reported in #1631 @@ -1059,3 +1059,29 @@ def test_append_with_indirectobject_not_pointing(caplog): writer = PdfWriter() writer.append(reader) assert "Object 43 0 not defined." in caplog.text + + +@pytest.mark.enable_socket() +def test_iss1615_1673(): + """ + test cases where /N is not indicating chains of objects + test also where /N,... are not part of chains + """ + # #1615 + url = "https://github.com/py-pdf/pypdf/files/10671366/graph_letter.pdf" + name = "graph_letter.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + assert ( + "/N" + in writer.pages[0]["/Annots"][0] + .get_object()["/AP"]["/N"]["/Resources"]["/ColorSpace"]["/Cs1"][1] + .get_object() + ) + # #1673 + url = "https://github.com/py-pdf/pypdf/files/10848750/budgeting-loan-form-sf500.pdf" + name = "budgeting-loan-form-sf500.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + writer = PdfWriter() + writer.clone_document_from_reader(reader) diff --git a/tests/test_javascript.py b/tests/test_javascript.py index a438ace745..37edb218a5 100644 --- a/tests/test_javascript.py +++ b/tests/test_javascript.py @@ -1,3 +1,4 @@ +"""Test topics around the usage of JavaScript in PDF documents.""" from pathlib import Path from typing import Any diff --git a/tests/test_merger.py b/tests/test_merger.py index d8cd9573eb..cdebead8f5 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -1,4 +1,4 @@ -import os +"""Test the pypdf._merger module.""" import sys from io import BytesIO from pathlib import Path @@ -191,7 +191,7 @@ def test_merger_operations_by_semi_traditional_usage(tmp_path): merger.write(path) # Act # Assert - assert os.path.isfile(path) + assert Path(path).is_file() check_outline(path) @@ -203,7 +203,7 @@ def test_merger_operations_by_semi_traditional_usage_with_writer(tmp_path): merger.write(path) # Act # Assert - assert os.path.isfile(path) + assert Path(path).is_file() check_outline(path) @@ -212,7 +212,7 @@ def test_merger_operation_by_new_usage(tmp_path): with PdfMerger(fileobj=path) as merger: merger_operate(merger) # Assert - assert os.path.isfile(path) + assert Path(path).is_file() check_outline(path) @@ -222,7 +222,7 @@ def test_merger_operation_by_new_usage_with_writer(tmp_path): merger_operate(merger) # Assert - assert os.path.isfile(path) + assert Path(path).is_file() check_outline(path) @@ -310,10 +310,10 @@ def test_merge_write_closed_fh_with_writer(): merger.set_page_mode("/UseNone") merger.add_outline_item("An outline item", 0) - os.unlink("stream1.pdf") + Path("stream1.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_trim_outline_list(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/995/995175.pdf" name = "tika-995175.pdf" @@ -324,10 +324,10 @@ def test_trim_outline_list(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_trim_outline_list_with_writer(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/995/995175.pdf" name = "tika-995175.pdf" @@ -338,10 +338,10 @@ def test_trim_outline_list_with_writer(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_zoom(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994759.pdf" name = "tika-994759.pdf" @@ -352,10 +352,10 @@ def test_zoom(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_zoom_with_writer(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994759.pdf" name = "tika-994759.pdf" @@ -366,10 +366,10 @@ def test_zoom_with_writer(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_zoom_xyz_no_left(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/933/933322.pdf" name = "tika-933322.pdf" @@ -380,10 +380,10 @@ def test_zoom_xyz_no_left(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_zoom_xyz_no_left_with_writer(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/933/933322.pdf" name = "tika-933322.pdf" @@ -394,10 +394,10 @@ def test_zoom_xyz_no_left_with_writer(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_outline_item(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/997/997511.pdf" name = "tika-997511.pdf" @@ -408,11 +408,11 @@ def test_outline_item(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_outline_item_with_writer(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/997/997511.pdf" name = "tika-997511.pdf" @@ -423,11 +423,11 @@ def test_outline_item_with_writer(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_trim_outline(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/982/982336.pdf" name = "tika-982336.pdf" @@ -438,11 +438,11 @@ def test_trim_outline(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_trim_outline_with_writer(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/982/982336.pdf" name = "tika-982336.pdf" @@ -453,11 +453,11 @@ def test_trim_outline_with_writer(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test1(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923621.pdf" name = "tika-923621.pdf" @@ -468,11 +468,11 @@ def test1(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test1_with_writer(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923621.pdf" name = "tika-923621.pdf" @@ -483,11 +483,11 @@ def test1_with_writer(): merger.close() # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_sweep_recursion1(): # TODO: This test looks like an infinite loop. url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" @@ -502,11 +502,11 @@ def test_sweep_recursion1(): reader2.pages # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_sweep_recursion1_with_writer(): # TODO: This test looks like an infinite loop. url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" @@ -521,11 +521,11 @@ def test_sweep_recursion1_with_writer(): reader2.pages # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() @pytest.mark.parametrize( ("url", "name"), [ @@ -551,11 +551,11 @@ def test_sweep_recursion2(url, name): reader2.pages # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() @pytest.mark.parametrize( ("url", "name"), [ @@ -581,10 +581,10 @@ def test_sweep_recursion2_with_writer(url, name): reader2.pages # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_sweep_indirect_list_newobj_is_none(caplog): url = "https://corpora.tika.apache.org/base/docs/govdocs1/906/906769.pdf" name = "tika-906769.pdf" @@ -599,10 +599,10 @@ def test_sweep_indirect_list_newobj_is_none(caplog): reader2.pages # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_sweep_indirect_list_newobj_is_none_with_writer(caplog): url = "https://corpora.tika.apache.org/base/docs/govdocs1/906/906769.pdf" name = "tika-906769.pdf" @@ -617,10 +617,10 @@ def test_sweep_indirect_list_newobj_is_none_with_writer(caplog): reader2.pages # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path("tmp-merger-do-not-commit.pdf").unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1145(): # issue with FitH destination with null param url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" @@ -630,7 +630,7 @@ def test_iss1145(): merger.close() -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1145_with_writer(): # issue with FitH destination with null param url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" @@ -680,7 +680,7 @@ def test_deprecation_bookmark_decorator_output_with_writer(): merger.merge(0, reader, import_bookmarks=True) -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1344(caplog): url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" name = "iss1344.pdf" @@ -695,7 +695,7 @@ def test_iss1344(caplog): assert r.threads is None -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1344_with_writer(caplog): url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" name = "iss1344.pdf" @@ -708,7 +708,7 @@ def test_iss1344_with_writer(caplog): assert "adresse où le malade peut être visité" in p.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_articles_with_writer(caplog): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "924666.pdf" diff --git a/tests/test_page.py b/tests/test_page.py index d0a4fb6227..4213a4b885 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1,5 +1,5 @@ +"""Test the pypdf._page module.""" import json -import os import random from copy import deepcopy from io import BytesIO @@ -33,7 +33,7 @@ def get_all_sample_files(): meta_file = SAMPLE_ROOT / "files.json" - if not os.path.isfile(meta_file): + if not Path(meta_file).is_file(): return {"data": []} with open(meta_file) as fp: data = fp.read() @@ -44,7 +44,7 @@ def get_all_sample_files(): all_files_meta = get_all_sample_files() -@pytest.mark.samples +@pytest.mark.samples() @pytest.mark.parametrize( "meta", [m for m in all_files_meta["data"] if not m["encrypted"]], @@ -61,8 +61,8 @@ def test_read(meta): assert len(reader.pages) == meta["pages"] -@pytest.mark.samples -@pytest.mark.external +@pytest.mark.samples() +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("pdf_path", "password"), [ @@ -180,6 +180,11 @@ def test_transformation_equivalence2(): w.pages[0].merge_transformed_page( reader_comments.pages[0], Transformation().rotate(-15), True, True ) + nb_annots1 = len(w.pages[0]["/Annots"]) + w.pages[0].merge_transformed_page( + reader_comments.pages[0], Transformation().rotate(-30), True, True + ) + assert len(w.pages[0]["/Annots"]) == 2 * nb_annots1 # No special assert: Visual check the overlay has its comments at the good position @@ -191,14 +196,14 @@ def test_get_user_unit_property(): def compare_dict_objects(d1, d2): assert sorted(d1.keys()) == sorted(d2.keys()) - for k in d1.keys(): - if isinstance(d1[k], DictionaryObject): - compare_dict_objects(d1[k], d2[k]) + for key in d1: + if isinstance(d1[key], DictionaryObject): + compare_dict_objects(d1[key], d2[key]) else: - assert d1[k] == d2[k] + assert d1[key] == d2[key] -@pytest.mark.slow +@pytest.mark.slow() def test_page_transformations(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path) @@ -273,12 +278,10 @@ def test_page_rotation(): # test transfer_rotate_to_content page.rotation -= 90 page.transfer_rotation_to_content() - assert ( - abs(float(page.mediabox.left) - 0) < 0.1 - and abs(float(page.mediabox.bottom) - 0) < 0.1 - and abs(float(page.mediabox.right) - 792) < 0.1 - and abs(float(page.mediabox.top) - 612) < 0.1 - ) + assert abs(float(page.mediabox.left) - 0) < 0.1 + assert abs(float(page.mediabox.bottom) - 0) < 0.1 + assert abs(float(page.mediabox.right) - 792) < 0.1 + assert abs(float(page.mediabox.top) - 612) < 0.1 def test_page_indirect_rotation(): @@ -330,7 +333,7 @@ def test_multi_language(): set_custom_rtl(-1, -1, []) # to prevent further errors -@pytest.mark.external +@pytest.mark.enable_socket() def test_extract_text_single_quote_op(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/964/964029.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf"))) @@ -338,7 +341,7 @@ def test_extract_text_single_quote_op(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_no_ressources_on_text_extract(): url = "https://github.com/py-pdf/pypdf/files/9428434/TelemetryTX_EM.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf"))) @@ -346,7 +349,7 @@ def test_no_ressources_on_text_extract(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss_1142(): # check fix for problem of context save/restore (q/Q) url = "https://github.com/py-pdf/pypdf/files/9150656/ST.2019.PDF" @@ -364,8 +367,8 @@ def test_iss_1142(): assert txt.find("郑州分公司") > 0 -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() @pytest.mark.parametrize( ("url", "name"), [ @@ -397,8 +400,8 @@ def test_extract_text_page_pdf(url, name): page.extract_text() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_extract_text_page_pdf_impossible_decode_xform(caplog): url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972962.pdf" name = "tika-972962.pdf" @@ -409,8 +412,8 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog): assert warn_msgs == [""] # text extraction recognise no text -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_extract_text_operator_t_star(): # L1266, L1267 url = "https://corpora.tika.apache.org/base/docs/govdocs1/967/967943.pdf" name = "tika-967943.pdf" @@ -682,19 +685,19 @@ def filter_first_table(r) -> bool: reader = PdfReader(RESOURCE_ROOT / "Sample_Td-matrix.pdf") page_td_model = reader.pages[0] # We store the translations of the Td-executions. - list_Td = [] + list_td = [] def visitor_td(op, args, cm, tm) -> None: if op == b"Td": - list_Td.append((tm[4], tm[5])) + list_td.append((tm[4], tm[5])) page_td_model.extract_text(visitor_operand_after=visitor_td) - assert len(list_Td) == 4 + assert len(list_td) == 4 # Check the translations of the four Td-executions. - assert list_Td[0] == (210.0, 110.0) - assert list_Td[1] == (410.0, 110.0) - assert list_Td[2] == (210.0, 210.0) - assert list_Td[3] == (410.0, 210.0) + assert list_td[0] == (210.0, 110.0) + assert list_td[1] == (410.0, 110.0) + assert list_td[2] == (210.0, 210.0) + assert list_td[3] == (410.0, 210.0) @pytest.mark.parametrize( @@ -852,10 +855,10 @@ def test_annotation_setter(): writer.write(fp) # Cleanup - os.remove(target) # remove for testing + Path(target).unlink() # remove for testing -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.xfail(reason="#1091") def test_text_extraction_issue_1091(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/966/966635.pdf" @@ -867,7 +870,7 @@ def test_text_extraction_issue_1091(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_empyt_password_1088(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/941/941536.pdf" name = "tika-941536.pdf" @@ -876,17 +879,17 @@ def test_empyt_password_1088(): len(reader.pages) -@pytest.mark.external +@pytest.mark.enable_socket() def test_old_habibi(): # this habibi has som multiple characters associated with the h reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi.pdf") txt = reader.pages[0].extract_text() # very odd file - assert ( - "habibi" in txt and "حَبيبي" in txt - ) # extract from acrobat reader "حَبيبي habibi􀀃􀏲􀎒􀏴􀎒􀎣􀋴 + # extract from acrobat reader "حَبيبي habibi􀀃􀏲􀎒􀏴􀎒􀎣􀋴 + assert "habibi" in txt + assert "حَبيبي" in txt -@pytest.mark.samples +@pytest.mark.samples() def test_read_link_annotation(): reader = PdfReader(SAMPLE_ROOT / "016-libre-office-link/libre-office-link.pdf") assert len(reader.pages[0].annotations) == 1 @@ -916,7 +919,7 @@ def test_read_link_annotation(): assert annot == expected -@pytest.mark.external +@pytest.mark.enable_socket() def test_no_resources(): url = "https://github.com/py-pdf/pypdf/files/9572045/108.pdf" name = "108.pdf" @@ -1089,7 +1092,7 @@ def test_merge_page_resources_smoke_test(): assert relevant_operations == expected_operations -@pytest.mark.external +@pytest.mark.enable_socket() def test_merge_transformed_page_into_blank(): url = "https://github.com/py-pdf/pypdf/files/10768334/badges_3vjrh_7LXDZ_1-1.pdf" name = "badges_3vjrh_7LXDZ_1.pdf" diff --git a/tests/test_page_labels.py b/tests/test_page_labels.py index c8d4fa5982..a0cc873551 100644 --- a/tests/test_page_labels.py +++ b/tests/test_page_labels.py @@ -1,3 +1,4 @@ +"""Test the pypdf._page_labels module.""" from io import BytesIO import pytest @@ -67,7 +68,7 @@ def test_number2uppercase_letter(): number2uppercase_letter(-1) -@pytest.mark.external +@pytest.mark.enable_socket() def test_index2label(caplog): url = "https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" name = "waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" diff --git a/tests/test_pagerange.py b/tests/test_pagerange.py index 816d664624..a90c9cdbf2 100644 --- a/tests/test_pagerange.py +++ b/tests/test_pagerange.py @@ -1,3 +1,4 @@ +"""Test the pypdf.pagerange module.""" import pytest from pypdf.pagerange import PageRange, ParseError, parse_filename_page_ranges @@ -81,7 +82,7 @@ def test_parse_filename_page_ranges_err(): @pytest.mark.parametrize( - "a, b, expected", + ("a", "b", "expected"), [ (PageRange(slice(0, 5)), PageRange(slice(2, 10)), slice(0, 10)), (PageRange(slice(0, 5)), PageRange(slice(2, 3)), slice(0, 5)), @@ -96,7 +97,7 @@ def test_addition(a, b, expected): @pytest.mark.parametrize( - "a, b", + ("a", "b"), [ (PageRange(slice(0, 5)), PageRange(slice(7, 10))), (PageRange(slice(7, 10)), PageRange(slice(0, 5))), diff --git a/tests/test_papersizes.py b/tests/test_papersizes.py index d546b923e7..d50948bf5d 100644 --- a/tests/test_papersizes.py +++ b/tests/test_papersizes.py @@ -1,3 +1,4 @@ +"""Test the pypdf.papersizes module.""" import pytest from pypdf import papersizes diff --git a/tests/test_protocols.py b/tests/test_protocols.py index 01c15a83a6..8c8a6ff50c 100644 --- a/tests/test_protocols.py +++ b/tests/test_protocols.py @@ -1,12 +1,13 @@ +"""Test the pypdf._protocols module.""" from pypdf._protocols import PdfObjectProtocol -class iPdfObjectProtocol(PdfObjectProtocol): +class IPdfObjectProtocol(PdfObjectProtocol): pass def test_pdfobjectprotocol(): - o = iPdfObjectProtocol() + o = IPdfObjectProtocol() assert o.clone(None, False, None) is None assert o._reference_clone(None, None) is None assert o.get_object() is None diff --git a/tests/test_reader.py b/tests/test_reader.py index 35a51af993..102b425a09 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1,5 +1,5 @@ +"""Test the pypdf._reader module.""" import io -import os import time from io import BytesIO from pathlib import Path @@ -110,7 +110,7 @@ def test_read_metadata(pdf_path, expected): assert metadict["/Title"] == docinfo.title -@pytest.mark.samples +@pytest.mark.samples() @pytest.mark.parametrize( "pdf_path", [SAMPLE_ROOT / "017-unreadable-meta-data/unreadablemetadata.pdf"] ) @@ -179,7 +179,7 @@ def test_get_outline(src, outline_elements): assert len(outline) == outline_elements -@pytest.mark.samples +@pytest.mark.samples() @pytest.mark.parametrize( ("src", "expected_images"), [ @@ -226,7 +226,7 @@ def test_get_images(src, expected_images): ) finally: try: - os.remove(fn) + Path(fn).unlink() except Exception: pass @@ -417,7 +417,7 @@ def test_get_form(src, expected, expected_get_fields): ] # cleanup - os.remove("tmp-fields-report.txt") + Path("tmp-fields-report.txt").unlink() @pytest.mark.parametrize( @@ -633,7 +633,7 @@ def test_do_not_get_stuck_on_large_files_without_start_xref(): assert parse_duration < 60 -@pytest.mark.external +@pytest.mark.enable_socket() def test_decrypt_when_no_id(): """ Decrypt an encrypted file that's missing the 'ID' value in its trailer. @@ -743,7 +743,7 @@ def test_convert_to_int_error(): assert exc.value.args[0] == "invalid size in convert_to_int" -def test_convertToInt_deprecated(): +def test_converttoint_deprecated(): msg = ( "convertToInt is deprecated and was removed in pypdf 3.0.0. " "Use convert_to_int instead." @@ -755,7 +755,7 @@ def test_convertToInt_deprecated(): assert convertToInt(b"\x01", 8) == 1 -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss925(): url = "https://github.com/py-pdf/pypdf/files/8796328/1.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name="iss925.pdf"))) @@ -811,7 +811,7 @@ def test_read_not_binary_mode(caplog): assert normalize_warnings(caplog.text) == [msg] -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.skipif(not HAS_PYCRYPTODOME, reason="No pycryptodome") def test_read_form_416(): url = ( @@ -850,7 +850,7 @@ def test_form_topname_with_and_without_acroform(caplog): assert "have a non-expected parent" in caplog.text -@pytest.mark.external +@pytest.mark.enable_socket() def test_extract_text_xref_issue_2(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf" @@ -861,8 +861,8 @@ def test_extract_text_xref_issue_2(caplog): assert normalize_warnings(caplog.text) == [msg] -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_extract_text_xref_issue_3(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf" @@ -873,7 +873,7 @@ def test_extract_text_xref_issue_3(caplog): assert normalize_warnings(caplog.text) == [msg] -@pytest.mark.external +@pytest.mark.enable_socket() def test_extract_text_pdf15(): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/976/976030.pdf" @@ -882,7 +882,7 @@ def test_extract_text_pdf15(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_extract_text_xref_table_21_bytes_clrf(): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/956/956939.pdf" @@ -891,7 +891,7 @@ def test_extract_text_xref_table_21_bytes_clrf(): page.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_get_fields(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972486.pdf" name = "tika-972486.pdf" @@ -902,7 +902,7 @@ def test_get_fields(): assert dict(fields["c1-1"]) == ({"/FT": "/Btn", "/T": "c1-1"}) -@pytest.mark.external +@pytest.mark.enable_socket() def test_get_full_qualified_fields(): url = "https://github.com/py-pdf/pypdf/files/10142389/fields_with_dots.pdf" name = "fields_with_dots.pdf" @@ -922,7 +922,7 @@ def test_get_full_qualified_fields(): assert fields["customer.name"]["/T"] == "name" -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning") def test_get_fields_read_else_block(): # covers also issue 1089 @@ -931,7 +931,7 @@ def test_get_fields_read_else_block(): PdfReader(BytesIO(get_pdf_from_url(url, name=name))) -@pytest.mark.external +@pytest.mark.enable_socket() def test_get_fields_read_else_block2(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914902.pdf" name = "tika-914902.pdf" @@ -940,7 +940,7 @@ def test_get_fields_read_else_block2(): assert fields is None -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning") def test_get_fields_read_else_block3(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957721.pdf" @@ -948,7 +948,7 @@ def test_get_fields_read_else_block3(): PdfReader(BytesIO(get_pdf_from_url(url, name=name))) -@pytest.mark.external +@pytest.mark.enable_socket() def test_metadata_is_none(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/963/963692.pdf" name = "tika-963692.pdf" @@ -956,7 +956,7 @@ def test_metadata_is_none(): assert reader.metadata is None -@pytest.mark.external +@pytest.mark.enable_socket() def test_get_fields_read_write_report(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/909/909655.pdf" name = "tika-909655.pdf" @@ -966,7 +966,7 @@ def test_get_fields_read_write_report(): assert fields # cleanup - os.remove("tmp-fields-report.txt") + Path("tmp-fields-report.txt").unlink() @pytest.mark.parametrize( @@ -981,7 +981,7 @@ def test_xfa(src): assert reader.xfa is None -@pytest.mark.external +@pytest.mark.enable_socket() def test_xfa_non_empty(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/942/942050.pdf" name = "tika-942050.pdf" @@ -997,7 +997,7 @@ def test_xfa_non_empty(): @pytest.mark.parametrize( - "src,pdf_header", + ("src", "pdf_header"), [ (RESOURCE_ROOT / "attachment.pdf", "%PDF-1.5"), (RESOURCE_ROOT / "crazyones.pdf", "%PDF-1.5"), @@ -1009,7 +1009,7 @@ def test_header(src, pdf_header): assert reader.pdf_header == pdf_header -@pytest.mark.external +@pytest.mark.enable_socket() def test_outline_color(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" @@ -1017,7 +1017,7 @@ def test_outline_color(): assert reader.outline[0].color == [0, 0, 1] -@pytest.mark.external +@pytest.mark.enable_socket() def test_outline_font_format(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" @@ -1038,7 +1038,7 @@ def get_outline_property(outline, attribute_name: str): return results -@pytest.mark.samples +@pytest.mark.samples() def test_outline_title_issue_1121(): reader = PdfReader(SAMPLE_ROOT / "014-outlines/mistitled_outlines_example.pdf") @@ -1085,7 +1085,7 @@ def test_outline_title_issue_1121(): ] -@pytest.mark.samples +@pytest.mark.samples() def test_outline_count(): reader = PdfReader(SAMPLE_ROOT / "014-outlines/mistitled_outlines_example.pdf") @@ -1144,7 +1144,7 @@ def test_outline_missing_title(caplog): assert reader.outline[0]["/Title"] == "" -@pytest.mark.external +@pytest.mark.enable_socket() def test_named_destination(): # 1st case : the named_dest are stored directly as a dictionnary, PDF1.1 style url = "https://github.com/py-pdf/pypdf/files/9197028/lorem_ipsum.pdf" @@ -1163,7 +1163,7 @@ def test_named_destination(): # TODO : case to be added -@pytest.mark.external +@pytest.mark.enable_socket() def test_outline_with_missing_named_destination(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/913/913678.pdf" name = "tika-913678.pdf" @@ -1172,7 +1172,7 @@ def test_outline_with_missing_named_destination(): assert reader.outline[1][0].title.startswith("Report for 2002AZ3B: Microbial") -@pytest.mark.external +@pytest.mark.enable_socket() def test_outline_with_empty_action(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" @@ -1189,8 +1189,8 @@ def test_outline_with_invalid_destinations(): assert len(reader.outline) == 9 -@pytest.mark.external -def test_PdfReaderMultipleDefinitions(caplog): +@pytest.mark.enable_socket() +def test_pdfreader_multiple_definitions(caplog): # iss325 url = "https://github.com/py-pdf/pypdf/files/9176644/multipledefs.pdf" name = "multipledefs.pdf" @@ -1215,7 +1215,7 @@ def test_get_page_number_by_indirect(): reader._get_page_number_by_indirect(1) -@pytest.mark.external +@pytest.mark.enable_socket() def test_corrupted_xref_table(): # issue #1292 url = "https://github.com/py-pdf/pypdf/files/9444747/BreezeManual.orig.pdf" @@ -1228,7 +1228,7 @@ def test_corrupted_xref_table(): reader.pages[0].extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_reader(caplog): # iss #1273 url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" @@ -1246,7 +1246,7 @@ def test_reader(caplog): assert caplog.text == "" -@pytest.mark.external +@pytest.mark.enable_socket() def test_zeroing_xref(): # iss #328 url = ( @@ -1258,7 +1258,7 @@ def test_zeroing_xref(): len(reader.pages) -@pytest.mark.external +@pytest.mark.enable_socket() def test_thread(): url = ( "https://github.com/py-pdf/pypdf/files/9066120/" @@ -1274,7 +1274,7 @@ def test_thread(): assert len(reader.threads) >= 1 -@pytest.mark.external +@pytest.mark.enable_socket() def test_build_outline_item(caplog): url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" @@ -1302,7 +1302,7 @@ def test_build_outline_item(caplog): assert "Unexpected destination 2" in exc.value.args[0] -@pytest.mark.samples +@pytest.mark.samples() @pytest.mark.parametrize( ("src", "page_labels"), [ @@ -1326,7 +1326,7 @@ def test_page_labels(src, page_labels): assert PdfReader(src).page_labels[:max_indices] == page_labels[:max_indices] -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1559(): url = "https://github.com/py-pdf/pypdf/files/10441992/default.pdf" name = "iss1559.pdf" @@ -1335,7 +1335,7 @@ def test_iss1559(): p.extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1652(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10818844/tt.pdf" diff --git a/tests/test_security.py b/tests/test_security.py index 272d812a92..2240c6a2ef 100644 --- a/tests/test_security.py +++ b/tests/test_security.py @@ -1,3 +1,4 @@ +"""Test the pypdf._security module.""" from pypdf._security import _alg32 from pypdf.generic import ByteStringObject @@ -8,7 +9,7 @@ def test_alg32_metadata_encrypt(): _alg32( "a", rev=3, - keylen=3, + key_length=3, owner_entry=ByteStringObject(b""), p_entry=0, id1_entry=ByteStringObject(b""), @@ -24,7 +25,7 @@ def test_alg32_no_metadata_encrypt(): _alg32( "a", rev=3, - keylen=3, + key_length=3, owner_entry=ByteStringObject(b""), p_entry=0, id1_entry=ByteStringObject(b""), diff --git a/tests/test_utils.py b/tests/test_utils.py index 4e2f85afa1..cfc8d7f883 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ +"""Test the pypdf._utils module.""" import io -import os from pathlib import Path import pytest @@ -84,7 +84,7 @@ def test_matrix_multiply(a, b, expected): def test_mark_location(): stream = io.BytesIO(b"abde" * 6000) mark_location(stream) - os.remove("pypdf_pdfLocation.txt") # cleanup + Path("pypdf_pdfLocation.txt").unlink() # cleanup def test_hex_str(): @@ -238,7 +238,7 @@ def foo(old_param: int = 1, baz: int = 2) -> float: assert exc.value.args[0] == expected_msg -@pytest.mark.external +@pytest.mark.enable_socket() def test_escapedcode_followed_by_int(): # iss #1294 url = ( diff --git a/tests/test_workflows.py b/tests/test_workflows.py index e625829ff4..ce01335068 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -5,7 +5,6 @@ """ import binascii -import os import sys from io import BytesIO from pathlib import Path @@ -78,7 +77,7 @@ def test_dropdown_items(): assert "/Opt" in fields["Nationality"] -def test_PdfReaderFileLoad(): +def test_pdfreader_file_load(): """ Test loading and parsing of a file. @@ -106,7 +105,7 @@ def test_PdfReaderFileLoad(): ) -def test_PdfReaderJpegImage(): +def test_pdfreader_jpeg_image(): """ Test loading and parsing of a file. Extract the image of the file and compare to expected textual output. @@ -179,8 +178,8 @@ def test_rotate_45(): assert exc.value.args[0] == "Rotation angle must be a multiple of 90" -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() @pytest.mark.parametrize( ("enable", "url", "pages"), [ @@ -261,7 +260,7 @@ def test_extract_textbench(enable, url, pages, print_result=False): pass -@pytest.mark.slow +@pytest.mark.slow() def test_orientations(): p = PdfReader(RESOURCE_ROOT / "test Orient.pdf").pages[0] with pytest.warns(DeprecationWarning): @@ -302,8 +301,8 @@ def test_orientations(): ), f"extract_text({req}) => {rst}" -@pytest.mark.samples -@pytest.mark.external +@pytest.mark.samples() +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("base_path", "overlay_path"), [ @@ -335,11 +334,11 @@ def test_overlay(base_path, overlay_path): writer.write(fp) # Cleanup - os.remove("dont_commit_overlay.pdf") # remove for manual inspection + Path("dont_commit_overlay.pdf").unlink() # remove for manual inspection -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() @pytest.mark.parametrize( ("url", "name"), [ @@ -358,7 +357,7 @@ def test_merge_with_warning(tmp_path, url, name): merger.write(tmp_path / "tmp.merged.pdf") -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -376,7 +375,7 @@ def test_merge(tmp_path, url, name): merger.write(tmp_path / "tmp.merged.pdf") -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -392,7 +391,7 @@ def test_get_metadata(url, name): reader.metadata -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name", "strict", "exception"), [ @@ -478,7 +477,7 @@ def test_extract_text(url, name, strict, exception): assert ex_info.value.args[0] == exc_text -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -500,8 +499,8 @@ def test_compress_raised(url, name): page.compress_content_streams() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() @pytest.mark.parametrize( ("url", "name", "strict"), [ @@ -531,7 +530,7 @@ def test_compress(url, name, strict): page.compress_content_streams() -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -552,7 +551,7 @@ def test_get_fields_warns(tmp_path, caplog, url, name): assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."] -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -572,7 +571,7 @@ def test_get_fields_no_warning(tmp_path, url, name): assert len(retrieved_fields) == 10 -@pytest.mark.external +@pytest.mark.enable_socket() def test_scale_rectangle_indirect_object(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/999/999944.pdf" name = "tika-999944.pdf" @@ -605,15 +604,15 @@ def test_merge_output(caplog): expected_data = fp.read() if actual != expected_data: # See https://github.com/pytest-dev/pytest/issues/9124 - assert ( - False - ), f"len(actual) = {len(actual):,} vs len(expected) = {len(expected_data):,}" + pytest.fail( + f"len(actual) = {len(actual):,} vs len(expected) = {len(expected_data):,}" + ) # Cleanup merger.close() -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -667,7 +666,7 @@ def test_image_extraction(url, name): images_extracted = [] root = Path("extracted-images") if not root.exists(): - os.mkdir(root) + root.mkdir() for page in reader.pages: for image in page.images: @@ -680,11 +679,11 @@ def test_image_extraction(url, name): do_cleanup = True # set this to False for manual inspection if do_cleanup: for filepath in images_extracted: - if os.path.exists(filepath): - os.remove(filepath) + if Path(filepath).exists(): + Path(filepath).unlink() -@pytest.mark.external +@pytest.mark.enable_socket() def test_image_extraction_strict(): # Emits log messages url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf" @@ -695,7 +694,7 @@ def test_image_extraction_strict(): images_extracted = [] root = Path("extracted-images") if not root.exists(): - os.mkdir(root) + root.mkdir() for page in reader.pages: for image in page.images: @@ -708,11 +707,11 @@ def test_image_extraction_strict(): do_cleanup = True # set this to False for manual inspection if do_cleanup: for filepath in images_extracted: - if os.path.exists(filepath): - os.remove(filepath) + if Path(filepath).exists(): + Path(filepath).unlink() -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -729,7 +728,7 @@ def test_image_extraction2(url, name): images_extracted = [] root = Path("extracted-images") if not root.exists(): - os.mkdir(root) + root.mkdir() for page in reader.pages: for image in page.images: @@ -742,11 +741,11 @@ def test_image_extraction2(url, name): do_cleanup = True # set this to False for manual inspection if do_cleanup: for filepath in images_extracted: - if os.path.exists(filepath): - os.remove(filepath) + if Path(filepath).exists(): + Path(filepath).unlink() -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -766,7 +765,7 @@ def test_get_outline(url, name): reader.outline -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name"), [ @@ -786,7 +785,7 @@ def test_get_xfa(url, name): reader.xfa -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name", "strict"), [ @@ -819,7 +818,7 @@ def test_get_fonts(url, name, strict): page._get_fonts() -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name", "strict"), [ @@ -877,7 +876,7 @@ def test_get_xmp(url, name, strict): xmp_info.custom_properties -@pytest.mark.external +@pytest.mark.enable_socket() def test_tounicode_is_identity(): url = "https://github.com/py-pdf/pypdf/files/9998335/FP_Thesis.pdf" name = "FP_Thesis.pdf" @@ -886,7 +885,7 @@ def test_tounicode_is_identity(): reader.pages[0].extract_text() -@pytest.mark.external +@pytest.mark.enable_socket() def test_append_forms(): # from #1538 writer = PdfWriter() @@ -911,7 +910,7 @@ def test_append_forms(): ) + len(reader2.get_form_text_fields()) -@pytest.mark.external +@pytest.mark.enable_socket() def test_extra_test_iss1541(): url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf" name = "tst_iss1541.pdf" @@ -944,7 +943,7 @@ def test_extra_test_iss1541(): assert exc.value.args[0] == "Unexpected end of stream" -@pytest.mark.external +@pytest.mark.enable_socket() def test_fields_returning_stream(): """This problem was reported in #424""" url = "https://github.com/mstamy2/PyPDF2/files/1948267/Simple.form.pdf" diff --git a/tests/test_writer.py b/tests/test_writer.py index 372ed7e014..f28c172a07 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1,4 +1,4 @@ -import os +"""Test the pypdf._writer module.""" import re from io import BytesIO from pathlib import Path @@ -218,7 +218,7 @@ def test_writer_operations_by_traditional_usage(write_data_here, needs_cleanup): writer.write(output_stream) if needs_cleanup: - os.remove(write_data_here) + Path(write_data_here).unlink() @pytest.mark.parametrize( @@ -242,7 +242,7 @@ def test_writer_operations_by_semi_traditional_usage(write_data_here, needs_clea writer.write(output_stream) if needs_cleanup: - os.remove(write_data_here) + Path(write_data_here).unlink() @pytest.mark.parametrize( @@ -263,7 +263,7 @@ def test_writer_operations_by_semi_new_traditional_usage( writer.write(write_data_here) if needs_cleanup: - os.remove(write_data_here) + Path(write_data_here).unlink() @pytest.mark.parametrize( @@ -280,14 +280,14 @@ def test_writer_operation_by_new_usage(write_data_here, needs_cleanup): writer_operate(writer) if needs_cleanup: - os.remove(write_data_here) + Path(write_data_here).unlink() @pytest.mark.parametrize( - ("input_path",), + "input_path", [ - ("side-by-side-subfig.pdf",), - ("reportlab-inline-image.pdf",), + "side-by-side-subfig.pdf", + "reportlab-inline-image.pdf", ], ) def test_remove_images(input_path): @@ -312,14 +312,14 @@ def test_remove_images(input_path): assert "Lorem ipsum dolor sit amet" in extracted_text # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() @pytest.mark.parametrize( - ("input_path",), + "input_path", [ - ("side-by-side-subfig.pdf",), - ("reportlab-inline-image.pdf",), + "side-by-side-subfig.pdf", + "reportlab-inline-image.pdf", ], ) def test_remove_text(input_path): @@ -338,7 +338,7 @@ def test_remove_text(input_path): writer.write(output_stream) # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_remove_text_all_operators(): @@ -404,7 +404,7 @@ def test_remove_text_all_operators(): writer.write(output_stream) # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_write_metadata(): @@ -433,7 +433,7 @@ def test_write_metadata(): assert metadata.get("/Title") == "The Crazy Ones" # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_fill_form(): @@ -463,7 +463,7 @@ def test_fill_form(): with open(tmp_filename, "wb") as output_stream: writer.write(output_stream) - os.remove(tmp_filename) # cleanup + Path(tmp_filename).unlink() # cleanup @pytest.mark.parametrize( @@ -519,7 +519,7 @@ def test_encrypt(use_128bit, user_password, owner_password): assert new_text == orig_text # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_add_outline_item(): @@ -540,7 +540,7 @@ def test_add_outline_item(): writer.write(output_stream) # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_add_named_destination(): @@ -579,7 +579,7 @@ def test_add_named_destination(): writer.write(output_stream) # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_add_uri(): @@ -620,7 +620,7 @@ def test_add_uri(): writer.write(output_stream) # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_add_link(): @@ -672,7 +672,7 @@ def test_add_link(): writer.write(output_stream) # Cleanup - os.remove(tmp_filename) + Path(tmp_filename).unlink() def test_io_streams(): @@ -702,7 +702,7 @@ def test_regression_issue670(): writer.write(f_pdf) # cleanup - os.remove(tmp_file) + Path(tmp_file).unlink() def test_issue301(): @@ -725,8 +725,8 @@ def test_append_pages_from_reader_append(): writer.write(o) -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() def test_sweep_indirect_references_nullobject_exception(): # TODO: Check this more closely... this looks weird url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" @@ -734,14 +734,15 @@ def test_sweep_indirect_references_nullobject_exception(): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) - merger.write("tmp-merger-do-not-commit.pdf") + tmp_file = "tmp-merger-do-not-commit.pdf" + merger.write(tmp_file) # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path(tmp_file).unlink() -@pytest.mark.external -@pytest.mark.slow +@pytest.mark.enable_socket() +@pytest.mark.slow() @pytest.mark.parametrize( ("url", "name"), [ @@ -758,16 +759,17 @@ def test_sweep_indirect_references_nullobject_exception(): ) def test_some_appends(url, name): reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + tmp_file = "tmp-merger-do-not-commit.pdf" # PdfMerger merger = PdfMerger() merger.append(reader) - merger.write("tmp-merger-do-not-commit.pdf") + merger.write(tmp_file) # PdfWriter merger = PdfWriter() merger.append(reader) - merger.write("tmp-merger-do-not-commit.pdf") + merger.write(tmp_file) # cleanup - os.remove("tmp-merger-do-not-commit.pdf") + Path(tmp_file).unlink() def test_pdf_header(): @@ -806,8 +808,8 @@ def test_write_dict_stream_object(): page_object[NameObject("/Test")] = stream_object page_object = writer.add_page(page_object) - - with open("tmp-writer-do-not-commit.pdf", "wb") as fp: + tmp_file = "tmp-writer-do-not-commit.pdf" + with open(tmp_file, "wb") as fp: writer.write(fp) for k, v in page_object.items(): @@ -817,7 +819,7 @@ def test_write_dict_stream_object(): assert str(v.get_object()) == str(stream_object) break else: - assert False, "/Test not found" + pytest.fail("/Test not found") # Check that every key in _idnum_hash is correct objects_hash = [o.hash_value() for o in writer._objects] @@ -825,7 +827,7 @@ def test_write_dict_stream_object(): assert v.pdf == writer assert k in objects_hash, "Missing %s" % v - os.remove("tmp-writer-do-not-commit.pdf") + Path(tmp_file).unlink() def test_add_single_annotation(): @@ -857,7 +859,7 @@ def test_add_single_annotation(): writer.write(fp) # Cleanup - os.remove(target) # comment out for testing + Path(target).unlink() # comment out for testing def test_deprecation_bookmark_decorator(): @@ -873,7 +875,7 @@ def test_deprecation_bookmark_decorator(): writer.add_outline_item_dict(bookmark=outline_item) -@pytest.mark.samples +@pytest.mark.samples() def test_colors_in_outline_item(): reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") writer = PdfWriter() @@ -893,10 +895,10 @@ def test_colors_in_outline_item(): assert [str(c) for c in outline_item.color] == [str(p) for p in purple_rgb] # Cleanup - os.remove(target) # comment out for testing + Path(target).unlink() # comment out for testing -@pytest.mark.samples +@pytest.mark.samples() def test_write_empty_stream(): reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") writer = PdfWriter() @@ -948,7 +950,7 @@ def test_startup_dest(): pdf_file_writer.open_destination = None -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss471(): url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" name = "book_471.pdf" @@ -961,7 +963,7 @@ def test_iss471(): ) -@pytest.mark.external +@pytest.mark.enable_socket() def test_reset_translation(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" @@ -997,7 +999,7 @@ def test_threads_empty(): assert thr == thr2 -@pytest.mark.external +@pytest.mark.enable_socket() def test_append_without_annots_and_articles(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" @@ -1016,7 +1018,7 @@ def test_append_without_annots_and_articles(): assert len(writer.threads) >= 1 -@pytest.mark.external +@pytest.mark.enable_socket() def test_append_multiple(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" @@ -1031,7 +1033,7 @@ def test_append_multiple(): assert pages[-1] not in pages[0:-1] # page not repeated -@pytest.mark.samples +@pytest.mark.samples() def test_set_page_label(): src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels target = "pypdf-output.pdf" @@ -1127,7 +1129,7 @@ def test_set_page_label(): ): writer.set_page_label(0, 5, "/r", start=-1) - os.remove(target) + Path(target).unlink() src = ( SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" @@ -1151,7 +1153,7 @@ def test_set_page_label(): writer.write(target) assert PdfReader(target).page_labels[: len(expected)] == expected - os.remove(target) + Path(target).unlink() # Tests prefix and start. src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels @@ -1169,10 +1171,10 @@ def test_set_page_label(): writer.set_page_label(31, 39, "/D", prefix="HURT-") writer.write(target) - os.remove(target) # comment to see result + Path(target).unlink() # comment to see result -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1601(): url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" name = "badge-38.pdf" @@ -1241,7 +1243,7 @@ def test_attachments(): assert reader.attachments["foobar2.txt"][1] == b"2nd_foobarcontent" -@pytest.mark.external +@pytest.mark.enable_socket() def test_iss1614(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10669995/broke.pdf" @@ -1256,6 +1258,7 @@ def test_iss1614(): out_pdf.append(in_pdf) +@pytest.mark.enable_socket() def test_new_removes(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10807951/tt.pdf" diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 353491ac11..1fc1184ab2 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -1,3 +1,4 @@ +"""Test the pypdf.xmp module.""" from datetime import datetime from io import BytesIO from pathlib import Path @@ -87,7 +88,7 @@ def test_identity(x): assert pypdf.xmp._identity(x) == x -@pytest.mark.external +@pytest.mark.enable_socket() @pytest.mark.parametrize( ("url", "name", "xmpmm_instance_id"), [ @@ -106,7 +107,7 @@ def test_xmpmm(url, name, xmpmm_instance_id): assert xmp_metadata.xmpmm_instance_id == xmpmm_instance_id -@pytest.mark.external +@pytest.mark.enable_socket() def test_dc_description(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf" name = "tika-953770.pdf" @@ -121,7 +122,7 @@ def test_dc_description(): } -@pytest.mark.external +@pytest.mark.enable_socket() def test_dc_creator(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf" name = "tika-953770.pdf" @@ -132,7 +133,7 @@ def test_dc_creator(): assert xmp_metadata.dc_creator == ["U.S. Fish and Wildlife Service"] -@pytest.mark.external +@pytest.mark.enable_socket() def test_custom_properties(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/986/986065.pdf" name = "tika-986065.pdf" @@ -143,7 +144,7 @@ def test_custom_properties(): assert xmp_metadata.custom_properties == {"Style": "Searchable Image (Exact)"} -@pytest.mark.external +@pytest.mark.enable_socket() def test_dc_subject(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959519.pdf" name = "tika-959519.pdf" @@ -174,7 +175,7 @@ def test_dc_subject(): ] -@pytest.mark.external +@pytest.mark.enable_socket() def test_issue585(): url = "https://github.com/py-pdf/pypdf/files/5536984/test.pdf" name = "pypdf-5536984.pdf"