diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 4878a3c1e7..05284f39be 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -34,8 +34,8 @@ better. Let us know if we may add them to our tests! ## Traceback -This is the complete Traceback I see: +This is the complete traceback I see: ``` -# TODO: Your Traceback goes here (if applicable) +# TODO: Your traceback goes here (if applicable) ``` diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml index 96389177b7..98dc86ed92 100644 --- a/.github/workflows/benchmark.yaml +++ b/.github/workflows/benchmark.yaml @@ -21,7 +21,7 @@ jobs: with: submodules: 'recursive' - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install requirements (Python 3) diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 8b52544e1e..a6157a484a 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -26,7 +26,7 @@ jobs: with: submodules: 'recursive' - name: Setup Python (3.11+) - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.12 # latest stable python allow-prereleases: true @@ -80,14 +80,14 @@ jobs: path: '**/tests/pdf_cache/*' key: cache-downloaded-files - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 if: matrix.python-version == '3.6' || matrix.python-version == '3.7' || matrix.python-version == '3.8' || matrix.python-version == '3.9' || matrix.python-version == '3.10' with: python-version: ${{ matrix.python-version }} cache: 'pip' cache-dependency-path: '**/requirements/ci.txt' - name: Setup Python (3.11+) - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 if: matrix.python-version == '3.11' || matrix.python-version == '3.12' with: python-version: ${{ matrix.python-version }} @@ -143,7 +143,7 @@ jobs: with: submodules: 'recursive' - name: Setup Python 3.11 - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" cache: 'pip' @@ -171,7 +171,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{env.PYTHON_LATEST}} @@ -194,7 +194,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: # Use latest Python, so it understands all syntax. python-version: ${{env.PYTHON_LATEST}} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9fb135d8a8..3ca2426d63 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -20,7 +20,7 @@ jobs: uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.x diff --git a/CHANGELOG.md b/CHANGELOG.md index 6102d43b09..0f52bf2707 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # CHANGELOG +## Version 3.17.2, 2023-12-10 + +### Bug Fixes (BUG) +- Cope with deflated images with CMYK Black Only (#2322) +- Handle indirect objects as parameters for CCITTFaxDecode (#2307) +- check words length in _cmap type1_alternative function (#2310) + +### Robustness (ROB) +- Relax flate decoding for too many lookup values (#2331) +- Let _build_destination skip in case of missing /D key (#2018) + +[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.17.1...3.17.2) + ## Version 3.17.1, 2023-11-14 ### Bug Fixes (BUG) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 18fd750463..7fffed4d65 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -15,6 +15,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/pypdf/gra * [ArkieCoder](https://github.com/ArkieCoder) * [Clauss, Christian](https://github.com/cclauss) * [DL6ER](https://github.com/DL6ER) +* [Duy, Phan Thanh](https://github.com/zuypt) * [ediamondscience](https://github.com/ediamondscience) * [Ermeson, Felipe](https://github.com/FelipeErmeson) * [Freitag, François](https://github.com/francoisfreitag) diff --git a/docs/dev/intro.md b/docs/dev/intro.md index 7d10901ab6..6765b4a44a 100644 --- a/docs/dev/intro.md +++ b/docs/dev/intro.md @@ -59,10 +59,13 @@ BODY The `PREFIX` can be: +* `SEC`: Security improvements. Typically an infinite loop that was possible. * `BUG`: A bug was fixed. Likely there is one or multiple issues. Then write in the `BODY`: `Closes #123` where 123 is the issue number on GitHub. It would be absolutely amazing if you could write a regression test in those cases. That is a test that would fail without the fix. + A bug is always an issue for pypdf users - test code or CI that was fixed is + not considered a bug here. * `ENH`: A new feature! Describe in the body what it can be used for. * `DEP`: A deprecation - either marking something as "this is going to be removed" or actually removing it. @@ -75,7 +78,21 @@ The `PREFIX` can be: * `MAINT`: Quite a lot of different stuff. Performance improvements are for sure the most interesting changes in here. Refactorings as well. * `STY`: A style change. Something that makes pypdf code more consistent. - Typically a small change. + Typically a small change. It could also be better error messages for + end users. + +The prefix is used to generate the CHANGELOG. Every PR must have exactly one - +if you feel like several match, take the top one from this list that matches for +your PR. + +## Pull Request Size + +Smaller Pull Requests (PRs) are preferred as it's typically easier to merge +them. For example, if you have some typos, a few code-style changes, a new +feature, and a bug-fix, that could be 3 or 4 PRs. + +A PR must be complete. That means if you introduce a new feature it must be +finished within the PR and have a test for that feature. ## Benchmarks diff --git a/docs/user/forms.md b/docs/user/forms.md index ff729d0596..3cc29ee00a 100644 --- a/docs/user/forms.md +++ b/docs/user/forms.md @@ -8,6 +8,9 @@ from pypdf import PdfReader reader = PdfReader("form.pdf") fields = reader.get_form_text_fields() fields == {"key": "value", "key2": "value2"} + +# You can also get all fields: +fields = reader.get_fields() ``` ## Filling out forms diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 2706007c27..f221d2d23d 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -6,13 +6,19 @@ from ._codecs import adobe_glyphs, charset_encoding from ._utils import b_, logger_warning from .errors import PdfReadWarning -from .generic import DecodedStreamObject, DictionaryObject, IndirectObject, NullObject, StreamObject +from .generic import ( + DecodedStreamObject, + DictionaryObject, + IndirectObject, + NullObject, + StreamObject, +) # code freely inspired from @twiggy ; see #711 def build_char_map( font_name: str, space_width: float, obj: DictionaryObject -) -> Tuple[str, float, Union[str, Dict[int, str]], Dict, DictionaryObject]: +) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]: """ Determine information about a font. @@ -34,7 +40,7 @@ def build_char_map( def build_char_map_from_dict( space_width: float, ft: DictionaryObject -) -> Tuple[str, float, Union[str, Dict[int, str]], Dict]: +) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]: """ Determine information about a font. @@ -489,7 +495,7 @@ def type1_alternative( for li in lines: if li.startswith(b"dup"): words = [_w for _w in li.split(b" ") if _w != b""] - if words[3] != b"put": + if len(words) > 3 and words[3] != b"put": continue try: i = int(words[1]) diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 2cc857f167..c000a2a2a7 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -759,7 +759,7 @@ class PasswordType(IntEnum): OWNER_PASSWORD = 2 -class EncryptAlgorithm(tuple, Enum): # noqa: SLOT001 +class EncryptAlgorithm(tuple, Enum): # type: ignore # noqa: SLOT001 # V, R, Length RC4_40 = (1, 2, 40) RC4_128 = (2, 3, 128) @@ -1144,7 +1144,7 @@ def read(encryption_entry: DictionaryObject, first_id_entry: bytes) -> "Encrypti def make( alg: EncryptAlgorithm, permissions: int, first_id_entry: bytes ) -> "Encryption": - alg_ver, alg_rev, key_bits = cast(tuple, alg) + alg_ver, alg_rev, key_bits = alg stm_filter, str_filter, ef_filter = "/V2", "/V2", "/V2" diff --git a/pypdf/_page.py b/pypdf/_page.py index e8216e9d04..120e15a193 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -296,6 +296,16 @@ def rotate(self, rotation: float) -> "Transformation": def __repr__(self) -> str: return f"Transformation(ctm={self.ctm})" + @overload + def apply_on(self, pt: List[float], as_object: bool = False) -> List[float]: + ... + + @overload + def apply_on( + self, pt: Tuple[float, float], as_object: bool = False + ) -> Tuple[float, float]: + ... + def apply_on( self, pt: Union[Tuple[float, float], List[float]], @@ -1232,10 +1242,10 @@ def _merge_page_writer( if "/QuadPoints" in a: q = cast(ArrayObject, a["/QuadPoints"]) aa[NameObject("/QuadPoints")] = ArrayObject( - cast(tuple, trsf.apply_on((q[0], q[1]), True)) - + cast(tuple, trsf.apply_on((q[2], q[3]), True)) - + cast(tuple, trsf.apply_on((q[4], q[5]), True)) - + cast(tuple, trsf.apply_on((q[6], q[7]), True)) + trsf.apply_on((q[0], q[1]), True) + + trsf.apply_on((q[2], q[3]), True) + + trsf.apply_on((q[4], q[5]), True) + + trsf.apply_on((q[6], q[7]), True) ) try: aa["/Popup"][NameObject("/Parent")] = aa.indirect_reference @@ -1936,7 +1946,7 @@ def _extract_text( def current_spacewidth() -> float: return _space_width / 1000.0 - def process_operation(operator: bytes, operands: List) -> None: + def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap nonlocal orientations, rtl_dir, visitor_text, output, text @@ -2479,7 +2489,7 @@ def annotations(self, value: Optional[ArrayObject]) -> None: self[NameObject("/Annots")] = value -class _VirtualList(Sequence): +class _VirtualList(Sequence[PageObject]): def __init__( self, length_function: Callable[[], int], @@ -2664,7 +2674,7 @@ def process_font(f: DictionaryObject) -> None: return fnt, emb # return the sets for each page -class _VirtualListImages(Sequence): +class _VirtualListImages(Sequence[ImageFile]): def __init__( self, ids_function: Callable[[], List[Union[str, List[str]]]], diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index de0fa78ce4..fdb8a01b58 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -70,7 +70,7 @@ class PdfWriterProtocol(Protocol): # deprecated def get_object(self, indirect_reference: Any) -> Optional[PdfObjectProtocol]: ... - def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: + def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: ... def _add_object(self, obj: Any) -> Any: diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 7357fd1a76..365cb0bae9 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -721,7 +721,7 @@ def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, A second and following will get the suffix .2, .3, ... """ - def indexed_key(k: str, fields: dict) -> str: + def indexed_key(k: str, fields: Dict[Any, Any]) -> str: if k not in fields: return k else: @@ -804,8 +804,11 @@ def _get_named_destinations( except IndexError: break i += 1 - if isinstance(value, DictionaryObject) and "/D" in value: - value = value["/D"] + if isinstance(value, DictionaryObject): + if "/D" in value: + value = value["/D"] + else: + continue dest = self._build_destination(key, value) # type: ignore if dest is not None: retval[key] = dest @@ -813,7 +816,10 @@ def _get_named_destinations( for k__, v__ in tree.items(): val = v__.get_object() if isinstance(val, DictionaryObject): - val = val["/D"].get_object() + if "/D" in val: + val = val["/D"].get_object() + else: + continue dest = self._build_destination(k__, val) if dest is not None: retval[k__] = dest @@ -2292,7 +2298,7 @@ def _get_attachments( return attachments -class LazyDict(Mapping): +class LazyDict(Mapping[Any, Any]): def __init__(self, *args: Any, **kw: Any) -> None: self._raw_dict = dict(*args, **kw) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 9613cd1b02..0e3e7ebabb 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -453,7 +453,7 @@ def logger_warning(msg: str, src: str) -> None: logging.getLogger(src).warning(msg) -def deprecation_bookmark(**aliases: str) -> Callable: +def deprecation_bookmark(**aliases: str) -> Callable[..., Any]: """ Decorator for deprecated term "bookmark". @@ -462,7 +462,7 @@ def deprecation_bookmark(**aliases: str) -> Callable: outline = a collection of outline items. """ - def decoration(func: Callable) -> Any: + def decoration(func: Callable[..., Any]) -> Any: @functools.wraps(func) def wrapper(*args: Any, **kwargs: Any) -> Any: rename_kwargs(func.__name__, kwargs, aliases, fail=True) diff --git a/pypdf/_version.py b/pypdf/_version.py index 7e84c71b24..4874dc2b76 100644 --- a/pypdf/_version.py +++ b/pypdf/_version.py @@ -1 +1 @@ -__version__ = "3.17.1" +__version__ = "3.17.2" diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 10cc9342a9..e4db6e32e7 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -885,7 +885,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: if font_name not in dr: # ...or AcroForm dictionary dr = cast( - dict, + Dict[Any, Any], cast(DictionaryObject, self._root_object["/AcroForm"]).get("/DR", {}), ) if isinstance(dr, IndirectObject): # pragma: no cover @@ -1353,7 +1353,7 @@ def write_stream(self, stream: StreamType) -> None: xref_location = self._write_xref_table(stream, object_positions) self._write_trailer(stream, xref_location) - def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: + def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: """ Write the collection of pages added to this object out as a PDF file. @@ -2240,7 +2240,10 @@ def clean_forms( # to prevent infinite looping return [], [] # pragma: no cover try: - d = cast(dict, cast(DictionaryObject, elt["/Resources"])["/XObject"]) + d = cast( + Dict[Any, Any], + cast(DictionaryObject, elt["/Resources"])["/XObject"], + ) except KeyError: d = {} images = [] @@ -2793,7 +2796,7 @@ def add_annotation( # Internal link annotations need the correct object type for the # destination if to_add.get("/Subtype") == "/Link" and "/Dest" in to_add: - tmp = cast(dict, to_add[NameObject("/Dest")]) + tmp = cast(Dict[Any, Any], to_add[NameObject("/Dest")]) dest = Destination( NameObject("/LinkName"), tmp["target_page_index"], @@ -3029,7 +3032,7 @@ def merge( for dest in reader._namedDests.values(): arr = dest.dest_array if "/Names" in self._root_object and dest["/Title"] in cast( - list, + List[Any], cast( DictionaryObject, cast(DictionaryObject, self._root_object["/Names"])["/Dests"], @@ -3188,7 +3191,9 @@ def _add_articles_thread( def add_filtered_articles( self, - fltr: Union[Pattern, str], # thread entry from the reader's array of threads + fltr: Union[ + Pattern[Any], str + ], # thread entry from the reader's array of threads pages: Dict[int, PageObject], reader: PdfReader, ) -> None: @@ -3245,7 +3250,7 @@ def _insert_filtered_annotations( ) -> List[Destination]: outlist = ArrayObject() if isinstance(annots, IndirectObject): - annots = cast("List", annots.get_object()) + annots = cast("List[Any]", annots.get_object()) for an in annots: ano = cast("DictionaryObject", an.get_object()) if ( diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 87bff19b96..a390357dd5 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -4,7 +4,7 @@ from io import BytesIO from typing import Any, List, Tuple, Union, cast -from ._utils import logger_warning +from ._utils import WHITESPACES, logger_warning from .constants import ColorSpaces from .errors import PdfReadError from .generic import ( @@ -91,10 +91,18 @@ def _get_imagemode( ) return mode2, True elif color_space[0] == "/DeviceN": + original_color_space = color_space color_components = len(color_space[1]) color_space = color_space[2] if isinstance(color_space, IndirectObject): # pragma: no cover color_space = color_space.get_object() + if color_space == "/DeviceCMYK" and color_components == 1: + if original_color_space[1][0] != "/Black": + logger_warning( + f"Color {original_color_space[1][0]} converted to Gray. Please share PDF with pypdf dev team", + __name__, + ) + return "L", True mode2, invert_color = _get_imagemode( color_space, color_components, prev_mode, depth + 1 ) @@ -187,7 +195,13 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: else: if img.mode == "1": # Two values ("high" and "low"). - assert len(lookup) == 2 * nb, len(lookup) + expected_count = 2 * nb + if len(lookup) != expected_count: + if len(lookup) < expected_count: + raise PdfReadError(f"Not enough lookup values: Expected {expected_count}, got {len(lookup)}.") + lookup = lookup[:expected_count] + if not all(_value in WHITESPACES for _value in lookup[expected_count:]): + raise PdfReadError(f"Too many lookup values: Expected {expected_count}, got {len(lookup)}.") colors_arr = [lookup[:nb], lookup[nb:]] arr = b"".join( [ diff --git a/pypdf/filters.py b/pypdf/filters.py index b6d198f55b..d1c06a3418 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -322,7 +322,7 @@ def decode( **kwargs: Any, ) -> bytes: """ - Decode an ASCII-Hex encoded data stream. + Decode a run length encoded data stream. Args: data: a bytes sequence of length/data @@ -558,23 +558,27 @@ class CCITTFaxDecode: @staticmethod def _get_parameters( - parameters: Union[None, ArrayObject, DictionaryObject], rows: int + parameters: Union[None, ArrayObject, DictionaryObject, IndirectObject], + rows: int, ) -> CCITParameters: # TABLE 3.9 Optional parameters for the CCITTFaxDecode filter k = 0 columns = 1728 if parameters: - if isinstance(parameters, ArrayObject): - for decode_parm in parameters: + parameters_unwrapped = cast( + Union[ArrayObject, DictionaryObject], parameters.get_object() + ) + if isinstance(parameters_unwrapped, ArrayObject): + for decode_parm in parameters_unwrapped: if CCITT.COLUMNS in decode_parm: columns = decode_parm[CCITT.COLUMNS] if CCITT.K in decode_parm: k = decode_parm[CCITT.K] else: - if CCITT.COLUMNS in parameters: - columns = parameters[CCITT.COLUMNS] # type: ignore - if CCITT.K in parameters: - k = parameters[CCITT.K] # type: ignore + if CCITT.COLUMNS in parameters_unwrapped: + columns = parameters_unwrapped[CCITT.COLUMNS] # type: ignore + if CCITT.K in parameters_unwrapped: + k = parameters_unwrapped[CCITT.K] # type: ignore return CCITParameters(k, columns, rows) @@ -777,8 +781,8 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, alpha = None filters = x_object_obj.get(SA.FILTER, [None]) lfilters = filters[-1] if isinstance(filters, list) else filters - if lfilters == FT.FLATE_DECODE: - img, image_format, extension, invert_color = _handle_flate( + if lfilters in (FT.FLATE_DECODE, FT.RUN_LENGTH_DECODE): + img, image_format, extension, _ = _handle_flate( size, data, mode, @@ -820,15 +824,14 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, ".png", False, ) - # CMYK image and other colorspaces without decode # requires reverting scale (cf p243,2§ last sentence) decode = x_object_obj.get( IA.DECODE, ([1.0, 0.0] * len(img.getbands())) if ( - (img.mode == "CMYK" or (invert_color and img.mode == "L")) - and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE) + (img.mode == "CMYK" and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE)) + or (invert_color and img.mode == "L") ) else None, ) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index a50bb5faff..667d675266 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -727,7 +727,7 @@ def readFromStream( def encode_pdfdocencoding(unicode_string: str) -> bytes: - retval = b"" + retval = bytearray() for c in unicode_string: try: retval += b_(chr(_pdfdoc_encoding_rev[c])) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 4ee13a13b0..d9853a2d1e 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -87,7 +87,7 @@ IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]") -class ArrayObject(list, PdfObject): +class ArrayObject(List[Any], PdfObject): def clone( self, pdf_dest: PdfWriterProtocol, @@ -174,7 +174,7 @@ def readFromStream( return ArrayObject.read_from_stream(stream, pdf) -class DictionaryObject(dict, PdfObject): +class DictionaryObject(Dict[Any, Any], PdfObject): def clone( self, pdf_dest: PdfWriterProtocol, @@ -420,7 +420,7 @@ def get_next_obj_pos( else: return get_next_obj_pos(p, p1, rem_gens[1:], pdf) - def read_unsized_from_steam( + def read_unsized_from_stream( stream: StreamType, pdf: PdfReaderProtocol ) -> bytes: # we are just pointing at beginning of the stream @@ -535,7 +535,7 @@ def read_unsized_from_steam( data["__streamdata__"] = data["__streamdata__"][:-1] elif pdf is not None and not pdf.strict: stream.seek(pstart, 0) - data["__streamdata__"] = read_unsized_from_steam(stream, pdf) + data["__streamdata__"] = read_unsized_from_stream(stream, pdf) pos = stream.tell() else: stream.seek(pos, 0) @@ -631,7 +631,7 @@ def insert_child( child: Any, before: Any, pdf: PdfWriterProtocol, - inc_parent_counter: Optional[Callable] = None, + inc_parent_counter: Optional[Callable[..., Any]] = None, ) -> IndirectObject: if inc_parent_counter is None: inc_parent_counter = self.inc_parent_counter_default diff --git a/pyproject.toml b/pyproject.toml index 6d3021745e..7ab3c384d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -225,6 +225,7 @@ wrap-descriptions = 0 show_error_codes = true ignore_missing_imports = true check_untyped_defs = true +disallow_any_generics = true disallow_untyped_defs = true disallow_incomplete_defs = true warn_redundant_casts = true diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index 33143f5cd3..f382fe2b94 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -69,6 +69,8 @@ pytest-timeout==2.1.0 # via -r requirements/ci.in pytest-xdist==3.3.1 # via -r requirements/ci.in +pyyaml==6.0.1 + # via -r requirements/ci.in ruff==0.0.290 # via -r requirements/ci.in typeguard==4.1.2 diff --git a/requirements/ci.in b/requirements/ci.in index ff071d125c..4c14acc41e 100644 --- a/requirements/ci.in +++ b/requirements/ci.in @@ -17,3 +17,4 @@ pytest-cov typeguard types-dataclasses types-Pillow +pyyaml diff --git a/requirements/ci.txt b/requirements/ci.txt index 0d28144261..ebb121aa68 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -83,6 +83,8 @@ pytest-timeout==2.1.0 # via -r requirements/ci.in pytest-xdist==3.0.2 # via -r requirements/ci.in +pyyaml==6.0.1 + # via -r requirements/ci.in six==1.16.0 # via flake8-print tomli==1.2.3 diff --git a/tests/__init__.py b/tests/__init__.py index c9f4dc3ddd..d81f2c94cc 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,13 +1,16 @@ +import concurrent.futures import ssl import urllib.request from pathlib import Path -from typing import List +from typing import Dict, List, Optional from urllib.error import HTTPError +import yaml + from pypdf.generic import DictionaryObject, IndirectObject -def get_data_from_url(url: str, name: str) -> bytes: +def get_data_from_url(url: Optional[str] = None, name: Optional[str] = None) -> bytes: """ Download a File from a URL and return its contents. @@ -22,28 +25,33 @@ def get_data_from_url(url: str, name: str) -> bytes: Returns: Read File as bytes """ - if url.startswith("file://"): - with open(url[7:].replace("\\", "/"), "rb") as fp: - return fp.read() + if name is None: + raise ValueError("A name must always be specified") + cache_dir = Path(__file__).parent / "pdf_cache" if not cache_dir.exists(): cache_dir.mkdir() cache_path = cache_dir / name - if not cache_path.exists(): - ssl._create_default_https_context = ssl._create_unverified_context - cpt = 3 - while cpt > 0: - try: - with urllib.request.urlopen( # noqa: S310 - url - ) as response, cache_path.open("wb") as out_file: - out_file.write(response.read()) - cpt = 0 - except HTTPError as e: - if cpt > 0: - cpt -= 1 - else: - raise e + + if url is not None: + if url.startswith("file://"): + with open(url[7:].replace("\\", "/"), "rb") as fp: + return fp.read() + if not cache_path.exists(): + ssl._create_default_https_context = ssl._create_unverified_context + cpt = 3 + while cpt > 0: + try: + with urllib.request.urlopen( # noqa: S310 + url + ) as response, cache_path.open("wb") as out_file: + out_file.write(response.read()) + cpt = 0 + except HTTPError as e: + if cpt > 0: + cpt -= 1 + else: + raise e with open(cache_path, "rb") as fp: data = fp.read() return data @@ -106,12 +114,32 @@ def is_sublist(child_list, parent_list): return is_sublist(child_list, parent_list[1:]) +def read_yaml_to_list_of_dicts(yaml_file: Path) -> List[Dict[str, str]]: + with open(yaml_file) as yaml_input: + data = yaml.safe_load(yaml_input) + return data + + def download_test_pdfs(): """ Run this before the tests are executed to ensure you have everything locally. This is especially important to avoid pytest timeouts. """ - pdfs = [("https://arxiv.org/pdf/2201.00214.pdf", "2201.00214.pdf")] - for url, name in pdfs: - get_data_from_url(url, name=name) + pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.yaml") + + with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: + futures = [ + executor.submit(get_data_from_url, pdf["url"], name=pdf["local_filename"]) + for pdf in pdfs + ] + concurrent.futures.wait(futures) + + +def test_csv_consistency(): + pdfs = read_yaml_to_list_of_dicts(Path(__file__).parent / "example_files.csv") + # Ensure the names are unique + assert len(pdfs) == len({pdf["name"] for pdf in pdfs}) + + # Ensure the urls are unique + assert len(pdfs) == len({pdf["url"] for pdf in pdfs}) diff --git a/tests/example_files.yaml b/tests/example_files.yaml new file mode 100644 index 0000000000..f12a784449 --- /dev/null +++ b/tests/example_files.yaml @@ -0,0 +1,112 @@ +- local_filename: 2201.00214.pdf + url: https://arxiv.org/pdf/2201.00214.pdf +- local_filename: ASurveyofImageClassificationBasedTechniques.pdf + url: https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf +- local_filename: Giacalone.pdf + url: https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf +- local_filename: iss1718.pdf + url: https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf +- local_filename: iss2077.pdf + url: https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf +- local_filename: pdf_font_garbled.pdf + url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf +- local_filename: The%20lean%20times%20in%20the%20Peruvian%20economy.pdf + url: https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf +- local_filename: tika-908104.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf +- local_filename: tika-923406.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf +- local_filename: tika-955562.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf +- local_filename: tika-959173.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf +- local_filename: waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf + url: https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf +- local_filename: tika-957144.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf +- local_filename: ascii charset.pdf + url: https://github.com/py-pdf/pypdf/files/9472500/main.pdf +- local_filename: cmap1370.pdf + url: https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf +- local_filename: 02voc.pdf + url: https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf +- local_filename: iss1533.pdf + url: https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf +- local_filename: tstUCS2.pdf + url: https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf +- local_filename: tst-GBK_EUC.pdf + url: https://github.com/py-pdf/pypdf/files/11315397/3.pdf +- local_filename: math_latex.pdf + url: https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf +- local_filename: unixxx_glyphs.pdf + url: https://arxiv.org/pdf/2201.00021.pdf +- local_filename: TextAttack_paper.pdf + url: https://arxiv.org/pdf/2005.05909.pdf +- local_filename: iss2173.pdf + url: https://github.com/py-pdf/pypdf/files/12552700/tt.pdf +- local_filename: iss2290.pdf + url: https://github.com/py-pdf/pypdf/files/13452885/example.pdf +- local_filename: NewJersey.pdf + url: https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf +- local_filename: tika-952445.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf +- local_filename: tika-921632.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf +- local_filename: tika-976970.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/976/976970.pdf +- local_filename: tika-914102.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf +- local_filename: iss1737.pdf + url: https://github.com/py-pdf/pypdf/files/11068604/tt1.pdf +- local_filename: issue-1801.pdf + url: https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf +- local_filename: tika-924546.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf +- local_filename: tika-924546.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf +- local_filename: issue-1801.png + url: https://user-images.githubusercontent.com/1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png +- local_filename: grimm10 + url: https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf +- local_filename: labeled-edges-center-image.png + url: https://user-images.githubusercontent.com/4083478/236685544-a1940b06-fb42-4bb1-b589-1e4ad429d68e.png +- local_filename: pdf_font_garbled.pdf + url: https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf +- local_filename: watermark1.png + url: https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png +- local_filename: tika-977609.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/977/977609.pdf +- local_filename: tifimage.png + url: https://user-images.githubusercontent.com/4083478/236793166-288b4b59-dee3-49fd-a04e-410aab06199a.png +- local_filename: tika-972174.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf +- local_filename: tika-972174_p0-im0.png + url: https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png +- local_filename: Vitocal.pdf + url: https://github.com/py-pdf/pypdf/files/11962229/DB-5368770_Vitocal_200-G.pdf +- local_filename: VitocalImage.png + url: https://user-images.githubusercontent.com/4083478/251283945-38c5b92c-cf94-473c-bb57-a51b74fc39be.jpg +- local_filename: cmyk_deflate.pdf + url: https://github.com/py-pdf/pypdf/files/12078533/cmyk2.pdf +- local_filename: cmyk_deflate.tif + url: https://github.com/py-pdf/pypdf/files/12078556/cmyk.tif.txt +- local_filename: o1whh9b3.pdf + url: https://github.com/py-pdf/pypdf/files/11578953/USC.EMBA.-.Pre-Season.and.Theme.I.pdf +- local_filename: selbst.72916.pdf + url: https://www.selbst.de/paidcontent/dl/64733/72916 +- local_filename: iss1912.pdf + url: https://github.com/py-pdf/pypdf/files/11845099/GeoTopo-komprimiert.pdf +- local_filename: calRGB.pdf + url: https://github.com/py-pdf/pypdf/files/12061061/tt.pdf +- local_filename: 2023USDC.pdf + url: https://github.com/py-pdf/pypdf/files/12090523/2023.USDC_Circle.Examination.Report.May.2023.pdf +- local_filename: iss1982_im1.png + url: https://github.com/py-pdf/pypdf/files/12144094/im1.png.txt +- local_filename: iss1982_im2.png + url: https://github.com/py-pdf/pypdf/files/12144093/im2.png.txt +- local_filename: tika-972174.pdf + url: https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf +- local_filename: usa.png + url: https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42 +- local_filename: paid.pdf + url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 09cac1bb59..3c11508950 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -17,24 +17,24 @@ [ # compute_space_width: ( - "https://corpora.tika.apache.org/base/docs/govdocs1/923/923406.pdf", + None, "tika-923406.pdf", False, ), # _parse_to_unicode_process_rg: ( - "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf", + None, "tika-959173.pdf", False, ), ( - "https://corpora.tika.apache.org/base/docs/govdocs1/959/959173.pdf", + None, "tika-959173.pdf", True, ), # issue #1718: ( - "https://github.com/py-pdf/pypdf/files/10983477/Ballinasloe_WS.pdf", + None, "iss1718.pdf", False, ), @@ -53,27 +53,24 @@ def test_text_extraction_slow(caplog, url: str, name: str, strict: bool): [ # bfchar_on_2_chars: issue #1293 ( - "https://raw.githubusercontent.com/xyegithub/myBlog/12127c712ac2008782616c743224b187a4069477/posts/" - "c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf", + None, "ASurveyofImageClassificationBasedTechniques.pdf", False, ), # L40, get_font_width_from_default ( - "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf", + None, "tika-908104.pdf", False, ), # multiline_bfrange / regression test for issue #1285: ( - "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/" - "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf", + None, "The%20lean%20times%20in%20the%20Peruvian%20economy.pdf", False, ), ( - "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/" - "Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf", + None, "Giacalone.pdf", False, ), @@ -89,10 +86,7 @@ def test_text_extraction_fast(caplog, url: str, name: str, strict: bool): @pytest.mark.enable_socket() def test_parse_encoding_advanced_encoding_not_implemented(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf" - name = "tika-957144.pdf" - - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-957144.pdf"))) with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"): for page in reader.pages: page.extract_text() @@ -100,10 +94,8 @@ def test_parse_encoding_advanced_encoding_not_implemented(): @pytest.mark.enable_socket() def test_ascii_charset(): - # iss #1312 - url = "https://github.com/py-pdf/pypdf/files/9472500/main.pdf" - name = "ascii charset.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + # Issue #1312 + reader = PdfReader(BytesIO(get_data_from_url(name="ascii charset.pdf"))) assert "/a" not in reader.pages[0].extract_text() @@ -112,13 +104,13 @@ def test_ascii_charset(): ("url", "name", "page_nb", "within_text"), [ ( - "https://github.com/py-pdf/pypdf/files/9667138/cmap1370.pdf", + None, "cmap1370.pdf", 0, "", ), ( - "https://github.com/py-pdf/pypdf/files/9712729/02voc.pdf", + None, "02voc.pdf", 2, "Document delineation and character sequence decoding", @@ -135,9 +127,7 @@ def test_text_extraction_of_specific_pages( @pytest.mark.enable_socket() def test_iss1533(): - url = "https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf" - name = "iss1533.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss1533.pdf"))) reader.pages[0].extract_text() # no error assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü" @@ -147,14 +137,14 @@ def test_iss1533(): ("url", "name", "page_index", "within_text", "caplog_text"), [ ( - "https://github.com/py-pdf/pypdf/files/11190189/pdf_font_garbled.pdf", + None, "tstUCS2.pdf", 1, ["2 / 12", "S0490520090001", "于博"], "", ), ( - "https://github.com/py-pdf/pypdf/files/11315397/3.pdf", + None, "tst-GBK_EUC.pdf", 0, ["NJA", "中华男科学杂志"], @@ -172,9 +162,7 @@ def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text) @pytest.mark.enable_socket() def test_latex(): - url = "https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf" - name = "math_latex.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="math_latex.pdf"))) txt = reader.pages[0].extract_text() # no error for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"): assert pat in txt @@ -183,9 +171,7 @@ def test_latex(): @pytest.mark.enable_socket() def test_unixxx_glyphs(): - url = "https://arxiv.org/pdf/2201.00021.pdf" - name = "unixxx_glyphs.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="unixxx_glyphs.pdf"))) txt = reader.pages[0].extract_text() # no error for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): assert pat in txt @@ -195,18 +181,22 @@ def test_unixxx_glyphs(): def test_cmap_compute_space_width(): # issue 2137 # original file URL: - url = "https://arxiv.org/pdf/2005.05909.pdf" + # url = "https://arxiv.org/pdf/2005.05909.pdf" # URL from github issue is too long to pass code stype check, use original arxiv URL instead # url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf" - name = "TextAttack_paper.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf"))) reader.pages[0].extract_text() # no error @pytest.mark.enable_socket() def test_tabs_in_cmap(): """Issue #2173""" - url = "https://github.com/py-pdf/pypdf/files/12552700/tt.pdf" - name = "iss2173.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss2173.pdf"))) + reader.pages[0].extract_text() + + +@pytest.mark.enable_socket() +def test_ignoring_non_put_entries(): + """Issue #2290""" + reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf"))) reader.pages[0].extract_text() diff --git a/tests/test_filters.py b/tests/test_filters.py index 00a548ab00..873e7a957d 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -142,9 +142,7 @@ def test_decode_ahx(): See #1979 Gray Image in CMYK : requiring reverse """ - url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf" - name = "NewJersey.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="NewJersey.pdf"))) for p in reader.pages: _ = list(p.images.keys()) @@ -231,9 +229,7 @@ def test_ccitt_fax_decode(): @pytest.mark.enable_socket() @patch("pypdf._reader.logger_warning") def test_decompress_zlib_error(mock_logger_warning): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf" - name = "tika-952445.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf"))) for page in reader.pages: page.extract_text() mock_logger_warning.assert_called_with( @@ -243,9 +239,7 @@ def test_decompress_zlib_error(mock_logger_warning): @pytest.mark.enable_socket() def test_lzw_decode_neg1(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf" - name = "tika-921632.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-921632.pdf"))) page = reader.pages[47] with pytest.raises(PdfReadError) as exc: page.extract_text() @@ -254,17 +248,13 @@ def test_lzw_decode_neg1(): @pytest.mark.enable_socket() def test_issue_399(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/976/976970.pdf" - name = "tika-976970.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-976970.pdf"))) reader.pages[1].extract_text() @pytest.mark.enable_socket() def test_image_without_pillow(tmp_path): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf" name = "tika-914102.pdf" - _ = get_data_from_url(url, name=name) pdf_path = Path(__file__).parent / "pdf_cache" / name pdf_path_str = str(pdf_path.resolve()).replace("\\", "/") @@ -304,9 +294,7 @@ def test_image_without_pillow(tmp_path): @pytest.mark.enable_socket() def test_issue_1737(): - url = "https://github.com/py-pdf/pypdf/files/11068604/tt1.pdf" - name = "iss1737.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss1737.pdf"))) reader.pages[0]["/Resources"]["/XObject"]["/Im0"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im1"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data() @@ -319,9 +307,7 @@ def test_pa_image_extraction(): This is a regression test for issue #1801 """ - url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf" - name = "issue-1801.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="issue-1801.pdf"))) page0 = reader.pages[0] images = page0.images @@ -329,20 +315,14 @@ def test_pa_image_extraction(): assert images[0].name == "Im1.png" # Ensure visual appearence - data = get_data_from_url( - "https://user-images.githubusercontent.com/" - "1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png", - "issue-1801.png", - ) + data = get_data_from_url(name="issue-1801.png") assert data == images[0].data @pytest.mark.enable_socket() def test_1bit_image_extraction(): """Cf issue #1814""" - url = "https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf" - name = "grimm10" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="grimm10"))) for p in reader.pages: p.images @@ -352,9 +332,9 @@ def test_png_transparency_reverse(): """Cf issue #1599""" pdf_path = RESOURCE_ROOT / "labeled-edges-center-image.pdf" reader = PdfReader(pdf_path) - url_png = "https://user-images.githubusercontent.com/4083478/236685544-a1940b06-fb42-4bb1-b589-1e4ad429d68e.png" - name_png = "labeled-edges-center-image.png" - _refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + _refimg = Image.open( + BytesIO(get_data_from_url(name="labeled-edges-center-image.png")) + ) data = reader.pages[0].images[0] _img = Image.open(BytesIO(data.data)) assert ".jp2" in data.name @@ -364,12 +344,8 @@ def test_png_transparency_reverse(): @pytest.mark.enable_socket() def test_iss1787(): """Cf issue #1787""" - url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" - name = "pdf_font_garbled.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png" - name_png = "watermark1.png" - refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + reader = PdfReader(BytesIO(get_data_from_url(name="pdf_font_garbled.pdf"))) + refimg = Image.open(BytesIO(get_data_from_url(name="watermark1.png"))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name @@ -385,12 +361,8 @@ def test_iss1787(): @pytest.mark.enable_socket() def test_tiff_predictor(): """Decode Tiff Predictor 2 Images""" - url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977609.pdf" - name = "tika-977609.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/236793166-288b4b59-dee3-49fd-a04e-410aab06199a.png" - name_png = "tifimage.png" - refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-977609.pdf"))) + refimg = Image.open(BytesIO(get_data_from_url(name="tifimage.png"))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name @@ -400,15 +372,11 @@ def test_tiff_predictor(): @pytest.mark.enable_socket() def test_rgba(): """Decode rgb with transparency""" - url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" - name = "tika-972174.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png" - name_png = "tika-972174_p0-im0.png" + reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf"))) data = reader.pages[0].images[0] assert ".jp2" in data.name similarity = image_similarity( - data.image, BytesIO(get_data_from_url(url_png, name=name_png)) + data.image, BytesIO(get_data_from_url(name="tika-972174_p0-im0.png")) ) assert similarity > 0.99 @@ -421,23 +389,15 @@ def test_cmyk(): from Crypto.Cipher import AES # noqa: F401 except ImportError: return # the file is encrypted - url = "https://github.com/py-pdf/pypdf/files/11962229/DB-5368770_Vitocal_200-G.pdf" - name = "Vitocal.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/251283945-38c5b92c-cf94-473c-bb57-a51b74fc39be.jpg" - name_png = "VitocalImage.png" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + reader = PdfReader(BytesIO(get_data_from_url(name="Vitocal.pdf"))) + refimg = BytesIO(get_data_from_url(name="VitocalImage.png")) data = reader.pages[1].images[0] assert data.image.mode == "CMYK" assert ".jpg" in data.name assert image_similarity(data.image, refimg) > 0.99 # deflate - url = "https://github.com/py-pdf/pypdf/files/12078533/cmyk2.pdf" - name = "cmyk_deflate.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://github.com/py-pdf/pypdf/files/12078556/cmyk.tif.txt" - name_png = "cmyk_deflate.tif" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + reader = PdfReader(BytesIO(get_data_from_url(name="cmyk_deflate.pdf"))) + refimg = BytesIO(get_data_from_url(name="cmyk_deflate.tif")) data = reader.pages[0].images[0] assert data.image.mode == "CMYK" assert ".tif" in data.name @@ -447,9 +407,7 @@ def test_cmyk(): @pytest.mark.enable_socket() def test_iss1863(): """Test doc from iss1863""" - url = "https://github.com/py-pdf/pypdf/files/11578953/USC.EMBA.-.Pre-Season.and.Theme.I.pdf" - name = "o1whh9b3.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="o1whh9b3.pdf"))) for p in reader.pages: for i in p.images: i.name @@ -457,9 +415,7 @@ def test_iss1863(): @pytest.mark.enable_socket() def test_read_images(): - url = "https://www.selbst.de/paidcontent/dl/64733/72916" - name = "selbst.72916.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="selbst.72916.pdf"))) page = reader.pages[0] for _ in page.images: pass @@ -467,9 +423,7 @@ def test_read_images(): @pytest.mark.enable_socket() def test_cascaded_filters_images(): - url = "https://github.com/py-pdf/pypdf/files/11845099/GeoTopo-komprimiert.pdf" - name = "iss1912.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss1912.pdf"))) # for focus, analyse the page 23 for p in reader.pages: for i in p.images: @@ -478,40 +432,28 @@ def test_cascaded_filters_images(): @pytest.mark.enable_socket() def test_calrgb(): - url = "https://github.com/py-pdf/pypdf/files/12061061/tt.pdf" - name = "calRGB.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="calRGB.pdf"))) reader.pages[0].images[0] @pytest.mark.enable_socket() def test_index_lookup(): """The lookup is provided as an str and bytes""" - url = "https://github.com/py-pdf/pypdf/files/12090523/2023.USDC_Circle.Examination.Report.May.2023.pdf" - name = "2023USDC.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="2023USDC.pdf"))) # TextStringObject Lookup - url_png = "https://github.com/py-pdf/pypdf/files/12144094/im1.png.txt" - name_png = "iss1982_im1.png" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + refimg = BytesIO(get_data_from_url(name="iss1982_im1.png")) data = reader.pages[0].images[-1] assert data.image.mode == "RGB" assert image_similarity(data.image, refimg) > 0.999 # ByteStringObject Lookup - url_png = "https://github.com/py-pdf/pypdf/files/12144093/im2.png.txt" - name_png = "iss1982_im2.png" - refimg = BytesIO(get_data_from_url(url_png, name=name_png)) + refimg = BytesIO(get_data_from_url(name="iss1982_im2.png")) data = reader.pages[-1].images[-1] assert data.image.mode == "RGB" assert image_similarity(data.image, refimg) > 0.999 # indexed CMYK images # currently with a TODO as we convert to RBG the palette - url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" - name = "tika-972174.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url_png = "https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42" - name_png = "usa.png" - refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-972174.pdf"))) + refimg = Image.open(BytesIO(get_data_from_url(name="usa.png"))) data = reader.pages[0].images["/Im3"] # assert data.image.mode == "PA" but currently "RGBA" assert image_similarity(data.image, refimg) > 0.999 @@ -520,9 +462,7 @@ def test_index_lookup(): @pytest.mark.enable_socket() def test_2bits_image(): """From #1954, test with 2bits image. TODO: 4bits also""" - url = "https://github.com/py-pdf/pypdf/files/12050253/tt.pdf" - name = "paid.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="paid.pdf"))) url_png = "https://user-images.githubusercontent.com/4083478/253568117-ca95cc85-9dea-4145-a5e0-032f1c1aa322.png" name_png = "Paid.png" refimg = BytesIO(get_data_from_url(url_png, name=name_png)) @@ -649,3 +589,12 @@ def test_flate_decode_with_image_mode_1(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for image in reader.pages[7].images: _ = image + + +@pytest.mark.enable_socket() +def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup(): + """From #2331""" + url = "https://github.com/py-pdf/pypdf/files/13611048/out1.pdf" + name = "issue2331.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].images[0] diff --git a/tests/test_images.py b/tests/test_images.py index 9e03c9f352..3e9e8a034e 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -105,9 +105,8 @@ def test_image_similarity_mid(): @pytest.mark.enable_socket() def test_image_new_property(): - url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" name = "pdf_font_garbled.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name=name))) assert reader.pages[0].images.keys() == [ "/I0", "/I1", @@ -215,8 +214,22 @@ def test_image_extraction(src, page_index, image_key, expected): @pytest.mark.timeout(30) def test_loop_in_image_keys(): """Cf #2077""" - url = "https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf" - name = "iss2077.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="iss2077.pdf"))) reader.pages[0]["/Resources"]["/XObject"][NameObject("/toto")] = NullObject() reader.pages[0].images.keys() + + +@pytest.mark.enable_socket() +def test_devicen_cmyk_black_only(): + """Cf #2321""" + url = "https://github.com/py-pdf/pypdf/files/13501846/Addressing_Adversarial_Attacks.pdf" + name = "iss2321.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/cc2dabc1-86e6-4179-a8a4-2b0efea124be" + name = "iss2321_img0.pdf" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[5].images[0].image, img) >= 0.99 + url = "https://github.com/py-pdf/pypdf/assets/4083478/6b64a949-42be-40d5-9eea-95707f350d89" + name = "iss2321_img1.pdf" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[10].images[0].image, img) >= 0.99 diff --git a/tests/test_page.py b/tests/test_page.py index 8df25ad465..1c388c426a 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -129,9 +129,11 @@ def test_page_operations(pdf_path, password): (175, 680, 844), (45, 994, 994), (-80, 888, 742), - ] + ], ) -def test_mediabox_expansion_after_rotation(angle: float, expected_width: int, expected_height: int): +def test_mediabox_expansion_after_rotation( + angle: float, expected_width: int, expected_height: int +): """ Mediabox dimensions after rotation at a non-right angle with expension are correct. diff --git a/tests/test_page_labels.py b/tests/test_page_labels.py index 8b2e11b0da..1eb6f6aab4 100644 --- a/tests/test_page_labels.py +++ b/tests/test_page_labels.py @@ -70,9 +70,8 @@ def test_number2uppercase_letter(): @pytest.mark.enable_socket() def test_index2label(caplog): - url = "https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" name = "waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" - r = PdfReader(BytesIO(get_data_from_url(url, name=name))) + r = PdfReader(BytesIO(get_data_from_url(name=name))) assert index2label(r, 1) == "ii" assert index2label(r, 9) == "6" # very silly data to get test cover diff --git a/tests/test_reader.py b/tests/test_reader.py index b252e48f90..555a3b2fe7 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1027,17 +1027,13 @@ def test_header(src, pdf_header): @pytest.mark.enable_socket() def test_outline_color(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" - name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-924546.pdf"))) assert reader.outline[0].color == [0, 0, 1] @pytest.mark.enable_socket() def test_outline_font_format(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" - name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(name="tika-924546.pdf"))) assert reader.outline[0].font_format == 2 diff --git a/tests/test_xmp.py b/tests/test_xmp.py index e01e5c6fde..f864a9df9d 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -118,7 +118,7 @@ def test_identity_function(x): ("url", "name", "xmpmm_instance_id"), [ ( - "https://corpora.tika.apache.org/base/docs/govdocs1/955/955562.pdf", + None, "tika-955562.pdf", "uuid:ca96e032-c2af-49bd-a71c-95889bafbf1d", )