Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion packages/paper-qa-pypdf/src/paperqa_pypdf/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,15 @@ def parse_pdf_to_pages( # noqa: PLR0912
# Re-encode as PNG because the image may be in a
# format LLM providers reject (e.g. JPEG2000)
buf = io.BytesIO()
pil_image.save(buf, format="PNG")
try:
pil_image.save(buf, format="PNG")
except OSError as exc:
if "cannot write mode" not in str(exc):
raise # Don't swallow unrelated IO errors
# PNG doesn't support all color modes (e.g. CMYK
# from print-oriented PDFs), so fall back to RGB
buf = io.BytesIO() # Reset after partial write
pil_image.convert("RGB").save(buf, format="PNG")
data = buf.getvalue()
media_metadata = {
"type": "picture",
Expand Down
17 changes: 11 additions & 6 deletions packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,16 +344,20 @@ def test_clustering() -> None:


@pytest.mark.parametrize(
"img_format",
("img_mode", "img_format", "expected_mode"),
[
pytest.param("BMP", id="non_png_re_encodes"),
pytest.param("PNG", id="png_passthrough"),
pytest.param("RGB", "BMP", "RGB", id="non_png_re_encodes"),
pytest.param("RGB", "PNG", "RGB", id="png_passthrough"),
pytest.param("CMYK", "TIFF", "RGB", id="cmyk_converts_to_rgb"),
pytest.param("L", "BMP", "L", id="grayscale_preserves_mode"),
],
)
def test_individual_mode_outputs_png(img_format: str) -> None:
# Form an image in the input format
def test_individual_mode_outputs_png(
img_mode: str, img_format: str, expected_mode: str
) -> None:
# Form an image in the input format (and mode)
raw_buf = io.BytesIO()
Image.new("RGB", (4, 4), "red").save(raw_buf, format=img_format)
Image.new(img_mode, (4, 4)).save(raw_buf, format=img_format)
raw_bytes = raw_buf.getvalue()
mock_img_obj = SimpleNamespace(
image=Image.open(io.BytesIO(raw_bytes)), data=raw_bytes
Expand All @@ -377,6 +381,7 @@ def test_individual_mode_outputs_png(img_format: str) -> None:
result_image = Image.open(io.BytesIO(media.data))
assert result_image.format == "PNG"
assert result_image.size == (4, 4)
assert result_image.mode == expected_mode


class TestMediaMode:
Expand Down
Loading