Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Fixed CMYK images crashing PNG encoding in PyPDF reader
PNG doesn't support all color modes (e.g. CMYK from print-oriented
PDFs). When Pillow raises OSError for an unsupported mode, fall back
to converting the image to RGB before re-encoding as PNG.

Made-with: Cursor
  • Loading branch information
jamesbraza committed Mar 4, 2026
commit a0c25caa85bc67c2623297ad0dd6f0ab0fe07b69
10 changes: 9 additions & 1 deletion packages/paper-qa-pypdf/src/paperqa_pypdf/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,15 @@ def parse_pdf_to_pages( # noqa: PLR0912
# Re-encode as PNG because the image may be in a
# format LLM providers reject (e.g. JPEG2000)
buf = io.BytesIO()
pil_image.save(buf, format="PNG")
try:
pil_image.save(buf, format="PNG")
except OSError as exc:
if "cannot write mode" not in str(exc):
raise # Don't swallow unrelated IO errors
# PNG doesn't support all color modes (e.g. CMYK
# from print-oriented PDFs), so fall back to RGB
buf = io.BytesIO() # Reset after partial write
pil_image.convert("RGB").save(buf, format="PNG")
data = buf.getvalue()
media_metadata = {
"type": "picture",
Expand Down
17 changes: 11 additions & 6 deletions packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,16 +344,20 @@ def test_clustering() -> None:


@pytest.mark.parametrize(
"img_format",
("img_mode", "img_format", "expected_mode"),
[
pytest.param("BMP", id="non_png_re_encodes"),
pytest.param("PNG", id="png_passthrough"),
pytest.param("RGB", "BMP", "RGB", id="non_png_re_encodes"),
pytest.param("RGB", "PNG", "RGB", id="png_passthrough"),
pytest.param("CMYK", "TIFF", "RGB", id="cmyk_converts_to_rgb"),
pytest.param("L", "BMP", "L", id="grayscale_preserves_mode"),
],
)
def test_individual_mode_outputs_png(img_format: str) -> None:
# Form an image in the input format
def test_individual_mode_outputs_png(
img_mode: str, img_format: str, expected_mode: str
) -> None:
# Form an image in the input format (and mode)
raw_buf = io.BytesIO()
Image.new("RGB", (4, 4), "red").save(raw_buf, format=img_format)
Image.new(img_mode, (4, 4)).save(raw_buf, format=img_format)
raw_bytes = raw_buf.getvalue()
mock_img_obj = SimpleNamespace(
image=Image.open(io.BytesIO(raw_bytes)), data=raw_bytes
Expand All @@ -377,6 +381,7 @@ def test_individual_mode_outputs_png(img_format: str) -> None:
result_image = Image.open(io.BytesIO(media.data))
assert result_image.format == "PNG"
assert result_image.size == (4, 4)
assert result_image.mode == expected_mode


class TestMediaMode:
Expand Down
Loading