Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re

import pymupdf
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
Expand Down Expand Up @@ -34,6 +35,14 @@ def setup_pymupdf_python_logging() -> None:
"yres",
}

# On 9/14/2025, a `pymupdf.table.Table.to_markdown` stripped call returned:
# '|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n|---|---|---|---|---|---|---|---|\n||\x02\x03<br>|\x04\x05\x06\x07\x08<br> <br>|\x07\x08\x08<br>\n\x08<br>\x0e\x0f<br>\x17\x18\x18\x08<br>|\x02<br>\x0c\x10<br>\x11<br>\x19\r\x02\x1a\x00\x01\x02\x03<br>|\x11<br>\x12\x06\x05<br>\x0e\x13\x14\x15<br>\x04\x05\x06\x07<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|' # noqa: E501, W505
# This garbage led to `asyncpg==0.30.0` with a PostgreSQL 15 DB throwing:
# > asyncpg.exceptions.CharacterNotInRepertoireError:
# > invalid byte sequence for encoding "UTF8": 0x00
# Thus, this regex exists to deny table markdown exports containing invalid chars
_INVALID_MD_CHARS = re.compile(r"\x00")


def parse_pdf_to_pages(
path: str | os.PathLike,
Expand Down Expand Up @@ -140,11 +149,16 @@ def parse_pdf_to_pages(
# Capture tables
for table_i, table in enumerate(t for t in page.find_tables()):
pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi)
raw_md = table.to_markdown().strip()
media.append(
ParsedMedia(
index=table_i,
data=pix.tobytes(),
text=table.to_markdown().strip(),
# If the markdown contains invalid control characters
# that'd trigger encoding errors later, drop the markdown
text=(
None if _INVALID_MD_CHARS.search(raw_md) else raw_md
),
info={"bbox": tuple(table.bbox), "type": "table"}
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
)
Expand Down
33 changes: 31 additions & 2 deletions packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
from pathlib import Path
from typing import cast
from unittest.mock import MagicMock, patch

import pymupdf
import pytest
Expand Down Expand Up @@ -139,8 +140,25 @@ def test_page_size_limit_denial() -> None:


def test_table_parsing() -> None:
spy_to_markdown = MagicMock(side_effect=pymupdf.table.Table.to_markdown)
zeroth_raw_table_text = ""

def custom_to_markdown(self, clean=False, fill_empty=True) -> str:
md = spy_to_markdown(self, clean=clean, fill_empty=fill_empty)
if spy_to_markdown.call_count == 1:
nonlocal zeroth_raw_table_text
zeroth_raw_table_text = md
return ( # NOTE: this text has a null byte, which we want to filter
"|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|"
"\n|---|---|---|---|---|---|---|---|"
"\n||\x02\x03<br>|\x04\x05\x06\x07\x08<br>"
" <br>|\x07\x08\x08<br>\n\x08<br>\x0e\x0f<br>\x17\x18\x18\x08<br>|\x02<br>\x0c\x10<br>\x11<br>\x19\r\x02\x1a\x00\x01\x02\x03<br>|\x11<br>\x12\x06\x05<br>\x0e\x13\x14\x15<br>\x04\x05\x06\x07<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|" # noqa: E501
)
return md

filepath = STUB_DATA_DIR / "influence.pdf"
parsed_text = parse_pdf_to_pages(filepath)
with patch.object(pymupdf.table.Table, "to_markdown", custom_to_markdown):
parsed_text = parse_pdf_to_pages(filepath)
assert isinstance(parsed_text.content, dict)
assert all(
t and t[0] != "\n" and t[-1] != "\n" for t in parsed_text.content.values()
Expand All @@ -151,6 +169,17 @@ def test_table_parsing() -> None:
for i, pagenum_media in parsed_text.content.items()
if isinstance(pagenum_media, tuple)
}
all_tables = {k: v for k, v in all_tables.items() if v}
assert (
sum(len(tables) for tables in all_tables.values()) >= 2
), "Expected a few tables to be parsed"
), "Expected a few tables to be parsed for assertions to work"
zeroth_media, *_ = next(iter(all_tables.values()))
assert zeroth_media.text is None, "Expected null byte to be filtered"
assert zeroth_raw_table_text == (
"|Gap Size (mm)|Ununited|Uncertain|United|"
"\n|---|---|---|---|"
"\n|**1.0**|1/5 (20%)|1/5 (20%)|3/5 (60%)|"
"\n|**1.5**|3/7 (43%)|2/7 (29%)|2/7 (29%)|"
"\n|**2.0** <br>|3/6 (50%)|2/6 (33%)|1/6 (17%)|"
"\n\n" # NOTE: this is before strip, so there can be trailing whitespace
)