Future-House · jamesbraza · Sep 15, 2025 · Sep 15, 2025
diff --git a/packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py b/packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
@@ -1,4 +1,5 @@
 import os
+import re
 
 import pymupdf
 from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
@@ -34,6 +35,14 @@ def setup_pymupdf_python_logging() -> None:
     "yres",
 }
 
+# On 9/14/2025, a `pymupdf.table.Table.to_markdown` stripped call returned:
+# '|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n|---|---|---|---|---|---|---|---|\n||\x02\x03<br>|\x04\x05\x06\x07\x08<br> <br>|\x07\x08\x08<br>\n\x08<br>\x0e\x0f<br>\x17\x18\x18\x08<br>|\x02<br>\x0c\x10<br>\x11<br>\x19\r\x02\x1a\x00\x01\x02\x03<br>|\x11<br>\x12\x06\x05<br>\x0e\x13\x14\x15<br>\x04\x05\x06\x07<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|'  # noqa: E501, W505
+# This garbage led to `asyncpg==0.30.0` with a PostgreSQL 15 DB throwing:
+# > asyncpg.exceptions.CharacterNotInRepertoireError:
+# > invalid byte sequence for encoding "UTF8": 0x00
+# Thus, this regex exists to deny table markdown exports containing invalid chars
+_INVALID_MD_CHARS = re.compile(r"\x00")
+
 
 def parse_pdf_to_pages(
     path: str | os.PathLike,
@@ -140,11 +149,16 @@ def parse_pdf_to_pages(
                     # Capture tables
                     for table_i, table in enumerate(t for t in page.find_tables()):
                         pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi)
+                        raw_md = table.to_markdown().strip()
                         media.append(
                             ParsedMedia(
                                 index=table_i,
                                 data=pix.tobytes(),
-                                text=table.to_markdown().strip(),
+                                # If the markdown contains invalid control characters
+                                # that'd trigger encoding errors later, drop the markdown
+                                text=(
+                                    None if _INVALID_MD_CHARS.search(raw_md) else raw_md
+                                ),
                                 info={"bbox": tuple(table.bbox), "type": "table"}
                                 | {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
                             )

diff --git a/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py b/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
@@ -2,6 +2,7 @@
 import json
 from pathlib import Path
 from typing import cast
+from unittest.mock import MagicMock, patch
 
 import pymupdf
 import pytest
@@ -139,8 +140,25 @@ def test_page_size_limit_denial() -> None:
 
 
 def test_table_parsing() -> None:
+    spy_to_markdown = MagicMock(side_effect=pymupdf.table.Table.to_markdown)
+    zeroth_raw_table_text = ""
+
+    def custom_to_markdown(self, clean=False, fill_empty=True) -> str:
+        md = spy_to_markdown(self, clean=clean, fill_empty=fill_empty)
+        if spy_to_markdown.call_count == 1:
+            nonlocal zeroth_raw_table_text
+            zeroth_raw_table_text = md
+            return (  # NOTE: this text has a null byte, which we want to filter
+                "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|"
+                "\n|---|---|---|---|---|---|---|---|"
+                "\n||\x02\x03<br>|\x04\x05\x06\x07\x08<br>"
+                " <br>|\x07\x08\x08<br>\n\x08<br>\x0e\x0f<br>\x17\x18\x18\x08<br>|\x02<br>\x0c\x10<br>\x11<br>\x19\r\x02\x1a\x00\x01\x02\x03<br>|\x11<br>\x12\x06\x05<br>\x0e\x13\x14\x15<br>\x04\x05\x06\x07<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|\x05\x08<br>\x0c\x10<br>\x12\x06\x05<br>\x0e\x16\x13<br>|"  # noqa: E501
+            )
+        return md
+
     filepath = STUB_DATA_DIR / "influence.pdf"
-    parsed_text = parse_pdf_to_pages(filepath)
+    with patch.object(pymupdf.table.Table, "to_markdown", custom_to_markdown):
+        parsed_text = parse_pdf_to_pages(filepath)
     assert isinstance(parsed_text.content, dict)
     assert all(
         t and t[0] != "\n" and t[-1] != "\n" for t in parsed_text.content.values()
@@ -151,6 +169,17 @@ def test_table_parsing() -> None:
         for i, pagenum_media in parsed_text.content.items()
         if isinstance(pagenum_media, tuple)
     }
+    all_tables = {k: v for k, v in all_tables.items() if v}
     assert (
         sum(len(tables) for tables in all_tables.values()) >= 2
-    ), "Expected a few tables to be parsed"
+    ), "Expected a few tables to be parsed for assertions to work"
+    zeroth_media, *_ = next(iter(all_tables.values()))
+    assert zeroth_media.text is None, "Expected null byte to be filtered"
+    assert zeroth_raw_table_text == (
+        "|Gap Size (mm)|Ununited|Uncertain|United|"
+        "\n|---|---|---|---|"
+        "\n|**1.0**|1/5 (20%)|1/5 (20%)|3/5 (60%)|"
+        "\n|**1.5**|3/7  (43%)|2/7  (29%)|2/7 (29%)|"
+        "\n|**2.0** <br>|3/6 (50%)|2/6 (33%)|1/6 (17%)|"
+        "\n\n"  # NOTE: this is before strip, so there can be trailing whitespace
+    )