From 84077df6d61cd6c3e5b99006212e339591180662 Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 29 Oct 2025 12:02:49 -0700 Subject: [PATCH 1/2] Updated reader defaults to match current Settings --- src/paperqa/readers.py | 4 ++-- tests/test_paperqa.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/paperqa/readers.py b/src/paperqa/readers.py index 0fb8e7649..2e3c45d2e 100644 --- a/src/paperqa/readers.py +++ b/src/paperqa/readers.py @@ -345,8 +345,8 @@ async def read_doc( # noqa: PLR0912 doc: Doc, parsed_text_only: bool = False, include_metadata: bool = False, - chunk_chars: int = 3000, - overlap: int = 100, + chunk_chars: int = 5000, + overlap: int = 250, multimodal_enricher: Callable[[ParsedText], Awaitable[str]] | None = None, parse_pdf: PDFParserFn | None = None, **parser_kwargs, diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index ef38c09bf..e3e483158 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1378,6 +1378,8 @@ async def test_chunk_metadata_reader( parsed_text_only=False, # noqa: FURB120 include_metadata=True, parse_pdf=pdf_parser, + chunk_chars=3000, + overlap=100, ) assert metadata.name assert "pdf" in metadata.name @@ -1418,6 +1420,8 @@ async def test_chunk_metadata_reader( Doc(docname="foo", citation="Foo et al, 2002", dockey="1"), parsed_text_only=False, # noqa: FURB120 include_metadata=True, + chunk_chars=3000, + overlap=100, ) # NOTE the use of tiktoken changes the actual char and overlap counts assert metadata.name @@ -1443,6 +1447,8 @@ async def test_chunk_metadata_reader( path=code_input, doc=Doc(docname="foo", citation="Foo et al, 2002", dockey="1"), include_metadata=True, + chunk_chars=3000, + overlap=100, ) assert metadata.name assert "txt" in metadata.name From 944451b825e11337f8d244c2fea3bf65bcedf23b Mon Sep 17 00:00:00 2001 From: James Braza Date: Wed, 29 Oct 2025 12:04:14 -0700 Subject: [PATCH 2/2] Exposed reader_config setting, with deprecations of old settings and tests --- README.md | 1 + src/paperqa/docs.py | 8 ++---- src/paperqa/settings.py | 42 ++++++++++++++++++++++++++++++ tests/test_paperqa.py | 57 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 101 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index bccfb51a4..7c83252a1 100644 --- a/README.md +++ b/README.md @@ -944,6 +944,7 @@ will return much faster than the first query and we'll be certain the authors ma | `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. | | `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. | | `parsing.overlap` | `250` | Characters to overlap chunks. | +| `parsing.reader_config` | `dict` | Optional keyword arguments for the document reader. | | `parsing.multimodal` | `True` | Control to parse both text and media from applicable documents, as well as potentially enriching them with text descriptions. | | `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. | | `parsing.parse_pdf` | `paperqa_pypdf.parse_pdf_to_pages` | Function to parse PDF files. | diff --git a/src/paperqa/docs.py b/src/paperqa/docs.py index 695030aad..f788f2714 100644 --- a/src/paperqa/docs.py +++ b/src/paperqa/docs.py @@ -278,10 +278,7 @@ async def aadd( # noqa: PLR0912 texts = await read_doc( path, Doc(docname="", citation="", dockey=dockey), # Fake doc - chunk_chars=parse_config.chunk_size, - overlap=parse_config.overlap, page_size_limit=parse_config.page_size_limit, - use_block_parsing=parse_config.pdfs_use_block_parsing, parse_images=False, # Peeking is text only # We only use the first chunk, so let's peek just enough pages for that. # Usually pages 1 - 2 give that, @@ -289,6 +286,7 @@ async def aadd( # noqa: PLR0912 # we read pages 1 - 3 to be safe page_range=(1, 3), parse_pdf=parse_config.parse_pdf, + **parse_config.reader_config, ) if not texts or not texts[0].text.strip(): raise ValueError(f"Could not read document {path}. Is it empty?") @@ -397,13 +395,11 @@ async def aadd( # noqa: PLR0912 texts, metadata = await read_doc( path, doc, - chunk_chars=parse_config.chunk_size, - overlap=parse_config.overlap, page_size_limit=parse_config.page_size_limit, - use_block_parsing=parse_config.pdfs_use_block_parsing, parse_pdf=parse_config.parse_pdf, include_metadata=True, **multimodal_kwargs, + **parse_config.reader_config, ) # loose check to see if document was loaded if metadata.name != "image" and ( diff --git a/src/paperqa/settings.py b/src/paperqa/settings.py index 7e25da63a..f7ece0e20 100644 --- a/src/paperqa/settings.py +++ b/src/paperqa/settings.py @@ -242,6 +242,11 @@ class ParsingSettings(BaseModel): overlap: int = Field( default=250, description="Number of characters to overlap chunks." ) + reader_config: dict[str, Any] = Field( + default_factory=dict, + description="Optional keyword arguments for the document reader.", + examples=[{"dpi": 300}], + ) multimodal: bool | MultimodalOptions = Field( default=MultimodalOptions.ON_WITH_ENRICHMENT, description=( @@ -342,6 +347,43 @@ class ParsingSettings(BaseModel): description="Prompt template for enriching media.", ) + @model_validator(mode="after") + def _deprecated_field(self) -> Self: + if ( + self.pdfs_use_block_parsing + != type(self).model_fields["pdfs_use_block_parsing"].default + ): + warnings.warn( + "The 'pdfs_use_block_parsing' field is deprecated" + " and will be removed in version 6." + " Use 'use_block_parsing' parameter in 'reader_config' instead.", + category=DeprecationWarning, + stacklevel=2, + ) + if "use_block_parsing" not in self.reader_config: + self.reader_config["use_block_parsing"] = self.pdfs_use_block_parsing + if self.chunk_size != type(self).model_fields["chunk_size"].default: + warnings.warn( + "The 'chunk_size' field is deprecated" + " and will be removed in version 6." + " Use 'chunk_chars' parameter in 'reader_config' instead.", + category=DeprecationWarning, + stacklevel=2, + ) + if "chunk_chars" not in self.reader_config: + self.reader_config["chunk_chars"] = self.chunk_size + if self.overlap != type(self).model_fields["overlap"].default: + warnings.warn( + "The 'overlap' field is deprecated" + " and will be removed in version 6." + " Use 'overlap' parameter in 'reader_config' instead.", + category=DeprecationWarning, + stacklevel=2, + ) + if "overlap" not in self.reader_config: + self.reader_config["overlap"] = self.overlap + return self + @property def should_parse_and_enrich_media(self) -> tuple[bool, bool]: """Get if the settings indicate to parse and also enrich media.""" diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index e3e483158..b31049dae 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -1187,7 +1187,7 @@ async def test_pdf_reader_w_no_match_doc_details(stub_data_dir: Path) -> None: async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None: settings = Settings.from_name("debug") assert settings.parsing.defer_embedding, "Test relies on deferred embedding" - settings.parsing.chunk_size = 0 # Leads to one chunk = entire text + settings.parsing.reader_config["chunk_chars"] = 0 # Have one chunk = entire text # don't want to shove whole document into llm to get citation or embedding settings.parsing.use_doc_details = False settings.summary_llm = "gpt-4o-mini" # context window needs to fit our one chunk @@ -3053,3 +3053,58 @@ async def test_timeout_resilience() -> None: context, llm_results = await map_fxn_summary(**kw) assert context is None assert not llm_results + + +def test_reader_params_deprecation_warnings(recwarn: pytest.WarningsRecorder) -> None: + """Test that deprecated settings trigger warnings and are migrated to reader_config.""" + with pytest.warns(DeprecationWarning, match="chunk_size.*deprecated"): + settings1 = Settings(parsing=ParsingSettings(chunk_size=2000)) + assert settings1.parsing.reader_config["chunk_chars"] == 2000 + with pytest.warns(DeprecationWarning, match="overlap.*deprecated"): + settings2 = Settings(parsing=ParsingSettings(overlap=50)) + assert settings2.parsing.reader_config["overlap"] == 50 + with pytest.warns(DeprecationWarning, match="pdfs_use_block_parsing.*deprecated"): + settings3 = Settings(parsing=ParsingSettings(pdfs_use_block_parsing=True)) + assert settings3.parsing.reader_config["use_block_parsing"] + with pytest.warns(DeprecationWarning, match="chunk_size.*deprecated"): + settings4 = Settings( + parsing=ParsingSettings( + chunk_size=4000, reader_config={"chunk_chars": 2000} + ) + ) + assert ( + settings4.parsing.reader_config["chunk_chars"] == 2000 + ), "Expected reader_config to win out" + + _ = Settings(parsing=ParsingSettings()) + assert not [ + w for w in recwarn if issubclass(w.category, DeprecationWarning) + ], "Expected clean settings to have no warnings" + + +@pytest.mark.asyncio +async def test_reader_config_propagation(stub_data_dir: Path) -> None: + settings = Settings( + parsing=ParsingSettings( + reader_config={"chunk_chars": 2000, "overlap": 50, "dpi": 144} + ) + ) + + docs = Docs() + with ( + patch( + "paperqa.docs.read_doc", side_effect=RuntimeError("sentinel") + ) as mock_read_doc, + pytest.raises(RuntimeError, match="sentinel"), + ): + await docs.aadd( + stub_data_dir / "paper.pdf", + citation="Wellawatte et al, XAI Review, 2023", # Skip citation inference + doi="10.1021/acs.jctc.2c01235", # Skip DOI inference + title="A Perspective on Explanations of Molecular Prediction Models", # Skip title inference + settings=settings, + ) + mock_read_doc.assert_awaited_once() + assert mock_read_doc.call_args.kwargs["chunk_chars"] == 2000 + assert mock_read_doc.call_args.kwargs["overlap"] == 50 + assert mock_read_doc.call_args.kwargs["dpi"] == 144