Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,7 @@ will return much faster than the first query and we'll be certain the authors ma
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
| `parsing.overlap` | `250` | Characters to overlap chunks. |
| `parsing.reader_config` | `dict` | Optional keyword arguments for the document reader. |
| `parsing.multimodal` | `True` | Control to parse both text and media from applicable documents, as well as potentially enriching them with text descriptions. |
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |
| `parsing.parse_pdf` | `paperqa_pypdf.parse_pdf_to_pages` | Function to parse PDF files. |
Expand Down
8 changes: 2 additions & 6 deletions src/paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,17 +278,15 @@ async def aadd( # noqa: PLR0912
texts = await read_doc(
path,
Doc(docname="", citation="", dockey=dockey), # Fake doc
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
use_block_parsing=parse_config.pdfs_use_block_parsing,
parse_images=False, # Peeking is text only
# We only use the first chunk, so let's peek just enough pages for that.
# Usually pages 1 - 2 give that,
# but in the event page 2 is blank (true for some PDFs),
# we read pages 1 - 3 to be safe
page_range=(1, 3),
parse_pdf=parse_config.parse_pdf,
**parse_config.reader_config,
Comment thread
jamesbraza marked this conversation as resolved.
)
if not texts or not texts[0].text.strip():
raise ValueError(f"Could not read document {path}. Is it empty?")
Expand Down Expand Up @@ -397,13 +395,11 @@ async def aadd( # noqa: PLR0912
texts, metadata = await read_doc(
path,
doc,
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
use_block_parsing=parse_config.pdfs_use_block_parsing,
parse_pdf=parse_config.parse_pdf,
include_metadata=True,
**multimodal_kwargs,
**parse_config.reader_config,
)
# loose check to see if document was loaded
if metadata.name != "image" and (
Expand Down
4 changes: 2 additions & 2 deletions src/paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,8 +345,8 @@ async def read_doc( # noqa: PLR0912
doc: Doc,
parsed_text_only: bool = False,
include_metadata: bool = False,
chunk_chars: int = 3000,
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These were lagging our defaults in Settings, so this re-syncs them

overlap: int = 100,
chunk_chars: int = 5000,
overlap: int = 250,
multimodal_enricher: Callable[[ParsedText], Awaitable[str]] | None = None,
parse_pdf: PDFParserFn | None = None,
**parser_kwargs,
Expand Down
42 changes: 42 additions & 0 deletions src/paperqa/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,11 @@ class ParsingSettings(BaseModel):
overlap: int = Field(
default=250, description="Number of characters to overlap chunks."
)
reader_config: dict[str, Any] = Field(
default_factory=dict,
description="Optional keyword arguments for the document reader.",
examples=[{"dpi": 300}],
)
multimodal: bool | MultimodalOptions = Field(
default=MultimodalOptions.ON_WITH_ENRICHMENT,
description=(
Expand Down Expand Up @@ -342,6 +347,43 @@ class ParsingSettings(BaseModel):
description="Prompt template for enriching media.",
)

@model_validator(mode="after")
def _deprecated_field(self) -> Self:
if (
self.pdfs_use_block_parsing
!= type(self).model_fields["pdfs_use_block_parsing"].default
):
warnings.warn(
"The 'pdfs_use_block_parsing' field is deprecated"
" and will be removed in version 6."
" Use 'use_block_parsing' parameter in 'reader_config' instead.",
category=DeprecationWarning,
stacklevel=2,
)
if "use_block_parsing" not in self.reader_config:
self.reader_config["use_block_parsing"] = self.pdfs_use_block_parsing
if self.chunk_size != type(self).model_fields["chunk_size"].default:
warnings.warn(
"The 'chunk_size' field is deprecated"
" and will be removed in version 6."
" Use 'chunk_chars' parameter in 'reader_config' instead.",
category=DeprecationWarning,
stacklevel=2,
)
if "chunk_chars" not in self.reader_config:
self.reader_config["chunk_chars"] = self.chunk_size
if self.overlap != type(self).model_fields["overlap"].default:
warnings.warn(
"The 'overlap' field is deprecated"
" and will be removed in version 6."
" Use 'overlap' parameter in 'reader_config' instead.",
category=DeprecationWarning,
stacklevel=2,
)
if "overlap" not in self.reader_config:
self.reader_config["overlap"] = self.overlap
return self

@property
def should_parse_and_enrich_media(self) -> tuple[bool, bool]:
"""Get if the settings indicate to parse and also enrich media."""
Expand Down
63 changes: 62 additions & 1 deletion tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -1187,7 +1187,7 @@ async def test_pdf_reader_w_no_match_doc_details(stub_data_dir: Path) -> None:
async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None:
settings = Settings.from_name("debug")
assert settings.parsing.defer_embedding, "Test relies on deferred embedding"
settings.parsing.chunk_size = 0 # Leads to one chunk = entire text
settings.parsing.reader_config["chunk_chars"] = 0 # Have one chunk = entire text
# don't want to shove whole document into llm to get citation or embedding
settings.parsing.use_doc_details = False
settings.summary_llm = "gpt-4o-mini" # context window needs to fit our one chunk
Expand Down Expand Up @@ -1378,6 +1378,8 @@ async def test_chunk_metadata_reader(
parsed_text_only=False, # noqa: FURB120
include_metadata=True,
parse_pdf=pdf_parser,
chunk_chars=3000,
overlap=100,
)
assert metadata.name
assert "pdf" in metadata.name
Expand Down Expand Up @@ -1418,6 +1420,8 @@ async def test_chunk_metadata_reader(
Doc(docname="foo", citation="Foo et al, 2002", dockey="1"),
parsed_text_only=False, # noqa: FURB120
include_metadata=True,
chunk_chars=3000,
overlap=100,
)
# NOTE the use of tiktoken changes the actual char and overlap counts
assert metadata.name
Expand All @@ -1443,6 +1447,8 @@ async def test_chunk_metadata_reader(
path=code_input,
doc=Doc(docname="foo", citation="Foo et al, 2002", dockey="1"),
include_metadata=True,
chunk_chars=3000,
overlap=100,
)
assert metadata.name
assert "txt" in metadata.name
Expand Down Expand Up @@ -3047,3 +3053,58 @@ async def test_timeout_resilience() -> None:
context, llm_results = await map_fxn_summary(**kw)
assert context is None
assert not llm_results


def test_reader_params_deprecation_warnings(recwarn: pytest.WarningsRecorder) -> None:
"""Test that deprecated settings trigger warnings and are migrated to reader_config."""
with pytest.warns(DeprecationWarning, match="chunk_size.*deprecated"):
settings1 = Settings(parsing=ParsingSettings(chunk_size=2000))
assert settings1.parsing.reader_config["chunk_chars"] == 2000
with pytest.warns(DeprecationWarning, match="overlap.*deprecated"):
settings2 = Settings(parsing=ParsingSettings(overlap=50))
assert settings2.parsing.reader_config["overlap"] == 50
with pytest.warns(DeprecationWarning, match="pdfs_use_block_parsing.*deprecated"):
settings3 = Settings(parsing=ParsingSettings(pdfs_use_block_parsing=True))
assert settings3.parsing.reader_config["use_block_parsing"]
Comment thread
jamesbraza marked this conversation as resolved.
with pytest.warns(DeprecationWarning, match="chunk_size.*deprecated"):
settings4 = Settings(
parsing=ParsingSettings(
chunk_size=4000, reader_config={"chunk_chars": 2000}
)
)
assert (
settings4.parsing.reader_config["chunk_chars"] == 2000
), "Expected reader_config to win out"

_ = Settings(parsing=ParsingSettings())
assert not [
w for w in recwarn if issubclass(w.category, DeprecationWarning)
], "Expected clean settings to have no warnings"


@pytest.mark.asyncio
async def test_reader_config_propagation(stub_data_dir: Path) -> None:
settings = Settings(
parsing=ParsingSettings(
reader_config={"chunk_chars": 2000, "overlap": 50, "dpi": 144}
)
)

docs = Docs()
with (
patch(
"paperqa.docs.read_doc", side_effect=RuntimeError("sentinel")
) as mock_read_doc,
pytest.raises(RuntimeError, match="sentinel"),
):
await docs.aadd(
stub_data_dir / "paper.pdf",
citation="Wellawatte et al, XAI Review, 2023", # Skip citation inference
doi="10.1021/acs.jctc.2c01235", # Skip DOI inference
title="A Perspective on Explanations of Molecular Prediction Models", # Skip title inference
settings=settings,
)
mock_read_doc.assert_awaited_once()
assert mock_read_doc.call_args.kwargs["chunk_chars"] == 2000
assert mock_read_doc.call_args.kwargs["overlap"] == 50
assert mock_read_doc.call_args.kwargs["dpi"] == 144