Skip to content
Prev Previous commit
Next Next commit
Integrated media enrichment into settings and Docs.aadd, with tests
Integrated media enrichment into settings and Docs.aadd
  • Loading branch information
jamesbraza committed Oct 29, 2025
commit 346f39d39d80ef96e6fa4af2967119676d3f9464
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -926,13 +926,18 @@ will return much faster than the first query and we'll be certain the authors ma
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
| `parsing.overlap` | `250` | Characters to overlap chunks. |
| `parsing.multimodal` | `True` | Flag to parse both text and images from applicable documents. |
| `parsing.multimodal` | `True` | Control to parse both text and media from applicable documents, as well as potentially enriching them with text descriptions. |
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |
| `parsing.parse_pdf` | `paperqa_pypdf.parse_pdf_to_pages` | Function to parse PDF files. |
| `parsing.configure_pdf_parser` | No-op | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging. |
| `parsing.chunking_algorithm` | `ChunkingOptions.SIMPLE_OVERLAP` | Algorithm for chunking. |
| `parsing.doc_filters` | `None` | Optional filters for allowed documents. |
| `parsing.use_human_readable_clinical_trials` | `False` | Parse clinical trial JSONs into readable text. |
| `parsing.enrichment_llm` | `"gpt-4o-2024-11-20"` | LLM for media enrichment. |
| `parsing.enrichment_llm_config` | `None` | Optional configuration for `enrichment_llm`. |
| `parsing.enrichment_page_radius` | `1` | Page radius for context text in enrichment. |
| `parsing.enrichment_prompt` | `image_enrichment_prompt_template` | Prompt template for enriching media. |
| `parsing.enrichment_description_length` | `"about 150 words"` | Plain text stating the desired length of an enriched media's description. |
| `prompt.summary` | `summary_prompt` | Template for summarizing text, must contain variables matching `summary_prompt`. |
| `prompt.qa` | `qa_prompt` | Template for QA, must contain variables matching `qa_prompt`. |
| `prompt.select` | `select_paper_prompt` | Template for selecting papers, must contain variables matching `select_paper_prompt`. |
Expand Down
8 changes: 7 additions & 1 deletion src/paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,16 +387,22 @@ async def aadd( # noqa: PLR0912
doc, **(query_kwargs | kwargs)
)

parse_images, enrich_media = parse_config.should_parse_and_enrich_media
multimodal_kwargs: dict[str, Any] = {"parse_images": parse_images}
if enrich_media:
multimodal_kwargs["multimodal_enricher"] = (
all_settings.make_media_enricher()
)
texts, metadata = await read_doc(
path,
doc,
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
use_block_parsing=parse_config.pdfs_use_block_parsing,
parse_images=parse_config.multimodal,
parse_pdf=parse_config.parse_pdf,
include_metadata=True,
**multimodal_kwargs,
)
# loose check to see if document was loaded
if metadata.name != "image" and (
Expand Down
36 changes: 36 additions & 0 deletions src/paperqa/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,39 @@
EMPTY_CONTEXTS = len(CONTEXT_OUTER_PROMPT.format(context_str="", valid_keys="").strip())
CONTEXT_INNER_PROMPT_NOT_DETAILED = "{name}: {text}"
CONTEXT_INNER_PROMPT = f"{CONTEXT_INNER_PROMPT_NOT_DETAILED}\nFrom {{citation}}"

# For reference, here's Docling's image description prompt:
# https://github.com/docling-project/docling/blob/v2.55.1/docling/datamodel/pipeline_options.py#L214-L216
media_enrichment_prompt_template = (
"You are analyzing an image or table from a scientific document."
" Provide a detailed description that will be used to answer questions about its content."
" Focus on key elements, data, relationships, and scientific insights visible in the image."
" It's especially important to document referential information such as"
" figure/table numbers, labels, plot colors, or legends."
"\n\nText co-located with the media may be associated with"
" other media or unrelated content,"
" so do not just blindly quote referential information."
" The smaller the image, the more likely co-located text is unrelated."
" To restate, often the co-located text is several pages of content,"
" so only use aspects relevant to accompanying image or table."
"\n\nHere's a few failure mode with possible resolutions:"
"\n- The media was a logo or icon, so the text is unrelated."
" In this case, describe the media as a logo or icon,"
" and do not mention other unrelated surrounding text."
"\n- The media was display type, so the text is probably unrelated."
" The display type can be spread over several lines."
" In this case, describe the media as display type,"
" and do not mention other unrelated surrounding text."
"\n- The media is a margin box or design element, so the text is unrelated."
" In this case, describe the media as decorative,"
" and do not mention other unrelated surrounding text."
"\n- The media came from a bad PDF read, so it's garbled."
" In this case, describe the media as garbled, state why it's considered garbled,"
" and do not mention other unrelated surrounding text."
"\n- The media is a subfigure or a subtable."
" In this case, make sure to only detail the subfigure or subtable,"
" not the entire figure or table."
" Do not mention other unrelated surrounding text."
"\n\n{context_text}Describe the media ({description_length})," # Allow for empty context_text
" or if uncertain on a description please state why:"
)
188 changes: 180 additions & 8 deletions src/paperqa/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import pathlib
import warnings
from collections import defaultdict
from collections.abc import Callable, Mapping, Sequence
from enum import StrEnum
from collections.abc import Awaitable, Callable, Mapping, Sequence
from enum import IntEnum, StrEnum
from itertools import starmap
from pydoc import locate
from typing import Any, ClassVar, Protocol, Self, TypeAlias, cast, runtime_checkable

import anyio
from aviary.core import Tool, ToolSelector
from aviary.core import Message, Tool, ToolSelector
from lmi import (
CommonLLMNames,
EmbeddingModel,
Expand Down Expand Up @@ -49,6 +50,7 @@
default_system_prompt,
env_reset_prompt,
env_system_prompt,
media_enrichment_prompt_template,
qa_prompt,
select_paper_prompt,
structured_citation_prompt,
Expand All @@ -57,7 +59,7 @@
summary_prompt,
)
from paperqa.readers import PDFParserFn
from paperqa.types import Context
from paperqa.types import Context, ParsedMedia, ParsedText
from paperqa.utils import hexdigest, pqa_directory

# TODO: move to actual EnvironmentState
Expand Down Expand Up @@ -196,6 +198,15 @@ def default_pdf_parser_configurator() -> None:
setup_pymupdf_python_logging()


class MultimodalOptions(IntEnum):
# Text-only PDF reads
OFF = 0 # Falsey
ON_WITH_ENRICHMENT = 1 # Default
# Without image enrichment, multimodal will miss retrieval of certain images,
# but it costs less money (no enrichment LLM calls)
ON_WITHOUT_ENRICHMENT = 2


class ParsingSettings(BaseModel):
"""Settings relevant for parsing and chunking documents."""

Expand Down Expand Up @@ -226,11 +237,14 @@ class ParsingSettings(BaseModel):
overlap: int = Field(
default=250, description="Number of characters to overlap chunks."
)
multimodal: bool = Field(
default=True,
multimodal: bool | MultimodalOptions = Field(
default=MultimodalOptions.ON_WITH_ENRICHMENT,
description=(
"Parse both text and images (if applicable to a given document),"
" or disable to parse just text."
"Controls on parsing images/tables (if applicable to a given document)."
" Setting false or off will parse only text,"
" setting true or 'on with enrichment' will parse media and use"
" the enrichment LLM to generate descriptions of the media,"
" setting 'on without enrichment' will parse media without enrichment."
),
)
citation_prompt: str = Field(
Expand Down Expand Up @@ -291,6 +305,53 @@ class ParsingSettings(BaseModel):
default=False,
description="Parse clinical trial JSONs into human readable text.",
)
enrichment_llm: str = Field(
# NOTE: from CapArena (https://arxiv.org/abs/2503.12329),
# GPT-4o was the best image captioning model as of spring 2025
# NOTE: claude-haiku-4-5-20251001 recurringly failed to describe display type
# to be display type, so its captioning ability isn't good enough yet
default=CommonLLMNames.GPT_4O.value,
description="LLM for media enrichment (e.g. generating descriptions).",
)
enrichment_llm_config: dict | None = Field(
default=None,
description=(
"Optional configuration for the enrichment_llm model. More specifically, it's"
" a LiteLLM Router configuration to pass to LiteLLMModel, must have"
" `model_list` key (corresponding to model_list inputs here:"
" https://docs.litellm.ai/docs/routing), and can optionally include a"
" router_kwargs key with router kwargs as values."
),
)
enrichment_page_radius: int = Field(
default=1, # Default is 1 because figures are usually +/- 1 page in LaTeX
ge=-1,
description=(
"Page radius for context text in enrichment. "
"-1 means all pages, 0 means current page only, "
"1+ means a radius of pages around the current page."
),
)
enrichment_prompt: str = Field(
default=media_enrichment_prompt_template,
description="Prompt template for enriching media.",
)
enrichment_description_length: str = Field(
default="about 150 words",
description=(
"Plain text stating the desired length of an enriched media's description."
),
)

@property
def should_parse_and_enrich_media(self) -> tuple[bool, bool]:
"""Get if the settings indicate to parse and also enrich media."""
if (
isinstance(self.multimodal, bool)
or self.multimodal != MultimodalOptions.ON_WITHOUT_ENRICHMENT
):
return bool(self.multimodal), bool(self.multimodal)
return True, False


class _FormatDict(dict): # noqa: FURB189
Expand Down Expand Up @@ -954,6 +1015,15 @@ def get_agent_llm(self) -> LiteLLMModel:
def get_embedding_model(self) -> EmbeddingModel:
return embedding_model_factory(self.embedding, **(self.embedding_config or {}))

def get_enrichment_llm(self) -> LiteLLMModel:
return LiteLLMModel(
name=self.parsing.enrichment_llm,
config=self.parsing.enrichment_llm_config
or make_default_litellm_model_list_settings(
self.parsing.enrichment_llm, self.temperature
),
)

def make_aviary_tool_selector(self, agent_type: str | type) -> ToolSelector | None:
"""Attempt to convert the input agent type to an aviary ToolSelector."""
if agent_type is ToolSelector or (
Expand Down Expand Up @@ -1043,6 +1113,108 @@ async def make_ldp_agent(
)
raise NotImplementedError(f"Didn't yet handle agent type {agent_type}.")

def make_media_enricher(self) -> Callable[[ParsedText], Awaitable[str]]:
"""Create an enricher function from settings."""

async def enrich_media_with_llm(parsed_text: ParsedText) -> str:
"""Enrich media in parsed text with LLM-generated descriptions.

Returns:
A summary string of the enrichment.
"""
if not isinstance(parsed_text.content, dict) or not any(
isinstance(c, tuple) for c in parsed_text.content.values()
):
raise ValueError(
"Media enrichment requires media to be in the parsed text."
)
text_content = cast(
dict[str, str | tuple[str, list[ParsedMedia]]], parsed_text.content
)

# Collect all media with their page numbers
# NOTE: we could deduplicate media here across pages,
# but this introduces a bunch of complexity:
# - Do we enrich using on text surrounding the earlier or later image?
# - Or do we enrich using text surrounding all images,
# and risk confusing the LLM if the texts aren't similar?
# Given these risks, we just enrich extra times
media_to_enrich: list[tuple[str, ParsedMedia]] = [
(page_num, media)
for page_num, page_contents in text_content.items()
if isinstance(page_contents, tuple)
for media in page_contents[1]
if not media.info.get("enriched_description") # Don't clobber prior
]
llm = self.get_enrichment_llm()
radius = self.parsing.enrichment_page_radius

async def enrich_single_media(
page_num: int | str, media: ParsedMedia
) -> None:
"""Enrich a single media item with LLM-generated description."""
if radius == -1: # All pages
context_text: str = "\n\n".join(
(
(
pg_contents
if isinstance(pg_contents, str)
else pg_contents[0]
)
for _, pg_contents in sorted(
text_content.items(), key=lambda x: int(x[0])
)
)
)
radius_msg: str = "all pages"
else: # Specific page radius
page_texts: list[str] = []
for pg_int in range(
max(1, int(page_num) - radius),
min(len(text_content), int(page_num) + radius) + 1,
):
# Use get so we're tolerant to missing pages here
page_content = text_content.get(str(pg_int))
if page_content:
page_texts.append(
page_content
if isinstance(page_content, str)
else page_content[0]
)
context_text = "\n\n".join(page_texts)
radius_msg = (
f"a radius of {'1 page' if radius == 1 else f'{radius} pages'}"
)

prompt = self.parsing.enrichment_prompt.format(
context_text=(
f"Here is the co-located text from {radius_msg}:\n\n{context_text}\n\n"
if context_text
else ""
),
description_length=self.parsing.enrichment_description_length,
)
result = await llm.call_single(
messages=[
Message.create_message(
text=prompt, images=[media.to_image_url()]
)
]
)
if result.text:
media.info["enriched_description"] = result.text.strip()

await asyncio.gather(*list(starmap(enrich_single_media, media_to_enrich)))
count_enriched = sum(
bool(media.info.get("enriched_description"))
for page_num, page_contents in parsed_text.content.items()
if isinstance(page_contents, tuple)
for media in page_contents[1]
)
return f"enriched={count_enriched}|radius={radius}"

return enrich_media_with_llm

def adjust_tools_for_agent_llm(self, tools: list[Tool]) -> None:
"""In-place adjust tool attributes or schemae to match agent LLM-specifics."""
# This was originally made for Gemini 1.5 Flash not supporting empty tool args
Expand Down
19 changes: 19 additions & 0 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -1597,6 +1597,25 @@ async def validate(data: bytes) -> None: # noqa: RUF029
)


@pytest.mark.parametrize(
("multimodal_option", "expected"),
[
(False, (False, False)),
(True, (True, True)),
(MultimodalOptions.OFF, (False, False)),
(MultimodalOptions.ON_WITH_ENRICHMENT, (True, True)),
(MultimodalOptions.ON_WITHOUT_ENRICHMENT, (True, False)),
],
)
def test_should_parse_and_enrich_media(
multimodal_option: bool | MultimodalOptions, expected: tuple[bool, bool]
) -> None:
assert (
ParsingSettings(multimodal=multimodal_option).should_parse_and_enrich_media
== expected
)


@pytest.mark.asyncio
async def test_code() -> None:
settings = Settings.from_name("fast")
Expand Down