Integrated media enrichment into settings and Docs.aadd, with tests

Integrated media enrichment into settings and Docs.aadd
Future-House · jamesbraza · Oct 29, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 11, 2025
commit 346f39d39d80ef96e6fa4af2967119676d3f9464
diff --git a/README.md b/README.md
@@ -926,13 +926,18 @@ will return much faster than the first query and we'll be certain the authors ma
 | `parsing.pdfs_use_block_parsing`             | `False`                                | Opt-in flag for block-based PDF parsing over text-based PDF parsing.                                                          |
 | `parsing.use_doc_details`                    | `True`                                 | Whether to get metadata details for docs.                                                                                     |
 | `parsing.overlap`                            | `250`                                  | Characters to overlap chunks.                                                                                                 |
-| `parsing.multimodal`                         | `True`                                 | Flag to parse both text and images from applicable documents.                                                                 |
+| `parsing.multimodal`                         | `True`                                 | Control to parse both text and media from applicable documents, as well as potentially enriching them with text descriptions. |
 | `parsing.defer_embedding`                    | `False`                                | Whether to defer embedding until summarization.                                                                               |
 | `parsing.parse_pdf`                          | `paperqa_pypdf.parse_pdf_to_pages`     | Function to parse PDF files.                                                                                                  |
 | `parsing.configure_pdf_parser`               | No-op                                  | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging.                       |
 | `parsing.chunking_algorithm`                 | `ChunkingOptions.SIMPLE_OVERLAP`       | Algorithm for chunking.                                                                                                       |
 | `parsing.doc_filters`                        | `None`                                 | Optional filters for allowed documents.                                                                                       |
 | `parsing.use_human_readable_clinical_trials` | `False`                                | Parse clinical trial JSONs into readable text.                                                                                |
+| `parsing.enrichment_llm`                     | `"gpt-4o-2024-11-20"`                  | LLM for media enrichment.                                                                                                     |
+| `parsing.enrichment_llm_config`              | `None`                                 | Optional configuration for `enrichment_llm`.                                                                                  |
+| `parsing.enrichment_page_radius`             | `1`                                    | Page radius for context text in enrichment.                                                                                   |
+| `parsing.enrichment_prompt`                  | `image_enrichment_prompt_template`     | Prompt template for enriching media.                                                                                          |
+| `parsing.enrichment_description_length`      | `"about 150 words"`                    | Plain text stating the desired length of an enriched media's description.                                                     |
 | `prompt.summary`                             | `summary_prompt`                       | Template for summarizing text, must contain variables matching `summary_prompt`.                                              |
 | `prompt.qa`                                  | `qa_prompt`                            | Template for QA, must contain variables matching `qa_prompt`.                                                                 |
 | `prompt.select`                              | `select_paper_prompt`                  | Template for selecting papers, must contain variables matching `select_paper_prompt`.                                         |

diff --git a/src/paperqa/docs.py b/src/paperqa/docs.py
@@ -387,16 +387,22 @@ async def aadd(  # noqa: PLR0912
                 doc, **(query_kwargs | kwargs)
             )
 
+        parse_images, enrich_media = parse_config.should_parse_and_enrich_media
+        multimodal_kwargs: dict[str, Any] = {"parse_images": parse_images}
+        if enrich_media:
+            multimodal_kwargs["multimodal_enricher"] = (
+                all_settings.make_media_enricher()
+            )
         texts, metadata = await read_doc(
             path,
             doc,
             chunk_chars=parse_config.chunk_size,
             overlap=parse_config.overlap,
             page_size_limit=parse_config.page_size_limit,
             use_block_parsing=parse_config.pdfs_use_block_parsing,
-            parse_images=parse_config.multimodal,
             parse_pdf=parse_config.parse_pdf,
             include_metadata=True,
+            **multimodal_kwargs,
         )
         # loose check to see if document was loaded
         if metadata.name != "image" and (

diff --git a/src/paperqa/prompts.py b/src/paperqa/prompts.py
@@ -162,3 +162,39 @@
 EMPTY_CONTEXTS = len(CONTEXT_OUTER_PROMPT.format(context_str="", valid_keys="").strip())
 CONTEXT_INNER_PROMPT_NOT_DETAILED = "{name}: {text}"
 CONTEXT_INNER_PROMPT = f"{CONTEXT_INNER_PROMPT_NOT_DETAILED}\nFrom {{citation}}"
+
+# For reference, here's Docling's image description prompt:
+# https://github.com/docling-project/docling/blob/v2.55.1/docling/datamodel/pipeline_options.py#L214-L216
+media_enrichment_prompt_template = (
+    "You are analyzing an image or table from a scientific document."
+    " Provide a detailed description that will be used to answer questions about its content."
+    " Focus on key elements, data, relationships, and scientific insights visible in the image."
+    " It's especially important to document referential information such as"
+    " figure/table numbers, labels, plot colors, or legends."
+    "\n\nText co-located with the media may be associated with"
+    " other media or unrelated content,"
+    " so do not just blindly quote referential information."
+    " The smaller the image, the more likely co-located text is unrelated."
+    " To restate, often the co-located text is several pages of content,"
+    " so only use aspects relevant to accompanying image or table."
+    "\n\nHere's a few failure mode with possible resolutions:"
+    "\n- The media was a logo or icon, so the text is unrelated."
+    " In this case, describe the media as a logo or icon,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media was display type, so the text is probably unrelated."
+    " The display type can be spread over several lines."
+    " In this case, describe the media as display type,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media is a margin box or design element, so the text is unrelated."
+    " In this case, describe the media as decorative,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media came from a bad PDF read, so it's garbled."
+    " In this case, describe the media as garbled, state why it's considered garbled,"
+    " and do not mention other unrelated surrounding text."
+    "\n- The media is a subfigure or a subtable."
+    " In this case, make sure to only detail the subfigure or subtable,"
+    " not the entire figure or table."
+    " Do not mention other unrelated surrounding text."
+    "\n\n{context_text}Describe the media ({description_length}),"  # Allow for empty context_text
+    " or if uncertain on a description please state why:"
+)
diff --git a/src/paperqa/settings.py b/src/paperqa/settings.py
@@ -4,13 +4,14 @@
 import pathlib
 import warnings
 from collections import defaultdict
-from collections.abc import Callable, Mapping, Sequence
-from enum import StrEnum
+from collections.abc import Awaitable, Callable, Mapping, Sequence
+from enum import IntEnum, StrEnum
+from itertools import starmap
 from pydoc import locate
 from typing import Any, ClassVar, Protocol, Self, TypeAlias, cast, runtime_checkable
 
 import anyio
-from aviary.core import Tool, ToolSelector
+from aviary.core import Message, Tool, ToolSelector
 from lmi import (
     CommonLLMNames,
     EmbeddingModel,
@@ -49,6 +50,7 @@
     default_system_prompt,
     env_reset_prompt,
     env_system_prompt,
+    media_enrichment_prompt_template,
     qa_prompt,
     select_paper_prompt,
     structured_citation_prompt,
@@ -57,7 +59,7 @@
     summary_prompt,
 )
 from paperqa.readers import PDFParserFn
-from paperqa.types import Context
+from paperqa.types import Context, ParsedMedia, ParsedText
 from paperqa.utils import hexdigest, pqa_directory
 
 # TODO: move to actual EnvironmentState
@@ -196,6 +198,15 @@ def default_pdf_parser_configurator() -> None:
     setup_pymupdf_python_logging()
 
 
+class MultimodalOptions(IntEnum):
+    # Text-only PDF reads
+    OFF = 0  # Falsey
+    ON_WITH_ENRICHMENT = 1  # Default
+    # Without image enrichment, multimodal will miss retrieval of certain images,
+    # but it costs less money (no enrichment LLM calls)
+    ON_WITHOUT_ENRICHMENT = 2
+
+
 class ParsingSettings(BaseModel):
     """Settings relevant for parsing and chunking documents."""
 
@@ -226,11 +237,14 @@ class ParsingSettings(BaseModel):
     overlap: int = Field(
         default=250, description="Number of characters to overlap chunks."
     )
-    multimodal: bool = Field(
-        default=True,
+    multimodal: bool | MultimodalOptions = Field(
+        default=MultimodalOptions.ON_WITH_ENRICHMENT,
         description=(
-            "Parse both text and images (if applicable to a given document),"
-            " or disable to parse just text."
+            "Controls on parsing images/tables (if applicable to a given document)."
+            " Setting false or off will parse only text,"
+            " setting true or 'on with enrichment' will parse media and use"
+            " the enrichment LLM to generate descriptions of the media,"
+            " setting 'on without enrichment' will parse media without enrichment."
         ),
     )
     citation_prompt: str = Field(
@@ -291,6 +305,53 @@ class ParsingSettings(BaseModel):
         default=False,
         description="Parse clinical trial JSONs into human readable text.",
     )
+    enrichment_llm: str = Field(
+        # NOTE: from CapArena (https://arxiv.org/abs/2503.12329),
+        # GPT-4o was the best image captioning model as of spring 2025
+        # NOTE: claude-haiku-4-5-20251001 recurringly failed to describe display type
+        # to be display type, so its captioning ability isn't good enough yet
+        default=CommonLLMNames.GPT_4O.value,
+        description="LLM for media enrichment (e.g. generating descriptions).",
+    )
+    enrichment_llm_config: dict | None = Field(
+        default=None,
+        description=(
+            "Optional configuration for the enrichment_llm model. More specifically, it's"
+            " a LiteLLM Router configuration to pass to LiteLLMModel, must have"
+            " `model_list` key (corresponding to model_list inputs here:"
+            " https://docs.litellm.ai/docs/routing), and can optionally include a"
+            " router_kwargs key with router kwargs as values."
+        ),
+    )
+    enrichment_page_radius: int = Field(
+        default=1,  # Default is 1 because figures are usually +/- 1 page in LaTeX
+        ge=-1,
+        description=(
+            "Page radius for context text in enrichment. "
+            "-1 means all pages, 0 means current page only, "
+            "1+ means a radius of pages around the current page."
+        ),
+    )
+    enrichment_prompt: str = Field(
+        default=media_enrichment_prompt_template,
+        description="Prompt template for enriching media.",
+    )
+    enrichment_description_length: str = Field(
+        default="about 150 words",
+        description=(
+            "Plain text stating the desired length of an enriched media's description."
+        ),
+    )
+
+    @property
+    def should_parse_and_enrich_media(self) -> tuple[bool, bool]:
+        """Get if the settings indicate to parse and also enrich media."""
+        if (
+            isinstance(self.multimodal, bool)
+            or self.multimodal != MultimodalOptions.ON_WITHOUT_ENRICHMENT
+        ):
+            return bool(self.multimodal), bool(self.multimodal)
+        return True, False
 
 
 class _FormatDict(dict):  # noqa: FURB189
@@ -954,6 +1015,15 @@ def get_agent_llm(self) -> LiteLLMModel:
     def get_embedding_model(self) -> EmbeddingModel:
         return embedding_model_factory(self.embedding, **(self.embedding_config or {}))
 
+    def get_enrichment_llm(self) -> LiteLLMModel:
+        return LiteLLMModel(
+            name=self.parsing.enrichment_llm,
+            config=self.parsing.enrichment_llm_config
+            or make_default_litellm_model_list_settings(
+                self.parsing.enrichment_llm, self.temperature
+            ),
+        )
+
     def make_aviary_tool_selector(self, agent_type: str | type) -> ToolSelector | None:
         """Attempt to convert the input agent type to an aviary ToolSelector."""
         if agent_type is ToolSelector or (
@@ -1043,6 +1113,108 @@ async def make_ldp_agent(
             )
         raise NotImplementedError(f"Didn't yet handle agent type {agent_type}.")
 
+    def make_media_enricher(self) -> Callable[[ParsedText], Awaitable[str]]:
+        """Create an enricher function from settings."""
+
+        async def enrich_media_with_llm(parsed_text: ParsedText) -> str:
+            """Enrich media in parsed text with LLM-generated descriptions.
+
+            Returns:
+                A summary string of the enrichment.
+            """
+            if not isinstance(parsed_text.content, dict) or not any(
+                isinstance(c, tuple) for c in parsed_text.content.values()
+            ):
+                raise ValueError(
+                    "Media enrichment requires media to be in the parsed text."
+                )
+            text_content = cast(
+                dict[str, str | tuple[str, list[ParsedMedia]]], parsed_text.content
+            )
+
+            # Collect all media with their page numbers
+            # NOTE: we could deduplicate media here across pages,
+            # but this introduces a bunch of complexity:
+            # - Do we enrich using on text surrounding the earlier or later image?
+            # - Or do we enrich using text surrounding all images,
+            #   and risk confusing the LLM if the texts aren't similar?
+            # Given these risks, we just enrich extra times
+            media_to_enrich: list[tuple[str, ParsedMedia]] = [
+                (page_num, media)
+                for page_num, page_contents in text_content.items()
+                if isinstance(page_contents, tuple)
+                for media in page_contents[1]
+                if not media.info.get("enriched_description")  # Don't clobber prior
+            ]
+            llm = self.get_enrichment_llm()
+            radius = self.parsing.enrichment_page_radius
+
+            async def enrich_single_media(
+                page_num: int | str, media: ParsedMedia
+            ) -> None:
+                """Enrich a single media item with LLM-generated description."""
+                if radius == -1:  # All pages
+                    context_text: str = "\n\n".join(
+                        (
+                            (
+                                pg_contents
+                                if isinstance(pg_contents, str)
+                                else pg_contents[0]
+                            )
+                            for _, pg_contents in sorted(
+                                text_content.items(), key=lambda x: int(x[0])
+                            )
+                        )
+                    )
+                    radius_msg: str = "all pages"
+                else:  # Specific page radius
+                    page_texts: list[str] = []
+                    for pg_int in range(
+                        max(1, int(page_num) - radius),
+                        min(len(text_content), int(page_num) + radius) + 1,
+                    ):
+                        # Use get so we're tolerant to missing pages here
+                        page_content = text_content.get(str(pg_int))
+                        if page_content:
+                            page_texts.append(
+                                page_content
+                                if isinstance(page_content, str)
+                                else page_content[0]
+                            )
+                    context_text = "\n\n".join(page_texts)
+                    radius_msg = (
+                        f"a radius of {'1 page' if radius == 1 else f'{radius} pages'}"
+                    )
+
+                prompt = self.parsing.enrichment_prompt.format(
+                    context_text=(
+                        f"Here is the co-located text from {radius_msg}:\n\n{context_text}\n\n"
+                        if context_text
+                        else ""
+                    ),
+                    description_length=self.parsing.enrichment_description_length,
+                )
+                result = await llm.call_single(
+                    messages=[
+                        Message.create_message(
+                            text=prompt, images=[media.to_image_url()]
+                        )
+                    ]
+                )
+                if result.text:
+                    media.info["enriched_description"] = result.text.strip()
+
+            await asyncio.gather(*list(starmap(enrich_single_media, media_to_enrich)))
+            count_enriched = sum(
+                bool(media.info.get("enriched_description"))
+                for page_num, page_contents in parsed_text.content.items()
+                if isinstance(page_contents, tuple)
+                for media in page_contents[1]
+            )
+            return f"enriched={count_enriched}|radius={radius}"
+
+        return enrich_media_with_llm
+
     def adjust_tools_for_agent_llm(self, tools: list[Tool]) -> None:
         """In-place adjust tool attributes or schemae to match agent LLM-specifics."""
         # This was originally made for Gemini 1.5 Flash not supporting empty tool args

diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -1597,6 +1597,25 @@ async def validate(data: bytes) -> None:  # noqa: RUF029
     )
 
 
+@pytest.mark.parametrize(
+    ("multimodal_option", "expected"),
+    [
+        (False, (False, False)),
+        (True, (True, True)),
+        (MultimodalOptions.OFF, (False, False)),
+        (MultimodalOptions.ON_WITH_ENRICHMENT, (True, True)),
+        (MultimodalOptions.ON_WITHOUT_ENRICHMENT, (True, False)),
+    ],
+)
+def test_should_parse_and_enrich_media(
+    multimodal_option: bool | MultimodalOptions, expected: tuple[bool, bool]
+) -> None:
+    assert (
+        ParsingSettings(multimodal=multimodal_option).should_parse_and_enrich_media
+        == expected
+    )
+
+
 @pytest.mark.asyncio
 async def test_code() -> None:
     settings = Settings.from_name("fast")