Skip to content

Commit 5004eb0

Browse files
authored
Supporting media enrichment in embeddings (#1143)
1 parent 7175ddd commit 5004eb0

13 files changed

+11099
-8312
lines changed

README.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,24 @@ Depending on the source document, the same image can appear multiple times
747747
Thus, clients should consider media databases
748748
to have a many-to-many relationship with chunks.
749749

750+
Since PaperQA's evidence gathering process centers on text-based retrieval,
751+
it's possible relevant image(s) or table(s) aren't retrieved
752+
because their associated text content is irrelevant.
753+
For a concrete example, imagine the figure in a paper has a terse caption
754+
and is placed one page after relevant main-text discussion.
755+
To solve this problem, PaperQA supports media enrichment at document read-time.
756+
Basically after reading in the PDF,
757+
the `parsing.enrichment_llm` is given the `parsing.enrichment_prompt`
758+
and co-located text to generate a synthetic caption for every image/table.
759+
The synthetic captions are used to shift the embeddings of each text chunk,
760+
but are kept separate from the actual source text.
761+
This way evidence gathering can fetch relevant images/tables
762+
without risk of polluting contextual summaries with LLM-generated captions.
763+
764+
If you want multimodal PDF reading, but do not want enrichment
765+
(since adds one LLM prompt/media at read-time),
766+
enrichment can be disabled by setting `parsing.multimodal` to `ON_WITHOUT_ENRICHMENT`.
767+
750768
When creating contextual summaries on a given chunk (a `Text`),
751769
the summary LLM is passed both the chunk's text and the chunk's associated media,
752770
but the output contextual summary itself remains text-only.
@@ -926,13 +944,17 @@ will return much faster than the first query and we'll be certain the authors ma
926944
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
927945
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
928946
| `parsing.overlap` | `250` | Characters to overlap chunks. |
929-
| `parsing.multimodal` | `True` | Flag to parse both text and images from applicable documents. |
947+
| `parsing.multimodal` | `True` | Control to parse both text and media from applicable documents, as well as potentially enriching them with text descriptions. |
930948
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |
931949
| `parsing.parse_pdf` | `paperqa_pypdf.parse_pdf_to_pages` | Function to parse PDF files. |
932950
| `parsing.configure_pdf_parser` | No-op | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging. |
933951
| `parsing.chunking_algorithm` | `ChunkingOptions.SIMPLE_OVERLAP` | Algorithm for chunking. |
934952
| `parsing.doc_filters` | `None` | Optional filters for allowed documents. |
935953
| `parsing.use_human_readable_clinical_trials` | `False` | Parse clinical trial JSONs into readable text. |
954+
| `parsing.enrichment_llm` | `"gpt-4o-2024-11-20"` | LLM for media enrichment. |
955+
| `parsing.enrichment_llm_config` | `None` | Optional configuration for `enrichment_llm`. |
956+
| `parsing.enrichment_page_radius` | `1` | Page radius for context text in enrichment. |
957+
| `parsing.enrichment_prompt` | `image_enrichment_prompt_template` | Prompt template for enriching media. |
936958
| `prompt.summary` | `summary_prompt` | Template for summarizing text, must contain variables matching `summary_prompt`. |
937959
| `prompt.qa` | `qa_prompt` | Template for QA, must contain variables matching `qa_prompt`. |
938960
| `prompt.select` | `select_paper_prompt` | Template for selecting papers, must contain variables matching `select_paper_prompt`. |

src/paperqa/core.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,17 +232,19 @@ async def _map_fxn_summary( # noqa: PLR0912
232232
cleaned_text = text.text.strip("\n") or "(no text)"
233233
if summary_llm_model and prompt_templates:
234234
unique_media = list(dict.fromkeys(text.media)) # Preserve order
235-
media_text: list[str] = [m.text for m in unique_media if m.text]
235+
table_texts: list[str] = [
236+
m.text for m in unique_media if m.info.get("type") == "table" and m.text
237+
]
236238
data = {
237239
"question": question,
238240
"citation": citation,
239241
"text": (
240242
text_with_tables_prompt_template.format(
241243
text=cleaned_text,
242244
citation=citation,
243-
tables="\n\n".join(media_text),
245+
tables="\n\n".join(table_texts),
244246
)
245-
if media_text
247+
if table_texts
246248
else cleaned_text
247249
),
248250
} | (extra_prompt_data or {})

src/paperqa/docs.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
import json
45
import logging
56
import os
@@ -387,16 +388,22 @@ async def aadd( # noqa: PLR0912
387388
doc, **(query_kwargs | kwargs)
388389
)
389390

391+
parse_images, enrich_media = parse_config.should_parse_and_enrich_media
392+
multimodal_kwargs: dict[str, Any] = {"parse_images": parse_images}
393+
if enrich_media:
394+
multimodal_kwargs["multimodal_enricher"] = (
395+
all_settings.make_media_enricher()
396+
)
390397
texts, metadata = await read_doc(
391398
path,
392399
doc,
393400
chunk_chars=parse_config.chunk_size,
394401
overlap=parse_config.overlap,
395402
page_size_limit=parse_config.page_size_limit,
396403
use_block_parsing=parse_config.pdfs_use_block_parsing,
397-
parse_images=parse_config.multimodal,
398404
parse_pdf=parse_config.parse_pdf,
399405
include_metadata=True,
406+
**multimodal_kwargs,
400407
)
401408
# loose check to see if document was loaded
402409
if metadata.name != "image" and (
@@ -480,7 +487,16 @@ async def aadd_texts(
480487
if embedding_model and texts[0].embedding is None:
481488
for t, t_embedding in zip(
482489
texts,
483-
await embedding_model.embed_documents(texts=[t.text for t in texts]),
490+
await embedding_model.embed_documents(
491+
texts=await asyncio.gather(
492+
*(
493+
t.get_embeddable_text(
494+
all_settings.parsing.should_parse_and_enrich_media[1]
495+
)
496+
for t in texts
497+
)
498+
)
499+
),
484500
strict=True,
485501
):
486502
t.embedding = t_embedding
@@ -534,14 +550,20 @@ def delete(
534550
self.deleted_dockeys.add(dockey)
535551
self.texts = list(filter(lambda x: x.doc.dockey != dockey, self.texts))
536552

537-
async def _build_texts_index(self, embedding_model: EmbeddingModel) -> None:
553+
async def _build_texts_index(
554+
self, embedding_model: EmbeddingModel, with_enrichment: bool = False
555+
) -> None:
538556
texts = [t for t in self.texts if t not in self.texts_index]
539557
# For any embeddings we are supposed to lazily embed, embed them now
540558
to_embed = [t for t in texts if t.embedding is None]
541559
if to_embed:
542560
for t, t_embedding in zip(
543561
to_embed,
544-
await embedding_model.embed_documents(texts=[t.text for t in to_embed]),
562+
await embedding_model.embed_documents(
563+
texts=await asyncio.gather(
564+
*(t.get_embeddable_text(with_enrichment) for t in to_embed)
565+
)
566+
),
545567
strict=True,
546568
):
547569
t.embedding = t_embedding
@@ -563,7 +585,10 @@ async def retrieve_texts(
563585
# TODO: should probably happen elsewhere
564586
self.texts_index.mmr_lambda = settings.texts_index_mmr_lambda
565587

566-
await self._build_texts_index(embedding_model)
588+
await self._build_texts_index(
589+
embedding_model,
590+
with_enrichment=settings.parsing.should_parse_and_enrich_media[1],
591+
)
567592
_k = k + len(self.deleted_dockeys)
568593
matches: list[Text] = cast(
569594
"list[Text]",

src/paperqa/prompts.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,39 @@
162162
EMPTY_CONTEXTS = len(CONTEXT_OUTER_PROMPT.format(context_str="", valid_keys="").strip())
163163
CONTEXT_INNER_PROMPT_NOT_DETAILED = "{name}: {text}"
164164
CONTEXT_INNER_PROMPT = f"{CONTEXT_INNER_PROMPT_NOT_DETAILED}\nFrom {{citation}}"
165+
166+
# For reference, here's Docling's image description prompt:
167+
# https://github.com/docling-project/docling/blob/v2.55.1/docling/datamodel/pipeline_options.py#L214-L216
168+
media_enrichment_prompt_template = (
169+
"You are analyzing an image or table from a scientific document."
170+
" Provide a detailed description that will be used to answer questions about its content."
171+
" Focus on key elements, data, relationships, and scientific insights visible in the image."
172+
" It's especially important to document referential information such as"
173+
" figure/table numbers, labels, plot colors, or legends."
174+
"\n\nText co-located with the media may be associated with"
175+
" other media or unrelated content,"
176+
" so do not just blindly quote referential information."
177+
" The smaller the image, the more likely co-located text is unrelated."
178+
" To restate, often the co-located text is several pages of content,"
179+
" so only use aspects relevant to accompanying image or table."
180+
"\n\nHere's a few failure mode with possible resolutions:"
181+
"\n- The media was a logo or icon, so the text is unrelated."
182+
" In this case, briefly describe the media as a logo or icon,"
183+
" and do not mention other unrelated surrounding text."
184+
"\n- The media was display type, so the text is probably unrelated."
185+
" The display type can be spread over several lines."
186+
" In this case, briefly describe the media as display type,"
187+
" and do not mention other unrelated surrounding text."
188+
"\n- The media is a margin box or design element, so the text is unrelated."
189+
" In this case, briefly describe the media as decorative,"
190+
" and do not mention other unrelated surrounding text."
191+
"\n- The media came from a bad PDF read, so it's garbled."
192+
" In this case, describe the media as garbled, state why it's considered garbled,"
193+
" and do not mention other unrelated surrounding text."
194+
"\n- The media is a subfigure or a subtable."
195+
" In this case, make sure to only detail the subfigure or subtable,"
196+
" not the entire figure or table."
197+
" Do not mention other unrelated surrounding text."
198+
"\n\n{context_text}Describe the media," # Allow for empty context_text
199+
" or if uncertain on a description please state why:"
200+
)

src/paperqa/readers.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,8 @@ def chunk_code_text(
275275

276276

277277
IMAGE_EXTENSIONS = tuple({".png", ".jpg", ".jpeg"})
278+
# When HTML reader supports images, add here
279+
ENRICHMENT_EXTENSIONS = tuple({".pdf", *IMAGE_EXTENSIONS})
278280

279281

280282
@overload
@@ -285,6 +287,7 @@ async def read_doc(
285287
include_metadata: Literal[True],
286288
chunk_chars: int = ...,
287289
overlap: int = ...,
290+
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
288291
parse_pdf: PDFParserFn | None = ...,
289292
**parser_kwargs,
290293
) -> ParsedText: ...
@@ -296,6 +299,7 @@ async def read_doc(
296299
include_metadata: Literal[False] = ...,
297300
chunk_chars: int = ...,
298301
overlap: int = ...,
302+
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
299303
parse_pdf: PDFParserFn | None = ...,
300304
**parser_kwargs,
301305
) -> ParsedText: ...
@@ -307,6 +311,7 @@ async def read_doc(
307311
include_metadata: Literal[True],
308312
chunk_chars: int = ...,
309313
overlap: int = ...,
314+
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
310315
parse_pdf: PDFParserFn | None = ...,
311316
**parser_kwargs,
312317
) -> tuple[list[Text], ParsedMetadata]: ...
@@ -318,6 +323,7 @@ async def read_doc(
318323
include_metadata: Literal[False] = ...,
319324
chunk_chars: int = ...,
320325
overlap: int = ...,
326+
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
321327
parse_pdf: PDFParserFn | None = ...,
322328
**parser_kwargs,
323329
) -> list[Text]: ...
@@ -329,6 +335,8 @@ async def read_doc(
329335
include_metadata: Literal[True],
330336
chunk_chars: int = ...,
331337
overlap: int = ...,
338+
image_enrichment_pages: int | bool = ...,
339+
multimodal_enricher: Callable[[ParsedText], Awaitable] | None = ...,
332340
parse_pdf: PDFParserFn | None = ...,
333341
**parser_kwargs,
334342
) -> tuple[list[Text], ParsedMetadata]: ...
@@ -339,6 +347,7 @@ async def read_doc( # noqa: PLR0912
339347
include_metadata: bool = False,
340348
chunk_chars: int = 3000,
341349
overlap: int = 100,
350+
multimodal_enricher: Callable[[ParsedText], Awaitable[str]] | None = None,
342351
parse_pdf: PDFParserFn | None = None,
343352
**parser_kwargs,
344353
) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
@@ -351,6 +360,8 @@ async def read_doc( # noqa: PLR0912
351360
include_metadata: Opt-in flag to include metadata about the chunking algorithm.
352361
chunk_chars: size of chunks
353362
overlap: size of overlap between chunks
363+
multimodal_enricher: Optional function to enrich the parsed text
364+
and return a hashable string summary before chunking.
354365
parse_pdf: Optional function to parse PDF files (if you're parsing a PDF).
355366
parser_kwargs: Keyword arguments to pass to the used parsing function.
356367
"""
@@ -380,6 +391,13 @@ async def read_doc( # noqa: PLR0912
380391
if parsed_text_only:
381392
return parsed_text
382393

394+
# Enrich upon full parsed text before chunking, since enrichment
395+
# may view adjacent pages (and not getting cut off on chunk boundaries)
396+
if str_path.endswith(ENRICHMENT_EXTENSIONS) and multimodal_enricher:
397+
enrichment_summary: str = f"|{await multimodal_enricher(parsed_text)}"
398+
else:
399+
enrichment_summary = ""
400+
383401
# next chunk the parsed text
384402

385403
if chunk_chars == 0:
@@ -389,7 +407,10 @@ async def read_doc( # noqa: PLR0912
389407
chunk_metadata = ChunkMetadata(
390408
size=0,
391409
overlap=0,
392-
name=f"paper-qa={pqa_version}|algorithm=none|reduction=cl100k_base",
410+
name=(
411+
f"paper-qa={pqa_version}|algorithm=none"
412+
f"|reduction=cl100k_base{enrichment_summary}"
413+
),
393414
)
394415
elif str_path.endswith(".pdf"):
395416
chunked_text = chunk_pdf(
@@ -400,7 +421,7 @@ async def read_doc( # noqa: PLR0912
400421
overlap=overlap,
401422
name=(
402423
f"paper-qa={pqa_version}|algorithm=overlap-pdf"
403-
f"|size={chunk_chars}|overlap={overlap}"
424+
f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
404425
),
405426
)
406427
elif str_path.endswith(IMAGE_EXTENSIONS):
@@ -410,7 +431,7 @@ async def read_doc( # noqa: PLR0912
410431
chunk_metadata = ChunkMetadata(
411432
size=0,
412433
overlap=0,
413-
name=f"paper-qa={pqa_version}|algorithm=none",
434+
name=f"paper-qa={pqa_version}|algorithm=none{enrichment_summary}",
414435
)
415436
elif str_path.endswith((".txt", ".html")):
416437
chunked_text = chunk_text(
@@ -421,7 +442,7 @@ async def read_doc( # noqa: PLR0912
421442
overlap=overlap,
422443
name=(
423444
f"paper-qa={pqa_version}|algorithm=overlap-text|reduction=cl100k_base"
424-
f"|size={chunk_chars}|overlap={overlap}"
445+
f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
425446
),
426447
)
427448
else:
@@ -433,7 +454,7 @@ async def read_doc( # noqa: PLR0912
433454
overlap=overlap,
434455
name=(
435456
f"paper-qa={pqa_version}|algorithm=overlap-code|reduction=cl100k_base"
436-
f"|size={chunk_chars}|overlap={overlap}"
457+
f"|size={chunk_chars}|overlap={overlap}{enrichment_summary}"
437458
),
438459
)
439460

0 commit comments

Comments
 (0)