Removing irrelevant contexts, instead of filtering them out for 'Best…

… evidence(s)'
Future-House · jamesbraza · Sep 27, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
commit e64d5aac2d751f30b3ca3f6a4608466bd5c952a3
diff --git a/src/paperqa/agents/tools.py b/src/paperqa/agents/tools.py
@@ -252,13 +252,12 @@ async def gather_evidence(self, question: str, state: EnvironmentState) -> str:
         status = state.status
         logger.info(status)
         # only show top n contexts for this particular question to the agent
-        # only show context above score 0, because 0 is a sentinel for irrelevance
-        sorted_relevant_contexts = sorted(
-            [
+        sorted_contexts = sorted(
+            (
                 c
                 for c in state.session.contexts
-                if ((c.question is None or c.question == question) and c.score > 0)
-            ],
+                if c.question is None or c.question == question
+            ),
             key=lambda x: x.score,
             reverse=True,
         )
@@ -267,7 +266,7 @@ async def gather_evidence(self, question: str, state: EnvironmentState) -> str:
             [
                 f"{n + 1}. {sc.context}\n"
                 for n, sc in enumerate(
-                    sorted_relevant_contexts[: self.settings.agent.agent_evidence_n]
+                    sorted_contexts[: self.settings.agent.agent_evidence_n]
                 )
             ]
         )

diff --git a/src/paperqa/docs.py b/src/paperqa/docs.py
@@ -682,7 +682,8 @@ async def aget_evidence(
             for r in llm_results:
                 session.add_tokens(r)
 
-        session.contexts += [c for c, _ in results if c is not None]
+        # Filter out failed context creations or irrelevant contexts
+        session.contexts += [c for c, _ in results if c is not None and c.score > 0]
         return session
 
     def query(

diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -6,7 +6,6 @@
 import os
 import pathlib
 import pickle
-import random
 import re
 import sys
 from collections.abc import AsyncIterable, Sequence
@@ -1077,17 +1076,21 @@ async def test_unrelated_context(
     assert await docs.aadd(
         stub_data_dir / "bates.txt", "WikiMedia Foundation, 2023, Accessed now"
     )
+    assert docs.texts, "Test requires at least one text"
     session = await docs.aget_evidence(
         "What do scientist estimate as the planetary composition of Jupyter?",
         settings=agent_test_settings,
     )
-    assert session.contexts, "Test relies on some contexts being added"
+    session.contexts.append(  # Give a context so the rest of the test can run
+        Context(
+            context="George Washington is a founding father",
+            question="What do scientist estimate as the planetary composition of Jupyter?",
+            text=docs.texts[0],
+            score=1,
+        )
+    )
     for c in session.contexts:
         assert c.score <= 2, "Expected contexts to be considered irrelevant"
-        if c.score <= 0:
-            # Now, let's trick the system into thinking the context
-            # was at least somewhat relevant
-            c.score = random.randint(1, 2)
     session = await docs.aquery(session, settings=agent_test_settings)
     assert unsure_sentinel in session.answer
 
@@ -1652,6 +1655,8 @@ async def test_images_corrupt(stub_data_dir: Path) -> None:
     )
     assert districts_docname, "Expected successful image addition"
     (districts_doc,) = (d for d in docs.docs.values() if d.docname == districts_docname)
+    (districts_text,) = docs.texts
+    assert not districts_text.text, "Test expects no text content from image addition"
     for media in (t.media for t in docs.texts if t.doc == districts_doc and t.media):
         for m in media:
             # Validate the image, then chop the image in half (breaking it), and
@@ -1669,27 +1674,13 @@ async def test_images_corrupt(stub_data_dir: Path) -> None:
 
     # By suppressing the use of images, we can actually gather evidence now
     settings.answer.evidence_text_only_fallback = True
-    # The answer will be garbage, but let's make sure we didn't claim to use images
     session = await docs.aget_evidence(
         "What districts neighbor the Western Addition?", settings=settings
     )
-    assert session.contexts, "Test relies on some contexts being added"
-    for c in session.contexts:
-        assert c.score <= 2, "Expected contexts to be considered irrelevant"
-        if c.score <= 0:
-            # Now, let's trick the system into thinking the context
-            # was at least somewhat relevant
-            c.score = random.randint(1, 2)
-    await docs.aquery(session, settings=settings)
-    assert session.used_contexts
-    assert session.cost > 0
-    contexts_used = [
-        c
-        for c in session.contexts
-        if c.id in session.used_contexts and c.text.doc == districts_doc
-    ]
-    assert contexts_used
-    assert all(not bool(c.used_images) for c in contexts_used)  # type: ignore[attr-defined]
+    assert (
+        not session.contexts
+    ), "Expected no contexts to be made from a bad image that has no text"
+    assert session.cost > 0, "Expected some costs to have been incurred in our attempt"
 
 
 def test_zotero() -> None: