Retrying context creation once, and then abandoning if it fails twice

Future-House · jamesbraza · Sep 11, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
commit 54b3559bd413d99db61b00d9cf9ce5e54df7cb07
diff --git a/src/paperqa/core.py b/src/paperqa/core.py
@@ -106,11 +106,7 @@ def fraction_replacer(match: re.Match) -> str:
                         "relevance_score": int(score_match.group(1)),
                     }
 
-        raise ValueError(
-            f"Failed to parse JSON from text {text!r}. Your model may not be capable of"
-            " supporting JSON output or our parsing technique could use some work. Try"
-            " a different model or specify `Settings(prompts={'use_json': False})`"
-        ) from e
+        raise ValueError(f"Failed to load JSON from text {text!r}.") from e
 
     # Handling incorrect key names for "relevance_score"
     for key in list(data.keys()):
@@ -130,7 +126,15 @@ def fraction_replacer(match: re.Match) -> str:
     return data
 
 
-async def map_fxn_summary(
+class LLMBadContextJSONError(ValueError):
+    """Retryable exception for when the LLM gives back bad JSON."""
+
+    def __init__(self, message: str, llm_results: list[LLMResult]) -> None:
+        super().__init__(message)
+        self.llm_results = llm_results  # House so we can cost track across retries
+
+
+async def _map_fxn_summary(  # noqa: PLR0912
     text: Text,
     question: str,
     summary_llm_model: LLMModel | None,
@@ -140,6 +144,7 @@ async def map_fxn_summary(
     callbacks: Sequence[Callable[[str], None]] | None = None,
     skip_citation_strip: bool = False,
     evidence_text_only_fallback: bool = False,
+    _prior_attempt: LLMBadContextJSONError | None = None,
 ) -> tuple[Context, list[LLMResult]]:
     """Parses the given text and returns a context object with the parser and prompt runner.
 
@@ -160,14 +165,25 @@ async def map_fxn_summary(
         skip_citation_strip: Optional skipping of citation stripping, if you want to keep in the context.
         evidence_text_only_fallback: Opt-in flag to allow retrying context creation
             without media in the completion.
+        _prior_attempt: Optional failure from a prior attempt, for LLM result tracking.
 
     Returns:
         A two-tuple of the made Context, and any LLM results made along the way.
     """
-    llm_results = []
+    if _prior_attempt is not None:
+        llm_results = _prior_attempt.llm_results
+        append_msgs = [
+            Message(
+                content=(
+                    "In a prior attempt, we failed with this failure message:"
+                    f" {_prior_attempt!s}."
+                )
+            )
+        ]
+    else:
+        llm_results, append_msgs = [], []
     extras: dict[str, Any] = {}
     citation = text.name + ": " + text.doc.formatted_citation
-    successfully_has_score = False
     used_text_only_fallback = False
 
     # Strip newlines in case chunking led to blank lines,
@@ -201,6 +217,7 @@ async def map_fxn_summary(
                             else None
                         ),
                     ),
+                    *append_msgs,
                 ],
                 callbacks=callbacks,
                 name="evidence:" + text.name,
@@ -218,42 +235,69 @@ async def map_fxn_summary(
                 messages=[
                     Message(role="system", content=system_prompt),
                     Message(content=message_prompt),
+                    *append_msgs,
                 ],
                 callbacks=callbacks,
                 name="evidence:" + text.name,
             )
             used_text_only_fallback = True
         llm_results.append(llm_result)
         context = llm_result.text or ""
-        result_data = parser(context) if parser else {}
+        try:
+            result_data = parser(context) if parser else {}
+        except ValueError as exc:
+            raise LLMBadContextJSONError(
+                f"Failed to parse JSON from context {context!r} due to: {exc}",
+                llm_results=llm_results,
+            ) from exc
         if result_data:
             try:
                 context = result_data.pop("summary")
-                score = (
-                    result_data.pop("relevance_score")
-                    if "relevance_score" in result_data
-                    else extract_score(context)
-                )
-                successfully_has_score = True
+                try:
+                    score = (
+                        result_data.pop("relevance_score")
+                        if "relevance_score" in result_data
+                        else extract_score(context)
+                    )
+                except ValueError as exc:
+                    raise LLMBadContextJSONError(
+                        f"Successfully parsed JSON and extracted 'summary' key,"
+                        f" but then failed to extract score from context {context!r} due to: {exc}",
+                        llm_results=llm_results,
+                    ) from exc
                 # just in case question was present
                 result_data.pop("question", None)
                 extras = result_data
-            except KeyError:
-                successfully_has_score = False
+            except KeyError:  # No summary key, so extract from LLM result
+                try:
+                    score = extract_score(context)
+                except ValueError as exc:
+                    raise LLMBadContextJSONError(
+                        f"Successfully parsed JSON but it had no 'summary' key."
+                        f" Then, the failover to extract score from raw context {context!r}"
+                        f" failed due to: {exc}",
+                        llm_results=llm_results,
+                    ) from exc
+        else:
+            try:
+                score = extract_score(context)
+            except ValueError as exc:
+                raise LLMBadContextJSONError(
+                    f"Extracting score from raw context {context!r}"
+                    f" failed due to: {exc}",
+                    llm_results=llm_results,
+                ) from exc
     else:
         llm_results.append(LLMResult(model="", date=""))
         context = cleaned_text
         # If we don't assign scores, just default to 5.
         # why 5? Because we filter out 0s in another place
         # and 5/10 is the other default I could come up with
         score = 5
-        successfully_has_score = True
     # remove citations that collide with our grounded citations (for the answer LLM)
     if not skip_citation_strip:
         context = strip_citations(context)
 
-    if not successfully_has_score:
-        score = extract_score(context)
     if used_text_only_fallback:
         extras["used_images"] = False
 
@@ -269,8 +313,30 @@ async def map_fxn_summary(
                 doc=text.doc.model_dump(exclude={"embedding"}),
                 **text.model_dump(exclude={"embedding", "doc"}),
             ),
-            score=score,  # pylint: disable=possibly-used-before-assignment
+            score=score,
             **extras,
         ),
         llm_results,
     )
+
+
+async def map_fxn_summary(**kwargs) -> tuple[Context | None, list[LLMResult]]:
+    if "_prior_attempt" in kwargs:
+        raise ValueError(
+            "_prior_attempt is reserved for internal use only, don't specify it."
+        )
+    try:
+        return await _map_fxn_summary(**kwargs)
+    except LLMBadContextJSONError as exc:
+        try:
+            return await _map_fxn_summary(**kwargs, _prior_attempt=exc)
+        except LLMBadContextJSONError as exc2:
+            logger.exception(
+                "Failed twice to create a context, abandoning it."
+                " Your model may not be capable of supporting JSON output"
+                " or our parsing technique could use some work. Try"
+                " a different model or specify `Settings(prompts={'use_json': False})`."
+                " Or, feel free to just ignore this message, as many contexts are"
+                " concurrently made and we're not attached to any one given context."
+            )
+            return None, exc2.llm_results
diff --git a/src/paperqa/docs.py b/src/paperqa/docs.py
@@ -682,7 +682,7 @@ async def aget_evidence(
             for r in llm_results:
                 session.add_tokens(r)
 
-        session.contexts += [r for r, _ in results]
+        session.contexts += [c for c, _ in results if c is not None]
         return session
 
     def query(