Skip to content
Merged
Next Next commit
Compare retrieved context with golden context
  • Loading branch information
alekszievr committed Mar 6, 2025
commit b7f9e166358ffe0e2d1030554a1c0222c21f627a
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@ async def question_answering_non_parallel(
retrieval_context = await retriever.get_context(query_text)
search_results = await retriever.get_completion(query_text, retrieval_context)

answers.append(
{
"question": query_text,
"answer": search_results[0],
"golden_answer": correct_answer,
"retrieval_context": retrieval_context,
}
)
answer = {
"question": query_text,
"answer": search_results[0],
"golden_answer": correct_answer,
"retrieval_context": retrieval_context,
}

if "golden_context" in instance:
answer["golden_context"] = instance["golden_context"]

answers.append(answer)

return answers
16 changes: 12 additions & 4 deletions cognee/eval_framework/corpus_builder/corpus_builder_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,22 @@ def __init__(
self.questions = None
self.task_getter = task_getter

def load_corpus(self, limit: Optional[int] = None) -> Tuple[List[Dict], List[str]]:
self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit)
def load_corpus(
self, limit: Optional[int] = None, load_golden_context: bool = False
) -> Tuple[List[Dict], List[str]]:
self.raw_corpus, self.questions = self.adapter.load_corpus(
limit=limit, load_golden_context=load_golden_context
)
return self.raw_corpus, self.questions

async def build_corpus(
self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker
self,
limit: Optional[int] = None,
chunk_size=1024,
chunker=TextChunker,
load_golden_context: bool = False,
) -> List[str]:
self.load_corpus(limit=limit)
self.load_corpus(limit=limit, load_golden_context=load_golden_context)
await self.run_cognee(chunk_size=chunk_size, chunker=chunker)
return self.questions

Expand Down
5 changes: 4 additions & 1 deletion cognee/eval_framework/corpus_builder/run_corpus_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
task_getter=task_getter,
)
questions = await corpus_builder.build_corpus(
limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker
limit=params.get("number_of_samples_in_corpus"),
chunk_size=chunk_size,
chunker=chunker,
load_golden_context=params.get("evaluating_contexts"),
)
with open(params["questions_path"], "w", encoding="utf-8") as f:
json.dump(questions, f, ensure_ascii=False, indent=4)
Expand Down
3 changes: 3 additions & 0 deletions cognee/eval_framework/evaluation/deep_eval_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from cognee.eval_framework.evaluation.metrics.context_match import ContextMatchMetric
from typing import Any, Dict, List
from deepeval.metrics import ContextualRelevancyMetric

Expand All @@ -15,6 +16,7 @@ def __init__(self):
"EM": ExactMatchMetric(),
"f1": F1ScoreMetric(),
"contextual_relevancy": ContextualRelevancyMetric(),
"context_match": ContextMatchMetric(),
}

async def evaluate_answers(
Expand All @@ -32,6 +34,7 @@ async def evaluate_answers(
actual_output=answer["answer"],
expected_output=answer["golden_answer"],
retrieval_context=[answer["retrieval_context"]],
context=[answer["golden_context"]] if "golden_context" in answer else None,
)
metric_results = {}
for metric in evaluator_metrics:
Expand Down
1 change: 1 addition & 0 deletions cognee/eval_framework/evaluation/evaluation_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ def __init__(
async def execute(self, answers: List[Dict[str, str]], evaluator_metrics: Any) -> Any:
if self.evaluate_contexts:
evaluator_metrics.append("contextual_relevancy")
evaluator_metrics.append("context_match")
metrics = await self.eval_adapter.evaluate_answers(answers, evaluator_metrics)
return metrics