Skip to content

Commit 7b5bd78

Browse files
authored
Feat: evaluate retrieved context against golden context [cog-1481] (#619)
<!-- .github/pull_request_template.md --> ## Description - Compare retrieved context to golden context using deepeval's summarization metric - Display relevant fields to each metric on metrics dashboard Example output: ![image](https://github.com/user-attachments/assets/9facf716-b2ab-4573-bfdf-7b343d2a57c5) ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced context handling in answer generation and corpus building to include extended details. - Introduced a new context coverage metric for deeper evaluation insights. - Upgraded the evaluation dashboard with dynamic presentation of metric details. - Added a new parameter to support loading golden context in corpus loading methods. - **Bug Fixes** - Improved clarity in how answers are structured and appended in the answer generation process. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent ac01565 commit 7b5bd78

File tree

9 files changed

+115
-41
lines changed

9 files changed

+115
-41
lines changed

cognee/eval_framework/answer_generation/answer_generation_executor.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,16 @@ async def question_answering_non_parallel(
2929
retrieval_context = await retriever.get_context(query_text)
3030
search_results = await retriever.get_completion(query_text, retrieval_context)
3131

32-
answers.append(
33-
{
34-
"question": query_text,
35-
"answer": search_results[0],
36-
"golden_answer": correct_answer,
37-
"retrieval_context": retrieval_context,
38-
}
39-
)
32+
answer = {
33+
"question": query_text,
34+
"answer": search_results[0],
35+
"golden_answer": correct_answer,
36+
"retrieval_context": retrieval_context,
37+
}
38+
39+
if "golden_context" in instance:
40+
answer["golden_context"] = instance["golden_context"]
41+
42+
answers.append(answer)
4043

4144
return answers

cognee/eval_framework/benchmark_adapters/dummy_adapter.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,21 @@
55

66
class DummyAdapter(BaseBenchmarkAdapter):
77
def load_corpus(
8-
self, limit: Optional[int] = None, seed: int = 42
8+
self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
99
) -> tuple[list[str], list[dict[str, Any]]]:
1010
corpus_list = [
1111
"The cognee is an AI memory engine that supports different vector and graph databases",
1212
"Neo4j is a graph database supported by cognee",
1313
]
14-
question_answer_pairs = [
15-
{
16-
"answer": "Yes",
17-
"question": "Is Neo4j supported by cognee?",
18-
"type": "dummy",
19-
}
20-
]
14+
qa_pair = {
15+
"answer": "Yes",
16+
"question": "Is Neo4j supported by cognee?",
17+
"type": "dummy",
18+
}
19+
20+
if load_golden_context:
21+
qa_pair["golden_context"] = "Cognee supports Neo4j and NetworkX"
22+
23+
question_answer_pairs = [qa_pair]
2124

2225
return corpus_list, question_answer_pairs

cognee/eval_framework/corpus_builder/corpus_builder_executor.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,22 @@ def __init__(
2828
self.questions = None
2929
self.task_getter = task_getter
3030

31-
def load_corpus(self, limit: Optional[int] = None) -> Tuple[List[Dict], List[str]]:
32-
self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit)
31+
def load_corpus(
32+
self, limit: Optional[int] = None, load_golden_context: bool = False
33+
) -> Tuple[List[Dict], List[str]]:
34+
self.raw_corpus, self.questions = self.adapter.load_corpus(
35+
limit=limit, load_golden_context=load_golden_context
36+
)
3337
return self.raw_corpus, self.questions
3438

3539
async def build_corpus(
36-
self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker
40+
self,
41+
limit: Optional[int] = None,
42+
chunk_size=1024,
43+
chunker=TextChunker,
44+
load_golden_context: bool = False,
3745
) -> List[str]:
38-
self.load_corpus(limit=limit)
46+
self.load_corpus(limit=limit, load_golden_context=load_golden_context)
3947
await self.run_cognee(chunk_size=chunk_size, chunker=chunker)
4048
return self.questions
4149

cognee/eval_framework/corpus_builder/run_corpus_builder.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
4747
task_getter=task_getter,
4848
)
4949
questions = await corpus_builder.build_corpus(
50-
limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker
50+
limit=params.get("number_of_samples_in_corpus"),
51+
chunk_size=chunk_size,
52+
chunker=chunker,
53+
load_golden_context=params.get("evaluating_contexts"),
5154
)
5255
with open(params["questions_path"], "w", encoding="utf-8") as f:
5356
json.dump(questions, f, ensure_ascii=False, indent=4)

cognee/eval_framework/evaluation/deep_eval_adapter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
55
from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
66
from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
7+
from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric
78
from typing import Any, Dict, List
89
from deepeval.metrics import ContextualRelevancyMetric
910

@@ -15,6 +16,7 @@ def __init__(self):
1516
"EM": ExactMatchMetric(),
1617
"f1": F1ScoreMetric(),
1718
"contextual_relevancy": ContextualRelevancyMetric(),
19+
"context_coverage": ContextCoverageMetric(),
1820
}
1921

2022
async def evaluate_answers(
@@ -32,6 +34,7 @@ async def evaluate_answers(
3234
actual_output=answer["answer"],
3335
expected_output=answer["golden_answer"],
3436
retrieval_context=[answer["retrieval_context"]],
37+
context=[answer["golden_context"]] if "golden_context" in answer else None,
3538
)
3639
metric_results = {}
3740
for metric in evaluator_metrics:

cognee/eval_framework/evaluation/evaluation_executor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@ def __init__(
2323
async def execute(self, answers: List[Dict[str, str]], evaluator_metrics: Any) -> Any:
2424
if self.evaluate_contexts:
2525
evaluator_metrics.append("contextual_relevancy")
26+
evaluator_metrics.append("context_coverage")
2627
metrics = await self.eval_adapter.evaluate_answers(answers, evaluator_metrics)
2728
return metrics
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from deepeval.metrics import SummarizationMetric
2+
from deepeval.test_case import LLMTestCase
3+
from deepeval.metrics.summarization.schema import ScoreType
4+
from deepeval.metrics.indicator import metric_progress_indicator
5+
from deepeval.utils import get_or_create_event_loop
6+
7+
8+
class ContextCoverageMetric(SummarizationMetric):
9+
def measure(
10+
self,
11+
test_case,
12+
_show_indicator: bool = True,
13+
) -> float:
14+
mapped_test_case = LLMTestCase(
15+
input=test_case.context[0],
16+
actual_output=test_case.retrieval_context[0],
17+
)
18+
self.assessment_questions = None
19+
self.evaluation_cost = 0 if self.using_native_model else None
20+
with metric_progress_indicator(self, _show_indicator=_show_indicator):
21+
if self.async_mode:
22+
loop = get_or_create_event_loop()
23+
return loop.run_until_complete(
24+
self.a_measure(mapped_test_case, _show_indicator=False)
25+
)
26+
else:
27+
self.coverage_verdicts = self._generate_coverage_verdicts(mapped_test_case)
28+
self.alignment_verdicts = []
29+
self.score = self._calculate_score(ScoreType.COVERAGE)
30+
self.reason = self._generate_reason()
31+
self.success = self.score >= self.threshold
32+
return self.score
33+
34+
async def a_measure(
35+
self,
36+
test_case,
37+
_show_indicator: bool = True,
38+
) -> float:
39+
self.evaluation_cost = 0 if self.using_native_model else None
40+
with metric_progress_indicator(
41+
self,
42+
async_mode=True,
43+
_show_indicator=_show_indicator,
44+
):
45+
self.coverage_verdicts = await self._a_generate_coverage_verdicts(test_case)
46+
self.alignment_verdicts = []
47+
self.score = self._calculate_score(ScoreType.COVERAGE)
48+
self.reason = await self._a_generate_reason()
49+
self.success = self.score >= self.threshold
50+
return self.score

cognee/eval_framework/metrics_dashboard.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
from typing import Dict, List, Tuple
44
from collections import defaultdict
55

6+
metrics_fields = {
7+
"contextual_relevancy": ["question", "retrieval_context"],
8+
"context_coverage": ["question", "retrieval_context", "golden_context"],
9+
}
10+
default_metrics_fields = ["question", "answer", "golden_answer"]
11+
612

713
def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
814
"""Create distribution histogram plots for each metric."""
@@ -59,38 +65,30 @@ def generate_details_html(metrics_data: List[Dict]) -> List[str]:
5965
for metric, values in entry["metrics"].items():
6066
if metric not in metric_details:
6167
metric_details[metric] = []
68+
current_metrics_fields = metrics_fields.get(metric, default_metrics_fields)
6269
metric_details[metric].append(
63-
{
64-
"question": entry["question"],
65-
"answer": entry["answer"],
66-
"golden_answer": entry["golden_answer"],
70+
{key: entry[key] for key in current_metrics_fields}
71+
| {
6772
"reason": values.get("reason", ""),
6873
"score": values["score"],
6974
}
7075
)
7176

7277
for metric, details in metric_details.items():
78+
formatted_column_names = [key.replace("_", " ").title() for key in details[0].keys()]
7379
details_html.append(f"<h3>{metric} Details</h3>")
74-
details_html.append("""
80+
details_html.append(f"""
7581
<table class="metric-table">
7682
<tr>
77-
<th>Question</th>
78-
<th>Answer</th>
79-
<th>Golden Answer</th>
80-
<th>Reason</th>
81-
<th>Score</th>
83+
{"".join(f"<th>{col}</th>" for col in formatted_column_names)}
8284
</tr>
8385
""")
8486
for item in details:
85-
details_html.append(
86-
f"<tr>"
87-
f"<td>{item['question']}</td>"
88-
f"<td>{item['answer']}</td>"
89-
f"<td>{item['golden_answer']}</td>"
90-
f"<td>{item['reason']}</td>"
91-
f"<td>{item['score']}</td>"
92-
f"</tr>"
93-
)
87+
details_html.append(f"""
88+
<tr>
89+
{"".join(f"<td>{value}</td>" for value in item.values())}
90+
</tr>
91+
""")
9492
details_html.append("</table>")
9593
return details_html
9694

cognee/tests/unit/eval_framework/deepeval_adapter_test.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@
55

66
with patch.dict(
77
sys.modules,
8-
{"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
8+
{
9+
"deepeval": MagicMock(),
10+
"deepeval.metrics": MagicMock(),
11+
"deepeval.test_case": MagicMock(),
12+
"cognee.eval_framework.evaluation.metrics.context_coverage": MagicMock(),
13+
},
914
):
1015
from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
1116

0 commit comments

Comments
 (0)