Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
468de67
refactor: utils folder in retrieval
lxobr Feb 21, 2025
d789dd0
feat: add base_retriever.py
lxobr Feb 21, 2025
49c2355
feat: add retriever classes
lxobr Feb 21, 2025
7619df2
fix: include generate_completion function
lxobr Feb 21, 2025
5a5eb5e
feat: add search comparison script, compare summaries
lxobr Feb 21, 2025
8f0cbee
feat: enable context dumping
lxobr Feb 24, 2025
beacdea
fix: improve context getting and completion
lxobr Feb 24, 2025
4b71081
feat: add all searches and context comparisons
lxobr Feb 24, 2025
7631b11
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 24, 2025
62f8ac3
Update cognee/tasks/completion/query_completion.py
lxobr Feb 24, 2025
58c7eaf
feat: context dumping error handling
lxobr Feb 24, 2025
afd5ca4
feat: expose aggregate metrics, enable saving
lxobr Feb 24, 2025
8bf5aae
feat: add modal example
lxobr Feb 24, 2025
fd7f837
delete: metrics_dashboard.py
lxobr Feb 24, 2025
416eed1
fix: dashboard generation
lxobr Feb 24, 2025
d9fcb12
feat: add get_golden_context flag
lxobr Feb 25, 2025
36dbdf7
feat: implement get_golden_context for hotpot_qa
lxobr Feb 25, 2025
c07cf22
chore: added todos
lxobr Feb 25, 2025
2ef174a
chore: added a todo
lxobr Feb 25, 2025
5910fb7
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 25, 2025
65784e1
Merge branch 'dev' into feat/COG-1364-golden-contexts
lxobr Feb 25, 2025
bdaea29
feat: simplify twowikimultihop, get golden context
lxobr Feb 25, 2025
32d5829
feat: add golden context to musique_adapter.py
lxobr Feb 25, 2025
ec3b753
Merge branch 'dev' into feat/COG-1331-modal-run-eval
lxobr Feb 25, 2025
2f70de4
fix: update tests
lxobr Feb 27, 2025
3d0b839
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 27, 2025
4903d7e
feat: update code retriever
lxobr Feb 27, 2025
e98c12e
refactor: rename variables
lxobr Feb 27, 2025
af5d7c6
Merge branch 'dev' into feat/COG-1364-golden-contexts
lxobr Feb 27, 2025
0ece58a
refactor: add metadata_field_name property
lxobr Feb 27, 2025
cb0fccd
Merge remote-tracking branch 'origin/feat/COG-1331-modal-run-eval' in…
alekszievr Feb 27, 2025
1eb5e71
Merge remote-tracking branch 'origin/feat/COG-1364-golden-contexts' i…
alekszievr Feb 27, 2025
30927d7
First render.
soobrosa Feb 27, 2025
b02231d
Small fixes.
soobrosa Feb 27, 2025
2d90221
coderabbit don't be smart
soobrosa Feb 27, 2025
68a4584
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
8521fa8
Merge remote-tracking branch 'origin/feature/cog-1403-transition-to-n…
alekszievr Feb 27, 2025
3906bf5
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
aae1237
Calculate context relevancy score
alekszievr Feb 27, 2025
4cffd4b
Adjust dashboard tests
alekszievr Feb 27, 2025
2e2beb3
Adjust answer generation test
alekszievr Feb 27, 2025
7a574e3
adjust deepeval adapter test
alekszievr Feb 27, 2025
a17a5c8
Fix type hinting
alekszievr Feb 27, 2025
3f10725
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
df5ba7b
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
4c09877
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Mar 3, 2025
d9b007a
ruff format
alekszievr Mar 3, 2025
5691a1f
fix
alekszievr Mar 3, 2025
634a7fa
fix: add comment to new param
borisarzentar Mar 3, 2025
3453ede
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
borisarzentar Mar 3, 2025
5b9a64d
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Mar 5, 2025
6b2b6f2
Pass system prompt in question answering
alekszievr Mar 5, 2025
1a3371e
Adjust tests
alekszievr Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,42 +1,40 @@
import cognee
from typing import List, Dict, Callable, Awaitable
from cognee.api.v1.search import SearchType

question_answering_engine_options: Dict[str, Callable[[str, str], Awaitable[List[str]]]] = {
"cognee_graph_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text=query,
system_prompt_path=system_prompt_path,
),
"cognee_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.COMPLETION, query_text=query, system_prompt_path=system_prompt_path
),
"graph_summary_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.GRAPH_SUMMARY_COMPLETION,
query_text=query,
system_prompt_path=system_prompt_path,
),
from typing import List, Dict
from cognee.modules.retrieval.completion_retriever import CompletionRetriever
from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
from cognee.modules.retrieval.graph_summary_completion_retriever import (
GraphSummaryCompletionRetriever,
)

from cognee.modules.retrieval.base_retriever import BaseRetriever


retriever_options: Dict[str, BaseRetriever] = {
"cognee_graph_completion": GraphCompletionRetriever,
"cognee_completion": CompletionRetriever,
"graph_summary_completion": GraphSummaryCompletionRetriever,
}


class AnswerGeneratorExecutor:
async def question_answering_non_parallel(
self,
questions: List[Dict[str, str]],
answer_resolver: Callable[[str], Awaitable[List[str]]],
retriever: BaseRetriever,
) -> List[Dict[str, str]]:
answers = []
for instance in questions:
query_text = instance["question"]
correct_answer = instance["answer"]

search_results = await answer_resolver(query_text)
retrieval_context = await retriever.get_context(query_text)
search_results = await retriever.get_completion(query_text, retrieval_context)

answers.append(
{
"question": query_text,
"answer": search_results[0],
"golden_answer": correct_answer,
"retrieval_context": retrieval_context,
}
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import List
from cognee.eval_framework.answer_generation.answer_generation_executor import (
AnswerGeneratorExecutor,
question_answering_engine_options,
retriever_options,
)
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import (
Expand Down Expand Up @@ -48,9 +48,7 @@ async def run_question_answering(
answer_generator = AnswerGeneratorExecutor()
answers = await answer_generator.question_answering_non_parallel(
questions=questions,
answer_resolver=lambda query: question_answering_engine_options[params["qa_engine"]](
query, system_prompt
),
retriever=retriever_options[params["qa_engine"]](system_prompt_path=system_prompt),
)
with open(params["answers_path"], "w", encoding="utf-8") as f:
json.dump(answers, f, ensure_ascii=False, indent=4)
Expand Down
2 changes: 2 additions & 0 deletions cognee/eval_framework/eval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class EvalConfig(BaseSettings):

# Evaluation params
evaluating_answers: bool = True
evaluating_contexts: bool = True
evaluation_engine: str = "DeepEval" # Options: 'DeepEval' (uses deepeval_model), 'DirectLLM' (uses default llm from .env)
evaluation_metrics: List[str] = [
"correctness",
Expand Down Expand Up @@ -51,6 +52,7 @@ def to_dict(self) -> dict:
"answering_questions": self.answering_questions,
"qa_engine": self.qa_engine,
"evaluating_answers": self.evaluating_answers,
"evaluating_contexts": self.evaluating_contexts, # Controls whether context evaluation should be performed
"evaluation_engine": self.evaluation_engine,
"evaluation_metrics": self.evaluation_metrics,
"calculate_metrics": self.calculate_metrics,
Expand Down
3 changes: 3 additions & 0 deletions cognee/eval_framework/evaluation/deep_eval_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from typing import Any, Dict, List
from deepeval.metrics import ContextualRelevancyMetric


class DeepEvalAdapter(BaseEvalAdapter):
Expand All @@ -13,6 +14,7 @@ def __init__(self):
"correctness": self.g_eval_correctness(),
"EM": ExactMatchMetric(),
"f1": F1ScoreMetric(),
"contextual_relevancy": ContextualRelevancyMetric(),
}

async def evaluate_answers(
Expand All @@ -29,6 +31,7 @@ async def evaluate_answers(
input=answer["question"],
actual_output=answer["answer"],
expected_output=answer["golden_answer"],
retrieval_context=[answer["retrieval_context"]],
)
metric_results = {}
for metric in evaluator_metrics:
Expand Down
9 changes: 8 additions & 1 deletion cognee/eval_framework/evaluation/evaluation_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@


class EvaluationExecutor:
def __init__(self, evaluator_engine: Union[str, EvaluatorAdapter, Any] = "DeepEval") -> None:
def __init__(
self,
evaluator_engine: Union[str, EvaluatorAdapter, Any] = "DeepEval",
evaluate_contexts: bool = False,
) -> None:
if isinstance(evaluator_engine, str):
try:
adapter_enum = EvaluatorAdapter(evaluator_engine)
Expand All @@ -14,7 +18,10 @@ def __init__(self, evaluator_engine: Union[str, EvaluatorAdapter, Any] = "DeepEv
self.eval_adapter = evaluator_engine.adapter_class()
else:
self.eval_adapter = evaluator_engine
self.evaluate_contexts = evaluate_contexts

async def execute(self, answers: List[Dict[str, str]], evaluator_metrics: Any) -> Any:
if self.evaluate_contexts:
evaluator_metrics.append("contextual_relevancy")
metrics = await self.eval_adapter.evaluate_answers(answers, evaluator_metrics)
return metrics
5 changes: 4 additions & 1 deletion cognee/eval_framework/evaluation/run_evaluation_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ async def execute_evaluation(params: dict) -> None:
raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")

logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
evaluator = EvaluationExecutor(
evaluator_engine=params["evaluation_engine"],
evaluate_contexts=params["evaluating_contexts"],
)
metrics = await evaluator.execute(
answers=answers, evaluator_metrics=params["evaluation_metrics"]
)
Expand Down
12 changes: 8 additions & 4 deletions cognee/tests/unit/eval_framework/answer_generation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,25 @@ async def test_answer_generation():
limit = 1
corpus_list, qa_pairs = DummyAdapter().load_corpus(limit=limit)

mock_answer_resolver = AsyncMock()
mock_answer_resolver.side_effect = lambda query: ["mock_answer"]
mock_retriever = AsyncMock()
mock_retriever.get_context = AsyncMock(return_value="Mocked retrieval context")
mock_retriever.get_completion = AsyncMock(return_value=["Mocked answer"])

answer_generator = AnswerGeneratorExecutor()
answers = await answer_generator.question_answering_non_parallel(
questions=qa_pairs, answer_resolver=mock_answer_resolver
questions=qa_pairs,
retriever=mock_retriever,
)

mock_retriever.get_context.assert_any_await(qa_pairs[0]["question"])

assert len(answers) == len(qa_pairs)
assert answers[0]["question"] == qa_pairs[0]["question"], (
"AnswerGeneratorExecutor is passing the question incorrectly"
)
assert answers[0]["golden_answer"] == qa_pairs[0]["answer"], (
"AnswerGeneratorExecutor is passing the golden answer incorrectly"
)
assert answers[0]["answer"] == "mock_answer", (
assert answers[0]["answer"] == "Mocked answer", (
"AnswerGeneratorExecutor is passing the generated answer incorrectly"
)