Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
74f02b3
chore: moves eval_framework inside the cognee lib
hajdul88 Feb 19, 2025
725ebc5
feat: adds returns in order to get the metrics in dreamify
hajdul88 Feb 19, 2025
5cac85b
chore: moves dashboard creation outside of run_evaluation due to the …
hajdul88 Feb 19, 2025
dd7a1aa
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 19, 2025
d070a40
fix: updates eval framework test with the new directory
hajdul88 Feb 20, 2025
ab646da
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 20, 2025
d91b9bc
feat: outsourcing chunksize and chunker adapter to the eval framework
hajdul88 Feb 20, 2025
a30927d
fix resolves merge conflict
hajdul88 Feb 20, 2025
19c1229
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 20, 2025
462bac2
fix: fixes import in unit test
hajdul88 Feb 20, 2025
6a37833
fix: fixes typing for unit tests
hajdul88 Feb 20, 2025
59e03de
chore: deletes duplicated unit tests
hajdul88 Feb 20, 2025
a93ce2b
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 20, 2025
dbbb254
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 21, 2025
9cf36bd
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 21, 2025
6d1e430
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 25, 2025
f66ace8
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 25, 2025
0f5138b
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
67f1829
fix: fixes import
hajdul88 Feb 27, 2025
7dc476d
fix: Fixes eval pipeline after conflicts and dev changes
hajdul88 Feb 27, 2025
5460230
fix: fixes eval unit tests
hajdul88 Feb 27, 2025
afbbead
chore: finishes logging message
hajdul88 Feb 27, 2025
6ec9d59
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
e71a7ba
fix: fixes tests coming from dev branch
hajdul88 Feb 27, 2025
da4cdee
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
f0efc7a
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
cc1462c
feat: outsources system prompt as search function parameter
hajdul88 Feb 27, 2025
91d507a
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 28, 2025
69a5a10
feat: implements prompt param outsourcing for autooptimizer
hajdul88 Feb 28, 2025
4ae22de
fix: fixing unit test with the new parameter in eval
hajdul88 Feb 28, 2025
1ac0d93
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
27bdd4d
feat: adds directllm adapter to evaluators
hajdul88 Mar 3, 2025
cead5c7
fix: updates import path in direct llm adapter
hajdul88 Mar 3, 2025
b23cc69
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
cccbd7d
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
8b54d20
fix: Fixes unit test
hajdul88 Mar 3, 2025
fe4a401
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
0d3959c
fixes import
hajdul88 Mar 3, 2025
64be044
feat: fixes tests
hajdul88 Mar 3, 2025
284425b
adds return value to run_evaluation
hajdul88 Mar 3, 2025
baccca9
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
ddf9720
fix: adding return value to evaluation module
hajdul88 Mar 3, 2025
93d2953
Merge branch 'feature/cog-1312-integrating-evaluation-framework-into-…
hajdul88 Mar 3, 2025
9ecfb04
adds comments based on meeting
hajdul88 Mar 3, 2025
1ba190e
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
b12d37d
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
53aed24
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test_eval_framework.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
run_eval_framework_test:
uses: ./.github/workflows/reusable_python_example.yml
with:
example-location: ./evals/eval_framework/run_eval.py
example-location: ./cognee/eval_framework/run_eval.py
secrets:
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand Down
7 changes: 4 additions & 3 deletions cognee/api/v1/cognify/cognify_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ def generate_dataset_name(dataset_name: str) -> str:
return dataset_name.replace(".", "_").replace(" ", "_")


async def get_default_tasks(
user: User = None, graph_model: BaseModel = KnowledgeGraph
async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment)
user: User = None, graph_model: BaseModel = KnowledgeGraph, chunk_size=1024, chunker=TextChunker
) -> list[Task]:
if user is None:
user = await get_default_user()
Expand All @@ -126,7 +126,8 @@ async def get_default_tasks(
Task(
extract_chunks_from_documents,
max_chunk_tokens=get_max_chunk_tokens(),
chunker=TextChunker,
chunker=chunker,
chunk_size=chunk_size,
), # Extract text chunks based on the document type.
Task(
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}
Expand Down
5 changes: 4 additions & 1 deletion cognee/api/v1/search/search_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ async def search(
query_type: SearchType = SearchType.GRAPH_COMPLETION,
user: User = None,
datasets: Union[list[str], str, None] = None,
system_prompt_path: str = "answer_simple_question.txt",
) -> list:
# We use lists from now on for datasets
if isinstance(datasets, str):
Expand All @@ -23,6 +24,8 @@ async def search(
if user is None:
raise UserNotFoundError

filtered_search_results = await search_function(query_text, query_type, datasets, user)
filtered_search_results = await search_function(
query_text, query_type, datasets, user, system_prompt_path=system_prompt_path
)

return filtered_search_results
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,19 @@
from cognee.api.v1.search import SearchType

question_answering_engine_options: Dict[str, Callable[[str], Awaitable[List[str]]]] = {
"cognee_graph_completion": lambda query: cognee.search(
query_type=SearchType.GRAPH_COMPLETION, query_text=query
"cognee_graph_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.GRAPH_COMPLETION,
query_text=query,
system_prompt_path=system_prompt_path,
),
"cognee_completion": lambda query: cognee.search(
query_type=SearchType.COMPLETION, query_text=query
"cognee_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.COMPLETION, query_text=query, system_prompt_path=system_prompt_path
),
"cognee_summaries": lambda query: cognee.search(
query_type=SearchType.SUMMARIES, query_text=query
"graph_summary_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.GRAPH_SUMMARY_COMPLETION,
query_text=query,
system_prompt_path=system_prompt_path,
),
"cognee_insights": lambda query: cognee.search(
query_type=SearchType.INSIGHTS, query_text=query
),
"cognee_chunks": lambda query: cognee.search(query_type=SearchType.CHUNKS, query_text=query),
"cognee_code": lambda query: cognee.search(query_type=SearchType.CODE, query_text=query),
}


Expand All @@ -25,13 +24,14 @@ async def question_answering_non_parallel(
self,
questions: List[Dict[str, str]],
answer_resolver: Callable[[str], Awaitable[List[str]]],
system_prompt: str = "answer_simple_question.txt",
) -> List[Dict[str, str]]:
answers = []
for instance in questions:
query_text = instance["question"]
correct_answer = instance["answer"]

search_results = await answer_resolver(query_text)
search_results = await answer_resolver(query_text, system_prompt)

answers.append(
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import json
from evals.eval_framework.answer_generation.answer_generation_executor import (
from typing import List
from cognee.eval_framework.answer_generation.answer_generation_executor import (
AnswerGeneratorExecutor,
question_answering_engine_options,
)
Expand Down Expand Up @@ -30,7 +31,9 @@ async def create_and_insert_answers_table(questions_payload):
await session.commit()


async def run_question_answering(params: dict) -> None:
async def run_question_answering(
params: dict, system_prompt="answer_simple_question.txt"
) -> List[dict]:
if params.get("answering_questions"):
logging.info("Question answering started...")
try:
Expand All @@ -46,9 +49,17 @@ async def run_question_answering(params: dict) -> None:
answers = await answer_generator.question_answering_non_parallel(
questions=questions,
answer_resolver=question_answering_engine_options[params["qa_engine"]],
system_prompt=system_prompt,
)
with open(params["answers_path"], "w", encoding="utf-8") as f:
json.dump(answers, f, ensure_ascii=False, indent=4)

await create_and_insert_answers_table(answers)
logging.info("Question answering End...")

return answers
else:
logging.info(
"The question answering module was not executed as answering_questions is not enabled"
)
return []
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from enum import Enum
from typing import Type

from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter
from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from cognee.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from cognee.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter


class BenchmarkAdapter(Enum):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import Optional
from typing import Optional, Any

from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class DummyAdapter(BaseBenchmarkAdapter):
def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[str], list[dict[str, str]]]:
) -> tuple[list[str], list[dict[str, Any]]]:
corpus_list = [
"The cognee is an AI memory engine that supports different vector and graph databases",
"Neo4j is a graph database supported by cognee",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import random
from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class HotpotQAAdapter(BaseBenchmarkAdapter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import gdown

from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class MusiqueQAAdapter(BaseBenchmarkAdapter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import random
from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter


class TwoWikiMultihopAdapter(HotpotQAAdapter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import logging
from typing import Optional, Tuple, List, Dict, Union, Any, Callable, Awaitable

from evals.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter
from evals.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters
from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters
from cognee.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.pipelines.tasks.Task import Task
from cognee.shared.utils import setup_logging

Expand Down Expand Up @@ -31,18 +32,20 @@ def load_corpus(self, limit: Optional[int] = None) -> Tuple[List[Dict], List[str
self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit)
return self.raw_corpus, self.questions

async def build_corpus(self, limit: Optional[int] = None) -> List[str]:
async def build_corpus(
self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker
) -> List[str]:
self.load_corpus(limit=limit)
await self.run_cognee()
await self.run_cognee(chunk_size=chunk_size, chunker=chunker)
return self.questions

async def run_cognee(self) -> None:
async def run_cognee(self, chunk_size=1024, chunker=TextChunker) -> None:
setup_logging(logging.ERROR)

await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)

await cognee.add(self.raw_corpus)

tasks = await self.task_getter()
tasks = await self.task_getter(chunk_size=chunk_size, chunker=TextChunker)
await cognee.cognify(tasks=tasks)
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import logging
import json
from typing import List

from unstructured.chunking.dispatch import chunk

from cognee.infrastructure.files.storage import LocalStorage
from evals.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
from cognee.modules.data.models.questions_base import QuestionsBase
from cognee.modules.data.models.questions_data import Questions
from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine,
get_relational_config,
)
from evals.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters


async def create_and_insert_questions_table(questions_payload):
Expand All @@ -28,7 +33,7 @@ async def create_and_insert_questions_table(questions_payload):
await session.commit()


async def run_corpus_builder(params: dict) -> None:
async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker) -> List[dict]:
if params.get("building_corpus_from_scratch"):
logging.info("Corpus Builder started...")

Expand All @@ -42,11 +47,13 @@ async def run_corpus_builder(params: dict) -> None:
task_getter=task_getter,
)
questions = await corpus_builder.build_corpus(
limit=params.get("number_of_samples_in_corpus")
limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker
)
with open(params["questions_path"], "w", encoding="utf-8") as f:
json.dump(questions, f, ensure_ascii=False, indent=4)

await create_and_insert_questions_table(questions_payload=questions)

logging.info("Corpus Builder End...")

return questions
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Callable, Awaitable, List
from cognee.api.v1.cognify.cognify_v2 import get_default_tasks
from cognee.modules.pipelines.tasks.Task import Task
from evals.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import (
from cognee.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import (
get_cascade_graph_tasks,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from cognee.api.v1.cognify.cognify_v2 import get_default_tasks
from typing import List
from cognee.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
from cognee.modules.pipelines.tasks.Task import Task
from cognee.infrastructure.llm import get_max_chunk_tokens
from cognee.modules.chunking.TextChunker import TextChunker


class DefaultTaskGetter(BaseTaskGetter):
"""Default task getter that retrieves tasks using the standard get_default_tasks function."""

async def get_tasks(self, chunk_size=1024, chunker=TextChunker) -> List[Task]:
"""Retrieve default tasks asynchronously."""
return await get_default_tasks(chunk_size=chunk_size, chunker=chunker)
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.eval_framework.eval_config import EvalConfig
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from cognee.eval_framework.eval_config import EvalConfig
from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from typing import Any, Dict, List


Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Any, Dict, List
from pydantic import BaseModel
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
from evals.eval_framework.eval_config import EvalConfig
from cognee.eval_framework.eval_config import EvalConfig


class CorrectnessEvaluation(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import List, Dict, Any, Union
from evals.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter
from cognee.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter


class EvaluationExecutor:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import Enum
from typing import Type
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
from evals.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter
from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
from cognee.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter


class EvaluatorAdapter(Enum):
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
import json
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
from evals.eval_framework.analysis.dashboard_generator import create_dashboard
from typing import List
from cognee.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
from cognee.eval_framework.analysis.dashboard_generator import create_dashboard
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine,
Expand Down Expand Up @@ -50,13 +51,14 @@ async def execute_evaluation(params: dict) -> None:

await create_and_insert_metrics_table(metrics)
logging.info("Evaluation completed")
return metrics


async def run_evaluation(params: dict) -> None:
async def run_evaluation(params: dict) -> List[dict]:
"""Run each step of the evaluation pipeline based on configuration flags."""
# Step 1: Evaluate answers if requested
if params.get("evaluating_answers"):
await execute_evaluation(params)
metrics = await execute_evaluation(params)
else:
logging.info("Skipping evaluation as evaluating_answers is False")

Expand All @@ -67,18 +69,7 @@ async def run_evaluation(params: dict) -> None:
json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
)
logging.info("Metrics calculation completed")
return metrics
else:
logging.info("Skipping metrics calculation as calculate_metrics is False")

# Step 3: Generate dashboard if requested
if params.get("dashboard"):
logging.info("Generating dashboard...")
create_dashboard(
metrics_path=params["metrics_path"],
aggregate_metrics_path=params["aggregate_metrics_path"],
output_file=params["dashboard_path"],
benchmark=params["benchmark"],
)
logging.info(f"Dashboard generated at {params['dashboard_path']}")
else:
logging.info("Skipping dashboard generation as dashboard is False")
return []
Loading