Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
74f02b3
chore: moves eval_framework inside the cognee lib
hajdul88 Feb 19, 2025
725ebc5
feat: adds returns in order to get the metrics in dreamify
hajdul88 Feb 19, 2025
5cac85b
chore: moves dashboard creation outside of run_evaluation due to the …
hajdul88 Feb 19, 2025
dd7a1aa
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 19, 2025
d070a40
fix: updates eval framework test with the new directory
hajdul88 Feb 20, 2025
ab646da
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 20, 2025
d91b9bc
feat: outsourcing chunksize and chunker adapter to the eval framework
hajdul88 Feb 20, 2025
a30927d
fix resolves merge conflict
hajdul88 Feb 20, 2025
19c1229
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 20, 2025
462bac2
fix: fixes import in unit test
hajdul88 Feb 20, 2025
6a37833
fix: fixes typing for unit tests
hajdul88 Feb 20, 2025
59e03de
chore: deletes duplicated unit tests
hajdul88 Feb 20, 2025
a93ce2b
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 20, 2025
dbbb254
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 21, 2025
9cf36bd
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 21, 2025
6d1e430
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 25, 2025
f66ace8
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 25, 2025
0f5138b
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
67f1829
fix: fixes import
hajdul88 Feb 27, 2025
7dc476d
fix: Fixes eval pipeline after conflicts and dev changes
hajdul88 Feb 27, 2025
5460230
fix: fixes eval unit tests
hajdul88 Feb 27, 2025
afbbead
chore: finishes logging message
hajdul88 Feb 27, 2025
6ec9d59
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
e71a7ba
fix: fixes tests coming from dev branch
hajdul88 Feb 27, 2025
da4cdee
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
f0efc7a
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 27, 2025
cc1462c
feat: outsources system prompt as search function parameter
hajdul88 Feb 27, 2025
91d507a
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Feb 28, 2025
69a5a10
feat: implements prompt param outsourcing for autooptimizer
hajdul88 Feb 28, 2025
4ae22de
fix: fixing unit test with the new parameter in eval
hajdul88 Feb 28, 2025
1ac0d93
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
27bdd4d
feat: adds directllm adapter to evaluators
hajdul88 Mar 3, 2025
cead5c7
fix: updates import path in direct llm adapter
hajdul88 Mar 3, 2025
b23cc69
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
cccbd7d
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
8b54d20
fix: Fixes unit test
hajdul88 Mar 3, 2025
fe4a401
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
0d3959c
fixes import
hajdul88 Mar 3, 2025
64be044
feat: fixes tests
hajdul88 Mar 3, 2025
284425b
adds return value to run_evaluation
hajdul88 Mar 3, 2025
baccca9
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
ddf9720
fix: adding return value to evaluation module
hajdul88 Mar 3, 2025
93d2953
Merge branch 'feature/cog-1312-integrating-evaluation-framework-into-…
hajdul88 Mar 3, 2025
9ecfb04
adds comments based on meeting
hajdul88 Mar 3, 2025
1ba190e
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
b12d37d
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
53aed24
Merge branch 'dev' into feature/cog-1312-integrating-evaluation-frame…
hajdul88 Mar 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import json
from evals.eval_framework.answer_generation.answer_generation_executor import (
from typing import List
from cognee.eval_framework.answer_generation.answer_generation_executor import (
AnswerGeneratorExecutor,
question_answering_engine_options,
)
Expand Down Expand Up @@ -30,7 +31,7 @@ async def create_and_insert_answers_table(questions_payload):
await session.commit()


async def run_question_answering(params: dict) -> None:
async def run_question_answering(params: dict) -> List[dict]:
if params.get("answering_questions"):
logging.info("Question answering started...")
try:
Expand All @@ -52,3 +53,7 @@ async def run_question_answering(params: dict) -> None:

await create_and_insert_answers_table(answers)
logging.info("Question answering End...")

return answers
else:
logging.info("The question answering module ")
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from enum import Enum
from typing import Type

from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter
from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from cognee.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from cognee.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter


class BenchmarkAdapter(Enum):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Union, LiteralString

from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class DummyAdapter(BaseBenchmarkAdapter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import random
from typing import Optional, Union, Any, LiteralString
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class HotpotQAAdapter(BaseBenchmarkAdapter):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Any
import zipfile

import gdown

from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class MusiqueQAAdapter(BaseBenchmarkAdapter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import random
from typing import Optional, Union, Any, LiteralString
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import logging
from typing import Optional, Tuple, List, Dict, Union, Any

from evals.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter
from evals.eval_framework.corpus_builder.task_getters.task_getters import TaskGetters
from evals.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
from cognee.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter
from cognee.eval_framework.corpus_builder.task_getters.task_getters import TaskGetters
from cognee.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
from cognee.shared.utils import setup_logging


Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
import json
from typing import List
from cognee.infrastructure.files.storage import LocalStorage
from evals.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
from cognee.modules.data.models.questions_base import QuestionsBase
from cognee.modules.data.models.questions_data import Questions
from cognee.infrastructure.databases.relational.get_relational_engine import (
Expand All @@ -27,7 +28,7 @@ async def create_and_insert_questions_table(questions_payload):
await session.commit()


async def run_corpus_builder(params: dict) -> None:
async def run_corpus_builder(params: dict) -> List[dict]:
if params.get("building_corpus_from_scratch"):
logging.info("Corpus Builder started...")
corpus_builder = CorpusBuilderExecutor(
Expand All @@ -43,3 +44,5 @@ async def run_corpus_builder(params: dict) -> None:
await create_and_insert_questions_table(questions_payload=questions)

logging.info("Corpus Builder End...")

return questions
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from cognee.api.v1.cognify.cognify_v2 import get_default_tasks
from typing import List
from evals.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
from cognee.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
from cognee.modules.pipelines.tasks.Task import Task


Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum
from typing import Type
from evals.eval_framework.corpus_builder.task_getters.default_task_getter import DefaultTaskGetter
from cognee.eval_framework.corpus_builder.task_getters.default_task_getter import DefaultTaskGetter


class TaskGetters(Enum):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.eval_framework.eval_config import EvalConfig
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from cognee.eval_framework.eval_config import EvalConfig
from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from typing import Any, Dict, List


Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import List, Dict, Any, Union
from evals.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter
from cognee.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter


class EvaluationExecutor:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum
from typing import Type
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter


class EvaluatorAdapter(Enum):
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import json
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
from typing import List
from cognee.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine,
Expand All @@ -28,7 +28,7 @@ async def create_and_insert_metrics_table(questions_payload):
await session.commit()


async def run_evaluation(params: dict) -> None:
async def run_evaluation(params: dict) -> List[dict]:
if params.get("evaluating_answers"):
logging.info("Evaluation started...")
try:
Expand All @@ -51,9 +51,4 @@ async def run_evaluation(params: dict) -> None:

logging.info("Evaluation End...")

if params.get("dashboard"):
generate_metrics_dashboard(
json_data=params["metrics_path"],
output_file=params["dashboard_path"],
benchmark=params["benchmark"],
)
return metrics
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import logging
import asyncio
from cognee.shared.utils import setup_logging
from evals.eval_framework.eval_config import EvalConfig
from cognee.eval_framework.eval_config import EvalConfig

from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from evals.eval_framework.answer_generation.run_question_answering_module import (
from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from cognee.eval_framework.answer_generation.run_question_answering_module import (
run_question_answering,
)
from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
from cognee.eval_framework.metrics_dashboard import generate_metrics_dashboard

# Configure logging
setup_logging(logging.INFO)
Expand All @@ -31,6 +32,13 @@ async def main():
# Metrics calculation + dashboard
await run_evaluation(eval_params)

if eval_params.get("dashboard"):
generate_metrics_dashboard(
json_data=eval_params["metrics_path"],
output_file=eval_params["dashboard_path"],
benchmark=eval_params["benchmark"],
)


if __name__ == "__main__":
loop = asyncio.new_event_loop()
Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import pytest
import random
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter
from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from cognee.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from cognee.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter


ADAPTER_CLASSES = [
Expand Down
Loading