Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions Dockerfile_modal
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,12 @@ RUN apt-get update && apt-get install -y \

WORKDIR /app


ENV PYTHONPATH=/app
WORKDIR /app
COPY pyproject.toml poetry.lock /app/

COPY pyproject.toml poetry.lock README.md /app/

RUN pip install poetry

RUN poetry install --all-extras --no-root --without dev
RUN poetry config virtualenvs.create false

RUN poetry install --extras neo4j --extras qdrant --no-root

COPY cognee/ /app/cognee
COPY README.md /app/README.md
4 changes: 3 additions & 1 deletion cognee/eval_framework/eval_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import List
from typing import List, Optional


class EvalConfig(BaseSettings):
Expand Down Expand Up @@ -43,6 +43,7 @@ class EvalConfig(BaseSettings):
dashboard_path: str = "dashboard.html"
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
instance_filter: Optional[List[str]] = None

model_config = SettingsConfigDict(env_file=".env", extra="allow")

Expand All @@ -68,6 +69,7 @@ def to_dict(self) -> dict:
"task_getter_type": self.task_getter_type,
"direct_llm_system_prompt": self.direct_llm_system_prompt,
"direct_llm_eval_prompt": self.direct_llm_eval_prompt,
"instance_filter": self.instance_filter,
}


Expand Down
2 changes: 1 addition & 1 deletion cognee/eval_framework/metrics_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,4 @@ def create_dashboard(
with open(output_file, "w", encoding="utf-8") as f:
f.write(dashboard_html)

return output_file
return dashboard_html
85 changes: 56 additions & 29 deletions cognee/eval_framework/modal_run_eval.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import modal
import os
import json
import asyncio
import datetime
import hashlib
import json
from cognee.shared.logging_utils import get_logger
from cognee.eval_framework.eval_config import EvalConfig
from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from cognee.eval_framework.answer_generation.run_question_answering_module import (
run_question_answering,
)
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
from cognee.eval_framework.metrics_dashboard import create_dashboard

logger = get_logger()
vol = modal.Volume.from_name("evaluation_dashboard_results", create_if_missing=True)


def read_and_combine_metrics(eval_params: dict) -> dict:
Expand Down Expand Up @@ -46,32 +49,54 @@ def read_and_combine_metrics(eval_params: dict) -> dict:
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
}
)
.poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
)


@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
async def modal_run_eval(eval_params=None):
"""Runs evaluation pipeline and returns combined metrics results."""
if eval_params is None:
eval_params = EvalConfig().to_dict()

version_name = "baseline"
benchmark_name = os.environ.get("BENCHMARK", eval_params.get("benchmark", "benchmark"))
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

answers_filename = (
f"{version_name}_{benchmark_name}_{timestamp}_{eval_params.get('answers_path')}"
)
html_filename = (
f"{version_name}_{benchmark_name}_{timestamp}_{eval_params.get('dashboard_path')}"
)

logger.info(f"Running evaluation with params: {eval_params}")

# Run the evaluation pipeline
await run_corpus_builder(eval_params)
await run_corpus_builder(eval_params, instance_filter=eval_params.get("instance_filter"))
await run_question_answering(eval_params)
await run_evaluation(eval_params)

# Early return if metrics calculation wasn't requested
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
logger.info(
"Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
answers = await run_evaluation(eval_params)

with open("/data/" + answers_filename, "w") as f:
json.dump(answers, f, ensure_ascii=False, indent=4)
vol.commit()

if eval_params.get("dashboard"):
logger.info("Generating dashboard...")
html_output = create_dashboard(
metrics_path=eval_params["metrics_path"],
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
output_file=eval_params["dashboard_path"],
benchmark=eval_params["benchmark"],
)
return None

return read_and_combine_metrics(eval_params)
with open("/data/" + html_filename, "w") as f:
f.write(html_output)
vol.commit()

logger.info("Evaluation set finished...")

return True


@app.local_entrypoint()
Expand All @@ -80,37 +105,39 @@ async def main():
configs = [
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=2,
number_of_samples_in_corpus=10,
benchmark="HotPotQA",
qa_engine="cognee_graph_completion",
building_corpus_from_scratch=True,
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=10,
benchmark="TwoWikiMultiHop",
qa_engine="cognee_graph_completion",
building_corpus_from_scratch=True,
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
dashboard=True,
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=10,
benchmark="Musique",
qa_engine="cognee_graph_completion",
building_corpus_from_scratch=True,
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
dashboard=True,
),
]

# Run evaluations in parallel with different configurations
modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
results = await asyncio.gather(*modal_tasks)

# Filter out None results and save combined results
results = [r for r in results if r is not None]
if results:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"combined_results_{timestamp}.json"

with open(output_file, "w") as f:
json.dump(results, f, indent=2)

logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
else:
logger.info("No metrics were collected from any of the evaluation runs")
await asyncio.gather(*modal_tasks)
Loading