Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
468de67
refactor: utils folder in retrieval
lxobr Feb 21, 2025
d789dd0
feat: add base_retriever.py
lxobr Feb 21, 2025
49c2355
feat: add retriever classes
lxobr Feb 21, 2025
7619df2
fix: include generate_completion function
lxobr Feb 21, 2025
5a5eb5e
feat: add search comparison script, compare summaries
lxobr Feb 21, 2025
8f0cbee
feat: enable context dumping
lxobr Feb 24, 2025
beacdea
fix: improve context getting and completion
lxobr Feb 24, 2025
4b71081
feat: add all searches and context comparisons
lxobr Feb 24, 2025
7631b11
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 24, 2025
62f8ac3
Update cognee/tasks/completion/query_completion.py
lxobr Feb 24, 2025
58c7eaf
feat: context dumping error handling
lxobr Feb 24, 2025
afd5ca4
feat: expose aggregate metrics, enable saving
lxobr Feb 24, 2025
8bf5aae
feat: add modal example
lxobr Feb 24, 2025
fd7f837
delete: metrics_dashboard.py
lxobr Feb 24, 2025
416eed1
fix: dashboard generation
lxobr Feb 24, 2025
d9fcb12
feat: add get_golden_context flag
lxobr Feb 25, 2025
36dbdf7
feat: implement get_golden_context for hotpot_qa
lxobr Feb 25, 2025
c07cf22
chore: added todos
lxobr Feb 25, 2025
2ef174a
chore: added a todo
lxobr Feb 25, 2025
5910fb7
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 25, 2025
65784e1
Merge branch 'dev' into feat/COG-1364-golden-contexts
lxobr Feb 25, 2025
bdaea29
feat: simplify twowikimultihop, get golden context
lxobr Feb 25, 2025
32d5829
feat: add golden context to musique_adapter.py
lxobr Feb 25, 2025
ec3b753
Merge branch 'dev' into feat/COG-1331-modal-run-eval
lxobr Feb 25, 2025
2f70de4
fix: update tests
lxobr Feb 27, 2025
3d0b839
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 27, 2025
4903d7e
feat: update code retriever
lxobr Feb 27, 2025
e98c12e
refactor: rename variables
lxobr Feb 27, 2025
af5d7c6
Merge branch 'dev' into feat/COG-1364-golden-contexts
lxobr Feb 27, 2025
0ece58a
refactor: add metadata_field_name property
lxobr Feb 27, 2025
cb0fccd
Merge remote-tracking branch 'origin/feat/COG-1331-modal-run-eval' in…
alekszievr Feb 27, 2025
1eb5e71
Merge remote-tracking branch 'origin/feat/COG-1364-golden-contexts' i…
alekszievr Feb 27, 2025
30927d7
First render.
soobrosa Feb 27, 2025
b02231d
Small fixes.
soobrosa Feb 27, 2025
2d90221
coderabbit don't be smart
soobrosa Feb 27, 2025
68a4584
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
8521fa8
Merge remote-tracking branch 'origin/feature/cog-1403-transition-to-n…
alekszievr Feb 27, 2025
3906bf5
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
aae1237
Calculate context relevancy score
alekszievr Feb 27, 2025
4cffd4b
Adjust dashboard tests
alekszievr Feb 27, 2025
2e2beb3
Adjust answer generation test
alekszievr Feb 27, 2025
7a574e3
adjust deepeval adapter test
alekszievr Feb 27, 2025
a17a5c8
Fix type hinting
alekszievr Feb 27, 2025
3f10725
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
df5ba7b
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
4c09877
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Mar 3, 2025
d9b007a
ruff format
alekszievr Mar 3, 2025
5691a1f
fix
alekszievr Mar 3, 2025
634a7fa
fix: add comment to new param
borisarzentar Mar 3, 2025
3453ede
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
borisarzentar Mar 3, 2025
5b9a64d
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Mar 5, 2025
6b2b6f2
Pass system prompt in question answering
alekszievr Mar 5, 2025
1a3371e
Adjust tests
alekszievr Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Original file line number Diff line number Diff line change
@@ -1,50 +1,12 @@
import json
from collections import defaultdict
import plotly.graph_objects as go
import numpy as np


def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
means = []
n = len(scores)
for _ in range(num_samples):
sample = np.random.choice(scores, size=n, replace=True)
means.append(np.mean(sample))

lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
return np.mean(scores), lower_bound, upper_bound


def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
try:
with open(json_data, "r", encoding="utf-8") as f:
data = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {json_data}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {json_data}: {e}")

metrics_data = defaultdict(list)
metric_details = defaultdict(list)
from typing import Dict, List, Tuple
from collections import defaultdict

for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)

def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
"""Create distribution histogram plots for each metric."""
figures = []

for metric, scores in metrics_data.items():
fig = go.Figure()
fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
Expand All @@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
template="seaborn",
)
figures.append(fig.to_html(full_html=False))
return figures

ci_results = {}
for metric, scores in metrics_data.items():
mean_score, lower, upper = bootstrap_ci(scores)
ci_results[metric] = (mean_score, lower, upper)

# Bar chart with confidence intervals
def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
"""Create confidence interval bar plot."""
fig = go.Figure()
for metric, (mean_score, lower, upper) in ci_results.items():
fig.add_trace(
Expand All @@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
yaxis_title="Score",
template="seaborn",
)
figures.append(fig.to_html(full_html=False))
return fig.to_html(full_html=False)


def generate_details_html(metrics_data: List[Dict]) -> List[str]:
"""Generate HTML for detailed metric information."""
details_html = []
metric_details = {}

# Organize metrics by type
for entry in metrics_data:
for metric, values in entry["metrics"].items():
if metric not in metric_details:
metric_details[metric] = []
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values.get("reason", ""),
"score": values["score"],
}
)

for metric, details in metric_details.items():
details_html.append(f"<h3>{metric} Details</h3>")
details_html.append("""
Expand All @@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
f"</tr>"
)
details_html.append("</table>")
return details_html


html_template = f"""
def get_dashboard_html_template(
figures: List[str], details_html: List[str], benchmark: str = ""
) -> str:
"""Generate the complete HTML dashboard template."""
return f"""
<!DOCTYPE html>
<html>
<head>
Expand All @@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
<h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>

<h2>Metrics Distribution</h2>
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[: len(metrics_data)]])}
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}

<h2>95% confidence interval for all the metrics</h2>
<div class="chart">{figures[-1]}</div>
Expand All @@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
</html>
"""


def create_dashboard(
metrics_path: str,
aggregate_metrics_path: str,
output_file: str = "dashboard_with_ci.html",
benchmark: str = "",
) -> str:
"""Create and save the dashboard with all visualizations."""
# Read metrics files
with open(metrics_path, "r") as f:
metrics_data = json.load(f)
with open(aggregate_metrics_path, "r") as f:
aggregate_data = json.load(f)

# Extract data for visualizations
metrics_by_type = defaultdict(list)
for entry in metrics_data:
for metric, values in entry["metrics"].items():
metrics_by_type[metric].append(values["score"])

# Generate visualizations
distribution_figures = create_distribution_plots(metrics_by_type)
ci_plot = create_ci_plot(
{
metric: (data["mean"], data["ci_lower"], data["ci_upper"])
for metric, data in aggregate_data.items()
}
)

# Combine all figures
figures = distribution_figures + [ci_plot]

# Generate HTML components
details_html = generate_details_html(metrics_data)
dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)

# Write to file
with open(output_file, "w", encoding="utf-8") as f:
f.write(html_template)
f.write(dashboard_html)

return output_file
92 changes: 92 additions & 0 deletions evals/eval_framework/analysis/metrics_calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import json
from collections import defaultdict
import numpy as np
from typing import Dict, List, Tuple


def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
"""Calculate bootstrap confidence intervals for a list of scores."""
means = []
n = len(scores)
for _ in range(num_samples):
sample = np.random.choice(scores, size=n, replace=True)
means.append(np.mean(sample))

lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
return np.mean(scores), lower_bound, upper_bound


def load_metrics_data(json_file_path: str) -> List[Dict]:
"""Load metrics data from JSON file."""
try:
with open(json_file_path, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {json_file_path}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")


def extract_metrics_and_details(
data: List[Dict],
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
"""Extract metrics scores and details from evaluation data."""
metrics_data = defaultdict(list)
metric_details = defaultdict(list)

for entry in data:
for metric, values in entry["metrics"].items():
score = values["score"]
metrics_data[metric].append(score)
if "reason" in values:
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values["reason"],
"score": score,
}
)

return metrics_data, metric_details


def save_aggregate_metrics(
metrics_data: Dict[str, List[float]],
ci_results: Dict[str, Tuple[float, float, float]],
output_path: str,
) -> None:
"""Save aggregated metrics and confidence intervals to file."""
aggregate_data = {
metric: {
"scores": scores,
"mean": ci_results[metric][0],
"ci_lower": ci_results[metric][1],
"ci_upper": ci_results[metric][2],
}
for metric, scores in metrics_data.items()
}

with open(output_path, "w", encoding="utf-8") as f:
json.dump(aggregate_data, f, indent=4)


def calculate_metrics_statistics(
json_data: str, aggregate_output_path: str
) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
"""Calculate metrics statistics and save aggregated results."""
data = load_metrics_data(json_data)
metrics_data, metric_details = extract_metrics_and_details(data)

# Calculate confidence intervals
ci_results = {}
for metric, scores in metrics_data.items():
mean_score, lower, upper = bootstrap_ci(scores)
ci_results[metric] = (mean_score, lower, upper)

# Save aggregate metrics
save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)

return metrics_data, metric_details, ci_results
6 changes: 6 additions & 0 deletions evals/eval_framework/eval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@ class EvalConfig(BaseSettings):
evaluation_metrics: List[str] = ["correctness", "EM", "f1"]
deepeval_model: str = "gpt-4o-mini"

# Metrics params
calculate_metrics: bool = True

# Visualization
dashboard: bool = True

# file paths
questions_path: str = "questions_output.json"
answers_path: str = "answers_output.json"
metrics_path: str = "metrics_output.json"
aggregate_metrics_path: str = "aggregate_metrics.json"
dashboard_path: str = "dashboard.html"

model_config = SettingsConfigDict(env_file=".env", extra="allow")
Expand All @@ -43,10 +47,12 @@ def to_dict(self) -> dict:
"evaluating_answers": self.evaluating_answers,
"evaluation_engine": self.evaluation_engine,
"evaluation_metrics": self.evaluation_metrics,
"calculate_metrics": self.calculate_metrics,
"dashboard": self.dashboard,
"questions_path": self.questions_path,
"answers_path": self.answers_path,
"metrics_path": self.metrics_path,
"aggregate_metrics_path": self.aggregate_metrics_path,
"dashboard_path": self.dashboard_path,
"deepeval_model": self.deepeval_model,
"task_getter_type": self.task_getter_type,
Expand Down
69 changes: 47 additions & 22 deletions evals/eval_framework/evaluation/run_evaluation_module.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
import json
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
from evals.eval_framework.analysis.dashboard_generator import create_dashboard
from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine,
Expand All @@ -28,32 +29,56 @@ async def create_and_insert_metrics_table(questions_payload):
await session.commit()


async def execute_evaluation(params: dict) -> None:
"""Execute the evaluation step and save results."""
logging.info("Evaluation started...")
try:
with open(params["answers_path"], "r", encoding="utf-8") as f:
answers = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")

logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
metrics = await evaluator.execute(
answers=answers, evaluator_metrics=params["evaluation_metrics"]
)
with open(params["metrics_path"], "w", encoding="utf-8") as f:
json.dump(metrics, f, ensure_ascii=False, indent=4)

await create_and_insert_metrics_table(metrics)
logging.info("Evaluation completed")


async def run_evaluation(params: dict) -> None:
"""Run each step of the evaluation pipeline based on configuration flags."""
# Step 1: Evaluate answers if requested
if params.get("evaluating_answers"):
logging.info("Evaluation started...")
try:
with open(params["answers_path"], "r", encoding="utf-8") as f:
answers = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
except json.JSONDecodeError as e:
raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")

logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
metrics = await evaluator.execute(
answers=answers, evaluator_metrics=params["evaluation_metrics"]
)
with open(params["metrics_path"], "w", encoding="utf-8") as f:
json.dump(metrics, f, ensure_ascii=False, indent=4)

await create_and_insert_metrics_table(metrics)
await execute_evaluation(params)
else:
logging.info("Skipping evaluation as evaluating_answers is False")

logging.info("Evaluation End...")
# Step 2: Calculate metrics if requested
if params.get("calculate_metrics"):
logging.info("Calculating metrics statistics...")
calculate_metrics_statistics(
json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
)
logging.info("Metrics calculation completed")
else:
logging.info("Skipping metrics calculation as calculate_metrics is False")

# Step 3: Generate dashboard if requested
if params.get("dashboard"):
generate_metrics_dashboard(
json_data=params["metrics_path"],
logging.info("Generating dashboard...")
create_dashboard(
metrics_path=params["metrics_path"],
aggregate_metrics_path=params["aggregate_metrics_path"],
output_file=params["dashboard_path"],
benchmark=params["benchmark"],
)
logging.info(f"Dashboard generated at {params['dashboard_path']}")
else:
logging.info("Skipping dashboard generation as dashboard is False")
Loading