topoteretes · alekszievr · Mar 5, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/evals/eval_framework/analysis/__init__.py b/evals/eval_framework/analysis/__init__.py
diff --git a/evals/eval_framework/metrics_dashboard.py → ...framework/analysis/dashboard_generator.py b/evals/eval_framework/metrics_dashboard.py → ...framework/analysis/dashboard_generator.py
@@ -1,50 +1,12 @@
 import json
-from collections import defaultdict
 import plotly.graph_objects as go
-import numpy as np
-
-
-def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
-    means = []
-    n = len(scores)
-    for _ in range(num_samples):
-        sample = np.random.choice(scores, size=n, replace=True)
-        means.append(np.mean(sample))
-
-    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
-    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
-    return np.mean(scores), lower_bound, upper_bound
-
-
-def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html", benchmark=""):
-    try:
-        with open(json_data, "r", encoding="utf-8") as f:
-            data = json.load(f)
-    except FileNotFoundError:
-        raise FileNotFoundError(f"Could not find the file: {json_data}")
-    except json.JSONDecodeError as e:
-        raise ValueError(f"Error decoding JSON from {json_data}: {e}")
-
-    metrics_data = defaultdict(list)
-    metric_details = defaultdict(list)
+from typing import Dict, List, Tuple
+from collections import defaultdict
 
-    for entry in data:
-        for metric, values in entry["metrics"].items():
-            score = values["score"]
-            metrics_data[metric].append(score)
-            if "reason" in values:
-                metric_details[metric].append(
-                    {
-                        "question": entry["question"],
-                        "answer": entry["answer"],
-                        "golden_answer": entry["golden_answer"],
-                        "reason": values["reason"],
-                        "score": score,
-                    }
-                )
 
+def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
+    """Create distribution histogram plots for each metric."""
     figures = []
-
     for metric, scores in metrics_data.items():
         fig = go.Figure()
         fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
@@ -57,13 +19,11 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
             template="seaborn",
         )
         figures.append(fig.to_html(full_html=False))
+    return figures
 
-    ci_results = {}
-    for metric, scores in metrics_data.items():
-        mean_score, lower, upper = bootstrap_ci(scores)
-        ci_results[metric] = (mean_score, lower, upper)
 
-    # Bar chart with confidence intervals
+def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
+    """Create confidence interval bar plot."""
     fig = go.Figure()
     for metric, (mean_score, lower, upper) in ci_results.items():
         fig.add_trace(
@@ -86,9 +46,29 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
         yaxis_title="Score",
         template="seaborn",
     )
-    figures.append(fig.to_html(full_html=False))
+    return fig.to_html(full_html=False)
+
 
+def generate_details_html(metrics_data: List[Dict]) -> List[str]:
+    """Generate HTML for detailed metric information."""
     details_html = []
+    metric_details = {}
+
+    # Organize metrics by type
+    for entry in metrics_data:
+        for metric, values in entry["metrics"].items():
+            if metric not in metric_details:
+                metric_details[metric] = []
+            metric_details[metric].append(
+                {
+                    "question": entry["question"],
+                    "answer": entry["answer"],
+                    "golden_answer": entry["golden_answer"],
+                    "reason": values.get("reason", ""),
+                    "score": values["score"],
+                }
+            )
+
     for metric, details in metric_details.items():
         details_html.append(f"<h3>{metric} Details</h3>")
         details_html.append("""
@@ -112,8 +92,14 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
                 f"</tr>"
             )
         details_html.append("</table>")
+    return details_html
+
 
-    html_template = f"""
+def get_dashboard_html_template(
+    figures: List[str], details_html: List[str], benchmark: str = ""
+) -> str:
+    """Generate the complete HTML dashboard template."""
+    return f"""
     <!DOCTYPE html>
     <html>
     <head>
@@ -132,7 +118,7 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
         <h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>
 
         <h2>Metrics Distribution</h2>
-        {"".join([f'<div class="chart">{fig}</div>' for fig in figures[: len(metrics_data)]])}
+        {"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}
 
         <h2>95% confidence interval for all the metrics</h2>
         <div class="chart">{figures[-1]}</div>
@@ -143,6 +129,44 @@ def generate_metrics_dashboard(json_data, output_file="dashboard_with_ci.html",
     </html>
     """
 
+
+def create_dashboard(
+    metrics_path: str,
+    aggregate_metrics_path: str,
+    output_file: str = "dashboard_with_ci.html",
+    benchmark: str = "",
+) -> str:
+    """Create and save the dashboard with all visualizations."""
+    # Read metrics files
+    with open(metrics_path, "r") as f:
+        metrics_data = json.load(f)
+    with open(aggregate_metrics_path, "r") as f:
+        aggregate_data = json.load(f)
+
+    # Extract data for visualizations
+    metrics_by_type = defaultdict(list)
+    for entry in metrics_data:
+        for metric, values in entry["metrics"].items():
+            metrics_by_type[metric].append(values["score"])
+
+    # Generate visualizations
+    distribution_figures = create_distribution_plots(metrics_by_type)
+    ci_plot = create_ci_plot(
+        {
+            metric: (data["mean"], data["ci_lower"], data["ci_upper"])
+            for metric, data in aggregate_data.items()
+        }
+    )
+
+    # Combine all figures
+    figures = distribution_figures + [ci_plot]
+
+    # Generate HTML components
+    details_html = generate_details_html(metrics_data)
+    dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
+
+    # Write to file
     with open(output_file, "w", encoding="utf-8") as f:
-        f.write(html_template)
+        f.write(dashboard_html)
+
     return output_file
diff --git a/evals/eval_framework/analysis/metrics_calculator.py b/evals/eval_framework/analysis/metrics_calculator.py
@@ -0,0 +1,92 @@
+import json
+from collections import defaultdict
+import numpy as np
+from typing import Dict, List, Tuple
+
+
+def bootstrap_ci(scores, num_samples=10000, confidence_level=0.95):
+    """Calculate bootstrap confidence intervals for a list of scores."""
+    means = []
+    n = len(scores)
+    for _ in range(num_samples):
+        sample = np.random.choice(scores, size=n, replace=True)
+        means.append(np.mean(sample))
+
+    lower_bound = np.percentile(means, (1 - confidence_level) / 2 * 100)
+    upper_bound = np.percentile(means, (1 + confidence_level) / 2 * 100)
+    return np.mean(scores), lower_bound, upper_bound
+
+
+def load_metrics_data(json_file_path: str) -> List[Dict]:
+    """Load metrics data from JSON file."""
+    try:
+        with open(json_file_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Could not find the file: {json_file_path}")
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Error decoding JSON from {json_file_path}: {e}")
+
+
+def extract_metrics_and_details(
+    data: List[Dict],
+) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]]]:
+    """Extract metrics scores and details from evaluation data."""
+    metrics_data = defaultdict(list)
+    metric_details = defaultdict(list)
+
+    for entry in data:
+        for metric, values in entry["metrics"].items():
+            score = values["score"]
+            metrics_data[metric].append(score)
+            if "reason" in values:
+                metric_details[metric].append(
+                    {
+                        "question": entry["question"],
+                        "answer": entry["answer"],
+                        "golden_answer": entry["golden_answer"],
+                        "reason": values["reason"],
+                        "score": score,
+                    }
+                )
+
+    return metrics_data, metric_details
+
+
+def save_aggregate_metrics(
+    metrics_data: Dict[str, List[float]],
+    ci_results: Dict[str, Tuple[float, float, float]],
+    output_path: str,
+) -> None:
+    """Save aggregated metrics and confidence intervals to file."""
+    aggregate_data = {
+        metric: {
+            "scores": scores,
+            "mean": ci_results[metric][0],
+            "ci_lower": ci_results[metric][1],
+            "ci_upper": ci_results[metric][2],
+        }
+        for metric, scores in metrics_data.items()
+    }
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(aggregate_data, f, indent=4)
+
+
+def calculate_metrics_statistics(
+    json_data: str, aggregate_output_path: str
+) -> Tuple[Dict[str, List[float]], Dict[str, List[Dict]], Dict[str, Tuple[float, float, float]]]:
+    """Calculate metrics statistics and save aggregated results."""
+    data = load_metrics_data(json_data)
+    metrics_data, metric_details = extract_metrics_and_details(data)
+
+    # Calculate confidence intervals
+    ci_results = {}
+    for metric, scores in metrics_data.items():
+        mean_score, lower, upper = bootstrap_ci(scores)
+        ci_results[metric] = (mean_score, lower, upper)
+
+    # Save aggregate metrics
+    save_aggregate_metrics(metrics_data, ci_results, aggregate_output_path)
+
+    return metrics_data, metric_details, ci_results
diff --git a/evals/eval_framework/eval_config.py b/evals/eval_framework/eval_config.py
@@ -22,13 +22,17 @@ class EvalConfig(BaseSettings):
     evaluation_metrics: List[str] = ["correctness", "EM", "f1"]
     deepeval_model: str = "gpt-4o-mini"
 
+    # Metrics params
+    calculate_metrics: bool = True
+
     # Visualization
     dashboard: bool = True
 
     # file paths
     questions_path: str = "questions_output.json"
     answers_path: str = "answers_output.json"
     metrics_path: str = "metrics_output.json"
+    aggregate_metrics_path: str = "aggregate_metrics.json"
     dashboard_path: str = "dashboard.html"
 
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
@@ -43,10 +47,12 @@ def to_dict(self) -> dict:
             "evaluating_answers": self.evaluating_answers,
             "evaluation_engine": self.evaluation_engine,
             "evaluation_metrics": self.evaluation_metrics,
+            "calculate_metrics": self.calculate_metrics,
             "dashboard": self.dashboard,
             "questions_path": self.questions_path,
             "answers_path": self.answers_path,
             "metrics_path": self.metrics_path,
+            "aggregate_metrics_path": self.aggregate_metrics_path,
             "dashboard_path": self.dashboard_path,
             "deepeval_model": self.deepeval_model,
             "task_getter_type": self.task_getter_type,

diff --git a/evals/eval_framework/evaluation/run_evaluation_module.py b/evals/eval_framework/evaluation/run_evaluation_module.py
@@ -1,7 +1,8 @@
 import logging
 import json
 from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
-from evals.eval_framework.metrics_dashboard import generate_metrics_dashboard
+from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
+from evals.eval_framework.analysis.dashboard_generator import create_dashboard
 from cognee.infrastructure.files.storage import LocalStorage
 from cognee.infrastructure.databases.relational.get_relational_engine import (
     get_relational_engine,
@@ -28,32 +29,56 @@ async def create_and_insert_metrics_table(questions_payload):
         await session.commit()
 
 
+async def execute_evaluation(params: dict) -> None:
+    """Execute the evaluation step and save results."""
+    logging.info("Evaluation started...")
+    try:
+        with open(params["answers_path"], "r", encoding="utf-8") as f:
+            answers = json.load(f)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
+
+    logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
+    evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
+    metrics = await evaluator.execute(
+        answers=answers, evaluator_metrics=params["evaluation_metrics"]
+    )
+    with open(params["metrics_path"], "w", encoding="utf-8") as f:
+        json.dump(metrics, f, ensure_ascii=False, indent=4)
+
+    await create_and_insert_metrics_table(metrics)
+    logging.info("Evaluation completed")
+
+
 async def run_evaluation(params: dict) -> None:
+    """Run each step of the evaluation pipeline based on configuration flags."""
+    # Step 1: Evaluate answers if requested
     if params.get("evaluating_answers"):
-        logging.info("Evaluation started...")
-        try:
-            with open(params["answers_path"], "r", encoding="utf-8") as f:
-                answers = json.load(f)
-        except FileNotFoundError:
-            raise FileNotFoundError(f"Could not find the file: {params['answers_path']}")
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Error decoding JSON from {params['answers_path']}: {e}")
-
-        logging.info(f"Loaded {len(answers)} answers from {params['answers_path']}")
-        evaluator = EvaluationExecutor(evaluator_engine=params["evaluation_engine"])
-        metrics = await evaluator.execute(
-            answers=answers, evaluator_metrics=params["evaluation_metrics"]
-        )
-        with open(params["metrics_path"], "w", encoding="utf-8") as f:
-            json.dump(metrics, f, ensure_ascii=False, indent=4)
-
-        await create_and_insert_metrics_table(metrics)
+        await execute_evaluation(params)
+    else:
+        logging.info("Skipping evaluation as evaluating_answers is False")
 
-        logging.info("Evaluation End...")
+    # Step 2: Calculate metrics if requested
+    if params.get("calculate_metrics"):
+        logging.info("Calculating metrics statistics...")
+        calculate_metrics_statistics(
+            json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
+        )
+        logging.info("Metrics calculation completed")
+    else:
+        logging.info("Skipping metrics calculation as calculate_metrics is False")
 
+    # Step 3: Generate dashboard if requested
     if params.get("dashboard"):
-        generate_metrics_dashboard(
-            json_data=params["metrics_path"],
+        logging.info("Generating dashboard...")
+        create_dashboard(
+            metrics_path=params["metrics_path"],
+            aggregate_metrics_path=params["aggregate_metrics_path"],
             output_file=params["dashboard_path"],
             benchmark=params["benchmark"],
         )
+        logging.info(f"Dashboard generated at {params['dashboard_path']}")
+    else:
+        logging.info("Skipping dashboard generation as dashboard is False")