Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
468de67
refactor: utils folder in retrieval
lxobr Feb 21, 2025
d789dd0
feat: add base_retriever.py
lxobr Feb 21, 2025
49c2355
feat: add retriever classes
lxobr Feb 21, 2025
7619df2
fix: include generate_completion function
lxobr Feb 21, 2025
5a5eb5e
feat: add search comparison script, compare summaries
lxobr Feb 21, 2025
8f0cbee
feat: enable context dumping
lxobr Feb 24, 2025
beacdea
fix: improve context getting and completion
lxobr Feb 24, 2025
4b71081
feat: add all searches and context comparisons
lxobr Feb 24, 2025
7631b11
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 24, 2025
62f8ac3
Update cognee/tasks/completion/query_completion.py
lxobr Feb 24, 2025
58c7eaf
feat: context dumping error handling
lxobr Feb 24, 2025
afd5ca4
feat: expose aggregate metrics, enable saving
lxobr Feb 24, 2025
8bf5aae
feat: add modal example
lxobr Feb 24, 2025
fd7f837
delete: metrics_dashboard.py
lxobr Feb 24, 2025
416eed1
fix: dashboard generation
lxobr Feb 24, 2025
d9fcb12
feat: add get_golden_context flag
lxobr Feb 25, 2025
36dbdf7
feat: implement get_golden_context for hotpot_qa
lxobr Feb 25, 2025
c07cf22
chore: added todos
lxobr Feb 25, 2025
2ef174a
chore: added a todo
lxobr Feb 25, 2025
5910fb7
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 25, 2025
65784e1
Merge branch 'dev' into feat/COG-1364-golden-contexts
lxobr Feb 25, 2025
bdaea29
feat: simplify twowikimultihop, get golden context
lxobr Feb 25, 2025
32d5829
feat: add golden context to musique_adapter.py
lxobr Feb 25, 2025
ec3b753
Merge branch 'dev' into feat/COG-1331-modal-run-eval
lxobr Feb 25, 2025
2f70de4
fix: update tests
lxobr Feb 27, 2025
3d0b839
Merge branch 'dev' into feat/COG-1365-unify-retrievers
lxobr Feb 27, 2025
4903d7e
feat: update code retriever
lxobr Feb 27, 2025
e98c12e
refactor: rename variables
lxobr Feb 27, 2025
af5d7c6
Merge branch 'dev' into feat/COG-1364-golden-contexts
lxobr Feb 27, 2025
0ece58a
refactor: add metadata_field_name property
lxobr Feb 27, 2025
cb0fccd
Merge remote-tracking branch 'origin/feat/COG-1331-modal-run-eval' in…
alekszievr Feb 27, 2025
1eb5e71
Merge remote-tracking branch 'origin/feat/COG-1364-golden-contexts' i…
alekszievr Feb 27, 2025
30927d7
First render.
soobrosa Feb 27, 2025
b02231d
Small fixes.
soobrosa Feb 27, 2025
2d90221
coderabbit don't be smart
soobrosa Feb 27, 2025
68a4584
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
8521fa8
Merge remote-tracking branch 'origin/feature/cog-1403-transition-to-n…
alekszievr Feb 27, 2025
3906bf5
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
aae1237
Calculate context relevancy score
alekszievr Feb 27, 2025
4cffd4b
Adjust dashboard tests
alekszievr Feb 27, 2025
2e2beb3
Adjust answer generation test
alekszievr Feb 27, 2025
7a574e3
adjust deepeval adapter test
alekszievr Feb 27, 2025
a17a5c8
Fix type hinting
alekszievr Feb 27, 2025
3f10725
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
df5ba7b
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Feb 27, 2025
4c09877
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Mar 3, 2025
d9b007a
ruff format
alekszievr Mar 3, 2025
5691a1f
fix
alekszievr Mar 3, 2025
634a7fa
fix: add comment to new param
borisarzentar Mar 3, 2025
3453ede
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
borisarzentar Mar 3, 2025
5b9a64d
Merge branch 'dev' into feat/cog-1366-add-context-evaluation
alekszievr Mar 5, 2025
6b2b6f2
Pass system prompt in question answering
alekszievr Mar 5, 2025
1a3371e
Adjust tests
alekszievr Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@

class BaseBenchmarkAdapter(ABC):
@abstractmethod
def load_corpus(self, limit: Optional[int] = None, seed: int = 42) -> List[str]:
def load_corpus(
self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
) -> List[str]:
pass
64 changes: 51 additions & 13 deletions evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import json
import random
from typing import Optional, Any
from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


Expand All @@ -14,9 +14,55 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
# distractor test: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json" delete file after changing the url
}

def __init__(self):
super().__init__()
self.metadata_field_name = "level"

def _is_valid_supporting_fact(self, sentences: List[str], sentence_idx: Any) -> bool:
"""Validates if a supporting fact index is valid for the given sentences."""
return sentences and isinstance(sentence_idx, int) and 0 <= sentence_idx < len(sentences)

def _get_golden_context(self, item: dict[str, Any]) -> str:
"""Extracts and formats the golden context from supporting facts."""
# Create a mapping of title to sentences for easy lookup
context_dict = {title: sentences for (title, sentences) in item["context"]}

# Get all supporting facts in order
golden_contexts = []
for title, sentence_idx in item["supporting_facts"]:
sentences = context_dict.get(title, [])
if not self._is_valid_supporting_fact(sentences, sentence_idx):
continue
golden_contexts.append(f"{title}: {sentences[sentence_idx]}")

return "\n".join(golden_contexts)

def _process_item(
self,
item: dict[str, Any],
corpus_list: List[str],
question_answer_pairs: List[dict[str, Any]],
load_golden_context: bool = False,
) -> None:
"""Processes a single item and adds it to the corpus and QA pairs."""
for title, sentences in item["context"]:
corpus_list.append(" ".join(sentences))

qa_pair = {
"question": item["question"],
"answer": item["answer"].lower(),
self.metadata_field_name: item[self.metadata_field_name],
}

if load_golden_context:
qa_pair["golden_context"] = self._get_golden_context(item)

question_answer_pairs.append(qa_pair)

def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[str], list[dict[str, Any]]]:
self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
) -> Tuple[List[str], List[dict[str, Any]]]:
"""Loads and processes the HotpotQA corpus, optionally with golden context."""
filename = self.dataset_info["filename"]

if os.path.exists(filename):
Expand All @@ -36,16 +82,8 @@ def load_corpus(

corpus_list = []
question_answer_pairs = []

for item in corpus_json:
for title, sentences in item["context"]:
corpus_list.append(" ".join(sentences))

question_answer_pairs.append(
{
"question": item["question"],
"answer": item["answer"].lower(),
"level": item["level"],
}
)
self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)

return corpus_list, question_answer_pairs
90 changes: 53 additions & 37 deletions evals/eval_framework/benchmark_adapters/musique_adapter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import json
import random
from typing import Optional, Any
from typing import Optional, Any, List
import zipfile

import gdown
Expand All @@ -10,38 +10,71 @@


class MusiqueQAAdapter(BaseBenchmarkAdapter):
"""
Adapter to load and process the Musique QA dataset from a local .jsonl file.
Optionally downloads and unzips the dataset if it does not exist locally.
"""
"""Adapter for the Musique QA dataset with local file loading and optional download."""

dataset_info = {
# Name of the final file we want to load
"filename": "data/musique_ans_v1.0_dev.jsonl",
# A Google Drive URL (or share link) to the ZIP containing this file
"download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
# The name of the ZIP archive we expect after downloading
"zip_filename": "musique_v1.0.zip",
}

def _get_golden_context(self, item: dict[str, Any]) -> str:
"""Extracts golden context from question decomposition and supporting paragraphs."""
golden_context = []
paragraphs = item.get("paragraphs", [])

# Process each decomposition step
for step in item.get("question_decomposition", []):
# Add the supporting paragraph if available
support_idx = step.get("paragraph_support_idx")
if isinstance(support_idx, int) and 0 <= support_idx < len(paragraphs):
para = paragraphs[support_idx]
golden_context.append(f"{para['title']}: {para['paragraph_text']}")

# Add the step's question and answer
golden_context.append(f"Q: {step['question']}")
golden_context.append(f"A: {step['answer']}")
golden_context.append("") # Empty line between steps

return "\n".join(golden_context)

def _process_item(
self,
item: dict[str, Any],
corpus_list: List[str],
question_answer_pairs: List[dict[str, Any]],
load_golden_context: bool = False,
) -> None:
"""Processes a single item and adds it to the corpus and QA pairs."""
# Add paragraphs to corpus
paragraphs = item.get("paragraphs", [])
for paragraph in paragraphs:
corpus_list.append(paragraph["paragraph_text"])

# Create QA pair
qa_pair = {
"id": item.get("id", ""),
"question": item.get("question", ""),
"answer": item.get("answer", "").lower()
if isinstance(item.get("answer"), str)
else item.get("answer"),
}

if load_golden_context:
qa_pair["golden_context"] = self._get_golden_context(item)

question_answer_pairs.append(qa_pair)

def load_corpus(
self,
limit: Optional[int] = None,
seed: int = 42,
load_golden_context: bool = False,
auto_download: bool = True,
) -> tuple[list[str], list[dict[str, Any]]]:
"""
Loads the Musique QA dataset.

:param limit: If set, randomly sample 'limit' items.
:param seed: Random seed for sampling.
:param auto_download: If True, attempt to download + unzip the dataset
from Google Drive if the .jsonl file is not present locally.
:return: (corpus_list, question_answer_pairs)
"""
"""Loads and processes the Musique QA dataset."""
target_filename = self.dataset_info["filename"]

# 1. Ensure the file is locally available; optionally download if missing
if not os.path.exists(target_filename):
if auto_download:
self._musique_download_file()
Expand All @@ -62,29 +95,12 @@ def load_corpus(
question_answer_pairs = []

for item in data:
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
paragraphs = item.get("paragraphs", [])
for paragraph in paragraphs:
corpus_list.append(paragraph["paragraph_text"])

question = item.get("question", "")
answer = item.get("answer", "")

question_answer_pairs.append(
{
"id": item.get("id", ""),
"question": question,
"answer": answer.lower() if isinstance(answer, str) else answer,
}
)
self._process_item(item, corpus_list, question_answer_pairs, load_golden_context)

return corpus_list, question_answer_pairs

def _musique_download_file(self) -> None:
"""
Download and unzip the Musique dataset if not already present locally.
Uses gdown for Google Drive links.
"""
"""Downloads and unzips the Musique dataset if not present locally."""
url = self.dataset_info["download_url"]
zip_filename = self.dataset_info["zip_filename"]
target_filename = self.dataset_info["filename"]
Expand Down
51 changes: 15 additions & 36 deletions evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,27 @@
import os
import json
import random
from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter


class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):
class TwoWikiMultihopAdapter(HotpotQAAdapter):
dataset_info = {
"filename": "2wikimultihop_dev.json",
"URL": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json",
"url": "https://huggingface.co/datasets/voidful/2WikiMultihopQA/resolve/main/dev.json",
}

def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[str], list[dict[str, Any]]]:
filename = self.dataset_info["filename"]
def __init__(self):
super().__init__()
self.metadata_field_name = "type"

if os.path.exists(filename):
with open(filename, "r", encoding="utf-8") as f:
corpus_json = json.load(f)
else:
response = requests.get(self.dataset_info["URL"])
response.raise_for_status()
corpus_json = response.json()
def _get_golden_context(self, item: dict[str, Any]) -> str:
"""Extracts and formats the golden context from supporting facts and adds evidence if available."""
golden_context = super()._get_golden_context(item)

with open(filename, "w", encoding="utf-8") as f:
json.dump(corpus_json, f, ensure_ascii=False, indent=4)
if "evidences" in item:
golden_context += "\nEvidence fact triplets:"
for subject, relation, obj in item["evidences"]:
golden_context += f"\n • {subject} - {relation} - {obj}"

if limit is not None and 0 < limit < len(corpus_json):
random.seed(seed)
corpus_json = random.sample(corpus_json, limit)

corpus_list = []
question_answer_pairs = []
for dict in corpus_json:
for title, sentences in dict["context"]:
corpus_list.append(" ".join(sentences))

question_answer_pairs.append(
{
"question": dict["question"],
"answer": dict["answer"].lower(),
"type": dict["type"],
}
)

return corpus_list, question_answer_pairs
return golden_context