ZeroEval is an evaluations, a/b testing and monitoring platform for AI products. This SDK lets you create datasets, run AI/LLM experiments, and trace multimodal workloads.
Issues? Email us at [email protected]
• Dataset API – versioned, queryable, text or multimodal (images, audio, video, URLs).
• Experiment engine – run tasks + custom evaluators locally or in the cloud.
• Observability – hierarchical Session → Trace → Span tracing; live dashboard.
• Python CLI – zeroeval run …, zeroeval setup for friction-less onboarding.
pip install zeroeval # Core SDK onlyInstall with specific integrations:
pip install zeroeval[openai] # For OpenAI integration
pip install zeroeval[gemini] # For Google Gemini integration
pip install zeroeval[langchain] # For LangChain integration
pip install zeroeval[langgraph] # For LangGraph integration
pip install zeroeval[all] # Install all integrationsThe SDK automatically detects and instruments installed integrations. No additional configuration needed!
- One-off interactive setup (recommended):
Your API key will be automatically saved to your shell configuration file (e.g.,
zeroeval setup # or: poetry run zeroeval setup~/.zshrc,~/.bashrc). Best practice is to also store it in a.envfile in your project root. - Or set it in code each time:
import zeroeval as ze ze.init(api_key="YOUR_API_KEY")
# quickstart.py
import zeroeval as ze
ze.init() # uses ZEROEVAL_API_KEY env var
# 1. Create dataset
ds = ze.Dataset(
name="gsm8k_sample",
data=[
{"question": "What is 6 times 7?", "answer": "42"},
{"question": "What is 10 plus 7?", "answer": "17"}
]
)
# 2. Define task
@ze.task(outputs=["prediction"])
def solve(row):
# Your LLM logic here
response = llm_call(row["question"])
return {"prediction": response}
# 3. Define evaluation
@ze.evaluation(mode="dataset", outputs=["accuracy"])
def accuracy(answer_col, prediction_col):
correct = sum(a == p for a, p in zip(answer_col, prediction_col))
return {"accuracy": correct / len(answer_col)}
# 4. Run and evaluate
run = ds.run(solve, workers=8).score([accuracy], answer="answer")
print(f"Accuracy: {run.metrics['accuracy']:.2%}")For a fully-worked multimodal example, visit the docs: https://docs.zeroeval.com/multimodal-datasets (coming soon)
# Create dataset from list
cities = ze.Dataset(
"Cities",
data=[
{"name": "Paris", "population": 2_165_000},
{"name": "Berlin", "population": 3_769_000}
],
description="Example tabular dataset"
)
cities.push()
# Load dataset from CSV
ds = ze.Dataset("/path/to/data.csv")
ds.push() # Creates new version if dataset existsmm = ze.Dataset(
"Medical_Xray_Dataset",
data=[{"patient_id": "P001", "symptoms": "Cough"}],
description="Symptoms + chest X-ray"
)
mm.add_image(row_index=0, column_name="chest_xray", image_path="sample_images/p001.jpg")
mm.add_audio(row_index=0, column_name="verbal_notes", audio_path="notes/p001.wav")
mm.add_media_url(row_index=0, column_name="external_scan", media_url="https://example.com/scan.jpg", media_type="image")
mm.push()# Load dataset directly from CSV file
dataset = ze.Dataset("data.csv")Datasets support Python's iteration protocol:
# Basic iteration
for row in dataset:
print(row.name, row.score)
# With enumerate
for i, row in enumerate(dataset):
print(f"Row {i}: {row.name}")
# List comprehensions
high_scores = [row for row in dataset if row.score > 90]# Single item access (returns DotDict)
first_row = dataset[0]
last_row = dataset[-1]
# Slicing (returns new Dataset)
top_10 = dataset[:10]
bottom_5 = dataset[-5:]
middle = dataset[10:20]
# Sliced datasets can be processed independently
subset = dataset[:100]
results = subset.run(my_task)
subset.push() # Upload subset as new datasetAll rows support dot notation for cleaner code:
# Instead of row["column_name"]
value = row.column_name
# Works in tasks too
@ze.task(outputs=["length"])
def get_length(row):
return {"length": len(row.text)}import zeroeval as ze
ze.init()
# Pull dataset
dataset = ze.Dataset.pull("Capitals")
# Define task
@ze.task(outputs=["prediction"])
def uppercase_task(row):
return {"prediction": row["input"].upper()}
# Define evaluation
@ze.evaluation(mode="row", outputs=["exact_match"])
def exact_match(output, prediction):
return {"exact_match": int(output.upper() == prediction)}
@ze.evaluation(mode="dataset", outputs=["accuracy"])
def accuracy(exact_match_col):
return {"accuracy": sum(exact_match_col) / len(exact_match_col)}
# Run experiment
run = dataset.run(uppercase_task, workers=8)
run = run.score([exact_match, accuracy], output="output")
print(f"Accuracy: {run.metrics['accuracy']:.2%}")Advanced options:
# Multiple runs with ensemble
run = dataset.run(task, repeats=5, ensemble="majority", on="prediction")
# Pass@k evaluation
run = dataset.run(task, repeats=10, ensemble="pass@k", on="prediction", k=3)
# Custom aggregator
def best_length(values):
return max(values, key=len) if values else ""
run = dataset.run(task, repeats=3, ensemble=best_length, on="prediction")A shortened version (full listing in the docs):
import zeroeval as ze, openai, base64
from pathlib import Path
ze.init()
client = openai.OpenAI() # assumes env var OPENAI_API_KEY
def img_to_data_uri(path):
data = Path(path).read_bytes()
b64 = base64.b64encode(data).decode()
return f"data:image/jpeg;base64,{b64}"
# Pull multimodal dataset
dataset = ze.Dataset.pull("Medical_Xray_Dataset")
# Define task
@ze.task(outputs=["diagnosis"])
def diagnose(row):
messages = [
{"role": "user", "content": "Patient: " + row["symptoms"]},
{"role": "user", "content": {
"type": "image_url",
"image_url": {"url": img_to_data_uri(row["chest_xray"])}
}}
]
response = client.chat.completions.create(model="gpt-4o-mini", messages=messages)
return {"diagnosis": response.choices[0].message.content}
# Define evaluation
@ze.evaluation(mode="row", outputs=["contains_keyword"])
def check_keywords(diagnosis, expected_keywords):
keywords = expected_keywords.lower().split(',')
diagnosis_lower = diagnosis.lower()
found = any(kw.strip() in diagnosis_lower for kw in keywords)
return {"contains_keyword": int(found)}
# Run and evaluate
run = dataset.run(diagnose, workers=4)
run = run.score([check_keywords], expected_keywords="expected_keywords")Track user feedback on prompt completions to improve your prompts with DSPy optimization:
import zeroeval as ze
# Initialize client
ze.init()
# Send positive feedback
feedback = ze.send_feedback(
prompt_slug="customer-support",
completion_id="completion-uuid-123",
thumbs_up=True,
reason="Excellent response, very helpful"
)
# Send negative feedback with expected output
feedback = ze.send_feedback(
prompt_slug="customer-support",
completion_id="completion-uuid-456",
thumbs_up=False,
reason="Response was too formal",
expected_output="Should be more casual and friendly",
metadata={"user_id": "user-789", "source": "production"}
)- prompt_slug (str, required) – The slug of the prompt
- completion_id (str, required) – UUID of the completion to provide feedback on
- thumbs_up (bool, required) – True for positive feedback, False for negative
- reason (str, optional) – Explanation of the feedback
- expected_output (str, optional) – Description of what the expected output should be. This field is automatically used by ZeroEval for tuning datasets and DSPy prompt optimization to create stronger training examples.
- metadata (dict, optional) – Additional metadata to attach to the feedback
Feedback submitted via send_feedback is automatically linked to the prompt version used for the completion. When you provide both reason and expected_output, ZeroEval creates stronger training examples for DSPy optimization:
reasonhelps the optimizer understand what makes a response good or badexpected_outputprovides a concrete example of the ideal response, which DSPy uses to generate improved prompts
If the completion was traced with a span_id, the feedback is mirrored to your tuning datasets automatically, making it available for prompt optimization runs in the ZeroEval platform.
• Streaming responses – streaming guide: https://docs.zeroeval.com/streaming (coming soon) • Deep observability – tracing guide: https://docs.zeroeval.com/tracing (coming soon) • Framework integrations – see INTEGRATIONS.md for automatic OpenAI, LangChain, and LangGraph tracing
zeroeval setup # one-time API key config (auto-saves to shell config)
zeroeval run my_script.py # run a Python script that uses ZeroEvalpoetry run pytest