Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update e2e test of qa built-in evaluation flow
  • Loading branch information
qusongms committed Mar 25, 2024
commit 4d0c54ead0d0776310ae0fd39319a4f0f9601d43
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ def parse_single_sample(response: dict) -> list:

# get content harm metric_value
if 'label' in harm_response:
metric_value = harm_response['label']
try:
metric_value = int(harm_response['label'])
except Exception:
metric_value = harm_response['label']
else:
metric_value = np.nan

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
@pytest.mark.usefixtures("recorded_test")
class TestEvaluate(AzureRecordedTestCase):

def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
def test_evaluate_built_in_metrics(self, ai_client, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
test_data = [
{"context": "Some are reported as not having been wanted at all.",
"question": "",
"question": "are all reported as being wanted?",
"answer": "All are reported as being completely and fully wanted."
},
{"question": "How do you log a model?",
Expand All @@ -35,12 +35,13 @@ def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key

with tmpdir.as_cwd():
output_path = tmpdir + "/evaluation_output"
tracking_uri = ai_client.tracking_uri

result = evaluate( # This will log metric/artifacts using mlflow
evaluation_name="rag-chat-1",
data=test_data,
task_type="qa",
metrics_list=["gpt_groundedness"],
metrics_list=["gpt_groundedness", "gpt_relevance"],
model_config={
"api_version": "2023-07-01-preview",
"api_base": e2e_openai_api_base,
Expand All @@ -54,16 +55,20 @@ def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key
"y_pred": "answer",
"y_test": "truth",
},
tracking_uri=tracking_uri,
output_path=output_path
)

metrics_summary = result.metrics_summary
tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True)

assert "gpt_groundedness" in metrics_summary.keys()
assert metrics_summary.get("gpt_groundedness") == np.nanmean(tabular_result["gpt_groundedness"])
assert tabular_result["gpt_groundedness"][0] in [1,2]
assert tabular_result["gpt_groundedness"][1] in [5, 4]
assert "gpt_relevance" in metrics_summary.keys()
assert metrics_summary.get("gpt_relevance") == np.nanmean(tabular_result["gpt_relevance"])
assert tabular_result["gpt_groundedness"][0] in [1, 2]
assert tabular_result["gpt_groundedness"][1] in ["n/a"]
assert tabular_result["gpt_relevance"][0] in [1, 2]
assert tabular_result["gpt_relevance"][1] in [4, 5]

def test_evaluate_built_in_metrics_missing_model_config(self, tmpdir):
test_data = [
Expand All @@ -85,7 +90,7 @@ def test_evaluate_built_in_metrics_missing_model_config(self, tmpdir):
evaluation_name="rag-chat-1",
data=test_data,
task_type="qa",
metrics_list=["gpt_groundedness"],
metrics_list=["gpt_relevance"],
data_mapping={
"question": "question",
"context": "context",
Expand Down Expand Up @@ -115,7 +120,7 @@ def test_evaluate_built_in_content_safety_metrics(self, ai_client, tmpdir):
evaluation_name="rag-chat-1",
data=test_data,
task_type="qa",
metrics_list=["violence"],
metrics_list=["hate_unfairness"],
data_mapping={
"question": "question",
"context": "context",
Expand All @@ -129,10 +134,10 @@ def test_evaluate_built_in_content_safety_metrics(self, ai_client, tmpdir):
metrics_summary = result.metrics_summary
tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True)

assert "violence_defect_rate" in metrics_summary.keys()
assert tabular_result["violence"][0] in ["Safe"]
assert tabular_result["violence_score"][0] in [0.0]
assert tabular_result["violence_reasoning"] is not None
assert "hate_unfairness_defect_rate" in metrics_summary.keys()
assert tabular_result["hate_unfairness"][0] in ["Very low"]
assert tabular_result["hate_unfairness_score"][0] in [0.0]
assert tabular_result["hate_unfairness_reason"] is not None


def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
Expand All @@ -144,7 +149,7 @@ def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e
]

from azure.ai.generative.evaluate.metrics import PromptMetric
custom_prompt_metric = PromptMetric.from_template(path="test_template.jinja2", name="gpt_groundedness")
custom_prompt_metric = PromptMetric.from_template(path="test_template.jinja2", name="gpt_relevance")

with pytest.raises(Exception) as ex:
output_path = tmpdir + "/evaluation_output"
Expand All @@ -153,7 +158,7 @@ def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e
evaluation_name="rag-chat-1",
data=test_data,
task_type="qa",
metrics_list=["gpt_groundedness", custom_prompt_metric],
metrics_list=["gpt_relevance", custom_prompt_metric],
model_config={
"api_version": "2023-07-01-preview",
"api_base": e2e_openai_api_base,
Expand Down Expand Up @@ -308,7 +313,7 @@ def test_missing_data(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_
evaluation_name="rag-chat-1",
data=data_file,
task_type="qa",
metrics_list=[custom_prompt_metric, "gpt_groundedness"],
metrics_list=[custom_prompt_metric, "gpt_relevance"],
model_config={
"api_version": "2023-07-01-preview",
"api_base": "base", #e2e_openai_api_base,
Expand Down