update e2e test of qa built-in evaluation flow

Azure · singankit · Mar 26, 2024 · Mar 15, 2024 · Mar 16, 2024 · Mar 19, 2024
commit 4d0c54ead0d0776310ae0fd39319a4f0f9601d43
@@ -20,7 +20,10 @@ def parse_single_sample(response: dict) -> list:
 
             # get content harm metric_value
             if 'label' in harm_response:
-                metric_value = harm_response['label']
+                try:
+                    metric_value = int(harm_response['label'])
+                except Exception:
+                    metric_value = harm_response['label']
             else:
                 metric_value = np.nan
 

@@ -21,10 +21,10 @@
 @pytest.mark.usefixtures("recorded_test")
 class TestEvaluate(AzureRecordedTestCase):
 
-    def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
+    def test_evaluate_built_in_metrics(self, ai_client, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
         test_data = [
             {"context": "Some are reported as not having been wanted at all.",
-             "question": "",
+             "question": "are all reported as being wanted?",
              "answer": "All are reported as being completely and fully wanted."
             },
             {"question": "How do you log a model?",
@@ -35,12 +35,13 @@ def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key
 
         with tmpdir.as_cwd():
             output_path = tmpdir + "/evaluation_output"
+            tracking_uri = ai_client.tracking_uri
 
             result = evaluate(  # This will log metric/artifacts using mlflow
                 evaluation_name="rag-chat-1",
                 data=test_data,
                 task_type="qa",
-                metrics_list=["gpt_groundedness"],
+                metrics_list=["gpt_groundedness", "gpt_relevance"],
                 model_config={
                     "api_version": "2023-07-01-preview",
                     "api_base": e2e_openai_api_base,
@@ -54,16 +55,20 @@ def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key
                     "y_pred": "answer",
                     "y_test": "truth",
                 },
+                tracking_uri=tracking_uri,
                 output_path=output_path
             )
 
             metrics_summary = result.metrics_summary
             tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True)
 
             assert "gpt_groundedness" in metrics_summary.keys()
-            assert metrics_summary.get("gpt_groundedness") == np.nanmean(tabular_result["gpt_groundedness"])
-            assert tabular_result["gpt_groundedness"][0] in [1,2]
-            assert tabular_result["gpt_groundedness"][1] in [5, 4]
+            assert "gpt_relevance" in metrics_summary.keys()
+            assert metrics_summary.get("gpt_relevance") == np.nanmean(tabular_result["gpt_relevance"])
+            assert tabular_result["gpt_groundedness"][0] in [1, 2]
+            assert tabular_result["gpt_groundedness"][1] in ["n/a"]
+            assert tabular_result["gpt_relevance"][0] in [1, 2]
+            assert tabular_result["gpt_relevance"][1] in [4, 5]
 
     def test_evaluate_built_in_metrics_missing_model_config(self, tmpdir):
         test_data = [
@@ -85,7 +90,7 @@ def test_evaluate_built_in_metrics_missing_model_config(self, tmpdir):
                     evaluation_name="rag-chat-1",
                     data=test_data,
                     task_type="qa",
-                    metrics_list=["gpt_groundedness"],
+                    metrics_list=["gpt_relevance"],
                     data_mapping={
                         "question": "question",
                         "context": "context",
@@ -115,7 +120,7 @@ def test_evaluate_built_in_content_safety_metrics(self, ai_client, tmpdir):
                 evaluation_name="rag-chat-1",
                 data=test_data,
                 task_type="qa",
-                metrics_list=["violence"],
+                metrics_list=["hate_unfairness"],
                 data_mapping={
                     "question": "question",
                     "context": "context",
@@ -129,10 +134,10 @@ def test_evaluate_built_in_content_safety_metrics(self, ai_client, tmpdir):
             metrics_summary = result.metrics_summary
             tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True)
 
-            assert "violence_defect_rate" in metrics_summary.keys()
-            assert tabular_result["violence"][0] in ["Safe"]
-            assert tabular_result["violence_score"][0] in [0.0]
-            assert tabular_result["violence_reasoning"] is not None
+            assert "hate_unfairness_defect_rate" in metrics_summary.keys()
+            assert tabular_result["hate_unfairness"][0] in ["Very low"]
+            assert tabular_result["hate_unfairness_score"][0] in [0.0]
+            assert tabular_result["hate_unfairness_reason"] is not None
 
 
     def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
@@ -144,7 +149,7 @@ def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e
         ]
 
         from azure.ai.generative.evaluate.metrics import PromptMetric
-        custom_prompt_metric = PromptMetric.from_template(path="test_template.jinja2", name="gpt_groundedness")
+        custom_prompt_metric = PromptMetric.from_template(path="test_template.jinja2", name="gpt_relevance")
 
         with pytest.raises(Exception) as ex:
             output_path = tmpdir + "/evaluation_output"
@@ -153,7 +158,7 @@ def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e
                 evaluation_name="rag-chat-1",
                 data=test_data,
                 task_type="qa",
-                metrics_list=["gpt_groundedness", custom_prompt_metric],
+                metrics_list=["gpt_relevance", custom_prompt_metric],
                 model_config={
                     "api_version": "2023-07-01-preview",
                     "api_base": e2e_openai_api_base,
@@ -308,7 +313,7 @@ def test_missing_data(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_
                 evaluation_name="rag-chat-1",
                 data=data_file,
                 task_type="qa",
-                metrics_list=[custom_prompt_metric, "gpt_groundedness"],
+                metrics_list=[custom_prompt_metric, "gpt_relevance"],
                 model_config={
                     "api_version": "2023-07-01-preview",
                     "api_base": "base", #e2e_openai_api_base,