modify built-in qa evaluation flow:

* add logic to check service availability in a region * change hate_fairness to hate_unfairness
Azure · singankit · Mar 26, 2024 · Mar 15, 2024 · Mar 16, 2024 · Mar 19, 2024
commit 5de28011215cc0038eade37f71300e5be0efbc1f
@@ -51,4 +51,4 @@ def aggregate_results(results: List[dict], selected_metrics: List[dict], thresho
             else:
                 aggregate_output[metric_name] = np.nan
         log_metric(metric_name, aggregate_output[metric_name])
-    return aggregate_output
+    return aggregate_output
@@ -1,11 +1,13 @@
 from promptflow import tool
 from rai_client import RAIServiceHandler
 
-# The inputs section will change based on the arguments of the tool function, after you save the code
-# Adding type to arguments and return value will help the system show the types properly
+# The inputs section will change based on the
+# arguments of the tool function, after you save the code
+# Adding type to arguments and return value will help
+# the system show the types properly
 # Please update the function name/signature per need
 @tool
 def call_groundedness_service(request_body: dict) -> [dict]:
     service_handler = RAIServiceHandler()
     annotation_results = service_handler.get_annotation(request_body)
-    return annotation_results
+    return annotation_results
@@ -1,8 +1,11 @@
 from promptflow import tool
 from rai_client import RAIServiceHandler
 
-# The inputs section will change based on the arguments of the tool function, after you save the code
-# Adding type to arguments and return value will help the system show the types properly
+
+# The inputs section will change based on the arguments
+# of the tool function, after you save the code
+# Adding type to arguments and return value will help
+# the system show the types properly
 # Please update the function name/signature per need
 @tool
 def call_rai_service(request_body: dict) -> dict:

@@ -8,14 +8,13 @@ def concat_results(gpt_coherence_score: str = None,
                    gpt_similarity_score: str = None,
                    gpt_fluency_score: str = None,
                    gpt_relevance_score: str = None,
-                   #gpt_groundedness_score: str = None,
-                   f1_score: float = None) -> dict:
+                   f1_score: float = None
+                   ) -> dict:
 
     load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score},
                  {'name': 'gpt_similarity', 'score': gpt_similarity_score},
                  {'name': 'gpt_fluency', 'score': gpt_fluency_score},
                  {'name': 'gpt_relevance', 'score': gpt_relevance_score},
-                 #{'name': 'gpt_groundedness', 'score': gpt_groundedness_score},
                  {'name': 'f1_score', 'score': f1_score}
                  ]
 

@@ -8,9 +8,10 @@ def default_safety_results():
     for metric_name in supported_metrics:
         result[metric_name] = np.nan
         result[metric_name + "_score"] = np.nan
-        result[metric_name + "_reasoning"] = np.nan
+        result[metric_name + "_reason"] = np.nan
     return result
 
+
 def default_gpt_results():
     supported_metrics = constants.Metric.QUALITY_METRICS
     result = {}
@@ -19,14 +20,17 @@ def default_gpt_results():
             result[metric_name] = np.nan
     return result
 
+
 def default_groundedness_results():
     return {"gpt_groundedness": np.nan,
-            "gpt_groundedness_reasoning": np.nan
+            "gpt_groundedness_reason": np.nan
             }
 
 
-# The inputs section will change based on the arguments of the tool function, after you save the code
-# Adding type to arguments and return value will help the system show the types properly
+# The inputs section will change based on the arguments
+# of the tool function, after you save the code
+# Adding type to arguments and return value will help
+# the system show the types properly
 # Please update the function name/signature per need
 @tool
 def concat_results(selected_metrics: dict, 

@@ -1,13 +1,14 @@
 import sys
 from enum import Enum
-#import numpy as np
+
 
 class RAIService:
     """Define constants related to RAI service"""
     TIMEOUT = 1800
     SLEEPTIME = 2
     HARM_SEVERITY_THRESHOLD = 4
 
+
 class Metric:
     """Defines all metrics supported by RAI service"""
     Metrics = "metrics"
@@ -16,7 +17,7 @@ class Metric:
     SelfHarm = "self_harm"
     Violence = "violence"
     Sexual = "sexual"
-    HateFairness = "hate_fairness"
+    HateFairness = "hate_unfairness"
 
     QUALITY_METRICS = {
         "gpt_groundedness",
@@ -35,12 +36,14 @@ class Metric:
         HateFairness
     }
 
+
 class HarmSeverityLevel(Enum):
     VeryLow = "Very low"
     Low = "Low"
     Medium = "Medium"
     High = "High"
 
+
 class Tasks:
     """Defines types of annotation tasks supported by RAI Service."""
     CONTENT_HARM = "content harm"
@@ -1,16 +1,24 @@
 from promptflow import tool
 import json
 
+
 def normalize_user_text(user_text):
     return user_text.replace("'", "\\\"")
 
 
+# The inputs section will change based on the arguments
+# of the tool function, after you save the code
+# Adding type to arguments and return value will help
+# the system show the types properly
+# Please update the function name/signature per need
 @tool
 def construct_request(question: str, answer:str, context: str) -> dict:
     metrics = ["generic_groundedness"]
     user_text = json.dumps({"question": question, 
                  "answer": answer,
                  "context": context})
     parsed_user_text = normalize_user_text(user_text)
-    request_body = {"UserTextList": [parsed_user_text], "AnnotationTask": "groundedness", "MetricList":metrics}
+    request_body = {"UserTextList": [parsed_user_text],
+                    "AnnotationTask": "groundedness",
+                    "MetricList":metrics}
     return request_body
@@ -1,16 +1,23 @@
 from promptflow import tool
 
+
 def normalize_user_text(user_text):
     return user_text.replace("'", "\\\"")
 
-# The inputs section will change based on the arguments of the tool function, after you save the code
-# Adding type to arguments and return value will help the system show the types properly
+
+# The inputs section will change based on the arguments
+# of the tool function, after you save the code
+# Adding type to arguments and return value will help
+# the system show the types properly
 # Please update the function name/signature per need
 @tool
 def construct_request(question: str, answer:str, selected_metrics: dict) -> dict:
     selected_safety_metrics = selected_metrics["safety_metrics"]
-    metrics = [metric for metric in selected_safety_metrics if selected_safety_metrics[metric]]
+    metrics = [metric.replace("_unfairness", "_fairness") for metric in\
+        selected_safety_metrics if selected_safety_metrics[metric]]
     user_text = f"<Human>{question}</><System>{answer}</>"
     parsed_user_text = normalize_user_text(user_text)
-    request_body = {"UserTextList": [parsed_user_text], "AnnotationTask": "content harm", "MetricList":metrics}
+    request_body = {"UserTextList": [parsed_user_text],
+                    "AnnotationTask": "content harm",
+                    "MetricList":metrics}
     return request_body
@@ -27,42 +27,36 @@ outputs:
   gpt_coherence:
     type: string
     reference: ${concat_scores.output.gpt_coherence}
-  hate_fairness_score:
-    type: string
-    reference: ${concat_scores.output.hate_fairness_score}
   gpt_relevance:
     type: string
     reference: ${concat_scores.output.gpt_relevance}
-  self_harm_reasoning:
-    type: string
-    reference: ${concat_scores.output.self_harm_reasoning}
   self_harm:
     type: string
     reference: ${concat_scores.output.self_harm}
+  violence_reason:
+    type: string
+    reference: ${concat_scores.output.violence_reason}
+  sexual_reason:
+    type: string
+    reference: ${concat_scores.output.sexual_reason}
+  hate_unfairness:
+    type: string
+    reference: ${concat_scores.output.hate_unfairness}
   sexual_score:
     type: string
     reference: ${concat_scores.output.sexual_score}
   violence_score:
     type: string
     reference: ${concat_scores.output.violence_score}
-  hate_fairness_reasoning:
-    type: string
-    reference: ${concat_scores.output.hate_fairness_reasoning}
-  hate_fairness:
-    type: string
-    reference: ${concat_scores.output.hate_fairness}
-  gpt_groundedness_reasoning:
-    type: string
-    reference: ${concat_scores.output.gpt_groundedness_reasoning}
   gpt_groundedness:
     type: string
     reference: ${concat_scores.output.gpt_groundedness}
+  gpt_groundedness_reason:
+    type: string
+    reference: ${concat_scores.output.gpt_groundedness_reason}
   gpt_similarity:
     type: string
     reference: ${concat_scores.output.gpt_similarity}
-  sexual_reasoning:
-    type: string
-    reference: ${concat_scores.output.sexual_reasoning}
   gpt_fluency:
     type: string
     reference: ${concat_scores.output.gpt_fluency}
@@ -72,12 +66,18 @@ outputs:
   self_harm_score:
     type: string
     reference: ${concat_scores.output.self_harm_score}
-  violence_reasoning:
+  hate_unfairness_reason:
     type: string
-    reference: ${concat_scores.output.violence_reasoning}
+    reference: ${concat_scores.output.hate_unfairness_reason}
   violence:
     type: string
     reference: ${concat_scores.output.violence}
+  hate_unfairness_score:
+    type: string
+    reference: ${concat_scores.output.hate_unfairness_score}
+  self_harm_reason:
+    type: string
+    reference: ${concat_scores.output.self_harm_reason}
   f1_score:
     type: string
     reference: ${concat_scores.output.f1_score}
@@ -295,6 +295,7 @@ nodes:
     context: ${inputs.context}
     question: ${inputs.question}
     selected_metrics: ${select_metrics.output}
+    validate_input_result: ${validate_input.output}
   use_variants: false
 - name: construct_groundedness_request
   type: python
@@ -327,9 +328,29 @@ nodes:
     path: parse_groundedness_response.py
   inputs:
     batch_response: ${call_groundedness_service.output}
+    is_service_available: ${validate_service.output}
+    llm_groundedness_response: ${gpt_groundedness.output}
     selected_label_keys: ${select_metrics.output}
+  use_variants: false
+- name: gpt_groundedness
+  type: llm
+  source:
+    type: code
+    path: gpt_groundedness_prompt.jinja2
+  inputs:
+    deployment_name: GPT-4-Prod
+    temperature: 1
+    top_p: 1
+    presence_penalty: 0
+    frequency_penalty: 0
+    answer: ${inputs.answer}
+    context: ${inputs.context}
+  provider: AzureOpenAI
+  connection: Default_AzureOpenAI
+  api: chat
+  module: promptflow.tools.aoai
   activate:
-    when: ${validate_service.output.groundedness_service}
+    when: ${validate_service.output.groundedness_prompt}
     is: true
   use_variants: false
 node_variants: {}

diff --git a/...enerative/evaluate/pf_templates/built_in_metrics/qa/format_groundedness_service_output.py b/...enerative/evaluate/pf_templates/built_in_metrics/qa/format_groundedness_service_output.py
@@ -5,8 +5,10 @@
 from utils import get_harm_severity_level
 
 
-# The inputs section will change based on the arguments of the tool function, after you save the code
-# Adding type to arguments and return value will help the system show the types properly
+# The inputs section will change based on the
+# arguments of the tool function, after you save the code
+# Adding type to arguments and return value will help
+# the system show the types properly
 # Please update the function name/signature per need
 @tool
 def format_service_output(parsed_responses: List[List[dict]]) -> dict:
@@ -24,11 +26,11 @@ def format_service_output(parsed_responses: List[List[dict]]) -> dict:
                         harm_score = np.nan
                     result[key + "_score"] = harm_score
                     harm_severity_level = get_harm_severity_level(harm_score)
-                    result[key + "_reasoning"] = metric_dict["reasoning"]
+                    result[key + "_reason"] = metric_dict["reasoning"]
                     result[key] = harm_severity_level
     for metric_name in supported_metrics:
         if metric_name not in result:
             result[metric_name] = np.nan
             result[metric_name + "_score"] = np.nan
-            result[metric_name + "_reasoning"] = np.nan
-    return result
+            result[metric_name + "_reason"] = np.nan
+    return result