update built-in qa evaluation flow

* add flight control to flow input * code flake8 cleaning
Azure · singankit · Mar 26, 2024 · Mar 15, 2024 · Mar 16, 2024 · Mar 19, 2024
commit f9f104c80e3a7ad6da711dcfe6a41db50b77f3eb
@@ -6,25 +6,31 @@
 
 
 @tool
-def aggregate_results(results: List[dict], selected_metrics: List[dict], thresholds: List[int]) -> dict:
+def aggregate_results(results: List[dict],
+                      selected_metrics: List[dict],
+                      thresholds: List[int]) -> dict:
     if selected_metrics:
-        selected_safety_metrics = filter_metrics(selected_metrics[0]["safety_metrics"])
-        selected_quality_metrics = filter_metrics(selected_metrics[0]["quality_metrics"])
+        selected_safety_metrics = filter_metrics(
+            selected_metrics[0]["safety_metrics"])
+        selected_quality_metrics = filter_metrics(
+            selected_metrics[0]["quality_metrics"])
     else:
         selected_safety_metrics = []
         selected_quality_metrics = []
 
     if thresholds != [] and thresholds is not None:
         threshold = np.float16(thresholds[0])
     else:
-        threshold = np.float16(RAIService.HARM_SEVERITY_THRESHOLD)
+        threshold = np.float16(
+            RAIService.HARM_SEVERITY_THRESHOLD)
 
     aggregate_results = {}
     for result in results:
         if not result:
             continue
         for name in result.keys():
-            if name in selected_quality_metrics or name in selected_safety_metrics:
+            if name in selected_quality_metrics \
+                    or name in selected_safety_metrics:
                 if name not in aggregate_results.keys():
                     aggregate_results[name] = []
                 metric_value = result[name]
@@ -47,7 +53,8 @@ def aggregate_results(results: List[dict], selected_metrics: List[dict], thresho
             if name in selected_quality_metrics:
                 aggregate_output[metric_name] = round(np.nanmean(values), 2)
             elif name in selected_safety_metrics:
-                aggregate_output[metric_name] = round(np.sum(values >= threshold) / len(values), 2)
+                aggregate_output[metric_name] = round(
+                    np.sum(values >= threshold) / len(values), 2)
             else:
                 aggregate_output[metric_name] = np.nan
         log_metric(metric_name, aggregate_output[metric_name])

@@ -1,11 +1,7 @@
 from promptflow import tool
 from rai_client import RAIServiceHandler
 
-# The inputs section will change based on the
-# arguments of the tool function, after you save the code
-# Adding type to arguments and return value will help
-# the system show the types properly
-# Please update the function name/signature per need
+
 @tool
 def call_groundedness_service(request_body: dict) -> [dict]:
     service_handler = RAIServiceHandler()

@@ -2,14 +2,8 @@
 from rai_client import RAIServiceHandler
 
 
-# The inputs section will change based on the arguments
-# of the tool function, after you save the code
-# Adding type to arguments and return value will help
-# the system show the types properly
-# Please update the function name/signature per need
 @tool
 def call_rai_service(request_body: dict) -> dict:
     service_handler = RAIServiceHandler()
     annotation_results = service_handler.get_annotation(request_body)
     return annotation_results
-
@@ -27,7 +27,9 @@ def concat_results(gpt_coherence_score: str = None,
                 score = float(item["score"])
             except Exception as e:
                 score = np.nan
-                errors.append({"name": item["name"], "msg":   str(e), "data": item["score"]})
+                errors.append({"name": item["name"],
+                               "msg": str(e),
+                               "data": item["score"]})
         else:
             if item['score']:
                 try:
@@ -39,15 +41,19 @@ def concat_results(gpt_coherence_score: str = None,
                         score = np.nan
                 except Exception as e:
                     score = np.nan
-                    errors.append({"name": item["name"], "msg":   str(e), "data": item["score"]})
+                    errors.append({"name": item["name"],
+                                   "msg": str(e),
+                                   "data": item["score"]})
             else:
                 score = np.nan
-        score_list.append({"name": item["name"], "score": score})
+        score_list.append({"name": item["name"],
+                           "score": score})
 
     variant_level_result = {}
     for item in score_list:
         item_name = str(item["name"])
         variant_level_result[item_name] = item["score"]
         if 'gpt' in item_name:
-            variant_level_result[item_name + '_pass_rate'] = 1 if item["score"] > 3 else 0
+            variant_level_result[item_name + '_pass_rate'] = 1 \
+                if item["score"] > 3 else 0
     return variant_level_result
@@ -2,6 +2,7 @@
 import constants
 import numpy as np
 
+
 def default_safety_results():
     supported_metrics = constants.Metric.CONTENT_HARM_METRICS
     result = {}
@@ -27,16 +28,11 @@ def default_groundedness_results():
             }
 
 
-# The inputs section will change based on the arguments
-# of the tool function, after you save the code
-# Adding type to arguments and return value will help
-# the system show the types properly
-# Please update the function name/signature per need
 @tool
-def concat_results(selected_metrics: dict, 
-        quality_results: dict = None, 
-        safety_results: dict = None,
-        groundedness_results: dict = None) -> dict:
+def concat_results(selected_metrics: dict,
+                   quality_results: dict = None,
+                   safety_results: dict = None,
+                   groundedness_results: dict = None) -> dict:
     if quality_results:
         concated_results = quality_results.copy()
     else:
@@ -51,4 +47,3 @@ def concat_results(selected_metrics: dict,
         concated_results.update(default_safety_results())
 
     return concated_results
-
@@ -1,4 +1,3 @@
-import sys
 from enum import Enum
 
 
@@ -22,9 +21,9 @@ class Metric:
     QUALITY_METRICS = {
         "gpt_groundedness",
         "gpt_similarity",
-        "gpt_fluency", 
+        "gpt_fluency",
         "gpt_coherence",
-        "gpt_relevance", 
+        "gpt_relevance",
         "f1_score"
         }
 
@@ -46,4 +45,4 @@ class HarmSeverityLevel(Enum):
 
 class Tasks:
     """Defines types of annotation tasks supported by RAI Service."""
-    CONTENT_HARM = "content harm"
+    CONTENT_HARM = "content harm"
@@ -6,19 +6,16 @@ def normalize_user_text(user_text):
     return user_text.replace("'", "\\\"")
 
 
-# The inputs section will change based on the arguments
-# of the tool function, after you save the code
-# Adding type to arguments and return value will help
-# the system show the types properly
-# Please update the function name/signature per need
 @tool
-def construct_request(question: str, answer:str, context: str) -> dict:
+def construct_request(question: str,
+                      answer: str,
+                      context: str) -> dict:
     metrics = ["generic_groundedness"]
-    user_text = json.dumps({"question": question, 
-                 "answer": answer,
-                 "context": context})
+    user_text = json.dumps({"question": question,
+                            "answer": answer,
+                            "context": context})
     parsed_user_text = normalize_user_text(user_text)
     request_body = {"UserTextList": [parsed_user_text],
                     "AnnotationTask": "groundedness",
-                    "MetricList":metrics}
+                    "MetricList": metrics}
     return request_body
@@ -5,19 +5,17 @@ def normalize_user_text(user_text):
     return user_text.replace("'", "\\\"")
 
 
-# The inputs section will change based on the arguments
-# of the tool function, after you save the code
-# Adding type to arguments and return value will help
-# the system show the types properly
-# Please update the function name/signature per need
 @tool
-def construct_request(question: str, answer:str, selected_metrics: dict) -> dict:
+def construct_request(question: str,
+                      answer: str,
+                      selected_metrics: dict) -> dict:
     selected_safety_metrics = selected_metrics["safety_metrics"]
-    metrics = [metric.replace("_unfairness", "_fairness") for metric in\
-        selected_safety_metrics if selected_safety_metrics[metric]]
+    metrics = [metric.replace("_unfairness", "_fairness") for metric in
+               selected_safety_metrics if selected_safety_metrics[metric]]
     user_text = f"<Human>{question}</><System>{answer}</>"
     parsed_user_text = normalize_user_text(user_text)
     request_body = {"UserTextList": [parsed_user_text],
                     "AnnotationTask": "content harm",
-                    "MetricList":metrics}
+                    "MetricList": metrics,
+                    "PromptVersion": "0.2"}
     return request_body
@@ -33,7 +33,8 @@ def remove_punctuation(text):
         def lower(text):
             return text.lower()
 
-        return white_space_fix(remove_articles(remove_punctuation(lower(text))))
+        return white_space_fix(
+            remove_articles(remove_punctuation(lower(text))))
     prediction_tokens = normalize_text(answer)
     reference_tokens = normalize_text(ground_truth)
     tokenizer = QASplitTokenizer()

@@ -23,6 +23,10 @@ inputs:
     type: int
     default: 4
     is_chat_input: false
+  groundedness_service_flight:
+    type: bool
+    default: true
+    is_chat_input: false
 outputs:
   gpt_coherence:
     type: string
@@ -293,6 +297,7 @@ nodes:
   inputs:
     answer: ${inputs.answer}
     context: ${inputs.context}
+    flight: ${inputs.groundedness_service_flight}
     question: ${inputs.question}
     selected_metrics: ${select_metrics.output}
     validate_input_result: ${validate_input.output}
@@ -330,7 +335,6 @@ nodes:
     batch_response: ${call_groundedness_service.output}
     is_service_available: ${validate_service.output}
     llm_groundedness_response: ${gpt_groundedness.output}
-    selected_label_keys: ${select_metrics.output}
   use_variants: false
 - name: gpt_groundedness
   type: llm

@@ -4,19 +4,17 @@
 import re
 
 
-def parse_single_sample(response: dict, selected_metrics: dict) -> list:
-    selected_label_keys = selected_metrics["quality_metrics"]
+def parse_single_sample(response: dict) -> list:
     parsed_response = []
     for key in response:
-        #if selected_label_keys[key]:
         harm_type = key.replace("generic", "gpt")
         parsed_harm_response = {}
         try:
             harm_response = eval(response[key])
-        except:
+        except Exception:
             harm_response = response[key]
         if harm_response != "" and isinstance(harm_response, dict):
-            ### check if "output" is one key in harm_response
+            # check if "output" is one key in harm_response
             if "output" in harm_response:
                 harm_response = harm_response["output"]
 
@@ -25,7 +23,7 @@ def parse_single_sample(response: dict, selected_metrics: dict) -> list:
                 metric_value = harm_response['label']
             else:
                 metric_value = np.nan
-            
+
             # get reasoning
             if "reasoning" in harm_response:
                 reasoning = harm_response['reasoning']
@@ -40,7 +38,8 @@ def parse_single_sample(response: dict, selected_metrics: dict) -> list:
             else:
                 metric_value = np.nan
             reasoning = harm_response
-        elif harm_response != "" and (isinstance(harm_response, int) or isinstance(harm_response, float)):
+        elif harm_response != "" and (isinstance(harm_response, int)
+                                      or isinstance(harm_response, float)):
             if harm_response >= 0 and harm_response <= 7:
                 metric_value = harm_response
             else:
@@ -55,8 +54,8 @@ def parse_single_sample(response: dict, selected_metrics: dict) -> list:
     return parsed_response
 
 
-def parse_groundedness_llm_response(llm_groundedness_response = None) -> dict:
-    item = {'name': 'gpt_groundedness', 
+def parse_groundedness_llm_response(llm_groundedness_response=None) -> dict:
+    item = {'name': 'gpt_groundedness',
             'score': llm_groundedness_response}
     if item['score']:
         try:
@@ -66,36 +65,26 @@ def parse_groundedness_llm_response(llm_groundedness_response = None) -> dict:
                 score = float(match.group())
             else:
                 score = np.nan
-        except Exception as e:
+        except Exception:
             score = np.nan
-            errors.append({
-                "name": item["name"], 
-                "msg":   str(e), "data": item["score"]})
     else:
         score = np.nan
     return {"gpt_groundedness": score,
             "gpt_groundedness_reason": np.nan}
-
 
 
-# The inputs section will change based on the arguments
-# of the tool function, after you save the code
-# Adding type to arguments and return value will help
-# the system show the types properly
-# Please update the function name/signature per need
 @tool
-def parse_response(selected_label_keys: dict,
-                   is_service_available: dict,
+def parse_response(is_service_available: dict,
                    llm_groundedness_response: dict = None,
                    batch_response: List[dict] = None):
     parsed_single_sample_response = None
     if is_service_available["groundedness_service"]:
         if batch_response:
             single_sample_response = batch_response[0]
             parsed_single_sample_response = parse_single_sample(
-                single_sample_response,
-                selected_label_keys)[0]
+                single_sample_response)[0]
     else:
-        parsed_single_sample_response = parse_groundedness_llm_response(llm_groundedness_response)
+        parsed_single_sample_response = \
+            parse_groundedness_llm_response(llm_groundedness_response)
 
     return parsed_single_sample_response