Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update built-in qa evaluation flow
* add flight control to flow input
* code flake8 cleaning
  • Loading branch information
qusongms committed Mar 22, 2024
commit f9f104c80e3a7ad6da711dcfe6a41db50b77f3eb
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,31 @@


@tool
def aggregate_results(results: List[dict], selected_metrics: List[dict], thresholds: List[int]) -> dict:
def aggregate_results(results: List[dict],
selected_metrics: List[dict],
thresholds: List[int]) -> dict:
if selected_metrics:
selected_safety_metrics = filter_metrics(selected_metrics[0]["safety_metrics"])
selected_quality_metrics = filter_metrics(selected_metrics[0]["quality_metrics"])
selected_safety_metrics = filter_metrics(
selected_metrics[0]["safety_metrics"])
selected_quality_metrics = filter_metrics(
selected_metrics[0]["quality_metrics"])
else:
selected_safety_metrics = []
selected_quality_metrics = []

if thresholds != [] and thresholds is not None:
threshold = np.float16(thresholds[0])
else:
threshold = np.float16(RAIService.HARM_SEVERITY_THRESHOLD)
threshold = np.float16(
RAIService.HARM_SEVERITY_THRESHOLD)

aggregate_results = {}
for result in results:
if not result:
continue
for name in result.keys():
if name in selected_quality_metrics or name in selected_safety_metrics:
if name in selected_quality_metrics \
or name in selected_safety_metrics:
if name not in aggregate_results.keys():
aggregate_results[name] = []
metric_value = result[name]
Expand All @@ -47,7 +53,8 @@ def aggregate_results(results: List[dict], selected_metrics: List[dict], thresho
if name in selected_quality_metrics:
aggregate_output[metric_name] = round(np.nanmean(values), 2)
elif name in selected_safety_metrics:
aggregate_output[metric_name] = round(np.sum(values >= threshold) / len(values), 2)
aggregate_output[metric_name] = round(
np.sum(values >= threshold) / len(values), 2)
else:
aggregate_output[metric_name] = np.nan
log_metric(metric_name, aggregate_output[metric_name])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
from promptflow import tool
from rai_client import RAIServiceHandler

# The inputs section will change based on the
# arguments of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need

@tool
def call_groundedness_service(request_body: dict) -> [dict]:
service_handler = RAIServiceHandler()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,8 @@
from rai_client import RAIServiceHandler


# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def call_rai_service(request_body: dict) -> dict:
service_handler = RAIServiceHandler()
annotation_results = service_handler.get_annotation(request_body)
return annotation_results

Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def concat_results(gpt_coherence_score: str = None,
score = float(item["score"])
except Exception as e:
score = np.nan
errors.append({"name": item["name"], "msg": str(e), "data": item["score"]})
errors.append({"name": item["name"],
"msg": str(e),
"data": item["score"]})
else:
if item['score']:
try:
Expand All @@ -39,15 +41,19 @@ def concat_results(gpt_coherence_score: str = None,
score = np.nan
except Exception as e:
score = np.nan
errors.append({"name": item["name"], "msg": str(e), "data": item["score"]})
errors.append({"name": item["name"],
"msg": str(e),
"data": item["score"]})
else:
score = np.nan
score_list.append({"name": item["name"], "score": score})
score_list.append({"name": item["name"],
"score": score})

variant_level_result = {}
for item in score_list:
item_name = str(item["name"])
variant_level_result[item_name] = item["score"]
if 'gpt' in item_name:
variant_level_result[item_name + '_pass_rate'] = 1 if item["score"] > 3 else 0
variant_level_result[item_name + '_pass_rate'] = 1 \
if item["score"] > 3 else 0
return variant_level_result
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import constants
import numpy as np


def default_safety_results():
supported_metrics = constants.Metric.CONTENT_HARM_METRICS
result = {}
Expand All @@ -27,16 +28,11 @@ def default_groundedness_results():
}


# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def concat_results(selected_metrics: dict,
quality_results: dict = None,
safety_results: dict = None,
groundedness_results: dict = None) -> dict:
def concat_results(selected_metrics: dict,
quality_results: dict = None,
safety_results: dict = None,
groundedness_results: dict = None) -> dict:
if quality_results:
concated_results = quality_results.copy()
else:
Expand All @@ -51,4 +47,3 @@ def concat_results(selected_metrics: dict,
concated_results.update(default_safety_results())

return concated_results

Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import sys
from enum import Enum


Expand All @@ -22,9 +21,9 @@ class Metric:
QUALITY_METRICS = {
"gpt_groundedness",
"gpt_similarity",
"gpt_fluency",
"gpt_fluency",
"gpt_coherence",
"gpt_relevance",
"gpt_relevance",
"f1_score"
}

Expand All @@ -46,4 +45,4 @@ class HarmSeverityLevel(Enum):

class Tasks:
"""Defines types of annotation tasks supported by RAI Service."""
CONTENT_HARM = "content harm"
CONTENT_HARM = "content harm"
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,16 @@ def normalize_user_text(user_text):
return user_text.replace("'", "\\\"")


# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def construct_request(question: str, answer:str, context: str) -> dict:
def construct_request(question: str,
answer: str,
context: str) -> dict:
metrics = ["generic_groundedness"]
user_text = json.dumps({"question": question,
"answer": answer,
"context": context})
user_text = json.dumps({"question": question,
"answer": answer,
"context": context})
parsed_user_text = normalize_user_text(user_text)
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "groundedness",
"MetricList":metrics}
"MetricList": metrics}
return request_body
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,17 @@ def normalize_user_text(user_text):
return user_text.replace("'", "\\\"")


# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def construct_request(question: str, answer:str, selected_metrics: dict) -> dict:
def construct_request(question: str,
answer: str,
selected_metrics: dict) -> dict:
selected_safety_metrics = selected_metrics["safety_metrics"]
metrics = [metric.replace("_unfairness", "_fairness") for metric in\
selected_safety_metrics if selected_safety_metrics[metric]]
metrics = [metric.replace("_unfairness", "_fairness") for metric in
selected_safety_metrics if selected_safety_metrics[metric]]
user_text = f"<Human>{question}</><System>{answer}</>"
parsed_user_text = normalize_user_text(user_text)
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "content harm",
"MetricList":metrics}
"MetricList": metrics,
"PromptVersion": "0.2"}
return request_body
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def remove_punctuation(text):
def lower(text):
return text.lower()

return white_space_fix(remove_articles(remove_punctuation(lower(text))))
return white_space_fix(
remove_articles(remove_punctuation(lower(text))))
prediction_tokens = normalize_text(answer)
reference_tokens = normalize_text(ground_truth)
tokenizer = QASplitTokenizer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ inputs:
type: int
default: 4
is_chat_input: false
groundedness_service_flight:
type: bool
default: true
is_chat_input: false
outputs:
gpt_coherence:
type: string
Expand Down Expand Up @@ -293,6 +297,7 @@ nodes:
inputs:
answer: ${inputs.answer}
context: ${inputs.context}
flight: ${inputs.groundedness_service_flight}
question: ${inputs.question}
selected_metrics: ${select_metrics.output}
validate_input_result: ${validate_input.output}
Expand Down Expand Up @@ -330,7 +335,6 @@ nodes:
batch_response: ${call_groundedness_service.output}
is_service_available: ${validate_service.output}
llm_groundedness_response: ${gpt_groundedness.output}
selected_label_keys: ${select_metrics.output}
use_variants: false
- name: gpt_groundedness
type: llm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,17 @@
import re


def parse_single_sample(response: dict, selected_metrics: dict) -> list:
selected_label_keys = selected_metrics["quality_metrics"]
def parse_single_sample(response: dict) -> list:
parsed_response = []
for key in response:
#if selected_label_keys[key]:
harm_type = key.replace("generic", "gpt")
parsed_harm_response = {}
try:
harm_response = eval(response[key])
except:
except Exception:
harm_response = response[key]
if harm_response != "" and isinstance(harm_response, dict):
### check if "output" is one key in harm_response
# check if "output" is one key in harm_response
if "output" in harm_response:
harm_response = harm_response["output"]

Expand All @@ -25,7 +23,7 @@ def parse_single_sample(response: dict, selected_metrics: dict) -> list:
metric_value = harm_response['label']
else:
metric_value = np.nan

# get reasoning
if "reasoning" in harm_response:
reasoning = harm_response['reasoning']
Expand All @@ -40,7 +38,8 @@ def parse_single_sample(response: dict, selected_metrics: dict) -> list:
else:
metric_value = np.nan
reasoning = harm_response
elif harm_response != "" and (isinstance(harm_response, int) or isinstance(harm_response, float)):
elif harm_response != "" and (isinstance(harm_response, int)
or isinstance(harm_response, float)):
if harm_response >= 0 and harm_response <= 7:
metric_value = harm_response
else:
Expand All @@ -55,8 +54,8 @@ def parse_single_sample(response: dict, selected_metrics: dict) -> list:
return parsed_response


def parse_groundedness_llm_response(llm_groundedness_response = None) -> dict:
item = {'name': 'gpt_groundedness',
def parse_groundedness_llm_response(llm_groundedness_response=None) -> dict:
item = {'name': 'gpt_groundedness',
'score': llm_groundedness_response}
if item['score']:
try:
Expand All @@ -66,36 +65,26 @@ def parse_groundedness_llm_response(llm_groundedness_response = None) -> dict:
score = float(match.group())
else:
score = np.nan
except Exception as e:
except Exception:
score = np.nan
errors.append({
"name": item["name"],
"msg": str(e), "data": item["score"]})
else:
score = np.nan
return {"gpt_groundedness": score,
"gpt_groundedness_reason": np.nan}



# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def parse_response(selected_label_keys: dict,
is_service_available: dict,
def parse_response(is_service_available: dict,
llm_groundedness_response: dict = None,
batch_response: List[dict] = None):
parsed_single_sample_response = None
if is_service_available["groundedness_service"]:
if batch_response:
single_sample_response = batch_response[0]
parsed_single_sample_response = parse_single_sample(
single_sample_response,
selected_label_keys)[0]
single_sample_response)[0]
else:
parsed_single_sample_response = parse_groundedness_llm_response(llm_groundedness_response)
parsed_single_sample_response = \
parse_groundedness_llm_response(llm_groundedness_response)

return parsed_single_sample_response
Loading