Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
modify built-in qa evaluation flow:
* add logic to check service availability in a region
* change hate_fairness to hate_unfairness
  • Loading branch information
qusongms committed Mar 20, 2024
commit 5de28011215cc0038eade37f71300e5be0efbc1f
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def aggregate_results(results: List[dict], selected_metrics: List[dict], thresho
else:
aggregate_output[metric_name] = np.nan
log_metric(metric_name, aggregate_output[metric_name])
return aggregate_output
return aggregate_output
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from promptflow import tool
from rai_client import RAIServiceHandler

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# The inputs section will change based on the
# arguments of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def call_groundedness_service(request_body: dict) -> [dict]:
service_handler = RAIServiceHandler()
annotation_results = service_handler.get_annotation(request_body)
return annotation_results
return annotation_results
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from promptflow import tool
from rai_client import RAIServiceHandler

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly

# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def call_rai_service(request_body: dict) -> dict:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@ def concat_results(gpt_coherence_score: str = None,
gpt_similarity_score: str = None,
gpt_fluency_score: str = None,
gpt_relevance_score: str = None,
#gpt_groundedness_score: str = None,
f1_score: float = None) -> dict:
f1_score: float = None
) -> dict:

load_list = [{'name': 'gpt_coherence', 'score': gpt_coherence_score},
{'name': 'gpt_similarity', 'score': gpt_similarity_score},
{'name': 'gpt_fluency', 'score': gpt_fluency_score},
{'name': 'gpt_relevance', 'score': gpt_relevance_score},
#{'name': 'gpt_groundedness', 'score': gpt_groundedness_score},
{'name': 'f1_score', 'score': f1_score}
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ def default_safety_results():
for metric_name in supported_metrics:
result[metric_name] = np.nan
result[metric_name + "_score"] = np.nan
result[metric_name + "_reasoning"] = np.nan
result[metric_name + "_reason"] = np.nan
return result


def default_gpt_results():
supported_metrics = constants.Metric.QUALITY_METRICS
result = {}
Expand All @@ -19,14 +20,17 @@ def default_gpt_results():
result[metric_name] = np.nan
return result


def default_groundedness_results():
return {"gpt_groundedness": np.nan,
"gpt_groundedness_reasoning": np.nan
"gpt_groundedness_reason": np.nan
}


# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def concat_results(selected_metrics: dict,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import sys
from enum import Enum
#import numpy as np


class RAIService:
"""Define constants related to RAI service"""
TIMEOUT = 1800
SLEEPTIME = 2
HARM_SEVERITY_THRESHOLD = 4


class Metric:
"""Defines all metrics supported by RAI service"""
Metrics = "metrics"
Expand All @@ -16,7 +17,7 @@ class Metric:
SelfHarm = "self_harm"
Violence = "violence"
Sexual = "sexual"
HateFairness = "hate_fairness"
HateFairness = "hate_unfairness"

QUALITY_METRICS = {
"gpt_groundedness",
Expand All @@ -35,12 +36,14 @@ class Metric:
HateFairness
}


class HarmSeverityLevel(Enum):
VeryLow = "Very low"
Low = "Low"
Medium = "Medium"
High = "High"


class Tasks:
"""Defines types of annotation tasks supported by RAI Service."""
CONTENT_HARM = "content harm"
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
from promptflow import tool
import json


def normalize_user_text(user_text):
return user_text.replace("'", "\\\"")


# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def construct_request(question: str, answer:str, context: str) -> dict:
metrics = ["generic_groundedness"]
user_text = json.dumps({"question": question,
"answer": answer,
"context": context})
parsed_user_text = normalize_user_text(user_text)
request_body = {"UserTextList": [parsed_user_text], "AnnotationTask": "groundedness", "MetricList":metrics}
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "groundedness",
"MetricList":metrics}
return request_body
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
from promptflow import tool


def normalize_user_text(user_text):
return user_text.replace("'", "\\\"")

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly

# The inputs section will change based on the arguments
# of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def construct_request(question: str, answer:str, selected_metrics: dict) -> dict:
selected_safety_metrics = selected_metrics["safety_metrics"]
metrics = [metric for metric in selected_safety_metrics if selected_safety_metrics[metric]]
metrics = [metric.replace("_unfairness", "_fairness") for metric in\
selected_safety_metrics if selected_safety_metrics[metric]]
user_text = f"<Human>{question}</><System>{answer}</>"
parsed_user_text = normalize_user_text(user_text)
request_body = {"UserTextList": [parsed_user_text], "AnnotationTask": "content harm", "MetricList":metrics}
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "content harm",
"MetricList":metrics}
return request_body
Original file line number Diff line number Diff line change
Expand Up @@ -27,42 +27,36 @@ outputs:
gpt_coherence:
type: string
reference: ${concat_scores.output.gpt_coherence}
hate_fairness_score:
type: string
reference: ${concat_scores.output.hate_fairness_score}
gpt_relevance:
type: string
reference: ${concat_scores.output.gpt_relevance}
self_harm_reasoning:
type: string
reference: ${concat_scores.output.self_harm_reasoning}
self_harm:
type: string
reference: ${concat_scores.output.self_harm}
violence_reason:
type: string
reference: ${concat_scores.output.violence_reason}
sexual_reason:
type: string
reference: ${concat_scores.output.sexual_reason}
hate_unfairness:
type: string
reference: ${concat_scores.output.hate_unfairness}
sexual_score:
type: string
reference: ${concat_scores.output.sexual_score}
violence_score:
type: string
reference: ${concat_scores.output.violence_score}
hate_fairness_reasoning:
type: string
reference: ${concat_scores.output.hate_fairness_reasoning}
hate_fairness:
type: string
reference: ${concat_scores.output.hate_fairness}
gpt_groundedness_reasoning:
type: string
reference: ${concat_scores.output.gpt_groundedness_reasoning}
gpt_groundedness:
type: string
reference: ${concat_scores.output.gpt_groundedness}
gpt_groundedness_reason:
type: string
reference: ${concat_scores.output.gpt_groundedness_reason}
gpt_similarity:
type: string
reference: ${concat_scores.output.gpt_similarity}
sexual_reasoning:
type: string
reference: ${concat_scores.output.sexual_reasoning}
gpt_fluency:
type: string
reference: ${concat_scores.output.gpt_fluency}
Expand All @@ -72,12 +66,18 @@ outputs:
self_harm_score:
type: string
reference: ${concat_scores.output.self_harm_score}
violence_reasoning:
hate_unfairness_reason:
type: string
reference: ${concat_scores.output.violence_reasoning}
reference: ${concat_scores.output.hate_unfairness_reason}
violence:
type: string
reference: ${concat_scores.output.violence}
hate_unfairness_score:
type: string
reference: ${concat_scores.output.hate_unfairness_score}
self_harm_reason:
type: string
reference: ${concat_scores.output.self_harm_reason}
f1_score:
type: string
reference: ${concat_scores.output.f1_score}
Expand Down Expand Up @@ -295,6 +295,7 @@ nodes:
context: ${inputs.context}
question: ${inputs.question}
selected_metrics: ${select_metrics.output}
validate_input_result: ${validate_input.output}
use_variants: false
- name: construct_groundedness_request
type: python
Expand Down Expand Up @@ -327,9 +328,29 @@ nodes:
path: parse_groundedness_response.py
inputs:
batch_response: ${call_groundedness_service.output}
is_service_available: ${validate_service.output}
llm_groundedness_response: ${gpt_groundedness.output}
selected_label_keys: ${select_metrics.output}
use_variants: false
- name: gpt_groundedness
type: llm
source:
type: code
path: gpt_groundedness_prompt.jinja2
inputs:
deployment_name: GPT-4-Prod
temperature: 1
top_p: 1
presence_penalty: 0
frequency_penalty: 0
answer: ${inputs.answer}
context: ${inputs.context}
provider: AzureOpenAI
connection: Default_AzureOpenAI
api: chat
module: promptflow.tools.aoai
activate:
when: ${validate_service.output.groundedness_service}
when: ${validate_service.output.groundedness_prompt}
is: true
use_variants: false
node_variants: {}
Expand Down

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from utils import get_harm_severity_level


# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# The inputs section will change based on the
# arguments of the tool function, after you save the code
# Adding type to arguments and return value will help
# the system show the types properly
# Please update the function name/signature per need
@tool
def format_service_output(parsed_responses: List[List[dict]]) -> dict:
Expand All @@ -24,11 +26,11 @@ def format_service_output(parsed_responses: List[List[dict]]) -> dict:
harm_score = np.nan
result[key + "_score"] = harm_score
harm_severity_level = get_harm_severity_level(harm_score)
result[key + "_reasoning"] = metric_dict["reasoning"]
result[key + "_reason"] = metric_dict["reasoning"]
result[key] = harm_severity_level
for metric_name in supported_metrics:
if metric_name not in result:
result[metric_name] = np.nan
result[metric_name + "_score"] = np.nan
result[metric_name + "_reasoning"] = np.nan
return result
result[metric_name + "_reason"] = np.nan
return result
Loading