Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
# pylint: disable=E0611
# pylint: skip-file

import logging
import pandas as pd
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
# pylint: skip-file

import logging

from os import path
Expand All @@ -17,7 +19,7 @@

NODE_LIST_BY_TASK = {
"qa": ["gpt_coherence", "gpt_similarity", "gpt_relevance", "gpt_fluency", "gpt_groundedness"],
"chat": ["evaluate_chat_rag", "evaluate_coherence_fluency"],
"chat": ["evaluate_chat_rag", "evaluate_coherence_fluency", "fallback_groundedness_evaluation"],
}


Expand Down Expand Up @@ -136,6 +138,9 @@ def calculate_metrics(self) -> Dict:
if col.replace("outputs.", "").startswith(metric):
is_col_to_delete = False
break
# keep the column "evaluation_per_turn" in the output
if "evaluation_per_turn" in col:
is_col_to_delete = False
if is_col_to_delete:
columns_to_drop.append(col)
result_df.drop(columns_to_drop, axis=1, inplace=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
# pylint: skip-file

import os.path
import json
import pathlib
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@

@tool
def aggregate_variants_results(results: List[dict], metrics: List[dict],
thresholds = None):
if metrics:
thresholds=None):
if metrics:
rag_metrics = filter_metrics(metrics[0]["rag_metrics"])
non_rag_metrics = filter_metrics(metrics[0]["non_rag_metrics"])
safety_metrics = filter_metrics(metrics[0]["safety_metrics"])
else:
rag_metrics = []
non_rag_metrics = []
safety_metrics = []
selected_quality_metrics = rag_metrics + non_rag_metrics
selected_quality_metrics = rag_metrics + non_rag_metrics

# set threshold for safety metric defect_rate
if thresholds != [] and thresholds is not None:
Expand Down Expand Up @@ -51,7 +51,8 @@ def aggregate_variants_results(results: List[dict], metrics: List[dict],
if name in selected_quality_metrics:
aggregate_output[metric_name] = round(np.nanmean(values), 2)
elif name in safety_metrics:
aggregate_output[metric_name] = round(np.sum(values >= threshold) / len(values), 2)
aggregate_output[metric_name] = round(
np.sum(values >= threshold) / len(values), 2)
else:
aggregate_output[metric_name] = np.nan
log_metric(metric_name, aggregate_output[metric_name])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from promptflow import tool
from rai_client import RAIServiceHandler


@tool
def call_groundedness_service(request_bodies: list[dict]) -> [dict]:
service_handler = RAIServiceHandler()
annotation_results = []
for request_body in request_bodies:
try:
annotation_result = service_handler.get_annotation(request_body)
except Exception:
annotation_result = []
annotation_results += annotation_result
return annotation_results
Original file line number Diff line number Diff line change
@@ -1,76 +1,9 @@
from promptflow import tool
from mlflow.utils.rest_utils import http_request
import time
from utils import get_cred
from constants import RAIService
from rai_client import RAIServiceHandler


def submit_annotation(cred, request_body):
try:
response = http_request(
host_creds=cred,
endpoint="/submitannotation",
method="POST",
json=request_body,
)
if response.status_code != 202:
print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], response.text)
response.raise_for_status()
except AttributeError as e:
response = None
print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], e)
if response is not None:
json_obj = response.json()
else:
json_obj = {}
return json_obj

def check_status(cred, request_id):
try:
response = http_request(
host_creds = cred,
endpoint="/operations/" + request_id,
method="GET"
)
except AttributeError as e:
response = None
return response

def retrieve_annotation_result(cred, submitannotation_response):
request_id = submitannotation_response["location"].split("/")[-1]
annotation_result = None
start = time.time()
time_elapsed = 0
request_count = 1
while True and time_elapsed <= RAIService.TIMEOUT:
try:
request_status = check_status(cred, request_id)
except Exception:
request_status = None
if request_status:
request_status_code = request_status.status_code
if request_status_code == 200:
annotation_result = request_status.json()
break
else:
print("Failed to retrieve the status of RequestID: %s" % request_id)
request_count += 1
sleep_time = RAIService.SLEEPTIME ** request_count
time.sleep(sleep_time)
time_elapsed = time.time() - start

if time_elapsed > RAIService.TIMEOUT:
raise TimeoutError("Request times out after %d seconds", RAIService.TIMEOUT)

return annotation_result

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# Please update the function name/signature per need
@tool
def call_rai_service(request_body: dict) -> dict:
cred = get_cred()
submitannotation_response = submit_annotation(cred, request_body)
annotation_result = retrieve_annotation_result(cred, submitannotation_response)
service_handler = RAIServiceHandler()
annotation_result = service_handler.get_annotation(request_body)
return annotation_result

Original file line number Diff line number Diff line change
@@ -1,67 +1,91 @@
from promptflow import tool
import numpy as np
import constants

def format_rag_results(rag_results: dict, supported_metrics):

def format_rag_results(rag_results: dict,
selected_metrics: dict,
num_turns: int):
result_per_chat = {}
result_per_turn = {}
supported_metrics = selected_metrics["rag_metrics"]
if rag_results:
for metric, value in rag_results['artifacts'].items():
try:
result_per_chat[metric] = rag_results['metrics']["mean_" + metric]
result_per_turn[metric] = {"reason": value['reason'], "score": value['score_per_turn']}
result_per_chat[metric] = round(
rag_results['metrics']["mean_" + metric],
2)
result_per_turn[metric] = {"reason": value['reason'][0],
"score": value['score_per_turn'][0]}
except KeyError:
result_per_chat[metric] = np.nan
result_per_turn[metric] = np.nan
result_per_turn[metric] = {"score": [np.nan] * int(num_turns)}
for metric in supported_metrics:
if metric not in result_per_turn:
result_per_chat[metric] = np.nan
result_per_turn[metric] = np.nan
return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat}
return {"results_per_turn": result_per_turn,
"results_per_chat": result_per_chat}


def format_non_rag_results(non_rag_results: dict, supported_metrics):
def format_non_rag_results(non_rag_results: dict,
selected_metrics: dict,
num_turns: int):
result_per_chat = {}
result_per_turn = {}
supported_metrics = selected_metrics["non_rag_metrics"]
if non_rag_results:
for metric in non_rag_results['artifacts']:
try:
result_per_chat[metric] = non_rag_results['metrics']['mean_' + metric]
except:
result_per_chat[metric] = round(
non_rag_results['metrics']['mean_' + metric],
2)
result_per_turn[metric] = {
"score": non_rag_results['artifacts'][metric]}
except Exception:
result_per_chat[metric] = np.nan
result_per_turn = non_rag_results['artifacts']
result_per_turn[metric] = {
"score": [np.nan] * int(num_turns)}

for metric in supported_metrics:
if metric not in result_per_turn:
result_per_turn[metric] = np.nan
result_per_chat[metric] = np.nan
return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat}
return {"results_per_turn": result_per_turn,
"results_per_chat": result_per_chat}

def format_safety_results(safety_results: dict, supported_metrics):

def format_safety_results(safety_results: dict, selected_metrics):
result_per_chat = {}
supported_metrics = selected_metrics["safety_metrics"]
if safety_results:
result_per_chat = safety_results
for metric in supported_metrics:
if metric not in result_per_chat:
result_per_chat[metric] = np.nan
result_per_chat[metric + "_reasoning"] = np.nan
result_per_chat[metric + "_reason"] = np.nan
result_per_chat[metric + "_score"] = np.nan
return result_per_chat

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# Please update the function name/signature per need

@tool
def concatenate_metrics(rag_results: dict, non_rag_results: dict,
safety_results: dict,
selected_metrics: dict) -> dict:
formatted_rag = format_rag_results(rag_results, selected_metrics['rag_metrics'])
formatted_non_rag = format_non_rag_results(non_rag_results, selected_metrics['non_rag_metrics'])
formatted_safety = format_safety_results(safety_results, selected_metrics['safety_metrics'])
def concatenate_metrics(rag_results: dict, non_rag_results: dict,
safety_results: dict,
groundedness_results: list[dict],
selected_metrics: dict,
chat_validation: dict) -> dict:
num_turns = chat_validation["num_turns"]
formatted_rag = format_rag_results(rag_results,
selected_metrics,
num_turns)
formatted_non_rag = format_non_rag_results(non_rag_results,
selected_metrics,
num_turns)
formatted_safety = format_safety_results(safety_results,
selected_metrics)
results = {}
for key in ["results_per_turn", "results_per_chat"]:
result_concat = formatted_rag[key].copy()
result_concat.update(formatted_non_rag[key])
result_concat.update(groundedness_results[key])
if key == "results_per_chat":
result_concat.update(formatted_safety)
results[key] = result_concat
return results
return results
Original file line number Diff line number Diff line change
@@ -1,39 +1,45 @@
from azureml.metrics import constants
from enum import Enum


class RAIService:
"""Define constants related to RAI service"""
TIMEOUT = 1800
TIMEOUT = 120
SLEEPTIME = 2
HARM_SEVERITY_THRESHOLD = 4


class Metric:
"""Defines all metrics supported by RAI service"""
# Content harm
SelfHarm = "self_harm"
Violence = "violence"
Sexual = "sexual"
HateFairness = "hate_fairness"
HateUnfairness = "hate_unfairness"


class Tasks:
"""Defines types of annotation tasks supported by RAI Service."""
CONTENT_HARM = "content harm"


class HarmSeverityLevel(Enum):
Safe = 0
Low = 1
Medium = 2
High = 3
VeryLow = "Very low"
Low = "Low"
Medium = "Medium"
High = "High"


# Content harm metric set
CONTENT_HARM_METRICS = {
Metric.SelfHarm,
Metric.Violence,
Metric.Sexual,
Metric.HateFairness
Metric.HateUnfairness
}


RAG_EVALUATION_SET = constants.Metric.RAG_EVALUATION_SET
NON_RAG_EVALUATION_SET = {constants.Metric.GPTCoherence, constants.Metric.GPTFluency}
NON_RAG_EVALUATION_SET = {constants.Metric.GPTCoherence,
constants.Metric.GPTFluency}
QUALITY_METRIC_SET = RAG_EVALUATION_SET | NON_RAG_EVALUATION_SET
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from promptflow import tool
import json


def normalize_user_text(user_text):
return user_text.replace("'", "\\\"")


def construct_single_request(question: str,
answer: str,
context: dict = None) -> dict:
metrics = ["generic_groundedness"]
user_text = json.dumps({
"question": question,
"answer": answer,
"context": context})
parsed_user_text = normalize_user_text(user_text)
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "groundedness",
"MetricList": metrics}
return request_body


@tool
def construct_groundedness_requests(parsed_chat: dict) -> str:
num_turns = len(parsed_chat["questions"])
request_bodies = []
for i in range(num_turns):
question = parsed_chat["questions"][i]
answer = parsed_chat["answers"][i]
try:
retrieved_documents = eval(
parsed_chat["retrieved_documents"][i])
except Exception:
retrieved_documents = [
parsed_chat["retrieved_documents"][i]]
context = {"citations": retrieved_documents}
request = construct_single_request(question,
answer,
context)
request_bodies.append(request)
return request_bodies
Loading