Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_daf1ed16fc"
"Tag": "python/evaluation/azure-ai-evaluation_043418c052"
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import math
import re
import time
import json
import html
from ast import literal_eval
from typing import Dict, List, Optional, Union, cast
from urllib.parse import urlparse
Expand Down Expand Up @@ -38,10 +40,38 @@

USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
}


def get_formatted_template(data: dict, annotation_task: str) -> str:
"""Given the task and input data, produce a formatted string that will serve as the main
payload for the RAI service. Requires specific per-task logic.

:param data: The data to incorporate into the payload.
:type data: dict
:param annotation_task: The annotation task to use. This determines the template to use.
:type annotation_task: str
:return: The formatted based on the data and task template.
:rtype: str
"""
# Template class doesn't play nice with json dumping/loading, just handle groundedness'
# JSON format manually.
# Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
if annotation_task == Tasks.GROUNDEDNESS:
as_dict = {
"question": data.get("query", ""),
"answer": data.get("response", ""),
"context": data.get("context", ""),
}
return json.dumps(as_dict)
as_dict = {
"query": html.escape(data.get("query", "")),
"response": html.escape(data.get("response", "")),
}
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
return user_text.replace("'", '\\"')


def get_common_headers(token: str) -> Dict:
"""Get common headers for the HTTP request

Expand Down Expand Up @@ -161,8 +191,7 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
:return: The operation ID.
:rtype: str
"""
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
normalized_user_text = user_text.replace("'", '\\"')
normalized_user_text = get_formatted_template(data, annotation_task)
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)

url = rai_svc_url + "/submitannotation"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(
azure_ai_project,
**kwargs,
):
self._passing_score = 3 # TODO update once the binarization PR is merged
self._passing_score = 5 # TODO update once the binarization PR is merged
self._output_prefix = "groundedness_pro"
super().__init__(
eval_metric=EvaluationMetrics.GROUNDEDNESS,
Expand Down
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def simple_conversation():
return {
"messages": [
{
"content": "What is the capital of France?",
"content": "What is the capital of France?`''\"</>{}{{]",
"role": "user",
"context": "Customer wants to know the capital of France",
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"query":"How do you create a run?","context":"AML API only","response":"To create a run using the Azure Machine Learning API, you first need to create an Experiment. Once you have an experiment, you can create a Run object that is associated with that experiment. Here is some Python code that demonstrates this process:\n\n```\nfrom azureml.core import Experiment, Run\nfrom azureml.core.workspace import Workspace\n\n# Define workspace and experiment\nws = Workspace.from_config()\nexp = Experiment(workspace=ws, name='my_experiment')\n\n# Create a new run\nrun = exp.start_logging()\n```\n\nIn this code, the `from_config()` method reads the configuration file that you created when you set up your Azure Machine Learning workspace. The `Experiment` constructor creates an Experiment object that is associated with your workspace, and the `start_logging()` method creates a new Run object that is associated with the Experiment. Now you can use the `run` object to log metrics, upload files, and track other information related to your machine learning experiment.", "ground_truth":"Paris is the capital of France."}
{"query":"How do you log a model?","context":"Logging can be done using any OSS Sdk","response":"There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.","ground_truth":"Paris is the capital of France."}
{"query":"What is the capital of France?","context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."}
{"query":"What is the capital of France?`''\"</>{}{{]","context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."}
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"conversation" : {"context" : "", "messages": [{"content": "What shape has 3 sides", "role" :"user", "context": null}, {"content": "A triangle", "role" :"assistant", "context": "The answer is a triangle."}, {"content": "Next, what shape has 4 sides", "role" :"user", "context": null}, {"content": "A square", "role" :"assistant", "context": "The answer is a square."}]}}
{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}}
{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii`''\"</>{}{{]", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}}
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{"query":"How long is flight from Earth to LV-426?","ground_truth":"Far away.","context": "Refers to a distant fictional location."}
{"query":"Why there is no central heating on the street?","ground_truth":"It is expensive.","context": "Discusses infrastructure cost."}
{"query":"Why these questions are so strange?","ground_truth":"The life is strange...","context": "Questions may seem unusual."}
{"query":"Why these questions are so strange?`''\"</>{}{{]","ground_truth":"The life is strange...","context": "Questions may seem unusual."}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"query":"How long is flight from Earth to LV-426?","response":"There is nothing good there.", "ground_truth": "39 light years"}
{"query":"How long is flight from Earth to LV-426?`''\"</>{}{{]","response":"There is nothing good there.", "ground_truth": "39 light years"}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ImageContentItem,
ImageUrl,
)
from azure.ai.evaluation._common.constants import HarmSeverityLevel
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation import (
BleuScoreEvaluator,
Expand Down Expand Up @@ -240,7 +241,7 @@ def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simp
assert score2["violence_score"] > 0
assert score2["evaluation_per_turn"]["violence_score"][0] == 0
assert score2["evaluation_per_turn"]["violence_score"][1] > 0
assert score2["evaluation_per_turn"]["violence"] == ["Very low", "High"]
assert len(score2["evaluation_per_turn"]["violence"]) == 2
assert all(score2["evaluation_per_turn"]["violence_reason"]), "violence_reason must not be None or empty."

def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple_conversation):
Expand Down Expand Up @@ -287,7 +288,7 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim
# Test eval_last_turn behavior
eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True)
score3 = eval_fn_last(conversation=simple_conversation)
assert score3["self_harm"] == "Very low"
assert score3["self_harm"] in (level.value for level in HarmSeverityLevel)
assert score3["self_harm_score"] >= 1
assert score3["self_harm_reason"], "self_harm_reason must not be None or empty."

Expand Down Expand Up @@ -456,38 +457,12 @@ def test_protected_material_evaluator(self, project_scope, azure_cred, simple_co
assert not good_result["protected_material_label"]
assert "material was not found" in good_result["protected_material_reason"]

problematic_text = (
"Hey Dude, don't make it bad Take a sad song and make it better "
+ "Remember to let her into your heart Then you can stop to make it better Hey Dude, "
+ "don't be afraid You were made to go out and get her The minute you let her "
+ "under your skin Then you begin to make it better And anytime you feel the pain, "
+ "hey Jude, refrain Don't carry the world upon your shoulders For well you know that "
+ "it's a mule who plays it cool By making his world a little folder Na, na, na, "
+ "naa-naa Na, na, naa-naa Hey Dude, don't let me down You have found her, "
+ "now go and get her (let it out and let it in) Remember to let her into your heart"
)

problem_answer = ip_eval(
query="-",
response=problematic_text,
)

problem_question = ip_eval(
response="-",
query=problematic_text,
)
assert problem_answer is not None
assert problem_answer["protected_material_label"]
assert "material was found" in problem_answer["protected_material_reason"]
assert problem_question is not None
assert problem_question["protected_material_label"]
assert "material was found" in problem_question["protected_material_reason"]
# Positive test casing removed due to how fluctuating service results.

# Test conversation input
simple_conversation["messages"][3]["content"] = problematic_text
convo_result = ip_eval(conversation=simple_conversation)
assert convo_result["protected_material_label"] == 0.5
assert convo_result["evaluation_per_turn"]["protected_material_label"] == [False, True]
assert convo_result["protected_material_label"] == 0.0
assert convo_result["evaluation_per_turn"]["protected_material_label"] == [False, False]
assert all(
convo_result["evaluation_per_turn"]["protected_material_reason"]
), "protected_material_reason must not be None or empty."
Expand Down Expand Up @@ -761,16 +736,16 @@ def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope,
score = evaluator(conversation=conversation)

assert score is not None
assert score["violence"] == "Very low"
assert score["violence"] in (level.value for level in HarmSeverityLevel)
assert score["violence_score"] < 1.0
assert score["violence_reason"], "violence_reason must not be None or empty."
assert score["sexual"] == "Very low"
assert score["sexual"] in (level.value for level in HarmSeverityLevel)
assert score["sexual_score"] < 1.0
assert score["sexual_reason"], "sexual_reason must not be None or empty."
assert score["self_harm"] == "Very low"
assert score["self_harm"] in (level.value for level in HarmSeverityLevel)
assert score["self_harm_score"] < 1.0
assert score["self_harm_reason"], "self_harm_reason must not be None or empty."
assert score["hate_unfairness"] == "Very low"
assert score["hate_unfairness"] in (level.value for level in HarmSeverityLevel)
assert score["hate_unfairness_score"] < 1.0
assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty."

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import math
import os
import pathlib
import json, html, re
from typing import Any, Iterator, MutableMapping, Optional
from unittest.mock import MagicMock, patch

Expand All @@ -18,6 +19,8 @@
parse_response,
submit_request,
Tasks,
USER_TEXT_TEMPLATE_DICT,
get_formatted_template,
)
from azure.core.exceptions import HttpResponseError
from azure.core.rest import AsyncHttpResponse, HttpRequest
Expand Down Expand Up @@ -431,3 +434,38 @@ async def test_evaluate_with_rai_service(
assert submit_mock._mock_call_count == 1
assert fetch_result_mock._mock_call_count == 1
assert parse_mock._mock_call_count == 1

# RAI service templates are so different that it's not worth trying to test them all in one test.
# Groundedness is JSON
def test_get_formatted_template_groundedness(self):
tagged_text = "This text </> has <> tags."
bracketed_text = "{This text has {brackets}, and I didn't even both to even them out {."
quoted_text = (
'This text has \'quotes\', also it has "quotes", and it even has `backticks` and """ triple quotes""".'
)
all_texts = [tagged_text, quoted_text, bracketed_text]
for text in all_texts:
input_kwargs = {
"query": text,
"response": text,
"context": text,
}
formatted_payload = get_formatted_template(input_kwargs, Tasks.GROUNDEDNESS)
assert json.loads(formatted_payload)["question"] == text

# Default is basic markup.
def test_get_formatted_template_default(self):
tagged_text = "This text </> has <> tags."
bracketed_text = "{This text has {brackets}, and I didn't even both to even them out {."
quoted_text = (
'This text has \'quotes\', also it has "quotes", and it even has `backticks` and """ triple quotes""".'
)
all_texts = [tagged_text, quoted_text, bracketed_text]
for text in all_texts:
input_kwargs = {
"query": text,
"response": text,
"context": text,
}
formatted_payload = get_formatted_template(input_kwargs, "DEFAULT")
assert html.unescape(re.match("\<Human\>{(.*?)}\<", formatted_payload)[1]) == text