Skip to content

Commit 1480f19

Browse files
authored
Merge pull request #18 from YusakuNo1/azure_ai_evaluation
Introduce Azure AI Evaluation SDK
2 parents 62261f9 + fc35b75 commit 1480f19

File tree

1 file changed

+149
-0
lines changed

1 file changed

+149
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
"""This sample demonstrates how to use Azure AI Foundry SDK to run GitHub model catalog with evaluation.
2+
It is leveraging your endpoint and key. The call is synchronous.
3+
4+
For those who have Azure credentials, you can run the risk and safety evaluators from Azure AI.
5+
6+
Azure Evaluation SDK: https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk
7+
"""
8+
9+
import os
10+
import json
11+
from pathlib import Path
12+
from azure.ai.inference import ChatCompletionsClient
13+
from azure.ai.inference.models import SystemMessage, UserMessage
14+
from azure.ai import evaluation
15+
from azure.ai.evaluation import RougeType, evaluate
16+
from azure.core.credentials import AzureKeyCredential
17+
from azure.identity import DefaultAzureCredential
18+
19+
20+
token = os.environ['GITHUB_TOKEN']
21+
22+
# Target model is the model to be evaluated.
23+
target_model_name = "Mistral-small"
24+
target_model_endpoint = "https://models.inference.ai.azure.com"
25+
# Judge model is the model to evaluate the target model.
26+
judge_model_name = "gpt-4o-mini"
27+
judge_model_endpoint = "https://models.inference.ai.azure.com"
28+
29+
evaluation_name = "GitHub models evaluation"
30+
eval_data_file = Path("./eval_data.jsonl")
31+
eval_result_file_perf_and_quality = Path("./eval_result_perf_and_quality.json")
32+
eval_result_file_risk_and_safety = Path("./eval_result_risk_and_safety.json")
33+
34+
35+
def generate_eval_data():
36+
eval_data_queries = [{
37+
"query": "What is the capital of France?",
38+
"ground_truth": "Paris",
39+
}, {
40+
"query": "Where is Wineglass Bay?",
41+
"ground_truth": "Wineglass Bay is located on the Freycinet Peninsula on the east coast of Tasmania, Australia.",
42+
}]
43+
44+
with eval_data_file.open("w") as f:
45+
for eval_data_query in eval_data_queries:
46+
client = ChatCompletionsClient(
47+
endpoint=target_model_endpoint,
48+
credential=AzureKeyCredential(token),
49+
)
50+
51+
context = "You are a geography teacher."
52+
response = client.complete(
53+
messages=[
54+
SystemMessage(content=context),
55+
UserMessage(content=eval_data_query["query"]),
56+
],
57+
model=target_model_name,
58+
temperature=1.,
59+
max_tokens=1000,
60+
top_p=1.
61+
)
62+
result = response.choices[0].message.content
63+
64+
eval_data = {
65+
"id": "1",
66+
"description": "Evaluate the model",
67+
"query": eval_data_query["query"],
68+
"context": context,
69+
"response": result,
70+
"ground_truth": eval_data_query["ground_truth"],
71+
}
72+
f.write(json.dumps(eval_data) + "\n")
73+
74+
75+
def run_perf_and_quality_evaluators():
76+
model_config = {
77+
"azure_endpoint": judge_model_endpoint,
78+
"azure_deployment": judge_model_name,
79+
"api_key": token,
80+
}
81+
82+
evaluators = {
83+
"BleuScoreEvaluator": evaluation.BleuScoreEvaluator(),
84+
"F1ScoreEvaluator": evaluation.F1ScoreEvaluator(),
85+
"GleuScoreEvaluator": evaluation.GleuScoreEvaluator(),
86+
"MeteorScoreEvaluator": evaluation.MeteorScoreEvaluator(),
87+
"RougeScoreEvaluator": evaluation.RougeScoreEvaluator(rouge_type=RougeType.ROUGE_L),
88+
"CoherenceEvaluator": evaluation.CoherenceEvaluator(model_config=model_config),
89+
"FluencyEvaluator": evaluation.FluencyEvaluator(model_config=model_config),
90+
"GroundednessEvaluator": evaluation.GroundednessEvaluator(model_config=model_config),
91+
"QAEvaluator": evaluation.QAEvaluator(model_config=model_config, _parallel=False),
92+
"RelevanceEvaluator": evaluation.RelevanceEvaluator(model_config=model_config),
93+
"RetrievalEvaluator": evaluation.RetrievalEvaluator(model_config=model_config),
94+
"SimilarityEvaluator": evaluation.SimilarityEvaluator(model_config=model_config),
95+
}
96+
97+
eval_results = evaluate(
98+
data=eval_data_file,
99+
evaluators=evaluators,
100+
evaluation_name=evaluation_name,
101+
target=None,
102+
output_path=eval_result_file_perf_and_quality,
103+
)
104+
print(json.dumps(eval_results, indent=4))
105+
106+
107+
def run_risk_and_safety_evaluators_with_azure():
108+
azure_ai_project = {
109+
"subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
110+
"resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP_NAME"),
111+
"project_name": os.environ.get("AZURE_PROJECT_NAME"),
112+
}
113+
credential = DefaultAzureCredential()
114+
evaluators = {
115+
"ContentSafetyEvaluator": evaluation.ContentSafetyEvaluator(azure_ai_project=azure_ai_project, credential=credential),
116+
"HateUnfairnessEvaluator": evaluation.HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=credential),
117+
"SelfHarmEvaluator": evaluation.SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=credential),
118+
"SexualEvaluator": evaluation.SexualEvaluator(azure_ai_project=azure_ai_project, credential=credential),
119+
"ViolenceEvaluator": evaluation.ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=credential),
120+
"ProtectedMaterialEvaluator": evaluation.ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project, credential=credential),
121+
"IndirectAttackEvaluator": evaluation.IndirectAttackEvaluator(azure_ai_project=azure_ai_project, credential=credential),
122+
"GroundednessProEvaluator": evaluation.GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential),
123+
}
124+
125+
risk_and_safety_result_dict = {}
126+
with eval_data_file.open("r") as f:
127+
for line in f:
128+
eval_data = json.loads(line)
129+
for name, evaluator in evaluators.items():
130+
if name != "GroundednessProEvaluator":
131+
score = evaluator(query=eval_data["query"], response=eval_data["response"])
132+
else:
133+
score = evaluator(query=eval_data["query"], response=eval_data["response"], context=eval_data["context"])
134+
print(f"{name}: {score}")
135+
risk_and_safety_result_dict[name] = score
136+
137+
with eval_result_file_risk_and_safety.open("w") as f:
138+
f.write(json.dumps(risk_and_safety_result_dict, indent=4))
139+
140+
141+
if __name__ == "__main__":
142+
# Generate evaluation data with GitHub model catalog and save it to a file.
143+
generate_eval_data()
144+
145+
# Run performance and quality evaluators with GitHub model catalog.
146+
run_perf_and_quality_evaluators()
147+
148+
# # Uncomment the following code with Azure credentials, then we can run the risk and safety evaluators from Azure AI.
149+
# run_risk_and_safety_evaluators_with_azure()

0 commit comments

Comments
 (0)