Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3eb40a8
Update Tool Call Accuracy to output unified format
m7md7sien Apr 14, 2026
d3c4092
Update tests
m7md7sien Apr 15, 2026
d076d5c
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 15, 2026
5032e26
reformatting
m7md7sien Apr 15, 2026
a525806
Refactor not applicable result method calls
m7md7sien Apr 15, 2026
f454ee3
Fix test assertions for new unified output format and apply black for…
Copilot Apr 15, 2026
aa848fe
Rename tool_call_accuracy reasoning output to reason and update skipp…
Copilot Apr 16, 2026
83576b4
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 16, 2026
99deb99
Fix tool call accuracy test for skipped output schema (#46356)
Copilot Apr 16, 2026
1893bc8
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 19, 2026
d821299
Standradize Output Scheme
m7md7sien Apr 19, 2026
c489326
Merge branch 'mohessie/unify_output/tool_call_accuracy' into mohessie…
m7md7sien Apr 19, 2026
c94d4e7
Add explicit _KEY_PREFIX/_RESULT_KEY
m7md7sien Apr 21, 2026
f873be8
add missing evaluators to init
m7md7sien Apr 21, 2026
e91a4c7
Align evaluator unit tests with new unified output schema
m7md7sien Apr 21, 2026
55b365f
Update recordings tag to solve e2e tests
m7md7sien Apr 21, 2026
8726b93
Run formatting
m7md7sien Apr 21, 2026
73becf1
Align evaluator unit tests with unified output schema and refresh rec…
m7md7sien Apr 21, 2026
b4740de
Merge branch 'main' into mohessie/standardize_output_schema
m7md7sien Apr 21, 2026
c357054
Restore legacy `_result` and bare evaluator-name keys for backward co…
m7md7sien Apr 21, 2026
d7f459b
resolve conflict
m7md7sien Apr 21, 2026
5cb47ec
Refresh azure-ai-evaluation test recordings for standardized evaluato…
m7md7sien Apr 22, 2026
d91cbf6
Merge branch 'main' into mohessie/standardize_output_schema
m7md7sien Apr 22, 2026
bcef3b2
Merge branches 'mohessie/standardize_output_schema' and 'mohessie/sta…
m7md7sien Apr 22, 2026
b14f476
Merge branch 'main' into mohessie/standardize_output_schema
m7md7sien Apr 22, 2026
76cdaf5
Update multimodal test assertion for new schema and refresh recording…
m7md7sien Apr 22, 2026
519c97c
Remove unused label assignment in navigation efficiency
m7md7sien Apr 22, 2026
0012f79
Merge branch 'main' into mohessie/standardize_output_schema
m7md7sien May 11, 2026
5031bf2
update _return_not_applicable_result
m7md7sien May 11, 2026
8fd256d
Return "not_applicable" instead of "pass"
m7md7sien May 11, 2026
9bc6470
update evaluators
m7md7sien May 11, 2026
0cca651
Fix error
m7md7sien May 11, 2026
3a50a21
Add results back
m7md7sien May 11, 2026
0de6927
undo unrelated change
m7md7sien May 11, 2026
1eecdbd
undo key_prefix change
m7md7sien May 11, 2026
36e0cbb
Revert `_evaluate.py` changes from #46436 on `mohessie/standardize_ou…
Copilot May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_0748353c8d"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from typing_extensions import overload, override

from azure.ai.evaluation._common.utils import nltk_tokenize
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING

from azure.ai.evaluation._evaluators._common import EvaluatorBase
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING


class BleuScoreEvaluator(EvaluatorBase):
Expand Down Expand Up @@ -87,9 +87,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
binary_result = score <= self._threshold

return {
"bleu": score,
"bleu_score": score,
"bleu_passed": binary_result,
"bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
"bleu_reason": None,
"bleu_status": "completed",
"bleu_threshold": self._threshold,
"bleu_properties": None,
}

@overload # type: ignore
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ model:
presence_penalty: 0
frequency_penalty: 0
response_format:
type: text
type: json_object

inputs:
query:
Expand Down Expand Up @@ -89,11 +89,12 @@ RESPONSE: {{response}}


# Tasks
## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
- **Explanation**: a very short explanation of why you think the input Data should get that Score.
- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
## Please provide your assessment for the previous RESPONSE in relation to the QUERY based on the Definitions above.
Your output must be a valid JSON object with exactly these keys:
- reason: a string explaining your thought process and assessment. Start with "Let's think step by step:". When status is "skipped", explain why evaluation was skipped.
- score: an integer value between 1 and 5 based on the level definitions above. The score you give MUST be an integer score (i.e., 1, 2...) based on the levels of the definitions. Set to null when status is "skipped".
- status: a string indicating the evaluation status. Must be one of:
- "completed": evaluation was performed normally.
- "skipped": evaluation was not performed because the QUERY or RESPONSE is empty or not provided. When skipped, set score to null.


## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
# Output
Original file line number Diff line number Diff line change
Expand Up @@ -619,35 +619,43 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
for eval_input in eval_input_list:
result = await self._do_eval(eval_input)
# logic to determine threshold pass/fail
# if it wasn't computed in _do_eval
try:
for key in list(result.keys()):
if key.endswith("_score") and "rouge" not in key:
score_value = result[key]
base_key = key[:-6] # Remove "_score" suffix
result_key = f"{base_key}_result"
threshold_key = f"{base_key}_threshold"
threshold_value = (
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
)
if not isinstance(threshold_value, (int, float)):
raise EvaluationException(
"Threshold value must be a number.",
internal_message=str(threshold_value),
target=ErrorTarget.EVALUATE,
category=ErrorCategory.INVALID_VALUE,
keys = list(result.keys())
contains_result_key = any(key.endswith("_result") for key in keys)
contains_threshold_key = any(key.endswith("_threshold") for key in keys)
if not contains_result_key or not contains_threshold_key:
for key in keys:
if key.endswith("_score"):
score_value = result[key]
base_key = key[:-6] # Remove "_score" suffix
result_key = f"{base_key}_result"
threshold_key = f"{base_key}_threshold"
threshold_value = (
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
)

result[threshold_key] = threshold_value
if self._higher_is_better:
if float(score_value) >= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
else:
if float(score_value) <= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
if not isinstance(threshold_value, (int, float)):
raise EvaluationException(
"Threshold value must be a number.",
internal_message=str(threshold_value),
target=ErrorTarget.EVALUATE,
category=ErrorCategory.INVALID_VALUE,
)

if not contains_threshold_key:
result[threshold_key] = threshold_value

if not contains_result_key:
if self._higher_is_better:
if float(score_value) >= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
else:
if float(score_value) <= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
except Exception as e:
logger.warning(f"Error calculating binary result: {e}")
per_turn_results.append(result)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

import json
import math
import re
import os
Expand Down Expand Up @@ -201,7 +202,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
return self._return_not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)
Expand All @@ -216,59 +217,83 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)

score = math.nan
reason = ""
llm_properties = {}

if prompty_output_dict:
llm_output = prompty_output_dict.get("llm_output", "")
input_token_count = prompty_output_dict.get("input_token_count", 0)
output_token_count = prompty_output_dict.get("output_token_count", 0)
total_token_count = prompty_output_dict.get("total_token_count", 0)
finish_reason = prompty_output_dict.get("finish_reason", "")
model_id = prompty_output_dict.get("model_id", "")
sample_input = prompty_output_dict.get("sample_input", "")
sample_output = prompty_output_dict.get("sample_output", "")
# Parse out score and reason from evaluators known to possess them.
if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
score, reason = parse_quality_evaluator_reason_score(llm_output)
binary_result = self._get_binary_result(score)
return {
self._result_key: float(score),
f"gpt_{self._result_key}": float(score),
f"{self._result_key}_reason": reason,
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_prompt_tokens": input_token_count,
f"{self._result_key}_completion_tokens": output_token_count,
f"{self._result_key}_total_tokens": total_token_count,
f"{self._result_key}_finish_reason": finish_reason,
f"{self._result_key}_model": model_id,
f"{self._result_key}_sample_input": sample_input,
f"{self._result_key}_sample_output": sample_output,
}
match = re.search(r"\d", llm_output)
if match:
score = float(match.group())
binary_result = self._get_binary_result(score)

# Parse JSON output from LLM
parsed_output = None
if isinstance(llm_output, dict):
parsed_output = llm_output
elif isinstance(llm_output, str):
try:
parsed_output = json.loads(llm_output)
except (json.JSONDecodeError, TypeError):
parsed_output = None

if parsed_output and isinstance(parsed_output, dict):
# Handle skipped status from LLM
llm_status = parsed_output.get("status", "completed")
if llm_status == "skipped":
skip_reason = parsed_output.get("reason", "")
return self._return_not_applicable_result(skip_reason, self._threshold)

score = parsed_output.get("score", math.nan)
reason = parsed_output.get("reason", "")
llm_properties = parsed_output.get("properties", {}) or {}
else:
# Fallback: try to parse legacy XML format or extract digit
if isinstance(llm_output, str) and self._result_key in PROMPT_BASED_REASON_EVALUATORS:
score, reason = parse_quality_evaluator_reason_score(llm_output)
elif isinstance(llm_output, str):
match = re.search(r"\d", llm_output)
if match:
score = float(match.group())

score = float(score) if score is not None else math.nan
score_result = self._get_binary_result(score)

llm_properties.update(self._get_token_metadata(prompty_output_dict))

return {
self._result_key: float(score),
f"gpt_{self._result_key}": float(score),
f"{self._result_key}_result": binary_result,
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
f"{self._result_key}_result": score_result,
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": "completed",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_prompt_tokens": input_token_count,
f"{self._result_key}_completion_tokens": output_token_count,
f"{self._result_key}_total_tokens": total_token_count,
f"{self._result_key}_finish_reason": finish_reason,
f"{self._result_key}_model": model_id,
f"{self._result_key}_sample_input": sample_input,
f"{self._result_key}_sample_output": sample_output,
f"{self._result_key}_properties": llm_properties,
}

binary_result = self._get_binary_result(score)
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)

@staticmethod
def _get_token_metadata(prompty_output: Dict) -> Dict:
"""Extract token usage and model metadata from the prompty output dict.

:param prompty_output: The raw output dictionary from the prompty flow.
:type prompty_output: Dict
:return: A dictionary with token counts, finish reason, model, and sample I/O.
:rtype: Dict
"""
return {
"prompt_tokens": prompty_output.get("input_token_count", 0),
"completion_tokens": prompty_output.get("output_token_count", 0),
"total_tokens": prompty_output.get("total_token_count", 0),
"finish_reason": prompty_output.get("finish_reason", ""),
"model": prompty_output.get("model_id", ""),
"sample_input": prompty_output.get("sample_input", ""),
"sample_output": prompty_output.get("sample_output", ""),
}

@staticmethod
def _get_built_in_tool_definition(tool_name: str):
"""Get the definition for the built-in tool."""
Expand Down Expand Up @@ -401,45 +426,6 @@ def _extract_needed_tool_definitions(

return needed_tool_definitions

def _not_applicable_result(
self, error_message: str, threshold: Union[int, float], has_details: bool = False
) -> Dict[str, Union[str, int, float, Dict]]:
"""Return a result indicating that the evaluation is not applicable.

When evaluation cannot be performed (e.g., no tool calls, missing definitions),
this returns the threshold value as the score with a "pass" result.

:param error_message: The error message explaining why evaluation is not applicable.
:type error_message: str
:param threshold: The threshold value for the evaluator, used as the score.
:type threshold: Union[int, float]
:param has_details: Whether to include an empty details field in the result.
:type has_details: bool
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, Dict]]
"""
# If no tool calls were made or tool call type is not supported, return threshold as score with pass result
result = {
self._result_key: threshold,
f"{self._result_key}_result": "pass",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_prompt_tokens": 0,
f"{self._result_key}_completion_tokens": 0,
f"{self._result_key}_total_tokens": 0,
f"{self._result_key}_finish_reason": "",
f"{self._result_key}_model": "",
f"{self._result_key}_sample_input": "",
f"{self._result_key}_sample_output": "",
}

# Add empty details field if requested
if has_details:
result[f"{self._result_key}_details"] = {}

return result

# TODO: After all evaluators output are updated, we can remove the _not_applicable_result method and replace calls to it with _return_not_applicable_result, which returns a "skipped" status instead of "pass" to avoid confusion.
def _return_not_applicable_result(
self, error_message: str, threshold: Union[int, float]
) -> Dict[str, Union[str, float, Dict, None]]:
Expand All @@ -455,9 +441,7 @@ def _return_not_applicable_result(
return {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
# TODO: Return "not_applicable" instead of "pass" once the
# evaluation service accepts it as a valid result value.
f"{self._result_key}_result": "pass",
f"{self._result_key}_result": "not_applicable",
f"{self._result_key}_passed": None,
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_status": "skipped",
Expand Down
Loading
Loading