Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
3eb40a8
Update Tool Call Accuracy to output unified format
m7md7sien Apr 14, 2026
d3c4092
Update tests
m7md7sien Apr 15, 2026
d076d5c
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 15, 2026
5032e26
reformatting
m7md7sien Apr 15, 2026
a525806
Refactor not applicable result method calls
m7md7sien Apr 15, 2026
f454ee3
Fix test assertions for new unified output format and apply black for…
Copilot Apr 15, 2026
aa848fe
Rename tool_call_accuracy reasoning output to reason and update skipp…
Copilot Apr 16, 2026
83576b4
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 16, 2026
99deb99
Fix tool call accuracy test for skipped output schema (#46356)
Copilot Apr 16, 2026
1893bc8
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 19, 2026
1a7f191
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 21, 2026
da132f2
Add back backward-compatible base result keys for tool call accuracy …
Copilot Apr 21, 2026
adff374
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 21, 2026
3d2aaa1
Update documentation to state deprecate 'gpt_' prefix
m7md7sien Apr 23, 2026
56da1d6
Rename `_result` value from `not_applicable` to `pass` in `_return_no…
Copilot Apr 23, 2026
a288559
Add TODO for pass in _return_not_applicable_result
m7md7sien Apr 26, 2026
4db82df
Add back gpt_ key for backward compatibility.
m7md7sien Apr 26, 2026
7c1243a
Merge branch 'main' into mohessie/unify_output/tool_call_accuracy
m7md7sien Apr 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -438,3 +438,29 @@ def _not_applicable_result(
result[f"{self._result_key}_details"] = {}

return result

# TODO: After all evaluators output are updated, we can remove the _not_applicable_result method and replace calls to it with _return_not_applicable_result, which returns a "skipped" status instead of "pass" to avoid confusion.
def _return_not_applicable_result(
self, error_message: str, threshold: Union[int, float]
) -> Dict[str, Union[str, float, Dict, None]]:
"""Return a result indicating that the tool call is not applicable for evaluation.

:param error_message: The error message indicating why the evaluation is not applicable.
:type error_message: str
:param threshold: The threshold value for the evaluation.
:type threshold: Union[int, float]
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
# TODO: Return "not_applicable" instead of "pass" once the
# evaluation service accepts it as a valid result value.
f"{self._result_key}_result": "pass",
Comment thread
m7md7sien marked this conversation as resolved.
f"{self._result_key}_passed": None,
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_status": "skipped",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,11 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):

.. note::

The output field "details" has been renamed to "tool_call_accuracy_details" for clarity.
The output field "details" has been renamed to "tool_call_accuracy_properties" for clarity.

To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
To align with our support of a diverse set of models,
an output key with "_score" suffix instead of the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.

"""
Expand All @@ -86,7 +87,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."

_LLM_SCORE_KEY = "tool_calls_success_level"
_LLM_SCORE_KEY = "score"

_validator: ValidatorInterface

Expand Down Expand Up @@ -230,10 +231,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t

# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
return self._return_not_applicable_result(
Comment thread
m7md7sien marked this conversation as resolved.
"Intermediate response. Please provide the agent's final response for evaluation.",
self.threshold,
has_details=True,
)

# Preprocess messages if they are lists
Expand All @@ -256,6 +256,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
if isinstance(llm_output, dict):
# Handle skipped status from LLM
llm_status = llm_output.get("status", "completed")
if llm_status == "skipped":
reason = llm_output.get("reason", "")
return self._return_not_applicable_result(reason, self.threshold)
Comment thread
m7md7sien marked this conversation as resolved.

score = llm_output.get(self._LLM_SCORE_KEY, None)
if not score or not check_score_is_valid(
score,
Expand All @@ -271,23 +277,32 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
)

# Format the output
reason = llm_output.get("chain_of_thought", "")
reason = llm_output.get("reason", "")
score = float(score)
score_result = "pass" if score >= self.threshold else "fail"
llm_properties = llm_output.get("properties", {}) or {}
llm_properties.update(
{
"prompt_tokens": prompty_output_dict.get("input_token_count", 0),
"completion_tokens": prompty_output_dict.get("output_token_count", 0),
"total_tokens": prompty_output_dict.get("total_token_count", 0),
"finish_reason": prompty_output_dict.get("finish_reason", ""),
"model": prompty_output_dict.get("model_id", ""),
"sample_input": prompty_output_dict.get("sample_input", ""),
"sample_output": prompty_output_dict.get("sample_output", ""),
}
)
response_dict = {
self._result_key: score,
# The "gpt_" prefixed key is maintained for backwards compatibility but is deprecated.
f"gpt_{self._result_key}": score,
f"{self._result_key}_score": score,
f"{self._result_key}_result": score_result,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_passed": score_result == "pass",
Comment thread
m7md7sien marked this conversation as resolved.
f"{self._result_key}_reason": reason,
f"{self._result_key}_details": llm_output.get("details", {}),
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
f"{self._result_key}_status": "completed",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
Comment thread
m7md7sien marked this conversation as resolved.
}
return response_dict

Expand All @@ -314,7 +329,7 @@ async def _real_call(self, **kwargs):
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
if isinstance(eval_input, dict) and eval_input.get("error_message"):
# If there is an error message, return not applicable result
return self._not_applicable_result(eval_input.get("error_message"), self.threshold, has_details=True)
return self._return_not_applicable_result(eval_input.get("error_message"), self.threshold)
# Do the evaluation
result = await self._do_eval(eval_input)
# Return the result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,16 @@ Evaluate based on these factors:

**Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.

## Status: Skipped
Before performing any evaluation, check for the following conditions. If ANY are true, return `status: "skipped"` immediately without scoring:
1. **No tool calls to evaluate**: The TOOL CALLS TO BE EVALUATED section is empty (tool calls appearing only in the CONVERSATION section do not count).
2. **Missing tool definitions**: Any tool call in TOOL CALLS TO BE EVALUATED references a tool that is not present in the TOOL DEFINITIONS.

When skipped, return:
```json
{"reason": "<explain why evaluation was skipped>", "score": null, "status": "skipped", "properties": null}
```


# Ratings
## [Tool Call Accuracy: 1] (Irrelevant)
Expand Down Expand Up @@ -139,10 +149,13 @@ TOOL DEFINITIONS: {{tool_definitions}}

# Tasks
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
- tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
- details: a dictionary that contains the following keys:
Your output should consist only of a JSON object that has the following keys:
- reason: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
- score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
- status: a string indicating the evaluation status. Must be one of:
- "completed": tool calls were present, tool definitions were available, and evaluation was performed.
- "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
- properties: a dictionary that contains the following keys:
- tool_calls_made_by_agent: total number of tool calls made by the agent
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
- per_tool_call_details: a list of dictionaries, each containing:
Expand All @@ -163,4 +176,4 @@ Your output should consist only of a JSON object, as provided in the examples, t
- tool_name: name of the tool
- missing_count: number of missing calls for this query

# Output
# Output
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
}
],
)
assert (
result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
)
assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_score"] is None
assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_result"] == "pass"
assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_status"] == "skipped"
assert (
"not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"].lower()
and ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
Expand Down
Loading
Loading