Azure · Copilot · May 6, 2026 · May 7, 2026
@@ -14,6 +14,7 @@
 
 ### Bugs Fixed
 
+- `TaskNavigationEfficiencyEvaluator` now accepts JSON-stringified `response` and `ground_truth` inputs (e.g., from data pipelines that serialize list/tuple inputs to strings). String inputs are parsed as JSON; on parse failure the original value is preserved so downstream validation surfaces the error as before.
 - Fixed error blame attribution in `_get_single_run_results` to perform a case-insensitive comparison when checking the AOAI error code for `UserError`, ensuring failed evaluation runs are correctly classified as user errors regardless of server-side casing.
 - Fixed `deflection_rate` evaluator showing incorrect pass/fail labels where all results were labeled "pass" regardless of the actual score. The inverse metric adjustment was overriding the evaluator's correct string labels, remapping every result to "pass".
 - Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run.

@@ -5,6 +5,7 @@
 Validator for task navigation inputs (actions and expected_actions).
 """
 
+import json
 from typing import Any, Dict, Optional
 from typing_extensions import override
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -229,6 +230,15 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
         Raises:
             EvaluationException: If validation fails.
         """
+        # If response or ground_truth is a string, try to parse it as JSON
+        for key in ("response", "ground_truth"):
+            value = eval_input.get(key)
+            if isinstance(value, str):
+                try:
+                    eval_input[key] = json.loads(value)
+                except (ValueError, TypeError):
+                    pass
+
         # Validate response
         response = eval_input.get("response")
         error = self._validate_response(response)

@@ -263,6 +263,15 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[s
         :return: The evaluation result.
         :rtype: Dict[str, Union[float, str, Dict[str, float]]]
         """
+        # If response or ground_truth is a string, try to parse it as JSON
+        for key in ("response", "ground_truth"):
+            value = eval_input.get(key)
+            if isinstance(value, str):
+                try:
+                    eval_input[key] = json.loads(value)
+                except (ValueError, TypeError):
+                    pass
+
         response = eval_input["response"]
         ground_truth = eval_input["ground_truth"]
 

@@ -1,4 +1,5 @@
 import pytest
+import json
 from azure.ai.evaluation._evaluators._task_navigation_efficiency import (
     _TaskNavigationEfficiencyEvaluator,
     _TaskNavigationEfficiencyMatchingMode,
@@ -37,6 +38,33 @@ def test_exact_match_scenario(self):
         assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
         assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
+    def test_json_stringified_valid_inputs(self):
+        """Test that JSON-stringified response and ground_truth are parsed and evaluated correctly."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+
+        response = [
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "report", "arguments": {}}],
+            },
+        ]
+        ground_truth = ["search", "analyze", "report"]
+
+        result = evaluator(response=json.dumps(response), ground_truth=json.dumps(ground_truth))
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_result"] == "pass"
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
+
     def test_in_order_match_with_extra_steps(self):
         """Test when agent has extra steps but maintains order."""
         evaluator = _TaskNavigationEfficiencyEvaluator(