Azure · Copilot · May 11, 2026 · May 11, 2026
@@ -26,7 +26,6 @@
 
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
-    EVALUATION_PASS_FAIL_MAPPING,
     EvaluationMetrics,
     DefaultOpenEncoding,
     Prefixes,
@@ -251,7 +250,7 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
     """
     Aggregate binary output results (pass/fail) from evaluation dataframe.
 
-    For each evaluator, calculates the proportion of "pass" results.
+    For each evaluator, calculates the proportion of passed results.
 
     :param df: The dataframe of evaluation results.
     :type df: ~pandas.DataFrame
@@ -260,12 +259,12 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
     """
     results = {}
 
-    # Find all columns that end with "_result"
-    result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
+    # Find all columns that end with "_passed"
+    passed_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_passed")]
 
-    for col in result_columns:
+    for col in passed_columns:
         # Extract the evaluator name from the column name
-        # (outputs.<evaluator>.<metric>_result)
+        # (outputs.<evaluator>.<metric>_passed)
         parts = col.split(".")
         evaluator_name = None
         if len(parts) >= 3:
@@ -292,7 +291,7 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
 
             # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
             total_rows = len(df)
-            pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
+            pass_count = df[col].sum()
             proportion = pass_count / total_rows if total_rows > 0 else 0.0
 
             # Set the result with the evaluator name as the key
@@ -384,10 +383,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     token_count_cols = _get_token_count_columns_to_exclude(df)
     handled_columns.extend(token_count_cols)
 
-    # Exclude threshold and result columns from aggregation
+    # Exclude threshold and passed columns from aggregation
     # These are per-row metadata, not metrics to be averaged
-    threshold_and_result_cols = [col for col in df.columns if col.endswith("_threshold") or col.endswith("_result")]
-    handled_columns.extend(threshold_and_result_cols)
+    threshold_and_passed_cols = [col for col in df.columns if col.endswith("_threshold") or col.endswith("_passed")]
+    handled_columns.extend(threshold_and_passed_cols)
 
     # For rest of metrics, we will calculate mean
     df.drop(columns=handled_columns, inplace=True)