Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

from .._constants import (
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
EVALUATION_PASS_FAIL_MAPPING,
EvaluationMetrics,
DefaultOpenEncoding,
Prefixes,
Expand Down Expand Up @@ -251,7 +250,7 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
"""
Aggregate binary output results (pass/fail) from evaluation dataframe.

For each evaluator, calculates the proportion of "pass" results.
For each evaluator, calculates the proportion of passed results.

:param df: The dataframe of evaluation results.
:type df: ~pandas.DataFrame
Expand All @@ -260,12 +259,12 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
"""
results = {}

# Find all columns that end with "_result"
result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
# Find all columns that end with "_passed"
passed_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_passed")]

for col in result_columns:
for col in passed_columns:
# Extract the evaluator name from the column name
# (outputs.<evaluator>.<metric>_result)
# (outputs.<evaluator>.<metric>_passed)
parts = col.split(".")
evaluator_name = None
if len(parts) >= 3:
Expand All @@ -292,7 +291,7 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:

# Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
total_rows = len(df)
pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
pass_count = df[col].sum()
proportion = pass_count / total_rows if total_rows > 0 else 0.0

# Set the result with the evaluator name as the key
Expand Down Expand Up @@ -384,10 +383,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
token_count_cols = _get_token_count_columns_to_exclude(df)
handled_columns.extend(token_count_cols)

# Exclude threshold and result columns from aggregation
# Exclude threshold and passed columns from aggregation
# These are per-row metadata, not metrics to be averaged
threshold_and_result_cols = [col for col in df.columns if col.endswith("_threshold") or col.endswith("_result")]
handled_columns.extend(threshold_and_result_cols)
threshold_and_passed_cols = [col for col in df.columns if col.endswith("_threshold") or col.endswith("_passed")]
handled_columns.extend(threshold_and_passed_cols)

# For rest of metrics, we will calculate mean
df.drop(columns=handled_columns, inplace=True)
Expand Down
Loading