11#! /bin/bash
2- # eval.sh — org-scale benchmark evaluator for CCX-compliance-124
2+ # eval.sh — MCP-unique benchmark evaluator for CCX-compliance-124
33# Exit-code-first (SWE-Factory pattern):
44# exit 0 — agent produced useful output (composite score > 0)
55# exit 1 — total failure (composite score == 0 or missing answer)
66#
77# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
8+ # and /logs/verifier/validation_result.json with canonical verifier semantics.
89
910set -euo pipefail
1011
11- TASK_WORKDIR=" ${TASK_WORKDIR:-/ workspace} "
12- TASK_REPO_ROOT=" ${TASK_REPO_ROOT:- ${VERIFY_REPO:- $TASK_WORKDIR } } "
13- TASK_OUTPUT=" ${TASK_OUTPUT:- $TASK_WORKDIR / answer.json} "
14-
1512TASK_ID=" CCX-compliance-124"
13+ TASK_WORKDIR=" $( printenv TASK_WORKDIR || true) "
14+ [ -n " $TASK_WORKDIR " ] || TASK_WORKDIR=" /workspace"
15+ TASK_REPO_ROOT=" $( printenv TASK_REPO_ROOT || true) "
16+ [ -n " $TASK_REPO_ROOT " ] || TASK_REPO_ROOT=" $( printenv VERIFY_REPO || true) "
17+ [ -n " $TASK_REPO_ROOT " ] || TASK_REPO_ROOT=" $TASK_WORKDIR "
18+ TASK_OUTPUT=" $( printenv TASK_OUTPUT || true) "
19+ [ -n " $TASK_OUTPUT " ] || TASK_OUTPUT=" $TASK_WORKDIR /answer.json"
1620ANSWER_PATH=" $TASK_OUTPUT "
1721TASK_SPEC_PATH=" /tests/task_spec.json"
1822ORACLE_CHECKS=" /tests/oracle_checks.py"
1923REWARD_PATH=" /logs/verifier/reward.txt"
24+ VALIDATION_RESULT=" /logs/verifier/validation_result.json"
25+ VALIDATION_RESULT_SCHEMA=" validation_result.v1alpha1"
26+ SCORER_FAMILY=" oracle_checks"
27+ PASS_THRESHOLD=" 0.0"
2028
2129mkdir -p /logs/verifier
2230
31+ write_validation_failure () {
32+ local code=" $1 "
33+ local message=" $2 "
34+ local stage=" $3 "
35+ python3 - " $VALIDATION_RESULT " " $code " " $message " " $stage " " $TASK_OUTPUT " " $VALIDATION_RESULT_SCHEMA " " $SCORER_FAMILY " " $PASS_THRESHOLD " << 'PYEOF '
36+ import json
37+ import sys
38+
39+ (
40+ output_path,
41+ code,
42+ message,
43+ stage,
44+ primary_path,
45+ schema_version,
46+ scorer_family,
47+ pass_threshold,
48+ ) = sys.argv[1:]
49+
50+ status = "invalid_output" if stage == "output_validation" else "verifier_error"
51+ payload = {
52+ "schema_version": schema_version,
53+ "status": status,
54+ "scorable": False,
55+ "scorer_family": scorer_family,
56+ "reward": 0.0,
57+ "pass_threshold": float(pass_threshold),
58+ "passed": False,
59+ "output_contract": {
60+ "mode": "answer_json_native",
61+ "primary_path": primary_path,
62+ "required_artifact": True,
63+ },
64+ "sub_scores": {},
65+ "failure": {
66+ "code": code,
67+ "message": message,
68+ "stage": stage,
69+ },
70+ }
71+ with open(output_path, "w") as f:
72+ json.dump(payload, f, indent=2)
73+ PYEOF
74+ }
75+
76+ run_oracle_validation () {
77+ python3 - " $ORACLE_CHECKS " " $ANSWER_PATH " " $TASK_SPEC_PATH " " $VALIDATION_RESULT " " $TASK_OUTPUT " " $VALIDATION_RESULT_SCHEMA " " $SCORER_FAMILY " " $PASS_THRESHOLD " << 'PYEOF '
78+ import importlib.util
79+ import json
80+ import sys
81+ from pathlib import Path
82+
83+ (
84+ oracle_checks_path,
85+ answer_path,
86+ task_spec_path,
87+ output_path,
88+ primary_path,
89+ schema_version,
90+ scorer_family,
91+ pass_threshold,
92+ ) = sys.argv[1:]
93+
94+ spec = importlib.util.spec_from_file_location("oracle_checks", oracle_checks_path)
95+ if spec is None or spec.loader is None:
96+ raise RuntimeError(f"Failed to load oracle checks module from {oracle_checks_path}")
97+ module = importlib.util.module_from_spec(spec)
98+ spec.loader.exec_module(module)
99+
100+ check_result = module.run_all_checks(answer_path, task_spec_path)
101+ threshold = float(pass_threshold)
102+
103+ def primary_score(check_type: str, result: dict) -> float:
104+ if check_type == "file_set_match":
105+ return float(result.get("weighted_f1", result.get("f1", 0.0)))
106+ if check_type == "symbol_resolution":
107+ return float(result.get("recall", 0.0))
108+ if check_type == "dependency_chain":
109+ return float(result.get("chain_recall", 0.0))
110+ if check_type == "provenance":
111+ return float(result.get("provenance_score", 0.0))
112+ if check_type == "keyword_presence":
113+ return float(result.get("keyword_recall", 0.0))
114+ if check_type == "json_schema_match":
115+ return 1.0 if result.get("valid") else 0.0
116+ if check_type == "test_ratio":
117+ return float(result.get("ratio", 0.0))
118+ value = result.get("score", 0.0)
119+ if isinstance(value, bool):
120+ return 1.0 if value else 0.0
121+ return float(value)
122+
123+ if "error" in check_result:
124+ payload = {
125+ "schema_version": schema_version,
126+ "status": "verifier_error",
127+ "scorable": False,
128+ "scorer_family": scorer_family,
129+ "reward": 0.0,
130+ "pass_threshold": threshold,
131+ "passed": False,
132+ "output_contract": {
133+ "mode": "answer_json_native",
134+ "primary_path": primary_path,
135+ "required_artifact": True,
136+ },
137+ "sub_scores": {},
138+ "failure": {
139+ "code": "oracle_checks_error",
140+ "message": str(check_result["error"]),
141+ "stage": "scoring",
142+ },
143+ "details": {
144+ "oracle_checks": check_result,
145+ },
146+ "composite_score": 0.0,
147+ "checks": {},
148+ "error": check_result["error"],
149+ }
150+ score = 0.0
151+ else:
152+ raw_checks = check_result.get("checks", {})
153+ sub_scores = {}
154+ for check_type, result in raw_checks.items():
155+ score = round(primary_score(check_type, result), 4)
156+ sub_scores[check_type] = {
157+ "score": score,
158+ "passed": score > 0.0,
159+ }
160+
161+ score = round(float(check_result.get("composite_score", 0.0)), 4)
162+ payload = {
163+ "schema_version": schema_version,
164+ "status": "scored",
165+ "scorable": True,
166+ "scorer_family": scorer_family,
167+ "reward": score,
168+ "pass_threshold": threshold,
169+ "passed": score > threshold,
170+ "output_contract": {
171+ "mode": "answer_json_native",
172+ "primary_path": primary_path,
173+ "required_artifact": True,
174+ },
175+ "sub_scores": sub_scores,
176+ "failure": None,
177+ "details": {
178+ "oracle_checks": check_result,
179+ },
180+ "composite_score": score,
181+ "checks": raw_checks,
182+ }
183+
184+ with open(output_path, "w") as f:
185+ json.dump(payload, f, indent=2)
186+
187+ print(f"{score:.4f}")
188+ PYEOF
189+ }
190+
23191echo " === CCX-compliance-124 evaluator ==="
24192echo " Task spec: $TASK_SPEC_PATH "
25193echo " Answer: $ANSWER_PATH "
36204if [ ! -f " $ANSWER_PATH " ]; then
37205 echo " ERROR: answer.json not found at $ANSWER_PATH "
38206 echo " 0.0" > " $REWARD_PATH "
207+ write_validation_failure " missing_required_output" " answer.json not found at $ANSWER_PATH " " output_validation"
39208 exit 1
40209fi
41210
42211# Validate answer is valid JSON
43212if ! python3 -c " import json; json.load(open('$ANSWER_PATH '))" 2> /dev/null; then
44213 echo " ERROR: answer.json is not valid JSON"
45214 echo " 0.0" > " $REWARD_PATH "
215+ write_validation_failure " invalid_answer_json" " answer.json is not valid JSON" " output_validation"
46216 exit 1
47217fi
48218
@@ -52,16 +222,18 @@ echo "answer.json found and valid JSON"
52222if [ ! -f " $ORACLE_CHECKS " ]; then
53223 echo " ERROR: oracle_checks.py not found at $ORACLE_CHECKS "
54224 echo " 0.0" > " $REWARD_PATH "
225+ write_validation_failure " missing_oracle_checks" " oracle_checks.py not found at $ORACLE_CHECKS " " verifier_runtime"
55226 exit 1
56227fi
57228
58229echo " Running oracle checks..."
59- SCORE=$( python3 " $ORACLE_CHECKS " --answer " $ANSWER_PATH " --spec " $TASK_SPEC_PATH " --verbose 2>&1 | tee /dev/stderr | tail -1 ) || true
230+ SCORE=$( run_oracle_validation ) || true
60231
61232# Validate score is a number
62233if ! echo " $SCORE " | python3 -c " import sys; float(sys.stdin.read().strip())" 2> /dev/null; then
63234 echo " ERROR: oracle_checks.py did not return a valid score: $SCORE "
64235 echo " 0.0" > " $REWARD_PATH "
236+ write_validation_failure " invalid_verifier_score" " oracle_checks.py did not return a valid score: $SCORE " " scoring"
65237 exit 1
66238fi
67239
0 commit comments