Skip to content

Commit 75ea114

Browse files
authored
feat:Add test verification workflow and safer generation (#3)
* Add test verification workflow and safer generation * Fix source resolution type narrowing * Address Copilot review feedback
1 parent 8d23598 commit 75ea114

23 files changed

Lines changed: 1113 additions & 251 deletions

.claude-plugin/plugin.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "autocode",
3-
"version": "0.6.0",
3+
"version": "0.7.0",
44
"description": "Claude Code plugin for competitive programming problem-setting workflows.",
55
"author": {
66
"name": "SummerOneTwo",

CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.7.0] - 2026-04-27
9+
10+
### Features
11+
12+
- **source_path 直接编译**: 当使用 `source_path` 参数时,直接从原始文件编译,不再覆盖到标准位置。标准位置仍保留副本以供其他工具使用。所有构建工具返回 `canonical_path`(标准位置副本)和 `source_path`(实际编译源)。
13+
- **resolve_source() 公共函数**: 提取 5 个构建工具中的源码解析逻辑到 `mixins.py``resolve_source()` 函数和 `ResolvedSource` 数据类,消除约 100 行重复代码。
14+
- **name 参数**: `solution_build``solution_run` 新增 `name` 参数,支持自定义文件名(如 `name="brute_force"` 替代默认 `brute`)。
15+
- **sol_name / brute_name**: `stress_test_run` 新增 `sol_name``brute_name` 参数,支持查找自定义命名的解法二进制文件。
16+
- **output_dir 参数**: `problem_generate_tests` 新增 `output_dir` 参数,可指定测试数据输出目录(默认 `problem_dir/tests`)。
17+
- **extra_args 参数**: `stress_test_run``generator_run``problem_generate_tests``test_configs` 新增 `extra_args` 参数,支持传递自定义命令行参数给 generator。协议扩展为 `gen.exe <seed> <type> <n_min> <n_max> <t_min> <t_max> [extra_args...]`
18+
- **types 参数**: `stress_test_run` 新增 `types` 参数,支持在对拍中循环使用多种生成策略(如 `["1","2","3","4"]`)。
19+
- **problem_verify_tests 工具**: 新增测试数据验证工具,检查文件配对、答案一致性(重新运行 sol)、validator 验证、无空文件等。
20+
- **stress_test_run 统计信息**: 对拍通过/失败时返回详细统计,包括 sol/brute 运行时间分布、N 值分布、最慢轮次等。
21+
- **构建结果透明度**: 所有构建工具返回 `binary_size``canonical_path``source_path` 返回实际编译源文件路径。
22+
23+
### Improvements
24+
25+
- **smart mode 文档**: `problem_generate_tests``constraints` 参数说明更明确,返回 `effective_test_configs` 展示实际使用的配置。
26+
- **workflow_guard 自定义命名**: `infer_state()` 支持自定义解法文件名(前缀匹配),新增 `tests_verified` 状态字段。
27+
- **工作流步骤更新**: 新增 `problem_verify_tests(passed)` 步骤,位于 `problem_generate_tests``problem_pack_polygon` 之间。
28+
829
## [0.6.0] - 2026-04-25
930

1031
### Features

CLAUDE.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ AutoCode/
6666
| stress_test_run | 压力测试 |
6767
| problem_create | 初始化题目 |
6868
| problem_generate_tests | 生成测试数据 |
69+
| problem_verify_tests | 验证测试数据质量 |
6970
| problem_validate | 验证题面样例 |
7071
| problem_pack_polygon | 打包为 Polygon 格式 |
7172

@@ -102,7 +103,8 @@ AutoCode/
102103
6. 运行压力测试 (`stress_test_run`, completed_rounds == total_rounds)
103104
7. 按需构建检查器 (`checker_build`, accuracy >= 0.9)
104105
8. 生成测试数据 (`problem_generate_tests`, generated_test_count > 0)
105-
9. 打包 Polygon (`problem_pack_polygon`)
106+
9. 验证测试数据 (`problem_verify_tests`, passed)
107+
10. 打包 Polygon (`problem_pack_polygon`)
106108

107109
该顺序会被 [hooks/hooks.json](/c:/userProgram/program/AutoCode/hooks/hooks.json)[scripts/workflow_guard.py](/c:/userProgram/program/AutoCode/scripts/workflow_guard.py) 实际强制执行。
108110

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "autocode-mcp"
3-
version = "0.6.0"
3+
version = "0.7.0"
44
description = "MCP Server for competitive programming problem creation, based on AutoCode paper"
55
readme = "README.md"
66
requires-python = ">=3.10"

scripts/workflow_guard.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,12 @@ def state_file(problem_dir: str) -> Path:
3636

3737
def infer_state(problem_dir: str) -> dict[str, Any]:
3838
root = Path(problem_dir)
39+
solutions_dir = root / "solutions"
3940
return {
4041
"problem_dir": str(root),
4142
"created": root.exists() and (root / "files").exists() and (root / "solutions").exists(),
42-
"sol_built": (root / "solutions" / "sol.cpp").exists() or any(root.glob("solutions/sol.*")),
43-
"brute_built": (root / "solutions" / "brute.cpp").exists() or any(root.glob("solutions/brute.*")),
43+
"sol_built": _has_solution(solutions_dir, "sol"),
44+
"brute_built": _has_solution(solutions_dir, "brute"),
4445
"validator_ready": (root / "files" / "val.cpp").exists() or any(root.glob("files/val.*")),
4546
"validator_accuracy": None,
4647
"generator_built": (root / "files" / "gen.cpp").exists() or any(root.glob("files/gen.*")),
@@ -54,10 +55,25 @@ def infer_state(problem_dir: str) -> dict[str, Any]:
5455
"validation_passed": False,
5556
"tests_generated": any((root / "tests").glob("*.in")) if (root / "tests").exists() else False,
5657
"generated_test_count": len(list((root / "tests").glob("*.in"))) if (root / "tests").exists() else 0,
58+
"tests_verified": False,
5759
"packaged": (root / "problem.xml").exists(),
5860
}
5961

6062

63+
def _has_solution(solutions_dir: Path, prefix: str) -> bool:
64+
"""检查 solutions/ 下是否有指定前缀的解法文件(支持自定义命名)。"""
65+
if not solutions_dir.exists():
66+
return False
67+
# 精确匹配(如 sol.cpp, brute.cpp)
68+
if (solutions_dir / f"{prefix}.cpp").exists():
69+
return True
70+
# 前缀匹配(如 brute_force.cpp)
71+
for f in solutions_dir.iterdir():
72+
if f.is_file() and f.stem.startswith(prefix) and f.suffix == ".cpp":
73+
return True
74+
return False
75+
76+
6177
def load_state(problem_dir: str) -> dict[str, Any]:
6278
path = state_file(problem_dir)
6379
if path.exists():
@@ -120,7 +136,7 @@ def pre_tool(payload: dict[str, Any]) -> int:
120136
"checker_build": "必须先通过 stress_test_run(completed_rounds == total_rounds),再构建 checker。",
121137
"problem_validate": "必须先通过 stress_test_run(completed_rounds == total_rounds),再进行验证。",
122138
"problem_generate_tests": "必须先通过 problem_validate(验证通过),才能生成最终测试数据。",
123-
"problem_pack_polygon": "必须先生成最终测试数据,并且生成数量 > 0,再进行打包。",
139+
"problem_pack_polygon": "必须先生成最终测试数据并通过 problem_verify_tests(passed),再进行打包。",
124140
}
125141

126142
tool_input = payload.get("tool_input", {})
@@ -169,6 +185,7 @@ def pre_tool(payload: dict[str, Any]) -> int:
169185

170186
if short_name == "problem_pack_polygon" and not (
171187
state["tests_generated"] and state.get("generated_test_count", 0) > 0
188+
and state.get("tests_verified", False)
172189
):
173190
deny(reasons["problem_pack_polygon"])
174191
return 0
@@ -194,6 +211,12 @@ def post_tool(payload: dict[str, Any]) -> int:
194211
save_state(problem_dir, state)
195212
return 0
196213

214+
if short_name == "problem_verify_tests" and not success:
215+
state = load_state(problem_dir)
216+
state["tests_verified"] = False
217+
save_state(problem_dir, state)
218+
return 0
219+
197220
if not success:
198221
return 0
199222

@@ -229,6 +252,9 @@ def post_tool(payload: dict[str, Any]) -> int:
229252
generated_tests = data.get("generated_tests", [])
230253
state["tests_generated"] = bool(generated_tests)
231254
state["generated_test_count"] = len(generated_tests)
255+
state["tests_verified"] = False
256+
elif short_name == "problem_verify_tests":
257+
state["tests_verified"] = bool(data.get("passed", False))
232258
elif short_name == "problem_pack_polygon":
233259
state["packaged"] = True
234260

@@ -244,7 +270,8 @@ def session_start() -> int:
244270
"stress_test_run(completed_rounds == total_rounds) -> "
245271
"checker_build if needed (accuracy >= 0.9) -> "
246272
"problem_validate(validation_passed) -> "
247-
"problem_generate_tests(generated_test_count > 0) -> problem_pack_polygon. "
273+
"problem_generate_tests(generated_test_count > 0) -> "
274+
"problem_verify_tests(passed) -> problem_pack_polygon. "
248275
"If a hook blocks a step, complete the missing prerequisite instead of retrying blindly."
249276
)
250277
print(

src/autocode_mcp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"""
77
import os
88

9-
__version__ = "0.6.0"
9+
__version__ = "0.7.0"
1010

1111
# 获取 templates 目录路径(包内目录)
1212
_PACKAGE_DIR = os.path.dirname(__file__)

src/autocode_mcp/server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
MCP Server 入口。
33
4-
提供 15 个原子工具,基于 AutoCode 论文框架。
4+
提供 17 个原子工具,基于 AutoCode 论文框架。
55
"""
66

77
from __future__ import annotations
@@ -35,6 +35,7 @@
3535
from .tools.problem import ProblemCreateTool, ProblemGenerateTestsTool, ProblemPackPolygonTool
3636
from .tools.solution import SolutionBuildTool, SolutionRunTool
3737
from .tools.stress_test import StressTestRunTool
38+
from .tools.test_verify import ProblemVerifyTestsTool
3839
from .tools.validation import ProblemValidateTool
3940
from .tools.validator import ValidatorBuildTool, ValidatorSelectTool
4041

@@ -67,6 +68,7 @@ def register_all_tools() -> None:
6768
# Problem 工具组
6869
register_tool(ProblemCreateTool())
6970
register_tool(ProblemGenerateTestsTool())
71+
register_tool(ProblemVerifyTestsTool())
7072
register_tool(ProblemPackPolygonTool())
7173
register_tool(ProblemValidateTool())
7274

src/autocode_mcp/tools/checker.py

Lines changed: 21 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from ..utils.compiler import run_binary_with_args
1212
from ..utils.platform import get_exe_extension
1313
from .base import Tool, ToolResult
14-
from .mixins import BuildToolMixin
14+
from .mixins import BuildToolMixin, resolve_source
1515

1616

1717
class CheckerBuildTool(Tool, BuildToolMixin):
@@ -91,58 +91,45 @@ async def execute(
9191
compiler: str = "g++",
9292
) -> ToolResult:
9393
"""执行 Checker 构建。"""
94-
# 解析源代码:source_path 优先于 code
95-
source_dir = None
96-
if source_path:
97-
if not os.path.isabs(source_path):
98-
source_path = os.path.join(problem_dir, source_path)
99-
if not os.path.exists(source_path):
100-
return ToolResult.fail(f"Source file not found: {source_path}")
101-
try:
102-
with open(source_path, encoding="utf-8") as f:
103-
code = f.read()
104-
except UnicodeDecodeError:
105-
try:
106-
with open(source_path, encoding="latin-1") as f:
107-
code = f.read()
108-
except Exception as e:
109-
return ToolResult.fail(f"Failed to read source file: {e}")
110-
source_dir = os.path.dirname(os.path.abspath(source_path))
111-
elif code is None:
112-
return ToolResult.fail("Either 'code' or 'source_path' must be provided")
94+
resolved, err = resolve_source(problem_dir, code, source_path)
95+
if err is not None:
96+
return err
97+
assert resolved is not None
11398

11499
os.makedirs(problem_dir, exist_ok=True)
115-
116-
# 保存到 files/ 子目录
117100
files_dir = os.path.join(problem_dir, "files")
118101
os.makedirs(files_dir, exist_ok=True)
119102

120-
# 保存代码
121-
source_path = os.path.join(files_dir, "checker.cpp")
103+
canonical_path = os.path.join(files_dir, "checker.cpp")
122104
try:
123-
with open(source_path, "w", encoding="utf-8") as f:
124-
f.write(code)
105+
with open(canonical_path, "w", encoding="utf-8") as f:
106+
f.write(resolved.code)
125107
except Exception as e:
126108
return ToolResult.fail(f"Failed to save code: {str(e)}")
127109

128-
# 编译
129110
binary_path = os.path.join(files_dir, f"checker{get_exe_extension()}")
130111

131-
include_dirs = [source_dir] if source_dir else None
132-
compile_result = await self.build(source_path, binary_path, compiler=compiler, include_dirs=include_dirs)
112+
compile_source = resolved.original_source_path or canonical_path
113+
include_dirs = [resolved.include_dir] if resolved.include_dir else None
114+
compile_result = await self.build(compile_source, binary_path, compiler=compiler, include_dirs=include_dirs)
133115

134116
if not compile_result.success:
135117
return ToolResult.fail(
136118
f"Compilation failed: {compile_result.error}",
137-
source_path=source_path,
119+
source_path=compile_source,
120+
canonical_path=canonical_path,
138121
compile_log=compile_result.stderr,
139122
)
140123

124+
binary_size = os.path.getsize(binary_path) if os.path.exists(binary_path) else 0
125+
141126
# 如果没有测试场景,直接返回成功
142127
if not test_scenarios:
143128
return ToolResult.ok(
144-
source_path=source_path,
129+
source_path=compile_source,
130+
canonical_path=canonical_path,
145131
binary_path=binary_path,
132+
binary_size=binary_size,
146133
compile_log=compile_result.stderr,
147134
message="Checker built successfully (no test scenarios provided)",
148135
)
@@ -214,8 +201,10 @@ async def execute(
214201
accuracy = correct_count / total if total > 0 else 0
215202

216203
return ToolResult.ok(
217-
source_path=source_path,
204+
source_path=compile_source,
205+
canonical_path=canonical_path,
218206
binary_path=binary_path,
207+
binary_size=binary_size,
219208
compile_log=compile_result.stderr,
220209
test_results=test_results,
221210
correct_count=correct_count,

0 commit comments

Comments
 (0)