Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions evolution/validation/suites/write_file_ambiguous.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Ambiguous-task closed-loop suite for write_file, designed so the right
# tool choice depends on understanding write_file's wholesale-overwrite
# semantics. With the production description (which says "OVERWRITES the
# entire file — use 'patch' for targeted edits"), a capable validator
# picks the right tool. With a weakened description that omits the
# overwriting cue, the validator misuses write_file on tasks where
# existing file content matters, producing a stable weak_signal band.
#
# Task categories:
# - wf_*: write_file is the right tool, picking patch would fail
# - pt_*: patch is the right tool, picking write_file would wipe content
# - mixed_*: either tool is acceptable (control)
{"task_id": "wf_overwrite_existing", "user_message": "Replace the contents of {fixture_dir}/notes.md with just the single line 'archived'. The old content should be discarded.", "expected_tools": ["write_file"], "forbidden_tools": ["patch"], "fixture_setup": {"notes.md": "# Meeting Notes\n\n- Discussed Q3 roadmap\n- Action items: prepare design doc\n- Next meeting: Thursday\n\n## Decisions\n- Ship feature X by end of month\n- Defer feature Y to next quarter\n"}}
{"task_id": "pt_preserve_when_adding_entry", "user_message": "Add a new user with id=2 and name='bob' to the array in {fixture_dir}/users.json. The existing users must be preserved.", "expected_tools": ["patch"], "forbidden_tools": ["write_file"], "fixture_setup": {"users.json": "[\n {\"id\": 1, \"name\": \"alice\"}\n]\n"}}
{"task_id": "pt_change_single_setting", "user_message": "In {fixture_dir}/config.toml, change the port value from 8080 to 9090. Don't touch anything else.", "expected_tools": ["patch"], "forbidden_tools": ["write_file"], "fixture_setup": {"config.toml": "[server]\nhost = \"localhost\"\nport = 8080\ntimeout_seconds = 30\n\n[logging]\nlevel = \"info\"\npath = \"/var/log/app.log\"\n"}}
{"task_id": "wf_create_new", "user_message": "Create a new file at {fixture_dir}/greeting.py with the content `print(\"hello world\")`. The file does not exist yet.", "expected_tools": ["write_file"], "forbidden_tools": ["patch"], "fixture_setup": {}}
{"task_id": "wf_reset_to_minimal", "user_message": "Reset the contents of {fixture_dir}/state.json so it contains just `{}` and nothing else. Discard the existing state entirely.", "expected_tools": ["write_file"], "forbidden_tools": ["patch"], "fixture_setup": {"state.json": "{\n \"sessions\": [\n {\"id\": \"abc\", \"started\": \"2026-01-01\"},\n {\"id\": \"def\", \"started\": \"2026-01-02\"}\n ],\n \"cache\": {\"hits\": 142, \"misses\": 17},\n \"version\": 3\n}\n"}}
{"task_id": "pt_add_import_to_existing", "user_message": "Add `import json` to the imports section at the top of {fixture_dir}/script.py. Keep the existing imports and the rest of the file unchanged.", "expected_tools": ["patch"], "forbidden_tools": ["write_file"], "fixture_setup": {"script.py": "import os\nimport sys\nfrom pathlib import Path\n\n\ndef main():\n print(\"running\")\n return 0\n\n\nif __name__ == \"__main__\":\n sys.exit(main())\n"}}
{"task_id": "mixed_either_acceptable", "user_message": "Update {fixture_dir}/app.py so that DEBUG is False, port is 8080, and host is '0.0.0.0'. Make exactly those three changes.", "expected_tools": ["write_file", "patch"], "forbidden_tools": [], "fixture_setup": {"app.py": "DEBUG = True\nport = 5000\nhost = 'localhost'\n\ndef start():\n return f\"serving on {host}:{port}\"\n"}}
38 changes: 38 additions & 0 deletions tests/fixtures/tool_manifests/weakened_write_file_manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"_evolution_metadata": {
"confusable_neighbors": {
"write_file": "patch",
"patch": "write_file"
}
},
"tools": [
{
"name": "write_file",
"description": "Write content to a file.",
"inputSchema": {
"type": "object",
"properties": {
"path": {"type": "string", "description": "Path to the file to write (will be created if it doesn't exist, overwritten if it does)"},
"content": {"type": "string", "description": "Complete content to write to the file"}
},
"required": ["path", "content"]
}
},
{
"name": "patch",
"description": "Targeted find-and-replace edits in files. Use this instead of sed/awk in terminal. Uses fuzzy matching so minor whitespace/indentation differences won't break it. Returns a unified diff. Replace mode: find a unique string and replace it. Patch mode: apply V4A multi-file patches for bulk changes.",
"inputSchema": {
"type": "object",
"properties": {
"mode": {"type": "string", "enum": ["replace", "patch"], "description": "Edit mode: 'replace' for targeted find-and-replace, 'patch' for V4A multi-file patches", "default": "replace"},
"path": {"type": "string", "description": "File path to edit (required for 'replace' mode)"},
"old_string": {"type": "string", "description": "Text to find in the file (required for 'replace' mode). Must be unique unless replace_all=true."},
"new_string": {"type": "string", "description": "Replacement text (required for 'replace' mode). Can be empty string to delete the matched text."},
"replace_all": {"type": "boolean", "description": "Replace all occurrences instead of requiring a unique match", "default": false},
"patch": {"type": "string", "description": "V4A format patch content (required for 'patch' mode)"}
},
"required": ["mode"]
}
}
]
}
Loading