Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 65 additions & 4 deletions codec_agent_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def _qwen_chat(user_prompt: str, system_prompt: str = "",
Rules:
- Output ONLY valid JSON. No prose before or after.
- skills_needed MUST be skill names from the user-supplied registry list. Never invent skill names.
Common confusions to avoid: there is NO `file_read` skill (use `file_ops`, which reads, writes, appends, lists). There is NO `fetch_url` (use `web_fetch`). There is NO `read_file` (use `file_ops`). If you can't find an exact match in the registry list, pick the closest match — never invent.
- write_paths default to ~/.codec/agents/{agent_id}/artifacts/** unless the project explicitly requires writing elsewhere.
- destructive_ops list any irreversible operations (deletes, payments, sending emails on user's behalf). They will require additional consent at runtime.
- estimated_duration_minutes is your best honest guess.
Expand Down Expand Up @@ -406,12 +407,72 @@ def draft_plan(agent_id: str, description: str, registry=None,
raise PlanValidationError(f"plan schema invalid: {e}")

ok, missing = validate_plan_skills(plan, registry=registry)
if not ok:
if ok:
return plan

# PR #41: plan-time hallucination retry. Mirror of the execution-time
# retry shipped in PR #35 (codec_agent_runner._build_correction_nudge).
# Real-world Qwen drift hits both layers — at execution it picks
# `fetch_url` instead of `web_fetch`; at planning it picks `file_read`
# instead of `file_ops`. Same cure: re-prompt ONCE with an explicit
# closed-world correction, fail hard if the second draft still misses.
log.info("[%s] plan referenced unknown skills %s; retrying with correction nudge",
agent_id, missing)
correction = (
f"\n\nYour previous draft referenced these skills which DO NOT EXIST "
f"in the registry: {sorted(missing)}.\n"
f"You MUST pick from this exact list — do not invent names, do not "
f"add suffixes (no _v2, no _read, no _write versions of unrelated "
f"skills). The full allowed set is:\n"
f" {', '.join(available_skills)}\n\n"
f"Common confusions:\n"
f" - Need to read a file? Use `file_ops` (it reads, writes, "
f"appends, lists). There is NO `file_read` skill.\n"
f" - Need to fetch a URL? Use `web_fetch`. There is NO `fetch_url`.\n"
f" - Need to search files? Use `file_search`.\n\n"
f"Re-emit the entire JSON plan with valid skill names only."
)
retry_prompt = user_prompt + correction
try:
raw2 = _qwen_chat(retry_prompt, _PLAN_SYSTEM_PROMPT)
except (QwenUnavailableError, ConnectionError, OSError, RuntimeError) as e:
# If the retry call itself fails, surface the ORIGINAL validation
# error — that's more diagnostic than "qwen flaked on retry".
raise PlanValidationError(
f"plan references unknown skills: {missing}"
f"plan references unknown skills: {missing} "
f"(retry failed: {e})"
)

return plan
raw2 = raw2.strip()
if raw2.startswith("```"):
raw2 = re.sub(r"^```(?:json)?\s*", "", raw2)
raw2 = re.sub(r"\s*```\s*$", "", raw2)
try:
d2 = json.loads(raw2)
except json.JSONDecodeError as e:
raise PlanValidationError(
f"plan references unknown skills: {missing} "
f"(retry returned non-JSON: {e})"
)
if d2.get("too_vague"):
raise PlanValidationError("too_vague: description needs clarification")
d2.setdefault("schema", PLAN_SCHEMA_VERSION)
d2.setdefault("agent_id", agent_id)
for cp in d2.get("checkpoints", []):
cp.setdefault("id", _stable_checkpoint_id(cp))
try:
plan2 = plan_from_dict(d2)
except (KeyError, ValueError, TypeError) as e:
raise PlanValidationError(f"retry plan schema invalid: {e}")
ok2, missing2 = validate_plan_skills(plan2, registry=registry)
if not ok2:
# Second miss — give up. Surface BOTH attempts so the user can see
# the model is consistently confused (e.g. truly unfixable phrasing).
raise PlanValidationError(
f"plan references unknown skills after retry: "
f"first={sorted(missing)}, second={sorted(missing2)}"
)
log.info("[%s] retry succeeded; using corrected plan", agent_id)
return plan2


def _stable_checkpoint_id(cp_dict: Dict[str, Any]) -> str:
Expand Down
107 changes: 107 additions & 0 deletions tests/test_agent_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,113 @@ def raise_connection(*a, **k):
)


# ─────────────────────────────────────────────────────────────────────────────
# PR #41 — Plan-time hallucination retry (3 tests)
# ─────────────────────────────────────────────────────────────────────────────

def _make_plan_response(skill_name):
"""Build a minimal valid-shape plan JSON using one skill name. Used by
the retry tests below."""
return json.dumps({
"goals": ["read markdown files"],
"checkpoints": [
{"title": "scan", "description": "list and read",
"skills_needed": [skill_name],
"expected_output": "index.md written", "step_budget": 30},
],
"permission_manifest": {
"read_paths": ["~/codec-repo/docs/**"],
"write_paths": ["~/codec-repo/docs/index.md"],
"network_domains": [],
"skills": [skill_name], "destructive_ops": [],
},
"estimated_duration_minutes": 5, "assumptions": [],
})


def test_draft_plan_retries_on_hallucinated_skill_then_succeeds(monkeypatch):
"""Real reproducer from 2026-05-04 09:58: user asked CODEC to read
markdown files; Qwen drafted a plan with skill `file_read` (does not
exist; correct skill is `file_ops`). PR #41 retries ONCE with a
correction nudge — second draft picks `file_ops` and succeeds."""
import codec_agent_plan as cap

calls = {"n": 0}
def fake_qwen(prompt, *a, **k):
calls["n"] += 1
if calls["n"] == 1:
return _make_plan_response("file_read") # hallucinated
return _make_plan_response("file_ops") # corrected
monkeypatch.setattr(cap, "_qwen_chat", fake_qwen)

fake_registry = MagicMock()
fake_registry.names.return_value = ["file_ops", "file_search"]

plan = cap.draft_plan(
agent_id="agent_test",
description="Read markdown files in docs and create an index",
registry=fake_registry,
)
# The corrected plan was used
assert calls["n"] == 2
assert plan.checkpoints[0].skills_needed == ["file_ops"]


def test_draft_plan_retry_also_fails_raises_with_both_attempts(monkeypatch):
"""If the second attempt ALSO hallucinates, raise with both attempts
in the error message so the user can see Qwen is consistently confused
(vs a one-off transient miss)."""
import codec_agent_plan as cap

responses = iter([
_make_plan_response("file_read"), # first miss
_make_plan_response("read_file"), # second miss (different bad name)
])
monkeypatch.setattr(cap, "_qwen_chat", lambda *a, **k: next(responses))

fake_registry = MagicMock()
fake_registry.names.return_value = ["file_ops"]

with pytest.raises(cap.PlanValidationError) as exc_info:
cap.draft_plan(
agent_id="agent_test",
description="some project",
registry=fake_registry,
)
msg = str(exc_info.value)
assert "after retry" in msg
assert "file_read" in msg # first attempt
assert "read_file" in msg # second attempt


def test_draft_plan_retry_qwen_unavailable_surfaces_original_error(monkeypatch):
"""If the retry call itself fails (Qwen flakes between attempts), surface
the ORIGINAL validation error — that's more diagnostic than 'qwen flaked
on retry'."""
import codec_agent_plan as cap

calls = {"n": 0}
def flaky(*a, **k):
calls["n"] += 1
if calls["n"] == 1:
return _make_plan_response("file_read")
raise ConnectionError("qwen died between attempts")
monkeypatch.setattr(cap, "_qwen_chat", flaky)

fake_registry = MagicMock()
fake_registry.names.return_value = ["file_ops"]

with pytest.raises(cap.PlanValidationError) as exc_info:
cap.draft_plan(
agent_id="agent_test",
description="x",
registry=fake_registry,
)
msg = str(exc_info.value)
assert "file_read" in msg # original missing skill present
assert "retry failed" in msg # cause noted


# ─────────────────────────────────────────────────────────────────────────────
# Task 7 — Vague-description clarifying loop (Q3) (2 tests)
# ─────────────────────────────────────────────────────────────────────────────
Expand Down
Loading