Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .harness/eval.governance.jsonl
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
{"benchmark":"swebench","findings":[{"category":"correctness","reasons":["ERROR: Missing required plugins: pytest-bdd, pytest-benchmark, pytest-instafail, pytest-mock, pytest-qt, pytest-rerunfailures\n\n"],"summary":"1/1 F2P tests still failing — agent patch incomplete","tests":["['tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_unfiltered', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[Hello]', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[Hello World]', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[ Hello World ]']"]},{"category":"infrastructure","failed_count":52,"reasons":["ERROR: Missing required plugins: pytest-bdd, pytest-benchmark, pytest-instafail, pytest-mock, pytest-qt, pytest-rerunfailures\n\n"],"summary":"52/52 P2P tests failed — likely environment/setup issue, not selective regression","total_count":52}],"instance_id":"instance_qutebrowser__qutebrowser-f91ace96223cac8161c16dd061907e138fe85111-v059c6fdc75567943479b23ebca7c07b5e9a7f34c","metrics":{"duration_s":0},"resolved":false,"skill":"fractal","source":"eval","split":"pro","status":"correctness","timestamp":"2026-03-18T15:00:35.992843231+00:00","work_id":"eval-instance_qutebrowser__qutebrowser-f91ace96223cac8161c16dd061907e138fe85111-v059c6fdc75567943479b23ebca7c07b5e9a7f34c-fractal-1773846035"}
1 change: 0 additions & 1 deletion .harness/harness.governance.jsonl
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
{"work_id": "harness-1773756154", "source": "harness", "timestamp": "2026-03-17T14:02:34Z", "status": "passed", "agent_command": "echo no changes", "rework_items": [], "static_failures": [], "metrics": {"attempt_count": 1, "duration_s": 0, "base_ref": "315fc084c5710efc73a5bdce5c8b0700d4305d98"}}
11 changes: 10 additions & 1 deletion cli/src/eval/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,16 @@ pub fn execute(opts: RunOptions) -> Result<()> {
);

println!();
println!("━━━ Done ━━━");

// Exit non-zero when the eval is not resolved so the harness (or any
// parent process) can detect failure without parsing log files.
let resolved = verdict.as_ref().map_or(false, |v| v.resolved);
if !resolved {
println!("━━━ Done (not resolved) ━━━");
std::process::exit(1);
}

println!("━━━ Done (resolved) ━━━");
Ok(())
}

Expand Down
20 changes: 16 additions & 4 deletions cli/src/harness/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ pub fn execute(config: RunConfig) -> Result<()> {
// --- Governance loop ---
let mut attempt: u32 = 0;
let mut status = "running".to_string();
let mut last_agent_exit: i32 = 0;
let feedback_file = run_dir.join("feedback.md");
let mut all_rework_items: Vec<Value> = Vec::new();
let mut all_static_failures: Vec<String> = Vec::new();
Expand All @@ -88,11 +89,12 @@ pub fn execute(config: RunConfig) -> Result<()> {
{feedback}\n\n\
Re-read the original task and fix the issues above.\n"
);
run_agent_with_stdin(&config.agent_cmd, &workdir, &rework_input, &agent_log)?
run_agent_with_stdin(&config.agent_cmd, &workdir, &rework_input, &agent_log, &repo_root)?
} else {
run_agent(&config.agent_cmd, &workdir, &agent_log)?
run_agent(&config.agent_cmd, &workdir, &agent_log, &repo_root)?
};

last_agent_exit = agent_exit;
log_info(&config, &format!("Agent finished (exit code: {agent_exit})"));

// --- Observe: what changed? ---
Expand Down Expand Up @@ -383,6 +385,14 @@ pub fn execute(config: RunConfig) -> Result<()> {
// =================================================================
// Finalize — record governance log
// =================================================================

// Agent exit code is authoritative: if the agent reports failure (e.g.
// eval resolved=false), governance approval alone doesn't override that.
if status == "passed" && last_agent_exit != 0 {
log_info(&config, &format!("Agent exited with code {last_agent_exit} — overriding governance pass"));
status = "error".to_string();
}

let end_time = Utc::now();
let duration = (end_time - start_time).num_seconds();

Expand Down Expand Up @@ -504,23 +514,25 @@ fn git_rev_parse(workdir: &Path, refspec: &str) -> Option<String> {
})
}

fn run_agent(cmd: &[String], workdir: &Path, log_path: &Path) -> Result<i32> {
fn run_agent(cmd: &[String], workdir: &Path, log_path: &Path, repo_root: &Path) -> Result<i32> {
let log_file = fs::File::create(log_path)?;
let status = Command::new(&cmd[0])
.args(&cmd[1..])
.current_dir(workdir)
.env("SYNODIC_ROOT", repo_root)
.stdout(log_file.try_clone()?)
.stderr(log_file)
.status()
.with_context(|| format!("failed to run agent: {}", cmd[0]))?;
Ok(status.code().unwrap_or(1))
}

fn run_agent_with_stdin(cmd: &[String], workdir: &Path, input: &str, log_path: &Path) -> Result<i32> {
fn run_agent_with_stdin(cmd: &[String], workdir: &Path, input: &str, log_path: &Path, repo_root: &Path) -> Result<i32> {
let log_file = fs::File::create(log_path)?;
let mut child = Command::new(&cmd[0])
.args(&cmd[1..])
.current_dir(workdir)
.env("SYNODIC_ROOT", repo_root)
.stdin(std::process::Stdio::piped())
.stdout(log_file.try_clone()?)
.stderr(log_file)
Expand Down
9 changes: 9 additions & 0 deletions cli/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,16 @@ use std::path::{Path, PathBuf};
use std::process::Command;

/// Walk up from CWD to find the repo root (contains `.harness/` or `.git`).
///
/// Respects `SYNODIC_ROOT` env var — set by `harness run` so that eval
/// subprocesses write governance logs to the correct project, not the testbed.
pub fn find_repo_root() -> anyhow::Result<PathBuf> {
if let Ok(root) = std::env::var("SYNODIC_ROOT") {
let p = PathBuf::from(&root);
if p.is_dir() {
return Ok(p);
}
}
let mut dir = std::env::current_dir()?;
loop {
if dir.join(".harness").is_dir() || dir.join(".git").exists() {
Expand Down
Loading