codervisor · tikazyq · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.harness/eval.governance.jsonl b/.harness/eval.governance.jsonl
@@ -1 +0,0 @@
-{"benchmark":"swebench","findings":[{"category":"correctness","reasons":["ERROR: Missing required plugins: pytest-bdd, pytest-benchmark, pytest-instafail, pytest-mock, pytest-qt, pytest-rerunfailures\n\n"],"summary":"1/1 F2P tests still failing — agent patch incomplete","tests":["['tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_unfiltered', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[Hello]', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[Hello World]', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[  Hello World  ]']"]},{"category":"infrastructure","failed_count":52,"reasons":["ERROR: Missing required plugins: pytest-bdd, pytest-benchmark, pytest-instafail, pytest-mock, pytest-qt, pytest-rerunfailures\n\n"],"summary":"52/52 P2P tests failed — likely environment/setup issue, not selective regression","total_count":52}],"instance_id":"instance_qutebrowser__qutebrowser-f91ace96223cac8161c16dd061907e138fe85111-v059c6fdc75567943479b23ebca7c07b5e9a7f34c","metrics":{"duration_s":0},"resolved":false,"skill":"fractal","source":"eval","split":"pro","status":"correctness","timestamp":"2026-03-18T15:00:35.992843231+00:00","work_id":"eval-instance_qutebrowser__qutebrowser-f91ace96223cac8161c16dd061907e138fe85111-v059c6fdc75567943479b23ebca7c07b5e9a7f34c-fractal-1773846035"}

diff --git a/.harness/harness.governance.jsonl b/.harness/harness.governance.jsonl
@@ -1 +0,0 @@
-{"work_id": "harness-1773756154", "source": "harness", "timestamp": "2026-03-17T14:02:34Z", "status": "passed", "agent_command": "echo no changes", "rework_items": [], "static_failures": [], "metrics": {"attempt_count": 1, "duration_s": 0, "base_ref": "315fc084c5710efc73a5bdce5c8b0700d4305d98"}}

diff --git a/cli/src/eval/run.rs b/cli/src/eval/run.rs
@@ -302,7 +302,16 @@ pub fn execute(opts: RunOptions) -> Result<()> {
     );
 
     println!();
-    println!("━━━ Done ━━━");
+
+    // Exit non-zero when the eval is not resolved so the harness (or any
+    // parent process) can detect failure without parsing log files.
+    let resolved = verdict.as_ref().map_or(false, |v| v.resolved);
+    if !resolved {
+        println!("━━━ Done (not resolved) ━━━");
+        std::process::exit(1);
+    }
+
+    println!("━━━ Done (resolved) ━━━");
     Ok(())
 }
 

diff --git a/cli/src/harness/run.rs b/cli/src/harness/run.rs
@@ -64,6 +64,7 @@ pub fn execute(config: RunConfig) -> Result<()> {
     // --- Governance loop ---
     let mut attempt: u32 = 0;
     let mut status = "running".to_string();
+    let mut last_agent_exit: i32 = 0;
     let feedback_file = run_dir.join("feedback.md");
     let mut all_rework_items: Vec<Value> = Vec::new();
     let mut all_static_failures: Vec<String> = Vec::new();
@@ -88,11 +89,12 @@ pub fn execute(config: RunConfig) -> Result<()> {
                  {feedback}\n\n\
                  Re-read the original task and fix the issues above.\n"
             );
-            run_agent_with_stdin(&config.agent_cmd, &workdir, &rework_input, &agent_log)?
+            run_agent_with_stdin(&config.agent_cmd, &workdir, &rework_input, &agent_log, &repo_root)?
         } else {
-            run_agent(&config.agent_cmd, &workdir, &agent_log)?
+            run_agent(&config.agent_cmd, &workdir, &agent_log, &repo_root)?
         };
 
+        last_agent_exit = agent_exit;
         log_info(&config, &format!("Agent finished (exit code: {agent_exit})"));
 
         // --- Observe: what changed? ---
@@ -383,6 +385,14 @@ pub fn execute(config: RunConfig) -> Result<()> {
     // =================================================================
     // Finalize — record governance log
     // =================================================================
+
+    // Agent exit code is authoritative: if the agent reports failure (e.g.
+    // eval resolved=false), governance approval alone doesn't override that.
+    if status == "passed" && last_agent_exit != 0 {
+        log_info(&config, &format!("Agent exited with code {last_agent_exit} — overriding governance pass"));
+        status = "error".to_string();
+    }
+
     let end_time = Utc::now();
     let duration = (end_time - start_time).num_seconds();
 
@@ -504,23 +514,25 @@ fn git_rev_parse(workdir: &Path, refspec: &str) -> Option<String> {
         })
 }
 
-fn run_agent(cmd: &[String], workdir: &Path, log_path: &Path) -> Result<i32> {
+fn run_agent(cmd: &[String], workdir: &Path, log_path: &Path, repo_root: &Path) -> Result<i32> {
     let log_file = fs::File::create(log_path)?;
     let status = Command::new(&cmd[0])
         .args(&cmd[1..])
         .current_dir(workdir)
+        .env("SYNODIC_ROOT", repo_root)
         .stdout(log_file.try_clone()?)
         .stderr(log_file)
         .status()
         .with_context(|| format!("failed to run agent: {}", cmd[0]))?;
     Ok(status.code().unwrap_or(1))
 }
 
-fn run_agent_with_stdin(cmd: &[String], workdir: &Path, input: &str, log_path: &Path) -> Result<i32> {
+fn run_agent_with_stdin(cmd: &[String], workdir: &Path, input: &str, log_path: &Path, repo_root: &Path) -> Result<i32> {
     let log_file = fs::File::create(log_path)?;
     let mut child = Command::new(&cmd[0])
         .args(&cmd[1..])
         .current_dir(workdir)
+        .env("SYNODIC_ROOT", repo_root)
         .stdin(std::process::Stdio::piped())
         .stdout(log_file.try_clone()?)
         .stderr(log_file)

diff --git a/cli/src/util.rs b/cli/src/util.rs
@@ -2,7 +2,16 @@ use std::path::{Path, PathBuf};
 use std::process::Command;
 
 /// Walk up from CWD to find the repo root (contains `.harness/` or `.git`).
+///
+/// Respects `SYNODIC_ROOT` env var — set by `harness run` so that eval
+/// subprocesses write governance logs to the correct project, not the testbed.
 pub fn find_repo_root() -> anyhow::Result<PathBuf> {
+    if let Ok(root) = std::env::var("SYNODIC_ROOT") {
+        let p = PathBuf::from(&root);
+        if p.is_dir() {
+            return Ok(p);
+        }
+    }
     let mut dir = std::env::current_dir()?;
     loop {
         if dir.join(".harness").is_dir() || dir.join(".git").exists() {
Original file line number	Diff line number	Diff line change
		@@ -1 +0,0 @@
		{"benchmark":"swebench","findings":[{"category":"correctness","reasons":["ERROR: Missing required plugins: pytest-bdd, pytest-benchmark, pytest-instafail, pytest-mock, pytest-qt, pytest-rerunfailures\n\n"],"summary":"1/1 F2P tests still failing — agent patch incomplete","tests":["['tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_unfiltered', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[Hello]', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[Hello World]', 'tests/unit/utils/test_qtlog.py::TestHideQtWarning::test_filtered[ Hello World ]']"]},{"category":"infrastructure","failed_count":52,"reasons":["ERROR: Missing required plugins: pytest-bdd, pytest-benchmark, pytest-instafail, pytest-mock, pytest-qt, pytest-rerunfailures\n\n"],"summary":"52/52 P2P tests failed — likely environment/setup issue, not selective regression","total_count":52}],"instance_id":"instance_qutebrowser__qutebrowser-f91ace96223cac8161c16dd061907e138fe85111-v059c6fdc75567943479b23ebca7c07b5e9a7f34c","metrics":{"duration_s":0},"resolved":false,"skill":"fractal","source":"eval","split":"pro","status":"correctness","timestamp":"2026-03-18T15:00:35.992843231+00:00","work_id":"eval-instance_qutebrowser__qutebrowser-f91ace96223cac8161c16dd061907e138fe85111-v059c6fdc75567943479b23ebca7c07b5e9a7f34c-fractal-1773846035"}
Original file line number	Diff line number	Diff line change
		@@ -1 +0,0 @@
		{"work_id": "harness-1773756154", "source": "harness", "timestamp": "2026-03-17T14:02:34Z", "status": "passed", "agent_command": "echo no changes", "rework_items": [], "static_failures": [], "metrics": {"attempt_count": 1, "duration_s": 0, "base_ref": "315fc084c5710efc73a5bdce5c8b0700d4305d98"}}