OpenHands · xingyaoww · May 7, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/benchmarks/evoclaw/README.md b/benchmarks/evoclaw/README.md
@@ -0,0 +1,20 @@
+# EvoClaw
+
+This benchmark entrypoint runs OpenHands against EvoClaw repositories through the
+standard OpenHands benchmarks SDK path:
+
+1. discover EvoClaw repo directories from `--data-root`,
+2. build/start an OpenHands agent-server workspace from each EvoClaw base image,
+3. upload the EvoClaw task queue and SRS files into the workspace,
+4. run `Agent`/`Conversation` with the normal fake-user evaluation loop,
+5. emit the resulting git patch and conversation trajectory.
+
+```bash
+uv run evoclaw-infer .llm_config/example.json \
+  --data-root /path/to/EvoClaw-data \
+  --repos navidrome \
+  --n-limit 1
+```
+
+This is currently an inference harness. It intentionally does not reimplement
+EvoClaw's milestone DAG grader inside this repo.
diff --git a/benchmarks/evoclaw/__init__.py b/benchmarks/evoclaw/__init__.py
@@ -0,0 +1 @@
+"""EvoClaw benchmark integration."""
diff --git a/benchmarks/evoclaw/config.py b/benchmarks/evoclaw/config.py
@@ -0,0 +1,14 @@
+"""Defaults for EvoClaw inference."""
+
+INFER_DEFAULTS = {
+    "dataset": "evoclaw",
+    "split": "test",
+    "max_iterations": 3000,
+    "instance_timeout": 18000,
+    "num_workers": 1,
+    "n_critic_runs": 1,
+    "workspace": "docker",
+    "enable_condenser": True,
+    "condenser_max_size": 100,
+    "condenser_keep_first": 4,
+}
diff --git a/benchmarks/evoclaw/prompts/default.j2 b/benchmarks/evoclaw/prompts/default.j2
@@ -0,0 +1,9 @@
+We need modify the repository in /testbed to complete the EvoClaw task queue.
+
+Task queue:
+{{ task_queue_path }}
+
+Requirements files are available under:
+{{ srs_dir }}
+
+For each listed milestone, read its SRS file, implement the requested behavior in /testbed, and run the relevant tests when practical. If all listed milestones are complete, use the finish tool.