TIGER-AI-Lab · Perry2004 · May 23, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/.github/workflows/check-docker-build.yml → .github/workflows/check-harness-build.yml b/.github/workflows/check-docker-build.yml → .github/workflows/check-harness-build.yml
@@ -1,20 +1,20 @@
-name: check-docker-build
+name: check-harness-build
 
 on:
   pull_request:
     branches: [ main ]
     paths:
       - "src/clawbench/runtime/**"
-      - ".github/workflows/check-docker-build.yml"
+      - ".github/workflows/check-harness-build.yml"
   push:
     branches: [ main ]
     paths:
       - "src/clawbench/runtime/**"
-      - ".github/workflows/check-docker-build.yml"
+      - ".github/workflows/check-harness-build.yml"
   workflow_dispatch:
 
 jobs:
-  docker-build:
+  harness-build:
     runs-on: ubuntu-latest
     env:
       DOCKER_BUILD_PARALLELISM: "3"
@@ -24,6 +24,50 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.11"
+
+      - name: Install harness registry validator
+        run: python -m pip install "jsonschema==4.26.0" "PyYAML==6.0.2"
+
+      - name: Validate harness registry
+        shell: bash
+        run: |
+          set -euo pipefail
+          python - <<'PY'
+          import json
+          import sys
+          from pathlib import Path
+
+          import yaml
+          from jsonschema import Draft202012Validator
+
+          harness_root = Path("src/clawbench/runtime/harnesses")
+          registry_path = harness_root / "harnesses.yaml"
+          schema_path = harness_root / "harness.schema.json"
+
+          schema = json.loads(schema_path.read_text())
+          registry = yaml.safe_load(registry_path.read_text())
+          validator = Draft202012Validator(schema)
+
+          errors = [
+              "/" + "/".join(str(part) for part in error.path) + f": {error.message}"
+              for error in sorted(validator.iter_errors(registry), key=lambda item: list(item.path))
+          ]
+          if errors:
+              print("Harness registry schema validation failed:")
+              print("\n".join(errors))
+              raise SystemExit(1)
+
+          sys.path.insert(0, str(Path("src")))
+          from clawbench.runner.run_support.harness_registry import load_harness_registry
+
+          load_harness_registry(registry_path)
+          print(f"Validated {registry_path} against {schema_path}")
+          PY
+
       - name: Set up Docker
         uses: docker/setup-docker-action@v5.1.0
         with:
@@ -34,16 +78,16 @@ jobs:
               }
             }
 
-      - name: Restore Docker build cache
+      - name: Restore harness build cache
         id: docker-cache
         uses: actions/cache@v5.0.5
         with:
           path: /tmp/.buildx-cache
-          key: ${{ runner.os }}-docker-buildx-${{ github.sha }}
+          key: ${{ runner.os }}-harness-buildx-${{ github.sha }}
           restore-keys: |
-            ${{ runner.os }}-docker-buildx-
+            ${{ runner.os }}-harness-buildx-
 
-      - name: Inspect restored Docker build cache
+      - name: Inspect restored harness build cache
         shell: bash
         run: |
           set -euo pipefail
@@ -52,7 +96,7 @@ jobs:
             find /tmp/.buildx-cache -mindepth 1 -maxdepth 1 -type d -print | sort
             du -sh /tmp/.buildx-cache || true
           else
-            echo "No Docker build cache restored."
+            echo "No harness build cache restored."
           fi
 
       - name: Select harness Dockerfiles
@@ -147,10 +191,10 @@ jobs:
 
           echo "Selected ${#selected_dockerfiles[@]} harness Dockerfile(s):"
           printf '  %s\n' "${selected_dockerfiles[@]}"
-          echo "::notice title=Docker build plan::Selected ${#selected_dockerfiles[@]} harness Dockerfile(s)"
+          echo "::notice title=Harness build plan::Selected ${#selected_dockerfiles[@]} harness Dockerfile(s)"
 
           {
-            echo "## Docker build plan"
+            echo "## Harness build plan"
             echo
             echo "- Event: \`${{ github.event_name }}\`"
             echo "- Selection mode: \`$([[ "${build_all}" == "true" ]] && echo all || echo changed)\`"
@@ -166,7 +210,7 @@ jobs:
             done
           } >> "${GITHUB_STEP_SUMMARY}"
 
-      - name: Build Docker images
+      - name: Build harness Docker images
         shell: bash
         run: |
           set -euo pipefail
@@ -175,15 +219,15 @@ jobs:
           base_dockerfile="${harness_root}/base/Dockerfile.base"
           cache_root="/tmp/.buildx-cache"
           cache_update="/tmp/.buildx-cache-update"
-          status_root="/tmp/clawbench-docker-build-status"
+          status_root="/tmp/clawbench-harness-build-status"
           parallelism="${DOCKER_BUILD_PARALLELISM:-3}"
 
           rm -rf "${cache_update}" "${status_root}"
           mkdir -p "${cache_root}" "${cache_update}" "${status_root}"
 
           {
             echo
-            echo "## Docker build results"
+            echo "## Harness build results"
             echo
             echo "- Restored cache key: \`${{ steps.docker-cache.outputs.cache-matched-key || 'none' }}\`"
             echo "- Exact cache hit: \`${{ steps.docker-cache.outputs.cache-hit || 'false' }}\`"

diff --git a/README.md b/README.md
@@ -801,7 +801,7 @@ If an agent encounters a CAPTCHA, it must attempt it. We have seen cases where f
 The repo default is `openclaw`, but leaderboard rows include their harness explicitly. V1 results used OpenClaw; newer runs may use Hermes or other supported harnesses. Use the `harness` column when comparing models, because model and harness changes are separate experimental axes.
 
 **Is ClawBench tightly coupled to OpenClaw?**
-No. OpenClaw is the default harness, but ClawBench supports interchangeable harnesses listed in `src/clawbench/runtime/harnesses/`.
+No. OpenClaw is the default harness, but ClawBench supports interchangeable harnesses listed in `src/clawbench/runtime/harnesses/harnesses.yaml`.
 
 **Can ClawBench evaluate CLI agents?**
 Yes. ClawBench is a browser-task benchmark, but CLI and coding-agent harnesses can drive the same instrumented Chromium session using native tools or MCPs.
@@ -816,7 +816,7 @@ Yes. New model runs can be submitted or requested through the contribution flow
 The runner uses a hardened container with a request interceptor that blocks purchases, account creation, outbound email sends, and similar irreversible actions by default. Tasks that need to *simulate* those actions (e.g., "add to cart and checkout") terminate at the last reversible step. You can relax the interceptor per-task if your research requires it.
 
 **Can I contribute new tasks or harnesses?**
-Yes. V1 tasks live in `test-cases/v1/`; V2 tasks live in `test-cases/v2/`; Lite tasks live in `test-cases/v1-lite/`. See `CONTRIBUTING.md` for the task schema and validation flow.
+Yes. V1 tasks live in `test-cases/v1/`; V2 tasks live in `test-cases/v2/`; Lite tasks live in `test-cases/v1-lite/`. Harness definitions live in `src/clawbench/runtime/harnesses/harnesses.yaml`. See `CONTRIBUTING.md` for the task schema and validation flow.
 
 **How does ClawBench relate to HarnessBench?**
 Same scoring pipeline, orthogonal axis. ClawBench fixes the harness and varies the model; HarnessBench fixes the model and varies the harness. They share the V1 153-task corpus, the five-layer recording, and the agentic evaluator — so numbers are directly comparable.

diff --git a/src/README.md b/src/README.md
@@ -58,6 +58,8 @@ src/
     setup.sh                     # Local extension launch helper
     README.md
   harnesses/
+    harnesses.yaml              # Central harness registry: images, Dockerfiles, scripts
+    harness.schema.json         # JSON Schema for harnesses.yaml
     base/
       Dockerfile.base            # Shared Chromium, Xvfb, noVNC, server, extension image
       entrypoint.sh              # Shared container startup logic
@@ -140,7 +142,9 @@ Model entries are validated against `models/model.schema.json`.
 13. Optionally uploads the run to HuggingFace.
 14. Removes the container, deletes the disposable email, and removes the temporary personal info directory.
 
-Check `clawbench/runtime/harnesses/` for the currently supported harnesses.
+Check `clawbench/runtime/harnesses/harnesses.yaml` for the supported harnesses
+and their Dockerfile, setup script, run script, helper-file, and
+agent-message source mappings.
 
 Use `--harness <name>` to select one. The default is `openclaw`.
 

diff --git a/src/clawbench/runner/run.py b/src/clawbench/runner/run.py
@@ -340,7 +340,11 @@ def handle_sigint(sig, frame):
             step(f"Agent running (max {task['time_limit']}min)")
 
         phase = "waiting_for_container"
-        docker_wait(container, model_cfg=None if args.human else model_cfg)
+        docker_wait(
+            container,
+            model_cfg=None if args.human else model_cfg,
+            harness=None if args.human else args.harness,
+        )
 
         phase = "container_logs"
         step("Container logs")

diff --git a/src/clawbench/runner/run_support/config.py b/src/clawbench/runner/run_support/config.py
@@ -7,31 +7,56 @@
 
 import yaml
 
+from clawbench.runner.run_support.harness_registry import (
+    HARNESS_REGISTRY,
+    HARNESS_REGISTRY_YAML,
+    HarnessRegistry,
+    load_harness_registry,
+)
 from clawbench.utils.paths import (
     ASSET_ROOT,
     WORKSPACE_ROOT,
     bundled_path,
     workspace_path,
 )
 
-HARNESSES = (
-    "openclaw",
-    "opencode",
-    "claude-code",
-    "claude-code-chrome-extension",
-    "codex",
-    "browser-use",
-    "claw-code",
-    "hermes",
-    "pi",
-)
-DEFAULT_HARNESS = "openclaw"
-BASE_IMAGE = "clawbench-base"
+__all__ = [
+    "ASSET_ROOT",
+    "BASE_IMAGE",
+    "DEFAULT_HARNESS",
+    "ENGINE",
+    "HARNESS_REGISTRY",
+    "HARNESS_REGISTRY_YAML",
+    "HARNESSES",
+    "IMAGE",
+    "MODELS_YAML",
+    "WORKSPACE_ROOT",
+    "HarnessRegistry",
+    "harness_image",
+    "load_dotenv",
+    "load_harness_registry",
+    "load_model_config",
+    "load_models_yaml",
+    "load_runtime_env",
+    "resolve_task_file",
+    "resolve_test_case_dir",
+    "resolve_test_case_path",
+]
+
+
+HARNESSES = HARNESS_REGISTRY.harnesses
+DEFAULT_HARNESS = HARNESS_REGISTRY.default
+BASE_IMAGE = HARNESS_REGISTRY.base_image
 
 
 def harness_image(harness: str) -> str:
     """Return the docker image tag for a given harness name."""
-    return f"clawbench-{harness}"
+    try:
+        return HARNESS_REGISTRY.harness_images[harness]
+    except KeyError as e:
+        raise ValueError(
+            f"Unknown harness {harness!r}; expected one of {list(HARNESSES)}"
+        ) from e
 
 
 # Kept for back-compat with old callers / scripts that imported IMAGE.