facebookresearch · yurekami · May 22, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to __________
+# Contributing to ProgramBench
 We want to make contributing to this project as easy and transparent as
 possible.
 
@@ -35,5 +35,5 @@ outlined on that page and do not file a public issue.
 * ...
 
 ## License
-By contributing to __________, you agree that your contributions will be licensed
+By contributing to ProgramBench, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
diff --git a/docs/README.md b/docs/README.md
@@ -8,7 +8,7 @@
 ## Inference
 
 Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`.
-E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the followoing image:
+E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image:
 
 ```
 https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/
@@ -41,9 +41,9 @@ We expect to release our baseline system in `mini-swe-agent` this week.
 
 ## Evaluation
 
-Evaluation your agent run is the main function performed by the `ProgramBench` repository.
+Evaluating your agent run is the main function performed by the `ProgramBench` repository.
 
-After following the installation instructions from the [README](https://github.com/SWE-agent/ProgramBench#installation), you can run the evaluation with:
+After following the installation instructions from the [README](../README.md#quickstart), you can run the evaluation with:
 
 ```
 uv run programbench eval /path/to/my-amazing-agent-run

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Smoke tests for CLI subcommands."""
+
+from typer.testing import CliRunner
+
+from programbench.cli.main import app
+
+runner = CliRunner()
+
+
+def test_top_level_help():
+    result = runner.invoke(app, ["--help"])
+    assert result.exit_code == 0
+    assert "eval" in result.output
+    assert "blob" in result.output
+    assert "info" in result.output
+
+
+def test_info_help():
+    result = runner.invoke(app, ["info", "--help"])
+    assert result.exit_code == 0
+    assert "run-dir" in result.output.lower() or "run_dir" in result.output.lower()
+
+
+def test_blob_help():
+    result = runner.invoke(app, ["blob", "--help"])
+    assert result.exit_code == 0
+    assert "sync" in result.output
+
+
+def test_blob_sync_help():
+    result = runner.invoke(app, ["blob", "sync", "--help"])
+    assert result.exit_code == 0
+    assert "instance" in result.output.lower()
diff --git a/tests/test_eval_extras.py b/tests/test_eval_extras.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for eval functions not covered by test_eval.py."""
+
+import pytest
+
+from programbench.eval.eval import (
+    EvaluationResult,
+    TestBranchError,
+    TestResult,
+    count_testcases,
+)
+from programbench.eval.eval_batch import _can_reprocess
+
+
+JUNIT_XML_THREE_CASES = """\
+<?xml version="1.0" encoding="utf-8"?>
+<testsuites>
+  <testsuite name="pytest" tests="3">
+    <testcase classname="t" name="a" time="0.01"/>
+    <testcase classname="t" name="b" time="0.02"/>
+    <testcase classname="t" name="c" time="0.03"/>
+  </testsuite>
+</testsuites>
+"""
+
+
+class TestCountTestcases:
+    @pytest.mark.parametrize(
+        ("xml", "expected"),
+        [
+            ("", 0),
+            ("   \n  ", 0),
+            ("<not valid xml>", 0),
+            (JUNIT_XML_THREE_CASES, 3),
+        ],
+    )
+    def test_counts(self, xml, expected):
+        assert count_testcases(xml) == expected
+
+
+class TestEvaluationResultSummarize:
+    def test_clean_run(self):
+        result = EvaluationResult(
+            test_results=[
+                TestResult(name="t1", branch="b1", status="passed", extra={}),
+                TestResult(name="t2", branch="b1", status="passed", extra={}),
+            ],
+            solution_branch="submission",
+        )
+        s = result.summarize()
+        assert "100" in s
+        assert "2/2" in s
+        assert "submission" in s
+
+    def test_with_error_code(self):
+        result = EvaluationResult(error_code="compile_failed", error_details="gcc not found")
+        s = result.summarize()
+        assert "compile_failed" in s
+        assert "gcc not found" in s
+
+    def test_with_branch_errors(self):
+        result = EvaluationResult(
+            test_results=[TestResult(name="t1", branch="b1", status="passed", extra={})],
+            test_branch_errors={"b2": [TestBranchError(error_code="timeout", error_details="")]},
+        )
+        assert "b2" in result.summarize()
+
+    def test_with_system_errors(self):
+        result = EvaluationResult(
+            test_results=[TestResult(name="t1", branch="b1", status="system_error", extra={})],
+        )
+        assert "system_errors=1" in result.summarize()
+
+    def test_with_warnings(self):
+        result = EvaluationResult(warnings=["something unexpected"])
+        assert "warnings=1" in result.summarize()
+
+
+class TestCanReprocess:
+    def test_error_code_is_reprocessable(self):
+        assert _can_reprocess(EvaluationResult(error_code="compile_failed"))
+
+    def test_all_branches_tagged_in_log(self):
+        result = EvaluationResult(
+            test_branches=["b1", "b2"],
+            log=[
+                {"step": "results_read", "branch": "b1", "returncode": 0, "output": "<xml/>"},
+                {"step": "results_read", "branch": "b2", "returncode": 0, "output": "<xml/>"},
+            ],
+        )
+        assert _can_reprocess(result)
+
+    def test_missing_branch_in_log_not_reprocessable(self):
+        result = EvaluationResult(
+            test_branches=["b1", "b2"],
+            log=[
+                {"step": "results_read", "branch": "b1", "returncode": 0, "output": "<xml/>"},
+            ],
+        )
+        assert not _can_reprocess(result)
+
+    def test_branch_with_error_excluded_from_check(self):
+        result = EvaluationResult(
+            test_branches=["b1", "b2"],
+            test_branch_errors={"b2": [TestBranchError(error_code="fail", error_details="")]},
+            log=[
+                {"step": "results_read", "branch": "b1", "returncode": 0, "output": "<xml/>"},
+            ],
+        )
+        assert _can_reprocess(result)
diff --git a/tests/test_instance_filters.py b/tests/test_instance_filters.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from programbench.utils.instance_filters import filter_instances
+
+
+def _inst(iid: str, branches: dict | None = None) -> dict:
+    return {"instance_id": iid, "branches": branches or {}}
+
+
+INSTANCES = [
+    _inst("alpha__foo.abc", {"b1": {"tests": ["t1"]}}),
+    _inst("beta__bar.def", {"b2": {"tests": ["t2"]}}),
+    _inst("gamma__baz.ghi"),
+]
+
+
+class TestFilterInstances:
+    def test_no_filters_returns_all(self):
+        assert filter_instances(INSTANCES) == INSTANCES
+
+    def test_regex_filter(self):
+        assert [i["instance_id"] for i in filter_instances(INSTANCES, filter_spec="alpha.*")] == ["alpha__foo.abc"]
+
+    def test_regex_filter_no_match(self):
+        assert filter_instances(INSTANCES, filter_spec="nonexistent") == []
+
+    def test_slice_spec(self):
+        assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="0:2")] == [
+            "alpha__foo.abc",
+            "beta__bar.def",
+        ]
+
+    def test_slice_from_end(self):
+        assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="-1:")] == ["gamma__baz.ghi"]
+
+    def test_has_test_branch(self):
+        result = filter_instances(INSTANCES, has_test_branch=True)
+        assert [i["instance_id"] for i in result] == ["alpha__foo.abc", "beta__bar.def"]
+
+    def test_filter_and_slice_combined(self):
+        result = filter_instances(INSTANCES, filter_spec="(alpha|beta).*", slice_spec="0:1")
+        assert [i["instance_id"] for i in result] == ["alpha__foo.abc"]
+
+    def test_shuffle_is_deterministic(self):
+        r1 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)]
+        r2 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)]
+        assert r1 == r2