Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Contributing to __________
# Contributing to ProgramBench
We want to make contributing to this project as easy and transparent as
possible.

Expand Down Expand Up @@ -35,5 +35,5 @@ outlined on that page and do not file a public issue.
* ...

## License
By contributing to __________, you agree that your contributions will be licensed
By contributing to ProgramBench, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.
6 changes: 3 additions & 3 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
## Inference

Please use the images with tag `task_cleanroom` from `https://hub.docker.com/orgs/programbench/repositories`.
E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the followoing image:
E.g., to solve the task `ffmpeg__ffmpeg.360a402`, use the following image:

```
https://hub.docker.com/repository/docker/programbench/ffmpeg_1776_ffmpeg.360a402/tags/task_cleanroom/
Expand Down Expand Up @@ -41,9 +41,9 @@ We expect to release our baseline system in `mini-swe-agent` this week.

## Evaluation

Evaluation your agent run is the main function performed by the `ProgramBench` repository.
Evaluating your agent run is the main function performed by the `ProgramBench` repository.

After following the installation instructions from the [README](https://github.com/SWE-agent/ProgramBench#installation), you can run the evaluation with:
After following the installation instructions from the [README](../README.md#quickstart), you can run the evaluation with:

```
uv run programbench eval /path/to/my-amazing-agent-run
Expand Down
39 changes: 39 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""Smoke tests for CLI subcommands."""

from typer.testing import CliRunner

from programbench.cli.main import app

runner = CliRunner()


def test_top_level_help():
result = runner.invoke(app, ["--help"])
assert result.exit_code == 0
assert "eval" in result.output
assert "blob" in result.output
assert "info" in result.output


def test_info_help():
result = runner.invoke(app, ["info", "--help"])
assert result.exit_code == 0
assert "run-dir" in result.output.lower() or "run_dir" in result.output.lower()


def test_blob_help():
result = runner.invoke(app, ["blob", "--help"])
assert result.exit_code == 0
assert "sync" in result.output


def test_blob_sync_help():
result = runner.invoke(app, ["blob", "sync", "--help"])
assert result.exit_code == 0
assert "instance" in result.output.lower()
115 changes: 115 additions & 0 deletions tests/test_eval_extras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

"""Tests for eval functions not covered by test_eval.py."""

import pytest

from programbench.eval.eval import (
EvaluationResult,
TestBranchError,
TestResult,
count_testcases,
)
from programbench.eval.eval_batch import _can_reprocess


JUNIT_XML_THREE_CASES = """\
<?xml version="1.0" encoding="utf-8"?>
<testsuites>
<testsuite name="pytest" tests="3">
<testcase classname="t" name="a" time="0.01"/>
<testcase classname="t" name="b" time="0.02"/>
<testcase classname="t" name="c" time="0.03"/>
</testsuite>
</testsuites>
"""


class TestCountTestcases:
@pytest.mark.parametrize(
("xml", "expected"),
[
("", 0),
(" \n ", 0),
("<not valid xml>", 0),
(JUNIT_XML_THREE_CASES, 3),
],
)
def test_counts(self, xml, expected):
assert count_testcases(xml) == expected


class TestEvaluationResultSummarize:
def test_clean_run(self):
result = EvaluationResult(
test_results=[
TestResult(name="t1", branch="b1", status="passed", extra={}),
TestResult(name="t2", branch="b1", status="passed", extra={}),
],
solution_branch="submission",
)
s = result.summarize()
assert "100" in s
assert "2/2" in s
assert "submission" in s

def test_with_error_code(self):
result = EvaluationResult(error_code="compile_failed", error_details="gcc not found")
s = result.summarize()
assert "compile_failed" in s
assert "gcc not found" in s

def test_with_branch_errors(self):
result = EvaluationResult(
test_results=[TestResult(name="t1", branch="b1", status="passed", extra={})],
test_branch_errors={"b2": [TestBranchError(error_code="timeout", error_details="")]},
)
assert "b2" in result.summarize()

def test_with_system_errors(self):
result = EvaluationResult(
test_results=[TestResult(name="t1", branch="b1", status="system_error", extra={})],
)
assert "system_errors=1" in result.summarize()

def test_with_warnings(self):
result = EvaluationResult(warnings=["something unexpected"])
assert "warnings=1" in result.summarize()


class TestCanReprocess:
def test_error_code_is_reprocessable(self):
assert _can_reprocess(EvaluationResult(error_code="compile_failed"))

def test_all_branches_tagged_in_log(self):
result = EvaluationResult(
test_branches=["b1", "b2"],
log=[
{"step": "results_read", "branch": "b1", "returncode": 0, "output": "<xml/>"},
{"step": "results_read", "branch": "b2", "returncode": 0, "output": "<xml/>"},
],
)
assert _can_reprocess(result)

def test_missing_branch_in_log_not_reprocessable(self):
result = EvaluationResult(
test_branches=["b1", "b2"],
log=[
{"step": "results_read", "branch": "b1", "returncode": 0, "output": "<xml/>"},
],
)
assert not _can_reprocess(result)

def test_branch_with_error_excluded_from_check(self):
result = EvaluationResult(
test_branches=["b1", "b2"],
test_branch_errors={"b2": [TestBranchError(error_code="fail", error_details="")]},
log=[
{"step": "results_read", "branch": "b1", "returncode": 0, "output": "<xml/>"},
],
)
assert _can_reprocess(result)
51 changes: 51 additions & 0 deletions tests/test_instance_filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from programbench.utils.instance_filters import filter_instances


def _inst(iid: str, branches: dict | None = None) -> dict:
return {"instance_id": iid, "branches": branches or {}}


INSTANCES = [
_inst("alpha__foo.abc", {"b1": {"tests": ["t1"]}}),
_inst("beta__bar.def", {"b2": {"tests": ["t2"]}}),
_inst("gamma__baz.ghi"),
]


class TestFilterInstances:
def test_no_filters_returns_all(self):
assert filter_instances(INSTANCES) == INSTANCES

def test_regex_filter(self):
assert [i["instance_id"] for i in filter_instances(INSTANCES, filter_spec="alpha.*")] == ["alpha__foo.abc"]

def test_regex_filter_no_match(self):
assert filter_instances(INSTANCES, filter_spec="nonexistent") == []

def test_slice_spec(self):
assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="0:2")] == [
"alpha__foo.abc",
"beta__bar.def",
]

def test_slice_from_end(self):
assert [i["instance_id"] for i in filter_instances(INSTANCES, slice_spec="-1:")] == ["gamma__baz.ghi"]

def test_has_test_branch(self):
result = filter_instances(INSTANCES, has_test_branch=True)
assert [i["instance_id"] for i in result] == ["alpha__foo.abc", "beta__bar.def"]

def test_filter_and_slice_combined(self):
result = filter_instances(INSTANCES, filter_spec="(alpha|beta).*", slice_spec="0:1")
assert [i["instance_id"] for i in result] == ["alpha__foo.abc"]

def test_shuffle_is_deterministic(self):
r1 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)]
r2 = [i["instance_id"] for i in filter_instances(INSTANCES, shuffle=True)]
assert r1 == r2