Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions configs/eval/human-debug.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
save_results = true

[[eval]]
env_id = "primeintellect/wordle"
num_examples = 1
rollouts_per_example = 1
human_debug = true
17 changes: 17 additions & 0 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ The `--max-retries` flag enables automatic retry with exponential backoff when r
| `--verbose` | `-v` | false | Enable debug logging |
| `--tui` | `-u` | false | Use alternate screen mode (TUI) for display |
| `--debug` | `-d` | false | Disable Rich display; use normal logging and tqdm progress |
| `--human-debug` | — | false | Use interactive human input for model responses (text-only) |
| `--save-results` | `-s` | false | Save results to disk |
| `--resume [PATH]` | `-R` | — | Resume from a previous run (auto-detect latest matching incomplete run if PATH omitted) |
| `--state-columns` | `-C` | — | Extra state columns to save (comma-separated) |
Expand All @@ -166,6 +167,21 @@ Results are saved to `./outputs/evals/{env_id}--{model}/{run_id}/`, containing:
- `results.jsonl` — rollout outputs, one per line
- `metadata.json` — evaluation configuration and aggregate metrics

### Human Debug Mode

Use `--human-debug` to replace API model calls with terminal-entered responses:

```bash
prime eval run my-env --human-debug -n 3 -r 1 -s
```

In this mode:
- Responses are entered interactively in the CLI and ended with `:wq` on its own line
- Only text responses are supported (tool calls are not supported)
- Exactly one eval config is supported per run
- Execution is forced to sequential + independent scoring (`max_concurrent=1`, `independent_scoring=true`)
- TUI display is disabled automatically to avoid stdin/display conflicts

### Resuming Evaluations

Long-running evaluations can be interrupted and resumed using checkpointing. When `--save-results` is enabled, results are saved incrementally after each completed group of rollouts. Use `--resume` to continue from where you left off. Pass a path to resume a specific run, or omit the path to auto-detect the latest incomplete matching run.
Expand Down Expand Up @@ -290,6 +306,7 @@ Each `[[eval]]` section must contain an `env_id` field. All other fields are opt
| `extra_env_kwargs` | table | Arguments passed to environment constructor |
| `model` | string | Model to evaluate |
| `endpoint_id` | string | Endpoint registry id (requires TOML `endpoints_path`) |
| `human_debug` | boolean | Use interactive human-entered model responses (single eval only) |

Example with `env_args`:

Expand Down
56 changes: 56 additions & 0 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def _run_cli(
"max_retries": 0,
"tui": False,
"debug": False,
"human_debug": False,
"heartbeat_url": None,
}
base_args.update(overrides)
Expand Down Expand Up @@ -116,6 +117,35 @@ def test_cli_single_env_id(monkeypatch, run_cli):
assert configs[0].env_id == "env1"


def test_cli_human_debug_sets_flag(monkeypatch, run_cli):
captured = run_cli(
monkeypatch,
{
"env_id_or_config": "env1",
"human_debug": True,
},
)

config = captured["configs"][0]
assert config.human_debug is True


def test_cli_human_debug_forces_safe_runtime(monkeypatch, run_cli):
captured = run_cli(
monkeypatch,
{
"env_id_or_config": "env1",
"human_debug": True,
"max_concurrent": 8,
"independent_scoring": False,
},
)

config = captured["configs"][0]
assert config.max_concurrent == 1
assert config.independent_scoring is True


def test_cli_sampling_args_precedence_over_flags(monkeypatch, run_cli):
"""sampling_args JSON takes precedence over individual flags."""
captured = run_cli(
Expand Down Expand Up @@ -469,6 +499,15 @@ def test_load_toml_config_single_eval():
assert result[0]["env_id"] == "env1"


def test_load_toml_config_accepts_human_debug():
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
f.write('[[eval]]\nenv_id = "env1"\nhuman_debug = true\n')
f.flush()
result = load_toml_config(Path(f.name))
assert len(result) == 1
assert result[0]["human_debug"] is True


def test_repo_eval_example_configs_are_valid():
"""Bundled example configs should parse with the current eval config schema."""
config_paths = sorted(Path("configs/eval").glob("*.toml"))
Expand Down Expand Up @@ -569,6 +608,23 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
assert configs[1].env_id == "env2"


def test_cli_human_debug_rejects_multi_eval_config(monkeypatch, run_cli):
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
f.write(
'human_debug = true\n[[eval]]\nenv_id = "env1"\n[[eval]]\nenv_id = "env2"\n'
)
f.flush()
with pytest.raises(
ValueError, match="human_debug mode only supports a single evaluation"
):
run_cli(
monkeypatch,
{
"env_id_or_config": f.name,
},
)


def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
"""TOML config ignores CLI args, uses defaults for unspecified values."""
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
Expand Down
95 changes: 95 additions & 0 deletions tests/test_human_cli_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import builtins

import pytest

from verifiers.clients.human_cli_client import HumanCLIClient
from verifiers.errors import ModelError
from verifiers.types import SystemMessage, Tool, UserMessage


@pytest.mark.asyncio
async def test_human_cli_client_returns_multiline_response(monkeypatch):
responses = iter(["first line", "second line", ":wq"])
monkeypatch.setattr(builtins, "input", lambda: next(responses))

client = HumanCLIClient()
response = await client.get_response(
prompt=[UserMessage(content="hello")],
model="test-model",
sampling_args={},
)

assert response.model == "test-model"
assert response.message.content == "first line\nsecond line"
assert response.message.finish_reason == "stop"
assert response.message.tool_calls is None


@pytest.mark.asyncio
async def test_human_cli_client_reprompts_on_empty_response(monkeypatch):
responses = iter(["", ":wq", "final answer", ":wq"])
monkeypatch.setattr(builtins, "input", lambda: next(responses))

client = HumanCLIClient()
response = await client.get_response(
prompt=[UserMessage(content="hello")],
model="test-model",
sampling_args={},
)

assert response.message.content == "final answer"


@pytest.mark.asyncio
async def test_human_cli_client_rejects_tool_calls(monkeypatch):
responses = iter(["answer", ":wq"])
monkeypatch.setattr(builtins, "input", lambda: next(responses))

client = HumanCLIClient()
with pytest.raises(ModelError, match="text-only"):
await client.get_response(
prompt=[UserMessage(content="hello")],
model="test-model",
sampling_args={},
tools=[
Tool(
name="my_tool",
description="test tool",
parameters={"type": "object", "properties": {}},
)
],
)


@pytest.mark.asyncio
async def test_human_cli_client_propagates_keyboard_interrupt(monkeypatch):
def raise_interrupt():
raise KeyboardInterrupt

monkeypatch.setattr(builtins, "input", raise_interrupt)

client = HumanCLIClient()
with pytest.raises(KeyboardInterrupt):
await client.get_response(
prompt=[UserMessage(content="hello")],
model="test-model",
sampling_args={},
)


@pytest.mark.asyncio
async def test_human_cli_client_renders_prompt_without_crashing(monkeypatch):
responses = iter(["done", ":wq"])
monkeypatch.setattr(builtins, "input", lambda: next(responses))

client = HumanCLIClient()
response = await client.get_response(
prompt=[
SystemMessage(content="You are helpful"),
UserMessage(content="Solve this"),
],
model="test-model",
sampling_args={},
)

assert response.message.content == "done"
2 changes: 2 additions & 0 deletions verifiers/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
from verifiers.clients.client import Client
from verifiers.clients.human_cli_client import HumanCLIClient
from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
from verifiers.clients.openai_chat_completions_token_client import (
OpenAIChatCompletionsTokenClient,
Expand Down Expand Up @@ -35,5 +36,6 @@ def resolve_client(client_or_config: Client | ClientConfig) -> Client:
"OpenAICompletionsClient",
"OpenAIChatCompletionsClient",
"OpenAIChatCompletionsTokenClient",
"HumanCLIClient",
"Client",
]
111 changes: 111 additions & 0 deletions verifiers/clients/human_cli_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import time
import uuid

from rich.console import Console

from verifiers.clients.client import Client
from verifiers.errors import EmptyModelResponseError, ModelError
from verifiers.types import (
ClientConfig,
Messages,
Response,
ResponseMessage,
SamplingArgs,
Tool,
)
from verifiers.utils.logging_utils import format_messages


class HumanCLIClient(Client[None, Messages, Response, Tool]):
"""Client that captures assistant responses from a human in the terminal."""

def __init__(self, sentinel: str = ":wq") -> None:
self.sentinel = sentinel
self._console = Console()
super().__init__(None)

def setup_client(self, config: ClientConfig) -> None:
return None

async def to_native_tool(self, tool: Tool) -> Tool:
return tool

async def to_native_prompt(self, messages: Messages) -> tuple[Messages, dict]:
return messages, {}

async def get_native_response(
self,
prompt: Messages,
model: str,
sampling_args: SamplingArgs,
tools: list[Tool] | None = None,
**kwargs,
) -> Response:
raise NotImplementedError(
"HumanCLIClient.get_native_response is not used. Call get_response()."
)

async def raise_from_native_response(self, response: Response) -> None:
return None

async def from_native_response(self, response: Response) -> Response:
return response

async def close(self) -> None:
return None

def _read_human_response(self, prompt: Messages) -> str:
self._console.rule("Human Debug")
self._console.print(format_messages(prompt))
self._console.print(
f"\nEnter assistant response. End input with `{self.sentinel}` on its own line."
)

while True:
lines: list[str] = []
while True:
try:
line = input()
except EOFError as e:
raise EmptyModelResponseError(
"Reached EOF while waiting for human input"
) from e

if line.strip() == self.sentinel:
break
lines.append(line)

response_text = "\n".join(lines).strip()
if response_text:
return "\n".join(lines)

self._console.print("Empty response. Please enter a non-empty response.")

async def get_response(
self,
prompt: Messages,
model: str,
sampling_args: SamplingArgs,
tools: list[Tool] | None = None,
**kwargs,
) -> Response:
if tools:
raise ModelError(
"Human debug mode is text-only and does not support tool calls."
)

content = self._read_human_response(prompt)
return Response(
id=f"human-{uuid.uuid4().hex}",
created=int(time.time()),
model=model,
usage=None,
message=ResponseMessage(
content=content,
reasoning_content=None,
finish_reason="stop",
is_truncated=False,
tokens=None,
tool_calls=None,
),
)
Loading
Loading