PrimeIntellect-ai · LufyCZ · Feb 22, 2026
diff --git a/configs/eval/human-debug.toml b/configs/eval/human-debug.toml
@@ -0,0 +1,7 @@
+save_results = true
+
+[[eval]]
+env_id = "primeintellect/wordle"
+num_examples = 1
+rollouts_per_example = 1
+human_debug = true
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -154,6 +154,7 @@ The `--max-retries` flag enables automatic retry with exponential backoff when r
 | `--verbose` | `-v` | false | Enable debug logging |
 | `--tui` | `-u` | false | Use alternate screen mode (TUI) for display |
 | `--debug` | `-d` | false | Disable Rich display; use normal logging and tqdm progress |
+| `--human-debug` | — | false | Use interactive human input for model responses (text-only) |
 | `--save-results` | `-s` | false | Save results to disk |
 | `--resume [PATH]` | `-R` | — | Resume from a previous run (auto-detect latest matching incomplete run if PATH omitted) |
 | `--state-columns` | `-C` | — | Extra state columns to save (comma-separated) |
@@ -166,6 +167,21 @@ Results are saved to `./outputs/evals/{env_id}--{model}/{run_id}/`, containing:
 - `results.jsonl` — rollout outputs, one per line
 - `metadata.json` — evaluation configuration and aggregate metrics
 
+### Human Debug Mode
+
+Use `--human-debug` to replace API model calls with terminal-entered responses:
+
+```bash
+prime eval run my-env --human-debug -n 3 -r 1 -s
+```
+
+In this mode:
+- Responses are entered interactively in the CLI and ended with `:wq` on its own line
+- Only text responses are supported (tool calls are not supported)
+- Exactly one eval config is supported per run
+- Execution is forced to sequential + independent scoring (`max_concurrent=1`, `independent_scoring=true`)
+- TUI display is disabled automatically to avoid stdin/display conflicts
+
 ### Resuming Evaluations
 
 Long-running evaluations can be interrupted and resumed using checkpointing. When `--save-results` is enabled, results are saved incrementally after each completed group of rollouts. Use `--resume` to continue from where you left off. Pass a path to resume a specific run, or omit the path to auto-detect the latest incomplete matching run.
@@ -290,6 +306,7 @@ Each `[[eval]]` section must contain an `env_id` field. All other fields are opt
 | `extra_env_kwargs` | table | Arguments passed to environment constructor |
 | `model` | string | Model to evaluate |
 | `endpoint_id` | string | Endpoint registry id (requires TOML `endpoints_path`) |
+| `human_debug` | boolean | Use interactive human-entered model responses (single eval only) |
 
 Example with `env_args`:
 

diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -57,6 +57,7 @@ def _run_cli(
             "max_retries": 0,
             "tui": False,
             "debug": False,
+            "human_debug": False,
             "heartbeat_url": None,
         }
         base_args.update(overrides)
@@ -116,6 +117,35 @@ def test_cli_single_env_id(monkeypatch, run_cli):
     assert configs[0].env_id == "env1"
 
 
+def test_cli_human_debug_sets_flag(monkeypatch, run_cli):
+    captured = run_cli(
+        monkeypatch,
+        {
+            "env_id_or_config": "env1",
+            "human_debug": True,
+        },
+    )
+
+    config = captured["configs"][0]
+    assert config.human_debug is True
+
+
+def test_cli_human_debug_forces_safe_runtime(monkeypatch, run_cli):
+    captured = run_cli(
+        monkeypatch,
+        {
+            "env_id_or_config": "env1",
+            "human_debug": True,
+            "max_concurrent": 8,
+            "independent_scoring": False,
+        },
+    )
+
+    config = captured["configs"][0]
+    assert config.max_concurrent == 1
+    assert config.independent_scoring is True
+
+
 def test_cli_sampling_args_precedence_over_flags(monkeypatch, run_cli):
     """sampling_args JSON takes precedence over individual flags."""
     captured = run_cli(
@@ -469,6 +499,15 @@ def test_load_toml_config_single_eval():
         assert result[0]["env_id"] == "env1"
 
 
+def test_load_toml_config_accepts_human_debug():
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write('[[eval]]\nenv_id = "env1"\nhuman_debug = true\n')
+        f.flush()
+        result = load_toml_config(Path(f.name))
+        assert len(result) == 1
+        assert result[0]["human_debug"] is True
+
+
 def test_repo_eval_example_configs_are_valid():
     """Bundled example configs should parse with the current eval config schema."""
     config_paths = sorted(Path("configs/eval").glob("*.toml"))
@@ -569,6 +608,23 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
     assert configs[1].env_id == "env2"
 
 
+def test_cli_human_debug_rejects_multi_eval_config(monkeypatch, run_cli):
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write(
+            'human_debug = true\n[[eval]]\nenv_id = "env1"\n[[eval]]\nenv_id = "env2"\n'
+        )
+        f.flush()
+        with pytest.raises(
+            ValueError, match="human_debug mode only supports a single evaluation"
+        ):
+            run_cli(
+                monkeypatch,
+                {
+                    "env_id_or_config": f.name,
+                },
+            )
+
+
 def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
     """TOML config ignores CLI args, uses defaults for unspecified values."""
     with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:

diff --git a/tests/test_human_cli_client.py b/tests/test_human_cli_client.py
@@ -0,0 +1,95 @@
+import builtins
+
+import pytest
+
+from verifiers.clients.human_cli_client import HumanCLIClient
+from verifiers.errors import ModelError
+from verifiers.types import SystemMessage, Tool, UserMessage
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_returns_multiline_response(monkeypatch):
+    responses = iter(["first line", "second line", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    response = await client.get_response(
+        prompt=[UserMessage(content="hello")],
+        model="test-model",
+        sampling_args={},
+    )
+
+    assert response.model == "test-model"
+    assert response.message.content == "first line\nsecond line"
+    assert response.message.finish_reason == "stop"
+    assert response.message.tool_calls is None
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_reprompts_on_empty_response(monkeypatch):
+    responses = iter(["", ":wq", "final answer", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    response = await client.get_response(
+        prompt=[UserMessage(content="hello")],
+        model="test-model",
+        sampling_args={},
+    )
+
+    assert response.message.content == "final answer"
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_rejects_tool_calls(monkeypatch):
+    responses = iter(["answer", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    with pytest.raises(ModelError, match="text-only"):
+        await client.get_response(
+            prompt=[UserMessage(content="hello")],
+            model="test-model",
+            sampling_args={},
+            tools=[
+                Tool(
+                    name="my_tool",
+                    description="test tool",
+                    parameters={"type": "object", "properties": {}},
+                )
+            ],
+        )
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_propagates_keyboard_interrupt(monkeypatch):
+    def raise_interrupt():
+        raise KeyboardInterrupt
+
+    monkeypatch.setattr(builtins, "input", raise_interrupt)
+
+    client = HumanCLIClient()
+    with pytest.raises(KeyboardInterrupt):
+        await client.get_response(
+            prompt=[UserMessage(content="hello")],
+            model="test-model",
+            sampling_args={},
+        )
+
+
+@pytest.mark.asyncio
+async def test_human_cli_client_renders_prompt_without_crashing(monkeypatch):
+    responses = iter(["done", ":wq"])
+    monkeypatch.setattr(builtins, "input", lambda: next(responses))
+
+    client = HumanCLIClient()
+    response = await client.get_response(
+        prompt=[
+            SystemMessage(content="You are helpful"),
+            UserMessage(content="Solve this"),
+        ],
+        model="test-model",
+        sampling_args={},
+    )
+
+    assert response.message.content == "done"
diff --git a/verifiers/clients/__init__.py b/verifiers/clients/__init__.py
@@ -2,6 +2,7 @@
 
 from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
 from verifiers.clients.client import Client
+from verifiers.clients.human_cli_client import HumanCLIClient
 from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
 from verifiers.clients.openai_chat_completions_token_client import (
     OpenAIChatCompletionsTokenClient,
@@ -35,5 +36,6 @@ def resolve_client(client_or_config: Client | ClientConfig) -> Client:
     "OpenAICompletionsClient",
     "OpenAIChatCompletionsClient",
     "OpenAIChatCompletionsTokenClient",
+    "HumanCLIClient",
     "Client",
 ]
diff --git a/verifiers/clients/human_cli_client.py b/verifiers/clients/human_cli_client.py
@@ -0,0 +1,111 @@
+import time
+import uuid
+
+from rich.console import Console
+
+from verifiers.clients.client import Client
+from verifiers.errors import EmptyModelResponseError, ModelError
+from verifiers.types import (
+    ClientConfig,
+    Messages,
+    Response,
+    ResponseMessage,
+    SamplingArgs,
+    Tool,
+)
+from verifiers.utils.logging_utils import format_messages
+
+
+class HumanCLIClient(Client[None, Messages, Response, Tool]):
+    """Client that captures assistant responses from a human in the terminal."""
+
+    def __init__(self, sentinel: str = ":wq") -> None:
+        self.sentinel = sentinel
+        self._console = Console()
+        super().__init__(None)
+
+    def setup_client(self, config: ClientConfig) -> None:
+        return None
+
+    async def to_native_tool(self, tool: Tool) -> Tool:
+        return tool
+
+    async def to_native_prompt(self, messages: Messages) -> tuple[Messages, dict]:
+        return messages, {}
+
+    async def get_native_response(
+        self,
+        prompt: Messages,
+        model: str,
+        sampling_args: SamplingArgs,
+        tools: list[Tool] | None = None,
+        **kwargs,
+    ) -> Response:
+        raise NotImplementedError(
+            "HumanCLIClient.get_native_response is not used. Call get_response()."
+        )
+
+    async def raise_from_native_response(self, response: Response) -> None:
+        return None
+
+    async def from_native_response(self, response: Response) -> Response:
+        return response
+
+    async def close(self) -> None:
+        return None
+
+    def _read_human_response(self, prompt: Messages) -> str:
+        self._console.rule("Human Debug")
+        self._console.print(format_messages(prompt))
+        self._console.print(
+            f"\nEnter assistant response. End input with `{self.sentinel}` on its own line."
+        )
+
+        while True:
+            lines: list[str] = []
+            while True:
+                try:
+                    line = input()
+                except EOFError as e:
+                    raise EmptyModelResponseError(
+                        "Reached EOF while waiting for human input"
+                    ) from e
+
+                if line.strip() == self.sentinel:
+                    break
+                lines.append(line)
+
+            response_text = "\n".join(lines).strip()
+            if response_text:
+                return "\n".join(lines)
+
+            self._console.print("Empty response. Please enter a non-empty response.")
+
+    async def get_response(
+        self,
+        prompt: Messages,
+        model: str,
+        sampling_args: SamplingArgs,
+        tools: list[Tool] | None = None,
+        **kwargs,
+    ) -> Response:
+        if tools:
+            raise ModelError(
+                "Human debug mode is text-only and does not support tool calls."
+            )
+
+        content = self._read_human_response(prompt)
+        return Response(
+            id=f"human-{uuid.uuid4().hex}",
+            created=int(time.time()),
+            model=model,
+            usage=None,
+            message=ResponseMessage(
+                content=content,
+                reasoning_content=None,
+                finish_reason="stop",
+                is_truncated=False,
+                tokens=None,
+                tool_calls=None,
+            ),
+        )