Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
6ba7d8e
add MultiAgentEnv for turn-based multi-agent environments
nph4rd Feb 7, 2026
adb72a3
rename Actor to Agent, add Protocol abstraction
nph4rd Feb 8, 2026
bfc556e
require Protocol in MultiAgentEnv, simplify docstrings
nph4rd Feb 8, 2026
9d09ab9
update docstrings
nph4rd Feb 8, 2026
ba862ea
add multi-agent reward functions for heterogeneous rewards
nph4rd Feb 25, 2026
ef4a911
compute per-agent advantages for multi-agent rewards
nph4rd Feb 25, 2026
aad512b
include all rewards in per-agent rewards for multi-agent training
nph4rd Feb 25, 2026
6bd9130
add opponent-conditioned baselines for multi-agent advantage estimation
nph4rd Feb 26, 2026
c3dfd97
add debug logging for opponent-conditioned baselines
nph4rd Feb 26, 2026
02607e1
add trajectory structure debug
nph4rd Feb 26, 2026
f80b271
debug extras and state keys
nph4rd Feb 26, 2026
d6fd5ce
remove opponent-conditioned baselines for comparison test
nph4rd Feb 27, 2026
2980b02
add per-agent baselines for multi-agent advantage computation
nph4rd Mar 4, 2026
1db9aa0
debug: log multiagent func detection
nph4rd Mar 6, 2026
336a61f
fix score_rollout to support multi-agent reward functions
nph4rd Mar 6, 2026
9b33792
add diagnostic logging to score_rollout multiagent path
nph4rd Mar 6, 2026
9a4597b
add game state logging to score_rollout diagnostics
nph4rd Mar 6, 2026
6fa6fe0
add trajectory length and completion flags to diagnostics
nph4rd Mar 6, 2026
f615667
log error details in multiagent rollout and score_rollout diagnostics
nph4rd Mar 6, 2026
cb09b1d
normalize messages from build_agent_prompt before storing in trajectory
nph4rd Mar 6, 2026
e6c8df8
remove diagnostic logging
nph4rd Mar 6, 2026
3a3e71c
add per-agent reward metrics for multi-agent environments
nph4rd Mar 7, 2026
48583bf
add per-agent model routing for multi-policy lora training
nph4rd Mar 8, 2026
615187b
point textarena to fork with kuhn poker fixes
nph4rd Mar 9, 2026
5a0cd7d
fix get_free_port_pair macos compatibility with zmq bind
nph4rd Mar 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ rl = [
[tool.uv]
preview = true

[tool.uv.sources]
textarena = { git = "https://github.com/nph4rd/TextArena.git", branch = "fix/kuhn-poker-phantom-ante" }

[tool.uv.extra-build-dependencies]
flash-attn = [{ requirement = "torch", match-runtime = true }]

Expand Down
11 changes: 11 additions & 0 deletions verifiers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,12 @@
"MCPEnv",
"BrowserEnv",
"OpenEnvEnv",
"Agent",
"Protocol",
"RoundRobinProtocol",
"Environment",
"MultiTurnEnv",
"MultiAgentEnv",
"SingleTurnEnv",
"PythonEnv",
"SandboxEnv",
Expand Down Expand Up @@ -110,6 +114,10 @@
"SingleTurnEnv": "verifiers.envs.singleturn_env:SingleTurnEnv",
"StatefulToolEnv": "verifiers.envs.stateful_tool_env:StatefulToolEnv",
"ToolEnv": "verifiers.envs.tool_env:ToolEnv",
"Agent": "verifiers.envs.agent:Agent",
"Protocol": "verifiers.envs.protocol:Protocol",
"RoundRobinProtocol": "verifiers.envs.protocol:RoundRobinProtocol",
"MultiAgentEnv": "verifiers.envs.multiagent_env:MultiAgentEnv",
"EnvGroup": "verifiers.envs.env_group:EnvGroup",
"JudgeRubric": "verifiers.rubrics.judge_rubric:JudgeRubric",
"load_environment": "verifiers.utils.env_utils:load_environment",
Expand Down Expand Up @@ -173,8 +181,11 @@ def __getattr__(name: str):
OpenAIChatCompletionsTokenClient,
)
from .clients.openai_completions_client import OpenAICompletionsClient # noqa: F401
from .envs.agent import Agent # noqa: F401
from .envs.protocol import Protocol, RoundRobinProtocol # noqa: F401
from .envs.env_group import EnvGroup # noqa: F401
from .envs.environment import Environment # noqa: F401
from .envs.multiagent_env import MultiAgentEnv # noqa: F401
from .envs.experimental.cli_agent_env import CliAgentEnv # noqa: F401
from .envs.experimental.gym_env import GymEnv # noqa: F401
from .envs.experimental.harbor_env import HarborEnv # noqa: F401
Expand Down
35 changes: 35 additions & 0 deletions verifiers/envs/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
Agent: A participant in multi-agent environments.

Contains agent metadata (id, system prompt, trainability).
"""

from dataclasses import dataclass


@dataclass
class Agent:
"""
An agent in a multi-agent environment.

Fields:
id: Unique identifier for this agent (e.g., "player_0", "guesser")
system_prompt: The agent's specific instructions
is_trainable: Whether to compute gradients for this agent's actions
"""

id: str
system_prompt: str = ""
is_trainable: bool = True

def __hash__(self) -> int:
return hash(self.id)

def __eq__(self, other: object) -> bool:
if isinstance(other, Agent):
return self.id == other.id
return False

def __repr__(self) -> str:
trainable_str = "trainable" if self.is_trainable else "frozen"
return f"Agent(id={self.id!r}, {trainable_str})"
12 changes: 11 additions & 1 deletion verifiers/envs/env_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,11 +278,19 @@ async def run_rollout( # type: ignore[override]
max_retries: int = 0,
state_columns: list[str] | None = None,
env_client: EnvClient | None = None,
actor_models: dict[str, str] | None = None,
) -> vf.RolloutOutput:
env = self.get_env_for_task(input["task"])
env_client = env_client or env.env_client or self.env_client
return await env.run_rollout(
input, client, model, sampling_args, max_retries, state_columns, env_client
input,
client,
model,
sampling_args,
max_retries,
state_columns,
env_client,
actor_models=actor_models,
)

@final
Expand All @@ -295,6 +303,7 @@ async def run_group( # type: ignore[override]
max_retries: int = 0,
state_columns: list[str] | None = None,
env_client: EnvClient | None = None,
actor_models: dict[str, str] | None = None,
) -> list[vf.RolloutOutput]:
env = self.get_env_for_task(group_inputs[0]["task"])
env_client = env_client or env.env_client or self.env_client
Expand All @@ -306,6 +315,7 @@ async def run_group( # type: ignore[override]
max_retries,
state_columns,
env_client,
actor_models=actor_models,
)

@final
Expand Down
10 changes: 9 additions & 1 deletion verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from verifiers.utils.eval_utils import filter_inputs
from verifiers.utils.path_utils import is_valid_eval_results_path
from verifiers.utils.thread_utils import scale_executors
from verifiers.utils.worker_utils import get_free_port_pair
from verifiers.utils.worker_utils import get_free_port_pair, release_reserved_ports
from verifiers.workers.client.zmq_env_client import ZMQEnvClient
from verifiers.workers.server.zmq_env_server import ZMQEnvServer

Expand Down Expand Up @@ -687,6 +687,7 @@ async def run_rollout(
max_retries: int = 0,
state_columns: list[str] | None = None,
env_client: EnvClient | None = None,
actor_models: dict[str, str] | None = None,
) -> RolloutOutput:
"""Generate and, optionally, score a rollout."""

Expand All @@ -707,6 +708,7 @@ async def run_rollout(
sampling_args,
max_retries,
state_columns,
actor_models=actor_models,
)

resolved_client = resolve_client(client)
Expand Down Expand Up @@ -742,6 +744,7 @@ async def run_group(
max_retries: int = 0,
state_columns: list[str] | None = None,
env_client: EnvClient | None = None,
actor_models: dict[str, str] | None = None,
**kwargs,
) -> list[RolloutOutput]:
"""Generate and, optionally, score one group."""
Expand All @@ -763,6 +766,7 @@ async def run_group(
sampling_args,
max_retries,
state_columns,
actor_models=actor_models,
)

resolved_client = resolve_client(client)
Expand Down Expand Up @@ -1282,6 +1286,10 @@ async def start_server(
"""
address = address or f"tcp://127.0.0.1:{get_free_port_pair()}"
extra_env_kwargs = extra_env_kwargs or {}
# Release the port-reservation sockets so the subprocess can bind.
# With "spawn" context the subprocess doesn't inherit FDs, and on
# macOS ZMQ cannot bind over a held SO_REUSEADDR socket.
release_reserved_ports()
# Use spawn to avoid inheriting file descriptors (e.g. sockets) from
# the parent process, which has caused hangs when multiple env server
# subprocesses share the same fds.
Expand Down
Loading