Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion assets/lab/environments/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ Verifiers defines a hierarchy of error types under `vf.Error`:
- `vf.ModelError` — errors from model interactions (e.g., `vf.EmptyModelResponseError`)
- `vf.OverlongPromptError` — prompt exceeds model context length
- `vf.ToolError` — tool-related errors (`vf.ToolParseError`, `vf.ToolCallError`)
- `vf.InfraError` — infrastructure errors (e.g., `vf.SandboxError`)
- `vf.InfraError` — infrastructure errors (e.g., `vf.SandboxError`, `vf.TunnelError`)

When a `vf.Error` is raised during a rollout, it is automatically caught and stored in `state["error"]`, triggering the built-in `has_error` stop condition at the next check. This allows rollouts to terminate gracefully rather than crashing.

Expand Down
2 changes: 1 addition & 1 deletion environments/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ Verifiers defines a hierarchy of error types under `vf.Error`:
- `vf.ModelError` — errors from model interactions (e.g., `vf.EmptyModelResponseError`)
- `vf.OverlongPromptError` — prompt exceeds model context length
- `vf.ToolError` — tool-related errors (`vf.ToolParseError`, `vf.ToolCallError`)
- `vf.InfraError` — infrastructure errors (e.g., `vf.SandboxError`)
- `vf.InfraError` — infrastructure errors (e.g., `vf.SandboxError`, `vf.TunnelError`)

When a `vf.Error` is raised during a rollout, it is automatically caught and stored in `state["error"]`, triggering the built-in `has_error` stop condition at the next check. This allows rollouts to terminate gracefully rather than crashing.

Expand Down
44 changes: 44 additions & 0 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import importlib
import os
import sys
import tempfile
import time
from pathlib import Path
Expand Down Expand Up @@ -117,6 +119,48 @@ def test_cli_single_env_id(monkeypatch, run_cli):
assert configs[0].env_id == "env1"


def test_get_env_eval_defaults_for_package_module(tmp_path: Path, monkeypatch):
module_name = f"pkg_env_{time.time_ns()}"
env_id = module_name.replace("_", "-")
package_dir = tmp_path / module_name
package_dir.mkdir()
(package_dir / "__init__.py").write_text("", encoding="utf-8")
(package_dir / "pyproject.toml").write_text(
"[tool.verifiers.eval]\nnum_examples = 20\nrollouts_per_example = 6\n",
encoding="utf-8",
)

monkeypatch.syspath_prepend(str(tmp_path))
importlib.invalidate_caches()
try:
defaults = vf_eval.get_env_eval_defaults(env_id)
finally:
sys.modules.pop(module_name, None)

assert defaults == {"num_examples": 20, "rollouts_per_example": 6}


def test_get_env_eval_defaults_for_single_file_module(tmp_path: Path, monkeypatch):
module_name = f"single_file_env_{time.time_ns()}"
env_id = module_name.replace("_", "-")
(tmp_path / f"{module_name}.py").write_text(
"def load_environment():\n return None\n", encoding="utf-8"
)
(tmp_path / "pyproject.toml").write_text(
"[tool.verifiers.eval]\nnum_examples = 20\nrollouts_per_example = 6\n",
encoding="utf-8",
)

monkeypatch.syspath_prepend(str(tmp_path))
importlib.invalidate_caches()
try:
defaults = vf_eval.get_env_eval_defaults(env_id)
finally:
sys.modules.pop(module_name, None)

assert defaults == {"num_examples": 20, "rollouts_per_example": 6}


def test_cli_sampling_args_precedence_over_flags(monkeypatch, run_cli):
"""sampling_args JSON takes precedence over individual flags."""
captured = run_cli(
Expand Down
2 changes: 1 addition & 1 deletion verifiers/rubrics/experimental/hybrid_math_rubric.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from math_verify import parse, verify # type: ignore[unresolved-import]
from math_verify import parse, verify
from openai import AsyncOpenAI

import verifiers as vf
Expand Down
26 changes: 19 additions & 7 deletions verifiers/scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import argparse
import asyncio
import importlib.resources
import importlib.util
import json
import logging
from pathlib import Path
Expand Down Expand Up @@ -89,7 +89,7 @@


def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
"""Get eval config defaults from environment package's pyproject.toml.
"""Get eval config defaults from the environment module's pyproject.toml.

Returns dict with 'num_examples' and 'rollouts_per_example' keys if found,
otherwise returns empty dict. All errors are silently handled.
Expand All @@ -98,12 +98,24 @@ def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
module_name = env_id.replace("-", "_").split("/")[-1]

try:
# read pyproject.toml from installed package
package_ref = importlib.resources.files(module_name)
pyproject_file = package_ref / "pyproject.toml"
spec = importlib.util.find_spec(module_name)
if spec is None:
raise ModuleNotFoundError(module_name)

if spec.submodule_search_locations:
base_dir = Path(next(iter(spec.submodule_search_locations)))
elif spec.origin:
base_dir = Path(spec.origin).parent
else:
logger.debug(
f"Could not determine module path for {module_name}; skipping eval defaults"
)
return defaults

pyproject_file = base_dir / "pyproject.toml"

if not pyproject_file.is_file():
logger.debug(f"pyproject.toml not found in installed package {module_name}")
logger.debug(f"pyproject.toml not found for installed module {module_name}")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Skill docs missing for workflow change

Low Severity

This PR changes evaluation workflow behavior in verifiers/scripts/eval.py by altering how get_env_eval_defaults resolves pyproject.toml for environments, but no corresponding skills/*/SKILL.md updates are included. That violates the Skills Updates rule for user-facing workflow changes in verifiers/scripts/*.py.

Fix in Cursor Fix in Web

Triggered by project rule: BugBot Instructions

return defaults

with pyproject_file.open("rb") as f:
Expand All @@ -124,7 +136,7 @@ def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
f"Loaded eval defaults from {module_name} pyproject.toml: {defaults}"
)
except ModuleNotFoundError:
logger.debug(f"Package {module_name} not installed")
logger.debug(f"Module {module_name} not installed")
except Exception as e:
logger.debug(
f"Could not load eval defaults from {module_name} pyproject.toml: {e}"
Expand Down
Loading