Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/source/api_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class ProcessedOutputs(BaseModel):
rewards: list[float]
```

`GenerateOutputs.metadata` captures run-level context (environment + arguments, model + sampling configuration, summary statistics, and the resolved save path) so downstream tooling can reproduce or resume evaluations without guessing defaults.
`GenerateOutputs.metadata` captures run-level context (environment + arguments, environment + library versions, model + sampling configuration, summary statistics, and the resolved save path) so downstream tooling can reproduce or resume evaluations without guessing defaults.

### State Dictionary

Expand Down Expand Up @@ -287,4 +287,3 @@ def reset_for_rollout(self, prompt: Messages, answer: str, info: Info | None) ->
}
return state
```

2 changes: 1 addition & 1 deletion docs/source/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,4 +228,4 @@ However, the best way to improve training is ensuring appropriate task difficult

- Explore [Environments](environments.md) to create custom tasks
- Review [Components](components.md) for advanced patterns
- See the [examples directory](https://github.com/PrimeIntellect-ai/verifiers/tree/main/examples) on GitHub for complete training scripts
- See the [examples directory](https://github.com/PrimeIntellect-ai/verifiers/tree/main/examples) on GitHub for complete training scripts
7 changes: 6 additions & 1 deletion tests/test_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datasets import Dataset
from openai.types.chat.chat_completion import Choice

from verifiers import Environment, Parser, Rubric, ThinkParser
from verifiers import Environment, Parser, Rubric, ThinkParser, __version__ as vf_version
from verifiers.types import (
GenerateMetadata,
GenerateOutputs,
Expand Down Expand Up @@ -76,6 +76,8 @@ def _make_metadata(
return GenerateMetadata(
env_id="test-env",
env_args={},
env_version="test-env-version",
verifiers_version="test-verifiers-version",
model="test-model",
base_url="http://localhost",
num_examples=num_examples,
Expand Down Expand Up @@ -420,6 +422,7 @@ def reward_b(**kwargs):
weights=[0.5, 0.5],
),
)
env.version = "test-env-v1"

results = await env.generate(
inputs=env.get_dataset(n=2),
Expand All @@ -434,6 +437,8 @@ def reward_b(**kwargs):
assert "reward_b" in results["metadata"]["avg_metrics"]
assert results["metadata"]["avg_metrics"]["reward_a"] == 1.0
assert results["metadata"]["avg_metrics"]["reward_b"] == 0.5
assert results["metadata"]["env_version"] == "test-env-v1"
assert results["metadata"]["verifiers_version"] == vf_version

@pytest.mark.asyncio
async def test_generate_metadata_without_scoring(self, mock_openai_client):
Expand Down
2 changes: 2 additions & 0 deletions tests/test_environment_extra.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def _make_metadata(
return GenerateMetadata(
env_id="dummy-env",
env_args={},
env_version="dummy-env-version",
verifiers_version="test-verifiers-version",
model="test-model",
base_url="http://localhost",
num_examples=num_examples,
Expand Down
2 changes: 2 additions & 0 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ def _make_metadata(config) -> GenerateMetadata:
return GenerateMetadata(
env_id=config.env_id,
env_args=config.env_args,
env_version="test-env-version",
verifiers_version="test-verifiers-version",
model=config.model,
base_url=config.client_config.api_base_url,
num_examples=config.num_examples,
Expand Down
2 changes: 2 additions & 0 deletions tests/test_eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def _make_metadata(
return GenerateMetadata(
env_id="test-env",
env_args={},
env_version="test-env-version",
verifiers_version="test-verifiers-version",
model="test-model",
base_url="http://localhost",
num_examples=num_examples,
Expand Down
34 changes: 34 additions & 0 deletions verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,38 @@
pass


def _get_verifiers_version() -> str | None:
try:
return pkg_version("verifiers")
except PackageNotFoundError:
try:
from verifiers import __version__

return __version__
except Exception:
return None


def _get_env_version(env: "Environment") -> str | None:
# Instance override
if getattr(env, "version", None):
return str(getattr(env, "version"))

env_module = inspect.getmodule(env.__class__)
if env_module and hasattr(env_module, "__version__"):
version = getattr(env_module, "__version__")
if version:
return str(version)

top_level_module = env.__class__.__module__.split(".")[0]
try:
return pkg_version(top_level_module)
except PackageNotFoundError:
pass

return None


class Environment(ABC):
"""
Base class for all environments.
Expand Down Expand Up @@ -600,6 +632,8 @@ def _prepare_rollout_results(
metadata = GenerateMetadata(
env_id=self.env_id,
env_args=self.env_args,
env_version=_get_env_version(self),
verifiers_version=_get_verifiers_version(),
model=model,
base_url=str(client.base_url) if hasattr(client, "base_url") else "",
num_examples=num_unique_examples,
Expand Down
2 changes: 2 additions & 0 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ class GenerateMetadata(TypedDict):

env_id: str
env_args: dict
env_version: str | None
verifiers_version: str | None
model: str
base_url: str
num_examples: int
Expand Down