Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ the phase. At the end of each phase, we evaluate all the checkpoints in order to
Once training is complete, and we have picked the best checkpoint from the output of the final phase, we can run full-scale evaluation suite which runs MT-Bench, MMLU,
MT-Bench Branch and MMLU Branch.

### Leaderboard Evaluation

For cases when you want to run the full Open LLM Leaderboard v2 evaluation suite, we provide an optional dependency package for the leaderboard tasks. This includes additional benchmarks like GPQA, IFEVAL, BBH, MMLU-PRO, MUSR, and MATH-HARD.

To install the optional leaderboard dependencies, use:

```bash
pip install instructlab-eval[leaderboard]
```

## Methods of Evaluation

Below are more in-depth explanations of the suite of benchmarks we are using as methods for evaluation of models.
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ issues = "https://github.com/instructlab/eval/issues"
"mmlu_branch" = "instructlab.eval.mmlu:MMLUBranchEvaluator"
"mt_bench" = "instructlab.eval.mt_bench:MTBenchEvaluator"
"mt_bench_branch" = "instructlab.eval.mt_bench:MTBenchBranchEvaluator"
"leaderboard_v2" = "instructlab.eval.leaderboard:LeaderboardV2Evaluator"

[tool.setuptools_scm]
version_file = "src/instructlab/eval/_version.py"
Expand All @@ -53,6 +54,7 @@ package-dir = {"" = "src"}

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}
optional-dependencies = {leaderboard = {file = ["requirements-leaderboard.txt"]}}

[tool.setuptools.packages.find]
where = ["src"]
Expand Down
10 changes: 10 additions & 0 deletions requirements-leaderboard.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
lm-eval[ifeval,vllm,math,sentencepiece]>=0.4.4

# vLLM 0.8.3 + torch 2.6.0 doesn't work when running vLLM on granite-3.1-8b-instruct
vllm<=0.7.3
torch<=2.5.1

# XXX(osilkin): We use StrEnum in leaderboard, but Python3.10 doesn't have it as part of
# the standard library, so we have to install it from the older library.
strenum>=0.4.15; python_version < '3.11'
typing-extensions>=4.0.0; python_version < '3.11'
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ transformers
accelerate
pandas
pandas-stubs
# Base lm-eval dependency
lm-eval>=0.4.4
httpx
ragas>=0.2.11
84 changes: 84 additions & 0 deletions scripts/evaluate_best_checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3

"""
Example usage:
python scripts/evaluate_best_checkpoint.py \
/path/to/checkpoint_dir \
--output-file /path/to/output_file
"""

# Standard
from pathlib import Path
from typing import Optional
import json

# Third Party
import typer

app = typer.Typer()


@app.command()
def main(
input_dir: Path = typer.Argument(..., help="Input directory to process"),
output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
):
"""
Process files in the input directory and optionally save results to an output file.
"""
if not input_dir.exists():
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
raise typer.Exit(1)

if not input_dir.is_dir():
typer.echo(f"Error: '{input_dir}' is not a directory")
raise typer.Exit(1)

checkpoint_dirs = list(input_dir.glob("hf_format/samples_*"))
typer.echo(f"Found {len(checkpoint_dirs)} samples files")

if not checkpoint_dirs:
typer.echo(
f"No checkpoint directories found in the input directory: {input_dir}"
)
raise typer.Exit(1)

typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
# First Party
from instructlab.eval.leaderboard import LeaderboardV2Evaluator

checkpoint_results = {}
for checkpoint in checkpoint_dirs:
typer.echo(f"Processing checkpoint: {checkpoint}")
ckpt_output_file = checkpoint / "leaderboard_results.json"
evaluator = LeaderboardV2Evaluator(
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
)
result = evaluator.run()
checkpoint_results[checkpoint.name] = result
typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")

# Sort checkpoints by score
sorted_checkpoints = sorted(
checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
)
typer.echo("Sorted checkpoints by score:")
for checkpoint_name, result in sorted_checkpoints:
typer.echo(f"{'=' * 100}")
typer.echo(json.dumps(result, indent=2))

typer.echo(f"{'=' * 100}")
typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")

if output_file:
typer.echo(f"Output will be saved to: {output_file}")
with open(output_file, "w") as f:
json.dump(checkpoint_results, f, indent=2)

# Add your processing logic here

typer.echo("Processing complete!")


if __name__ == "__main__":
app()
22 changes: 22 additions & 0 deletions scripts/test_leaderboard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env python
# SPDX-License-Identifier: Apache-2.0

# NOTE: This script requires the leaderboard optional dependencies.
# Install with: pip install instructlab-eval[leaderboard]

# Standard
import json

# First Party
from instructlab.eval.leaderboard import LeaderboardV2Evaluator

if __name__ == "__main__":
evaluator = LeaderboardV2Evaluator(
model_path="ibm-granite/granite-3.1-8b-base",
eval_config={
"apply_chat_template": False,
},
)
results = evaluator.run()
print("got results from leaderboard v2")
print(json.dumps(results, indent=2))
Loading