Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion every_eval_ever/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,33 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int:
return 0


def _cmd_convert_lighteval(args: argparse.Namespace) -> int:
from every_eval_ever.converters.lighteval.adapter import LightEvalAdapter

adapter = LightEvalAdapter()
metadata = _common_metadata(args)
if args.inference_engine:
metadata['inference_engine'] = args.inference_engine
if args.inference_engine_version:
metadata['inference_engine_version'] = args.inference_engine_version

log_path = Path(args.log_path)
if log_path.is_file():
logs = adapter.transform_from_file(log_path, metadata)
elif log_path.is_dir():
logs = adapter.transform_from_directory(log_path, metadata)
else:
raise FileNotFoundError(f'Path is not a file or directory: {log_path}')

output_dir = Path(args.output_dir)
for log in logs:
eval_uuid = str(uuid.uuid4())
print(_write_log(log, output_dir, eval_uuid=eval_uuid))

print(f'Converted {len(logs)} evaluation log(s).')
return 0


def _cmd_convert_inspect(args: argparse.Namespace) -> int:
from every_eval_ever.converters.inspect.adapter import (
InspectAIAdapter,
Expand Down Expand Up @@ -241,6 +268,7 @@ def build_parser() -> argparse.ArgumentParser:
epilog=(
'Examples:\n'
' every_eval_ever convert lm_eval --log_path results.json --output_dir data\n'
' every_eval_ever convert lighteval --log_path results_run_dir --output_dir data\n'
' every_eval_ever convert inspect --log_path inspect_log.json --output_dir data\n'
' every_eval_ever convert helm --log_path helm_run_dir --output_dir data'
),
Expand Down Expand Up @@ -298,7 +326,13 @@ def build_parser() -> argparse.ArgumentParser:
dest='source', required=True
)

for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval']:
for source in [
'lm_eval',
'lighteval',
'inspect',
'helm',
'alpaca_eval',
]:
source_parser = convert_subparsers.add_parser(
source,
help=f'Convert {source} logs',
Expand Down Expand Up @@ -385,6 +419,20 @@ def build_parser() -> argparse.ArgumentParser:
help='Inference engine version to record in model_info.inference_engine.version.',
)

if source == 'lighteval':
source_parser.add_argument(
'--inference_engine',
'--inference-engine',
default=None,
help='Override inferred inference engine (e.g. vllm, transformers).',
)
source_parser.add_argument(
'--inference_engine_version',
'--inference-engine-version',
default=None,
help='Inference engine version to record in model_info.inference_engine.version.',
)

return parser


Expand Down Expand Up @@ -415,6 +463,8 @@ def main(argv: list[str] | None = None) -> int:
if args.command == 'convert':
if args.source == 'lm_eval':
return _cmd_convert_lm_eval(args)
if args.source == 'lighteval':
return _cmd_convert_lighteval(args)
if args.source == 'inspect':
return _cmd_convert_inspect(args)
if args.source == 'helm':
Expand Down
1 change: 1 addition & 0 deletions every_eval_ever/converters/common/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class SupportedLibrary(Enum):
LM_EVAL = 'lm-evaluation-harness'
INSPECT_AI = 'inspect-ai'
HELM = 'helm'
LIGHTEVAL = 'lighteval'
CUSTOM = 'custom'


Expand Down
1 change: 1 addition & 0 deletions every_eval_ever/converters/lighteval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""lighteval adapter for every_eval_ever."""
142 changes: 142 additions & 0 deletions every_eval_ever/converters/lighteval/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""CLI for converting lighteval output to every_eval_ever format."""

import argparse
import json
import sys
import uuid
from pathlib import Path

from .adapter import LightEvalAdapter


def main():
parser = argparse.ArgumentParser(
description='Convert lighteval output to every_eval_ever format'
)
parser.add_argument(
'--log_path',
type=str,
required=True,
help='Path to results JSON file or directory containing results files',
)
parser.add_argument(
'--output_dir',
type=str,
default='data',
help='Output directory for converted files',
)
parser.add_argument(
'--source_organization_name',
type=str,
default='',
help='Name of the organization that ran the evaluation',
)
parser.add_argument(
'--evaluator_relationship',
type=str,
default='first_party',
choices=['first_party', 'third_party', 'collaborative', 'other'],
help='Relationship of the evaluator to the model',
)
parser.add_argument(
'--source_organization_url',
type=str,
default=None,
help='URL of the source organization',
)
parser.add_argument(
'--source_organization_logo_url',
type=str,
default=None,
help='Logo of the source organization',
)
parser.add_argument(
'--inference_engine',
type=str,
default=None,
help="Override inference engine name (e.g. 'vllm', 'transformers'). "
'Auto-detected from provider when possible.',
)
parser.add_argument(
'--inference_engine_version',
type=str,
default=None,
help="Inference engine version (e.g. '0.6.0'). "
'Not available from lighteval logs, so must be provided manually.',
)
parser.add_argument(
'--eval_library_name',
type=str,
default='lighteval',
help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm, lighteval)',
)
parser.add_argument(
'--eval_library_version',
type=str,
default='unknown',
help='Version of the evaluation library. It should be extracted in the adapter if available in the evaluation log.',
)

args = parser.parse_args()

adapter = LightEvalAdapter()
output_dir = Path(args.output_dir)

metadata_args = {
'source_organization_name': args.source_organization_name,
'evaluator_relationship': args.evaluator_relationship,
'source_organization_url': args.source_organization_url,
'eval_library_name': args.eval_library_name,
'eval_library_version': args.eval_library_version,
}
if args.inference_engine:
metadata_args['inference_engine'] = args.inference_engine
if args.inference_engine_version:
metadata_args['inference_engine_version'] = (
args.inference_engine_version
)

log_path = Path(args.log_path)

if log_path.is_file():
logs = adapter.transform_from_file(log_path, metadata_args)
elif log_path.is_dir():
logs = adapter.transform_from_directory(log_path, metadata_args)
else:
print(f'Error: {log_path} is not a file or directory', file=sys.stderr)
sys.exit(1)

for log in logs:
# Organize as: output_dir/{evaluation_name}/{developer}/{model_name}/{uuid}.json
# Use the first evaluation result's name (before any /filter suffix) as the task name
if log.evaluation_results:
eval_name = log.evaluation_results[0].evaluation_name.split('/')[0]
else:
eval_name = 'unknown'

model_parts = log.model_info.id.split('/')
if len(model_parts) >= 2:
developer = model_parts[0]
model_name = '/'.join(model_parts[1:])
else:
developer = 'unknown'
model_name = log.model_info.id

out_path = output_dir / eval_name / developer / model_name
out_path.mkdir(parents=True, exist_ok=True)

eval_uuid = str(uuid.uuid4())
out_file = out_path / f'{eval_uuid}.json'

with open(out_file, 'w') as f:
json.dump(
log.model_dump(mode='json', exclude_none=True), f, indent=2
)

print(f' {out_file}')

print(f'\nConverted {len(logs)} evaluation log(s).')


if __name__ == '__main__':
main()
Loading