This project implements stepwise Direct Preference Optimization (DPO) for math problem solving using the PRM800K dataset.
git clone <your-repo-url>
cd RewardChain
pip install -r requirements.txt
export PYTHONPATH="$(pwd)/src:${PYTHONPATH:-}"python src/scripts/process_data.py --split train --output_dir data/processed/prm800k --max_samples 1000
python src/scripts/process_data.py --split test --output_dir data/processed/prm800k --max_samples 100python src/scripts/train.py \
--train_data data/processed/prm800k/train.jsonl \
--output_dir ./dpo_model \
--model_name microsoft/DialoGPT-mediumpython src/scripts/evaluate.py \
--model_path ./dpo_model \
--test_data data/processed/prm800k/test.jsonl \
--output_path ./evaluation_results.json