Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 125 additions & 1 deletion example/gpt2/main.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#include <algorithm>
#include <chrono>
#include <cstdlib>
#include <filesystem>
#include <format>
#include <limits>
#include <memory>
#include <optional>
#include <unordered_map>
Expand All @@ -10,6 +13,7 @@
#include "glog/logging.h"

#include "infini_train/include/autocast.h"
#include "infini_train/include/checkpoint.h"
#include "infini_train/include/core/runtime/device_guard.h"
#include "infini_train/include/dataloader.h"
#include "infini_train/include/device.h"
Expand Down Expand Up @@ -75,6 +79,14 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");

// precision
DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
DEFINE_uint32(save_steps, 0, "save checkpoint every N steps; 0 disables saving");
DEFINE_string(resume_from, "", "checkpoint directory to resume from");
DEFINE_string(checkpoint_dir, "./checkpoints", "root directory used to store checkpoints");
DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
DEFINE_bool(save_optimizer_state, true, "whether optimizer state is persisted in checkpoints");
DEFINE_string(checkpoint_format, "bin", "checkpoint format: bin|pth");
DEFINE_bool(use_llmc_checkpoint_io, false,
"whether to use GPT2 LLMC model.bin callback for checkpoint save/load when format=bin");
// precision check
DEFINE_string(
precision_check, "",
Expand Down Expand Up @@ -198,6 +210,8 @@ void Train(const nn::parallel::Rank &rank) {
} else {
model = GPT2::FromPretrained(kStrToModelType.at(FLAGS_model));
}
auto llmc_model = std::dynamic_pointer_cast<GPT2>(model);
CHECK(llmc_model != nullptr) << "Failed to cast model to GPT2 for LLMC checkpoint I/O.";

model->To(device);

Expand Down Expand Up @@ -311,6 +325,7 @@ void Train(const nn::parallel::Rank &rank) {
}

auto train_iter = train_loader.begin();
size_t saved_data_batch_idx = train_iter.BatchIndex();
std::shared_ptr<nn::Module> loss_fn
= (tp_world_size > 1) ? std::static_pointer_cast<nn::Module>(
std::make_shared<VocabParallelCrossEntropyLoss>(model_config.original_vocab_size))
Expand All @@ -320,9 +335,100 @@ void Train(const nn::parallel::Rank &rank) {

auto impl = core::GetDeviceGuardImpl(device.type());

int start_step = 0;
float best_loss = std::numeric_limits<float>::infinity();
if (!FLAGS_resume_from.empty()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

建议把主流程中恢复、保存、清理旧的Checkpoint提成公共函数,尽量让主流程简洁,另外各个训练入口可以复用。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

参数如果太多可以用struct整合在一起

std::filesystem::path resume_dir = FLAGS_resume_from;
if (rank.IsParallel()) {
const auto rank_dir = resume_dir / std::format("rank_{:06d}", rank.GlobalRank());
if (std::filesystem::exists(rank_dir)) {
resume_dir = rank_dir;
}
}

TrainerState state;
CheckpointLoadOptions load_options;
load_options.load_optimizer_state = true;
if (FLAGS_use_llmc_checkpoint_io) {
load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
auto loaded_model = GPT2::FromLLMC(model_path.string());
target_model->LoadStateDict(loaded_model->StateDict());
};
}
Checkpoint::Load(resume_dir, model.get(), optimizer.get(), &state, load_options);
start_step = static_cast<int>(state.global_step);
best_loss = state.best_loss;
if (state.data_batch_stride != static_cast<int64_t>(ddp_world_size) && rank.IsMainRank()) {
LOG(WARNING) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
"Proceeding with recorded data_batch_idx {}.",
state.data_batch_stride, ddp_world_size, state.data_batch_idx);
}
saved_data_batch_idx = static_cast<size_t>(std::max<int64_t>(state.data_batch_idx, 0));
train_iter = train_loader.IteratorAtBatchIndex(saved_data_batch_idx);
if (rank.IsMainRank()) {
LOG(INFO) << std::format(
"Resume training from step {} with best_loss {:.6f}, last_lr {:.3e}, data_batch_idx {}",
state.global_step, state.best_loss, state.last_lr, state.data_batch_idx);
LOG(INFO) << std::format("Checkpoint model I/O mode during resume: {}",
FLAGS_use_llmc_checkpoint_io ? "llmc-callback" : "native-state-dict");
}
}

LOG(INFO) << "start training";

for (int step = 0; step < FLAGS_num_iteration + 1; ++step) {
auto save_checkpoint = [&](const std::filesystem::path &save_dir, int64_t global_step,
bool prune_step_checkpoints) {
const auto ckpt_start = std::chrono::high_resolution_clock::now();

TrainerState state;
state.global_step = global_step;
state.data_batch_idx = saved_data_batch_idx;
state.data_batch_stride = ddp_world_size;
state.best_loss = best_loss;
state.last_lr = FLAGS_learning_rate;
state.optimizer_type = "SGD";
state.checkpoint_format = FLAGS_checkpoint_format;
state.ddp_size = ddp_world_size;
state.tp_size = tp_world_size;
state.sp_size = sp_world_size;
state.pp_size = pp_world_size;

CheckpointOptions options;
options.format = FLAGS_checkpoint_format;
options.save_optimizer_state = FLAGS_save_optimizer_state;
if (FLAGS_use_llmc_checkpoint_io) {
options.model_bin_writer = [&](const nn::Module &, const std::filesystem::path &model_path) {
llmc_model->SaveAsLLMC(model_path.string());
};
}
Checkpoint::Save(save_dir, *model, *optimizer, state, options);

const auto ckpt_end = std::chrono::high_resolution_clock::now();
const double ckpt_ms = std::chrono::duration<double, std::milli>(ckpt_end - ckpt_start).count();

if (rank.IsMainRank()) {
LOG(INFO) << std::format("Checkpoint saved at: {} ({:.2f} ms)", save_dir.string(), ckpt_ms);

if (prune_step_checkpoints) {
std::vector<std::filesystem::path> ckpts;
const auto root = std::filesystem::path(FLAGS_checkpoint_dir);
if (std::filesystem::exists(root)) {
for (const auto &entry : std::filesystem::directory_iterator(root)) {
if (entry.is_directory() && entry.path().filename().string().starts_with("checkpoint_step_")) {
ckpts.push_back(entry.path());
}
}
std::sort(ckpts.begin(), ckpts.end());
while (ckpts.size() > FLAGS_max_checkpoint_keep) {
std::filesystem::remove_all(ckpts.front());
ckpts.erase(ckpts.begin());
}
}
}
}
};

for (int step = start_step; step < FLAGS_num_iteration + 1; ++step) {
// Reset precision check counters at start of each iteration for file overwrite
utils::PrecisionChecker::ResetCounters();

Expand Down Expand Up @@ -372,6 +478,7 @@ void Train(const nn::parallel::Rank &rank) {
// if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
// TODO(dcj): support dataloader.reset() later
++train_iter;
saved_data_batch_idx = train_iter.BatchIndex();
x = std::make_shared<Tensor>(x->To(device));
y = std::make_shared<Tensor>(y->To(device));

Expand Down Expand Up @@ -401,6 +508,7 @@ void Train(const nn::parallel::Rank &rank) {
// if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
// TODO(dcj): support dataloader.reset() later
++train_iter;
saved_data_batch_idx = train_iter.BatchIndex();
x = std::make_shared<Tensor>(x->To(device));
y = std::make_shared<Tensor>(y->To(device));

Expand All @@ -413,6 +521,8 @@ void Train(const nn::parallel::Rank &rank) {
lossf = static_cast<const float *>(lossf_tensor->To(Device()).DataPtr())[0];
}

best_loss = std::min(best_loss, lossf);

const auto iter_end = std::chrono::high_resolution_clock::now();
const double duration_us = std::chrono::duration<double, std::micro>(iter_end - iter_start).count();
const double tps = FLAGS_total_batch_size / (duration_us / 1e6);
Expand All @@ -435,8 +545,22 @@ void Train(const nn::parallel::Rank &rank) {
}
}
}

if (FLAGS_save_steps > 0 && (step + 1) % FLAGS_save_steps == 0) {
std::filesystem::path step_dir
= std::filesystem::path(FLAGS_checkpoint_dir) / std::format("checkpoint_step_{:06d}", step + 1);
if (rank.IsParallel()) {
step_dir /= std::format("rank_{:06d}", rank.GlobalRank());
}
save_checkpoint(step_dir, step + 1, true);
}
}

std::filesystem::path final_dir = std::filesystem::path(FLAGS_checkpoint_dir) / "checkpoint_final";
if (rank.IsParallel()) {
final_dir /= std::format("rank_{:06d}", rank.GlobalRank());
}
save_checkpoint(final_dir, FLAGS_num_iteration, false);
// Save LoRA weights if enabled and path specified
if (lora_enabled && !FLAGS_lora_save_path.empty()) {
LOG(INFO) << "Saving LoRA weights to: " << FLAGS_lora_save_path;
Expand Down
108 changes: 108 additions & 0 deletions example/gpt2/net.cc
Original file line number Diff line number Diff line change
Expand Up @@ -719,4 +719,112 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
return local_gpt2;
}

void GPT2::SaveAsLLMC(const std::string &filepath) const {
CHECK_EQ(nn::parallel::global::GetTensorParallelSize(), 1) << "SaveAsLLMC currently supports TP=1 only.";
CHECK_EQ(nn::parallel::global::GetPipelineParallelSize(), 1) << "SaveAsLLMC currently supports PP=1 only.";

std::ofstream ofs(filepath, std::ios::binary);
CHECK(ofs.is_open()) << "Failed to open model file for write: " << filepath;

std::vector<int32_t> header(256, 0);
header[0] = kHeaderMagic;
header[1] = kHeaderFP32Version;
header[2] = static_cast<int32_t>(config_.block_size);
header[3] = static_cast<int32_t>(config_.original_vocab_size);
header[4] = static_cast<int32_t>(config_.n_layer);
header[5] = static_cast<int32_t>(config_.n_head);
header[6] = static_cast<int32_t>(config_.n_embd);
header[7] = static_cast<int32_t>(config_.vocab_size);
ofs.write(reinterpret_cast<const char *>(header.data()),
static_cast<std::streamsize>(header.size() * sizeof(int32_t)));

const auto state_dict = StateDict();
auto get_tensor = [&](const std::string &name) -> std::shared_ptr<Tensor> {
CHECK(state_dict.contains(name)) << "Missing tensor in GPT2 state_dict: " << name;
return state_dict.at(name);
};

auto write_tensor_fp32 = [&](const std::shared_ptr<Tensor> &tensor) {
Tensor cpu = tensor->To(Device());
if (cpu.Dtype() != DataType::kFLOAT32) {
cpu = cpu.To(DataType::kFLOAT32);
}
const auto bytes = static_cast<std::streamsize>(cpu.SizeInBytes());
ofs.write(reinterpret_cast<const char *>(cpu.DataPtr()), bytes);
};

// transformer.wte.weight
write_tensor_fp32(get_tensor(std::format("{}.{}.{}", kTransformerLayerName, GPT2FirstStage::kWTELayerName,
nn::parallel::VocabParallelEmbedding::kParamWeightName)));

// transformer.wpe.weight
write_tensor_fp32(get_tensor(std::format("{}.{}.{}", kTransformerLayerName, GPT2FirstStage::kWPELayerName,
nn::Embedding::kParamWeightName)));

for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
Block::kLn1LayerName, nn::LayerNorm::kParamWeightName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
Block::kLn1LayerName, nn::LayerNorm::kParamBiasName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName,
nn::parallel::ColumnParallelLinear::kParamWeightName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName,
nn::parallel::ColumnParallelLinear::kParamBiasName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName,
nn::parallel::RowParallelLinear::kParamWeightName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName,
nn::parallel::RowParallelLinear::kParamBiasName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
Block::kLn2LayerName, nn::LayerNorm::kParamWeightName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName, idx,
Block::kLn2LayerName, nn::LayerNorm::kParamBiasName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kMlpLayerName, MLP::kCFcLayerName,
nn::parallel::ColumnParallelLinear::kParamWeightName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kMlpLayerName, MLP::kCFcLayerName,
nn::parallel::ColumnParallelLinear::kParamBiasName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kMlpLayerName, MLP::kCProjLayerName,
nn::parallel::RowParallelLinear::kParamWeightName)));
}
for (int idx = 0; idx < config_.n_layer; ++idx) {
write_tensor_fp32(get_tensor(std::format("{}.{}.{}.{}.{}.{}", kTransformerLayerName, GPT2Chunk::kHLayerName,
idx, Block::kMlpLayerName, MLP::kCProjLayerName,
nn::parallel::RowParallelLinear::kParamBiasName)));
}

write_tensor_fp32(get_tensor(
std::format("{}.{}.{}", kTransformerLayerName, GPT2LastStage::kLnFLayerName, nn::LayerNorm::kParamWeightName)));
write_tensor_fp32(get_tensor(
std::format("{}.{}.{}", kTransformerLayerName, GPT2LastStage::kLnFLayerName, nn::LayerNorm::kParamBiasName)));

ofs.flush();
CHECK(ofs.good()) << "Failed to flush model file: " << filepath;
}

int GPT2::GetChunkSize() const { return stage_info_.layer_ranges_per_chunk.size(); }
1 change: 1 addition & 0 deletions example/gpt2/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ class GPT2 : public infini_train::nn::CloneableModule<GPT2> {

static std::shared_ptr<GPT2> FromPretrained(ModelType model_type);
static std::shared_ptr<GPT2> FromLLMC(const std::string &filepath);
void SaveAsLLMC(const std::string &filepath) const;

int GetChunkSize() const;

Expand Down
Loading
Loading