Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
449 changes: 391 additions & 58 deletions deepmd/dpmodel/utils/learning_rate.py

Large diffs are not rendered by default.

20 changes: 4 additions & 16 deletions deepmd/pd/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def get_sample():
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -387,11 +387,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
self.warmup_steps = training_params.get("warmup_steps", 0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
for model_key in self.model_keys:
Expand Down Expand Up @@ -580,18 +576,13 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step, warmup_steps):
if step < warmup_steps:
return step / warmup_steps
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
if self.opt_type == "Adam":
self.scheduler = paddle.optimizer.lr.LambdaDecay(
learning_rate=self.lr_exp.start_lr,
lr_lambda=lambda step: warm_up_linear(step, self.warmup_steps),
lr_lambda=lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
self.optimizer = paddle.optimizer.Adam(
learning_rate=self.scheduler, parameters=self.wrapper.parameters()
Expand Down Expand Up @@ -755,10 +746,7 @@ def step(_step_id, task_key="Default") -> None:
fout1.flush()
if self.opt_type == "Adam":
cur_lr = self.scheduler.get_lr()
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr

# disable synchronization in forward-backward manually
# as derivatives exist in model forward
Expand Down
4 changes: 3 additions & 1 deletion deepmd/pd/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PD_PRECISION_DICT

Expand Down Expand Up @@ -239,7 +240,8 @@ def to_numpy_array(
):
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PD_PRECISION_DICT
reverse_precision_dict = {v: k for k, v in PD_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
Expand Down
41 changes: 6 additions & 35 deletions deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def get_sample() -> Any:
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -437,27 +437,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
warmup_steps = training_params.get("warmup_steps", None)
warmup_ratio = training_params.get("warmup_ratio", None)
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
if not 0 <= warmup_ratio < 1:
raise ValueError(f"warmup_ratio must be in [0, 1), got {warmup_ratio}")
self.warmup_steps = int(warmup_ratio * self.num_steps)
if self.warmup_steps == 0 and warmup_ratio > 0:
log.warning(
f"warmup_ratio {warmup_ratio} results in 0 warmup steps "
f"due to truncation. Consider using a larger ratio or "
f"specify warmup_steps directly."
)
else:
self.warmup_steps = 0
self.warmup_start_factor = training_params.get("warmup_start_factor", 0.0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
for model_key in self.model_keys:
Expand Down Expand Up @@ -702,14 +682,6 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step: int, warmup_steps: int) -> float:
if step < warmup_steps:
return self.warmup_start_factor + (1.0 - self.warmup_start_factor) * (
step / warmup_steps
)
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
if self.opt_type in ["Adam", "AdamW"]:
Expand All @@ -730,7 +702,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self.optimizer.load_state_dict(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
elif self.opt_type == "LKF":
self.optimizer = LKFOptimizer(
Expand Down Expand Up @@ -768,7 +741,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self.optimizer.load_state_dict(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
else:
raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
Expand Down Expand Up @@ -883,10 +857,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
fout1.flush()
if self.opt_type in ["Adam", "AdamW", "AdaMuon", "HybridMuon"]:
cur_lr = self.scheduler.get_last_lr()[0]
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr
model_pred, loss, more_loss = self.wrapper(
**input_dict, cur_lr=pref_lr, label=label_dict, task_key=task_key
)
Expand Down
9 changes: 7 additions & 2 deletions deepmd/pt/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PT_PRECISION_DICT

Expand Down Expand Up @@ -227,18 +228,22 @@ def to_numpy_array(xx: None) -> None: ...


def to_numpy_array(
xx: torch.Tensor | None,
xx: torch.Tensor | np.ndarray | float | None,
) -> np.ndarray | None:
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
if isinstance(xx, np.ndarray):
return xx.astype(GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PT_PRECISION_DICT
reverse_precision_dict = {v: k for k, v in PT_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
prec = reverse_precision_dict.get(xx.dtype, None)
prec = NP_PRECISION_DICT.get(prec, None)
if prec is None:
raise ValueError(f"unknown precision {xx.dtype}")
assert isinstance(xx, torch.Tensor)
if xx.dtype == torch.bfloat16:
# https://github.com/pytorch/pytorch/issues/109873
xx = xx.float()
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/dipole.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/dos.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/ener.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
The loss function parameters.
lr : LearningRateExp
lr : LearningRateSchedule
The learning rate.

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/polar.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
52 changes: 32 additions & 20 deletions deepmd/tf/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import os
import shutil
import time
from typing import (
Any,
)

import google.protobuf.message
import numpy as np
Expand Down Expand Up @@ -52,7 +55,7 @@
load_graph_def,
)
from deepmd.tf.utils.learning_rate import (
LearningRateExp,
LearningRateSchedule,
)
from deepmd.tf.utils.sess import (
run_sess,
Expand Down Expand Up @@ -100,21 +103,18 @@ def _init_param(self, jdata) -> None:
self.model = Model(**model_param)
self.fitting = self.model.get_fitting()

def get_lr_and_coef(lr_param):
def get_lr_and_coef(
lr_param: dict[str, Any],
) -> tuple[LearningRateSchedule, float]:
scale_by_worker = lr_param.get("scale_by_worker", "linear")
if scale_by_worker == "linear":
scale_lr_coef = float(self.run_opt.world_size)
elif scale_by_worker == "sqrt":
scale_lr_coef = np.sqrt(self.run_opt.world_size).real
else:
scale_lr_coef = 1.0
lr_type = lr_param.get("type", "exp")
if lr_type == "exp":
lr = LearningRateExp(
lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"]
)
else:
raise RuntimeError("unknown learning_rate type " + lr_type)
lr_params = {k: v for k, v in lr_param.items() if k != "scale_by_worker"}
lr = LearningRateSchedule(lr_params)
return lr, scale_lr_coef

# learning rate
Expand Down Expand Up @@ -242,8 +242,13 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix="") -> Non
def _build_lr(self) -> None:
self._extra_train_ops = []
self.global_step = tf.train.get_or_create_global_step()
self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
log.info("built lr")
if self.stop_batch == 0:
# Use constant start_lr when stop_batch is zero (no training)
self.learning_rate = tf.cast(self.lr.start_lr(), GLOBAL_TF_FLOAT_PRECISION)
log.info("built lr (constant start_lr for stop_batch=0)")
else:
self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
log.info("built lr")

def _build_loss(self):
if self.stop_batch == 0:
Expand Down Expand Up @@ -426,14 +431,21 @@ def train(self, train_data=None, valid_data=None) -> None:
elapsed_batch = stop_batch - start_batch
is_first_step = True
self.cur_batch = cur_batch
log.info(
"start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
self.lr.value(cur_batch),
self.lr.decay_steps_,
self.lr.decay_rate_,
self.lr.value(stop_batch),
)
if stop_batch == 0:
lr0 = self.lr.start_lr()
log.info(
"start training at lr %.2e (== %.2e), final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
lr0,
lr0,
)
else:
log.info(
"start training at lr %.2e (== %.2e), final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
self.lr.value(cur_batch),
self.lr.value(stop_batch),
)

prf_options = None
prf_run_metadata = None
Expand Down Expand Up @@ -797,7 +809,7 @@ def _get_place_holders(self, data_dict) -> None:
prec = GLOBAL_ENER_FLOAT_PRECISION
self.place_holders[kk] = tf.placeholder(prec, [None], name="t_" + kk)
self.place_holders["find_" + kk] = tf.placeholder(
tf.float32, name="t_find_" + kk
GLOBAL_TF_FLOAT_PRECISION, name="t_find_" + kk
)

def _init_from_frz_model(self) -> None:
Expand Down
4 changes: 2 additions & 2 deletions deepmd/tf/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
DeepmdDataSystem,
)
from .learning_rate import (
LearningRateExp,
LearningRateSchedule,
)
from .pair_tab import (
PairTab,
Expand All @@ -20,7 +20,7 @@
__all__ = [
"DeepmdData",
"DeepmdDataSystem",
"LearningRateExp",
"LearningRateSchedule",
"PairTab",
"Plugin",
"PluginVariant",
Expand Down
Loading