Warmup changes: only warm a few batches; extract to separate method in trainer class (#43)

PatrickRMiles · Patrick Miles · michaelmckinsey1 · web-flow · commit b4d202ca5e4f · 2026-04-02T14:47:14.000-07:00
* apply optimizer every batch, not every epoch; unscale gradients before clipping

* trainer tweaks

* apply optimizer every batch, not every epoch; unscale gradients before clipping

* extract warmup to separate method; switch to warming up set number of batches (user configurable)

* whitespace; num_workers revert

* ruff

* make parallelstrategy, spatial_mesh, ddp_placements attrs of trainer; other small tweaks

* remove deprecated config attrs

* ruff

* get device mesh from ps class attr

* ruff

* missing self. on some ps accesses

* Fix imports and missing self.ps

* rm legacy warmup_epochs

* Move attributes to base class for clarity

* remove warmup_epochs -- not useful to keep support for this

* call cleanup_or_resume trainer method directly

* rm unused vars

---------

Co-authored-by: Patrick Miles &lt;miles30@tioga.llnl.gov&gt;
Co-authored-by: Michael McKinsey &lt;michaelmckinsey1@gmail.com&gt;
diff --git a/ScaFFold/cli.py b/ScaFFold/cli.py
@@ -140,6 +140,11 @@ def main():
     benchmark_parser.add_argument(
         "--batch-size", type=int, nargs="+", help="Batch sizes for each volume size."
     )
+    benchmark_parser.add_argument(
+        "--warmup-batches",
+        type=int,
+        help="Number of warmup batches to run per rank before training.",
+    )
     benchmark_parser.add_argument(
         "--optimizer",
         type=str,
diff --git a/ScaFFold/configs/benchmark_default.yml b/ScaFFold/configs/benchmark_default.yml
@@ -29,6 +29,6 @@ framework: "torch"                 # The DL framework to train with. Only valid
 checkpoint_dir: "checkpoints"      # Subfolder in which to save training checkpoints.
 loss_freq: 1                       # Number of epochs between logging the overall loss.
 normalize: 1                       # Cateogry search normalization parameter
-warmup_epochs: 1                   # How many warmup epochs before training
+warmup_batches: 5                  # How many warmup batches per rank to run before training.
 dataset_reuse_enforce_commit_id: 0 # Enforce matching commit IDs for dataset reuse.
-target_dice: 0.95
+target_dice: 0.95
diff --git a/ScaFFold/configs/benchmark_testing.yml b/ScaFFold/configs/benchmark_testing.yml
@@ -29,6 +29,6 @@ framework: "torch"                 # The DL framework to train with. Only valid
 checkpoint_dir: "checkpoints"      # Subfolder in which to save training checkpoints.
 loss_freq: 1                       # Number of epochs between logging the overall loss.
 normalize: 1                       # Cateogry search normalization parameter
-warmup_epochs: 1                   # How many warmup epochs before training
+warmup_batches: 5                  # How many warmup batches per rank to run before training.
 dataset_reuse_enforce_commit_id: 0 # Enforce matching commit IDs for dataset reuse.
 target_dice: 0.95
diff --git a/ScaFFold/utils/config_utils.py b/ScaFFold/utils/config_utils.py
@@ -66,7 +66,7 @@ def __init__(self, config_dict):
         self.loss_freq = config_dict["loss_freq"]
         self.checkpoint_dir = config_dict["checkpoint_dir"]
         self.normalize = config_dict["normalize"]
-        self.warmup_epochs = config_dict["warmup_epochs"]
+        self.warmup_batches = config_dict.get("warmup_batches")
         self.dataset_reuse_enforce_commit_id = config_dict[
             "dataset_reuse_enforce_commit_id"
         ]
diff --git a/ScaFFold/utils/trainer.py b/ScaFFold/utils/trainer.py
diff --git a/ScaFFold/worker.py b/ScaFFold/worker.py

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ def __init__(self, config_dict):`
`66`	`66`	`self.loss_freq = config_dict["loss_freq"]`
`67`	`67`	`self.checkpoint_dir = config_dict["checkpoint_dir"]`
`68`	`68`	`self.normalize = config_dict["normalize"]`
`69`		`- self.warmup_epochs = config_dict["warmup_epochs"]`
	`69`	`+ self.warmup_batches = config_dict.get("warmup_batches")`
`70`	`70`	`self.dataset_reuse_enforce_commit_id = config_dict[`
`71`	`71`	`"dataset_reuse_enforce_commit_id"`
`72`	`72`	`]`