NVIDIA · h-guo18 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -154,7 +154,7 @@ def make_eagle_supervised_data_module(
         assert not data_args.vlm_processor, "Offline data is not supported for VLM."
 
         offline_data_path = Path(data_args.offline_data_path)
-        dumped_files = [str(p) for p in offline_data_path.glob("*.pt")]
+        dumped_files = [str(p) for p in offline_data_path.rglob("*.pt")]
         if not dumped_files:
             raise ValueError(f"No .pt files found in {data_args.offline_data_path}")
 

@@ -114,6 +114,14 @@ while [ $# -gt 0 ]; do
       if [[ "$1" != *=* ]]; then shift; fi
       MIX_HIDDEN_STATES="${1#*=}"
       ;;
+    --use_fake_base_for_offline*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      USE_FAKE_BASE_FOR_OFFLINE="${1#*=}"
+      ;;
+    --trust_remote_code*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      TRUST_REMOTE_CODE="${1#*=}"
+      ;;
     *)
       >&2 printf "Error: Invalid argument ${1#*=}\n"
       exit 1
@@ -126,9 +134,16 @@ set -x
 
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 NUM_NODES=${NUM_NODES:-1}
-GPU_PER_NODE=${GPU_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)}
-TOTAL_GPU=$((NUM_NODES * GPU_PER_NODE))
-echo "Total GPUs: $TOTAL_GPU (NUM_NODES: $NUM_NODES, GPU_PER_NODE: $GPU_PER_NODE)"
+if [[ "$NUM_NODES" != 1 ]]; then
+  #Multi Node Training
+  GPU_PER_NODE=${GPU_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)}
+  TOTAL_GPU=$((NUM_NODES * GPU_PER_NODE))
+  echo "Total GPUs: $TOTAL_GPU (NUM_NODES: $NUM_NODES, GPU_PER_NODE: $GPU_PER_NODE)"
+else
+  #Single Node Training, GPU can be specified by $CUDA_VISIBLE_DEVICES
+  TOTAL_GPU=$(python -c "import torch; print(torch.cuda.device_count())")
+  echo "Total GPUs: $TOTAL_GPU (Single Node Training)"
+fi
 # Calculate save_steps
 DEFAULT_SAVE_STEPS=$((8192 / TOTAL_GPU))
 
@@ -154,7 +169,8 @@ DP_SHARD_SIZE=${DP_SHARD_SIZE:-$((TOTAL_GPU/CP_SIZE))}
 LOG_STEPS=${LOG_STEPS:-100}
 DRAFT_VOCAB_CACHE=${DRAFT_VOCAB_CACHE:-""}
 MIX_HIDDEN_STATES=${MIX_HIDDEN_STATES:-"False"}
-
+USE_FAKE_BASE_FOR_OFFLINE=${USE_FAKE_BASE_FOR_OFFLINE:-"False"}
+TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:-"False"}
 
 if [[ "$MODE" == "eagle3" ]]; then
   if [[ -n "$EAGLE_CONFIG" ]]; then
@@ -240,6 +256,8 @@ CMD="accelerate launch $MULTI_NODE_ARGS --mixed_precision bf16 ${SCRIPT_DIR}/mai
     --estimate_ar $ESTIMATE_AR \
     --ar_validate_steps $AR_VALIDATE_STEPS \
     --mix_hidden_states $MIX_HIDDEN_STATES \
+    --use_fake_base_for_offline $USE_FAKE_BASE_FOR_OFFLINE \
+    --trust_remote_code $TRUST_REMOTE_CODE \
     $DRAFT_VOCAB_CACHE_ARGS \
     $VLM_ARGS \
     $OFFLINE_TRAINING_ARGS \

@@ -47,10 +47,7 @@
 
 import modelopt.torch.opt as mto
 import modelopt.torch.speculative as mtsp
-from modelopt.torch.speculative.utils import (
-    load_vlm_or_llm_with_kwargs,
-    patch_transformers5_params_loading,
-)
+from modelopt.torch.speculative.utils import load_vlm_or_llm, patch_transformers5_params_loading
 from modelopt.torch.utils import print_rank_0
 
 torch.manual_seed(0)
@@ -60,6 +57,12 @@
 @dataclass
 class ModelArguments:
     model_name_or_path: str | None = field(default="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    use_fake_base_for_offline: bool = field(
+        default=False, metadata={"help": "Whether to use fake base for offline training."}
+    )
+    trust_remote_code: bool = field(
+        default=False, metadata={"help": "Whether to trust remote code."}
+    )
 
 
 @dataclass
@@ -169,29 +172,27 @@ def train():
 
     if checkpoint:
         with patch_transformers5_params_loading():
-            _, model = load_vlm_or_llm_with_kwargs(
-                checkpoint, torch_dtype="auto", trust_remote_code=True
+            model = load_vlm_or_llm(
+                checkpoint, torch_dtype="auto", trust_remote_code=model_args.trust_remote_code
             )
-        tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            checkpoint, trust_remote_code=model_args.trust_remote_code
+        )
     else:
         # To avoid OOM for large models, we load and convert model on CPU first.
         # Model will be moved to GPU during HF trainer.init().
-        offline_kwargs = {"num_hidden_layers": 0} if use_offline_training else {}
-        model_config, model = load_vlm_or_llm_with_kwargs(
+        model = load_vlm_or_llm(
             model_args.model_name_or_path,
+            use_fake_base=model_args.use_fake_base_for_offline,
+            use_offline_training=use_offline_training,
             torch_dtype="auto",
             device_map="cpu",
-            trust_remote_code=True,
-            **offline_kwargs,
+            trust_remote_code=model_args.trust_remote_code,
         )
-        if use_offline_training:
-            # When doing offline training, we need to set num_hidden_layers
-            # since we override it when loading the model for space savings
-            model.config.num_orig_hidden_layers = model_config.num_hidden_layers
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             model_max_length=training_args.training_seq_len,
-            trust_remote_code=True,
+            trust_remote_code=model_args.trust_remote_code,
         )
         if training_args.mode == "medusa":
             config = {

@@ -22,7 +22,7 @@
 
 import modelopt.torch.opt as mto
 from modelopt.torch.speculative.plugins.transformers import HFARValidation
-from modelopt.torch.speculative.utils import load_vlm_or_llm_with_kwargs
+from modelopt.torch.speculative.utils import load_vlm_or_llm
 
 mto.enable_huggingface_checkpointing()
 
@@ -72,7 +72,7 @@ def main():
 
     accelerator = Accelerator()
     # Load model and tokenizer
-    _, model = load_vlm_or_llm_with_kwargs(args.model_path, device_map="auto")
+    model = load_vlm_or_llm(args.model_path, device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(args.model_path)
     model.eval()
     model = accelerator.prepare(model)

@@ -21,7 +21,7 @@
 
 import modelopt.torch.opt as mto
 from modelopt.torch.export import export_speculative_decoding
-from modelopt.torch.speculative.utils import load_vlm_or_llm_with_kwargs
+from modelopt.torch.speculative.utils import load_vlm_or_llm
 
 
 def parse_args():
@@ -38,7 +38,7 @@ def parse_args():
 mto.enable_huggingface_checkpointing()
 
 args = parse_args()
-_, model = load_vlm_or_llm_with_kwargs(args.model_path, torch_dtype="auto")
+model = load_vlm_or_llm(args.model_path, torch_dtype="auto")
 model.eval()
 with torch.inference_mode():
     export_speculative_decoding(

@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Lightweight fake base model for offline speculative decoding training."""
+
+import json
+import os
+
+import torch
+import torch.nn as nn
+import transformers
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file as safetensors_load_file
+from transformers import PretrainedConfig, PreTrainedModel
+
+# Candidate module paths searched in order — shared with HFEagleModel._find_base_model_parts
+_EMBED_TOKENS_PATHS = [
+    "embed_tokens",
+    "language_model.model.embed_tokens",
+    "model.embed_tokens",
+    "backbone.embeddings",
+    "language_model.backbone.embeddings",
+    "model.language_model.embed_tokens",
+]
+_LM_HEAD_PATHS = ["lm_head", "language_model.lm_head"]
+_BASE_MODEL_PATHS = [
+    "language_model.model",
+    "model.language_model",
+    "model",
+    "backbone",
+    "language_model.backbone",
+]
+_VLM_CONFIG_ATTRS = ["text_config", "llm_config"]
+_SAFETENSORS_INDEX_FILENAME = "model.safetensors.index.json"
+
+
+class FakeBaseConfig(PretrainedConfig):
+    """Minimal config for FakeBaseModel that supports offline speculative decoding training."""
+
+    model_type = "fake_base_model"
+
+    def __init__(
+        self,
+        num_hidden_layers=None,
+        hidden_size=None,
+        vocab_size=None,
+        max_position_embeddings=None,
+        dtype=torch.bfloat16,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        """Initialize FakeBaseConfig with minimal model configuration parameters."""
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.dtype = dtype
+
+
+class FakeBaseModel(PreTrainedModel):
+    """Minimal base model for offline speculative decoding.
+
+    Contains only ``lm_head``, ``embed_tokens``, and the minimal config needed by the EAGLE
+    training loop. The full model weights are never loaded, keeping memory usage low.
+
+    Weights are loaded from a local HuggingFace checkpoint directory. Weight key names and
+    VLM config nesting are auto-detected from the shared path constants.
+    """
+
+    config_class = FakeBaseConfig
+
+    def __init__(self, source: str, trust_remote_code: bool = False):
+        """Load lm_head and embed_tokens from a local directory or HuggingFace Hub repo.
+
+        Args:
+            source: Path to a local HuggingFace checkpoint directory, or a HuggingFace Hub
+                repo ID (e.g. ``"meta-llama/Llama-3.1-8B"``). The source type is detected
+                automatically: if ``source`` is an existing local directory it is treated as a
+                local checkpoint; otherwise it is treated as a Hub repo ID and the required
+                files are downloaded via ``huggingface_hub``.
+        """
+        orig_config = transformers.AutoConfig.from_pretrained(
+            source, trust_remote_code=trust_remote_code
+        )
+        # For vlms, detect language model config based on _VLM_CONFIG_ATTRS
+        base_cfg = next(
+            (
+                getattr(orig_config, attr)
+                for attr in _VLM_CONFIG_ATTRS
+                if getattr(orig_config, attr, None) is not None
+            ),
+            orig_config,
+        )
+        # Extract necessary info for spec training from base config
+        config = FakeBaseConfig(
+            num_hidden_layers=getattr(base_cfg, "num_hidden_layers", None),
+            hidden_size=getattr(base_cfg, "hidden_size", None),
+            vocab_size=getattr(base_cfg, "vocab_size", None),
+            max_position_embeddings=getattr(base_cfg, "max_position_embeddings", None),
+            dtype=getattr(base_cfg, "dtype", torch.bfloat16),
+            tie_word_embeddings=getattr(base_cfg, "tie_word_embeddings", False),
+        )
+        super().__init__(config)
+        # Initialize dummy module and attributes for compatibility with HFEagleModel
+        self.model = nn.Module()
+        self.model.layers = nn.ModuleList()
+        self.model.dtype = config.dtype
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Load lm_head and embed_tokens only from checkpoint
+        lm_head_w, embed_tokens_w = self._load_weights(source)
+        assert lm_head_w.shape == (config.vocab_size, config.hidden_size)
+        assert embed_tokens_w.shape == (config.vocab_size, config.hidden_size)
+        self.lm_head.weight.data.copy_(lm_head_w)
+        self.embed_tokens.weight.data.copy_(embed_tokens_w)
+
+    @staticmethod
+    def _find_weight_key(weight_map: dict, paths: list[str], label: str) -> str:
+        """Return the first ``path + '.weight'`` found in ``weight_map``."""
+        for path in paths:
+            key = path + ".weight"
+            if key in weight_map:
+                return key
+        tried = [p + ".weight" for p in paths]
+        raise RuntimeError(f"Cannot find {label} in checkpoint; tried: {tried}")
+
+    def _load_weights(self, source: str):
+        """Load lm_head and embed_tokens weights from a local directory or HuggingFace Hub repo.
+
+        For remote repos the index file and the two required weight shards are downloaded via
+        ``huggingface_hub`` and cached locally; subsequent calls reuse the cache.
+        """
+        if os.path.isdir(source):
+            index_path = os.path.join(source, _SAFETENSORS_INDEX_FILENAME)
+            if not os.path.isfile(index_path):
+                raise FileNotFoundError(f"No {_SAFETENSORS_INDEX_FILENAME} found in {source!r}.")
+            with open(index_path) as f:
+                weight_map = json.load(f).get("weight_map", {})
+
+            lm_head_key = self._find_weight_key(weight_map, _LM_HEAD_PATHS, "lm_head")
+            embed_tokens_key = self._find_weight_key(
+                weight_map, _EMBED_TOKENS_PATHS, "embed_tokens"
+            )
+
+            lm_head_state = safetensors_load_file(
+                os.path.join(source, weight_map[lm_head_key]), device="cpu"
+            )
+            embed_tokens_state = safetensors_load_file(
+                os.path.join(source, weight_map[embed_tokens_key]), device="cpu"
+            )
+        else:
+            # Treat source as a HuggingFace Hub repo ID
+            index_path = hf_hub_download(repo_id=source, filename=_SAFETENSORS_INDEX_FILENAME)
+            with open(index_path) as f:
+                weight_map = json.load(f).get("weight_map", {})
+
+            lm_head_key = self._find_weight_key(weight_map, _LM_HEAD_PATHS, "lm_head")
+            embed_tokens_key = self._find_weight_key(
+                weight_map, _EMBED_TOKENS_PATHS, "embed_tokens"
+            )
+
+            lm_head_shard = hf_hub_download(repo_id=source, filename=weight_map[lm_head_key])
+            embed_tokens_shard = hf_hub_download(
+                repo_id=source, filename=weight_map[embed_tokens_key]
+            )
+            lm_head_state = safetensors_load_file(lm_head_shard, device="cpu")
+            embed_tokens_state = safetensors_load_file(embed_tokens_shard, device="cpu")
+
+        return lm_head_state[lm_head_key], embed_tokens_state[embed_tokens_key]
+
+    def forward(self, *args, **kwargs):
+        """Not implemented: FakeBaseModel omits full model weights and cannot run inference."""
+        raise NotImplementedError("FakeBaseModel forward is not implemented.")