NVIDIA · ChenhanYu · May 8, 2026
diff --git a/tools/launcher/examples/Qwen/qwen3-v0339-demo/step1_synth.yaml b/tools/launcher/examples/Qwen/qwen3-v0339-demo/step1_synth.yaml
@@ -0,0 +1,47 @@
+# EAGLE3 Speculative Decoding — Step 1: Synthetic Data Generation
+# Qwen3-v0339-demo
+#
+# Starts a TRT-LLM server with Qwen3-v0339-demo and queries it to generate
+# synthetic assistant responses for EAGLE3 draft model training.
+#
+# This is the first step in a 4-step pipeline:
+#   step1_synth.yaml       — synthetic data generation (this file)
+#   step2_hidden.yaml      — hidden-state dump
+#   step3_train.yaml       — offline training
+#   step4_speed_eval.yaml  — SPEED Eval
+#
+# Usage:
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step1_synth.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step1_synth.yaml --dry-run
+
+job_name: qwen3-v0339-demo_EAGLE3_step1_synth
+
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note: "Step 1: Data synthesis via TRT-LLM server"
+
+  global_vars:
+    hf_model: /hf-local/Qwen/qwen3-v0339-demo
+
+  task_0:
+    script: common/tensorrt_llm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tp_size 8
+      - --ep_size 8
+      - --max_num_tokens 32000
+      - --port 8000
+      - --host 0.0.0.0
+      - --trust_remote_code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 8
+      gpus_per_node: 8
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
diff --git a/tools/launcher/examples/Qwen/qwen3-v0339-demo/step2_hidden.yaml b/tools/launcher/examples/Qwen/qwen3-v0339-demo/step2_hidden.yaml
@@ -0,0 +1,45 @@
+# EAGLE3 Speculative Decoding — Step 2: Hidden State Dump
+# Qwen3-v0339-demo
+#
+# Runs the target model to capture hidden states from the synthetic data
+# generated in step1_synth.yaml. These hidden states are used as training data
+# for the EAGLE3 draft head.
+#
+# This is the second step in a 4-step pipeline:
+#   step1_synth.yaml       — synthetic data generation
+#   step2_hidden.yaml      — hidden-state dump (this file)
+#   step3_train.yaml       — offline training
+#   step4_speed_eval.yaml  — SPEED Eval
+#
+# Requires artifacts from step1_synth.yaml in /scratchspace/data
+#
+# Usage:
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step2_hidden.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step2_hidden.yaml --dry-run
+
+job_name: qwen3-v0339-demo_EAGLE3_step2_hidden
+
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note: "Step 2: Dump hidden states from target model"
+
+  global_vars:
+    hf_model: /hf-local/Qwen/qwen3-v0339-demo
+
+  task_0:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 8
+      - --moe-ep 8
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 8
+      gpus_per_node: 8
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
diff --git a/tools/launcher/examples/Qwen/qwen3-v0339-demo/step3_train.yaml b/tools/launcher/examples/Qwen/qwen3-v0339-demo/step3_train.yaml
@@ -0,0 +1,44 @@
+# EAGLE3 Speculative Decoding — Step 3: Offline Training
+# Qwen3-v0339-demo
+#
+# Trains the EAGLE3 draft head using hidden states captured in step2_hidden.yaml.
+# This produces a lightweight draft model that can be used for speculative decoding.
+#
+# This is the third step in a 4-step pipeline:
+#   step1_synth.yaml       — synthetic data generation
+#   step2_hidden.yaml      — hidden-state dump
+#   step3_train.yaml       — offline training (this file)
+#   step4_speed_eval.yaml  — SPEED Eval
+#
+# Requires artifacts from step2_hidden.yaml in /scratchspace/offline_hidden_states
+#
+# Usage:
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step3_train.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step3_train.yaml --dry-run
+
+job_name: qwen3-v0339-demo_EAGLE3_step3_train
+
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note: "Step 3: Train EAGLE3 draft head (offline)"
+
+  global_vars:
+    hf_model: /hf-local/Qwen/qwen3-v0339-demo
+
+  task_0:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 8
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
diff --git a/tools/launcher/examples/Qwen/qwen3-v0339-demo/step4_speed_eval.yaml b/tools/launcher/examples/Qwen/qwen3-v0339-demo/step4_speed_eval.yaml
@@ -0,0 +1,48 @@
+# EAGLE3 Speculative Decoding — Step 4: SPEED Eval
+# Qwen3-v0339-demo
+#
+# Benchmarks speculative decoding speedup using the draft head trained in step3_train.yaml.
+# Evaluates the EAGLE3 algorithm against MT-Bench prompts using VLLM backend.
+#
+# This is the fourth step in a 4-step pipeline:
+#   step1_synth.yaml       — synthetic data generation
+#   step2_hidden.yaml      — hidden-state dump
+#   step3_train.yaml       — offline training
+#   step4_speed_eval.yaml  — SPEED Eval (this file)
+#
+# Requires artifacts from step3_train.yaml in /scratchspace/eagle3
+#
+# Usage:
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step4_speed_eval.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step4_speed_eval.yaml --dry-run
+
+job_name: qwen3-v0339-demo_EAGLE3_step4_speed_eval
+
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note: "Step 4: Benchmark speculative decoding (VLLM backend)"
+
+  global_vars:
+    hf_model: /hf-local/Qwen/qwen3-v0339-demo
+
+  task_0:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 8
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 8
+      container: vllm/vllm-openai:latest