Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions tools/launcher/examples/Qwen/qwen3-v0339-demo/step1_synth.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# EAGLE3 Speculative Decoding — Step 1: Synthetic Data Generation
# Qwen3-v0339-demo
#
# Starts a TRT-LLM server with Qwen3-v0339-demo and queries it to generate
# synthetic assistant responses for EAGLE3 draft model training.
#
# This is the first step in a 4-step pipeline:
# step1_synth.yaml — synthetic data generation (this file)
# step2_hidden.yaml — hidden-state dump
# step3_train.yaml — offline training
# step4_speed_eval.yaml — SPEED Eval
#
# Usage:
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step1_synth.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step1_synth.yaml --dry-run

job_name: qwen3-v0339-demo_EAGLE3_step1_synth

pipeline:
allow_to_fail: false
skip: false
note: "Step 1: Data synthesis via TRT-LLM server"

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339-demo

task_0:
script: common/tensorrt_llm/query.sh
args:
- --model <<global_vars.hf_model>>
- --tp_size 8
- --ep_size 8
- --max_num_tokens 32000
- --port 8000
- --host 0.0.0.0
- --trust_remote_code
- --
- --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
- --save /scratchspace/data
environment:
- HF_LOCAL: /hf-local
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
45 changes: 45 additions & 0 deletions tools/launcher/examples/Qwen/qwen3-v0339-demo/step2_hidden.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# EAGLE3 Speculative Decoding — Step 2: Hidden State Dump
# Qwen3-v0339-demo
#
# Runs the target model to capture hidden states from the synthetic data
# generated in step1_synth.yaml. These hidden states are used as training data
# for the EAGLE3 draft head.
#
# This is the second step in a 4-step pipeline:
# step1_synth.yaml — synthetic data generation
# step2_hidden.yaml — hidden-state dump (this file)
# step3_train.yaml — offline training
# step4_speed_eval.yaml — SPEED Eval
#
# Requires artifacts from step1_synth.yaml in /scratchspace/data
#
# Usage:
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step2_hidden.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step2_hidden.yaml --dry-run

job_name: qwen3-v0339-demo_EAGLE3_step2_hidden

pipeline:
allow_to_fail: false
skip: false
note: "Step 2: Dump hidden states from target model"

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339-demo

task_0:
script: common/eagle3/dump_offline_data.sh
args:
- --input-data /scratchspace/data
- --output-dir /scratchspace/offline_hidden_states
- --max-seq-len 8192
- --tp 8
- --moe-ep 8
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
44 changes: 44 additions & 0 deletions tools/launcher/examples/Qwen/qwen3-v0339-demo/step3_train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# EAGLE3 Speculative Decoding — Step 3: Offline Training
# Qwen3-v0339-demo
#
# Trains the EAGLE3 draft head using hidden states captured in step2_hidden.yaml.
# This produces a lightweight draft model that can be used for speculative decoding.
#
# This is the third step in a 4-step pipeline:
# step1_synth.yaml — synthetic data generation
# step2_hidden.yaml — hidden-state dump
# step3_train.yaml — offline training (this file)
# step4_speed_eval.yaml — SPEED Eval
#
# Requires artifacts from step2_hidden.yaml in /scratchspace/offline_hidden_states
#
# Usage:
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step3_train.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step3_train.yaml --dry-run

job_name: qwen3-v0339-demo_EAGLE3_step3_train

pipeline:
allow_to_fail: false
skip: false
note: "Step 3: Train EAGLE3 draft head (offline)"

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339-demo

task_0:
script: common/eagle3/train_eagle.sh
args:
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
- model.model_name_or_path=<<global_vars.hf_model>>
- data.offline_data_path=/scratchspace/offline_hidden_states
- training.output_dir=/scratchspace/eagle3
- training.training_seq_len=4096
- training.disable_tqdm=true
- training.ar_validate_steps=500000
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# EAGLE3 Speculative Decoding — Step 4: SPEED Eval
# Qwen3-v0339-demo
#
# Benchmarks speculative decoding speedup using the draft head trained in step3_train.yaml.
# Evaluates the EAGLE3 algorithm against MT-Bench prompts using VLLM backend.
#
# This is the fourth step in a 4-step pipeline:
# step1_synth.yaml — synthetic data generation
# step2_hidden.yaml — hidden-state dump
# step3_train.yaml — offline training
# step4_speed_eval.yaml — SPEED Eval (this file)
#
# Requires artifacts from step3_train.yaml in /scratchspace/eagle3
#
# Usage:
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step4_speed_eval.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339-demo/step4_speed_eval.yaml --dry-run

job_name: qwen3-v0339-demo_EAGLE3_step4_speed_eval

pipeline:
allow_to_fail: false
skip: false
note: "Step 4: Benchmark speculative decoding (VLLM backend)"

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339-demo

task_0:
script: common/specdec_bench/quick_check.sh
args:
- --draft_model_dir /scratchspace/export
- --draft_length 3
- --output_length 4096
- --engine VLLM
- --tp_size 8
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: vllm/vllm-openai:latest
Loading