Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions tools/launcher/examples/Qwen/qwen3-v0339a-demo/step1_synth.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# EAGLE3 speculative decoding pipeline — Step 1: Data synthesis
#
# Queries TRT-LLM server to generate synthetic prompt samples for EAGLE3 training.
#
# Usage:
# uv run launch.py --yaml examples/Qwen/qwen3-v0339a-demo/step1_synth.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339a-demo/step1_synth.yaml --yes

job_name: qwen3-v0339a-demo_EAGLE3_step1_synth
pipeline:
allow_to_fail: false
skip: false
note: Synthetic data generation via TRT-LLM server

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339a-demo

# Step 1: Data synthesis via TRT-LLM server
# Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
task_0:
script: common/tensorrt_llm/query.sh
args:
- --model <<global_vars.hf_model>>
- --tp_size 8
- --ep_size 8
- --max_num_tokens 32000
- --port 8000
- --host 0.0.0.0
- --trust_remote_code
- --
- --data /hf-local/modelopt/Speculative-Decoding-Prompt-Samples
- --save /scratchspace/data
environment:
- HF_LOCAL: /hf-local
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
34 changes: 34 additions & 0 deletions tools/launcher/examples/Qwen/qwen3-v0339a-demo/step2_hidden.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# EAGLE3 speculative decoding pipeline — Step 2: Hidden state dump
#
# Runs target model to capture hidden states for offline EAGLE3 training.
#
# Usage:
# uv run launch.py --yaml examples/Qwen/qwen3-v0339a-demo/step2_hidden.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339a-demo/step2_hidden.yaml --yes

job_name: qwen3-v0339a-demo_EAGLE3_step2_hidden
pipeline:
allow_to_fail: false
skip: false
note: Hidden state dump from target model

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339a-demo

# Step 2: Dump hidden states from target model
task_0:
script: common/eagle3/dump_offline_data.sh
args:
- --input-data /scratchspace/data
- --output-dir /scratchspace/offline_hidden_states
- --max-seq-len 8192
- --tp 8
- --moe-ep 8
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 8
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
34 changes: 34 additions & 0 deletions tools/launcher/examples/Qwen/qwen3-v0339a-demo/step3_train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# EAGLE3 speculative decoding pipeline — Step 3: Offline training
#
# Trains EAGLE3 draft head using collected hidden states from target model.
#
# Usage:
# uv run launch.py --yaml examples/Qwen/qwen3-v0339a-demo/step3_train.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339a-demo/step3_train.yaml --yes

job_name: qwen3-v0339a-demo_EAGLE3_step3_train
pipeline:
allow_to_fail: false
skip: false
note: Offline EAGLE3 training

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339a-demo

# Step 3: Train EAGLE3 draft head (offline, single task)
task_0:
script: common/eagle3/train_eagle.sh
args:
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
- model.model_name_or_path=<<global_vars.hf_model>>
- data.offline_data_path=/scratchspace/offline_hidden_states
- training.output_dir=/scratchspace/eagle3
- training.training_seq_len=4096
- training.disable_tqdm=true
- training.ar_validate_steps=500000
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# EAGLE3 speculative decoding pipeline — Step 4: Speed evaluation
#
# Benchmarks speculative decoding speedup via vLLM with trained EAGLE3 draft head.
#
# Usage:
# uv run launch.py --yaml examples/Qwen/qwen3-v0339a-demo/step4_speed_eval.yaml --yes
# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/qwen3-v0339a-demo/step4_speed_eval.yaml --yes

job_name: qwen3-v0339a-demo_EAGLE3_step4_speed_eval
pipeline:
allow_to_fail: false
skip: false
note: Speculative decoding benchmark evaluation

global_vars:
hf_model: /hf-local/Qwen/qwen3-v0339a-demo

# Step 4: Benchmark speculative decoding (VLLM backend)
task_0:
script: common/specdec_bench/quick_check.sh
args:
- --draft_model_dir /scratchspace/export
- --draft_length 3
- --output_length 4096
- --engine VLLM
- --tp_size 8
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 8
container: vllm/vllm-openai:latest
Loading