Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions examples/speculative_decoding/eagle_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ def make_speculative_data_module(
if mode == "streaming":
# ``train_len`` right-truncates during tokenization and is also the collator's
# pad target; caller must ensure ``train_len <= vllm.max_model_len``.
# The streaming dataset tokenizes via ``tokenizer.apply_chat_template`` (no
# chat_template arg), so a custom template (e.g. one carrying {% generation %}
# tags for answer_only_loss) must be installed on the tokenizer here — unlike
# the online path, which threads ``chat_template`` straight into the collator.
if chat_template is not None:
tokenizer.chat_template = chat_template
print_rank_0("Installed custom chat template on tokenizer for streaming.")
print_rank_0(f"Streaming hidden states from {data_args.streaming_server_url}")
from modelopt.torch.speculative.plugins.hf_streaming_dataset import (
EagleVllmStreamingConfig,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# DFlash streaming speculative decoding pipeline for Qwen3-30B-A3B — MULTI-NODE.
#
# Same streaming transport / dispatch as hf_streaming_eagle3_multi_node.yaml: task_1
# splits N nodes into K serve replicas + (N-K) DDP trainers via SERVE_NODES; hidden
# states move serve -> trainer over NIXL RDMA. DFlash just consumes a different set of
# captured layers and trains a block-diffusion draft instead of an autoregressive one.
# See common/eagle3/train_eagle_streaming.sh for dispatch, rendezvous, and sharding.
#
# Qwen3-30B-A3B notes (vs the dense Qwen3-8B example):
# - MoE: 128 experts, 8 active/token, shipped in BF16 (NOT quantized, unlike
# gpt-oss MXFP4). The serve still passes --no-enable-flashinfer-autotune
# unconditionally; it is a no-op cost here since there is no FP4 re-tuning.
# - 48 hidden layers, 5 draft layers -> build_target_layer_ids(48,5)=[1,12,23,34,45]
# (the draft's fc input) plus the final layer for self-logit distillation. vLLM's
# capture ids are those +1 -> [2,13,24,35,46], plus final layer 48.
# - dflash_mask_token_id: Qwen3 has no tokenizer mask token (would fall back to
# None), so set it explicitly to a reserved/unused vocab slot (151669), same as
# the Qwen3-8B DFlash example (shared Qwen3 tokenizer).
# - We use a fake base (lm_head + embed_tokens only): streaming trains only the
# DFlash draft on hidden states from the live serve, so the trainer never needs
# the 30B base transformer layers. Matches the gpt-oss streaming examples.
#
# 3-step pipeline:
# task_0: Build input conversations (jsonl)
# task_1: Streaming train — 2 serve nodes (2 GPU, TP=2) + 2 trainer nodes (2 GPU)
# task_2: vLLM smoke test with DFlash speculative decoding
#
# Usage:
# uv run launch.py --yaml examples/Qwen/Qwen3-30B-A3B/hf_streaming_dflash_multi_node.yaml --yes

job_name: Qwen3-30B-A3B_DFlash_streaming_multi_node
pipeline:
allow_to_fail: false
skip: false
note:

global_vars:
hf_model: /hf-local/Qwen/Qwen3-30B-A3B

# Step 1: Build input conversations
task_0:
script: common/eagle3/make_dataset.sh
args:
- -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
- --full-conversations
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 1
container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10

# Step 2: Streaming DFlash training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each).
# DFlash extracts 5 target layers (build_target_layer_ids(48,5)=[1,12,23,34,45], the
# draft's fc input) plus the final layer for self-logit distillation. vLLM's capture
# ids are those +1 -> [2,13,24,35,46], plus final layer 48.
task_1:
script: common/eagle3/train_eagle_streaming.sh
args:
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
- model.model_name_or_path=<<global_vars.hf_model>>
# Streaming trains only the DFlash draft on hidden states from the live serve; the
# trainer never runs the base transformer layers, so load a fake base (lm_head +
# embed_tokens only). Consistent with the gpt-oss streaming examples.
- model.use_fake_base_for_offline=true
- data.mode=streaming
- data.data_path=/scratchspace/data/train.jsonl
- training.output_dir=/scratchspace/dflash
- training.training_seq_len=4096
- training.disable_tqdm=true
# Streaming corpus is prompt-only (the serve generates the response and we
# capture its hidden states), so there is no assistant span to mask -> train
# over the full sequence, same as the EAGLE3 streaming example.
- training.answer_only_loss=false
- training.num_train_epochs=1
- training.max_steps=2000
# dflash.yaml sets report_to=tensorboard, which hard-fails if tensorboard
# isn't in the serve container; the streaming trainer doesn't need it.
- training.report_to=none
- dflash.dflash_block_size=16
- dflash.dflash_num_anchors=512
- dflash.dflash_loss_decay_factor=7
# Qwen3 has no tokenizer mask token; use a reserved vocab slot (151669).
- dflash.dflash_mask_token_id=151669
- dflash.dflash_architecture_config.num_hidden_layers=5
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
# No spaces: nemo_run emits `export FOO=value` unquoted.
- EAGLE_CAPTURE_IDS: "[2,13,24,35,46,48]"
- SERVE_TP: "2"
# K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
- SERVE_NODES: "2"
# Per-rank in-flight fetches; keep low so the cold MoE serve isn't flooded past its execute-model timeout.
- STREAMING_NUM_WORKERS: "4"
# Qwen3-30B-A3B default ctx is 40960 -> full-len KV cache is wasteful for the
# 30B serve; we train at seq_len 4096, so cap it.
- SERVE_MAX_MODEL_LEN: "8192"
# DFlash uses a custom modeling file; export must trust remote code.
- EXPORT_EXTRA_ARGS: "--trust_remote_code"
slurm_config:
_factory_: "slurm_factory"
nodes: 4
ntasks_per_node: 1
gpus_per_node: 2
container: vllm/vllm-openai:latest

# Step 3: vLLM smoke test (DFlash, uses exported checkpoint from training)
task_2:
script: common/specdec/vllm_smoke_test.sh
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
- DRAFT_MODEL: /scratchspace/export
- SPEC_METHOD: "dflash"
- NUM_SPEC_TOKENS: "7"
- MIN_ACCEPTANCE_LENGTH: "1.2"
# Qwen3-30B-A3B is ~60 GB in BF16 -> does not fit one 80 GB GPU; serve the
# smoke-test target with TP=2 (gpus_per_node: 2 below).
- TP_SIZE: "2"
slurm_config:
_factory_: "slurm_factory"
container: "vllm/vllm-openai:nightly"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# EAGLE3 streaming speculative decoding pipeline for Qwen3-30B-A3B — MULTI-NODE.
#
# task_1 splits N nodes into K serve replicas + (N-K) DDP trainers via SERVE_NODES;
# see common/eagle3/train_eagle_streaming.sh for dispatch, rendezvous, and sharding.
#
# Qwen3-30B-A3B notes (vs the dense Qwen3-8B example):
# - MoE: 128 experts, 8 active/token, shipped in BF16 (NOT quantized, unlike
# gpt-oss MXFP4). The serve still passes --no-enable-flashinfer-autotune
# unconditionally; it is a no-op cost here since there is no FP4 re-tuning.
# - 48 hidden layers -> default_eagle_aux_layer_ids(48)=[1,23,44]; vLLM capture ids
# are each +1 plus the final layer 48 -> EAGLE_CAPTURE_IDS="[2,24,45,48]".
# - sliding_window is null (use_sliding_window=false), so the draft has no
# sliding-window leak. We still use a fake base (lm_head + embed_tokens only):
# streaming trains only the EAGLE3 draft on hidden states from the live serve,
# so the trainer never needs the 30B base transformer layers. This matches the
# gpt-oss streaming examples and keeps the draft config off Qwen3MoeConfig.
# - qwen3_moe is native in recent transformers/vLLM (no --trust-remote-code needed).
#
# 3-step pipeline:
# task_0: Build input conversations (jsonl)
# task_1: Streaming train — 2 serve nodes (2 GPU, TP=2) + 2 trainer nodes (2 GPU)
# task_2: Benchmark — evaluate speculative decoding speedup via VLLM
#
# Usage:
# uv run launch.py --yaml examples/Qwen/Qwen3-30B-A3B/hf_streaming_eagle3_multi_node.yaml --yes

job_name: Qwen3-30B-A3B_EAGLE3_streaming_multi_node
pipeline:
allow_to_fail: false
skip: false
note:

global_vars:
hf_model: /hf-local/Qwen/Qwen3-30B-A3B

# Step 1: Build input conversations
task_0:
script: common/eagle3/make_dataset.sh
args:
- -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
- --full-conversations
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 1
container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10

# Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each).
# Capture ids: default_eagle_aux_layer_ids(48)=[1,23,44] +1, plus final layer 48.
task_1:
script: common/eagle3/train_eagle_streaming.sh
args:
- --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
- model.model_name_or_path=<<global_vars.hf_model>>
# Streaming trains only the EAGLE3 draft on hidden states from the live serve; the
# trainer never runs the base transformer layers, so load a fake base (lm_head +
# embed_tokens only). Consistent with the gpt-oss streaming examples.
- model.use_fake_base_for_offline=true
- data.mode=streaming
- data.data_path=/scratchspace/data/train.jsonl
- training.output_dir=/scratchspace/eagle3
- training.training_seq_len=4096
- training.disable_tqdm=true
- training.ar_validate_steps=500000
- training.num_train_epochs=1
- training.max_steps=2000
- eagle.eagle_use_torch_compile=false
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
# No spaces: nemo_run emits `export FOO=value` unquoted.
- EAGLE_CAPTURE_IDS: "[2,24,45,48]"
- SERVE_TP: "2"
# K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
- SERVE_NODES: "2"
# Per-rank in-flight fetches; keep low so the cold MoE serve isn't flooded past its execute-model timeout (kills EngineCore).
- STREAMING_NUM_WORKERS: "4"
# Qwen3-30B-A3B default ctx is 40960 -> full-len KV cache is wasteful for the
# 30B serve; we train at seq_len 4096, so cap it.
- SERVE_MAX_MODEL_LEN: "8192"
slurm_config:
_factory_: "slurm_factory"
nodes: 4
ntasks_per_node: 1
gpus_per_node: 2
container: vllm/vllm-openai:latest

# Step 3: Benchmark speculative decoding (VLLM backend)
task_2:
script: common/specdec_bench/quick_check.sh
args:
- --draft_model_dir /scratchspace/export
- --draft_length 3
- --output_length 4096
- --engine VLLM
# Qwen3-30B-A3B is ~60 GB in BF16 -> does not fit one 80 GB GPU; serve the
# benchmark target with TP=2 (gpus_per_node: 2 below).
- --tp_size 2
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 32
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 2
container: vllm/vllm-openai:latest
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ pipeline:
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
- --concurrency 32
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ pipeline:
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
- --concurrency 32
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ pipeline:
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
- --concurrency 32
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pipeline:
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
- --concurrency 32
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ pipeline:
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
- --concurrency 32
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ pipeline:
- --ep_size 1
- --speculative_algorithm EAGLE3
- --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
- --concurrency 1
- --concurrency 32
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
slurm_config:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ pipeline:
- training.disable_tqdm=true
- training.ar_validate_steps=500000
- training.num_train_epochs=1
- training.max_steps=500
- training.max_steps=2000
# Kimi's slow tokenizer can't emit assistant masks the standard way; the mask
# is recovered from token ids (modelopt.torch.utils.loss_mask).
- training.answer_only_loss=true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pipeline:
- training.disable_tqdm=true
- training.ar_validate_steps=500000
- training.num_train_epochs=1
- training.max_steps=500
- training.max_steps=2000
- eagle.eagle_use_torch_compile=false
environment:
- HF_MODEL_CKPT: <<global_vars.hf_model>>
Expand Down
Loading
Loading