NVIDIA · h-guo18 · Jun 11, 2026 · Jun 11, 2026 · Jun 12, 2026 · Jun 12, 2026
@@ -78,6 +78,13 @@ def make_speculative_data_module(
     if mode == "streaming":
         # ``train_len`` right-truncates during tokenization and is also the collator's
         # pad target; caller must ensure ``train_len <= vllm.max_model_len``.
+        # The streaming dataset tokenizes via ``tokenizer.apply_chat_template`` (no
+        # chat_template arg), so a custom template (e.g. one carrying {% generation %}
+        # tags for answer_only_loss) must be installed on the tokenizer here — unlike
+        # the online path, which threads ``chat_template`` straight into the collator.
+        if chat_template is not None:
+            tokenizer.chat_template = chat_template
+            print_rank_0("Installed custom chat template on tokenizer for streaming.")
         print_rank_0(f"Streaming hidden states from {data_args.streaming_server_url}")
         from modelopt.torch.speculative.plugins.hf_streaming_dataset import (
             EagleVllmStreamingConfig,

diff --git a/tools/launcher/examples/Qwen/Qwen3-30B-A3B/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-30B-A3B/hf_streaming_dflash_multi_node.yaml
@@ -0,0 +1,124 @@
+# DFlash streaming speculative decoding pipeline for Qwen3-30B-A3B — MULTI-NODE.
+#
+# Same streaming transport / dispatch as hf_streaming_eagle3_multi_node.yaml: task_1
+# splits N nodes into K serve replicas + (N-K) DDP trainers via SERVE_NODES; hidden
+# states move serve -> trainer over NIXL RDMA. DFlash just consumes a different set of
+# captured layers and trains a block-diffusion draft instead of an autoregressive one.
+# See common/eagle3/train_eagle_streaming.sh for dispatch, rendezvous, and sharding.
+#
+# Qwen3-30B-A3B notes (vs the dense Qwen3-8B example):
+#   - MoE: 128 experts, 8 active/token, shipped in BF16 (NOT quantized, unlike
+#     gpt-oss MXFP4). The serve still passes --no-enable-flashinfer-autotune
+#     unconditionally; it is a no-op cost here since there is no FP4 re-tuning.
+#   - 48 hidden layers, 5 draft layers -> build_target_layer_ids(48,5)=[1,12,23,34,45]
+#     (the draft's fc input) plus the final layer for self-logit distillation. vLLM's
+#     capture ids are those +1 -> [2,13,24,35,46], plus final layer 48.
+#   - dflash_mask_token_id: Qwen3 has no tokenizer mask token (would fall back to
+#     None), so set it explicitly to a reserved/unused vocab slot (151669), same as
+#     the Qwen3-8B DFlash example (shared Qwen3 tokenizer).
+#   - We use a fake base (lm_head + embed_tokens only): streaming trains only the
+#     DFlash draft on hidden states from the live serve, so the trainer never needs
+#     the 30B base transformer layers. Matches the gpt-oss streaming examples.
+#
+# 3-step pipeline:
+#   task_0: Build input conversations (jsonl)
+#   task_1: Streaming train — 2 serve nodes (2 GPU, TP=2) + 2 trainer nodes (2 GPU)
+#   task_2: vLLM smoke test with DFlash speculative decoding
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3-30B-A3B/hf_streaming_dflash_multi_node.yaml --yes
+
+job_name: Qwen3-30B-A3B_DFlash_streaming_multi_node
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3-30B-A3B
+
+  # Step 1: Build input conversations
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming DFlash training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each).
+  # DFlash extracts 5 target layers (build_target_layer_ids(48,5)=[1,12,23,34,45], the
+  # draft's fc input) plus the final layer for self-logit distillation. vLLM's capture
+  # ids are those +1 -> [2,13,24,35,46], plus final layer 48.
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      # Streaming trains only the DFlash draft on hidden states from the live serve; the
+      # trainer never runs the base transformer layers, so load a fake base (lm_head +
+      # embed_tokens only). Consistent with the gpt-oss streaming examples.
+      - model.use_fake_base_for_offline=true
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      - training.output_dir=/scratchspace/dflash
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      # Streaming corpus is prompt-only (the serve generates the response and we
+      # capture its hidden states), so there is no assistant span to mask -> train
+      # over the full sequence, same as the EAGLE3 streaming example.
+      - training.answer_only_loss=false
+      - training.num_train_epochs=1
+      - training.max_steps=2000
+      # dflash.yaml sets report_to=tensorboard, which hard-fails if tensorboard
+      # isn't in the serve container; the streaming trainer doesn't need it.
+      - training.report_to=none
+      - dflash.dflash_block_size=16
+      - dflash.dflash_num_anchors=512
+      - dflash.dflash_loss_decay_factor=7
+      # Qwen3 has no tokenizer mask token; use a reserved vocab slot (151669).
+      - dflash.dflash_mask_token_id=151669
+      - dflash.dflash_architecture_config.num_hidden_layers=5
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces: nemo_run emits `export FOO=value` unquoted.
+      - EAGLE_CAPTURE_IDS: "[2,13,24,35,46,48]"
+      - SERVE_TP: "2"
+      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
+      - SERVE_NODES: "2"
+      # Per-rank in-flight fetches; keep low so the cold MoE serve isn't flooded past its execute-model timeout.
+      - STREAMING_NUM_WORKERS: "4"
+      # Qwen3-30B-A3B default ctx is 40960 -> full-len KV cache is wasteful for the
+      # 30B serve; we train at seq_len 4096, so cap it.
+      - SERVE_MAX_MODEL_LEN: "8192"
+      # DFlash uses a custom modeling file; export must trust remote code.
+      - EXPORT_EXTRA_ARGS: "--trust_remote_code"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 4
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:latest
+
+  # Step 3: vLLM smoke test (DFlash, uses exported checkpoint from training)
+  task_2:
+    script: common/specdec/vllm_smoke_test.sh
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - DRAFT_MODEL: /scratchspace/export
+      - SPEC_METHOD: "dflash"
+      - NUM_SPEC_TOKENS: "7"
+      - MIN_ACCEPTANCE_LENGTH: "1.2"
+      # Qwen3-30B-A3B is ~60 GB in BF16 -> does not fit one 80 GB GPU; serve the
+      # smoke-test target with TP=2 (gpus_per_node: 2 below).
+      - TP_SIZE: "2"
+    slurm_config:
+      _factory_: "slurm_factory"
+      container: "vllm/vllm-openai:nightly"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 2
diff --git a/tools/launcher/examples/Qwen/Qwen3-30B-A3B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-30B-A3B/hf_streaming_eagle3_multi_node.yaml
@@ -0,0 +1,110 @@
+# EAGLE3 streaming speculative decoding pipeline for Qwen3-30B-A3B — MULTI-NODE.
+#
+# task_1 splits N nodes into K serve replicas + (N-K) DDP trainers via SERVE_NODES;
+# see common/eagle3/train_eagle_streaming.sh for dispatch, rendezvous, and sharding.
+#
+# Qwen3-30B-A3B notes (vs the dense Qwen3-8B example):
+#   - MoE: 128 experts, 8 active/token, shipped in BF16 (NOT quantized, unlike
+#     gpt-oss MXFP4). The serve still passes --no-enable-flashinfer-autotune
+#     unconditionally; it is a no-op cost here since there is no FP4 re-tuning.
+#   - 48 hidden layers -> default_eagle_aux_layer_ids(48)=[1,23,44]; vLLM capture ids
+#     are each +1 plus the final layer 48 -> EAGLE_CAPTURE_IDS="[2,24,45,48]".
+#   - sliding_window is null (use_sliding_window=false), so the draft has no
+#     sliding-window leak. We still use a fake base (lm_head + embed_tokens only):
+#     streaming trains only the EAGLE3 draft on hidden states from the live serve,
+#     so the trainer never needs the 30B base transformer layers. This matches the
+#     gpt-oss streaming examples and keeps the draft config off Qwen3MoeConfig.
+#   - qwen3_moe is native in recent transformers/vLLM (no --trust-remote-code needed).
+#
+# 3-step pipeline:
+#   task_0: Build input conversations (jsonl)
+#   task_1: Streaming train — 2 serve nodes (2 GPU, TP=2) + 2 trainer nodes (2 GPU)
+#   task_2: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3-30B-A3B/hf_streaming_eagle3_multi_node.yaml --yes
+
+job_name: Qwen3-30B-A3B_EAGLE3_streaming_multi_node
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3-30B-A3B
+
+  # Step 1: Build input conversations
+  task_0:
+    script: common/eagle3/make_dataset.sh
+    args:
+      - -f modules/Model-Optimizer/examples/dataset/example_data_config.yaml
+      - --full-conversations
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 2: Streaming EAGLE3 training — 2 serve replicas (TP=2) + 2 trainer nodes (2 GPU each).
+  # Capture ids: default_eagle_aux_layer_ids(48)=[1,23,44] +1, plus final layer 48.
+  task_1:
+    script: common/eagle3/train_eagle_streaming.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      # Streaming trains only the EAGLE3 draft on hidden states from the live serve; the
+      # trainer never runs the base transformer layers, so load a fake base (lm_head +
+      # embed_tokens only). Consistent with the gpt-oss streaming examples.
+      - model.use_fake_base_for_offline=true
+      - data.mode=streaming
+      - data.data_path=/scratchspace/data/train.jsonl
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+      - training.num_train_epochs=1
+      - training.max_steps=2000
+      - eagle.eagle_use_torch_compile=false
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      # No spaces: nemo_run emits `export FOO=value` unquoted.
+      - EAGLE_CAPTURE_IDS: "[2,24,45,48]"
+      - SERVE_TP: "2"
+      # K serve replica nodes (Slurm nodes 0..K-1); the rest are trainers.
+      - SERVE_NODES: "2"
+      # Per-rank in-flight fetches; keep low so the cold MoE serve isn't flooded past its execute-model timeout (kills EngineCore).
+      - STREAMING_NUM_WORKERS: "4"
+      # Qwen3-30B-A3B default ctx is 40960 -> full-len KV cache is wasteful for the
+      # 30B serve; we train at seq_len 4096, so cap it.
+      - SERVE_MAX_MODEL_LEN: "8192"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 4
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Benchmark speculative decoding (VLLM backend)
+  task_2:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      # Qwen3-30B-A3B is ~60 GB in BF16 -> does not fit one 80 GB GPU; serve the
+      # benchmark target with TP=2 (gpus_per_node: 2 below).
+      - --tp_size 2
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 32
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml
@@ -111,7 +111,7 @@ pipeline:
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
+      - --concurrency 32
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:

diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3.yaml
@@ -93,7 +93,7 @@ pipeline:
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
+      - --concurrency 32
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:

diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3_ptq.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_offline_eagle3_ptq.yaml
@@ -117,7 +117,7 @@ pipeline:
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
+      - --concurrency 32
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:

diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_online_eagle3.yaml
@@ -62,7 +62,7 @@ pipeline:
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
+      - --concurrency 32
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:

diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3.yaml
@@ -70,7 +70,7 @@ pipeline:
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
+      - --concurrency 32
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:

diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_streaming_eagle3_multi_node.yaml
@@ -77,7 +77,7 @@ pipeline:
       - --ep_size 1
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
+      - --concurrency 32
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:

diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_dflash_multi_node.yaml
@@ -61,7 +61,7 @@ pipeline:
       - training.disable_tqdm=true
       - training.ar_validate_steps=500000
       - training.num_train_epochs=1
-      - training.max_steps=500
+      - training.max_steps=2000
       # Kimi's slow tokenizer can't emit assistant masks the standard way; the mask
       # is recovered from token ids (modelopt.torch.utils.loss_mask).
       - training.answer_only_loss=true

diff --git a/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml b/tools/launcher/examples/moonshotai/Kimi-K2.5/hf_streaming_eagle3_multi_node.yaml
@@ -52,7 +52,7 @@ pipeline:
       - training.disable_tqdm=true
       - training.ar_validate_steps=500000
       - training.num_train_epochs=1
-      - training.max_steps=500
+      - training.max_steps=2000
       - eagle.eagle_use_torch_compile=false
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>