Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 314 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2390,6 +2390,320 @@ glm5-fp4-b300-sglang-mtp:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }

glm5-fp4-gb200-dynamo-trt:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13
model: nvidia/GLM-5-NVFP4
model-prefix: glm5
runner: gb200
precision: fp4
framework: dynamo-trt
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# STP configurations
- conc-list: [ 4 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 5 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 84 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 168 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 284 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch64_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch64_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 666 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 1229 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 2151 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 2151 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 4301 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 4301 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- isl: 8192
osl: 1024
search-space:
# STP configurations
- conc-list: [ 5 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 10 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 25 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 50 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 105 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 308 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep32_batch8_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep32_batch8_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 615 ]
prefill:
num-worker: 4
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep32_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep32_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 1127 ]
prefill:
num-worker: 5
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep16_batch64_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep16_batch64_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 1229 ]
prefill:
num-worker: 6
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep32_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep32_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 2151 ]
prefill:
num-worker: 9
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx9dep4_gen1dep16_batch128_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx9dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

qwen3.5-fp8-b200-sglang-mtp:
image: lmsysorg/sglang:v0.5.12-cu130
model: Qwen/Qwen3.5-397B-A17B-FP8
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3909,3 +3909,13 @@
description:
- "Use the Marlin MoE backend for MiniMax-M3 B200/B300 TP-only vLLM configurations by adding --moe-backend marlin when expert parallelism is disabled."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1809

- config-keys:
- glm5-fp4-gb200-dynamo-trt
description:
- "Add GLM-5 NVFP4 GB200 disaggregated TRT-LLM (STP, non-MTP) benchmarks via Dynamo"
- "New multinode model: glm5 with dynamo-trt framework on GB200"
- "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13"
- "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026 (gb200_nvfp4 STP recipes); prefill tp=4/ep=4 (dep4)"
- "launch_gb200-nv.sh: added glm5-fp4 case to dynamo-trt branch with SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1803
6 changes: 5 additions & 1 deletion runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,12 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then
export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
export SERVED_MODEL_NAME="kimi-k2.5-nvfp4"
export SRT_SLURM_MODEL_PREFIX="nvidia/Kimi-K2.5-NVFP4"
elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then
export MODEL_PATH="/mnt/lustre01/models/GLM-5-NVFP4"
export SERVED_MODEL_NAME="glm-5-nvfp4"
export SRT_SLURM_MODEL_PREFIX="nvidia/GLM-5-NVFP4"
else
echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss, dsr1, or kimik2.5"
echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss, dsr1, kimik2.5, or glm5"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GLM5 missing srt-slurm checkout

High Severity

The new glm5-fp4-gb200-dynamo-trt sweep uses CONFIG_FILE paths under recipes/GLM5/... on NVIDIA srt-slurm sa-submission-q2-2026, but launch_gb200-nv.sh only selects that repo for kimik2.5 dynamo-trt. glm5 falls through to the cquil11/srt-slurm-nv clone, so recipe files are missing and the launcher exits when validating CONFIG_FILE.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit e99f46e. Configure here.

exit 1
fi
elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
Expand Down
Loading