From bbdd923d53275ffd09b195baa7d2113da8fb521c Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:59:43 -0700 Subject: [PATCH 01/33] feat: MiniMax-M3 MXFP8 full sweep config for GB200 Add minimaxm3-fp8-gb200-dynamo-vllm to nvidia-master.yaml with 6 topologies covering the full concurrency range: - TP4/TP8 (low latency, conc 4-64) - TP4+EP4 agg + 1P+1D disagg (mid curve, conc 64-512) - DEP4/DEP8 (high throughput, conc 256-2048) All recipe YAMLs included under minimax-m3-gb200-fp8/{1k1k,8k1k}/. --- .github/configs/nvidia-master.yaml | 111 ++++++++++++++++++ .../workflows/benchmark-multinode-tmpl.yml | 5 + .../1k1k/agg-gb200-dep4-1n.yaml | 74 ++++++++++++ .../1k1k/agg-gb200-dep8-2n.yaml | 74 ++++++++++++ .../1k1k/agg-gb200-tp4-1n.yaml | 71 +++++++++++ .../1k1k/agg-gb200-tp4ep4-1n.yaml | 72 ++++++++++++ .../1k1k/agg-gb200-tp8-2n.yaml | 71 +++++++++++ .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 89 ++++++++++++++ .../8k1k/agg-gb200-dep8-2n.yaml | 74 ++++++++++++ .../8k1k/agg-gb200-tp4-1n.yaml | 71 +++++++++++ .../8k1k/agg-gb200-tp4ep4-1n.yaml | 72 ++++++++++++ .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 89 ++++++++++++++ perf-changelog.yaml | 12 ++ runners/launch_gb200-nv.sh | 50 ++++++-- 14 files changed, 927 insertions(+), 8 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 187824347..e68adb5f4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11679,6 +11679,117 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: ep: 4 dp-attn: true +# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). +# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint +# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX +# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release; +# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64) +# image built from the m3_release branch (vllm-project/vllm#45381). +# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). +minimaxm3-fp8-gb200-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: gb200 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Low latency: TP=4 aggregated, 1 node (4 GPU). + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml" + decode: + num-worker: 0 + tp: 4 + ep: 1 + dp-attn: false + + # Low latency: TP=8 aggregated, 2 nodes (8 GPU). + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml" + decode: + num-worker: 0 + tp: 8 + ep: 1 + dp-attn: false + + # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU). + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml" + decode: + num-worker: 0 + tp: 4 + ep: 4 + dp-attn: false + + # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). + - conc-list: [64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + + # High throughput: DEP=4 aggregated, 1 node (4 GPU). + - conc-list: [256, 512, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml" + decode: + num-worker: 0 + tp: 1 + ep: 4 + dp-attn: true + + # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU). + - conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 1 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml" + decode: + num-worker: 0 + tp: 1 + ep: 8 + dp-attn: true + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 81727ef39..85b399e6c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -123,6 +123,11 @@ on: env: RANDOM_RANGE_RATIO: 0.8 + # Day-zero models resolved via hf: ids download from the Hub inside the + # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests + # get 429-rate-limited when several workers pull a 444 GB snapshot at + # once; sbatch/srun inherit this env so the token reaches the workers. + HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }} EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} MODEL_PREFIX: ${{ inputs.model-prefix }} diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml new file mode 100644 index 000000000..a95d2df41 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml @@ -0,0 +1,74 @@ +name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml new file mode 100644 index 000000000..ab231e733 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml @@ -0,0 +1,74 @@ +name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml new file mode 100644 index 000000000..ce431c3c0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml @@ -0,0 +1,71 @@ +name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml new file mode 100644 index 000000000..29efa7ecc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml @@ -0,0 +1,72 @@ +name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml new file mode 100644 index 000000000..29a5934bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml @@ -0,0 +1,71 @@ +name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml new file mode 100644 index 000000000..17769abf3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -0,0 +1,89 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200. +# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml new file mode 100644 index 000000000..db729764a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml @@ -0,0 +1,74 @@ +name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 128 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml new file mode 100644 index 000000000..8c7ecbe17 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml @@ -0,0 +1,71 @@ +name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 64 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml new file mode 100644 index 000000000..3e146af8b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml @@ -0,0 +1,72 @@ +name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" + +# MiniMax-M3 day-zero aggregated recipe +# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on +# day zero: NixlConnector KV transfer has never been exercised against +# M3's MSA index cache, so disagg shapes are deferred until agg +# baselines exist. model.path uses a staged-model alias — srtctl resolves it via +# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. +# --block-size 128 is mandatory (MSA sparse/index cache alignment); the +# benchmark is text-only, so language-model-only skips the vision +# encoder. max-model-len / cudagraph capture / batched tokens are +# trimmed to the fixed-seq-len scenario instead of the 1M default. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +# 444 GB of weights off shared FS (cold HF cache on the first run): +# allow up to 2 h for engine readiness. +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 1 + agg_workers: 1 + gpus_per_agg: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + aggregated: + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 256 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128x256" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml new file mode 100644 index 000000000..54980f7d3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -0,0 +1,89 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k. +# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128x256" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d29c9a5d3..647121c12 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3646,3 +3646,15 @@ - "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k" - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724 + +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo" + - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" + - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)" + - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode" + - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)" + - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" + - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" + pr-link: TBD diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 36c8af203..9c3430289 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -60,8 +60,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" + elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8" exit 1 fi else @@ -81,7 +84,7 @@ NGINX_IMAGE="nginx:1.27.4" # squash dir on a path that's also visible to compute nodes. Falls # back to the legacy sa-shared path so other configs are untouched. SQUASH_DIR="/mnt/lustre01/users-public/sa-shared" -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then echo "=== cluster diagnostic (minimax sweep) ===" echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)" echo "HOME=$HOME" @@ -128,8 +131,32 @@ fi SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -enroot import -o $SQUASH_FILE docker://$IMAGE -enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Concurrent matrix jobs (three gb200-nv runners) all import to the same +# shared-FS squash path. An unsynchronized `enroot import -o` onto an +# existing file APPENDS to it (mksquashfs default), corrupting the image +# while other jobs' pyxis extractions are reading it — observed on the +# minimaxm3 day-zero sweep (R1: an eval job appended to the live squash +# mid-run). Serialize with a lock, skip when the existing file is valid, +# and build to a temp path + atomic mv so readers never see a half-written +# file. Mirrors the import_squash pattern in launch_gb300-nv.sh. +import_squash() { + local squash="$1" image="$2" + local lock="${squash}.lock" + ( + exec 9>"$lock" + flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; } + if unsquashfs -l "$squash" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import: $squash" + else + rm -f "$squash" "$squash".tmp.* + enroot import -o "${squash}.tmp.$$" "docker://$image" + mv -f "${squash}.tmp.$$" "$squash" + fi + ) || exit 1 +} + +import_squash "$SQUASH_FILE" "$IMAGE" +import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -202,7 +229,7 @@ SRT_REPO_DIR="srt-slurm" # cross-mounted to compute nodes. Put the srt-slurm workspace and staged # InferenceX checkout on a writable shared-FS path that compute can see. # Per-run-unique paths avoid races between parallel sweep jobs. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SHARED_BASE="" for cand in \ /mnt/lustre01/users-public/sa-shared/gha-runs \ @@ -269,6 +296,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 exit 1 fi +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 + cd "$SRT_REPO_DIR" || exit 1 + git checkout main || exit 1 + mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -292,7 +325,7 @@ source $HOME/.local/bin/env # under a head-node-only path, .venv/bin/python3 becomes a broken # symlink on compute. Pin the venv to /usr/bin/python3 — a system # path that exists at the same location on both head and compute. -if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then +if [[ ( $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ) && -x /usr/bin/python3 ]]; then uv venv --seed --python /usr/bin/python3 else uv venv --seed @@ -312,7 +345,7 @@ SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" # Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path # above so srtctl's outputs/ directory (which lives under # SRTCTL_ROOT) is visible to compute nodes. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SRTCTL_ROOT="$SRT_REPO_DIR" fi echo "Creating srtslurm.yaml configuration..." @@ -354,7 +387,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" # can't see. Stage the relevant subset to shared FS and repoint # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already # on shared FS) and .git (not needed in container) for speed. -if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then +if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}" mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1 rsync -a --delete \ @@ -379,6 +412,7 @@ if [[ ! -f "$CONFIG_PATH" ]]; then fi sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" + if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) else From dbf5135c0299f26b19ff814519651f17efdc68e8 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:06:32 -0700 Subject: [PATCH 02/33] chore: update perf-changelog pr-link to #1734 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 647121c12..e1d38dd9f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3657,4 +3657,4 @@ - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)" - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" - pr-link: TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From ed63c1e042078379d6f555d573528c82e7559623 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 20:52:36 -0700 Subject: [PATCH 03/33] feat: switch GB200 M3 to ai-dynamo vllm-runtime 1.3.0 image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adopt the NVIDIA Dynamo vLLM runtime image (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1), the canonical M3 runtime from ai-dynamo/dynamo release/1.3.0-minimax-m3-dev.1. Changes mirrored from that release's recipes/minimax-m3/vllm/disagg/MXFP8/deploy.yaml: - dynamo.install: false — the runtime image bundles dynamo 1.3.0, so the prior 1.2.0 wheel install is dropped (srtctl defaults install=true) - attention-backend: FLASH_ATTN on every prefill/decode/agg engine Benchmark-specific knobs kept over the reference's serving defaults: language-model-only (text-only), no-enable-prefix-caching (random data), scenario-trimmed max-model-len. --- .github/configs/nvidia-master.yaml | 10 ++++++---- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml | 6 +++--- .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 7 ++++--- .../minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml | 6 +++--- .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml | 6 +++--- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 7 ++++--- 11 files changed, 38 insertions(+), 34 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c11f6505b..d1926f30f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11682,12 +11682,14 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX -# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release; -# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64) -# image built from the m3_release branch (vllm-project/vllm#45381). +# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime +# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with +# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. +# Engine args mirror the canonical recipe (ai-dynamo/dynamo +# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). minimaxm3-fp8-gb200-dynamo-vllm: - image: vllm/vllm-openai:minimax-m3 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml index a95d2df41..3b328ea28 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -60,6 +59,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml index ab231e733..81b000039 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -60,6 +59,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml index ce431c3c0..f7684fe8d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -57,6 +56,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 64 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml index 29efa7ecc..1fc4a3d98 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml index 29a5934bd..65e85f441 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -57,6 +56,7 @@ backend: max-num-batched-tokens: 2048 max-cudagraph-capture-size: 64 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 17769abf3..90ec1d007 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" @@ -75,6 +75,7 @@ backend: max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml index db729764a..c3f50da69 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -60,6 +59,7 @@ backend: max-num-batched-tokens: 16384 max-cudagraph-capture-size: 128 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml index 8c7ecbe17..444f1e1df 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -57,6 +56,7 @@ backend: max-num-batched-tokens: 16384 max-cudagraph-capture-size: 64 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml index 3e146af8b..ca8ea7e48 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-batched-tokens: 16384 max-cudagraph-capture-size: 256 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 54980f7d3..6a13b50d1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: - install: true - wheel: "1.2.0.dev20260526" + install: false slurm: time_limit: "8:00:00" @@ -58,6 +57,7 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" @@ -75,6 +75,7 @@ backend: max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 block-size: 128 + attention-backend: FLASH_ATTN language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" From 8738f42aabdbffb4235c5b5bc89c359c59ff26c4 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:11:07 -0700 Subject: [PATCH 04/33] fix: use enroot registry syntax (nvcr.io#) for GB200 M3 image enroot's docker:// URI needs `#` to separate the registry host from the image path; `nvcr.io/...` was parsed as a Docker Hub repo and 401'd against registry-1.docker.io. Matches the existing nvcr.io# convention in nvidia-master.yaml. Recipe container fields kept byte-identical to the master image: field (srtslurm.yaml maps "${IMAGE}" -> squashfile). --- .github/configs/nvidia-master.yaml | 4 ++-- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml | 2 +- .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml | 2 +- .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml | 2 +- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1d24e2857..9e3977232 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11777,13 +11777,13 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX # tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime -# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with +# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. # Engine args mirror the canonical recipe (ai-dynamo/dynamo # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). minimaxm3-fp8-gb200-dynamo-vllm: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 + image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml index 3b328ea28..921f99b8e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml index 81b000039..50eb3ff64 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml index f7684fe8d..6115d210c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml index 1fc4a3d98..94df4c8ec 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml index 65e85f441..1ac2612bd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 90ec1d007..4f9c01c6b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml index c3f50da69..adb36f646 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml index 444f1e1df..8cfbcb616 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml index ca8ea7e48..1567ca57c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml @@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 6a13b50d1..86d48468a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" precision: "fp8" dynamo: From 3415fb4e6a815393fd6c8ba12210bc9cd2f5074d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 13:44:48 -0700 Subject: [PATCH 05/33] feat: convert MiniMax-M3 GB200 sweep to fully disaggregated inference Replace the mostly-aggregated GB200 sweep (5 agg + 1 disagg) with a fully disaggregated sweep that splits prefill/decode over NixlConnector, mirroring the minimaxm2.5-fp8-gb200 reference. Every worker = one 4-GPU node since the 444 GB MXFP8 checkpoint can't fit in fewer. Topologies (1k1k): 1P1D TP4 (low-lat), 1P1D TP4+EP4 (mid), 1P2D TP4+EP4 (decode-scaled), 2P1D TP4+EP4 (prefill-scaled), 1P1D DEP4 (max-tput), spanning conc 4-2048. - add 4 disagg recipes; remove 8 orphaned agg recipes (1k1k + 8k1k) - rewire nvidia-master.yaml search-space to the 5 disagg entries - perf-changelog: describe disagg sweep; fix stale Image line (vllm/vllm-openai:minimax-m3 -> nvcr.io#.../vllm-runtime:1.3.0-minimax-m3-dev.1) Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 66 ++++++------- .../1k1k/agg-gb200-dep4-1n.yaml | 74 -------------- .../1k1k/agg-gb200-dep8-2n.yaml | 74 -------------- .../1k1k/agg-gb200-tp4-1n.yaml | 71 -------------- .../1k1k/agg-gb200-tp4ep4-1n.yaml | 72 -------------- .../1k1k/agg-gb200-tp8-2n.yaml | 71 -------------- .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml | 96 +++++++++++++++++++ .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 89 +++++++++++++++++ .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml | 92 ++++++++++++++++++ .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml | 92 ++++++++++++++++++ .../8k1k/agg-gb200-dep8-2n.yaml | 74 -------------- .../8k1k/agg-gb200-tp4-1n.yaml | 71 -------------- .../8k1k/agg-gb200-tp4ep4-1n.yaml | 72 -------------- perf-changelog.yaml | 10 +- 14 files changed, 401 insertions(+), 623 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e3977232..15aee30c5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11781,7 +11781,10 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. # Engine args mirror the canonical recipe (ai-dynamo/dynamo # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. -# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high). +# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split, +# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in +# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4 +# (decode- and prefill-scaled), 1P1D DEP4 (max throughput). minimaxm3-fp8-gb200-dynamo-vllm: image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -11796,7 +11799,7 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Low latency: TP=4 aggregated, 1 node (4 GPU). + # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each). - conc-list: [4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -11804,86 +11807,71 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml" decode: - num-worker: 0 + num-worker: 1 tp: 4 ep: 1 dp-attn: false - # Low latency: TP=8 aggregated, 2 nodes (8 GPU). - - conc-list: [4, 8, 16, 32, 64] + # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). + - conc-list: [64, 128, 256, 512] prefill: num-worker: 1 - tp: 8 - ep: 1 + tp: 4 + ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" decode: - num-worker: 0 - tp: 8 - ep: 1 + num-worker: 1 + tp: 4 + ep: 4 dp-attn: false - # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU). - - conc-list: [128, 256, 512] + # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each). + - conc-list: [256, 512, 1024] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml" decode: - num-worker: 0 + num-worker: 2 tp: 4 ep: 4 dp-attn: false - # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). - - conc-list: [64, 128, 256, 512] + # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each). + - conc-list: [256, 512, 1024] prefill: - num-worker: 1 + num-worker: 2 tp: 4 ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml" decode: num-worker: 1 tp: 4 ep: 4 dp-attn: false - # High throughput: DEP=4 aggregated, 1 node (4 GPU). - - conc-list: [256, 512, 1024] + # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each). + - conc-list: [512, 1024, 2048] prefill: num-worker: 1 tp: 1 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml" decode: - num-worker: 0 - tp: 1 - ep: 4 - dp-attn: true - - # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU). - - conc-list: [512, 1024, 2048] - prefill: num-worker: 1 tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml" - decode: - num-worker: 0 - tp: 1 - ep: 8 + ep: 4 dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml deleted file mode 100644 index 921f99b8e..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml deleted file mode 100644 index 50eb3ff64..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml deleted file mode 100644 index 6115d210c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 64 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml deleted file mode 100644 index 94df4c8ec..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml deleted file mode 100644 index 1ac2612bd..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-batched-tokens: 2048 - max-cudagraph-capture-size: 64 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml new file mode 100644 index 000000000..0749dbc86 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml @@ -0,0 +1,96 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve). +# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode +# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode +# token throughput at high concurrency; engine shape mirrors the proven +# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index +# cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 2304 + max-num-batched-tokens: 2048 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml new file mode 100644 index 000000000..927066e42 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -0,0 +1,89 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve). +# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP, +# no expert parallel: lowest TTFT/ITL for small concurrencies. +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml new file mode 100644 index 000000000..fbb99a3dd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml @@ -0,0 +1,92 @@ +name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k" + +# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled). +# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node +# each) = 3 nodes. Two decode workers absorb more in-flight sequences for +# mid/high concurrencies while a single prefill keeps TTFT low. +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml new file mode 100644 index 000000000..fb27934cb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml @@ -0,0 +1,92 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k" + +# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled). +# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4, +# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at +# mid/high concurrencies without starving a single decode worker. +# --block-size 128 is mandatory (MSA sparse/index cache alignment). + +model: + path: "minimax-m3-mxfp8" + container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + precision: "fp8" + +dynamo: + install: false + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASH_ATTN + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml deleted file mode 100644 index adb36f646..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml +++ /dev/null @@ -1,74 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 2 - agg_workers: 1 - gpus_per_agg: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml deleted file mode 100644 index 8cfbcb616..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml +++ /dev/null @@ -1,71 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 64 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml deleted file mode 100644 index 1567ca57c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k" - -# MiniMax-M3 day-zero aggregated recipe -# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on -# day zero: NixlConnector KV transfer has never been exercised against -# M3's MSA index cache, so disagg shapes are deferred until agg -# baselines exist. model.path uses a staged-model alias — srtctl resolves it via -# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8. -# --block-size 128 is mandatory (MSA sparse/index cache alignment); the -# benchmark is text-only, so language-model-only skips the vision -# encoder. max-model-len / cudagraph capture / batched tokens are -# trimmed to the fixed-seq-len scenario instead of the 1M default. - -model: - path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" - precision: "fp8" - -dynamo: - install: false - -slurm: - time_limit: "8:00:00" - -# 444 GB of weights off shared FS (cold HF cache on the first run): -# allow up to 2 h for engine readiness. -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - agg_nodes: 1 - agg_workers: 1 - gpus_per_agg: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - aggregated_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - - vllm_config: - aggregated: - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 9472 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 256 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128x256" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8ab05189e..5327dbd02 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3652,12 +3652,12 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo" + - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo" - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" - - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)" - - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode" - - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)" - - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048" + - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)" + - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" + - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)" + - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048" - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From 803cd20f243bb841b2013364af932e6aa9690850 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:19:36 -0700 Subject: [PATCH 06/33] fix: restore NIXL-bearing image for M3 GB200 disagg + enable MNNVL KV transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 27478698552 failed: every disagg worker crashed at NixlConnector init with "NIXL is not available" (RuntimeError, vllm .../nixl/worker.py:248). The ai-dynamo vllm-runtime:1.3.0-minimax-m3-dev.1 image ships dynamo but NOT the nixl bindings (cupy missing too), so kv_connector=NixlConnector cannot initialize and the engine core never becomes healthy. Revert to the pre-ed63c1e0 runtime path that pulls NIXL in via the dynamo wheel (same as the working minimaxm2.5-gb200 disagg recipes): - image/container: vllm/vllm-openai:minimax-m3 (the m3_release build all other m3 entries already use) - dynamo.install=true + wheel 1.2.0.dev20260526 (nixl is a dynamo dep) - keep attention-backend FLASH_ATTN (added in the image-switch commit) Also enable NVLink (MNNVL) KV transfer so NIXL doesn't fall back to TCP, mirroring the deepseek-v4 gb200 disagg recipes — on every prefill/decode env block: UCX_TLS=cuda_copy,cuda_ipc,tcp UCX_CUDA_IPC_ENABLE_MNNVL=y UCX_MEMTYPE_CACHE=n / UCX_MEMTYPE_REG_WHOLE=n NCCL_CUMEM_ENABLE=1 (cuMem-allocate buffers so they are IPC-exportable) Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 18 +++++++++------- .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml | 21 +++++++++++++++++-- .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml | 21 +++++++++++++++++-- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 21 +++++++++++++++++-- perf-changelog.yaml | 4 ++-- 8 files changed, 126 insertions(+), 22 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b0b99d53f..f246f518a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11776,17 +11776,19 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX -# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime -# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with -# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false. +# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build +# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set +# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND +# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image +# shipped without NIXL, so disagg workers crashed at NixlConnector init). # Engine args mirror the canonical recipe (ai-dynamo/dynamo # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. -# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split, -# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in -# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4 -# (decode- and prefill-scaled), 1P1D DEP4 (max throughput). +# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over +# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB +# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), +# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput). minimaxm3-fp8-gb200-dynamo-vllm: - image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 + image: vllm/vllm-openai:minimax-m3 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml index 0749dbc86..4b56e9e6f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml @@ -9,11 +9,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -44,10 +45,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index 927066e42..558c5d894 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -7,11 +7,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -42,10 +43,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 4f9c01c6b..eeefc68c1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -41,10 +42,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml index fbb99a3dd..02d9bd98e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml @@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -43,10 +44,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml index fb27934cb..4a440766a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml @@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -43,10 +44,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml index 86d48468a..c14b9fb3b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml @@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" model: path: "minimax-m3-mxfp8" - container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1" + container: "vllm/vllm-openai:minimax-m3" precision: "fp8" dynamo: - install: false + install: true + wheel: "1.2.0.dev20260526" slurm: time_limit: "8:00:00" @@ -41,10 +42,26 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead + # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and + # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" vllm_config: prefill: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index be638f5f1..627ed5bb1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3654,8 +3654,8 @@ description: - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo" - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" - - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)" - - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" + - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" + - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)" - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048" - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" From 1320056380a6f095211fbbb016a9fcc57fdbfbb6 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 20:04:13 -0700 Subject: [PATCH 07/33] feat: rack-scale wide-EP (DeepSeek megamoe) M3 GB200 disagg + FLASHINFER The narrow DEP8-max sweep showed no GB200 advantage over B200 because both cap at an 8-GPU NVLink island. Exploit NVL72's rack-scale NVLink with wide expert parallelism spanning multiple nodes, mirroring the deepseek-v4 "megamoe" ladder (DEP = data-parallel attention + expert-parallel): - 1P1D TP4 (2n) low-latency, conc 4-64 - 1P1D DEP8 (4n) mid, EP8/16-experts-per-rank, conc 128-512 - 1P1D DEP8->DEP16 (6n) wide decode (EP16), conc 512-2048 - 2P1D DEP8->DEP16 (8n) prefill-scaled, conc 2048-4096 - 4P1D DEP8->DEP16 (12n) max throughput, conc 4096-8192 M3 has 128 routed experts (top-4), so EP8/EP16 shard cleanly. EP16 across 16 GPU / 4 nodes is the regime B200 physically can't reach. Attention: FLASH_ATTN -> FLASHINFER (trtllm-gen) on all GB200 recipes to exploit Blackwell. Requires the :minimax-m3 image rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381), which gates trtllm-gen page>=128. Also add GB200 perf/NVLink-KV knobs from the deepseek-v4 reference: numa-bind (Grace) and enable-sleep-mode (cuMem allocator so the KV cache is IPC-exportable over the MNNVL fabric), alongside the existing UCX MNNVL env. Replaces the four narrow EP4 recipes; keeps 1P1D TP4 for low latency. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 95 +++++++++------- ...3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} | 45 +++++--- ...l => disagg-gb200-1p1d-dep8-dep16-6n.yaml} | 36 +++--- .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 17 ++- ...l => disagg-gb200-2p1d-dep8-dep16-8n.yaml} | 42 ++++--- ... => disagg-gb200-4p1d-dep8-dep16-12n.yaml} | 44 ++++--- .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml | 107 ------------------ perf-changelog.yaml | 15 +-- 8 files changed, 170 insertions(+), 231 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p2d-tp4ep4-3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} (69%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep4-2n.yaml => disagg-gb200-1p1d-dep8-dep16-6n.yaml} (77%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-tp4ep4-3n.yaml => disagg-gb200-2p1d-dep8-dep16-8n.yaml} (74%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tp4ep4-2n.yaml => disagg-gb200-4p1d-dep8-dep16-12n.yaml} (71%) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f246f518a..70ec293af 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11781,12 +11781,16 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: # dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND # NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image # shipped without NIXL, so disagg workers crashed at NixlConnector init). -# Engine args mirror the canonical recipe (ai-dynamo/dynamo -# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN. -# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over -# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB -# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), -# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput). +# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER +# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd +# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image +# from m3_release before running. Fully disaggregated, rack-scale wide-EP +# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors +# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel +# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers +# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge. +# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode), +# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts. minimaxm3-fp8-gb200-dynamo-vllm: image: vllm/vllm-openai:minimax-m3 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -11801,7 +11805,8 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each). + # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP + # would idle DP ranks at small concurrencies, so stay narrow here. - conc-list: [4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -11816,64 +11821,68 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false - # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each). - - conc-list: [64, 128, 256, 512] + # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes + # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank). + - conc-list: [128, 256, 512] prefill: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" decode: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true - # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each). - - conc-list: [256, 512, 1024] + # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU / + # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink. + - conc-list: [512, 1024, 2048] prefill: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" decode: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: false + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true - # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each). - - conc-list: [256, 512, 1024] + # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode + # (4 nodes) = 8 nodes. + - conc-list: [2048, 4096] prefill: num-worker: 2 - tp: 4 - ep: 4 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" decode: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false + tp: 16 + ep: 16 + dp-attn: true - # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each). - - conc-list: [512, 1024, 2048] + # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode + # (4 nodes) = 12 nodes within one NVL72 rack. + - conc-list: [4096, 8192] prefill: - num-worker: 1 - tp: 1 - ep: 4 + num-worker: 4 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" decode: num-worker: 1 - tp: 1 - ep: 4 + tp: 16 + ep: 16 dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml similarity index 69% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml index 02d9bd98e..efc5d5740 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -1,10 +1,11 @@ -name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k" -# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled). -# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node -# each) = 3 nodes. Two decode workers absorb more in-flight sequences for -# mid/high concurrencies while a single prefill keeps TTFT low. -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP). +# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> +# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel +# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of +# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so +# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128. model: path: "minimax-m3-mxfp8" @@ -26,12 +27,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 2 prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 4 - gpus_per_decode: 4 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -68,42 +69,50 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml similarity index 77% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml index 4b56e9e6f..5ca08a06d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -1,11 +1,10 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k" -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve). -# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode -# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode -# token throughput at high concurrency; engine shape mirrors the proven -# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index -# cache alignment). +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve). +# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn +# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128) +# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER +# attention, block-size 128. model: path: "minimax-m3-mxfp8" @@ -27,12 +26,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 4 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -71,7 +70,7 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 4 + data-parallel-size: 8 data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true @@ -79,31 +78,36 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 4 + data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-batched-tokens: 2048 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 128 benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index 558c5d894..b60b17515 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -1,9 +1,10 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve). -# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP, -# no expert parallel: lowest TTFT/ITL for small concurrencies. -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure +# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where +# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, +# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). model: path: "minimax-m3-mxfp8" @@ -74,12 +75,14 @@ backend: max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: @@ -91,13 +94,15 @@ backend: max-num-batched-tokens: 256 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml similarity index 74% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml index 4a440766a..853095727 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -1,10 +1,10 @@ -name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k" # MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled). -# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4, -# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at -# mid/high concurrencies without starving a single decode worker. -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16 +# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt +# ingest into a single wide decode at high concurrency. FLASHINFER +# attention, block-size 128. model: path: "minimax-m3-mxfp8" @@ -26,12 +26,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 1 + prefill_nodes: 4 + decode_nodes: 4 prefill_workers: 2 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -68,42 +68,50 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "2048x4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml similarity index 71% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml index eeefc68c1..4a6aa5d0f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -1,8 +1,10 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k" +name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k" -# MiniMax-M3 disaggregated 1P+1D recipe for GB200. -# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). -# --block-size 128 is mandatory (MSA sparse/index cache alignment). +# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput). +# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector -> +# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max +# prefill fan-in for the highest-concurrency points. FLASHINFER attention, +# block-size 128. model: path: "minimax-m3-mxfp8" @@ -24,12 +26,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 4 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -66,42 +68,50 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true stream-interval: 32 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 + max-num-seqs: 512 + max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 block-size: 128 - attention-backend: FLASH_ATTN + attention-backend: FLASHINFER language-model-only: true gpu-memory-utilization: 0.9 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - stream-interval: 32 + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "64x128x256x512" + concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml deleted file mode 100644 index c14b9fb3b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml +++ /dev/null @@ -1,107 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k. -# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node). -# --block-size 128 is mandatory (MSA sparse/index cache alignment). - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 9472 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASH_ATTN - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - stream-interval: 32 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "64x128x256" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 627ed5bb1..295a8e694 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3652,13 +3652,14 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo" - - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)" - - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" - - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)" - - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)" - - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048" - - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks" + - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo" + - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)" + - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" + - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks" + - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" + - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" + - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" + - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From c8cd5670cc1878c9d9109c8b212c2e02adb7eb98 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:00:11 -0700 Subject: [PATCH 08/33] feat: tune 1k1k low-conc latency + add 8k1k sweep for M3 GB200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1k1k TP4 low-conc tuning: stream-interval 1 (was 128 decode / 32 prefill), cudagraph cap 128 (was 512), conc range extended to 1-64 (was 4-64) to match B200 coverage. 8k1k sweep: 5 disagg recipes mirroring the 1k1k megamoe ladder (TP4, DEP8, DEP8→DEP16, 2P1D, 4P1D) with max-model-len 9472 (74×128 blocks = ISL+OSL+256 headroom). Concurrencies shifted ~4x lower for 8x heavier prefill: TP4 1-16, DEP8 32-128, DEP8→DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 81 ++++++++++++- .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 8 +- .../8k1k/disagg-gb200-1p1d-dep8-4n.yaml | 111 ++++++++++++++++++ .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 111 ++++++++++++++++++ .../8k1k/disagg-gb200-1p1d-tp4-2n.yaml | 106 +++++++++++++++++ .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 110 +++++++++++++++++ .../disagg-gb200-4p1d-dep8-dep16-12n.yaml | 110 +++++++++++++++++ perf-changelog.yaml | 3 +- 8 files changed, 634 insertions(+), 6 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 70ec293af..32957e282 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11807,7 +11807,7 @@ minimaxm3-fp8-gb200-dynamo-vllm: search-space: # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP # would idle DP ranks at small concurrencies, so stay narrow here. - - conc-list: [4, 8, 16, 32, 64] + - conc-list: [1, 2, 4, 8, 16, 32, 64] prefill: num-worker: 1 tp: 4 @@ -11885,6 +11885,85 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 16 dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # Low latency 8k1k: 1P+1D TP4, 2 nodes. stream-interval 1 + cudagraph + # cap 128 for best interactivity at small concurrencies. + - conc-list: [1, 2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL. + - conc-list: [512, 1024] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput 8k1k: 4P+1D, 12 nodes. + - conc-list: [1024, 2048] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index b60b17515..f3e79340a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -83,7 +83,7 @@ backend: no-enable-prefix-caching: true numa-bind: true enable-sleep-mode: true - stream-interval: 32 + stream-interval: 1 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -92,7 +92,7 @@ backend: max-model-len: 2304 max-num-seqs: 256 max-num-batched-tokens: 256 - max-cudagraph-capture-size: 512 + max-cudagraph-capture-size: 128 block-size: 128 attention-backend: FLASHINFER language-model-only: true @@ -102,10 +102,10 @@ backend: no-enable-prefix-caching: true numa-bind: true enable-sleep-mode: true - stream-interval: 128 + stream-interval: 1 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x8x16x32x64" + concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml new file mode 100644 index 000000000..f6f2c7874 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -0,0 +1,111 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k). +# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> +# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel +# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards +# 16 experts/rank. FLASHINFER attention, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml new file mode 100644 index 000000000..0d7d44843 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -0,0 +1,111 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k). +# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 +# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 +# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has +# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml new file mode 100644 index 000000000..b0602354c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k). +# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure +# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where +# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, +# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). +# Low-conc tuned: stream-interval 1, cudagraph cap 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml new file mode 100644 index 000000000..6a0765c60 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -0,0 +1,110 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k). +# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16 +# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute; +# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml new file mode 100644 index 000000000..9e4ff3c2b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -0,0 +1,110 @@ +name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k" + +# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k). +# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16 +# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill +# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 8 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8329ac1da..46ac06a08 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3659,7 +3659,8 @@ - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192" + - "1k1k concurrency sweep: TP4 1-64 (low-conc latency tuned: stream-interval 1, cudagraph cap 128), DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192" + - "8k1k concurrency sweep (same 5 topologies, shifted ~4x lower for 8x heavier prefill): TP4 1-16, DEP8 32-128, DEP8->DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048; max-model-len 9472 (74*128)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From b819a7a7dca04ea74b28b81bcd008a9059784144 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 14 Jun 2026 15:33:46 -0700 Subject: [PATCH 09/33] =?UTF-8?q?feat:=20low-conc=20focus=20=E2=80=94=20wi?= =?UTF-8?q?der=20decode=20+=20more=20decode=20workers=20for=20M3=20GB200?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comment out all conc > 64 entries (1k1k DEP8/DEP16/2P1D/4P1D and all 8k1k high-conc) to focus sweep budget on low-concurrency tuning. Add two new 1k1k experiments at conc 1-64 alongside the existing 1P1D TP4 baseline: - 1P2D TP4 (3 nodes): 2 decode workers halve per-worker batch - 1P1D TP4→TP8 (3 nodes): wider decode TP spreads forward pass across 8 GPU over NVL72 All three share the low-conc tuning (stream-interval 1, cudagraph cap 128, FLASHINFER, block-size 128). Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 264 ++++++++++-------- .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 105 +++++++ .../1k1k/disagg-gb200-1p2d-tp4-3n.yaml | 104 +++++++ perf-changelog.yaml | 4 +- 4 files changed, 361 insertions(+), 116 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 32957e282..483cf4dcd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11821,69 +11821,103 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false - # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes - # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank). - - conc-list: [128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU / - # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink. - - conc-list: [512, 1024, 2048] + # Low latency: 1P+2D TP4, 3 nodes. Two decode workers halve + # the per-worker batch, reducing ITL at low concurrency. + - conc-list: [1, 2, 4, 8, 16, 32, 64] prefill: num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml" decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode - # (4 nodes) = 8 nodes. - - conc-list: [2048, 4096] - prefill: num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false - # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode - # (4 nodes) = 12 nodes within one NVL72 rack. - - conc-list: [4096, 8192] + # Low latency: 1P+1D TP4 prefill -> TP8 decode (wider decode), + # 3 nodes. Wider decode TP spreads forward pass across 8 GPU. + - conc-list: [1, 2, 4, 8, 16, 32, 64] prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - decode: num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + + ## --- High-conc entries commented out to focus on low-conc tuning --- + # + ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes + ## decode = 4 nodes. First rung of rack-scale EP (16 experts/rank). + #- conc-list: [128, 256, 512] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # + ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU / + ## 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink. + #- conc-list: [512, 1024, 2048] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + ## Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode + ## (4 nodes) = 8 nodes. + #- conc-list: [2048, 4096] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + ## Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode + ## (4 nodes) = 12 nodes within one NVL72 rack. + #- conc-list: [4096, 8192] + # prefill: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true - isl: 8192 osl: 1024 @@ -11904,65 +11938,67 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false - # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. - - conc-list: [32, 64, 128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. - - conc-list: [128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL. - - conc-list: [512, 1024] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Max throughput 8k1k: 4P+1D, 12 nodes. - - conc-list: [1024, 2048] - prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + ## --- 8k1k high-conc entries commented out --- + # + ## Mid curve 8k1k: 1P+1D DEP8, 4 nodes. + #- conc-list: [32, 64, 128] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # + ## Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. + #- conc-list: [128, 256, 512] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + ## Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL. + #- conc-list: [512, 1024] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + ## Max throughput 8k1k: 4P+1D, 12 nodes. + #- conc-list: [1024, 2048] + # prefill: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml new file mode 100644 index 000000000..6923c645e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -0,0 +1,105 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider +# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TP8 +# (2 nodes) = 3 nodes. Wider decode TP reduces per-step latency by +# spreading the forward pass across 8 GPU over NVL72 NVLink. +# FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml new file mode 100644 index 000000000..1d1591198 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml @@ -0,0 +1,104 @@ +name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k" + +# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (low-latency, more +# decode workers). Prefill TP4 (1 node) -> NixlConnector -> 2x Decode +# TP4 (2 nodes) = 3 nodes. Two decode workers halve the per-worker +# batch, reducing ITL at low concurrency. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32x64" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 688493e05..d884e4a2d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3659,8 +3659,8 @@ - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "1k1k concurrency sweep: TP4 1-64 (low-conc latency tuned: stream-interval 1, cudagraph cap 128), DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192" - - "8k1k concurrency sweep (same 5 topologies, shifted ~4x lower for 8x heavier prefill): TP4 1-16, DEP8 32-128, DEP8->DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048; max-model-len 9472 (74*128)" + - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TP8 (3n, wider decode). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out." + - "8k1k: 1P1D TP4 (2n) at conc 1-16; high-conc 8k1k entries temporarily commented out" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 29eaaeb821e2993a6cf463da462f204408318fc0 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 14 Jun 2026 18:56:30 -0700 Subject: [PATCH 10/33] feat: enable expert-parallel on GB200 TEP8 decode to close B200 low-conc gap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit B200 TEP8 (TP8+EP8) achieves 11.68ms TPOT at conc 1 vs GB200 TP8's 15.29ms — the gap is entirely from expert parallelism splitting 128 MoE experts across 8 ranks. Add enable-expert-parallel: true to the TP8 decode recipe and update nvidia-master.yaml decode ep: 1→8 so result JSON reflects TEP8. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 6 +++--- .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 9 +++++---- perf-changelog.yaml | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 483cf4dcd..ab8060e5d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11837,8 +11837,8 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false - # Low latency: 1P+1D TP4 prefill -> TP8 decode (wider decode), - # 3 nodes. Wider decode TP spreads forward pass across 8 GPU. + # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8, wider + # decode + expert parallel), 3 nodes. Matches B200 TEP8 topology. - conc-list: [1, 2, 4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -11850,7 +11850,7 @@ minimaxm3-fp8-gb200-dynamo-vllm: decode: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: false ## --- High-conc entries commented out to focus on low-conc tuning --- diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index 6923c645e..199699212 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -1,10 +1,10 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k" # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider -# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TP8 -# (2 nodes) = 3 nodes. Wider decode TP reduces per-step latency by -# spreading the forward pass across 8 GPU over NVL72 NVLink. -# FLASHINFER, block-size 128. +# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8 +# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism +# reduces per-step latency by spreading both attention and MoE across +# 8 GPU over NVL72 NVLink. FLASHINFER, block-size 128. model: path: "minimax-m3-mxfp8" @@ -83,6 +83,7 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 8 pipeline-parallel-size: 1 + enable-expert-parallel: true max-model-len: 2304 max-num-seqs: 256 max-num-batched-tokens: 256 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5b03f66eb..2ab37b008 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3659,7 +3659,7 @@ - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TP8 (3n, wider decode). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out." + - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TEP8 (3n, decode TP8+EP8 to match B200 TEP8 topology). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out." - "8k1k: 1P1D TP4 (2n) at conc 1-16; high-conc 8k1k entries temporarily commented out" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From 56e61cfd6f54339a11eae9aa315bd757fc5b2b5b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 14 Jun 2026 23:38:44 -0700 Subject: [PATCH 11/33] feat: add 8k1k TEP8 decode recipe for GB200 to close B200 gap at long ISL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GB200 8k1k only had TP4 (2n) giving 18.50ms TPOT at conc 1 vs B200 TEP8's 11.57ms. Add 1P1D TP4→TEP8 (3n) 8k1k recipe mirroring the 1k1k TEP8 config that already closed the gap there (12.34ms vs 11.68ms). Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 16 +++ .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 106 ++++++++++++++++++ perf-changelog.yaml | 2 +- 3 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c5f5ee67b..6707785ab 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12758,6 +12758,22 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 1 dp-attn: false + # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), + # 3 nodes. Matches B200 TEP8 topology for 8k ISL. + - conc-list: [1, 2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + ## --- 8k1k high-conc entries commented out --- # ## Mid curve 8k1k: 1P+1D DEP8, 4 nodes. diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml new file mode 100644 index 000000000..165e9d338 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k, +# wider decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8 +# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism +# reduces per-step latency by spreading both attention and MoE across +# 8 GPU over NVL72 NVLink. FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4876032f7..2bd69870f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3676,7 +3676,7 @@ - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TEP8 (3n, decode TP8+EP8 to match B200 TEP8 topology). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out." - - "8k1k: 1P1D TP4 (2n) at conc 1-16; high-conc 8k1k entries temporarily commented out" + - "8k1k: 1P1D TP4 (2n) baseline + 1P1D TP4->TEP8 (3n, decode TP8+EP8) at conc 1-16; high-conc entries temporarily commented out" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 4a83c75eb89e244061cd1f0c3ef5dc50587a7c1b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 14 Jun 2026 23:41:14 -0700 Subject: [PATCH 12/33] refactor: remove unoptimized TP4/1P2D baselines, keep TEP8-only sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop 1P1D-TP4 (2n) and 1P2D-TP4 (3n) entries from both 1k1k and 8k1k. TEP8 dominates at every concurrency — TP4 baseline is 50% slower at conc 1 and 1P2D gave <2% TPOT improvement for 50% more GPUs. Active sweep is now TEP8-only: 1k1k conc 1-64, 8k1k conc 1-16. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 53 ++---------------------------- perf-changelog.yaml | 3 +- 2 files changed, 4 insertions(+), 52 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6707785ab..0e47ece77 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12625,40 +12625,9 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP - # would idle DP ranks at small concurrencies, so stay narrow here. - - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - - # Low latency: 1P+2D TP4, 3 nodes. Two decode workers halve - # the per-worker batch, reducing ITL at low concurrency. - - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml" - decode: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: false - - # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8, wider - # decode + expert parallel), 3 nodes. Matches B200 TEP8 topology. + # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. + # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting + # per-step latency ~19% vs pure TP8. Matches B200 TEP8 topology. - conc-list: [1, 2, 4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -12742,22 +12711,6 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # Low latency 8k1k: 1P+1D TP4, 2 nodes. stream-interval 1 + cudagraph - # cap 128 for best interactivity at small concurrencies. - - conc-list: [1, 2, 4, 8, 16] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), # 3 nodes. Matches B200 TEP8 topology for 8k ISL. - conc-list: [1, 2, 4, 8, 16] diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2bd69870f..abd45e97b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3675,8 +3675,7 @@ - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TEP8 (3n, decode TP8+EP8 to match B200 TEP8 topology). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out." - - "8k1k: 1P1D TP4 (2n) baseline + 1P1D TP4->TEP8 (3n, decode TP8+EP8) at conc 1-16; high-conc entries temporarily commented out" + - "Optimized low-conc: TEP8-only (1P1D TP4 prefill -> TEP8 decode, 3n). Unoptimized TP4 and 1P2D baselines removed. 1k1k conc 1-64, 8k1k conc 1-16. High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 2a237f7d639a58d11a2025f8273f35848ea8a0d6 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 14 Jun 2026 23:45:01 -0700 Subject: [PATCH 13/33] feat: uncomment all high-conc entries for full M3 GB200 sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable DEP8 (4n), DEP8→DEP16 (6n), 2P1D (8n), 4P1D (12n) for both 1k1k and 8k1k alongside the optimized TEP8 low-conc configs. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 248 ++++++++++++++--------------- perf-changelog.yaml | 2 +- 2 files changed, 123 insertions(+), 127 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0e47ece77..731d9cfe7 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12642,71 +12642,69 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 8 dp-attn: false - ## --- High-conc entries commented out to focus on low-conc tuning --- - # - ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes - ## decode = 4 nodes. First rung of rack-scale EP (16 experts/rank). - #- conc-list: [128, 256, 512] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # - ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU / - ## 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink. - #- conc-list: [512, 1024, 2048] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - ## Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode - ## (4 nodes) = 8 nodes. - #- conc-list: [2048, 4096] - # prefill: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - ## Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode - ## (4 nodes) = 12 nodes within one NVL72 rack. - #- conc-list: [4096, 8192] - # prefill: - # num-worker: 4 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes + # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank). + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU / + # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink. + - conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode + # (4 nodes) = 8 nodes. + - conc-list: [2048, 4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode + # (4 nodes) = 12 nodes within one NVL72 rack. + - conc-list: [4096, 8192] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true - isl: 8192 osl: 1024 @@ -12727,67 +12725,65 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 8 dp-attn: false - ## --- 8k1k high-conc entries commented out --- - # - ## Mid curve 8k1k: 1P+1D DEP8, 4 nodes. - #- conc-list: [32, 64, 128] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # - ## Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. - #- conc-list: [128, 256, 512] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - ## Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL. - #- conc-list: [512, 1024] - # prefill: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - ## Max throughput 8k1k: 4P+1D, 12 nodes. - #- conc-list: [1024, 2048] - # prefill: - # num-worker: 4 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL. + - conc-list: [512, 1024] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput 8k1k: 4P+1D, 12 nodes. + - conc-list: [1024, 2048] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint diff --git a/perf-changelog.yaml b/perf-changelog.yaml index abd45e97b..853fbdb7d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3675,7 +3675,7 @@ - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "Optimized low-conc: TEP8-only (1P1D TP4 prefill -> TEP8 decode, 3n). Unoptimized TP4 and 1P2D baselines removed. 1k1k conc 1-64, 8k1k conc 1-16. High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out." + - "Full sweep enabled: TEP8 (3n) for low-conc + DEP8 (4n), DEP8->DEP16 (6n), 2P1D (8n), 4P1D (12n) for high-conc. Both 1k1k and 8k1k." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 5e2c2f948d44426298e8526e5f3d696a0c36bee6 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 15 Jun 2026 08:59:51 -0700 Subject: [PATCH 14/33] =?UTF-8?q?feat:=20test=201P1D=20TEP4=20decode=20(TP?= =?UTF-8?q?4+EP4,=202n)=20=E2=80=94=20conc=201-32=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test whether EP4 on 4 decode GPUs (2 nodes total) improves TPOT over pure TP4 on GB200's NVL72 NVLink. B200 showed TEP4 slightly worse than TP4 intra-node; NVL72 all-to-all may differ. All other entries commented out for this isolated test. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 327 +++++++++--------- .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml | 106 ++++++ perf-changelog.yaml | 2 +- 3 files changed, 275 insertions(+), 160 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 731d9cfe7..0a8ae64fe 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12625,165 +12625,174 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. - # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting - # per-step latency ~19% vs pure TP8. Matches B200 TEP8 topology. - - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - - # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes - # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank). - - conc-list: [128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU / - # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink. - - conc-list: [512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode - # (4 nodes) = 8 nodes. - - conc-list: [2048, 4096] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode - # (4 nodes) = 12 nodes within one NVL72 rack. - - conc-list: [4096, 8192] - prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), - # 3 nodes. Matches B200 TEP8 topology for 8k ISL. - - conc-list: [1, 2, 4, 8, 16] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - - # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. - - conc-list: [32, 64, 128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. - - conc-list: [128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL. - - conc-list: [512, 1024] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Max throughput 8k1k: 4P+1D, 12 nodes. - - conc-list: [1024, 2048] - prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + # Test: 1P+1D TP4 prefill -> TEP4 decode (TP4+EP4), 2 nodes. + # EP on 4 GPU splits 128 experts into 32/rank — cheaper than TEP8 + # (3 nodes) if the all-to-all overhead is small on NVL72. + - conc-list: [1, 2, 4, 8, 16, 32] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + + ## --- All other entries commented out for TEP4 test run --- + # + ## Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. + #- conc-list: [1, 2, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: false + # + ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes. + #- conc-list: [128, 256, 512] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # + ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes. + #- conc-list: [512, 1024, 2048] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + ## Prefill-scaled: 2P+1D, 8 nodes. + #- conc-list: [2048, 4096] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + ## Max throughput: 4P+1D, 12 nodes. + #- conc-list: [4096, 8192] + # prefill: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + ## --- 8k1k entries commented out for TEP4 test run --- + # + #- isl: 8192 + # osl: 1024 + # search-space: + # - conc-list: [1, 2, 4, 8, 16] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: false + # + # - conc-list: [32, 64, 128] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # + # - conc-list: [128, 256, 512] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + # - conc-list: [512, 1024] + # prefill: + # num-worker: 2 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # + # - conc-list: [1024, 2048] + # prefill: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml new file mode 100644 index 000000000..147803c78 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k" + +# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, decode +# TEP4). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP4 +# (TP4+EP4, 1 node) = 2 nodes. Expert parallelism on decode splits 128 +# MoE experts across 4 ranks (32 each), reducing per-step MoE compute. +# FLASHINFER, block-size 128. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x2x4x8x16x32" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 853fbdb7d..173cba607 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3675,7 +3675,7 @@ - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "Full sweep enabled: TEP8 (3n) for low-conc + DEP8 (4n), DEP8->DEP16 (6n), 2P1D (8n), 4P1D (12n) for high-conc. Both 1k1k and 8k1k." + - "TEP4 decode test: 1P1D TP4 prefill -> TEP4 decode (TP4+EP4, 2n) at conc 1-32. All other entries temporarily commented out." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 055aa2c6c491f18d5190fa4ad536053c5c9ff417 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 15 Jun 2026 10:03:37 -0700 Subject: [PATCH 15/33] feat: restore full TEP8 + wide-EP sweep for M3 GB200 disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert from TEP4 decode test back to the full sweep config: - 1k1k: TEP8 3n (conc 1-64), DEP8 4n, DEP8→DEP16 6n, 2P1D 8n, 4P1D 12n - 8k1k: TEP8 3n (conc 1-16), DEP8 4n, DEP8→DEP16 6n, 2P1D 8n, 4P1D 12n Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 316 ++++++++++++++--------------- perf-changelog.yaml | 4 +- 2 files changed, 153 insertions(+), 167 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0a8ae64fe..6ae32670d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12625,171 +12625,157 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Test: 1P+1D TP4 prefill -> TEP4 decode (TP4+EP4), 2 nodes. - # EP on 4 GPU splits 128 experts into 32/rank — cheaper than TEP8 - # (3 nodes) if the all-to-all overhead is small on NVL72. - - conc-list: [1, 2, 4, 8, 16, 32] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - - ## --- All other entries commented out for TEP4 test run --- - # - ## Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. - #- conc-list: [1, 2, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # - ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes. - #- conc-list: [128, 256, 512] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # - ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes. - #- conc-list: [512, 1024, 2048] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - ## Prefill-scaled: 2P+1D, 8 nodes. - #- conc-list: [2048, 4096] - # prefill: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - ## Max throughput: 4P+1D, 12 nodes. - #- conc-list: [4096, 8192] - # prefill: - # num-worker: 4 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - ## --- 8k1k entries commented out for TEP4 test run --- - # - #- isl: 8192 - # osl: 1024 - # search-space: - # - conc-list: [1, 2, 4, 8, 16] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # - # - conc-list: [32, 64, 128] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # - # - conc-list: [128, 256, 512] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - # - conc-list: [512, 1024] - # prefill: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - # - conc-list: [1024, 2048] - # prefill: - # num-worker: 4 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - # decode: - # num-worker: 1 + # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. + # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting + # per-step latency ~19% vs pure TP8. Matches B200 TEP8 topology. + - conc-list: [1, 2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + + # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes. + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes. + - conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled: 2P+1D, 8 nodes. + - conc-list: [2048, 4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput: 4P+1D, 12 nodes. + - conc-list: [4096, 8192] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. + - conc-list: [1, 2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + + # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. + - conc-list: [128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Prefill-scaled 8k1k: 2P+1D, 8 nodes. + - conc-list: [512, 1024] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Max throughput 8k1k: 4P+1D, 12 nodes. + - conc-list: [1024, 2048] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" + decode: + num-worker: 1 # tp: 16 # ep: 16 # dp-attn: true diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 173cba607..0f6df9b66 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3674,8 +3674,8 @@ - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks" - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)" - - "TEP4 decode test: 1P1D TP4 prefill -> TEP4 decode (TP4+EP4, 2n) at conc 1-32. All other entries temporarily commented out." + - "5 topologies, 1k1k + 8k1k: 1P1D TEP8 decode (3n, low-lat conc 1-64), 1P1D DEP8 (4n, conc 128-512), 1P1D DEP8->DEP16 (6n, conc 512-2048), 2P1D (8n, conc 2048-4096), 4P1D (12n, conc 4096-8192)" + - "TEP8 decode (enable-expert-parallel on TP8): 128 experts / 8 ranks = 16 experts/rank, ~19% lower ITL than pure TP8 at low conc; stream-interval 1 + max-cudagraph-capture-size 128 for interactivity" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From e0ae36c3840c994336a408e3f6fb75165391a809 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 15 Jun 2026 10:10:32 -0700 Subject: [PATCH 16/33] fix: uncomment trailing 4P1D 8k1k decode lines in M3 GB200 sweep Three lines (tp/ep/dp-attn) for the 4P1D 8k1k decode were still commented out from the TEP4 test. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6ae32670d..876bf7fb6 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12776,9 +12776,9 @@ minimaxm3-fp8-gb200-dynamo-vllm: - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" decode: num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + tp: 16 + ep: 16 + dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint From b6926e36314105d8dd0c72c7ac0bc0f133a522c7 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 15 Jun 2026 12:08:30 -0700 Subject: [PATCH 17/33] fix: retrigger M3 GB200 sweep to validate MNNVL fused allreduce fix The TEP8 multi-node eval (TP8+EP8 decode, 2 NVL72 nodes) was producing gsm8k=0.0000 due to incorrect buffer aliasing in the eager fused_allreduce_gemma_rms_norm path. Fixed in vLLM M3 branch commit 66a43ba (cleanup/m3-mi300x-mxfp8): pass norm_out=None to match the compile-time AllReduceFusedAddGemmaRMSNormPattern aliasing. Co-Authored-By: Claude Opus 4.6 --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0f6df9b66..cc5112d0d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3855,3 +3855,11 @@ - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 + +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Fix MNNVL fused allreduce correctness bug for TEP8 multi-node decode (gsm8k=0.0000 → expect passing)" + - "Root cause: fused_allreduce_gemma_rms_norm passed norm_out=separate buffer, triggering the residual_out←allreduce_in aliasing path which produces wrong results on the MNNVL backend across NVL72 node boundaries" + - "Fix: pass norm_out=None to match the compile-time AllReduceFusedAddGemmaRMSNormPattern aliasing (norm_out←allreduce_in, residual_out←residual) — commit 66a43ba on cleanup/m3-mi300x-mxfp8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From 0e5ba19ee027728578676107ef06efce09278be5 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 15 Jun 2026 18:53:17 -0700 Subject: [PATCH 18/33] fix: point TEP8 recipe at ghcr image with NixlConnector head_ratio fix Update the 8k1k TEP8 recipe container to ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 which includes both the NixlConnector block_len validation fix (78ef73b) and the norm_out=None MNNVL aliasing fix (66a43ba). Co-Authored-By: Claude Opus 4.6 --- .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 2 +- perf-changelog.yaml | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index 165e9d338..453df782b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73" precision: "fp8" dynamo: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f2e21f015..a4c6b97b1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3859,9 +3859,11 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Fix MNNVL fused allreduce correctness bug for TEP8 multi-node decode (gsm8k=0.0000 → expect passing)" - - "Root cause: fused_allreduce_gemma_rms_norm passed norm_out=separate buffer, triggering the residual_out←allreduce_in aliasing path which produces wrong results on the MNNVL backend across NVL72 node boundaries" - - "Fix: pass norm_out=None to match the compile-time AllReduceFusedAddGemmaRMSNormPattern aliasing (norm_out←allreduce_in, residual_out←residual) — commit 66a43ba on cleanup/m3-mi300x-mxfp8" + - "Fix NixlConnector handshake failure for hetero-TP disagg when num_kv_heads < decode TP (M3 TEP8: TP4 prefill → TP8 decode)" + - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)." + - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8" + - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)" + - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 0531a4262ee8b1e0ceb6af2243f8c6ee1130680b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 16 Jun 2026 11:02:45 -0700 Subject: [PATCH 19/33] fix: retrigger M3 GB200 sweep after making ghcr image public MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous run failed with 401 Unauthorized pulling ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 — the package is now public. Co-Authored-By: Claude Opus 4.6 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 43f4e074e..d56081d5d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3870,7 +3870,7 @@ - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)." - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8" - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)" - - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4)" + - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4, public)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 6db15b052b460537407c0fcaa5370f4ab30cb1de Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 16 Jun 2026 12:18:38 -0700 Subject: [PATCH 20/33] =?UTF-8?q?fix:=20retrigger=20M3=20GB200=20sweep=20?= =?UTF-8?q?=E2=80=94=20previous=20run=20hit=20NATS=20infra=20failure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous run (19006) failed with "NATS failed to start" on watchtower-navy-cn01, a transient cluster infra issue unrelated to the NixlConnector fix. Co-Authored-By: Claude Opus 4.6 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 400b314ff..5b355060d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3871,7 +3871,7 @@ - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)." - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8" - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)" - - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4, public)" + - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (public, from cleanup/m3-mi300x-mxfp8@78ef73bc4)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - config-keys: From 67f452159dbab67237f17a910b1fa041c3b98d57 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 16 Jun 2026 12:50:26 -0700 Subject: [PATCH 21/33] Remove minimaxm3-fp8-gb200-vllm details from changelog Removed initial submission details for minimaxm3-fp8-gb200-dynamo-vllm from the changelog. --- perf-changelog.yaml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ba635f2da..5b1f07c9f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3665,19 +3665,6 @@ - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724 -- config-keys: - - minimaxm3-fp8-gb200-dynamo-vllm - description: - - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo" - - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)" - - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL" - - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks" - - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)" - - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island" - - "5 topologies, 1k1k + 8k1k: 1P1D TEP8 decode (3n, low-lat conc 1-64), 1P1D DEP8 (4n, conc 128-512), 1P1D DEP8->DEP16 (6n, conc 512-2048), 2P1D (8n, conc 2048-4096), 4P1D (12n, conc 4096-8192)" - - "TEP8 decode (enable-expert-parallel on TP8): 128 experts / 8 ranks = 16 experts/rank, ~19% lower ITL than pure TP8 at low conc; stream-interval 1 + max-cudagraph-capture-size 128 for interactivity" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - - config-keys: - minimaxm3-fp8-b200-vllm description: @@ -3873,6 +3860,7 @@ - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)" - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (public, from cleanup/m3-mi300x-mxfp8@78ef73bc4)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 + evals-only: true - config-keys: - minimaxm3-fp8-b200-vllm From 211488bed966e39c3c369ef1373a8436278e6363 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 05:00:20 +0800 Subject: [PATCH 22/33] fix: switch M3 GB200 disagg to nightly, drop extra_mount workaround Head_ratio fix merged upstream (vllm-project/vllm#45879). nightly-aarch64 contains both M3 model support (#45381) and the NixlConnector handshake fix (#45330). --- .github/configs/nvidia-master.yaml | 2 +- .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 6 +----- perf-changelog.yaml | 7 ++----- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fa5b6fa04..6caf1bc11 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12623,7 +12623,7 @@ qwen3.5-fp4-b200-trt: # 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode), # 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts. minimaxm3-fp8-gb200-dynamo-vllm: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:nightly-aarch64 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index a0db43086..43838147a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -8,13 +8,9 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" -extra_mount: - - "/mnt/lustre01/users-public/sa-shared/patches/m3/worker.py:/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py" - - "/mnt/lustre01/users-public/sa-shared/patches/m3/fused_allreduce_gemma_rms_norm.py:/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_allreduce_gemma_rms_norm.py" - dynamo: install: true wheel: "1.2.0.dev20260526" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 01af177f5..2954594b3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3854,11 +3854,8 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Fix NixlConnector handshake failure for hetero-TP disagg when num_kv_heads < decode TP (M3 TEP8: TP4 prefill → TP8 decode)" - - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)." - - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8" - - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)" - - "Runtime-patched via extra_mount: ARM64 base image vllm/vllm-openai:minimax-m3 + bind-mounted worker.py (head_ratio) and fused_allreduce_gemma_rms_norm.py (norm_out=None) from shared lustre" + - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" + - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 evals-only: true From 492db6131560394fa619cb14242302e84c8de230 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 05:13:19 +0800 Subject: [PATCH 23/33] fix: append changelog entry at end to pass immutability gate The validator requires new entries appended at the file end (byte prefix must match origin/main exactly). The previous commit inserted mid-file, shifting entry indices and triggering the immutability check. Co-Authored-By: Claude Opus 4.6 --- perf-changelog.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 75cb04b4d..093d2ee79 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3851,14 +3851,6 @@ - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689 -- config-keys: - - minimaxm3-fp8-gb200-dynamo-vllm - description: - - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" - - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - evals-only: true - - config-keys: - minimaxm3-fp8-b200-vllm description: @@ -3958,3 +3950,11 @@ - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 + +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" + - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 + evals-only: true From 37ab79b658fdf33efb977a093b9dec37643a65c3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 06:17:17 +0800 Subject: [PATCH 24/33] feat: switch GB200 M3 to vllm/vllm-openai:minimax-m3-0618, run full sweep Multi-arch image (arm64+amd64) with upstream head_ratio fix baked in. Update all 14 GB200 disagg recipes (1k1k + 8k1k), nvidia-master.yaml, and changelog entry (no longer evals-only). Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 2 +- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml | 2 +- .../1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 2 +- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 2 +- .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml | 2 +- .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 2 +- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml | 2 +- .../1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 2 +- .../1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml | 2 +- .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml | 2 +- .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 2 +- .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml | 2 +- .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 2 +- .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 2 +- .../8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml | 2 +- perf-changelog.yaml | 3 +-- 16 files changed, 16 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9b3477ec5..0d8ef5c87 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12923,7 +12923,7 @@ qwen3.5-fp4-b200-trt: # 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode), # 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts. minimaxm3-fp8-gb200-dynamo-vllm: - image: vllm/vllm-openai:nightly-aarch64 + image: vllm/vllm-openai:minimax-m3-0618 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml index efc5d5740..79a2004bc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml index 5ca08a06d..3d1dde14f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index f3e79340a..0e8d58332 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml index 147803c78..ffa7ec15a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index 199699212..b7cbc4ded 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml index 1d1591198..0000a9648 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml @@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml index 853095727..877b2e235 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml index 4a6aa5d0f..c2aaaae8a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml index f6f2c7874..85b8ecc9a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml index 0d7d44843..94049a5b8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml index b0602354c..3476bf62f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index 43838147a..138a8ff71 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml index 6a0765c60..dd8ea9dc8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml index 9e4ff3c2b..dead061e8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618" precision: "fp8" dynamo: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 093d2ee79..5c4362e04 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3954,7 +3954,6 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" + - "Switch to vllm/vllm-openai:minimax-m3-0618 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 - evals-only: true From 4723ca2cc7def4a81db0367682b9305f9b3a60c9 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 06:52:30 +0800 Subject: [PATCH 25/33] revert: switch GB200 M3 back to nightly-aarch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit minimax-m3-0618 likely cherry-picks vLLM PR #45723 (gemm1_alpha for FP8 TRT-LLM MoE) but ships flashinfer ≤0.6.13 which lacks that kwarg (flashinfer PR #3504), causing TypeError at runtime. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 2 +- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml | 2 +- .../1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 2 +- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 2 +- .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml | 2 +- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 2 +- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml | 2 +- .../1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 2 +- .../1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml | 2 +- .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml | 2 +- .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 2 +- .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml | 2 +- .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 2 +- .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 2 +- .../8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml | 2 +- perf-changelog.yaml | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0d8ef5c87..9b3477ec5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12923,7 +12923,7 @@ qwen3.5-fp4-b200-trt: # 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode), # 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts. minimaxm3-fp8-gb200-dynamo-vllm: - image: vllm/vllm-openai:minimax-m3-0618 + image: vllm/vllm-openai:nightly-aarch64 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml index 79a2004bc..c930ca92b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml index 3d1dde14f..d0f92214b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index 0e8d58332..1c1a8f5ca 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml index ffa7ec15a..0662dd338 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index b7cbc4ded..256f1f723 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml index 0000a9648..4552dfb01 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml @@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml index 877b2e235..de4f3ce22 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml index c2aaaae8a..cd978be55 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml index 85b8ecc9a..b0edec3a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml index 94049a5b8..d326ed74b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml index 3476bf62f..63bea1a22 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index 138a8ff71..43838147a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml index dd8ea9dc8..b8d2944a2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml @@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml index dead061e8..3cc56e088 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml @@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-0618" + container: "vllm/vllm-openai:nightly-aarch64" precision: "fp8" dynamo: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5c4362e04..b48027110 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3954,6 +3954,6 @@ - config-keys: - minimaxm3-fp8-gb200-dynamo-vllm description: - - "Switch to vllm/vllm-openai:minimax-m3-0618 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" + - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From c3030aaee9b6406d62fe6dba53a307b05ea172d7 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 13:27:02 +0800 Subject: [PATCH 26/33] feat: add --moe-backend marlin for TP-only GB200 M3 disagg workers Per PR #1809 pattern: Marlin MoE backend for TP-only configs (no EP, no DP-attention). Applied to 6 recipes affecting 9 worker sections (prefill and/or decode). EP/DP-attention workers stay on default. Co-Authored-By: Claude Opus 4.6 --- .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 2 ++ .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml | 1 + .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 1 + .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml | 2 ++ .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml | 2 ++ .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 1 + perf-changelog.yaml | 1 + 7 files changed, 10 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml index 1c1a8f5ca..049af1fa7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -69,6 +69,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 enforce-eager: true max-model-len: 2304 @@ -88,6 +89,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 max-model-len: 2304 max-num-seqs: 256 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml index 0662dd338..890b5a590 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml @@ -63,6 +63,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 enforce-eager: true max-model-len: 2304 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index 256f1f723..8d63df4ab 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -63,6 +63,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 enforce-eager: true max-model-len: 2304 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml index 4552dfb01..de1488514 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml @@ -62,6 +62,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 enforce-eager: true max-model-len: 2304 @@ -81,6 +82,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 max-model-len: 2304 max-num-seqs: 256 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml index 63bea1a22..3b1ba4032 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml @@ -64,6 +64,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 enforce-eager: true max-model-len: 9472 @@ -83,6 +84,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 max-model-len: 9472 max-num-seqs: 256 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml index 43838147a..7074eaf13 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml @@ -63,6 +63,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + moe-backend: marlin pipeline-parallel-size: 1 enforce-eager: true max-model-len: 9472 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b48027110..97dec6ca7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3956,4 +3956,5 @@ description: - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround" - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)" + - "Add --moe-backend marlin for TP-only prefill/decode workers (no EP, no DP-attention) per PR #1809 pattern" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 From 2ea30b385c363163e735e4b11ef1042c836c6828 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 15:40:36 +0800 Subject: [PATCH 27/33] fix: scale MiniMax-M3 prefill to DEP8 --- .github/configs/nvidia-master.yaml | 10 +++++----- ...aml => disagg-gb200-1p1d-dep8-tp8-4n.yaml} | 20 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-tp4-tp8-3n.yaml => disagg-gb200-1p1d-dep8-tp8-4n.yaml} (82%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d57819de0..b2091b3d8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -13016,15 +13016,15 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. + # Low latency 8k1k: DEP8 prefill -> TEP8 decode (TP8+EP8), 4 nodes. - conc-list: [1, 2, 4, 8, 16] prefill: num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false + tp: 8 + ep: 8 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml" decode: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml similarity index 82% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml index 7074eaf13..d217c0ed9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml @@ -1,10 +1,10 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-8k1k" # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k, -# wider decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8 -# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism -# reduces per-step latency by spreading both attention and MoE across -# 8 GPU over NVL72 NVLink. FLASHINFER, block-size 128. +# wider decode). Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> +# NixlConnector -> Decode TEP8 (TP8+EP8, 2 nodes) = 4 nodes. DEP8 +# shards the prefill MoE weights across 8 GPUs so the model fits without +# Marlin repacking. FLASHINFER attention, block-size 128. model: path: "minimax-m3-mxfp8" @@ -26,11 +26,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 + gpus_per_prefill: 8 gpus_per_decode: 8 frontend: @@ -62,9 +62,11 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true enforce-eager: true max-model-len: 9472 max-num-seqs: 16 From fc4af8b5717a0721e498cbf0e1f9c293fb253c0d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 16:00:06 +0800 Subject: [PATCH 28/33] feat: redesign MiniMax-M3 GB200 decode tiers --- .github/configs/nvidia-master.yaml | 73 ++++-------- .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 111 ------------------ ...ml => disagg-gb200-1p1d-dep8-tep8-4n.yaml} | 26 ++-- .../8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml | 14 +-- .../8k1k/disagg-gb200-1p1d-tp4-2n.yaml | 108 ----------------- ...n.yaml => disagg-gb200-2p7d-dep8-18n.yaml} | 19 ++- .../disagg-gb200-4p1d-dep8-dep16-12n.yaml | 110 ----------------- 7 files changed, 46 insertions(+), 415 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep8-4n.yaml => disagg-gb200-1p1d-dep8-tep8-4n.yaml} (77%) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p1d-dep8-dep16-8n.yaml => disagg-gb200-2p7d-dep8-18n.yaml} (84%) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b2091b3d8..21af1ea8d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12915,13 +12915,10 @@ qwen3.5-fp4-b200-trt: # block-size 128 mandatory (MSA index-cache alignment); FLASHINFER # (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd # (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image -# from m3_release before running. Fully disaggregated, rack-scale wide-EP -# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors -# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel -# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers -# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge. -# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode), -# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts. +# from m3_release before running. The active 8k1k matrix compares DEP8 +# prefill with TEP8 and TP8+Marlin decode at low concurrency, then fills the +# NVL72 rack with 2P+7D DEP8 for throughput. DEP8 is TP1 + DP-attn + EP8 +# across 8 GPU / 2 nodes. M3 has 128 routed experts. minimaxm3-fp8-gb200-dynamo-vllm: image: vllm/vllm-openai:nightly-aarch64 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -13016,80 +13013,50 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # Low latency 8k1k: DEP8 prefill -> TEP8 decode (TP8+EP8), 4 nodes. - - conc-list: [1, 2, 4, 8, 16] + # Low conc: 1P DEP8 + 1D TEP8 (TP8+EP8), 4 nodes. + - conc-list: [8, 16, 32, 64] prefill: num-worker: 1 - tp: 8 + tp: 1 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: false - # Mid curve 8k1k: 1P+1D DEP8, 4 nodes. - - conc-list: [32, 64, 128] + # Low conc: 1P DEP8 + 1D TP8 Marlin, 4 nodes. + - conc-list: [8, 16, 32, 64] prefill: num-worker: 1 - tp: 8 + tp: 1 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml" decode: num-worker: 1 tp: 8 - ep: 8 - dp-attn: true - - # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes. - - conc-list: [128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + ep: 1 + dp-attn: false - # Prefill-scaled 8k1k: 2P+1D, 8 nodes. - - conc-list: [512, 1024] + # High conc: 2P DEP8 + 7D DEP8, 18 nodes / 72 GPU. + - conc-list: [128, 256, 512, 1024] prefill: num-worker: 2 - tp: 8 + tp: 1 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml" decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Max throughput 8k1k: 4P+1D, 12 nodes. - - conc-list: [1024, 2048] - prefill: - num-worker: 4 - tp: 8 + num-worker: 7 + tp: 1 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml deleted file mode 100644 index d326ed74b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml +++ /dev/null @@ -1,111 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k). -# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 -# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 -# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has -# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 9472 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml similarity index 77% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml index b0edec3a1..3859ae520 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml @@ -1,10 +1,8 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tep8-4n-8k1k" -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k). -# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> -# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel -# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards -# 16 experts/rank. FLASHINFER attention, block-size 128. +# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TEP8 decode +# (TP8+EP8, 2 nodes) = 4 nodes. Low-concurrency latency config. +# Marlin is intentionally not used on the DEP8 prefill. model: path: "minimax-m3-mxfp8" @@ -80,19 +78,17 @@ backend: no-enable-prefix-caching: true numa-bind: true enable-sleep-mode: true - stream-interval: 32 + stream-interval: 1 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 + tensor-parallel-size: 8 pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 9472 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 block-size: 128 attention-backend: FLASHINFER language-model-only: true @@ -102,10 +98,10 @@ backend: no-enable-prefix-caching: true numa-bind: true enable-sleep-mode: true - stream-interval: 128 + stream-interval: 1 benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "32x64x128" + concurrencies: "8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml index d217c0ed9..528b90da9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml @@ -1,10 +1,8 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-4n-8k1k" -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k, -# wider decode). Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> -# NixlConnector -> Decode TEP8 (TP8+EP8, 2 nodes) = 4 nodes. DEP8 -# shards the prefill MoE weights across 8 GPUs so the model fits without -# Marlin repacking. FLASHINFER attention, block-size 128. +# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TP8 decode +# (Marlin MoE, 2 nodes) = 4 nodes. Low-concurrency latency config. +# Marlin is restricted to the pure TP8 decode worker. model: path: "minimax-m3-mxfp8" @@ -85,8 +83,8 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 8 + moe-backend: marlin pipeline-parallel-size: 1 - enable-expert-parallel: true max-model-len: 9472 max-num-seqs: 256 max-num-batched-tokens: 256 @@ -106,4 +104,4 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1x2x4x8x16" + concurrencies: "8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml deleted file mode 100644 index 3b1ba4032..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ /dev/null @@ -1,108 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k). -# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure -# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where -# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, -# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). -# Low-conc tuned: stream-interval 1, cudagraph cap 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - max-model-len: 9472 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x2x4x8x16" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml similarity index 84% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml index b8d2944a2..23900d1c0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml @@ -1,9 +1,8 @@ -name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k" +name: "minimax-m3-vllm-disagg-gb200-2p7d-dep8-18n-8k1k" -# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k). -# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16 -# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute; -# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128. +# 2P DEP8 prefill (4 nodes) + 7D DEP8 decode (14 nodes) = +# 18 nodes / 72 GPU. High-concurrency throughput config. +# DEP8 is TP1 DP8 EP on both sides; Marlin is not used. model: path: "minimax-m3-mxfp8" @@ -26,11 +25,11 @@ resources: gpu_type: "gb200" gpus_per_node: 4 prefill_nodes: 4 - decode_nodes: 4 + decode_nodes: 14 prefill_workers: 2 - decode_workers: 1 + decode_workers: 7 gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_decode: 8 frontend: type: dynamo @@ -85,7 +84,7 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 16 + data-parallel-size: 8 data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 9472 @@ -107,4 +106,4 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "512x1024" + concurrencies: "128x256x512x1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml deleted file mode 100644 index 3cc56e088..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml +++ /dev/null @@ -1,110 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k" - -# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k). -# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16 -# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill -# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 8 - decode_nodes: 4 - prefill_workers: 4 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 9472 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024x2048" From 4431440fdecb6b6519371832ca44742c79db7f51 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 19:00:18 +0800 Subject: [PATCH 29/33] feat: TEP4 prefill + B300-optimal decode for GB200 M3 disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch all prefill from DEP8 (TP1 DP8 EP, 2 nodes) to TEP4 (TP4+EP4, 1 node), halving per-worker node footprint. Decode configs follow B300 run 27630519240 optimal points (spec=none): - conc 8-32: TP4+Marlin (no EP) - conc 64-256: TEP4 (TP4+EP4) - conc 512/1024: TEP8 (8k1k) or DEP8 (1k1k), 8 workers × 18n Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 147 +++++++----------- .../1k1k/disagg-gb200-1p1d-tep4-2n.yaml | 105 +++++++++++++ .../1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml | 105 +++++++++++++ .../disagg-gb200-2p8d-tep4-dep8-18n.yaml} | 29 ++-- ...4n.yaml => disagg-gb200-1p1d-tep4-2n.yaml} | 24 ++- ...aml => disagg-gb200-1p1d-tep4-tp4-2n.yaml} | 24 ++- .../8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml | 106 +++++++++++++ 7 files changed, 407 insertions(+), 133 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/{8k1k/disagg-gb200-2p7d-dep8-18n.yaml => 1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml} (81%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep8-tep8-4n.yaml => disagg-gb200-1p1d-tep4-2n.yaml} (82%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep8-tp8-4n.yaml => disagg-gb200-1p1d-tep4-tp4-2n.yaml} (82%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 21af1ea8d..f85aeefa9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12907,18 +12907,13 @@ qwen3.5-fp4-b200-trt: # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX -# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build -# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set -# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND -# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image -# shipped without NIXL, so disagg workers crashed at NixlConnector init). -# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER -# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd -# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image -# from m3_release before running. The active 8k1k matrix compares DEP8 -# prefill with TEP8 and TP8+Marlin decode at low concurrency, then fills the -# NVL72 rack with 2P+7D DEP8 for throughput. DEP8 is TP1 + DP-attn + EP8 -# across 8 GPU / 2 nodes. M3 has 128 routed experts. +# tensor cores on Blackwell. Image is nightly-aarch64 (vLLM main); recipes +# set dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime +# AND NIXL are layered in at job start. block-size 128 mandatory (MSA +# index-cache alignment); FLASHINFER attention. All prefill is TEP4 +# (TP4+EP4, 1 GB200 node). Decode configs mirror B300 single-node optimal +# points (run 27630519240): TP4+Marlin at low conc, TEP4 at mid conc, +# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 8 decode workers. minimaxm3-fp8-gb200-dynamo-vllm: image: vllm/vllm-openai:nightly-aarch64 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -12933,130 +12928,98 @@ minimaxm3-fp8-gb200-dynamo-vllm: # - isl: 1024 # osl: 1024 # search-space: - # # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes. - # # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting - # # per-step latency ~19% vs pure TP8. Matches B200 TEP8 topology. - # - conc-list: [1, 2, 4, 8, 16, 32, 64] + # # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes. + # - conc-list: [16, 32] # prefill: # num-worker: 1 # tp: 4 - # ep: 1 + # ep: 4 # dp-attn: false # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml" + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml" # decode: # num-worker: 1 - # tp: 8 - # ep: 8 + # tp: 4 + # ep: 1 # dp-attn: false # - # # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes. - # - conc-list: [128, 256, 512] + # # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes. + # - conc-list: [64, 128, 256, 512] # prefill: # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # - # # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes. - # - conc-list: [512, 1024, 2048] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 4 + # dp-attn: false # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml" + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml" # decode: # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + # tp: 4 + # ep: 4 + # dp-attn: false # - # # Prefill-scaled: 2P+1D, 8 nodes. - # - conc-list: [2048, 4096] + # # High conc: 2P TEP4 + 8D DEP8, 18 nodes / 72 GPU. + # - conc-list: [1024] # prefill: # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 4 + # dp-attn: false # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml" + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml" # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - # # Max throughput: 4P+1D, 12 nodes. - # - conc-list: [4096, 8192] - # prefill: - # num-worker: 4 - # tp: 8 + # num-worker: 8 + # tp: 1 # ep: 8 # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - isl: 8192 osl: 1024 search-space: - # Low conc: 1P DEP8 + 1D TEP8 (TP8+EP8), 4 nodes. - - conc-list: [8, 16, 32, 64] + # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes. + - conc-list: [8, 16, 32] prefill: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true + tp: 4 + ep: 4 + dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 + tp: 4 + ep: 1 dp-attn: false - # Low conc: 1P DEP8 + 1D TP8 Marlin, 4 nodes. - - conc-list: [8, 16, 32, 64] + # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes. + - conc-list: [64, 128, 256] prefill: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true + tp: 4 + ep: 4 + dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml" decode: num-worker: 1 - tp: 8 - ep: 1 + tp: 4 + ep: 4 dp-attn: false - # High conc: 2P DEP8 + 7D DEP8, 18 nodes / 72 GPU. - - conc-list: [128, 256, 512, 1024] + # High conc: 2P TEP4 + 8D TEP8, 18 nodes / 72 GPU. + - conc-list: [512] prefill: num-worker: 2 - tp: 1 - ep: 8 - dp-attn: true + tp: 4 + ep: 4 + dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml" decode: - num-worker: 7 - tp: 1 + num-worker: 8 + tp: 8 ep: 8 - dp-attn: true + dp-attn: false # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml new file mode 100644 index 000000000..938bfe8cb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml @@ -0,0 +1,105 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-1k1k" + +# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4, +# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at +# conc 64-512. EP splits 128 MoE experts across 4 decode ranks. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml new file mode 100644 index 000000000..35a358f2b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml @@ -0,0 +1,105 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-1k1k" + +# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE, +# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal +# at conc 1-32. Marlin on TP-only decode (no EP). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 2304 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + moe-backend: marlin + pipeline-parallel-size: 1 + max-model-len: 2304 + max-num-seqs: 256 + max-num-batched-tokens: 256 + max-cudagraph-capture-size: 128 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 1 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16x32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml similarity index 81% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml index 23900d1c0..265b26092 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml @@ -1,8 +1,9 @@ -name: "minimax-m3-vllm-disagg-gb200-2p7d-dep8-18n-8k1k" +name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-dep8-18n-1k1k" -# 2P DEP8 prefill (4 nodes) + 7D DEP8 decode (14 nodes) = -# 18 nodes / 72 GPU. High-concurrency throughput config. -# DEP8 is TP1 DP8 EP on both sides; Marlin is not used. +# 2P TEP4 prefill (2 nodes) + 8D DEP8 decode (16 nodes) = 18 nodes +# / 72 GPU. High-concurrency throughput config. B300 optimal at +# conc 1024 is DEP8 (TP1 DP8 EP, dp-attn). Each decode worker +# spans 2 GB200 nodes (8 GPU) over NVL72 NVLink. model: path: "minimax-m3-mxfp8" @@ -24,11 +25,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 4 - decode_nodes: 14 + prefill_nodes: 2 + decode_nodes: 16 prefill_workers: 2 - decode_workers: 7 - gpus_per_prefill: 8 + decode_workers: 8 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -60,13 +61,11 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 + tensor-parallel-size: 4 pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true - max-model-len: 9472 + max-model-len: 2304 max-num-seqs: 16 max-num-batched-tokens: 16384 block-size: 128 @@ -87,7 +86,7 @@ backend: data-parallel-size: 8 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 9472 + max-model-len: 2304 max-num-seqs: 512 max-num-batched-tokens: 512 max-cudagraph-capture-size: 512 @@ -104,6 +103,6 @@ backend: benchmark: type: "sa-bench" - isl: 8192 + isl: 1024 osl: 1024 - concurrencies: "128x256x512x1024" + concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml similarity index 82% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml index 3859ae520..405751955 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml @@ -1,8 +1,8 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tep8-4n-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-8k1k" -# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TEP8 decode -# (TP8+EP8, 2 nodes) = 4 nodes. Low-concurrency latency config. -# Marlin is intentionally not used on the DEP8 prefill. +# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4, +# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at +# conc 64-256. EP splits 128 MoE experts across 4 decode ranks. model: path: "minimax-m3-mxfp8" @@ -24,12 +24,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -60,10 +60,8 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 + tensor-parallel-size: 4 pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 9472 @@ -82,7 +80,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 8 + tensor-parallel-size: 4 pipeline-parallel-size: 1 enable-expert-parallel: true max-model-len: 9472 @@ -104,4 +102,4 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "8x16x32x64" + concurrencies: "64x128x256" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml similarity index 82% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml index 528b90da9..de35075fb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml @@ -1,8 +1,8 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-4n-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-8k1k" -# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TP8 decode -# (Marlin MoE, 2 nodes) = 4 nodes. Low-concurrency latency config. -# Marlin is restricted to the pure TP8 decode worker. +# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE, +# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal +# at conc 1-32. Marlin on TP-only decode (no EP). model: path: "minimax-m3-mxfp8" @@ -24,12 +24,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -60,10 +60,8 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 + tensor-parallel-size: 4 pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 enable-expert-parallel: true enforce-eager: true max-model-len: 9472 @@ -82,7 +80,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 8 + tensor-parallel-size: 4 moe-backend: marlin pipeline-parallel-size: 1 max-model-len: 9472 @@ -104,4 +102,4 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "8x16x32x64" + concurrencies: "8x16x32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml new file mode 100644 index 000000000..1e3cea453 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml @@ -0,0 +1,106 @@ +name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-tep8-18n-8k1k" + +# 2P TEP4 prefill (2 nodes) + 8D TEP8 decode (16 nodes) = 18 nodes +# / 72 GPU. High-concurrency throughput config. B300 optimal at +# conc 512 is TEP8 (TP8+EP8). Each decode worker spans 2 GB200 +# nodes (8 GPU) over NVL72 NVLink. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 16 + prefill_workers: 2 + decode_workers: 8 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9472 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9472 + max-num-seqs: 512 + max-num-batched-tokens: 512 + max-cudagraph-capture-size: 512 + block-size: 128 + attention-backend: FLASHINFER + language-model-only: true + gpu-memory-utilization: 0.9 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + numa-bind: true + enable-sleep-mode: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" From 2ac0def1c4c38c5cf83652462ca0917054a8d948 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 19:02:00 +0800 Subject: [PATCH 30/33] fix: cap decode workers at 2 for high-conc GB200 M3 recipes Rename 2p8d/18n recipes to 2p2d/6n: 2 prefill (2 nodes) + 2 decode (4 nodes) = 6 nodes total. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 14 +++++++------- ...8n.yaml => disagg-gb200-2p2d-tep4-dep8-6n.yaml} | 10 +++++----- ...8n.yaml => disagg-gb200-2p2d-tep4-tep8-6n.yaml} | 10 +++++----- 3 files changed, 17 insertions(+), 17 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p8d-tep4-dep8-18n.yaml => disagg-gb200-2p2d-tep4-dep8-6n.yaml} (91%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p8d-tep4-tep8-18n.yaml => disagg-gb200-2p2d-tep4-tep8-6n.yaml} (91%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f85aeefa9..c073feb21 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12913,7 +12913,7 @@ qwen3.5-fp4-b200-trt: # index-cache alignment); FLASHINFER attention. All prefill is TEP4 # (TP4+EP4, 1 GB200 node). Decode configs mirror B300 single-node optimal # points (run 27630519240): TP4+Marlin at low conc, TEP4 at mid conc, -# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 8 decode workers. +# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 2 decode workers. minimaxm3-fp8-gb200-dynamo-vllm: image: vllm/vllm-openai:nightly-aarch64 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -12958,7 +12958,7 @@ minimaxm3-fp8-gb200-dynamo-vllm: # ep: 4 # dp-attn: false # - # # High conc: 2P TEP4 + 8D DEP8, 18 nodes / 72 GPU. + # # High conc: 2P TEP4 + 2D DEP8, 6 nodes / 24 GPU. # - conc-list: [1024] # prefill: # num-worker: 2 @@ -12966,9 +12966,9 @@ minimaxm3-fp8-gb200-dynamo-vllm: # ep: 4 # dp-attn: false # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml" + # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml" # decode: - # num-worker: 8 + # num-worker: 2 # tp: 1 # ep: 8 # dp-attn: true @@ -13006,7 +13006,7 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 4 dp-attn: false - # High conc: 2P TEP4 + 8D TEP8, 18 nodes / 72 GPU. + # High conc: 2P TEP4 + 2D TEP8, 6 nodes / 24 GPU. - conc-list: [512] prefill: num-worker: 2 @@ -13014,9 +13014,9 @@ minimaxm3-fp8-gb200-dynamo-vllm: ep: 4 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml" decode: - num-worker: 8 + num-worker: 2 tp: 8 ep: 8 dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml similarity index 91% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml index 265b26092..f3cd98459 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-dep8-18n-1k1k" +name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-dep8-6n-1k1k" -# 2P TEP4 prefill (2 nodes) + 8D DEP8 decode (16 nodes) = 18 nodes -# / 72 GPU. High-concurrency throughput config. B300 optimal at +# 2P TEP4 prefill (2 nodes) + 2D DEP8 decode (4 nodes) = 6 nodes +# / 24 GPU. High-concurrency throughput config. B300 optimal at # conc 1024 is DEP8 (TP1 DP8 EP, dp-attn). Each decode worker # spans 2 GB200 nodes (8 GPU) over NVL72 NVLink. @@ -26,9 +26,9 @@ resources: gpu_type: "gb200" gpus_per_node: 4 prefill_nodes: 2 - decode_nodes: 16 + decode_nodes: 4 prefill_workers: 2 - decode_workers: 8 + decode_workers: 2 gpus_per_prefill: 4 gpus_per_decode: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml similarity index 91% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml index 1e3cea453..999ffa26d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-tep8-18n-8k1k" +name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-tep8-6n-8k1k" -# 2P TEP4 prefill (2 nodes) + 8D TEP8 decode (16 nodes) = 18 nodes -# / 72 GPU. High-concurrency throughput config. B300 optimal at +# 2P TEP4 prefill (2 nodes) + 2D TEP8 decode (4 nodes) = 6 nodes +# / 24 GPU. High-concurrency throughput config. B300 optimal at # conc 512 is TEP8 (TP8+EP8). Each decode worker spans 2 GB200 # nodes (8 GPU) over NVL72 NVLink. @@ -26,9 +26,9 @@ resources: gpu_type: "gb200" gpus_per_node: 4 prefill_nodes: 2 - decode_nodes: 16 + decode_nodes: 4 prefill_workers: 2 - decode_workers: 8 + decode_workers: 2 gpus_per_prefill: 4 gpus_per_decode: 8 From 748469a061d2c1b96da85c08a3daf4a0801defa2 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 12:15:59 +0800 Subject: [PATCH 31/33] feat: adapt NV B300 PR #1863 disagg configs for GB200 M3 sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace TEP4 prefill + B300-optimal decode recipes with NV's PR #1863 B300 dynamo-vllm disagg search matrix, adapted for GB200 NVL72 (4 GPU/node): - All prefill switched to DEP2 (TP1 DP2 EP, 2 GPU/worker) — lighter per-worker footprint allows more prefill workers - Decode types: TP4+Marlin, TEP8, DEP8, DEP4 - 4p3d (3 decode workers) skipped - 15 recipe files: 8 for 8k1k, 7 for 1k1k (both ISLs active) - PR 1863 vllm_config values (max-num-seqs up to 4096, max-cudagraph-capture-size up to 8192, max-num-batched-tokens 16384) - Prefill uses cudagraph (max-cudagraph-capture-size: 2048) instead of enforce-eager - req_rate: inf for all benchmarks - FLASHINFER attention, GB200 UCX env vars preserved Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 273 +++++++++++++----- ...ml => disagg-gb200-1p1d-dep2-tep8-3n.yaml} | 68 ++--- ...disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} | 63 ++-- .../1k1k/disagg-gb200-1p1d-dep8-4n.yaml | 118 -------- .../1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 117 -------- .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml | 113 -------- .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml | 107 ------- .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 107 ------- .../1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml | 101 +++++++ .../1k1k/disagg-gb200-1p2d-tp4-3n.yaml | 106 ------- .../1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml | 101 +++++++ .../1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml | 99 +++++++ .../1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 117 -------- ...ml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} | 67 ++--- .../1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml | 99 +++++++ .../disagg-gb200-4p1d-dep8-dep16-12n.yaml | 117 -------- ...disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} | 63 ++-- ...ml => disagg-gb200-1p2d-dep2-dep8-5n.yaml} | 72 +++-- .../8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml | 101 +++++++ ...ml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} | 63 ++-- .../8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml | 101 +++++++ .../8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml | 99 +++++++ .../8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml | 101 +++++++ .../8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml | 99 +++++++ 24 files changed, 1282 insertions(+), 1190 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tep4-2n.yaml => disagg-gb200-1p1d-dep2-tep8-3n.yaml} (65%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tep4-tp4-2n.yaml => disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} (68%) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p2d-tep4-dep8-6n.yaml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} (66%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-tep4-tp4-2n.yaml => disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} (68%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-tep4-2n.yaml => disagg-gb200-1p2d-dep2-dep8-5n.yaml} (64%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p2d-tep4-tep8-6n.yaml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} (67%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c073feb21..6d631c8cb 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12904,16 +12904,10 @@ qwen3.5-fp4-b200-trt: - { tp: 4, ep: 4, dp-attn: true, conc-list: [1024] } - { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] } -# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). -# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint -# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX -# tensor cores on Blackwell. Image is nightly-aarch64 (vLLM main); recipes -# set dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime -# AND NIXL are layered in at job start. block-size 128 mandatory (MSA -# index-cache alignment); FLASHINFER attention. All prefill is TEP4 -# (TP4+EP4, 1 GB200 node). Decode configs mirror B300 single-node optimal -# points (run 27630519240): TP4+Marlin at low conc, TEP4 at mid conc, -# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 2 decode workers. +# MiniMax-M3 GB200 disagg sweep — adapted from NV B300 PR #1863. +# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8, +# DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped. +# FLASHINFER attention. No kv-cache-dtype (GB200 default). minimaxm3-fp8-gb200-dynamo-vllm: image: vllm/vllm-openai:nightly-aarch64 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -12925,96 +12919,231 @@ minimaxm3-fp8-gb200-dynamo-vllm: disagg: true scenarios: fixed-seq-len: - # - isl: 1024 - # osl: 1024 - # search-space: - # # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes. - # - conc-list: [16, 32] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml" - # decode: - # num-worker: 1 - # tp: 4 - # ep: 1 - # dp-attn: false - # - # # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes. - # - conc-list: [64, 128, 256, 512] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml" - # decode: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: false - # - # # High conc: 2P TEP4 + 2D DEP8, 6 nodes / 24 GPU. - # - conc-list: [1024] - # prefill: - # num-worker: 2 - # tp: 4 - # ep: 4 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - - - isl: 8192 + - isl: 1024 osl: 1024 search-space: - # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes. - - conc-list: [8, 16, 32] + # 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096 + - conc-list: [4, 16, 64, 128, 4096] prefill: num-worker: 1 - tp: 4 - ep: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 dp-attn: false + + # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml" decode: num-worker: 1 tp: 4 ep: 1 dp-attn: false - # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes. - - conc-list: [64, 128, 256] + # 1p2d DEP2+DEP4, 3n: conc 2048 + - conc-list: [2048] prefill: num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml" + decode: + num-worker: 2 tp: 4 ep: 4 + dp-attn: true + + # 2p1d DEP2+DEP8, 3n: conc 512,4096 + - conc-list: [512, 4096] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # 2p1d DEP2+TEP8, 3n: conc 32 + - conc-list: [32] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 dp-attn: false + + # 2p2d DEP2+TEP8, 5n: conc 16 + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + + # 3p2d DEP2+TEP8, 6n: conc 4 + - conc-list: [4] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + + - isl: 8192 + osl: 1024 + search-space: + # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml" decode: num-worker: 1 tp: 4 - ep: 4 + ep: 1 + dp-attn: false + + # 1p2d DEP2+DEP8, 5n: conc 128 + - conc-list: [128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # 2p2d DEP2+DEP8, 5n: conc 256,512 + - conc-list: [256, 512] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # 2p2d DEP2+TEP8, 5n: conc 16 + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 dp-attn: false - # High conc: 2P TEP4 + 2D TEP8, 6 nodes / 24 GPU. + # 3p2d DEP2+DEP8, 6n: conc 512 - conc-list: [512] prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml" + decode: num-worker: 2 - tp: 4 - ep: 4 + tp: 8 + ep: 8 + dp-attn: true + + # 3p2d DEP2+TEP8, 6n: conc 32 + - conc-list: [32] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 dp-attn: false + + # 4p2d DEP2+DEP8, 6n: conc 4096 + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # 5p2d DEP2+TEP8, 7n: conc 4,64 + - conc-list: [4, 64] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml" decode: num-worker: 2 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml similarity index 65% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml index 938bfe8cb..b4484c443 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml @@ -1,8 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tep8-fp8-1k1k" -# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4, -# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at -# conc 64-512. EP splits 128 MoE experts across 4 decode ranks. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) +# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -20,16 +19,15 @@ health_check: max_attempts: 720 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 1 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 2 + gpus_per_decode: 8 frontend: type: dynamo @@ -41,6 +39,7 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -50,6 +49,7 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -60,46 +60,40 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 8 enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "64x128x256x512" + concurrencies: "4x16x64x128x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml similarity index 68% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml index 35a358f2b..a0f1bda01 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml @@ -1,8 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-1k1k" -# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE, -# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal -# at conc 1-32. Marlin on TP-only decode (no EP). +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin +# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -20,7 +19,6 @@ health_check: max_attempts: 720 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 @@ -28,7 +26,7 @@ resources: decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 + gpus_per_prefill: 2 gpus_per_decode: 4 frontend: @@ -41,6 +39,7 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -50,6 +49,7 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -60,46 +60,41 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + enable-expert-parallel: false moe-backend: marlin - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "16x32" + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml deleted file mode 100644 index c930ca92b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml +++ /dev/null @@ -1,118 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP). -# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector -> -# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel -# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of -# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so -# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml deleted file mode 100644 index d0f92214b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml +++ /dev/null @@ -1,117 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve). -# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn -# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128) -# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER -# attention, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "512x1024x2048" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml deleted file mode 100644 index 049af1fa7..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml +++ /dev/null @@ -1,113 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve). -# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure -# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where -# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention, -# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd). - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml deleted file mode 100644 index 890b5a590..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml +++ /dev/null @@ -1,107 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, decode -# TEP4). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP4 -# (TP4+EP4, 1 node) = 2 nodes. Expert parallelism on decode splits 128 -# MoE experts across 4 ranks (32 each), reducing per-step MoE compute. -# FLASHINFER, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml deleted file mode 100644 index 8d63df4ab..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml +++ /dev/null @@ -1,107 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k" - -# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider -# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8 -# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism -# reduces per-step latency by spreading both attention and MoE across -# 8 GPU over NVL72 NVLink. FLASHINFER, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml new file mode 100644 index 000000000..70c71b647 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep4-fp8-1k1k" + +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP4 (TP1 DP4 EP) +# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml deleted file mode 100644 index de1488514..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml +++ /dev/null @@ -1,106 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k" - -# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (low-latency, more -# decode workers). Prefill TP4 (1 node) -> NixlConnector -> 2x Decode -# TP4 (2 nodes) = 3 nodes. Two decode workers halve the per-worker -# batch, reducing ITL at low concurrency. FLASHINFER, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 4 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - moe-backend: marlin - pipeline-parallel-size: 1 - max-model-len: 2304 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x2x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml new file mode 100644 index 000000000..1785bbe22 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-dep8-fp8-1k1k" + +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 (TP1 DP8 EP) +# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml new file mode 100644 index 000000000..79c4f08f8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml @@ -0,0 +1,99 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-tep8-fp8-1k1k" + +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) +# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml deleted file mode 100644 index de4f3ce22..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml +++ /dev/null @@ -1,117 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k" - -# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled). -# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16 -# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt -# ingest into a single wide decode at high concurrency. FLASHINFER -# attention, block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 4 - decode_nodes: 4 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "2048x4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml similarity index 66% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml index f3cd98459..b47576e2b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml @@ -1,9 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-dep8-6n-1k1k" +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-1k1k" -# 2P TEP4 prefill (2 nodes) + 2D DEP8 decode (4 nodes) = 6 nodes -# / 24 GPU. High-concurrency throughput config. B300 optimal at -# conc 1024 is DEP8 (TP1 DP8 EP, dp-attn). Each decode worker -# spans 2 GB200 nodes (8 GPU) over NVL72 NVLink. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) +# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -21,15 +19,14 @@ health_check: max_attempts: 720 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 1 decode_nodes: 4 prefill_workers: 2 decode_workers: 2 - gpus_per_prefill: 4 + gpus_per_prefill: 2 gpus_per_decode: 8 frontend: @@ -42,6 +39,7 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -51,6 +49,7 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -61,48 +60,40 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 + tensor-parallel-size: 8 enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1024" + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml new file mode 100644 index 000000000..91aff7587 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml @@ -0,0 +1,99 @@ +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-1k1k" + +# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) +# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml deleted file mode 100644 index cd978be55..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml +++ /dev/null @@ -1,117 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k" - -# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput). -# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector -> -# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max -# prefill fan-in for the highest-concurrency points. FLASHINFER attention, -# block-size 128. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 8 - decode_nodes: 4 - prefill_workers: 4 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead - # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and - # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable. - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13346 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 2304 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 32 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 2304 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" - trust-remote-code: true - no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml similarity index 68% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml index de35075fb..22614c41e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml @@ -1,8 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-8k1k" -# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE, -# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal -# at conc 1-32. Marlin on TP-only decode (no EP). +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin +# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -20,7 +19,6 @@ health_check: max_attempts: 720 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 @@ -28,7 +26,7 @@ resources: decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 + gpus_per_prefill: 2 gpus_per_decode: 4 frontend: @@ -41,6 +39,7 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -50,6 +49,7 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -60,46 +60,41 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 4 + enable-expert-parallel: false moe-backend: marlin - pipeline-parallel-size: 1 - max-model-len: 9472 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "8x16x32" + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml similarity index 64% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml index 405751955..0d3339356 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml @@ -1,8 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep8-fp8-8k1k" -# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4, -# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at -# conc 64-256. EP splits 128 MoE experts across 4 decode ranks. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -20,16 +19,15 @@ health_check: max_attempts: 720 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 1 + decode_nodes: 4 prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 frontend: type: dynamo @@ -41,6 +39,7 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -50,6 +49,7 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -60,46 +60,42 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 9472 - max-num-seqs: 256 - max-num-batched-tokens: 256 - max-cudagraph-capture-size: 128 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 1 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "64x128x256" + concurrencies: "128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml new file mode 100644 index 000000000..c2983a2e5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-dep8-fp8-8k1k" + +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml similarity index 67% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml index 999ffa26d..0b605388f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml @@ -1,9 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-tep8-6n-8k1k" +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-8k1k" -# 2P TEP4 prefill (2 nodes) + 2D TEP8 decode (4 nodes) = 6 nodes -# / 24 GPU. High-concurrency throughput config. B300 optimal at -# conc 512 is TEP8 (TP8+EP8). Each decode worker spans 2 GB200 -# nodes (8 GPU) over NVL72 NVLink. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) +# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -21,15 +19,14 @@ health_check: max_attempts: 720 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 1 decode_nodes: 4 prefill_workers: 2 decode_workers: 2 - gpus_per_prefill: 4 + gpus_per_prefill: 2 gpus_per_decode: 8 frontend: @@ -42,6 +39,7 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -51,6 +49,7 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" @@ -61,46 +60,40 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - pipeline-parallel-size: 1 + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - enforce-eager: true - max-model-len: 9472 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 8 - pipeline-parallel-size: 1 enable-expert-parallel: true - max-model-len: 9472 - max-num-seqs: 512 - max-num-batched-tokens: 512 - max-cudagraph-capture-size: 512 - block-size: 128 - attention-backend: FLASHINFER - language-model-only: true - gpu-memory-utilization: 0.9 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - numa-bind: true - enable-sleep-mode: true - stream-interval: 128 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "512" + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml new file mode 100644 index 000000000..2010dbf62 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-dep8-fp8-8k1k" + +# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml new file mode 100644 index 000000000..c0dc7c26f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml @@ -0,0 +1,99 @@ +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-8k1k" + +# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) +# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml new file mode 100644 index 000000000..669cad1d0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-4p2d-dep2-dep8-fp8-8k1k" + +# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml new file mode 100644 index 000000000..4aec44f74 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml @@ -0,0 +1,99 @@ +name: "minimax-m3-vllm-disagg-gb200-5p2d-dep2-tep8-fp8-8k1k" + +# 5P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) +# decode = 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:nightly-aarch64" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 4 + prefill_workers: 5 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x64" + req_rate: "inf" From d8e17d461325a7f233b99534cfb0714ba5a59cfa Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 21 Jun 2026 08:47:56 +0800 Subject: [PATCH 32/33] fix: fit MiniMax-M3 sweep on GB200 --- .github/configs/nvidia-master.yaml | 124 +++++++++--------- ...ml => disagg-gb200-1p1d-dep4-tep8-3n.yaml} | 12 +- ...disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} | 12 +- ...ml => disagg-gb200-1p2d-dep4-dep4-3n.yaml} | 16 ++- ...ml => disagg-gb200-2p1d-dep4-dep8-4n.yaml} | 18 +-- ...ml => disagg-gb200-2p1d-dep4-tep8-4n.yaml} | 14 +- ...ml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} | 14 +- ...ml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} | 14 +- ...disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} | 12 +- ...ml => disagg-gb200-1p2d-dep4-dep8-5n.yaml} | 16 ++- ...ml => disagg-gb200-2p2d-dep4-dep8-6n.yaml} | 18 +-- ...ml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} | 14 +- ...ml => disagg-gb200-3p2d-dep4-dep8-7n.yaml} | 18 +-- ...ml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} | 14 +- ...ml => disagg-gb200-4p2d-dep4-dep8-8n.yaml} | 18 +-- ...ml => disagg-gb200-5p2d-dep4-tep8-9n.yaml} | 14 +- 16 files changed, 189 insertions(+), 159 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep2-tep8-3n.yaml => disagg-gb200-1p1d-dep4-tep8-3n.yaml} (89%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml => disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} (89%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p2d-dep2-dep4-3n.yaml => disagg-gb200-1p2d-dep4-dep4-3n.yaml} (86%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-dep2-dep8-3n.yaml => disagg-gb200-2p1d-dep4-dep8-4n.yaml} (86%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-dep2-tep8-3n.yaml => disagg-gb200-2p1d-dep4-tep8-4n.yaml} (88%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p2d-dep2-tep8-5n.yaml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} (88%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-3p2d-dep2-tep8-6n.yaml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} (88%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml => disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} (89%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p2d-dep2-dep8-5n.yaml => disagg-gb200-1p2d-dep4-dep8-5n.yaml} (86%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p2d-dep2-dep8-5n.yaml => disagg-gb200-2p2d-dep4-dep8-6n.yaml} (86%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p2d-dep2-tep8-5n.yaml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} (88%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-3p2d-dep2-dep8-6n.yaml => disagg-gb200-3p2d-dep4-dep8-7n.yaml} (86%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-3p2d-dep2-tep8-6n.yaml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} (88%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-4p2d-dep2-dep8-6n.yaml => disagg-gb200-4p2d-dep4-dep8-8n.yaml} (86%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-5p2d-dep2-tep8-7n.yaml => disagg-gb200-5p2d-dep4-tep8-9n.yaml} (88%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 19215da03..cc22e03c5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -13305,9 +13305,9 @@ qwen3.5-fp4-b200-trt: - { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] } # MiniMax-M3 GB200 disagg sweep — adapted from NV B300 PR #1863. -# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8, +# All prefill DEP4 (TP1 DP4 EP, 4 GPU/worker). Decode: TP4+Marlin, TEP8, # DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped. -# FLASHINFER attention. No kv-cache-dtype (GB200 default). +# FLASHINFER attention with FP8 KV cache, matching the validated GB300 sweep. minimaxm3-fp8-gb200-dynamo-vllm: image: vllm/vllm-openai:nightly-aarch64 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -13322,105 +13322,105 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096 + # 1p1d DEP4+TEP8, 3n: conc 4,16,64,128,4096 - conc-list: [4, 16, 64, 128, 4096] prefill: num-worker: 1 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: false - # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 + # 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16 - conc-list: [1, 4, 8, 16] prefill: num-worker: 1 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml" decode: num-worker: 1 tp: 4 ep: 1 dp-attn: false - # 1p2d DEP2+DEP4, 3n: conc 2048 + # 1p2d DEP4+DEP4, 3n: conc 2048 - conc-list: [2048] prefill: num-worker: 1 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml" decode: num-worker: 2 tp: 4 ep: 4 dp-attn: true - # 2p1d DEP2+DEP8, 3n: conc 512,4096 + # 2p1d DEP4+DEP8, 4n: conc 512,4096 - conc-list: [512, 4096] prefill: num-worker: 2 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - # 2p1d DEP2+TEP8, 3n: conc 32 + # 2p1d DEP4+TEP8, 4n: conc 32 - conc-list: [32] prefill: num-worker: 2 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: false - # 2p2d DEP2+TEP8, 5n: conc 16 + # 2p2d DEP4+TEP8, 6n: conc 16 - conc-list: [16] prefill: num-worker: 2 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: false - # 3p2d DEP2+TEP8, 6n: conc 4 + # 3p2d DEP4+TEP8, 7n: conc 4 - conc-list: [4] prefill: num-worker: 3 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml" decode: num-worker: 2 tp: 8 @@ -13430,120 +13430,120 @@ minimaxm3-fp8-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 + # 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16 - conc-list: [1, 4, 8, 16] prefill: num-worker: 1 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml" decode: num-worker: 1 tp: 4 ep: 1 dp-attn: false - # 1p2d DEP2+DEP8, 5n: conc 128 + # 1p2d DEP4+DEP8, 5n: conc 128 - conc-list: [128] prefill: num-worker: 1 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - # 2p2d DEP2+DEP8, 5n: conc 256,512 + # 2p2d DEP4+DEP8, 6n: conc 256,512 - conc-list: [256, 512] prefill: num-worker: 2 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - # 2p2d DEP2+TEP8, 5n: conc 16 + # 2p2d DEP4+TEP8, 6n: conc 16 - conc-list: [16] prefill: num-worker: 2 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: false - # 3p2d DEP2+DEP8, 6n: conc 512 + # 3p2d DEP4+DEP8, 7n: conc 512 - conc-list: [512] prefill: num-worker: 3 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - # 3p2d DEP2+TEP8, 6n: conc 32 + # 3p2d DEP4+TEP8, 7n: conc 32 - conc-list: [32] prefill: num-worker: 3 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: false - # 4p2d DEP2+DEP8, 6n: conc 4096 + # 4p2d DEP4+DEP8, 8n: conc 4096 - conc-list: [4096] prefill: num-worker: 4 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - # 5p2d DEP2+TEP8, 7n: conc 4,64 + # 5p2d DEP4+TEP8, 9n: conc 4,64 - conc-list: [4, 64] prefill: num-worker: 5 - tp: 2 - ep: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml" decode: num-worker: 2 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml similarity index 89% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml index b4484c443..38b463e79 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tep8-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TEP8 (TP8+EP8) +# 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -26,7 +26,7 @@ resources: decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -85,6 +86,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 4096 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml similarity index 89% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml index a0f1bda01..653683bc4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin -# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TP4 Marlin +# 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -26,7 +26,7 @@ resources: decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 4 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -86,6 +87,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 4096 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml similarity index 86% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml index 70c71b647..ca884ade7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep4-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep4-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP4 (TP1 DP4 EP) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP4 (TP1 DP4 EP) +# 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -26,7 +26,7 @@ resources: decode_nodes: 2 prefill_workers: 1 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 4 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -87,11 +88,12 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 - max-num-seqs: 1024 + max-num-seqs: 512 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml similarity index 86% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml index 1785bbe22..10712e807 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-dep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-dep8-fp8-1k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 (TP1 DP8 EP) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D DEP8 (TP1 DP8 EP) +# 4 nodes (2P + 2D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 2 prefill_workers: 2 decode_workers: 1 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -87,11 +88,12 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 - max-num-seqs: 4096 + max-num-seqs: 512 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml similarity index 88% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml index 79c4f08f8..930ec860b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-tep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-tep8-fp8-1k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TEP8 (TP8+EP8) +# 4 nodes (2P + 2D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 2 prefill_workers: 2 decode_workers: 1 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -85,6 +86,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 4096 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml similarity index 88% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml index b47576e2b..c422781b4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-1k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 4 prefill_workers: 2 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -85,6 +86,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 4096 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml similarity index 88% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml index 91aff7587..58fb1952f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-1k1k" -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 3 decode_nodes: 4 prefill_workers: 3 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -85,6 +86,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 2304 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 4096 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml similarity index 89% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml index 22614c41e..2c94b75ed 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-8k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin -# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TP4 Marlin +# 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -26,7 +26,7 @@ resources: decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 4 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -86,6 +87,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 1024 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml similarity index 86% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml index 0d3339356..236bf112b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep8-fp8-8k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -26,7 +26,7 @@ resources: decode_nodes: 4 prefill_workers: 1 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -87,11 +88,12 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 - max-num-seqs: 1024 + max-num-seqs: 512 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml similarity index 86% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml index c2983a2e5..5c2056418 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-dep8-fp8-8k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 4 prefill_workers: 2 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -87,11 +88,12 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 - max-num-seqs: 1024 + max-num-seqs: 512 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml similarity index 88% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml index 0b605388f..9d6fea2e9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-8k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 + prefill_nodes: 2 decode_nodes: 4 prefill_workers: 2 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -85,6 +86,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 1024 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml similarity index 86% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml index 2010dbf62..515c0e48b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-dep8-fp8-8k1k" -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 3 decode_nodes: 4 prefill_workers: 3 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -87,11 +88,12 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 - max-num-seqs: 1024 + max-num-seqs: 512 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml similarity index 88% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml index c0dc7c26f..ace2e4477 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-8k1k" -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 3 decode_nodes: 4 prefill_workers: 3 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -85,6 +86,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 1024 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml similarity index 86% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml index 669cad1d0..2453fe560 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-4p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-4p2d-dep4-dep8-fp8-8k1k" -# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 4P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 8 nodes (4P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 4 decode_nodes: 4 prefill_workers: 4 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -87,11 +88,12 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 - max-num-seqs: 1024 + max-num-seqs: 512 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 + max-cudagraph-capture-size: 2048 benchmark: type: "sa-bench" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml similarity index 88% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml index 4aec44f74..418c65ba2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml @@ -1,7 +1,7 @@ -name: "minimax-m3-vllm-disagg-gb200-5p2d-dep2-tep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb200-5p2d-dep4-tep8-fp8-8k1k" -# 5P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. +# 5P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 9 nodes (5P + 4D). Adapted from NV B300 PR #1863. model: path: "minimax-m3-mxfp8" @@ -22,11 +22,11 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 3 + prefill_nodes: 5 decode_nodes: 4 prefill_workers: 5 decode_workers: 2 - gpus_per_prefill: 2 + gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -61,7 +61,7 @@ backend: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' tensor-parallel-size: 1 - data-parallel-size: 2 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true trust-remote-code: true @@ -70,6 +70,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-cudagraph-capture-size: 2048 @@ -85,6 +86,7 @@ backend: gpu-memory-utilization: 0.90 max-model-len: 9472 language-model-only: true + kv-cache-dtype: fp8 attention-backend: FLASHINFER stream-interval: 32 max-num-seqs: 1024 From 0c61503b4f5ed45d6f0ad493c27e7827ac84ec9b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:25:29 +0800 Subject: [PATCH 33/33] perf(gb200): refresh MiniMax-M3 vLLM image --- .github/configs/nvidia-master.yaml | 2 +- .../configs/minimax-m3-gb200-vllm-fixes.sh | 38 ++++++++++ .../1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml | 2 +- .../disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml | 2 +- .../1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml | 2 +- .../1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml | 2 +- .../1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml | 2 +- .../1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml | 2 +- .../1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml | 2 +- .../disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml | 2 +- .../8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml | 2 +- .../8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml | 2 +- .../8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml | 2 +- .../8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml | 2 +- .../8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml | 2 +- .../8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml | 2 +- .../8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml | 2 +- perf-changelog.yaml | 8 ++ runners/launch_gb200-nv.sh | 73 +++++++++++++++---- 19 files changed, 121 insertions(+), 30 deletions(-) create mode 100755 benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e0ef11dc5..93a377183 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12109,7 +12109,7 @@ qwen3.5-fp4-b200-trt: # DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped. # FLASHINFER attention with FP8 KV cache, matching the validated GB300 sweep. minimaxm3-fp8-gb200-dynamo-vllm: - image: vllm/vllm-openai:nightly-aarch64 + image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh new file mode 100755 index 000000000..c0eed0a51 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -euo pipefail + +python3 - <<'PYEOF' +from importlib.util import find_spec +from pathlib import Path + +spec = find_spec("vllm") +if not spec or not spec.origin: + raise RuntimeError("vllm is not installed") +root = Path(spec.origin).parent +patches = { + root / "distributed/device_communicators/flashinfer_all_reduce.py": [ + ( + " comm_backend=comm_backend,\n" + " group=group,\n", + " comm_backend=comm_backend,\n" + ' force_oneshot_support=backend == "mnnvl",\n' + " group=group,\n", + ), + ], + root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [ + ( + " prefill_topk = topk[:, nd:num_tokens, :]\n", + " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n", + ), + ], +} +for path, edits in patches.items(): + source = path.read_text() + for old, new in edits: + if new in source: + continue + if source.count(old) != 1: + raise RuntimeError(f"missing or ambiguous patch anchor in {path}") + source = source.replace(old, new, 1) + path.write_text(source) +PYEOF diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml index 38b463e79..74c2e2668 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml index 653683bc4..324170080 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml index ca884ade7..43ca4f723 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep4-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml index 10712e807..a8e05c640 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-dep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml index 930ec860b..9aea9db19 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml index c422781b4..9786b2306 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml index 58fb1952f..2d22a2437 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml index 2c94b75ed..d10b7866d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml index 236bf112b..1e386a693 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml index 5c2056418..5e77b9e8f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml index 9d6fea2e9..cda685755 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml index 515c0e48b..55a0cfc58 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml index ace2e4477..ad5e1da1b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml index 2453fe560..8b9857c14 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p2d-dep4-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml index 418c65ba2..7a39d40dc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-5p2d-dep4-tep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4265d320b..3b3ad71c0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4099,3 +4099,11 @@ - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1893 + +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Update the GB200 MiniMax-M3 Dynamo-vLLM image to vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + - "Allocate FlashInfer MNNVL workspace for one-shot all-reduce and materialize the MSA prefill top-k slice before CSR construction" + - "Preserve current Qwen3.5 and Kimi-K2.5 GB200 launcher paths while adding MiniMax-M3 shared-FS staging and atomic image import" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 4017b1fd2..8ab7de40a 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -63,8 +63,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" + elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8" exit 1 fi else @@ -77,15 +80,22 @@ export SLURM_ACCOUNT="benchmark" NGINX_IMAGE="nginx:1.27.4" -# === Cluster diagnostic probe (minimax only) === +uses_watchtower_shared_fs() { + case "$MODEL_PREFIX" in + minimaxm2.5|minimaxm3|kimik2.5) return 0 ;; + *) return 1 ;; + esac +} + +# === Cluster diagnostic probe for watchtower-hosted sweeps === # The gb200-nv_* runners may be hosted on different physical clusters # (e.g., the legacy NVIDIA Lustre cluster vs Oracle Cloud "watchtower"). # Print enough info to identify the layout, then pick a writable # squash dir on a path that's also visible to compute nodes. Falls # back to the legacy sa-shared path so other configs are untouched. SQUASH_DIR="/mnt/lustre01/users-public/sa-shared" -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then - echo "=== cluster diagnostic (minimax sweep) ===" +if uses_watchtower_shared_fs; then + echo "=== cluster diagnostic (watchtower sweep) ===" echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)" echo "HOME=$HOME" echo "HOSTNAME=$(hostname -f 2>/dev/null || hostname)" @@ -131,8 +141,27 @@ fi SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -enroot import -o $SQUASH_FILE docker://$IMAGE -enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Concurrent matrix jobs import to the same shared-FS squash path. +# Serialize imports and atomically replace invalid images so readers never +# observe a partially written squash file. +import_squash() { + local squash="$1" image="$2" + local lock="${squash}.lock" + ( + exec 9>"$lock" + flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; } + if unsquashfs -l "$squash" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import: $squash" + else + rm -f "$squash" "$squash".tmp.* + enroot import -o "${squash}.tmp.$$" "docker://$image" + mv -f "${squash}.tmp.$$" "$squash" + fi + ) || exit 1 +} + +import_squash "$SQUASH_FILE" "$IMAGE" +import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -201,11 +230,12 @@ fi echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" +SRTCTL_SETUP_SCRIPT="" # On the watchtower (Oracle) gb200 cluster, /home/slurm-shared is not # cross-mounted to compute nodes. Put the srt-slurm workspace and staged # InferenceX checkout on a writable shared-FS path that compute can see. # Per-run-unique paths avoid races between parallel sweep jobs. -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then +if uses_watchtower_shared_fs; then SHARED_BASE="" for cand in \ /mnt/lustre01/users-public/sa-shared/gha-runs \ @@ -277,6 +307,16 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 exit 1 fi +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 + cd "$SRT_REPO_DIR" || exit 1 + git checkout main || exit 1 + mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1 + SRTCTL_SETUP_SCRIPT="minimax-m3-gb200-vllm-fixes.sh" + cp \ + "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ + "configs/$SRTCTL_SETUP_SCRIPT" || exit 1 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 cd "$SRT_REPO_DIR" || exit 1 @@ -306,7 +346,7 @@ source $HOME/.local/bin/env # under a head-node-only path, .venv/bin/python3 becomes a broken # symlink on compute. Pin the venv to /usr/bin/python3 — a system # path that exists at the same location on both head and compute. -if [[ ($MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5") && -x /usr/bin/python3 ]]; then +if uses_watchtower_shared_fs && [[ -x /usr/bin/python3 ]]; then uv venv --seed --python /usr/bin/python3 else uv venv --seed @@ -323,10 +363,10 @@ echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" -# Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path +# Watchtower-hosted sweeps: SRT_REPO_DIR was moved to a shared-FS path # above so srtctl's outputs/ directory (which lives under # SRTCTL_ROOT) is visible to compute nodes. -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then +if uses_watchtower_shared_fs; then SRTCTL_ROOT="$SRT_REPO_DIR" fi echo "Creating srtslurm.yaml configuration..." @@ -368,7 +408,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" # can't see. Stage the relevant subset to shared FS and repoint # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already # on shared FS) and .git (not needed in container) for speed. -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then +if uses_watchtower_shared_fs; then SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}" mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1 rsync -a --delete \ @@ -393,11 +433,16 @@ if [[ ! -f "$CONFIG_PATH" ]]; then fi sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" +SRTCTL_APPLY_ARGS=( + -f "$CONFIG_PATH" + --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" +) if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) -else - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + SRTCTL_APPLY_ARGS+=(--setup-script install-torchao.sh) +elif [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then + SRTCTL_APPLY_ARGS+=(--setup-script "$SRTCTL_SETUP_SCRIPT") fi +SRTCTL_OUTPUT=$(srtctl apply "${SRTCTL_APPLY_ARGS[@]}" 2>&1) echo "$SRTCTL_OUTPUT" JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')