Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
bbdd923
feat: MiniMax-M3 MXFP8 full sweep config for GB200
Oseltamivir Jun 13, 2026
dbf5135
chore: update perf-changelog pr-link to #1734
Oseltamivir Jun 13, 2026
12a1da5
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 13, 2026
ed63c1e
feat: switch GB200 M3 to ai-dynamo vllm-runtime 1.3.0 image
Oseltamivir Jun 13, 2026
9b1fbfa
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 13, 2026
8738f42
fix: use enroot registry syntax (nvcr.io#) for GB200 M3 image
Oseltamivir Jun 13, 2026
3415fb4
feat: convert MiniMax-M3 GB200 sweep to fully disaggregated inference
Oseltamivir Jun 13, 2026
2e7938f
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 13, 2026
803cd20
fix: restore NIXL-bearing image for M3 GB200 disagg + enable MNNVL KV…
Oseltamivir Jun 13, 2026
1320056
feat: rack-scale wide-EP (DeepSeek megamoe) M3 GB200 disagg + FLASHINFER
Oseltamivir Jun 14, 2026
94decf2
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 14, 2026
c8cd567
feat: tune 1k1k low-conc latency + add 8k1k sweep for M3 GB200
Oseltamivir Jun 14, 2026
c806aa2
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 14, 2026
b819a7a
feat: low-conc focus — wider decode + more decode workers for M3 GB200
Oseltamivir Jun 14, 2026
d745d00
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 14, 2026
29eaaeb
feat: enable expert-parallel on GB200 TEP8 decode to close B200 low-c…
Oseltamivir Jun 15, 2026
124a74f
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 15, 2026
56e61cf
feat: add 8k1k TEP8 decode recipe for GB200 to close B200 gap at long…
Oseltamivir Jun 15, 2026
4a83c75
refactor: remove unoptimized TP4/1P2D baselines, keep TEP8-only sweep
Oseltamivir Jun 15, 2026
2a237f7
feat: uncomment all high-conc entries for full M3 GB200 sweep
Oseltamivir Jun 15, 2026
5e2c2f9
feat: test 1P1D TEP4 decode (TP4+EP4, 2n) — conc 1-32 only
Oseltamivir Jun 15, 2026
055aa2c
feat: restore full TEP8 + wide-EP sweep for M3 GB200 disagg
Oseltamivir Jun 15, 2026
e0ae36c
fix: uncomment trailing 4P1D 8k1k decode lines in M3 GB200 sweep
Oseltamivir Jun 15, 2026
b6926e3
fix: retrigger M3 GB200 sweep to validate MNNVL fused allreduce fix
Oseltamivir Jun 15, 2026
0de6356
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 15, 2026
0e5ba19
fix: point TEP8 recipe at ghcr image with NixlConnector head_ratio fix
Oseltamivir Jun 16, 2026
941b4e1
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 16, 2026
0531a42
fix: retrigger M3 GB200 sweep after making ghcr image public
Oseltamivir Jun 16, 2026
2b11508
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 16, 2026
6db15b0
fix: retrigger M3 GB200 sweep — previous run hit NATS infra failure
Oseltamivir Jun 16, 2026
38728a7
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb200…
Oseltamivir Jun 16, 2026
67f4521
Remove minimaxm3-fp8-gb200-vllm details from changelog
Oseltamivir Jun 16, 2026
7062524
fix: runtime-patch ARM64 image for M3 NixlConnector head_ratio + MNNV…
Oseltamivir Jun 16, 2026
211488b
fix: switch M3 GB200 disagg to nightly, drop extra_mount workaround
Oseltamivir Jun 18, 2026
831fdb6
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb200…
Oseltamivir Jun 18, 2026
b3a670b
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 18, 2026
492db61
fix: append changelog entry at end to pass immutability gate
Oseltamivir Jun 18, 2026
98c4ddd
Merge remote-tracking branch 'origin/feat/minimax-m3-gb200-sweep' int…
Oseltamivir Jun 18, 2026
37ab79b
feat: switch GB200 M3 to vllm/vllm-openai:minimax-m3-0618, run full s…
Oseltamivir Jun 18, 2026
4723ca2
revert: switch GB200 M3 back to nightly-aarch64
Oseltamivir Jun 18, 2026
c3030aa
feat: add --moe-backend marlin for TP-only GB200 M3 disagg workers
Oseltamivir Jun 19, 2026
4574f11
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb200…
Oseltamivir Jun 19, 2026
2ea30b3
fix: scale MiniMax-M3 prefill to DEP8
Oseltamivir Jun 19, 2026
fc4af8b
feat: redesign MiniMax-M3 GB200 decode tiers
Oseltamivir Jun 19, 2026
4431440
feat: TEP4 prefill + B300-optimal decode for GB200 M3 disagg
Oseltamivir Jun 19, 2026
2ac0def
fix: cap decode workers at 2 for high-conc GB200 M3 recipes
Oseltamivir Jun 19, 2026
7fca8b1
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 19, 2026
748469a
feat: adapt NV B300 PR #1863 disagg configs for GB200 M3 sweep
Oseltamivir Jun 20, 2026
77afbda
Merge branch 'main' into feat/minimax-m3-gb200-sweep
Oseltamivir Jun 20, 2026
d8e17d4
fix: fit MiniMax-M3 sweep on GB200
Oseltamivir Jun 21, 2026
702c780
Merge remote-tracking branch 'origin/main' into feat/minimax-m3-gb200…
Oseltamivir Jun 23, 2026
0c61503
perf(gb200): refresh MiniMax-M3 vLLM image
Oseltamivir Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 246 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12104,6 +12104,252 @@ qwen3.5-fp4-b200-trt:
- { tp: 4, ep: 4, dp-attn: true, conc-list: [1024] }
- { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] }

# MiniMax-M3 GB200 disagg sweep — adapted from NV B300 PR #1863.
# All prefill DEP4 (TP1 DP4 EP, 4 GPU/worker). Decode: TP4+Marlin, TEP8,
# DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped.
# FLASHINFER attention with FP8 KV cache, matching the validated GB300 sweep.
minimaxm3-fp8-gb200-dynamo-vllm:
image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: gb200
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# 1p1d DEP4+TEP8, 3n: conc 4,16,64,128,4096
- conc-list: [4, 16, 64, 128, 4096]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false

# 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# 1p2d DEP4+DEP4, 3n: conc 2048
- conc-list: [2048]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true

# 2p1d DEP4+DEP8, 4n: conc 512,4096
- conc-list: [512, 4096]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

# 2p1d DEP4+TEP8, 4n: conc 32
- conc-list: [32]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false

# 2p2d DEP4+TEP8, 6n: conc 16
- conc-list: [16]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

# 3p2d DEP4+TEP8, 7n: conc 4
- conc-list: [4]
prefill:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

- isl: 8192
osl: 1024
search-space:
# 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# 1p2d DEP4+DEP8, 5n: conc 128
- conc-list: [128]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 2p2d DEP4+DEP8, 6n: conc 256,512
- conc-list: [256, 512]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 2p2d DEP4+TEP8, 6n: conc 16
- conc-list: [16]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

# 3p2d DEP4+DEP8, 7n: conc 512
- conc-list: [512]
prefill:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 3p2d DEP4+TEP8, 7n: conc 32
- conc-list: [32]
prefill:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

# 4p2d DEP4+DEP8, 8n: conc 4096
- conc-list: [4096]
prefill:
num-worker: 4
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true

# 5p2d DEP4+TEP8, 9n: conc 4,64
- conc-list: [4, 64]
prefill:
num-worker: 5
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false

# MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail

python3 - <<'PYEOF'
from importlib.util import find_spec
from pathlib import Path

spec = find_spec("vllm")
if not spec or not spec.origin:
raise RuntimeError("vllm is not installed")
root = Path(spec.origin).parent
patches = {
root / "distributed/device_communicators/flashinfer_all_reduce.py": [
(
" comm_backend=comm_backend,\n"
" group=group,\n",
" comm_backend=comm_backend,\n"
' force_oneshot_support=backend == "mnnvl",\n'
" group=group,\n",
),
],
root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [
(
" prefill_topk = topk[:, nd:num_tokens, :]\n",
" prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n",
),
],
}
for path, edits in patches.items():
source = path.read_text()
for old, new in edits:
if new in source:
continue
if source.count(old) != 1:
raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
source = source.replace(old, new, 1)
path.write_text(source)
PYEOF
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tep8-fp8-1k1k"

# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TEP8 (TP8+EP8)
# 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
install: true
wheel: "1.2.0.dev20260526"
Comment thread
cursor[bot] marked this conversation as resolved.

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 720
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 4
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_CUMEM_ENABLE: "1"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLOAT32_MATMUL_PRECISION: "high"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_CUMEM_ENABLE: "1"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 1
data-parallel-size: 4
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
kv-cache-dtype: fp8
attention-backend: FLASHINFER
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
tensor-parallel-size: 8
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
kv-cache-dtype: fp8
attention-backend: FLASHINFER
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8192

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x16x64x128x4096"
req_rate: "inf"
Loading