Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b506cd4
[NV] Add MiniMax M3 B300 Dynamo vLLM recipes
Oseltamivir Jun 19, 2026
84a023a
chore: update MiniMax M3 B300 container
Oseltamivir Jun 19, 2026
b09bc78
chore: update changelog PR link
Oseltamivir Jun 19, 2026
86da150
Update perf-changelog.yaml
Oseltamivir Jun 19, 2026
f5727c2
Update perf-changelog.yaml
Oseltamivir Jun 19, 2026
3b6dad4
fix(vllm): patch MiniMax M3 MSA contiguity
Oseltamivir Jun 19, 2026
71ba2ea
fix(recipes): align MiniMax M3 parallel settings
Oseltamivir Jun 19, 2026
b859a0b
fix(vllm): backport MiniMax M3 eval fixes
Oseltamivir Jun 19, 2026
2d408e4
ci(sweep): enable full MiniMax M3 validation
Oseltamivir Jun 19, 2026
3956aee
perf(vllm): right-size MiniMax M3 low concurrency
Oseltamivir Jun 20, 2026
33fe6a9
Merge remote-tracking branch 'origin/main' into pr-1787-latest
Oseltamivir Jun 20, 2026
77c6391
Merge branch 'main' into pr-1787-latest
Oseltamivir Jun 20, 2026
b99d3c9
perf(vllm): colocate MiniMax M3 TP4 workers
Oseltamivir Jun 20, 2026
d2347aa
fix(runner): exclude faulty B300 RDMA node
Oseltamivir Jun 20, 2026
8ace2e9
fix(runner): verify B300 node exclusion
Oseltamivir Jun 20, 2026
884ff12
fix(runner): check generated B300 sbatch script
Oseltamivir Jun 20, 2026
3ae240b
ci(sweep): validate B300 node exclusion
Oseltamivir Jun 20, 2026
9751d93
Merge remote-tracking branch 'origin/main' into pr-1787-latest
Oseltamivir Jun 20, 2026
03d27e7
refactor(vllm): trim MiniMax M3 runtime patches
Oseltamivir Jun 21, 2026
826a64e
Merge branch 'main' into pr-1787-latest
Oseltamivir Jun 22, 2026
aec850f
Merge branch 'main' into pr-1787-latest
Oseltamivir Jun 22, 2026
37d5e2c
Update MiniMax M3 B300 Dynamo vLLM recipes
biswapanda Jun 22, 2026
adbe614
fix
biswapanda Jun 22, 2026
fe0eda5
update to flashinfer
biswapanda Jun 23, 2026
0a751a7
prune non-pareto
biswapanda Jun 23, 2026
d08cc43
Merge branch 'main' into pr-1787-latest--update
Ankur-singh Jun 23, 2026
f100024
fix(vllm): remove pruned MiniMax M3 B300 recipes
jasonlizhengjian Jun 23, 2026
376f261
Merge remote-tracking branch 'refs/remotes/inferencex/main' into code…
jasonlizhengjian Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11604,6 +11604,219 @@ qwen3.5-fp8-h100-sglang-agentic:
- { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] }
- { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }

minimaxm3-fp8-b300-dynamo-vllm:
image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b300
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [4, 16, 64, 128, 4096]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [2048]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
- conc-list: [512, 4096]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- isl: 8192
osl: 1024
search-space:
- conc-list: [256, 512]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: false
- conc-list: [16, 32, 64, 128]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 4
tp: 4
ep: 4
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: false
- conc-list: [4]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false

# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -euo pipefail

python3 - <<'PYEOF'
from importlib.util import find_spec
from pathlib import Path

spec = find_spec("vllm")
if not spec or not spec.origin:
raise RuntimeError("vllm is not installed")
root = Path(spec.origin).parent
patches = {
root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [
(
" prefill_topk = topk[:, nd:num_tokens, :]\n",
" prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n",
),
],
root / "distributed/kv_transfer/kv_connector/v1/nixl/base_worker.py": [
(
" for i, local_len in enumerate(self.block_len_per_layer):\n",
" total_kv_heads = self.transfer_topo.total_num_kv_heads\n"
" local_heads = self.transfer_topo.local_physical_heads\n"
" remote_heads = max(1, total_kv_heads // remote_tp_size)\n"
" for i, local_len in enumerate(self.block_len_per_layer):\n",
),
(
"remote_len == (local_len * tp_ratio) // block_size_ratio,",
"remote_len == (local_len * remote_heads // local_heads) "
"// block_size_ratio,",
),
(
"remote_len == local_len // (-tp_ratio),",
"remote_len == local_len * remote_heads // local_heads,",
),
],
}
for path, edits in patches.items():
source = path.read_text()
for old, new in edits:
if new in source:
continue
if source.count(old) != 1:
raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
source = source.replace(old, new, 1)
path.write_text(source)
PYEOF
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 8

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
tensor-parallel-size: 8
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8192

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x16x64x128x4096"
req_rate: "inf"
Loading
Loading