Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11604,6 +11604,232 @@ qwen3.5-fp8-h100-sglang-agentic:
- { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] }
- { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }

minimaxm3-fp8-b300-dynamo-vllm:
image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b300
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [4, 16, 64, 128, 4096]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [2048]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
- conc-list: [512, 4096]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- isl: 8192
osl: 1024
search-space:
- conc-list: [128]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [256, 512]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [16]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [512]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [32]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [4096]
prefill:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml"
decode:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
- conc-list: [4, 64]
prefill:
num-worker: 5
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.
Expand Down
21 changes: 21 additions & 0 deletions KLAUD_DEBUG.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,27 @@ requests per DP rank. If capture still OOMs, lower decode

Seen on: #1735 (MiniMax-M3 MXFP8 GB300 dynamo-vLLM).

### 2.2 Stale MiniMax-M3 NIXL runtime patch on newer images

**Symptom:** every B300 Dynamo-vLLM worker exits from
`minimax-m3-vllm-fixes.sh` before vLLM starts:
```
RuntimeError: missing or ambiguous patch anchor in
.../vllm/distributed/kv_transfer/kv_connector/v1/nixl/base_worker.py
```

**Root cause:** newer vLLM images already include vLLM #45879, and subsequent
refactors changed the formatting around the heterogeneous-TP validation. The
exact-string patcher no longer recognizes its replacement text and treats the
already-upstream fix as a missing anchor.

**Fix:** remove the obsolete NIXL edits from the runtime setup script. Retain
only patches still absent from the image, such as the MiniMax-M3 MSA
`prefill_topk.contiguous()` fix. Verify the image's source commit before
dropping each patch.

Seen on: #1890 (MiniMax-M3 MXFP8 B300 image refresh to vLLM `7a67223`).

---

## 3. Custom DSV4 image → generic v0.5.12 OOMs
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash
set -euo pipefail

python3 - <<'PYEOF'
from importlib.util import find_spec
from pathlib import Path

spec = find_spec("vllm")
if not spec or not spec.origin:
raise RuntimeError("vllm is not installed")
root = Path(spec.origin).parent
patches = {
root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [
(
" prefill_topk = topk[:, nd:num_tokens, :]\n",
" prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n",
),
],
}
for path, edits in patches.items():
source = path.read_text()
for old, new in edits:
if new in source:
continue
if source.count(old) != 1:
raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
source = source.replace(old, new, 1)
path.write_text(source)
PYEOF
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-7a67223"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 8

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

decode_environment:
UCX_TLS: "cuda_copy,rc"
VLLM_FLOAT32_MATMUL_PRECISION: "high"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 2048

decode:
tensor-parallel-size: 8
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 2304
language-model-only: true
stream-interval: 32
max-num-seqs: 4096
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 8192

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x16x64x128x4096"
req_rate: "inf"
Loading