diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1b54016a8..93a377183 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12104,6 +12104,252 @@ qwen3.5-fp4-b200-trt: - { tp: 4, ep: 4, dp-attn: true, conc-list: [1024] } - { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] } +# MiniMax-M3 GB200 disagg sweep — adapted from NV B300 PR #1863. +# All prefill DEP4 (TP1 DP4 EP, 4 GPU/worker). Decode: TP4+Marlin, TEP8, +# DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped. +# FLASHINFER attention with FP8 KV cache, matching the validated GB300 sweep. +minimaxm3-fp8-gb200-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: gb200 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1p1d DEP4+TEP8, 3n: conc 4,16,64,128,4096 + - conc-list: [4, 16, 64, 128, 4096] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + + # 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16 + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # 1p2d DEP4+DEP4, 3n: conc 2048 + - conc-list: [2048] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + + # 2p1d DEP4+DEP8, 4n: conc 512,4096 + - conc-list: [512, 4096] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # 2p1d DEP4+TEP8, 4n: conc 32 + - conc-list: [32] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + + # 2p2d DEP4+TEP8, 6n: conc 16 + - conc-list: [16] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + + # 3p2d DEP4+TEP8, 7n: conc 4 + - conc-list: [4] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + + - isl: 8192 + osl: 1024 + search-space: + # 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16 + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # 1p2d DEP4+DEP8, 5n: conc 128 + - conc-list: [128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # 2p2d DEP4+DEP8, 6n: conc 256,512 + - conc-list: [256, 512] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # 2p2d DEP4+TEP8, 6n: conc 16 + - conc-list: [16] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + + # 3p2d DEP4+DEP8, 7n: conc 512 + - conc-list: [512] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # 3p2d DEP4+TEP8, 7n: conc 32 + - conc-list: [32] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + + # 4p2d DEP4+DEP8, 8n: conc 4096 + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # 5p2d DEP4+TEP8, 9n: conc 4,64 + - conc-list: [4, 64] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh new file mode 100755 index 000000000..c0eed0a51 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -euo pipefail + +python3 - <<'PYEOF' +from importlib.util import find_spec +from pathlib import Path + +spec = find_spec("vllm") +if not spec or not spec.origin: + raise RuntimeError("vllm is not installed") +root = Path(spec.origin).parent +patches = { + root / "distributed/device_communicators/flashinfer_all_reduce.py": [ + ( + " comm_backend=comm_backend,\n" + " group=group,\n", + " comm_backend=comm_backend,\n" + ' force_oneshot_support=backend == "mnnvl",\n' + " group=group,\n", + ), + ], + root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [ + ( + " prefill_topk = topk[:, nd:num_tokens, :]\n", + " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n", + ), + ], +} +for path, edits in patches.items(): + source = path.read_text() + for old, new in edits: + if new in source: + continue + if source.count(old) != 1: + raise RuntimeError(f"missing or ambiguous patch anchor in {path}") + source = source.replace(old, new, 1) + path.write_text(source) +PYEOF diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml new file mode 100644 index 000000000..74c2e2668 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tep8-fp8-1k1k" + +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TEP8 (TP8+EP8) +# 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x16x64x128x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml new file mode 100644 index 000000000..324170080 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml @@ -0,0 +1,102 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-1k1k" + +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TP4 Marlin +# 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + enable-expert-parallel: false + moe-backend: marlin + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml new file mode 100644 index 000000000..43ca4f723 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml @@ -0,0 +1,103 @@ +name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep4-fp8-1k1k" + +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP4 (TP1 DP4 EP) +# 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml new file mode 100644 index 000000000..a8e05c640 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml @@ -0,0 +1,103 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-dep8-fp8-1k1k" + +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D DEP8 (TP1 DP8 EP) +# 4 nodes (2P + 2D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml new file mode 100644 index 000000000..9aea9db19 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-tep8-fp8-1k1k" + +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TEP8 (TP8+EP8) +# 4 nodes (2P + 2D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml new file mode 100644 index 000000000..9786b2306 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-1k1k" + +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml new file mode 100644 index 000000000..2d22a2437 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-1k1k" + +# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml new file mode 100644 index 000000000..d10b7866d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml @@ -0,0 +1,102 @@ +name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-8k1k" + +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TP4 Marlin +# 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 4 + enable-expert-parallel: false + moe-backend: marlin + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml new file mode 100644 index 000000000..1e386a693 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml @@ -0,0 +1,103 @@ +name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep8-fp8-8k1k" + +# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml new file mode 100644 index 000000000..5e77b9e8f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml @@ -0,0 +1,103 @@ +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-dep8-fp8-8k1k" + +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml new file mode 100644 index 000000000..cda685755 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-8k1k" + +# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml new file mode 100644 index 000000000..55a0cfc58 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml @@ -0,0 +1,103 @@ +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-dep8-fp8-8k1k" + +# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml new file mode 100644 index 000000000..ad5e1da1b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-8k1k" + +# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml new file mode 100644 index 000000000..8b9857c14 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml @@ -0,0 +1,103 @@ +name: "minimax-m3-vllm-disagg-gb200-4p2d-dep4-dep8-fp8-8k1k" + +# 4P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP) +# 8 nodes (4P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml new file mode 100644 index 000000000..7a39d40dc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml @@ -0,0 +1,101 @@ +name: "minimax-m3-vllm-disagg-gb200-5p2d-dep4-tep8-fp8-8k1k" + +# 5P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8) +# 9 nodes (5P + 4D). Adapted from NV B300 PR #1863. + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 4 + prefill_workers: 5 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + attention-backend: FLASHINFER + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x64" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4265d320b..3b3ad71c0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4099,3 +4099,11 @@ - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1893 + +- config-keys: + - minimaxm3-fp8-gb200-dynamo-vllm + description: + - "Update the GB200 MiniMax-M3 Dynamo-vLLM image to vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + - "Allocate FlashInfer MNNVL workspace for one-shot all-reduce and materialize the MSA prefill top-k slice before CSR construction" + - "Preserve current Qwen3.5 and Kimi-K2.5 GB200 launcher paths while adding MiniMax-M3 shared-FS staging and atomic image import" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 4017b1fd2..8ab7de40a 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -63,8 +63,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" + elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8" exit 1 fi else @@ -77,15 +80,22 @@ export SLURM_ACCOUNT="benchmark" NGINX_IMAGE="nginx:1.27.4" -# === Cluster diagnostic probe (minimax only) === +uses_watchtower_shared_fs() { + case "$MODEL_PREFIX" in + minimaxm2.5|minimaxm3|kimik2.5) return 0 ;; + *) return 1 ;; + esac +} + +# === Cluster diagnostic probe for watchtower-hosted sweeps === # The gb200-nv_* runners may be hosted on different physical clusters # (e.g., the legacy NVIDIA Lustre cluster vs Oracle Cloud "watchtower"). # Print enough info to identify the layout, then pick a writable # squash dir on a path that's also visible to compute nodes. Falls # back to the legacy sa-shared path so other configs are untouched. SQUASH_DIR="/mnt/lustre01/users-public/sa-shared" -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then - echo "=== cluster diagnostic (minimax sweep) ===" +if uses_watchtower_shared_fs; then + echo "=== cluster diagnostic (watchtower sweep) ===" echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)" echo "HOME=$HOME" echo "HOSTNAME=$(hostname -f 2>/dev/null || hostname)" @@ -131,8 +141,27 @@ fi SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -enroot import -o $SQUASH_FILE docker://$IMAGE -enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Concurrent matrix jobs import to the same shared-FS squash path. +# Serialize imports and atomically replace invalid images so readers never +# observe a partially written squash file. +import_squash() { + local squash="$1" image="$2" + local lock="${squash}.lock" + ( + exec 9>"$lock" + flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; } + if unsquashfs -l "$squash" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import: $squash" + else + rm -f "$squash" "$squash".tmp.* + enroot import -o "${squash}.tmp.$$" "docker://$image" + mv -f "${squash}.tmp.$$" "$squash" + fi + ) || exit 1 +} + +import_squash "$SQUASH_FILE" "$IMAGE" +import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -201,11 +230,12 @@ fi echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" +SRTCTL_SETUP_SCRIPT="" # On the watchtower (Oracle) gb200 cluster, /home/slurm-shared is not # cross-mounted to compute nodes. Put the srt-slurm workspace and staged # InferenceX checkout on a writable shared-FS path that compute can see. # Per-run-unique paths avoid races between parallel sweep jobs. -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then +if uses_watchtower_shared_fs; then SHARED_BASE="" for cand in \ /mnt/lustre01/users-public/sa-shared/gha-runs \ @@ -277,6 +307,16 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 exit 1 fi +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 + cd "$SRT_REPO_DIR" || exit 1 + git checkout main || exit 1 + mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1 + SRTCTL_SETUP_SCRIPT="minimax-m3-gb200-vllm-fixes.sh" + cp \ + "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ + "configs/$SRTCTL_SETUP_SCRIPT" || exit 1 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 cd "$SRT_REPO_DIR" || exit 1 @@ -306,7 +346,7 @@ source $HOME/.local/bin/env # under a head-node-only path, .venv/bin/python3 becomes a broken # symlink on compute. Pin the venv to /usr/bin/python3 — a system # path that exists at the same location on both head and compute. -if [[ ($MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5") && -x /usr/bin/python3 ]]; then +if uses_watchtower_shared_fs && [[ -x /usr/bin/python3 ]]; then uv venv --seed --python /usr/bin/python3 else uv venv --seed @@ -323,10 +363,10 @@ echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" -# Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path +# Watchtower-hosted sweeps: SRT_REPO_DIR was moved to a shared-FS path # above so srtctl's outputs/ directory (which lives under # SRTCTL_ROOT) is visible to compute nodes. -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then +if uses_watchtower_shared_fs; then SRTCTL_ROOT="$SRT_REPO_DIR" fi echo "Creating srtslurm.yaml configuration..." @@ -368,7 +408,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" # can't see. Stage the relevant subset to shared FS and repoint # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already # on shared FS) and .git (not needed in container) for speed. -if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then +if uses_watchtower_shared_fs; then SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}" mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1 rsync -a --delete \ @@ -393,11 +433,16 @@ if [[ ! -f "$CONFIG_PATH" ]]; then fi sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" +SRTCTL_APPLY_ARGS=( + -f "$CONFIG_PATH" + --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" +) if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) -else - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + SRTCTL_APPLY_ARGS+=(--setup-script install-torchao.sh) +elif [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then + SRTCTL_APPLY_ARGS+=(--setup-script "$SRTCTL_SETUP_SCRIPT") fi +SRTCTL_OUTPUT=$(srtctl apply "${SRTCTL_APPLY_ARGS[@]}" 2>&1) echo "$SRTCTL_OUTPUT" JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')