From b506cd47f6c2756ee56c4a93aaebc9a24cb8b6c9 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 05:25:19 +0800 Subject: [PATCH 01/21] [NV] Add MiniMax M3 B300 Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 226 ++++++++++++++++++ .../b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml | 79 ++++++ .../1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml | 80 +++++++ .../b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml | 81 +++++++ .../b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml | 81 +++++++ .../b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml | 79 ++++++ .../b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml | 79 ++++++ .../b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml | 79 ++++++ .../b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 79 ++++++ .../b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml | 79 ++++++ .../b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml | 78 ++++++ .../b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 79 ++++++ .../b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 78 ++++++ .../b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml | 79 ++++++ .../b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 79 ++++++ .../b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 78 ++++++ .../8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml | 79 ++++++ perf-changelog.yaml | 8 + runners/launch_b300-nv.sh | 11 +- 19 files changed, 1510 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e64100de7..50c94555c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12817,6 +12817,232 @@ minimaxm2.5-fp8-b300-dynamo-vllm: ep: 4 dp-attn: true +minimaxm3-fp8-b300-dynamo-vllm: + image: vllm/vllm-openai:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: b300 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 16, 64, 128, 4096] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [2048] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [512, 4096] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [256, 512] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [512] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [4, 64] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + minimaxm2.5-fp8-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: MiniMaxAI/MiniMax-M2.5 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..e76827af3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x16x64x128x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml new file mode 100644 index 000000000..c8362cb32 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml @@ -0,0 +1,80 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp8-marlin-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: false + moe-backend: marlin + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml new file mode 100644 index 000000000..349a125bb --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep4-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml new file mode 100644 index 000000000..0f790c79b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-dep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..1372ff29a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..4447d971b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 000000000..b03d644d2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-1k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8196 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..890014563 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..6d9ecc425 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..90d816592 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,78 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..84d580452 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..f272b21bc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,78 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..b087b0926 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp8-dep2-dep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml new file mode 100644 index 000000000..94c36243e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-4p3d-fp8-dep2-dep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 3 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..94f546ec2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,78 @@ +name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x64" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml new file mode 100644 index 000000000..e77e77600 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml @@ -0,0 +1,79 @@ +name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tp8-marlin-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: false + moe-backend: marlin + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 46bee4d44..9135f8109 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3974,3 +3974,11 @@ - "ISL1K/OSL1K: conc 8–8192 across ctx1dep4 and ctx2dep4 topologies" - "ISL8K/OSL1K: conc 5–4301 across ctx1dep4 to ctx9dep4 topologies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1797 + +- config-keys: + - minimaxm3-fp8-b300-dynamo-vllm + description: + - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP." + - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." + - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1787 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index a941860c0..f2a83e4b3 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -45,8 +45,11 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" && $FRAMEWORK == " elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/data/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" +elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then + export MODEL_PATH="/data/models/MiniMax-M3-MXFP8" + export SRT_SLURM_MODEL_PREFIX="MiniMaxAI/MiniMax-M3-MXFP8" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm" exit 1 fi @@ -79,6 +82,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS git checkout main mkdir -p recipes/vllm/minimax-m2.5-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8" recipes/vllm/minimax-m2.5-fp8 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout sa-submission-q2-2026 + mkdir -p recipes/vllm/minimax-m3 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 From 84a023a5dbe329cc95db3dacf592201153ee798e Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 05:27:47 +0800 Subject: [PATCH 02/21] chore: update MiniMax M3 B300 container --- .github/configs/nvidia-master.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml | 2 +- .../minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 2 +- .../minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml | 2 +- perf-changelog.yaml | 6 ++++++ 18 files changed, 23 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 50c94555c..9363133c5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12818,7 +12818,7 @@ minimaxm2.5-fp8-b300-dynamo-vllm: dp-attn: true minimaxm3-fp8-b300-dynamo-vllm: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b300 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml index e76827af3..750b96848 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml index c8362cb32..d5fc17ccd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp8-marlin-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml index 349a125bb..1167212e4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep4-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml index 0f790c79b..d3a93aeda 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-dep8-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml index 1372ff29a..c0b9b5ba5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-tep8-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml index 4447d971b..632d9a8d3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml index b03d644d2..768b6981e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml index 890014563..a48367069 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep8-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml index 6d9ecc425..5fe5abf7b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-dep8-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index 90d816592..2e612acf1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml index 84d580452..cfd380482 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-dep8-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml index f272b21bc..66b74d1fc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml index b087b0926..6b6592f05 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-4p2d-fp8-dep2-dep8-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml index 94c36243e..a47c6d375 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-4p3d-fp8-dep2-dep4-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml index 94f546ec2..859530f16 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tep8-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml index e77e77600..7a65f0136 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml @@ -2,7 +2,7 @@ name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tp8-marlin-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" precision: "fp8" resources: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9135f8109..b128cc2e0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3982,3 +3982,9 @@ - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1787 + +- config-keys: + - minimaxm3-fp8-b300-dynamo-vllm + description: + - "Update the MiniMax-M3 B300 Dynamo vLLM container from vllm/vllm-openai:minimax-m3 to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1787 From b09bc785669e2e04f44cf1c3f896cd295c5356ab Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 05:29:07 +0800 Subject: [PATCH 03/21] chore: update changelog PR link --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b128cc2e0..b816b95ba 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3981,10 +3981,10 @@ - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP." - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1787 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 - config-keys: - minimaxm3-fp8-b300-dynamo-vllm description: - "Update the MiniMax-M3 B300 Dynamo vLLM container from vllm/vllm-openai:minimax-m3 to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1787 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 From 86da150b0cf574c65ffa42e4c654b1aa9aa147b7 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 14:31:03 -0700 Subject: [PATCH 04/21] Update perf-changelog.yaml --- perf-changelog.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b816b95ba..121bcbb49 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3988,3 +3988,4 @@ description: - "Update the MiniMax-M3 B300 Dynamo vLLM container from vllm/vllm-openai:minimax-m3 to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 + evals-only: true From f5727c277c22e13f8ffb10016426d28d8abe416d Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 14:31:27 -0700 Subject: [PATCH 05/21] Update perf-changelog.yaml --- perf-changelog.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 121bcbb49..d67e05856 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3982,10 +3982,4 @@ - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 - -- config-keys: - - minimaxm3-fp8-b300-dynamo-vllm - description: - - "Update the MiniMax-M3 B300 Dynamo vLLM container from vllm/vllm-openai:minimax-m3 to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 evals-only: true From 3b6dad4368ae6d09393b7b649f49aa250f63b516 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 06:02:01 +0800 Subject: [PATCH 06/21] fix(vllm): patch MiniMax M3 MSA contiguity --- .../configs/minimax-m3-msa-contiguity.sh | 34 +++++++++++++++++++ perf-changelog.yaml | 1 + runners/launch_b300-nv.sh | 14 +++++++- 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100755 benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh new file mode 100755 index 000000000..fa0598011 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# The 0618 image slices a persistent CUDA-graph top-k buffer without +# materializing it. TP1 data-parallel-attention workers retain a non-contiguous +# head stride, which the MiniMax M3 MSA CSR builder rejects. +python3 - <<'PYEOF' +import importlib.util +import pathlib + +spec = importlib.util.find_spec("vllm") +if spec is None or not spec.submodule_search_locations: + raise RuntimeError("Could not locate the installed vllm package") + +target = ( + pathlib.Path(next(iter(spec.submodule_search_locations))) + / "models" + / "minimax_m3" + / "nvidia" + / "sparse_attention_msa.py" +) +src = target.read_text() +old = " prefill_topk = topk[:, nd:num_tokens, :]\n" +new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n" + +if new in src: + print(f"[minimax-m3-msa-patch] already applied: {target}") +elif src.count(old) == 1: + target.write_text(src.replace(old, new, 1)) + print(f"[minimax-m3-msa-patch] patched: {target}") +else: + raise RuntimeError(f"Expected exactly one patch anchor in {target}") +PYEOF diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d67e05856..dc7196f75 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3981,5 +3981,6 @@ - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP." - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." + - "Patch the 0618 image's MiniMax M3 MSA prefill top-k slice to be contiguous before CSR construction." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 evals-only: true diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index f2a83e4b3..306012451 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -55,6 +55,7 @@ fi echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" +SRTCTL_SETUP_SCRIPT="" if [ -d "$SRT_REPO_DIR" ]; then echo "Removing existing $SRT_REPO_DIR..." rm -rf "$SRT_REPO_DIR" @@ -88,6 +89,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISIO git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 + SRTCTL_SETUP_SCRIPT="minimax-m3-msa-contiguity.sh" + cp \ + "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ + "configs/$SRTCTL_SETUP_SCRIPT" else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 @@ -170,7 +175,14 @@ fi # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" -SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +SRTCTL_APPLY_ARGS=( + -f "$CONFIG_FILE" + --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" +) +if [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then + SRTCTL_APPLY_ARGS+=(--setup-script "$SRTCTL_SETUP_SCRIPT") +fi +SRTCTL_OUTPUT=$(srtctl apply "${SRTCTL_APPLY_ARGS[@]}" 2>&1) echo "$SRTCTL_OUTPUT" # Extract JOB_ID from srtctl output From 71ba2eaecb095dcbeb7a5d518da8392bbaf68bc0 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 06:09:17 +0800 Subject: [PATCH 07/21] fix(recipes): align MiniMax M3 parallel settings --- .../vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml | 2 +- .../minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml | 2 +- .../vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml | 1 + .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 1 + .../vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 2 ++ .../vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 1 + .../minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml | 1 + perf-changelog.yaml | 1 + 17 files changed, 22 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml index 750b96848..5bbb13362 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -65,7 +65,7 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8196 + max-cudagraph-capture-size: 8192 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml index d5fc17ccd..f7ebaef0a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml @@ -66,7 +66,7 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8196 + max-cudagraph-capture-size: 8192 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml index 1167212e4..ef7e66d76 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -67,7 +67,7 @@ backend: stream-interval: 32 max-num-seqs: 1024 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8196 + max-cudagraph-capture-size: 8192 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml index d3a93aeda..9f5aa341c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -67,7 +67,7 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8196 + max-cudagraph-capture-size: 8192 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml index c0b9b5ba5..42c6e7bbc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -65,7 +65,7 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8196 + max-cudagraph-capture-size: 8192 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml index 632d9a8d3..3e701df05 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -65,7 +65,7 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8196 + max-cudagraph-capture-size: 8192 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml index 768b6981e..b9a1d1058 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -65,7 +65,7 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8196 + max-cudagraph-capture-size: 8192 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml index a48367069..bc4c449b2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -55,6 +56,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml index 5fe5abf7b..7a8ddd1a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -55,6 +56,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index 2e612acf1..d00fb046e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml index cfd380482..cf8736e14 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -55,6 +56,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml index 66b74d1fc..9572688b2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml index 6b6592f05..8eb02bb25 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -55,6 +56,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml index a47c6d375..9d095b7b6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -55,6 +56,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 4 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml index 859530f16..d40a33582 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml index 7a65f0136..67c3eb242 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml @@ -40,6 +40,7 @@ backend: tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dc7196f75..ddb23efe9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3982,5 +3982,6 @@ - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." - "Patch the 0618 image's MiniMax M3 MSA prefill top-k slice to be contiguous before CSR construction." + - "Align 8k1k expert-parallel settings with the 1k1k recipes and correct the decode CUDA graph capture limit." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 evals-only: true From b859a0bedf93f10b9fb680a24e1381ad7998bb11 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 06:54:05 +0800 Subject: [PATCH 08/21] fix(vllm): backport MiniMax M3 eval fixes --- .../configs/minimax-m3-msa-contiguity.sh | 34 ----- .../configs/minimax-m3-vllm-fixes.sh | 138 ++++++++++++++++++ .../configs/srt-slurm-sanitize-node-ip.patch | 17 +++ .../b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml | 2 +- .../b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 2 +- perf-changelog.yaml | 2 + runners/launch_b300-nv.sh | 17 ++- 7 files changed, 175 insertions(+), 37 deletions(-) delete mode 100755 benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh create mode 100755 benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh create mode 100644 benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh deleted file mode 100755 index fa0598011..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-msa-contiguity.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -# The 0618 image slices a persistent CUDA-graph top-k buffer without -# materializing it. TP1 data-parallel-attention workers retain a non-contiguous -# head stride, which the MiniMax M3 MSA CSR builder rejects. -python3 - <<'PYEOF' -import importlib.util -import pathlib - -spec = importlib.util.find_spec("vllm") -if spec is None or not spec.submodule_search_locations: - raise RuntimeError("Could not locate the installed vllm package") - -target = ( - pathlib.Path(next(iter(spec.submodule_search_locations))) - / "models" - / "minimax_m3" - / "nvidia" - / "sparse_attention_msa.py" -) -src = target.read_text() -old = " prefill_topk = topk[:, nd:num_tokens, :]\n" -new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n" - -if new in src: - print(f"[minimax-m3-msa-patch] already applied: {target}") -elif src.count(old) == 1: - target.write_text(src.replace(old, new, 1)) - print(f"[minimax-m3-msa-patch] patched: {target}") -else: - raise RuntimeError(f"Expected exactly one patch anchor in {target}") -PYEOF diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh new file mode 100755 index 000000000..7f612b922 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Backport fixes merged after the 0618 MiniMax M3 image was built. +python3 - <<'PYEOF' +import importlib.util +import pathlib + + +def apply_exact_patches( + target: pathlib.Path, + patches: list[tuple[str, str]], + label: str, +) -> None: + src = target.read_text() + changed = False + + for old, new in patches: + if new in src: + continue + if src.count(old) != 1: + raise RuntimeError(f"Expected exactly one {label} patch anchor in {target}") + src = src.replace(old, new, 1) + changed = True + + if changed: + target.write_text(src) + print(f"[{label}] patched: {target}") + else: + print(f"[{label}] already applied: {target}") + + +spec = importlib.util.find_spec("vllm") +if spec is None or not spec.submodule_search_locations: + raise RuntimeError("Could not locate the installed vllm package") + +vllm_root = pathlib.Path(next(iter(spec.submodule_search_locations))) + +# TP1 data-parallel-attention workers retain a non-contiguous head stride in +# the persistent CUDA-graph top-k buffer. Materialize the slice before the +# MiniMax M3 MSA CSR builder consumes it. +msa_target = ( + vllm_root + / "models" + / "minimax_m3" + / "nvidia" + / "sparse_attention_msa.py" +) +apply_exact_patches( + msa_target, + [ + ( + " prefill_topk = topk[:, nd:num_tokens, :]\n", + " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n", + ) + ], + "minimax-m3-msa-contiguity", +) + +# vllm-project/vllm#45879: heterogeneous TP must validate SPLIT regions using +# per-rank KV heads. MiniMax M3 has four KV heads, so TP8 replicates one head +# per rank instead of scaling block length by the raw TP ratio of eight. +nixl_target = ( + vllm_root + / "distributed" + / "kv_transfer" + / "kv_connector" + / "v1" + / "nixl" + / "base_worker.py" +) +apply_exact_patches( + nixl_target, + [ + ( + """ # only allow the number of blocks to differ; SPLIT regions scale with + # tp_ratio. Mamba uses the ssm_sizes counterpart, so skip block_len here. +""", + """ # only allow the number of blocks to differ; SPLIT regions scale with + # the per-rank KV head ratio rather than the raw tp_ratio, because GQA + # replication caps per-rank heads at 1 when tp > total_kv_heads + # (issue #45330). Mamba uses the ssm_sizes counterpart, so skip here. +""", + ), + ( + """ model_replicated = self.use_mla or self.transfer_topo.is_kv_replicated( + remote_engine_id + ) + for i, local_len in enumerate(self.block_len_per_layer): +""", + """ model_replicated = self.use_mla or self.transfer_topo.is_kv_replicated( + remote_engine_id + ) + total_kv_heads = self.transfer_topo.total_num_kv_heads + local_heads = self.transfer_topo.local_physical_heads + remote_heads = max(1, total_kv_heads // remote_tp_size) + for i, local_len in enumerate(self.block_len_per_layer): +""", + ), + ( + """ elif tp_ratio > 0: + assert remote_len == (local_len * tp_ratio) // block_size_ratio, ( + f"SPLIT region {i}: remote P KV block_len {remote_len} " + f"must equal local {local_len} * tp_ratio {tp_ratio} " + f"// block_size_ratio {block_size_ratio}." + ) +""", + """ elif tp_ratio > 0: + assert ( + remote_len + == (local_len * remote_heads // local_heads) // block_size_ratio + ), ( + f"SPLIT region {i}: remote P KV block_len {remote_len} " + f"must equal local {local_len} * remote_heads " + f"{remote_heads} // local_heads {local_heads} " + f"// block_size_ratio {block_size_ratio}." + ) +""", + ), + ( + """ assert remote_len == local_len // (-tp_ratio), ( + f"SPLIT region {i}: remote P KV block_len " + f"{remote_len} must equal local {local_len} " + f"// |tp_ratio| {-tp_ratio}." + ) +""", + """ assert remote_len == local_len * remote_heads // local_heads, ( + f"SPLIT region {i}: remote P KV block_len {remote_len} " + f"must equal local {local_len} * remote_heads " + f"{remote_heads} // local_heads {local_heads}." + ) +""", + ), + ], + "minimax-m3-nixl-gqa", +) +PYEOF diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch b/benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch new file mode 100644 index 000000000..4c4ad7216 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch @@ -0,0 +1,17 @@ +diff --git a/src/srtctl/core/ip_utils/get_node_ip.sh b/src/srtctl/core/ip_utils/get_node_ip.sh +index 85d5e15..6f51321 100644 +--- a/src/srtctl/core/ip_utils/get_node_ip.sh ++++ b/src/srtctl/core/ip_utils/get_node_ip.sh +@@ -180,3 +180,5 @@ get_node_ip() { + local result +- result=$(srun --jobid $slurm_job_id --nodes=1 --ntasks=1 --nodelist=$node bash -c "$ip_script" 2>&1) ++ result=$(srun --quiet --jobid "$slurm_job_id" --nodes=1 --ntasks=1 --nodelist="$node" bash -c "$ip_script" 2>&1) + local rc=$? ++ local ip ++ ip=$(printf '%s\n' "$result" | grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' | tail -n 1) +@@ -184,3 +186,3 @@ get_node_ip() { +- if [ $rc -eq 0 ] && [ -n "$result" ]; then +- echo "$result" ++ if [ $rc -eq 0 ] && [ -n "$ip" ]; then ++ echo "$ip" + return 0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml index 8eb02bb25..a0e050f1d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -65,7 +65,7 @@ backend: max-model-len: 9472 language-model-only: true stream-interval: 32 - max-num-seqs: 1024 + max-num-seqs: 1024 # Per DP rank: 2 workers x DP8 = 16 ranks. max-num-batched-tokens: 16384 max-cudagraph-capture-size: 4096 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml index 9d095b7b6..6f765ab74 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -65,7 +65,7 @@ backend: max-model-len: 9472 language-model-only: true stream-interval: 32 - max-num-seqs: 512 + max-num-seqs: 512 # Per DP rank: 3 workers x DP4 = 12 ranks. max-num-batched-tokens: 16384 max-cudagraph-capture-size: 4096 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ddb23efe9..a7c4be006 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3983,5 +3983,7 @@ - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." - "Patch the 0618 image's MiniMax M3 MSA prefill top-k slice to be contiguous before CSR construction." - "Align 8k1k expert-parallel settings with the 1k1k recipes and correct the decode CUDA graph capture limit." + - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." + - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 evals-only: true diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 306012451..6fbf05783 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -56,6 +56,7 @@ fi echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" SRTCTL_SETUP_SCRIPT="" +SRTCTL_HOST_PATCH="" if [ -d "$SRT_REPO_DIR" ]; then echo "Removing existing $SRT_REPO_DIR..." rm -rf "$SRT_REPO_DIR" @@ -89,7 +90,8 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISIO git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 - SRTCTL_SETUP_SCRIPT="minimax-m3-msa-contiguity.sh" + SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh" + SRTCTL_HOST_PATCH="srt-slurm-sanitize-node-ip.patch" cp \ "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ "configs/$SRTCTL_SETUP_SCRIPT" @@ -99,6 +101,19 @@ else git checkout sa-submission-q2-2026 fi +if [[ -n "$SRTCTL_HOST_PATCH" ]]; then + SRTCTL_HOST_PATCH_PATH="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_HOST_PATCH" + if git apply --unidiff-zero --check "$SRTCTL_HOST_PATCH_PATH" 2>/dev/null; then + git apply --unidiff-zero "$SRTCTL_HOST_PATCH_PATH" || exit 1 + echo "Applied host patch: $SRTCTL_HOST_PATCH" + elif git apply --unidiff-zero --reverse --check "$SRTCTL_HOST_PATCH_PATH" 2>/dev/null; then + echo "Host patch already applied: $SRTCTL_HOST_PATCH" + else + echo "Error: host patch does not apply cleanly: $SRTCTL_HOST_PATCH" >&2 + exit 1 + fi +fi + echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" curl -LsSf https://astral.sh/uv/install.sh | sh From 2d408e4bf38c2b4a9bf398564cb393c3f2423480 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 07:12:11 +0800 Subject: [PATCH 09/21] ci(sweep): enable full MiniMax M3 validation --- perf-changelog.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a7c4be006..f47081f85 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3986,4 +3986,3 @@ - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 - evals-only: true From 3956aee20d529ec10c72f40b6cced79ad3eaf2e3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 10:09:54 +0800 Subject: [PATCH 10/21] perf(vllm): right-size MiniMax M3 low concurrency --- .github/configs/nvidia-master.yaml | 12 ++++++------ ...-1k1k.yaml => 1p1d-dep2-tp4-marlin-1k1k.yaml} | 8 ++++---- ...-8k1k.yaml => 1p1d-dep2-tp4-marlin-8k1k.yaml} | 16 ++++++++-------- perf-changelog.yaml | 2 +- 4 files changed, 19 insertions(+), 19 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/{1p1d-dep2-tp8-marlin-1k1k.yaml => 1p1d-dep2-tp4-marlin-1k1k.yaml} (91%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/{5p2d-dep2-tp8-marlin-8k1k.yaml => 1p1d-dep2-tp4-marlin-8k1k.yaml} (87%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9363133c5..78d147ba1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12851,10 +12851,10 @@ minimaxm3-fp8-b300-dynamo-vllm: ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml" decode: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false - conc-list: [2048] @@ -13031,15 +13031,15 @@ minimaxm3-fp8-b300-dynamo-vllm: dp-attn: false - conc-list: [1, 4, 8, 16] prefill: - num-worker: 5 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml" decode: - num-worker: 2 - tp: 8 + num-worker: 1 + tp: 4 ep: 1 dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml similarity index 91% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml index f7ebaef0a..651eb5743 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp8-marlin-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml @@ -1,4 +1,4 @@ -name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp8-marlin-1k1k" +name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp4-marlin-1k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" @@ -13,7 +13,7 @@ resources: prefill_workers: 1 decode_workers: 1 gpus_per_prefill: 2 - gpus_per_decode: 8 + gpus_per_decode: 4 dynamo: install: true @@ -53,7 +53,7 @@ backend: max-num-batched-tokens: 2048 decode: - tensor-parallel-size: 8 + tensor-parallel-size: 4 enable-expert-parallel: false moe-backend: marlin trust-remote-code: true @@ -66,7 +66,7 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 + max-cudagraph-capture-size: 2048 health_check: max_attempts: 360 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml similarity index 87% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml index 67c3eb242..f611292d2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tp8-marlin-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml @@ -1,4 +1,4 @@ -name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tp8-marlin-8k1k" +name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp4-marlin-8k1k" model: path: "MiniMaxAI/MiniMax-M3-MXFP8" @@ -8,12 +8,12 @@ model: resources: gpu_type: "b300" gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 5 - decode_workers: 2 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 gpus_per_prefill: 2 - gpus_per_decode: 8 + gpus_per_decode: 4 dynamo: install: true @@ -53,7 +53,7 @@ backend: max-num-batched-tokens: 16384 decode: - tensor-parallel-size: 8 + tensor-parallel-size: 4 enable-expert-parallel: false moe-backend: marlin trust-remote-code: true @@ -66,7 +66,7 @@ backend: stream-interval: 32 max-num-seqs: 1024 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 + max-cudagraph-capture-size: 2048 health_check: max_attempts: 360 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f47081f85..5989d9069 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3980,7 +3980,7 @@ description: - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP." - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." - - "Add TP8 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." + - "Add right-sized TP4 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." - "Patch the 0618 image's MiniMax M3 MSA prefill top-k slice to be contiguous before CSR construction." - "Align 8k1k expert-parallel settings with the 1k1k recipes and correct the decode CUDA graph capture limit." - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." From b99d3c9c8859a04521e384e1761b2691a4ced815 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 16:28:01 +0800 Subject: [PATCH 11/21] perf(vllm): colocate MiniMax M3 TP4 workers --- .../minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml | 5 +++-- .../minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml | 5 +++-- perf-changelog.yaml | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml index 651eb5743..49a60981e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml @@ -26,13 +26,14 @@ frontend: backend: type: vllm connector: null + allow_prefill_decode_colocation: true prefill_environment: - UCX_TLS: "cuda_copy,rc" + UCX_TLS: "cuda_ipc,cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" decode_environment: - UCX_TLS: "cuda_copy,rc" + UCX_TLS: "cuda_ipc,cuda_copy,rc" VLLM_FLOAT32_MATMUL_PRECISION: "high" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml index f611292d2..c98ad0b44 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml @@ -26,14 +26,15 @@ frontend: backend: type: vllm connector: null + allow_prefill_decode_colocation: true prefill_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" + UCX_TLS: "cuda_ipc,cuda_copy,rc" decode_environment: VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" + UCX_TLS: "cuda_ipc,cuda_copy,rc" vllm_config: prefill: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index cc4be8d26..87e85fe44 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3996,6 +3996,7 @@ - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP." - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm." - "Add right-sized TP4 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes." + - "Colocate the six-GPU TP4 prefill/decode pairs on one B300 node and enable CUDA IPC for NIXL KV transfer." - "Patch the 0618 image's MiniMax M3 MSA prefill top-k slice to be contiguous before CSR construction." - "Align 8k1k expert-parallel settings with the 1k1k recipes and correct the decode CUDA graph capture limit." - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." From d2347aaba74074778f252f538e01d29fe68a33cb Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:17:56 +0800 Subject: [PATCH 12/21] fix(runner): exclude faulty B300 RDMA node --- runners/launch_b300-nv.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 6fbf05783..fe75e2e34 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -3,6 +3,8 @@ # System-specific configuration for B300 NV Slurm cluster (sa-shared) SLURM_PARTITION="batch_1" SLURM_ACCOUNT="benchmark" +# b300-018 repeatedly times out UCX/NIXL transfers; allow an empty override to disable this. +MINIMAX_M3_SLURM_EXCLUDED_NODELIST="${MINIMAX_M3_SLURM_EXCLUDED_NODELIST-b300-018}" set -x @@ -190,6 +192,9 @@ fi # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" +if [[ "$MODEL_PREFIX" == "minimaxm3" && -n "$MINIMAX_M3_SLURM_EXCLUDED_NODELIST" ]]; then + sed -i "/^name:.*/a sbatch_directives:\n exclude: \"${MINIMAX_M3_SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_FILE" +fi SRTCTL_APPLY_ARGS=( -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" From 8ace2e9bbe6a2efe94e5824522e4b9b174765643 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:43:03 +0800 Subject: [PATCH 13/21] fix(runner): verify B300 node exclusion --- runners/launch_b300-nv.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index fe75e2e34..dced04e30 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -215,6 +215,15 @@ if [ -z "$JOB_ID" ]; then exit 1 fi +if [[ "$MODEL_PREFIX" == "minimaxm3" && -n "$MINIMAX_M3_SLURM_EXCLUDED_NODELIST" ]]; then + SBATCH_SCRIPT="outputs/$JOB_ID/logs/sbatch_script.sh" + if ! grep -Fq "#SBATCH --exclude=${MINIMAX_M3_SLURM_EXCLUDED_NODELIST}" "$SBATCH_SCRIPT"; then + echo "Error: Slurm node exclusion was not rendered in $SBATCH_SCRIPT" >&2 + scancel "$JOB_ID" || true + exit 1 + fi +fi + echo "Extracted JOB_ID: $JOB_ID" # Use the JOB_ID to find the logs directory From 884ff12ec70a16a6718e818870bd0e8852da6468 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:49:38 +0800 Subject: [PATCH 14/21] fix(runner): check generated B300 sbatch script --- runners/launch_b300-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index dced04e30..77381e7ab 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -216,7 +216,7 @@ if [ -z "$JOB_ID" ]; then fi if [[ "$MODEL_PREFIX" == "minimaxm3" && -n "$MINIMAX_M3_SLURM_EXCLUDED_NODELIST" ]]; then - SBATCH_SCRIPT="outputs/$JOB_ID/logs/sbatch_script.sh" + SBATCH_SCRIPT="outputs/$JOB_ID/sbatch_script.sh" if ! grep -Fq "#SBATCH --exclude=${MINIMAX_M3_SLURM_EXCLUDED_NODELIST}" "$SBATCH_SCRIPT"; then echo "Error: Slurm node exclusion was not rendered in $SBATCH_SCRIPT" >&2 scancel "$JOB_ID" || true From 3ae240bb7503e594fbd4d97eaab0c88c53d2cc22 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:54:19 +0800 Subject: [PATCH 15/21] ci(sweep): validate B300 node exclusion --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 87e85fe44..25c886a0d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4002,3 +4002,9 @@ - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 + +- config-keys: + - minimaxm3-fp8-b300-dynamo-vllm + description: + - "Exclude b300-018 from MiniMax-M3 Slurm jobs after repeated UCX/NIXL RDMA timeouts." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863 From 03d27e75ead3fb5e061b3742f69ebd0031c9daf3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 21 Jun 2026 08:30:38 +0800 Subject: [PATCH 16/21] refactor(vllm): trim MiniMax M3 runtime patches --- .../configs/minimax-m3-vllm-fixes.sh | 153 ++++-------------- .../configs/srt-slurm-sanitize-node-ip.patch | 17 -- runners/launch_b300-nv.sh | 17 +- 3 files changed, 33 insertions(+), 154 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh index 7f612b922..02862bba3 100755 --- a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh @@ -1,138 +1,47 @@ #!/usr/bin/env bash - set -euo pipefail -# Backport fixes merged after the 0618 MiniMax M3 image was built. python3 - <<'PYEOF' -import importlib.util -import pathlib - - -def apply_exact_patches( - target: pathlib.Path, - patches: list[tuple[str, str]], - label: str, -) -> None: - src = target.read_text() - changed = False - - for old, new in patches: - if new in src: - continue - if src.count(old) != 1: - raise RuntimeError(f"Expected exactly one {label} patch anchor in {target}") - src = src.replace(old, new, 1) - changed = True - - if changed: - target.write_text(src) - print(f"[{label}] patched: {target}") - else: - print(f"[{label}] already applied: {target}") - - -spec = importlib.util.find_spec("vllm") -if spec is None or not spec.submodule_search_locations: - raise RuntimeError("Could not locate the installed vllm package") - -vllm_root = pathlib.Path(next(iter(spec.submodule_search_locations))) - -# TP1 data-parallel-attention workers retain a non-contiguous head stride in -# the persistent CUDA-graph top-k buffer. Materialize the slice before the -# MiniMax M3 MSA CSR builder consumes it. -msa_target = ( - vllm_root - / "models" - / "minimax_m3" - / "nvidia" - / "sparse_attention_msa.py" -) -apply_exact_patches( - msa_target, - [ +from importlib.util import find_spec +from pathlib import Path + +spec = find_spec("vllm") +if not spec or not spec.origin: + raise RuntimeError("vllm is not installed") +root = Path(spec.origin).parent +patches = { + root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [ ( " prefill_topk = topk[:, nd:num_tokens, :]\n", " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n", - ) - ], - "minimax-m3-msa-contiguity", -) - -# vllm-project/vllm#45879: heterogeneous TP must validate SPLIT regions using -# per-rank KV heads. MiniMax M3 has four KV heads, so TP8 replicates one head -# per rank instead of scaling block length by the raw TP ratio of eight. -nixl_target = ( - vllm_root - / "distributed" - / "kv_transfer" - / "kv_connector" - / "v1" - / "nixl" - / "base_worker.py" -) -apply_exact_patches( - nixl_target, - [ - ( - """ # only allow the number of blocks to differ; SPLIT regions scale with - # tp_ratio. Mamba uses the ssm_sizes counterpart, so skip block_len here. -""", - """ # only allow the number of blocks to differ; SPLIT regions scale with - # the per-rank KV head ratio rather than the raw tp_ratio, because GQA - # replication caps per-rank heads at 1 when tp > total_kv_heads - # (issue #45330). Mamba uses the ssm_sizes counterpart, so skip here. -""", ), + ], + root / "distributed/kv_transfer/kv_connector/v1/nixl/base_worker.py": [ ( - """ model_replicated = self.use_mla or self.transfer_topo.is_kv_replicated( - remote_engine_id - ) - for i, local_len in enumerate(self.block_len_per_layer): -""", - """ model_replicated = self.use_mla or self.transfer_topo.is_kv_replicated( - remote_engine_id - ) - total_kv_heads = self.transfer_topo.total_num_kv_heads - local_heads = self.transfer_topo.local_physical_heads - remote_heads = max(1, total_kv_heads // remote_tp_size) - for i, local_len in enumerate(self.block_len_per_layer): -""", + " for i, local_len in enumerate(self.block_len_per_layer):\n", + " total_kv_heads = self.transfer_topo.total_num_kv_heads\n" + " local_heads = self.transfer_topo.local_physical_heads\n" + " remote_heads = max(1, total_kv_heads // remote_tp_size)\n" + " for i, local_len in enumerate(self.block_len_per_layer):\n", ), ( - """ elif tp_ratio > 0: - assert remote_len == (local_len * tp_ratio) // block_size_ratio, ( - f"SPLIT region {i}: remote P KV block_len {remote_len} " - f"must equal local {local_len} * tp_ratio {tp_ratio} " - f"// block_size_ratio {block_size_ratio}." - ) -""", - """ elif tp_ratio > 0: - assert ( - remote_len - == (local_len * remote_heads // local_heads) // block_size_ratio - ), ( - f"SPLIT region {i}: remote P KV block_len {remote_len} " - f"must equal local {local_len} * remote_heads " - f"{remote_heads} // local_heads {local_heads} " - f"// block_size_ratio {block_size_ratio}." - ) -""", + "remote_len == (local_len * tp_ratio) // block_size_ratio,", + "remote_len == (local_len * remote_heads // local_heads) " + "// block_size_ratio,", ), ( - """ assert remote_len == local_len // (-tp_ratio), ( - f"SPLIT region {i}: remote P KV block_len " - f"{remote_len} must equal local {local_len} " - f"// |tp_ratio| {-tp_ratio}." - ) -""", - """ assert remote_len == local_len * remote_heads // local_heads, ( - f"SPLIT region {i}: remote P KV block_len {remote_len} " - f"must equal local {local_len} * remote_heads " - f"{remote_heads} // local_heads {local_heads}." - ) -""", + "remote_len == local_len // (-tp_ratio),", + "remote_len == local_len * remote_heads // local_heads,", ), ], - "minimax-m3-nixl-gqa", -) +} +for path, edits in patches.items(): + source = path.read_text() + for old, new in edits: + if new in source: + continue + if source.count(old) != 1: + raise RuntimeError(f"missing or ambiguous patch anchor in {path}") + source = source.replace(old, new, 1) + path.write_text(source) PYEOF diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch b/benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch deleted file mode 100644 index 4c4ad7216..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/configs/srt-slurm-sanitize-node-ip.patch +++ /dev/null @@ -1,17 +0,0 @@ -diff --git a/src/srtctl/core/ip_utils/get_node_ip.sh b/src/srtctl/core/ip_utils/get_node_ip.sh -index 85d5e15..6f51321 100644 ---- a/src/srtctl/core/ip_utils/get_node_ip.sh -+++ b/src/srtctl/core/ip_utils/get_node_ip.sh -@@ -180,3 +180,5 @@ get_node_ip() { - local result -- result=$(srun --jobid $slurm_job_id --nodes=1 --ntasks=1 --nodelist=$node bash -c "$ip_script" 2>&1) -+ result=$(srun --quiet --jobid "$slurm_job_id" --nodes=1 --ntasks=1 --nodelist="$node" bash -c "$ip_script" 2>&1) - local rc=$? -+ local ip -+ ip=$(printf '%s\n' "$result" | grep -Eo '([0-9]{1,3}\.){3}[0-9]{1,3}' | tail -n 1) -@@ -184,3 +186,3 @@ get_node_ip() { -- if [ $rc -eq 0 ] && [ -n "$result" ]; then -- echo "$result" -+ if [ $rc -eq 0 ] && [ -n "$ip" ]; then -+ echo "$ip" - return 0 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 77381e7ab..5b92d6d9c 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -58,7 +58,6 @@ fi echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" SRTCTL_SETUP_SCRIPT="" -SRTCTL_HOST_PATCH="" if [ -d "$SRT_REPO_DIR" ]; then echo "Removing existing $SRT_REPO_DIR..." rm -rf "$SRT_REPO_DIR" @@ -93,7 +92,8 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISIO mkdir -p recipes/vllm/minimax-m3 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh" - SRTCTL_HOST_PATCH="srt-slurm-sanitize-node-ip.patch" + # NVIDIA/srt-slurm#38 + git show 22d46ba9971615016d2339c9ffbc7b4597accfad --format= -- src/srtctl/core/ip_utils/get_node_ip.sh | git apply - || exit 1 cp \ "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ "configs/$SRTCTL_SETUP_SCRIPT" @@ -103,19 +103,6 @@ else git checkout sa-submission-q2-2026 fi -if [[ -n "$SRTCTL_HOST_PATCH" ]]; then - SRTCTL_HOST_PATCH_PATH="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_HOST_PATCH" - if git apply --unidiff-zero --check "$SRTCTL_HOST_PATCH_PATH" 2>/dev/null; then - git apply --unidiff-zero "$SRTCTL_HOST_PATCH_PATH" || exit 1 - echo "Applied host patch: $SRTCTL_HOST_PATCH" - elif git apply --unidiff-zero --reverse --check "$SRTCTL_HOST_PATCH_PATH" 2>/dev/null; then - echo "Host patch already applied: $SRTCTL_HOST_PATCH" - else - echo "Error: host patch does not apply cleanly: $SRTCTL_HOST_PATCH" >&2 - exit 1 - fi -fi - echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" curl -LsSf https://astral.sh/uv/install.sh | sh From 37d5e2c04123aaded58f9dd19fee715e4a21fdf6 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 22 Jun 2026 13:45:45 -0700 Subject: [PATCH 17/21] Update MiniMax M3 B300 Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 78 +++++++++++++++++ .../b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml | 83 +++++++++++++++++++ .../b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml | 83 +++++++++++++++++++ .../b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml | 83 +++++++++++++++++++ .../b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml | 83 +++++++++++++++++++ .../b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml | 83 +++++++++++++++++++ .../b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml | 83 +++++++++++++++++++ 7 files changed, 576 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 877d3c795..c1897bec7 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11721,6 +11721,84 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 4 ep: 1 dp-attn: false + - conc-list: [1024] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false # MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863. # All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8, diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..e48310898 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..30ac635a9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..46af72e46 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..a43b474da --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..6a42998e3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-3p7d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 7 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..8c5deec60 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" From adbe614489ea9af185c2b798977ffdad45aaef35 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 22 Jun 2026 16:45:41 -0700 Subject: [PATCH 18/21] fix --- .../vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml index 8c5deec60..c9f29f785 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml @@ -67,7 +67,7 @@ backend: max-model-len: 9472 language-model-only: true stream-interval: 32 - max-num-seqs: 512 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 4096 From fe0eda575e6ce22c1b96bd227540a9b0af59f3a9 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 22 Jun 2026 17:11:11 -0700 Subject: [PATCH 19/21] update to flashinfer --- .../minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 4 ++++ .../vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 4 ++++ 9 files changed, 36 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml index c98ad0b44..04aca6586 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml @@ -45,6 +45,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml index bc4c449b2..16ac29090 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml index 7a8ddd1a1..b1558ae34 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index d00fb046e..46aaa045d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -58,6 +60,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml index cf8736e14..3fb21d7c8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml index 9572688b2..093e1450b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -58,6 +60,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml index a0e050f1d..3756103ee 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml index 6f765ab74..d53df1c7c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml index d40a33582..1e4c245b7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -58,6 +60,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 From 0a751a7465ea32023d9a836818d4a0dd380926e2 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 22 Jun 2026 18:00:18 -0700 Subject: [PATCH 20/21] prune non-pareto --- .../b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml | 85 ------------------- .../b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml | 83 ------------------ .../b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 85 ------------------- .../b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml | 83 ------------------ .../b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml | 83 ------------------ .../b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml | 85 ------------------- .../b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml | 83 ------------------ 7 files changed, 587 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml deleted file mode 100644 index 16ac29090..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml +++ /dev/null @@ -1,85 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "128" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml deleted file mode 100644 index a43b474da..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml +++ /dev/null @@ -1,83 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-tep4-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 4 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 512 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1024" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml deleted file mode 100644 index 3fb21d7c8..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml +++ /dev/null @@ -1,85 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-dep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml deleted file mode 100644 index 093e1450b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml +++ /dev/null @@ -1,83 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "32" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml deleted file mode 100644 index 6a42998e3..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml +++ /dev/null @@ -1,83 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-3p7d-fp8-dep2-tep4-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 7 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 4 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 512 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml deleted file mode 100644 index d53df1c7c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml +++ /dev/null @@ -1,85 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-4p3d-fp8-dep2-dep4-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 4 - decode_workers: 3 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 1 - data-parallel-size: 4 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 512 # Per DP rank: 3 workers x DP4 = 12 ranks. - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml deleted file mode 100644 index 1e4c245b7..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml +++ /dev/null @@ -1,83 +0,0 @@ -name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tep8-8k1k" - -model: - path: "MiniMaxAI/MiniMax-M3-MXFP8" - container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" - precision: "fp8" - -resources: - gpu_type: "b300" - gpus_per_node: 8 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 5 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -dynamo: - install: true - version: 1.3.0.dev20260614 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - decode_environment: - VLLM_FLOAT32_MATMUL_PRECISION: high - UCX_TLS: "cuda_copy,rc" - - vllm_config: - prefill: - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' - kv-cache-dtype: fp8 - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 4096 - -health_check: - max_attempts: 360 - interval_seconds: 10 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x64" - req_rate: "inf" From b2e71c887f6746003e3dd2444c3d1818853108cd Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Mon, 22 Jun 2026 20:09:43 -0700 Subject: [PATCH 21/21] clean up nvidia-master --- .github/configs/nvidia-master.yaml | 91 ------------------------------ 1 file changed, 91 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 313ee520a..adadd5f17 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11712,19 +11712,6 @@ minimaxm3-fp8-b300-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - - conc-list: [128] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - conc-list: [256, 512] prefill: num-worker: 2 @@ -11751,32 +11738,6 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 8 ep: 8 dp-attn: false - - conc-list: [512] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [32] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - conc-list: [4096] prefill: num-worker: 4 @@ -11790,32 +11751,6 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 8 ep: 8 dp-attn: true - - conc-list: [4096] - prefill: - num-worker: 4 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml" - decode: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - - conc-list: [4, 64] - prefill: - num-worker: 5 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - conc-list: [1, 4, 8, 16] prefill: num-worker: 1 @@ -11829,19 +11764,6 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 4 ep: 1 dp-attn: false - - conc-list: [1024] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p1d-dep2-tep4-8k1k.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - conc-list: [4096] prefill: num-worker: 4 @@ -11855,19 +11777,6 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 4 ep: 4 dp-attn: false - - conc-list: [256] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p7d-dep2-tep4-8k1k.yaml" - decode: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: false - conc-list: [16, 32, 64, 128] prefill: num-worker: 1