From 8aa42437cc33dd438576d752965d40d14faff801 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 16:41:34 -0500 Subject: [PATCH 1/5] chore: update MiniMax M3 Blackwell image --- .github/configs/nvidia-master.yaml | 12 ++++++------ .../single_node/fixed_seq_len/minimaxm3_fp8_b200.sh | 5 ++++- .../fixed_seq_len/minimaxm3_fp8_b200_mtp.sh | 5 ++++- .../single_node/fixed_seq_len/minimaxm3_fp8_b300.sh | 5 ++++- .../fixed_seq_len/minimaxm3_fp8_b300_mtp.sh | 5 ++++- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 521ba6636..ed0df708e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12908,12 +12908,12 @@ qwen3.5-fp4-b200-trt: # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor # cores on Blackwell. M3 support has not shipped in a stable vLLM release; -# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release +# the dedicated vllm/vllm-openai:minimax-m3-0618 image is built from the m3_release # branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA # sparse/index cache alignment). Weights are NOT SRE-staged; b300 falls back # to writable /data/models (see launch_b300-nv.sh MODEL_PATH split). minimaxm3-fp8-b300-vllm: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:minimax-m3-0618 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b300 @@ -12948,13 +12948,13 @@ minimaxm3-fp8-b300-vllm: # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor # cores on Blackwell. M3 support has not shipped in a stable vLLM release; -# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release +# the dedicated vllm/vllm-openai:minimax-m3-0618 image is built from the m3_release # branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA # sparse/index cache alignment). Weights are NOT SRE-staged: b200-dgxc reads # /lustre/fsw/gharunners/models/MiniMax-M3-MXFP8 (pre-downloaded, see # launch_b200-dgxc.sh). minimaxm3-fp8-b200-vllm: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:minimax-m3-0618 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b200-dgxc @@ -12991,7 +12991,7 @@ minimaxm3-fp8-b200-vllm: # precedent: spec decode pays off at low/mid concurrency while acceptance # dilutes in big batches, and the draft weights + draft KV shave headroom. minimaxm3-fp8-b200-vllm-mtp: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:minimax-m3-0618 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b200-dgxc @@ -13028,7 +13028,7 @@ minimaxm3-fp8-b200-vllm-mtp: # big batches, and the draft weights + draft KV shave headroom — tp2-ep2 is # dropped entirely since its KV headroom was already thin without a draft. minimaxm3-fp8-b300-vllm-mtp: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:minimax-m3-0618 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b300 diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh index 16041a2ea..9c743db95 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh @@ -45,7 +45,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" else - PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin" + PARALLEL_ARGS="--tensor-parallel-size=$TP" fi if [ "${EVAL_ONLY}" = "true" ]; then @@ -61,6 +61,9 @@ $PARALLEL_ARGS \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ +--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ +--attention-config.indexer_kv_dtype fp8 \ +--kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh index 54aabd342..c45b3c1ff 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh @@ -68,7 +68,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" else - PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin" + PARALLEL_ARGS="--tensor-parallel-size=$TP" fi # use 3 speculative tokens for all configs for now @@ -87,6 +87,9 @@ $PARALLEL_ARGS \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ +--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ +--attention-config.indexer_kv_dtype fp8 \ +--kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh index 1f4187c04..4dd6a8ae8 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh @@ -53,7 +53,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" else - PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin" + PARALLEL_ARGS="--tensor-parallel-size=$TP" fi if [ "${EVAL_ONLY}" = "true" ]; then @@ -69,6 +69,9 @@ $PARALLEL_ARGS \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ +--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ +--attention-config.indexer_kv_dtype fp8 \ +--kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh index 742933deb..3f5d8a6e7 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh @@ -69,7 +69,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" else - PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin" + PARALLEL_ARGS="--tensor-parallel-size=$TP" fi # use 3 speculative tokens for all configs for now @@ -88,6 +88,9 @@ $PARALLEL_ARGS \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ +--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ +--attention-config.indexer_kv_dtype fp8 \ +--kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ From 815e523d33e66ac1337b846b34407b739e3d9549 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 16:42:01 -0500 Subject: [PATCH 2/5] chore: register MiniMax M3 image bump --- perf-changelog.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 06a81eaf1..8cddf8d41 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3950,3 +3950,14 @@ - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 + +- config-keys: + - minimaxm3-fp8-b200-vllm + - minimaxm3-fp8-b300-vllm + - minimaxm3-fp8-b200-vllm-mtp + - minimaxm3-fp8-b300-vllm-mtp + description: + - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618." + - "Use FlashInfer with TRT-LLM attention, FP8 indexer KV, and an FP8 KV cache." + - "Remove the explicit Marlin MoE backend override; the new image requires no additional MoE backend argument." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831 From bb19111b4d5ae6d18c1da81a60e8979a3ec5468c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 16:45:03 -0500 Subject: [PATCH 3/5] fix: use Triton attention for MiniMax M3 EAGLE3 --- .github/configs/nvidia-master.yaml | 3 +-- .../single_node/fixed_seq_len/minimaxm3_fp8_b200.sh | 2 +- .../fixed_seq_len/minimaxm3_fp8_b200_mtp.sh | 12 ++++-------- .../single_node/fixed_seq_len/minimaxm3_fp8_b300.sh | 2 +- .../fixed_seq_len/minimaxm3_fp8_b300_mtp.sh | 11 ++++------- perf-changelog.yaml | 1 + 6 files changed, 12 insertions(+), 19 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ed0df708e..18fda26d2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12985,8 +12985,7 @@ minimaxm3-fp8-b200-vllm: # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of # minimaxm3-fp8-b200-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens, drafter pinned -# to FLASH_ATTN since the head is MHA and FlashInfer needs GQA/MQA at page -# size 128). Search space mirrors the non-MTP entry trimmed at the +# to TRITON_ATTN). Search space mirrors the non-MTP entry trimmed at the # extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp # precedent: spec decode pays off at low/mid concurrency while acceptance # dilutes in big batches, and the draft weights + draft KV shave headroom. diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh index 9c743db95..9c901a9bd 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh @@ -62,7 +62,7 @@ $PARALLEL_ARGS \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ ---attention-config.indexer_kv_dtype fp8 \ +--attention-config.indexer_kv_dtype "fp8" \ --kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh index c45b3c1ff..e104cf756 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh @@ -10,12 +10,8 @@ # encoder for the text-only benchmark. dp-attn=true maps to DP×EP (DEP); # ep>1 maps to TP+EP (TEP). # -# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer -# only supports page size 128 through its trtllm-gen kernel, which requires -# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise (the -# failure hit on the B300 MTP canary). The target keeps its default -# (FlashInfer) backend; FLASH_ATTN takes any multiple-of-16 block size, so -# the mandatory 128 is fine for the draft. +# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is +# pinned separately to TRITON_ATTN. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -88,12 +84,12 @@ $PARALLEL_ARGS \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ ---attention-config.indexer_kv_dtype fp8 \ +--attention-config.indexer_kv_dtype "fp8" \ --kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ ---speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \ +--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \ --stream-interval 20 --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh index 4dd6a8ae8..d3eaf6076 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh @@ -70,7 +70,7 @@ $PARALLEL_ARGS \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ ---attention-config.indexer_kv_dtype fp8 \ +--attention-config.indexer_kv_dtype "fp8" \ --kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh index 3f5d8a6e7..4d2dcda54 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh @@ -8,11 +8,8 @@ # --block-size 128 is mandatory (MSA sparse/index cache); the benchmark is # text-only, so --language-model-only frees the vision encoder's VRAM. # -# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer -# only supports page size 128 through its trtllm-gen kernel, which requires -# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise. The -# target keeps its default (FlashInfer) backend; FLASH_ATTN takes any -# multiple-of-16 block size, so the mandatory 128 is fine for the draft. +# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is +# pinned separately to TRITON_ATTN. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -89,12 +86,12 @@ $PARALLEL_ARGS \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ ---attention-config.indexer_kv_dtype fp8 \ +--attention-config.indexer_kv_dtype "fp8" \ --kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ ---speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \ +--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \ --stream-interval 20 --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8cddf8d41..d433a84f0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3959,5 +3959,6 @@ description: - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618." - "Use FlashInfer with TRT-LLM attention, FP8 indexer KV, and an FP8 KV cache." + - "Use TRITON_ATTN for the EAGLE3 drafter attention backend." - "Remove the explicit Marlin MoE backend override; the new image requires no additional MoE backend argument." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831 From 3a7f68187859264509e203cde2debff0be44982d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 16:45:22 -0500 Subject: [PATCH 4/5] docs: refine MiniMax M3 changelog --- perf-changelog.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d433a84f0..4c126edb3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3958,7 +3958,6 @@ - minimaxm3-fp8-b300-vllm-mtp description: - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618." - - "Use FlashInfer with TRT-LLM attention, FP8 indexer KV, and an FP8 KV cache." - - "Use TRITON_ATTN for the EAGLE3 drafter attention backend." - - "Remove the explicit Marlin MoE backend override; the new image requires no additional MoE backend argument." + - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter." + - "Remove the explicit Marlin MoE override because the new image requires no additional MoE backend argument." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831 From 9e9fe94aec656359cd56df12cb038dd0e183b04f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 16:47:14 -0500 Subject: [PATCH 5/5] docs: clarify MiniMax M3 MoE backend --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4c126edb3..6937e62dc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3959,5 +3959,5 @@ description: - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618." - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter." - - "Remove the explicit Marlin MoE override because the new image requires no additional MoE backend argument." + - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831