From 8aa42437cc33dd438576d752965d40d14faff801 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 16:41:34 -0500
Subject: [PATCH 1/5] chore: update MiniMax M3 Blackwell image

---
 .github/configs/nvidia-master.yaml                   | 12 ++++++------
 .../single_node/fixed_seq_len/minimaxm3_fp8_b200.sh  |  5 ++++-
 .../fixed_seq_len/minimaxm3_fp8_b200_mtp.sh          |  5 ++++-
 .../single_node/fixed_seq_len/minimaxm3_fp8_b300.sh  |  5 ++++-
 .../fixed_seq_len/minimaxm3_fp8_b300_mtp.sh          |  5 ++++-
 5 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 521ba6636..ed0df708e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12908,12 +12908,12 @@ qwen3.5-fp4-b200-trt:
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
 # cores on Blackwell. M3 support has not shipped in a stable vLLM release;
-# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release
+# the dedicated vllm/vllm-openai:minimax-m3-0618 image is built from the m3_release
 # branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA
 # sparse/index cache alignment). Weights are NOT SRE-staged; b300 falls back
 # to writable /data/models (see launch_b300-nv.sh MODEL_PATH split).
 minimaxm3-fp8-b300-vllm:
-  image: vllm/vllm-openai:minimax-m3
+  image: vllm/vllm-openai:minimax-m3-0618
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: b300
@@ -12948,13 +12948,13 @@ minimaxm3-fp8-b300-vllm:
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
 # cores on Blackwell. M3 support has not shipped in a stable vLLM release;
-# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release
+# the dedicated vllm/vllm-openai:minimax-m3-0618 image is built from the m3_release
 # branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA
 # sparse/index cache alignment). Weights are NOT SRE-staged: b200-dgxc reads
 # /lustre/fsw/gharunners/models/MiniMax-M3-MXFP8 (pre-downloaded, see
 # launch_b200-dgxc.sh).
 minimaxm3-fp8-b200-vllm:
-  image: vllm/vllm-openai:minimax-m3
+  image: vllm/vllm-openai:minimax-m3-0618
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: b200-dgxc
@@ -12991,7 +12991,7 @@ minimaxm3-fp8-b200-vllm:
 # precedent: spec decode pays off at low/mid concurrency while acceptance
 # dilutes in big batches, and the draft weights + draft KV shave headroom.
 minimaxm3-fp8-b200-vllm-mtp:
-  image: vllm/vllm-openai:minimax-m3
+  image: vllm/vllm-openai:minimax-m3-0618
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: b200-dgxc
@@ -13028,7 +13028,7 @@ minimaxm3-fp8-b200-vllm-mtp:
 # big batches, and the draft weights + draft KV shave headroom — tp2-ep2 is
 # dropped entirely since its KV headroom was already thin without a draft.
 minimaxm3-fp8-b300-vllm-mtp:
-  image: vllm/vllm-openai:minimax-m3
+  image: vllm/vllm-openai:minimax-m3-0618
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: b300
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
index 16041a2ea..9c743db95 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
@@ -45,7 +45,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
 elif [ "$EP_SIZE" -gt 1 ]; then
   PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
 else
-  PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
 fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -61,6 +61,9 @@ $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
+--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
+--attention-config.indexer_kv_dtype fp8 \
+--kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
index 54aabd342..c45b3c1ff 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
@@ -68,7 +68,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
 elif [ "$EP_SIZE" -gt 1 ]; then
   PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
 else
-  PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
 fi
 
 # use 3 speculative tokens for all configs for now
@@ -87,6 +87,9 @@ $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
+--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
+--attention-config.indexer_kv_dtype fp8 \
+--kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
index 1f4187c04..4dd6a8ae8 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
@@ -53,7 +53,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
 elif [ "$EP_SIZE" -gt 1 ]; then
   PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
 else
-  PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
 fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -69,6 +69,9 @@ $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
+--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
+--attention-config.indexer_kv_dtype fp8 \
+--kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
index 742933deb..3f5d8a6e7 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
@@ -69,7 +69,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
 elif [ "$EP_SIZE" -gt 1 ]; then
   PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
 else
-  PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
 fi
 
 # use 3 speculative tokens for all configs for now
@@ -88,6 +88,9 @@ $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
+--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
+--attention-config.indexer_kv_dtype fp8 \
+--kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \

From 815e523d33e66ac1337b846b34407b739e3d9549 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 16:42:01 -0500
Subject: [PATCH 2/5] chore: register MiniMax M3 image bump

---
 perf-changelog.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 06a81eaf1..8cddf8d41 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3950,3 +3950,14 @@
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
+
+- config-keys:
+    - minimaxm3-fp8-b200-vllm
+    - minimaxm3-fp8-b300-vllm
+    - minimaxm3-fp8-b200-vllm-mtp
+    - minimaxm3-fp8-b300-vllm-mtp
+  description:
+    - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618."
+    - "Use FlashInfer with TRT-LLM attention, FP8 indexer KV, and an FP8 KV cache."
+    - "Remove the explicit Marlin MoE backend override; the new image requires no additional MoE backend argument."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831

From bb19111b4d5ae6d18c1da81a60e8979a3ec5468c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 16:45:03 -0500
Subject: [PATCH 3/5] fix: use Triton attention for MiniMax M3 EAGLE3

---
 .github/configs/nvidia-master.yaml                   |  3 +--
 .../single_node/fixed_seq_len/minimaxm3_fp8_b200.sh  |  2 +-
 .../fixed_seq_len/minimaxm3_fp8_b200_mtp.sh          | 12 ++++--------
 .../single_node/fixed_seq_len/minimaxm3_fp8_b300.sh  |  2 +-
 .../fixed_seq_len/minimaxm3_fp8_b300_mtp.sh          | 11 ++++-------
 perf-changelog.yaml                                  |  1 +
 6 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ed0df708e..18fda26d2 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12985,8 +12985,7 @@ minimaxm3-fp8-b200-vllm:
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
 # minimaxm3-fp8-b200-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
 # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens, drafter pinned
-# to FLASH_ATTN since the head is MHA and FlashInfer needs GQA/MQA at page
-# size 128). Search space mirrors the non-MTP entry trimmed at the
+# to TRITON_ATTN). Search space mirrors the non-MTP entry trimmed at the
 # extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp
 # precedent: spec decode pays off at low/mid concurrency while acceptance
 # dilutes in big batches, and the draft weights + draft KV shave headroom.
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
index 9c743db95..9c901a9bd 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
@@ -62,7 +62,7 @@ $PARALLEL_ARGS \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
 --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
---attention-config.indexer_kv_dtype fp8 \
+--attention-config.indexer_kv_dtype "fp8" \
 --kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
index c45b3c1ff..e104cf756 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
@@ -10,12 +10,8 @@
 # encoder for the text-only benchmark. dp-attn=true maps to DP×EP (DEP);
 # ep>1 maps to TP+EP (TEP).
 #
-# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer
-# only supports page size 128 through its trtllm-gen kernel, which requires
-# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise (the
-# failure hit on the B300 MTP canary). The target keeps its default
-# (FlashInfer) backend; FLASH_ATTN takes any multiple-of-16 block size, so
-# the mandatory 128 is fine for the draft.
+# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is
+# pinned separately to TRITON_ATTN.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -88,12 +84,12 @@ $PARALLEL_ARGS \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
 --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
---attention-config.indexer_kv_dtype fp8 \
+--attention-config.indexer_kv_dtype "fp8" \
 --kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
---speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
+--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \
 --stream-interval 20 --no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
index 4dd6a8ae8..d3eaf6076 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
@@ -70,7 +70,7 @@ $PARALLEL_ARGS \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
 --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
---attention-config.indexer_kv_dtype fp8 \
+--attention-config.indexer_kv_dtype "fp8" \
 --kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
index 3f5d8a6e7..4d2dcda54 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
@@ -8,11 +8,8 @@
 # --block-size 128 is mandatory (MSA sparse/index cache); the benchmark is
 # text-only, so --language-model-only frees the vision encoder's VRAM.
 #
-# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer
-# only supports page size 128 through its trtllm-gen kernel, which requires
-# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise. The
-# target keeps its default (FlashInfer) backend; FLASH_ATTN takes any
-# multiple-of-16 block size, so the mandatory 128 is fine for the draft.
+# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is
+# pinned separately to TRITON_ATTN.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -89,12 +86,12 @@ $PARALLEL_ARGS \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
 --attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
---attention-config.indexer_kv_dtype fp8 \
+--attention-config.indexer_kv_dtype "fp8" \
 --kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
---speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
+--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \
 --stream-interval 20 --no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8cddf8d41..d433a84f0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3959,5 +3959,6 @@
   description:
     - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618."
     - "Use FlashInfer with TRT-LLM attention, FP8 indexer KV, and an FP8 KV cache."
+    - "Use TRITON_ATTN for the EAGLE3 drafter attention backend."
     - "Remove the explicit Marlin MoE backend override; the new image requires no additional MoE backend argument."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831

From 3a7f68187859264509e203cde2debff0be44982d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 16:45:22 -0500
Subject: [PATCH 4/5] docs: refine MiniMax M3 changelog

---
 perf-changelog.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d433a84f0..4c126edb3 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3958,7 +3958,6 @@
     - minimaxm3-fp8-b300-vllm-mtp
   description:
     - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618."
-    - "Use FlashInfer with TRT-LLM attention, FP8 indexer KV, and an FP8 KV cache."
-    - "Use TRITON_ATTN for the EAGLE3 drafter attention backend."
-    - "Remove the explicit Marlin MoE backend override; the new image requires no additional MoE backend argument."
+    - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter."
+    - "Remove the explicit Marlin MoE override because the new image requires no additional MoE backend argument."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831

From 9e9fe94aec656359cd56df12cb038dd0e183b04f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 16:47:14 -0500
Subject: [PATCH 5/5] docs: clarify MiniMax M3 MoE backend

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4c126edb3..6937e62dc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3959,5 +3959,5 @@
   description:
     - "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618."
     - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter."
-    - "Remove the explicit Marlin MoE override because the new image requires no additional MoE backend argument."
+    - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831