From 5023e6035f3e08ead7db13762016dd5872f234da Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 17:21:31 -0500
Subject: [PATCH 1/5] chore: update MiniMax M3 B300 EAGLE3 image

---
 .github/configs/nvidia-master.yaml                 |  2 +-
 .../fixed_seq_len/minimaxm3_fp8_b300_mtp.sh        | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 521ba6636..f72e678de 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -13028,7 +13028,7 @@ minimaxm3-fp8-b200-vllm-mtp:
 # big batches, and the draft weights + draft KV shave headroom — tp2-ep2 is
 # dropped entirely since its KV headroom was already thin without a draft.
 minimaxm3-fp8-b300-vllm-mtp:
-  image: vllm/vllm-openai:minimax-m3
+  image: vllm/vllm-openai:minimax-m3-0618
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: b300
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
index 742933deb..4d2dcda54 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
@@ -8,11 +8,8 @@
 # --block-size 128 is mandatory (MSA sparse/index cache); the benchmark is
 # text-only, so --language-model-only frees the vision encoder's VRAM.
 #
-# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer
-# only supports page size 128 through its trtllm-gen kernel, which requires
-# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise. The
-# target keeps its default (FlashInfer) backend; FLASH_ATTN takes any
-# multiple-of-16 block size, so the mandatory 128 is fine for the draft.
+# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is
+# pinned separately to TRITON_ATTN.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -69,7 +66,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
 elif [ "$EP_SIZE" -gt 1 ]; then
   PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
 else
-  PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
 fi
 
 # use 3 speculative tokens for all configs for now
@@ -88,10 +85,13 @@ $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
+--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
+--attention-config.indexer_kv_dtype "fp8" \
+--kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
---speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
+--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \
 --stream-interval 20 --no-enable-prefix-caching \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 

From 564772ee3fcec268e84e5769f144f451635c131e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 17:22:29 -0500
Subject: [PATCH 2/5] chore: register MiniMax M3 image bump

---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 06a81eaf1..2b32743c9 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3950,3 +3950,11 @@
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
+
+- config-keys:
+    - minimaxm3-fp8-b300-vllm-mtp
+  description:
+    - "Update the MiniMax-M3 B300 single-node image to vllm/vllm-openai:minimax-m3-0618."
+    - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter."
+    - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1835

From 645f7aa4335db20c2c7a38fcf16e8709c5d76f35 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 07:38:37 +0800
Subject: [PATCH 3/5] chore: use cu130 MiniMax M3 image

---
 .github/configs/nvidia-master.yaml | 2 +-
 perf-changelog.yaml                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f72e678de..456d4012b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -13028,7 +13028,7 @@ minimaxm3-fp8-b200-vllm-mtp:
 # big batches, and the draft weights + draft KV shave headroom — tp2-ep2 is
 # dropped entirely since its KV headroom was already thin without a draft.
 minimaxm3-fp8-b300-vllm-mtp:
-  image: vllm/vllm-openai:minimax-m3-0618
+  image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: b300
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2b32743c9..d5f4ffe72 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3954,7 +3954,7 @@
 - config-keys:
     - minimaxm3-fp8-b300-vllm-mtp
   description:
-    - "Update the MiniMax-M3 B300 single-node image to vllm/vllm-openai:minimax-m3-0618."
+    - "Update the MiniMax-M3 B300 single-node image to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130."
     - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter."
     - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1835

From 9105f7213da6fe842e9e036ca498d1917347f9a7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 07:42:14 +0800
Subject: [PATCH 4/5] chore: retrigger benchmark sweeps


From 266f953a923599e3c182dfc50b3bbd4afc1f352b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 11:33:15 +0800
Subject: [PATCH 5/5] fix(vllm): materialize MiniMax M3 MSA top-k slice

---
 .../fixed_seq_len/minimaxm3_fp8_b300_mtp.sh   | 33 +++++++++++++++++++
 perf-changelog.yaml                           |  1 +
 2 files changed, 34 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
index 4d2dcda54..01bf23eb6 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
@@ -25,6 +25,39 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# The 0618 image keeps MiniMax M3 top-k indices in a persistent
+# [head_kv, max_tokens, topK] buffer for CUDA graphs. Slicing that buffer to
+# the actual prefill length is non-contiguous when TP leaves multiple local KV
+# heads, and the MSA CSR builder rejects it. Materialize the slice until the
+# image includes this fix.
+python3 - <<'PYEOF' || { echo "MiniMax M3 MSA contiguity patch failed" >&2; exit 1; }
+import importlib.util
+import pathlib
+
+spec = importlib.util.find_spec("vllm")
+if spec is None or not spec.submodule_search_locations:
+    raise RuntimeError("Could not locate the installed vllm package")
+
+target = (
+    pathlib.Path(next(iter(spec.submodule_search_locations)))
+    / "models"
+    / "minimax_m3"
+    / "nvidia"
+    / "sparse_attention_msa.py"
+)
+src = target.read_text()
+old = "            prefill_topk = topk[:, nd:num_tokens, :]\n"
+new = "            prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n"
+
+if new in src:
+    print(f"[minimax-m3-msa-patch] already applied: {target}")
+elif src.count(old) == 1:
+    target.write_text(src.replace(old, new, 1))
+    print(f"[minimax-m3-msa-patch] patched: {target}")
+else:
+    raise RuntimeError(f"Expected exactly one patch anchor in {target}")
+PYEOF
+
 DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"
 
 # `hf download` creates the target dir if missing and is itself idempotent.
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d5f4ffe72..73ed83899 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3957,4 +3957,5 @@
     - "Update the MiniMax-M3 B300 single-node image to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130."
     - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter."
     - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend."
+    - "Patch the image's MiniMax M3 MSA prefill path to materialize sliced top-k indices before CSR construction."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1835