SemiAnalysisAI · Oseltamivir · Jun 23, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
@@ -11922,13 +11922,13 @@ minimaxm3-fp8-b300-vllm:
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
 # cores on Blackwell. M3 support has not shipped in a stable vLLM release;
-# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release
-# branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA
+# the dedicated vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 image is built
+# from the m3_release branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA
 # sparse/index cache alignment). Weights are NOT SRE-staged: b200-dgxc reads
 # /lustre/fsw/gharunners/models/MiniMax-M3-MXFP8 (pre-downloaded, see
 # launch_b200-dgxc.sh).
 minimaxm3-fp8-b200-vllm:
-  image: vllm/vllm-openai:minimax-m3
+  image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: b200-dgxc

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
@@ -22,6 +22,39 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# The 0618 image keeps MiniMax M3 top-k indices in a persistent
+# [head_kv, max_tokens, topK] buffer for CUDA graphs. Slicing that buffer to
+# the actual prefill length is non-contiguous when TP leaves multiple local KV
+# heads, and the MSA CSR builder rejects it. Materialize the slice until the
+# image includes this fix.
+python3 - <<'PYEOF' || { echo "MiniMax M3 MSA contiguity patch failed" >&2; exit 1; }
+import importlib.util
+import pathlib
+
+spec = importlib.util.find_spec("vllm")
+if spec is None or not spec.submodule_search_locations:
+    raise RuntimeError("Could not locate the installed vllm package")
+
+target = (
+    pathlib.Path(next(iter(spec.submodule_search_locations)))
+    / "models"
+    / "minimax_m3"
+    / "nvidia"
+    / "sparse_attention_msa.py"
+)
+src = target.read_text()
+old = "            prefill_topk = topk[:, nd:num_tokens, :]\n"
+new = "            prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n"
+
+if new in src:
+    print(f"[minimax-m3-msa-patch] already applied: {target}")
+elif src.count(old) == 1:
+    target.write_text(src.replace(old, new, 1))
+    print(f"[minimax-m3-msa-patch] patched: {target}")
+else:
+    raise RuntimeError(f"Expected exactly one patch anchor in {target}")
+PYEOF
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -45,7 +78,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
 elif [ "$EP_SIZE" -gt 1 ]; then
   PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
 else
-  PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
+  PARALLEL_ARGS="--tensor-parallel-size=$TP"
 fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -61,6 +94,9 @@ $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --block-size 128 \
+--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
+--attention-config.indexer_kv_dtype "fp8" \
+--kv-cache-dtype fp8 \
 --language-model-only \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3951,6 +3951,15 @@
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
 
+- config-keys:
+    - minimaxm3-fp8-b200-vllm
+  description:
+    - "Update the MiniMax-M3 B200 single-node image to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130."
+    - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache."
+    - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend."
+    - "Patch the image's MiniMax M3 MSA prefill path to materialize sliced top-k indices before CSR construction."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1833
+
 - config-keys:
     - minimaxm3-fp8-b300-vllm
   description: