From 35f4511b83c35b60fb03886c01dace65dc1aefc5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 17:21:31 -0500 Subject: [PATCH 1/5] chore: update MiniMax M3 B200 EAGLE3 image --- .github/configs/nvidia-master.yaml | 5 ++--- .../fixed_seq_len/minimaxm3_fp8_b200_mtp.sh | 15 +++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 521ba6636..51ab71993 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12985,13 +12985,12 @@ minimaxm3-fp8-b200-vllm: # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of # minimaxm3-fp8-b200-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens, drafter pinned -# to FLASH_ATTN since the head is MHA and FlashInfer needs GQA/MQA at page -# size 128). Search space mirrors the non-MTP entry trimmed at the +# to TRITON_ATTN). Search space mirrors the non-MTP entry trimmed at the # extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp # precedent: spec decode pays off at low/mid concurrency while acceptance # dilutes in big batches, and the draft weights + draft KV shave headroom. minimaxm3-fp8-b200-vllm-mtp: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:minimax-m3-0618 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b200-dgxc diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh index 54aabd342..e104cf756 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh @@ -10,12 +10,8 @@ # encoder for the text-only benchmark. dp-attn=true maps to DP×EP (DEP); # ep>1 maps to TP+EP (TEP). # -# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer -# only supports page size 128 through its trtllm-gen kernel, which requires -# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise (the -# failure hit on the B300 MTP canary). The target keeps its default -# (FlashInfer) backend; FLASH_ATTN takes any multiple-of-16 block size, so -# the mandatory 128 is fine for the draft. +# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is +# pinned separately to TRITON_ATTN. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -68,7 +64,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" else - PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin" + PARALLEL_ARGS="--tensor-parallel-size=$TP" fi # use 3 speculative tokens for all configs for now @@ -87,10 +83,13 @@ $PARALLEL_ARGS \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ +--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ +--attention-config.indexer_kv_dtype "fp8" \ +--kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ ---speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \ +--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \ --stream-interval 20 --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & From 4c8770796b8f11b77add5a3fc9fbd8c1c553f101 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 17:22:29 -0500 Subject: [PATCH 2/5] chore: register MiniMax M3 image bump --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 06a81eaf1..e976ac3f7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3950,3 +3950,11 @@ - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 + +- config-keys: + - minimaxm3-fp8-b200-vllm-mtp + description: + - "Update the MiniMax-M3 B200 single-node image to vllm/vllm-openai:minimax-m3-0618." + - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter." + - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1832 From 448289f0ae11a313a0700005f51d10f862f81c54 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 07:38:24 +0800 Subject: [PATCH 3/5] chore: use cu130 MiniMax M3 image --- .github/configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 51ab71993..df37583a4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12990,7 +12990,7 @@ minimaxm3-fp8-b200-vllm: # precedent: spec decode pays off at low/mid concurrency while acceptance # dilutes in big batches, and the draft weights + draft KV shave headroom. minimaxm3-fp8-b200-vllm-mtp: - image: vllm/vllm-openai:minimax-m3-0618 + image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b200-dgxc diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e976ac3f7..2b636607f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3954,7 +3954,7 @@ - config-keys: - minimaxm3-fp8-b200-vllm-mtp description: - - "Update the MiniMax-M3 B200 single-node image to vllm/vllm-openai:minimax-m3-0618." + - "Update the MiniMax-M3 B200 single-node image to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130." - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter." - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1832 From e80baecaba9ad0aeb0d33cf6814e6896cbdd655b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 07:42:10 +0800 Subject: [PATCH 4/5] chore: retrigger benchmark sweeps From 260b450fbd02f38b07853bc0675381c3391102d3 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Fri, 19 Jun 2026 11:32:49 +0800 Subject: [PATCH 5/5] fix(vllm): materialize MiniMax M3 MSA top-k slice --- .../fixed_seq_len/minimaxm3_fp8_b200_mtp.sh | 33 +++++++++++++++++++ perf-changelog.yaml | 1 + 2 files changed, 34 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh index e104cf756..51147129d 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh @@ -27,6 +27,39 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# The 0618 image keeps MiniMax M3 top-k indices in a persistent +# [head_kv, max_tokens, topK] buffer for CUDA graphs. Slicing that buffer to +# the actual prefill length is non-contiguous when TP leaves multiple local KV +# heads, and the MSA CSR builder rejects it. Materialize the slice until the +# image includes this fix. +python3 - <<'PYEOF' || { echo "MiniMax M3 MSA contiguity patch failed" >&2; exit 1; } +import importlib.util +import pathlib + +spec = importlib.util.find_spec("vllm") +if spec is None or not spec.submodule_search_locations: + raise RuntimeError("Could not locate the installed vllm package") + +target = ( + pathlib.Path(next(iter(spec.submodule_search_locations))) + / "models" + / "minimax_m3" + / "nvidia" + / "sparse_attention_msa.py" +) +src = target.read_text() +old = " prefill_topk = topk[:, nd:num_tokens, :]\n" +new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n" + +if new in src: + print(f"[minimax-m3-msa-patch] already applied: {target}") +elif src.count(old) == 1: + target.write_text(src.replace(old, new, 1)) + print(f"[minimax-m3-msa-patch] patched: {target}") +else: + raise RuntimeError(f"Expected exactly one patch anchor in {target}") +PYEOF + DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3" if [[ -n "$SLURM_JOB_ID" ]]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2b636607f..baf433ac8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3957,4 +3957,5 @@ - "Update the MiniMax-M3 B200 single-node image to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130." - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter." - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend." + - "Patch the image's MiniMax M3 MSA prefill path to materialize sliced top-k indices before CSR construction." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1832