Skip to content
5 changes: 2 additions & 3 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11959,13 +11959,12 @@ minimaxm3-fp8-b200-vllm:
# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-b200-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens, drafter pinned
# to FLASH_ATTN since the head is MHA and FlashInfer needs GQA/MQA at page
# size 128). Search space mirrors the non-MTP entry trimmed at the
# to TRITON_ATTN). Search space mirrors the non-MTP entry trimmed at the
# extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp
# precedent: spec decode pays off at low/mid concurrency while acceptance
# dilutes in big batches, and the draft weights + draft KV shave headroom.
minimaxm3-fp8-b200-vllm-mtp:
image: vllm/vllm-openai:minimax-m3
image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b200-dgxc
Expand Down
48 changes: 40 additions & 8 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,8 @@
# encoder for the text-only benchmark. dp-attn=true maps to DP×EP (DEP);
# ep>1 maps to TP+EP (TEP).
#
# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer
# only supports page size 128 through its trtllm-gen kernel, which requires
# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise (the
# failure hit on the B300 MTP canary). The target keeps its default
# (FlashInfer) backend; FLASH_ATTN takes any multiple-of-16 block size, so
# the mandatory 128 is fine for the draft.
# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is
# pinned separately to TRITON_ATTN.

source "$(dirname "$0")/../../benchmark_lib.sh"

Expand All @@ -31,6 +27,39 @@ check_env_vars \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

# The 0618 image keeps MiniMax M3 top-k indices in a persistent
# [head_kv, max_tokens, topK] buffer for CUDA graphs. Slicing that buffer to
# the actual prefill length is non-contiguous when TP leaves multiple local KV
# heads, and the MSA CSR builder rejects it. Materialize the slice until the
# image includes this fix.
python3 - <<'PYEOF' || { echo "MiniMax M3 MSA contiguity patch failed" >&2; exit 1; }
import importlib.util
import pathlib

spec = importlib.util.find_spec("vllm")
if spec is None or not spec.submodule_search_locations:
raise RuntimeError("Could not locate the installed vllm package")

target = (
pathlib.Path(next(iter(spec.submodule_search_locations)))
/ "models"
/ "minimax_m3"
/ "nvidia"
/ "sparse_attention_msa.py"
)
src = target.read_text()
old = " prefill_topk = topk[:, nd:num_tokens, :]\n"
new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n"

if new in src:
print(f"[minimax-m3-msa-patch] already applied: {target}")
elif src.count(old) == 1:
target.write_text(src.replace(old, new, 1))
print(f"[minimax-m3-msa-patch] patched: {target}")
else:
raise RuntimeError(f"Expected exactly one patch anchor in {target}")
PYEOF

DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"

if [[ -n "$SLURM_JOB_ID" ]]; then
Expand Down Expand Up @@ -68,7 +97,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# use 3 speculative tokens for all configs for now
Expand All @@ -87,10 +116,13 @@ $PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
--attention-config.indexer_kv_dtype "fp8" \
--kv-cache-dtype fp8 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3951,6 +3951,15 @@
- "Update Applied TBO on high concurrencies"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717

- config-keys:
- minimaxm3-fp8-b200-vllm-mtp
description:
- "Update the MiniMax-M3 B200 single-node image to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130."
- "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter."
- "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend."
- "Patch the image's MiniMax M3 MSA prefill path to materialize sliced top-k indices before CSR construction."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1832

- config-keys:
- minimaxm3-fp8-b200-vllm
description:
Expand Down