Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12908,12 +12908,12 @@ qwen3.5-fp4-b200-trt:
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
# cores on Blackwell. M3 support has not shipped in a stable vLLM release;
# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release
# the dedicated vllm/vllm-openai:minimax-m3-0618 image is built from the m3_release
# branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA
# sparse/index cache alignment). Weights are NOT SRE-staged; b300 falls back
# to writable /data/models (see launch_b300-nv.sh MODEL_PATH split).
minimaxm3-fp8-b300-vllm:
image: vllm/vllm-openai:minimax-m3
image: vllm/vllm-openai:minimax-m3-0618
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b300
Expand Down Expand Up @@ -12948,13 +12948,13 @@ minimaxm3-fp8-b300-vllm:
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
# cores on Blackwell. M3 support has not shipped in a stable vLLM release;
# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release
# the dedicated vllm/vllm-openai:minimax-m3-0618 image is built from the m3_release
# branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA
# sparse/index cache alignment). Weights are NOT SRE-staged: b200-dgxc reads
# /lustre/fsw/gharunners/models/MiniMax-M3-MXFP8 (pre-downloaded, see
# launch_b200-dgxc.sh).
minimaxm3-fp8-b200-vllm:
image: vllm/vllm-openai:minimax-m3
image: vllm/vllm-openai:minimax-m3-0618
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b200-dgxc
Expand Down Expand Up @@ -12985,13 +12985,12 @@ minimaxm3-fp8-b200-vllm:
# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-b200-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens, drafter pinned
# to FLASH_ATTN since the head is MHA and FlashInfer needs GQA/MQA at page
# size 128). Search space mirrors the non-MTP entry trimmed at the
# to TRITON_ATTN). Search space mirrors the non-MTP entry trimmed at the
# extreme-concurrency end, identical to the minimaxm3-fp8-b300-vllm-mtp
# precedent: spec decode pays off at low/mid concurrency while acceptance
# dilutes in big batches, and the draft weights + draft KV shave headroom.
minimaxm3-fp8-b200-vllm-mtp:
image: vllm/vllm-openai:minimax-m3
image: vllm/vllm-openai:minimax-m3-0618
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b200-dgxc
Expand Down Expand Up @@ -13028,7 +13027,7 @@ minimaxm3-fp8-b200-vllm-mtp:
# big batches, and the draft weights + draft KV shave headroom — tp2-ep2 is
# dropped entirely since its KV headroom was already thin without a draft.
minimaxm3-fp8-b300-vllm-mtp:
image: vllm/vllm-openai:minimax-m3
image: vllm/vllm-openai:minimax-m3-0618
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: b300
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -61,6 +61,9 @@ $PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
--attention-config.indexer_kv_dtype "fp8" \
--kv-cache-dtype fp8 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
Expand Down
15 changes: 7 additions & 8 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b200_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,8 @@
# encoder for the text-only benchmark. dp-attn=true maps to DP×EP (DEP);
# ep>1 maps to TP+EP (TEP).
#
# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer
# only supports page size 128 through its trtllm-gen kernel, which requires
# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise (the
# failure hit on the B300 MTP canary). The target keeps its default
# (FlashInfer) backend; FLASH_ATTN takes any multiple-of-16 block size, so
# the mandatory 128 is fine for the draft.
# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is
# pinned separately to TRITON_ATTN.

source "$(dirname "$0")/../../benchmark_lib.sh"

Expand Down Expand Up @@ -68,7 +64,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# use 3 speculative tokens for all configs for now
Expand All @@ -87,10 +83,13 @@ $PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
--attention-config.indexer_kv_dtype "fp8" \
--kv-cache-dtype fp8 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

Expand Down
5 changes: 4 additions & 1 deletion benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -69,6 +69,9 @@ $PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
--attention-config.indexer_kv_dtype "fp8" \
--kv-cache-dtype fp8 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
Expand Down
14 changes: 7 additions & 7 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,8 @@
# --block-size 128 is mandatory (MSA sparse/index cache); the benchmark is
# text-only, so --language-model-only frees the vision encoder's VRAM.
#
# The drafter is pinned to FLASH_ATTN: the EAGLE3 head is MHA, and FlashInfer
# only supports page size 128 through its trtllm-gen kernel, which requires
# GQA/MQA — engine init dies in FlashInferMetadataBuilder otherwise. The
# target keeps its default (FlashInfer) backend; FLASH_ATTN takes any
# multiple-of-16 block size, so the mandatory 128 is fine for the draft.
# The target uses the FlashInfer TRT-LLM attention path. The EAGLE3 drafter is
# pinned separately to TRITON_ATTN.

source "$(dirname "$0")/../../benchmark_lib.sh"

Expand Down Expand Up @@ -69,7 +66,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin"
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# use 3 speculative tokens for all configs for now
Expand All @@ -88,10 +85,13 @@ $PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \
--attention-config.indexer_kv_dtype "fp8" \
--kv-cache-dtype fp8 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"TRITON_ATTN\"}" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

Expand Down
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3950,3 +3950,14 @@
- "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
- "Update Applied TBO on high concurrencies"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717

- config-keys:
- minimaxm3-fp8-b200-vllm
- minimaxm3-fp8-b300-vllm
- minimaxm3-fp8-b200-vllm-mtp
- minimaxm3-fp8-b300-vllm-mtp
description:
- "Update the MiniMax-M3 B200/B300 single-node image to vllm/vllm-openai:minimax-m3-0618."
- "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache; use TRITON_ATTN for the EAGLE3 drafter."
- "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1831
Loading