From 12dbb7abd2f837707f176aca85f1a2a97fdbe159 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 17:43:05 -0500 Subject: [PATCH 1/3] perf: update MI325X MiniMax-M3 MTP image and FP8 KV cache --- .github/configs/amd-master.yaml | 9 +++------ .../fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh | 17 +++++------------ perf-changelog.yaml | 8 ++++++++ runners/launch_mi325x-amds.sh | 3 ++- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3d50247d7..a3d001cc9 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2962,13 +2962,10 @@ minimaxm3-fp8-mi325x-vllm: # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style # search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency # end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP -# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0, -# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD -# MiniMax-M3 model, so the recipe applies that fix in-place at runtime -# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on -# MI355X/MI300X) before serving. +# recipes). Runs with CUDA graphs (no --enforce-eager, +# VLLM_USE_BREAKABLE_CUDAGRAPH=0). minimaxm3-fp8-mi325x-vllm-mtp: - image: vllm/vllm-openai-rocm:minimax-m3 + image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi325x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh index 4ba15e761..2dd51653c 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh @@ -8,9 +8,7 @@ # the text-only benchmark, --attention-backend TRITON_ATTN, and # --no-enable-prefix-caching. Runs with CUDA graphs (no --enforce-eager); # VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path. -# The default BF16 KV cache is retained (unlike the MI355X recipe's FP8 KV -# cache): gfx942 has no calibrated q/prob scales for ROCm FP8 attention and -# vLLM's fallback scale of 1.0 corrupts accuracy. +# FP8 KV cache reduces memory pressure and increases concurrency headroom. # # Unlike the CUDA recipes, the drafter needs no attention_backend override: # the FlashInfer "page size 128 requires GQA/MQA" limitation that forced @@ -18,15 +16,9 @@ # Here the whole server runs on TRITON_ATTN (set globally below), which serves # the MHA draft fine. # -# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image -# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3 -# engine init fails with "Model does not support EAGLE3 interface but -# aux_hidden_state_outputs was requested". This recipe applies that fix -# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as -# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we -# can validate EAGLE3 on real MI325X hardware ahead of an image rebuild. The -# same patch is validated green on MI355X. It is idempotent and fails the job -# loudly if the installed amd/model.py has drifted from the expected base. +# Keep the SupportsEagle3 compatibility guard for older images. It exits +# immediately when the installed AMD MiniMax-M3 model already has the upstream +# interface and otherwise applies the validated compatibility patch. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -175,6 +167,7 @@ set -x vllm serve "$MODEL" --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --block-size 128 \ + --kv-cache-dtype fp8 \ --no-enable-prefix-caching \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 06a81eaf1..31d799b5a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3950,3 +3950,11 @@ - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 + +- config-keys: + - minimaxm3-fp8-mi325x-vllm-mtp + description: + - "Update the MI325X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a" + - "Use FP8 KV cache" + - "Exclude chi-mi325x-pod2-120, which lacks the required populated /raid/hf-hub-cache" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1838 diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index e1f852715..17ea12613 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -13,7 +13,8 @@ SPEC_SUFFIX=$([[ "${SPEC_DECODING:-}" == "mtp" ]] && printf '_mtp' || printf '') set -x -JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# pod2-120 lacks the populated /raid/hf-hub-cache required by the launcher. +JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod2-120.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" >&2 From 540e3c1b1cb1655add9340c999136854b8794042 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 21:30:26 -0500 Subject: [PATCH 2/3] fix: preserve perf changelog history --- perf-changelog.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 30c58e4ed..4c0d4bbcb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3910,7 +3910,6 @@ - "Use the Marlin MoE backend for MiniMax-M3 B200/B300 TP-only vLLM configurations by adding --moe-backend marlin when expert parallelism is disabled." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1809 - - config-keys: - dsr1-fp8-gb300-dynamo-trt description: @@ -3927,7 +3926,6 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1813 - - config-keys: - glm5-fp4-gb300-dynamo-trt description: @@ -3937,7 +3935,6 @@ - "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798 - - config-keys: - dsv4-fp4-mi355x-atom description: @@ -3946,7 +3943,6 @@ - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 - - config-keys: - dsv4-fp4-mi355x-atom description: From 1770e9c8c49803d87f8e87da740bcb695e0b3649 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 21:32:43 -0500 Subject: [PATCH 3/3] fix: use upstream MI325X EAGLE support --- .../fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh | 92 ------------------- perf-changelog.yaml | 1 + 2 files changed, 1 insertion(+), 92 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh index 2dd51653c..79c52a3d1 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh @@ -16,10 +16,6 @@ # Here the whole server runs on TRITON_ATTN (set globally below), which serves # the MHA draft fine. # -# Keep the SupportsEagle3 compatibility guard for older images. It exits -# immediately when the installed AMD MiniMax-M3 model already has the upstream -# interface and otherwise applies the validated compatibility patch. - source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ @@ -73,94 +69,6 @@ fi # use 3 speculative tokens for all configs for now NUM_SPEC_TOKENS=3 -# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the -# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546). -# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model + -# aux-hidden-state emission, and SupportsEagle3 to the two outer classes. -# Idempotent; hard-fails if the installed file has drifted from the expected -# base (so we never silently run unpatched and mislabel the result). -python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; } -import ast, importlib.util, pathlib, sys - -spec = importlib.util.find_spec("vllm") -root = pathlib.Path(spec.submodule_search_locations[0]) -target = root / "models" / "minimax_m3" / "amd" / "model.py" -src = target.read_text() - -if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src: - print(f"[eagle3-patch] already applied: {target}") - sys.exit(0) - -edits = [ - ( - "from vllm.model_executor.models.interfaces import (\n" - " MultiModalEmbeddings,\n" - " SupportsMultiModal,\n" - ")", - "from vllm.model_executor.models.interfaces import (\n" - " EagleModelMixin,\n" - " MultiModalEmbeddings,\n" - " SupportsEagle3,\n" - " SupportsMultiModal,\n" - ")", - ), - ( - "class MiniMaxM3Model(nn.Module):", - "class MiniMaxM3Model(nn.Module, EagleModelMixin):", - ), - ( - " inputs_embeds: torch.Tensor | None = None,\n" - " ) -> torch.Tensor:\n" - " if inputs_embeds is not None:", - " inputs_embeds: torch.Tensor | None = None,\n" - " ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n" - " if inputs_embeds is not None:", - ), - ( - " residual = None\n\n" - " for layer in self.layers[self.start_layer : self.end_layer]:\n" - " hidden_states, residual = layer(positions, hidden_states, residual)\n\n" - " hidden_states, _ = self.norm(hidden_states, residual)\n" - " return hidden_states", - " residual = None\n\n" - " # EAGLE3 is not yet compatible with pipeline parallel\n" - " aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n" - " for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n" - " hidden_states, residual = layer(positions, hidden_states, residual)\n" - " self._maybe_add_hidden_state(\n" - " aux_hidden_states, idx + 1, hidden_states, residual\n" - " )\n\n" - " hidden_states, _ = self.norm(hidden_states, residual)\n\n" - " if len(aux_hidden_states) > 0:\n" - " return hidden_states, aux_hidden_states\n" - " return hidden_states", - ), - ( - "class MiniMaxM3SparseForCausalLM(nn.Module):", - "class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):", - ), - ( - "class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):", - "class MiniMaxM3SparseForConditionalGeneration(\n" - " nn.Module, SupportsMultiModal, SupportsEagle3\n" - "):", - ), -] - -for old, new in edits: - count = src.count(old) - if count != 1: - sys.exit( - f"[eagle3-patch] anchor matched {count} times (expected 1); " - f"installed {target} has drifted from the expected base — aborting" - ) - src = src.replace(old, new) - -ast.parse(src) -target.write_text(src) -print(f"[eagle3-patch] applied EAGLE3 support to {target}") -PYEOF - start_gpu_monitor set -x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4c0d4bbcb..2640e01d2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3964,5 +3964,6 @@ description: - "Update the MI325X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a" - "Use FP8 KV cache" + - "Remove the legacy in-place EAGLE3 patch now included upstream in vLLM" - "Exclude chi-mi325x-pod2-120, which lacks the required populated /raid/hf-hub-cache" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1838