From 12dbb7abd2f837707f176aca85f1a2a97fdbe159 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 17:43:05 -0500
Subject: [PATCH 1/3] perf: update MI325X MiniMax-M3 MTP image and FP8 KV cache

---
 .github/configs/amd-master.yaml                 |  9 +++------
 .../fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh   | 17 +++++------------
 perf-changelog.yaml                             |  8 ++++++++
 runners/launch_mi325x-amds.sh                   |  3 ++-
 4 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 3d50247d7..a3d001cc9 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2962,13 +2962,10 @@ minimaxm3-fp8-mi325x-vllm:
 # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style
 # search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency
 # end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP
-# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0,
-# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD
-# MiniMax-M3 model, so the recipe applies that fix in-place at runtime
-# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on
-# MI355X/MI300X) before serving.
+# recipes). Runs with CUDA graphs (no --enforce-eager,
+# VLLM_USE_BREAKABLE_CUDAGRAPH=0).
 minimaxm3-fp8-mi325x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:minimax-m3
+  image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi325x
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
index 4ba15e761..2dd51653c 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
@@ -8,9 +8,7 @@
 # the text-only benchmark, --attention-backend TRITON_ATTN, and
 # --no-enable-prefix-caching. Runs with CUDA graphs (no --enforce-eager);
 # VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path.
-# The default BF16 KV cache is retained (unlike the MI355X recipe's FP8 KV
-# cache): gfx942 has no calibrated q/prob scales for ROCm FP8 attention and
-# vLLM's fallback scale of 1.0 corrupts accuracy.
+# FP8 KV cache reduces memory pressure and increases concurrency headroom.
 #
 # Unlike the CUDA recipes, the drafter needs no attention_backend override:
 # the FlashInfer "page size 128 requires GQA/MQA" limitation that forced
@@ -18,15 +16,9 @@
 # Here the whole server runs on TRITON_ATTN (set globally below), which serves
 # the MHA draft fine.
 #
-# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image
-# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3
-# engine init fails with "Model does not support EAGLE3 interface but
-# aux_hidden_state_outputs was requested". This recipe applies that fix
-# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as
-# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we
-# can validate EAGLE3 on real MI325X hardware ahead of an image rebuild. The
-# same patch is validated green on MI355X. It is idempotent and fails the job
-# loudly if the installed amd/model.py has drifted from the expected base.
+# Keep the SupportsEagle3 compatibility guard for older images. It exits
+# immediately when the installed AMD MiniMax-M3 model already has the upstream
+# interface and otherwise applies the validated compatibility patch.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -175,6 +167,7 @@ set -x
 vllm serve "$MODEL" --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --block-size 128 \
+    --kv-cache-dtype fp8 \
     --no-enable-prefix-caching \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 06a81eaf1..31d799b5a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3950,3 +3950,11 @@
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
+
+- config-keys:
+    - minimaxm3-fp8-mi325x-vllm-mtp
+  description:
+    - "Update the MI325X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a"
+    - "Use FP8 KV cache"
+    - "Exclude chi-mi325x-pod2-120, which lacks the required populated /raid/hf-hub-cache"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1838
diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh
index e1f852715..17ea12613 100644
--- a/runners/launch_mi325x-amds.sh
+++ b/runners/launch_mi325x-amds.sh
@@ -13,7 +13,8 @@ SPEC_SUFFIX=$([[ "${SPEC_DECODING:-}" == "mtp" ]] && printf '_mtp' || printf '')
 
 set -x
 
-JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+# pod2-120 lacks the populated /raid/hf-hub-cache required by the launcher.
+JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod2-120.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
 
 if [ -z "$JOB_ID" ]; then
     echo "ERROR: salloc failed to allocate a job" >&2

From 540e3c1b1cb1655add9340c999136854b8794042 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 21:30:26 -0500
Subject: [PATCH 2/3] fix: preserve perf changelog history

---
 perf-changelog.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 30c58e4ed..4c0d4bbcb 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3910,7 +3910,6 @@
     - "Use the Marlin MoE backend for MiniMax-M3 B200/B300 TP-only vLLM configurations by adding --moe-backend marlin when expert parallelism is disabled."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1809
   
-
 - config-keys:
     - dsr1-fp8-gb300-dynamo-trt
   description:
@@ -3927,7 +3926,6 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1813
 
   
-
 - config-keys:
     - glm5-fp4-gb300-dynamo-trt
   description:
@@ -3937,7 +3935,6 @@
     - "Runner script launch_gb300-nv.sh: added dynamo-trt-specific glm5-fp4 case with SERVED_MODEL_NAME and SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1798
   
-
 - config-keys:
     - dsv4-fp4-mi355x-atom
   description:
@@ -3946,7 +3943,6 @@
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
   
-
 - config-keys:
     - dsv4-fp4-mi355x-atom
   description:

From 1770e9c8c49803d87f8e87da740bcb695e0b3649 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 21:32:43 -0500
Subject: [PATCH 3/3] fix: use upstream MI325X EAGLE support

---
 .../fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh | 92 -------------------
 perf-changelog.yaml                           |  1 +
 2 files changed, 1 insertion(+), 92 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
index 2dd51653c..79c52a3d1 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
@@ -16,10 +16,6 @@
 # Here the whole server runs on TRITON_ATTN (set globally below), which serves
 # the MHA draft fine.
 #
-# Keep the SupportsEagle3 compatibility guard for older images. It exits
-# immediately when the installed AMD MiniMax-M3 model already has the upstream
-# interface and otherwise applies the validated compatibility patch.
-
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
@@ -73,94 +69,6 @@ fi
 # use 3 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=3
 
-# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
-# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546).
-# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model +
-# aux-hidden-state emission, and SupportsEagle3 to the two outer classes.
-# Idempotent; hard-fails if the installed file has drifted from the expected
-# base (so we never silently run unpatched and mislabel the result).
-python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; }
-import ast, importlib.util, pathlib, sys
-
-spec = importlib.util.find_spec("vllm")
-root = pathlib.Path(spec.submodule_search_locations[0])
-target = root / "models" / "minimax_m3" / "amd" / "model.py"
-src = target.read_text()
-
-if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src:
-    print(f"[eagle3-patch] already applied: {target}")
-    sys.exit(0)
-
-edits = [
-    (
-        "from vllm.model_executor.models.interfaces import (\n"
-        "    MultiModalEmbeddings,\n"
-        "    SupportsMultiModal,\n"
-        ")",
-        "from vllm.model_executor.models.interfaces import (\n"
-        "    EagleModelMixin,\n"
-        "    MultiModalEmbeddings,\n"
-        "    SupportsEagle3,\n"
-        "    SupportsMultiModal,\n"
-        ")",
-    ),
-    (
-        "class MiniMaxM3Model(nn.Module):",
-        "class MiniMaxM3Model(nn.Module, EagleModelMixin):",
-    ),
-    (
-        "        inputs_embeds: torch.Tensor | None = None,\n"
-        "    ) -> torch.Tensor:\n"
-        "        if inputs_embeds is not None:",
-        "        inputs_embeds: torch.Tensor | None = None,\n"
-        "    ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n"
-        "        if inputs_embeds is not None:",
-    ),
-    (
-        "        residual = None\n\n"
-        "        for layer in self.layers[self.start_layer : self.end_layer]:\n"
-        "            hidden_states, residual = layer(positions, hidden_states, residual)\n\n"
-        "        hidden_states, _ = self.norm(hidden_states, residual)\n"
-        "        return hidden_states",
-        "        residual = None\n\n"
-        "        # EAGLE3 is not yet compatible with pipeline parallel\n"
-        "        aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n"
-        "        for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n"
-        "            hidden_states, residual = layer(positions, hidden_states, residual)\n"
-        "            self._maybe_add_hidden_state(\n"
-        "                aux_hidden_states, idx + 1, hidden_states, residual\n"
-        "            )\n\n"
-        "        hidden_states, _ = self.norm(hidden_states, residual)\n\n"
-        "        if len(aux_hidden_states) > 0:\n"
-        "            return hidden_states, aux_hidden_states\n"
-        "        return hidden_states",
-    ),
-    (
-        "class MiniMaxM3SparseForCausalLM(nn.Module):",
-        "class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):",
-    ),
-    (
-        "class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):",
-        "class MiniMaxM3SparseForConditionalGeneration(\n"
-        "    nn.Module, SupportsMultiModal, SupportsEagle3\n"
-        "):",
-    ),
-]
-
-for old, new in edits:
-    count = src.count(old)
-    if count != 1:
-        sys.exit(
-            f"[eagle3-patch] anchor matched {count} times (expected 1); "
-            f"installed {target} has drifted from the expected base — aborting"
-        )
-    src = src.replace(old, new)
-
-ast.parse(src)
-target.write_text(src)
-print(f"[eagle3-patch] applied EAGLE3 support to {target}")
-PYEOF
-
 start_gpu_monitor
 
 set -x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4c0d4bbcb..2640e01d2 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3964,5 +3964,6 @@
   description:
     - "Update the MI325X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a"
     - "Use FP8 KV cache"
+    - "Remove the legacy in-place EAGLE3 patch now included upstream in vLLM"
     - "Exclude chi-mi325x-pod2-120, which lacks the required populated /raid/hf-hub-cache"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1838