SemiAnalysisAI · cquil11 · Jun 18, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
@@ -2963,13 +2963,10 @@ minimaxm3-fp8-mi325x-vllm:
 # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style
 # search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency
 # end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP
-# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0,
-# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD
-# MiniMax-M3 model, so the recipe applies that fix in-place at runtime
-# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on
-# MI355X/MI300X) before serving.
+# recipes). Runs with CUDA graphs (no --enforce-eager,
+# VLLM_USE_BREAKABLE_CUDAGRAPH=0).
 minimaxm3-fp8-mi325x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:minimax-m3
+  image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi325x

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh
@@ -8,26 +8,14 @@
 # the text-only benchmark, --attention-backend TRITON_ATTN, and
 # --no-enable-prefix-caching. Runs with CUDA graphs (no --enforce-eager);
 # VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path.
-# The default BF16 KV cache is retained (unlike the MI355X recipe's FP8 KV
-# cache): gfx942 has no calibrated q/prob scales for ROCm FP8 attention and
-# vLLM's fallback scale of 1.0 corrupts accuracy.
+# FP8 KV cache reduces memory pressure and increases concurrency headroom.
 #
 # Unlike the CUDA recipes, the drafter needs no attention_backend override:
 # the FlashInfer "page size 128 requires GQA/MQA" limitation that forced
 # FLASH_ATTN for the EAGLE3 MHA head on Blackwell is FlashInfer/CUDA-specific.
 # Here the whole server runs on TRITON_ATTN (set globally below), which serves
 # the MHA draft fine.
 #
-# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image
-# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3
-# engine init fails with "Model does not support EAGLE3 interface but
-# aux_hidden_state_outputs was requested". This recipe applies that fix
-# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as
-# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we
-# can validate EAGLE3 on real MI325X hardware ahead of an image rebuild. The
-# same patch is validated green on MI355X. It is idempotent and fails the job
-# loudly if the installed amd/model.py has drifted from the expected base.
-
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars \
@@ -81,100 +69,13 @@ fi
 # use 3 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=3
 
-# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
-# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546).
-# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model +
-# aux-hidden-state emission, and SupportsEagle3 to the two outer classes.
-# Idempotent; hard-fails if the installed file has drifted from the expected
-# base (so we never silently run unpatched and mislabel the result).
-python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; }
-import ast, importlib.util, pathlib, sys
-
-spec = importlib.util.find_spec("vllm")
-root = pathlib.Path(spec.submodule_search_locations[0])
-target = root / "models" / "minimax_m3" / "amd" / "model.py"
-src = target.read_text()
-
-if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src:
-    print(f"[eagle3-patch] already applied: {target}")
-    sys.exit(0)
-
-edits = [
-    (
-        "from vllm.model_executor.models.interfaces import (\n"
-        "    MultiModalEmbeddings,\n"
-        "    SupportsMultiModal,\n"
-        ")",
-        "from vllm.model_executor.models.interfaces import (\n"
-        "    EagleModelMixin,\n"
-        "    MultiModalEmbeddings,\n"
-        "    SupportsEagle3,\n"
-        "    SupportsMultiModal,\n"
-        ")",
-    ),
-    (
-        "class MiniMaxM3Model(nn.Module):",
-        "class MiniMaxM3Model(nn.Module, EagleModelMixin):",
-    ),
-    (
-        "        inputs_embeds: torch.Tensor | None = None,\n"
-        "    ) -> torch.Tensor:\n"
-        "        if inputs_embeds is not None:",
-        "        inputs_embeds: torch.Tensor | None = None,\n"
-        "    ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n"
-        "        if inputs_embeds is not None:",
-    ),
-    (
-        "        residual = None\n\n"
-        "        for layer in self.layers[self.start_layer : self.end_layer]:\n"
-        "            hidden_states, residual = layer(positions, hidden_states, residual)\n\n"
-        "        hidden_states, _ = self.norm(hidden_states, residual)\n"
-        "        return hidden_states",
-        "        residual = None\n\n"
-        "        # EAGLE3 is not yet compatible with pipeline parallel\n"
-        "        aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n"
-        "        for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n"
-        "            hidden_states, residual = layer(positions, hidden_states, residual)\n"
-        "            self._maybe_add_hidden_state(\n"
-        "                aux_hidden_states, idx + 1, hidden_states, residual\n"
-        "            )\n\n"
-        "        hidden_states, _ = self.norm(hidden_states, residual)\n\n"
-        "        if len(aux_hidden_states) > 0:\n"
-        "            return hidden_states, aux_hidden_states\n"
-        "        return hidden_states",
-    ),
-    (
-        "class MiniMaxM3SparseForCausalLM(nn.Module):",
-        "class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):",
-    ),
-    (
-        "class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):",
-        "class MiniMaxM3SparseForConditionalGeneration(\n"
-        "    nn.Module, SupportsMultiModal, SupportsEagle3\n"
-        "):",
-    ),
-]
-
-for old, new in edits:
-    count = src.count(old)
-    if count != 1:
-        sys.exit(
-            f"[eagle3-patch] anchor matched {count} times (expected 1); "
-            f"installed {target} has drifted from the expected base — aborting"
-        )
-    src = src.replace(old, new)
-
-ast.parse(src)
-target.write_text(src)
-print(f"[eagle3-patch] applied EAGLE3 support to {target}")
-PYEOF
-
 start_gpu_monitor
 
 set -x
 vllm serve "$MODEL" --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --block-size 128 \
+    --kv-cache-dtype fp8 \
     --no-enable-prefix-caching \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3963,3 +3963,12 @@
     - "Use FP8 KV cache"
     - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843
+
+- config-keys:
+    - minimaxm3-fp8-mi325x-vllm-mtp
+  description:
+    - "Update the MI325X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a"
+    - "Use FP8 KV cache"
+    - "Remove the legacy in-place EAGLE3 patch now included upstream in vLLM"
+    - "Exclude chi-mi325x-pod2-120, which lacks the required populated /raid/hf-hub-cache"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1838
diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh
@@ -13,7 +13,8 @@ SPEC_SUFFIX=$([[ "${SPEC_DECODING:-}" == "mtp" ]] && printf '_mtp' || printf '')
 
 set -x
 
-JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+# pod2-120 lacks the populated /raid/hf-hub-cache required by the launcher.
+JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod2-120.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
 
 if [ -z "$JOB_ID" ]; then
     echo "ERROR: salloc failed to allocate a job" >&2