diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 92f8a5609..a825b05de 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2963,13 +2963,10 @@ minimaxm3-fp8-mi325x-vllm: # Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Same H200-style # search space as the non-MTP MI325X entry, trimmed at the extreme-concurrency # end with TP-only latency rows started at conc 1 (matching the H200/MI355X MTP -# recipes). Runs with CUDA graphs (no --enforce-eager, VLLM_USE_BREAKABLE_CUDAGRAPH=0, -# BF16 KV on gfx942). The shipped ROCm image lacks SupportsEagle3 on the AMD -# MiniMax-M3 model, so the recipe applies that fix in-place at runtime -# (functionstackx/vllm#1, upstream vllm-project/vllm#45546; validated green on -# MI355X/MI300X) before serving. +# recipes). Runs with CUDA graphs (no --enforce-eager, +# VLLM_USE_BREAKABLE_CUDAGRAPH=0). minimaxm3-fp8-mi325x-vllm-mtp: - image: vllm/vllm-openai-rocm:minimax-m3 + image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi325x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh index 4ba15e761..79c52a3d1 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi325x_mtp.sh @@ -8,9 +8,7 @@ # the text-only benchmark, --attention-backend TRITON_ATTN, and # --no-enable-prefix-caching. Runs with CUDA graphs (no --enforce-eager); # VLLM_USE_BREAKABLE_CUDAGRAPH=0 avoids the M3-decode breakable-cudagraph path. -# The default BF16 KV cache is retained (unlike the MI355X recipe's FP8 KV -# cache): gfx942 has no calibrated q/prob scales for ROCm FP8 attention and -# vLLM's fallback scale of 1.0 corrupts accuracy. +# FP8 KV cache reduces memory pressure and increases concurrency headroom. # # Unlike the CUDA recipes, the drafter needs no attention_backend override: # the FlashInfer "page size 128 requires GQA/MQA" limitation that forced @@ -18,16 +16,6 @@ # Here the whole server runs on TRITON_ATTN (set globally below), which serves # the MHA draft fine. # -# [AI generated draft test] The shipped vllm/vllm-openai-rocm:minimax-m3 image -# does NOT implement SupportsEagle3 on the AMD MiniMax-M3 model, so EAGLE3 -# engine init fails with "Model does not support EAGLE3 interface but -# aux_hidden_state_outputs was requested". This recipe applies that fix -# (functionstackx/vllm#1 — ported from nvidia/model.py, upstreamed as -# vllm-project/vllm#45546) in-place to the installed vllm before serving, so we -# can validate EAGLE3 on real MI325X hardware ahead of an image rebuild. The -# same patch is validated green on MI355X. It is idempotent and fails the job -# loudly if the installed amd/model.py has drifted from the expected base. - source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ @@ -81,100 +69,13 @@ fi # use 3 speculative tokens for all configs for now NUM_SPEC_TOKENS=3 -# [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the -# SupportsEagle3 interface (functionstackx/vllm#1, upstream vllm-project/vllm#45546). -# Mirrors nvidia/model.py: adds EagleModelMixin to the inner model + -# aux-hidden-state emission, and SupportsEagle3 to the two outer classes. -# Idempotent; hard-fails if the installed file has drifted from the expected -# base (so we never silently run unpatched and mislabel the result). -python3 - <<'PYEOF' || { echo "EAGLE3 in-place patch failed" >&2; exit 1; } -import ast, importlib.util, pathlib, sys - -spec = importlib.util.find_spec("vllm") -root = pathlib.Path(spec.submodule_search_locations[0]) -target = root / "models" / "minimax_m3" / "amd" / "model.py" -src = target.read_text() - -if "EagleModelMixin" in src and "class MiniMaxM3Model(nn.Module, EagleModelMixin):" in src: - print(f"[eagle3-patch] already applied: {target}") - sys.exit(0) - -edits = [ - ( - "from vllm.model_executor.models.interfaces import (\n" - " MultiModalEmbeddings,\n" - " SupportsMultiModal,\n" - ")", - "from vllm.model_executor.models.interfaces import (\n" - " EagleModelMixin,\n" - " MultiModalEmbeddings,\n" - " SupportsEagle3,\n" - " SupportsMultiModal,\n" - ")", - ), - ( - "class MiniMaxM3Model(nn.Module):", - "class MiniMaxM3Model(nn.Module, EagleModelMixin):", - ), - ( - " inputs_embeds: torch.Tensor | None = None,\n" - " ) -> torch.Tensor:\n" - " if inputs_embeds is not None:", - " inputs_embeds: torch.Tensor | None = None,\n" - " ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:\n" - " if inputs_embeds is not None:", - ), - ( - " residual = None\n\n" - " for layer in self.layers[self.start_layer : self.end_layer]:\n" - " hidden_states, residual = layer(positions, hidden_states, residual)\n\n" - " hidden_states, _ = self.norm(hidden_states, residual)\n" - " return hidden_states", - " residual = None\n\n" - " # EAGLE3 is not yet compatible with pipeline parallel\n" - " aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)\n" - " for idx, layer in enumerate(self.layers[self.start_layer : self.end_layer]):\n" - " hidden_states, residual = layer(positions, hidden_states, residual)\n" - " self._maybe_add_hidden_state(\n" - " aux_hidden_states, idx + 1, hidden_states, residual\n" - " )\n\n" - " hidden_states, _ = self.norm(hidden_states, residual)\n\n" - " if len(aux_hidden_states) > 0:\n" - " return hidden_states, aux_hidden_states\n" - " return hidden_states", - ), - ( - "class MiniMaxM3SparseForCausalLM(nn.Module):", - "class MiniMaxM3SparseForCausalLM(nn.Module, SupportsEagle3):", - ), - ( - "class MiniMaxM3SparseForConditionalGeneration(nn.Module, SupportsMultiModal):", - "class MiniMaxM3SparseForConditionalGeneration(\n" - " nn.Module, SupportsMultiModal, SupportsEagle3\n" - "):", - ), -] - -for old, new in edits: - count = src.count(old) - if count != 1: - sys.exit( - f"[eagle3-patch] anchor matched {count} times (expected 1); " - f"installed {target} has drifted from the expected base — aborting" - ) - src = src.replace(old, new) - -ast.parse(src) -target.write_text(src) -print(f"[eagle3-patch] applied EAGLE3 support to {target}") -PYEOF - start_gpu_monitor set -x vllm serve "$MODEL" --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --block-size 128 \ + --kv-cache-dtype fp8 \ --no-enable-prefix-caching \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 46ce8a77c..85d30ec69 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3963,3 +3963,12 @@ - "Use FP8 KV cache" - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 + +- config-keys: + - minimaxm3-fp8-mi325x-vllm-mtp + description: + - "Update the MI325X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a" + - "Use FP8 KV cache" + - "Remove the legacy in-place EAGLE3 patch now included upstream in vLLM" + - "Exclude chi-mi325x-pod2-120, which lacks the required populated /raid/hf-hub-cache" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1838 diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index e1f852715..17ea12613 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -13,7 +13,8 @@ SPEC_SUFFIX=$([[ "${SPEC_DECODING:-}" == "mtp" ]] && printf '_mtp' || printf '') set -x -JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# pod2-120 lacks the populated /raid/hf-hub-cache required by the launcher. +JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod2-120.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job" >&2