diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 92f8a5609..caa2de85b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2876,12 +2876,10 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } - { tp: 8, conc-start: 1, conc-end: 2 } -# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and -# MI355X serving shape, but retain the default BF16 KV cache because this -# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 -# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency. +# MiniMax-M3 MXFP8 MI300X recipe. Use the TP8-only H100 search space: TP8 for +# latency and TP8+EP8 (TEP) at high concurrency. minimaxm3-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:minimax-m3 + image: vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi300x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh index f2cdaf284..8566c5185 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh @@ -1,10 +1,8 @@ #!/usr/bin/env bash # MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe. -# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128 -# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on -# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8 -# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy. +# Block size 128 is mandatory for MSA sparse attention. Use FP8 KV cache to +# reduce memory pressure and increase the available concurrency headroom. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -55,6 +53,7 @@ set -x vllm serve "$MODEL" --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --block-size 128 \ + --kv-cache-dtype fp8 \ --no-enable-prefix-caching \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 46ce8a77c..464588329 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3963,3 +3963,12 @@ - "Use FP8 KV cache" - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm + description: + - "Reopen #1837 to run and ingest the MI300X MiniMax-M3 STP sweep from main" + - "Update the MI300X MiniMax-M3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a" + - "Use FP8 KV cache" + - "Exclude unprovisioned chi-mi300x-121 from Slurm allocation" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1858 diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b0c1e22c8..fe1843bf5 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -15,7 +15,8 @@ set -x # Exclude known-bad nodes; let Slurm pick from anything else: # chi-mi300x-049: persistent /nvme_home disk-full -JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049 --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# chi-mi300x-121: missing required Enroot and RAID storage provisioning +JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049,chi-mi300x-121 --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job"