diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1b54016a8..d59ef5841 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11835,7 +11835,7 @@ minimaxm3-fp8-b300-dynamo-vllm: # DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped. # kv-cache-dtype=fp8 added. srun_options mem=0 required. minimaxm3-fp8-gb300-dynamo-vllm: - image: vllm/vllm-openai:nightly-aarch64 + image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: gb300-nv diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb300-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb300-vllm-fixes.sh new file mode 100755 index 000000000..c0eed0a51 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb300-vllm-fixes.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -euo pipefail + +python3 - <<'PYEOF' +from importlib.util import find_spec +from pathlib import Path + +spec = find_spec("vllm") +if not spec or not spec.origin: + raise RuntimeError("vllm is not installed") +root = Path(spec.origin).parent +patches = { + root / "distributed/device_communicators/flashinfer_all_reduce.py": [ + ( + " comm_backend=comm_backend,\n" + " group=group,\n", + " comm_backend=comm_backend,\n" + ' force_oneshot_support=backend == "mnnvl",\n' + " group=group,\n", + ), + ], + root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [ + ( + " prefill_topk = topk[:, nd:num_tokens, :]\n", + " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n", + ), + ], +} +for path, edits in patches.items(): + source = path.read_text() + for old, new in edits: + if new in source: + continue + if source.count(old) != 1: + raise RuntimeError(f"missing or ambiguous patch anchor in {path}") + source = source.replace(old, new, 1) + path.write_text(source) +PYEOF diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml index fd79fcee1..4b00b5660 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml index 59612a695..26fa89b94 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml index d5dc421ad..af5315c76 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep4-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml index a1c7cdb6c..7cc5f50c4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-dep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml index 94709574a..0c4f3498c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml index 049893137..5babf0835 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml index 95ef6e17d..d4176055a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-1k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml index 4bb218ae8..4ee41241e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml index 88b923633..b56b65b26 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml index 61bbddf4e..7beba3420 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml index 428943e5f..1ea678ace 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml index 7feaa1d18..f4e000a5f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml index b9276d154..35950dc32 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml index 9d025f69d..1526cd7ad 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-4p2d-dep2-dep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml index 2663a6178..dbc9c5c9a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-5p2d-dep2-tep8-fp8-8k1k" model: path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:nightly-aarch64" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" dynamo: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4265d320b..af01516c1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4099,3 +4099,11 @@ - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch." - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1893 + +- config-keys: + - minimaxm3-fp8-gb300-dynamo-vllm + description: + - "Update the GB300 MiniMax-M3 Dynamo-vLLM image to vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + - "Use the dedicated ARM64 MiniMax-M3 performance image; benchmark settings unchanged" + - "Allocate FlashInfer MNNVL workspace for one-shot TP8 all-reduce during CUDA graph capture" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1888 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 66d1fbfe2..93d9eb252 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -106,6 +106,7 @@ export OSL="$OSL" echo "Cloning srt-slurm repository..." RUN_KEY=$(printf "%s" "${RESULT_FILENAME:-${RUNNER_NAME:-gb300-nv}}" | sha1sum | cut -c1-12) SRT_REPO_DIR="${GITHUB_WORKSPACE}/srt-slurm-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}" +SRTCTL_SETUP_SCRIPT="" rm -rf "$SRT_REPO_DIR" if [[ "$IS_AGENTIC" == "1" ]]; then @@ -171,6 +172,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then git checkout main mkdir -p recipes/vllm/minimax-m3-gb300-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8" recipes/vllm/minimax-m3-gb300-fp8 + SRTCTL_SETUP_SCRIPT="minimax-m3-gb300-vllm-fixes.sh" + cp \ + "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ + "configs/$SRTCTL_SETUP_SCRIPT" elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -274,12 +279,18 @@ sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" # seq-len recipes still resolve model.path to an NFS-visible location # where the precheck is a useful sanity guard, so keep enforcement on # for them. -PREFLIGHT_FLAG="" +SRTCTL_APPLY_ARGS=( + -f "$CONFIG_FILE" + --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" +) if [[ "$IS_AGENTIC" == "1" ]]; then - PREFLIGHT_FLAG="--no-preflight" + SRTCTL_APPLY_ARGS+=(--no-preflight) +fi +if [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then + SRTCTL_APPLY_ARGS+=(--setup-script "$SRTCTL_SETUP_SCRIPT") fi -SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +SRTCTL_OUTPUT=$(srtctl apply "${SRTCTL_APPLY_ARGS[@]}" 2>&1) echo "$SRTCTL_OUTPUT" JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')