Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11835,7 +11835,7 @@ minimaxm3-fp8-b300-dynamo-vllm:
# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.
# kv-cache-dtype=fp8 added. srun_options mem=0 required.
minimaxm3-fp8-gb300-dynamo-vllm:
image: vllm/vllm-openai:nightly-aarch64
image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: gb300-nv
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail

python3 - <<'PYEOF'
from importlib.util import find_spec
from pathlib import Path

spec = find_spec("vllm")
if not spec or not spec.origin:
raise RuntimeError("vllm is not installed")
root = Path(spec.origin).parent
patches = {
root / "distributed/device_communicators/flashinfer_all_reduce.py": [
(
" comm_backend=comm_backend,\n"
" group=group,\n",
" comm_backend=comm_backend,\n"
' force_oneshot_support=backend == "mnnvl",\n'
" group=group,\n",
),
],
root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [
(
" prefill_topk = topk[:, nd:num_tokens, :]\n",
" prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n",
),
],
}
for path, edits in patches.items():
source = path.read_text()
for old, new in edits:
if new in source:
continue
if source.count(old) != 1:
raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
source = source.replace(old, new, 1)
path.write_text(source)
PYEOF
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-1k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-1k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep4-fp8-1k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-dep8-fp8-1k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-tep8-fp8-1k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-1k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-1k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep8-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-dep8-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-dep8-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-4p2d-dep2-dep8-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb300-5p2d-dep2-tep8-fp8-8k1k"

model:
path: "minimax-m3-mxfp8"
container: "vllm/vllm-openai:nightly-aarch64"
container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
precision: "fp8"

dynamo:
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4099,3 +4099,11 @@
- "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch."
- "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1893

- config-keys:
- minimaxm3-fp8-gb300-dynamo-vllm
description:
- "Update the GB300 MiniMax-M3 Dynamo-vLLM image to vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
- "Use the dedicated ARM64 MiniMax-M3 performance image; benchmark settings unchanged"
- "Allocate FlashInfer MNNVL workspace for one-shot TP8 all-reduce during CUDA graph capture"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1888
17 changes: 14 additions & 3 deletions runners/launch_gb300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ export OSL="$OSL"
echo "Cloning srt-slurm repository..."
RUN_KEY=$(printf "%s" "${RESULT_FILENAME:-${RUNNER_NAME:-gb300-nv}}" | sha1sum | cut -c1-12)
SRT_REPO_DIR="${GITHUB_WORKSPACE}/srt-slurm-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}"
SRTCTL_SETUP_SCRIPT=""
rm -rf "$SRT_REPO_DIR"

if [[ "$IS_AGENTIC" == "1" ]]; then
Expand Down Expand Up @@ -171,6 +172,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then
git checkout main
mkdir -p recipes/vllm/minimax-m3-gb300-fp8
cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8" recipes/vllm/minimax-m3-gb300-fp8
SRTCTL_SETUP_SCRIPT="minimax-m3-gb300-vllm-fixes.sh"
cp \
"$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \
"configs/$SRTCTL_SETUP_SCRIPT"
elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then
git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
cd "$SRT_REPO_DIR"
Expand Down Expand Up @@ -274,12 +279,18 @@ sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
# seq-len recipes still resolve model.path to an NFS-visible location
# where the precheck is a useful sanity guard, so keep enforcement on
# for them.
PREFLIGHT_FLAG=""
SRTCTL_APPLY_ARGS=(
-f "$CONFIG_FILE"
--tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)"
)
if [[ "$IS_AGENTIC" == "1" ]]; then
PREFLIGHT_FLAG="--no-preflight"
SRTCTL_APPLY_ARGS+=(--no-preflight)
fi
if [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then
SRTCTL_APPLY_ARGS+=(--setup-script "$SRTCTL_SETUP_SCRIPT")
fi

SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
SRTCTL_OUTPUT=$(srtctl apply "${SRTCTL_APPLY_ARGS[@]}" 2>&1)
echo "$SRTCTL_OUTPUT"

JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
Expand Down