diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0293a5746..86d73ba55 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11882,12 +11882,12 @@ qwen3.5-fp4-b200-trt: # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor # cores on Blackwell. M3 support has not shipped in a stable vLLM release; -# the dedicated vllm/vllm-openai:minimax-m3 image is built from the m3_release -# branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA +# the dedicated vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 image is built +# from the m3_release branch (vllm-project/vllm#45381). --block-size 128 is mandatory (MSA # sparse/index cache alignment). Weights are NOT SRE-staged; b300 falls back # to writable /data/models (see launch_b300-nv.sh MODEL_PATH split). minimaxm3-fp8-b300-vllm: - image: vllm/vllm-openai:minimax-m3 + image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: b300 diff --git a/KLAUD_DEBUG.md b/KLAUD_DEBUG.md index c1a8f9efa..a3b0cb659 100644 --- a/KLAUD_DEBUG.md +++ b/KLAUD_DEBUG.md @@ -249,3 +249,35 @@ raw/aggregate disagreement still fail reuse. - `/merge-prs [...]` — sequential merge via `utils/merge_with_reuse.sh`. Each command file is self-contained; read them to understand the exact jq filters they use. + +--- + +## 11. MiniMax M3 B300 MSA top-k slice is non-contiguous + +**Symptom:** MiniMax M3 fails during MSA kernel warmup with: +``` +ValueError: q2k_indices must be contiguous with layout [head_kv, total_q, topK] +``` +The stack ends in `sparse_attention_msa.py -> build_k2q_csr()`. TP4/TP8 +canaries may pass while TP1 data-parallel-attention jobs fail. + +**Root cause:** `vllm/vllm-openai:minimax-m3-0618-x86_64-cu130` stores top-k +indices in a persistent `[head_kv, max_num_batched_tokens, topK]` buffer for +CUDA graphs. The MSA prefill path slices the token dimension before calling +`build_k2q_csr()`. That view retains the full-buffer head stride and is not +contiguous when a worker has multiple local KV/index heads. Data-parallel +attention forces TP1, exposing all four MiniMax M3 KV/index heads per worker. + +**Workaround:** Before server startup, patch the installed +`vllm/models/minimax_m3/nvidia/sparse_attention_msa.py` assignment from: +```python +prefill_topk = topk[:, nd:num_tokens, :] +``` +to: +```python +prefill_topk = topk[:, nd:num_tokens, :].contiguous() +``` +Use an exact-source guard and remove the workaround once the image includes +the fix. + +Seen on: #1834. diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh index 1f4187c04..c0ee15f1f 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_b300.sh @@ -23,6 +23,39 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# The 0618 image keeps MiniMax M3 top-k indices in a persistent +# [head_kv, max_tokens, topK] buffer for CUDA graphs. Slicing that buffer to +# the actual prefill length is non-contiguous when TP leaves multiple local KV +# heads, and the MSA CSR builder rejects it. Materialize the slice until the +# image includes this fix. +python3 - <<'PYEOF' || { echo "MiniMax M3 MSA contiguity patch failed" >&2; exit 1; } +import importlib.util +import pathlib + +spec = importlib.util.find_spec("vllm") +if spec is None or not spec.submodule_search_locations: + raise RuntimeError("Could not locate the installed vllm package") + +target = ( + pathlib.Path(next(iter(spec.submodule_search_locations))) + / "models" + / "minimax_m3" + / "nvidia" + / "sparse_attention_msa.py" +) +src = target.read_text() +old = " prefill_topk = topk[:, nd:num_tokens, :]\n" +new = " prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n" + +if new in src: + print(f"[minimax-m3-msa-patch] already applied: {target}") +elif src.count(old) == 1: + target.write_text(src.replace(old, new, 1)) + print(f"[minimax-m3-msa-patch] patched: {target}") +else: + raise RuntimeError(f"Expected exactly one patch anchor in {target}") +PYEOF + # `hf download` creates the target dir if missing and is itself idempotent. # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE. # Either way, MODEL_PATH is what the server is launched with. @@ -53,7 +86,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" else - PARALLEL_ARGS="--tensor-parallel-size=$TP --moe-backend marlin" + PARALLEL_ARGS="--tensor-parallel-size=$TP" fi if [ "${EVAL_ONLY}" = "true" ]; then @@ -69,6 +102,9 @@ $PARALLEL_ARGS \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --block-size 128 \ +--attention-config '{"backend": "FLASHINFER", "use_trtllm_attention": true}' \ +--attention-config.indexer_kv_dtype "fp8" \ +--kv-cache-dtype fp8 \ --language-model-only \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4d99c3d9f..683becb8e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3951,6 +3951,15 @@ - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 +- config-keys: + - minimaxm3-fp8-b300-vllm + description: + - "Update the MiniMax-M3 B300 single-node image to vllm/vllm-openai:minimax-m3-0618-x86_64-cu130." + - "Enable FlashInfer TRT-LLM attention with FP8 indexer KV and KV cache." + - "Switch TP-only configurations from explicit Marlin MoE to the new image's default FlashInfer TRT-LLM MoE backend." + - "Patch the image's MiniMax M3 MSA prefill path to materialize sliced top-k indices before CSR construction." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1834 + - config-keys: - minimaxm3-fp4-mi355x-atom description: