Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
4d484fb
feat: add minimaxm3-fp8-mi355x-atom and minimaxm3-fp8-mi355x-atom-mtp…
seungrokj Jun 20, 2026
45b360c
chore: add perf-changelog entries for minimaxm3-fp8-mi355x-atom and a…
seungrokj Jun 20, 2026
4700bd4
fix: remove stray GitHub review comment artifacts from minimaxm3_fp8 …
seungrokj Jun 20, 2026
e17cbaa
fix: use MiniMaxAI/MiniMax-M3-MXFP8 model id for minimaxm3-fp8-mi355x…
seungrokj Jun 20, 2026
f48400e
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 20, 2026
a068fde
fix: correct model id in amd-master.yaml for minimaxm3-fp8-mi355x-ato…
seungrokj Jun 20, 2026
196961a
fix: trim minimaxm3-fp8-mi355x-atom and atom-mtp search spaces to TP4…
seungrokj Jun 20, 2026
e016169
fix: correct minimaxm3-fp8-mi355x-atom-mtp ISL=1024 search space to tp4
seungrokj Jun 20, 2026
3913e9a
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 20, 2026
f3def1c
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 21, 2026
9f9c9a7
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 23, 2026
5c44abf
fix: bump minimaxm3-fp8-mi355x-atom and atom-mtp images to MiniMax-M3…
seungrokj Jun 23, 2026
d2cd74e
fix: disable prefix caching for minimaxm3-fp8-mi355x-atom and atom-mtp
seungrokj Jun 23, 2026
5a0d2cf
fix: update minimaxm3-fp8-mi355x-atom scripts to align with fp4 recipe
seungrokj Jun 23, 2026
ecd0bef
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 23, 2026
45dfb13
fix: bump MAX_NUM_SEQS to 256 for minimaxm3-fp8-mi355x-atom scripts
seungrokj Jun 23, 2026
60deebe
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 23, 2026
7557212
fix: cap minimaxm3-fp8-mi355x-atom search-space conc-end to 128
seungrokj Jun 23, 2026
0d1ed51
fix: remove minimaxm3-fp8-mi355x-atom recipe, script, and perf-change…
seungrokj Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2612,6 +2612,25 @@ minimaxm3-fp4-mi355x-atom:
- { tp: 4, conc-start: 1, conc-end: 256 }
- { tp: 8, conc-start: 1, conc-end: 2 }

minimaxm3-fp8-mi355x-atom-mtp:
image: rocm/atom-dev:MiniMax-M3-20260622
model: MiniMaxAI/MiniMax-M3-MXFP8
Comment thread
cursor[bot] marked this conversation as resolved.
model-prefix: minimaxm3
runner: mi355x
precision: fp8
framework: atom
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
Comment thread
seungrokj marked this conversation as resolved.

# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
# MI355X serving shape, but retain the default BF16 KV cache because this
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log

PARALLEL_ARGS=(-tp "$TP") #TP
if [ "$DP_ATTENTION" = "true" ]; then
if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
else #DP+TP
PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
fi
fi

SPEC_ARGS=(--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3 )

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.8

set -x
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
export MAX_MODEL_LEN=32768
export MAX_NUM_BATCHED_TOKENS=32768
export MAX_NUM_SEQS=256
# (srok), not yet
# --kv_cache_dtype fp8 \
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
"${PARALLEL_ARGS[@]}" \
"${SPEC_ARGS[@]}" \
--block-size 128 \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--max-model-len $MAX_MODEL_LEN \

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded max model length

High Severity

The benchmark script sets MAX_MODEL_LEN to 32768 and passes it to the atom server, overwriting the matrix value (isl + osl + 256, e.g. 9472 for 8k/1k). Sibling atom recipes use the supplied scenario length. Extra KV reservation can prevent the server from starting or waste GPU time under high concurrency with EAGLE3.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 0d1ed51. Configure here.

--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--max-num-seqs $MAX_NUM_SEQS \
--trust-remote-code \
--no-enable_prefix_caching \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" )

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4001,6 +4001,13 @@
- "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843

- config-keys:
- minimaxm3-fp8-mi355x-atom-mtp
description:
- "Add minimaxm3-fp8-mi355x-atom-mtp: same with EAGLE3 speculative decoding (3 draft tokens)"
- "Both use rocm/atom-dev:MiniMax-M3-20260619; search space mirrors FP4 atom variants (ISL=1024,8192 OSL=1024 TP2/TP4/TP8)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1867

- config-keys:
- minimaxm3-fp8-gb300-dynamo-vllm
description:
Expand Down
Loading