From 4d484fb7c87700253737a06f9c55ef7d2baa49fb Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:49:55 +0900 Subject: [PATCH 01/13] feat: add minimaxm3-fp8-mi355x-atom and minimaxm3-fp8-mi355x-atom-mtp benchmark scripts and CI configs Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 44 ++++++++++ .../minimaxm3_fp8_mi355x_atom.sh | 79 ++++++++++++++++++ .../minimaxm3_fp8_mi355x_atom_mtp.sh | 80 +++++++++++++++++++ 3 files changed, 203 insertions(+) create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 92f8a5609..daeca1443 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2876,6 +2876,50 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } - { tp: 8, conc-start: 1, conc-end: 2 } +minimaxm3-fp8-mi355x-atom: + image: rocm/atom-dev:MiniMax-M3-20260619 + model: amd/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 128, conc-end: 256 } + - { tp: 4, conc-start: 1, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 128, conc-end: 256 } + - { tp: 4, conc-start: 1, conc-end: 256 } + - { tp: 8, conc-start: 1, conc-end: 2 } + +minimaxm3-fp8-mi355x-atom-mtp: + image: rocm/atom-dev:MiniMax-M3-20260619 + model: amd/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } + # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh new file mode 100644 index 000000000..95e3f669c --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE \ + DP_ATTENTION Expand commentComment on line R14Resolved + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +SERVER_LOG=/workspace/server.log + +PARALLEL_ARGS=(-tp "$TP") #TP +if [ "$DP_ATTENTION" = "true" ]; then + if [ "$EP_SIZE" -gt 1 ]; then #DP+EP + PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) + else #DP+TP + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) + fi +fi + +SPEC_ARGS=() + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor +MEM_FRAC_STATIC=0.8 + +set -x +# (srok), not yet +# --kv_cache_dtype fp8 \ +python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + "${PARALLEL_ARGS[@]}" \Expand commentComment on line R45Resolved + "${SPEC_ARGS[@]}" \ + --block-size 128 \ + --gpu-memory-utilization $MEM_FRAC_STATIC \ + --trust-remote-code \ + > $SERVER_LOG 2>&1 &Expand commentComment on line R50ResolvedExpand commentComment on line R50Resolved + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +export PYTHONDONTWRITEBYTECODE=1 +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x \ No newline at end of file diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh new file mode 100644 index 000000000..55618a4bd --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE \ + DP_ATTENTION Expand commentComment on line R14Resolved + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +SERVER_LOG=/workspace/server.log + +PARALLEL_ARGS=(-tp "$TP") #TP +if [ "$DP_ATTENTION" = "true" ]; then + if [ "$EP_SIZE" -gt 1 ]; then #DP+EP + PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) + else #DP+TP + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) + fi +fi + +SPEC_ARGS=(--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3 ) + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor +MEM_FRAC_STATIC=0.8 + +set -x +# (srok), not yet +# --kv_cache_dtype fp8 \ +python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + "${PARALLEL_ARGS[@]}" \Expand commentComment on line R45Resolved + "${SPEC_ARGS[@]}" \ + --block-size 128 \ + --gpu-memory-utilization $MEM_FRAC_STATIC \ + --trust-remote-code \ + > $SERVER_LOG 2>&1 &Expand commentComment on line R50ResolvedExpand commentComment on line R50Resolved + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +export PYTHONDONTWRITEBYTECODE=1 +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code \ + --use-chat-template + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x \ No newline at end of file From 45b360cf0eaffd4affee1d91a37b8edb4a983734 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:52:02 +0900 Subject: [PATCH 02/13] chore: add perf-changelog entries for minimaxm3-fp8-mi355x-atom and atom-mtp Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 46bee4d44..ae211ceb0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3964,6 +3964,15 @@ - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 +- config-keys: + - minimaxm3-fp8-mi355x-atom + - minimaxm3-fp8-mi355x-atom-mtp + description: + - "Add minimaxm3-fp8-mi355x-atom CI recipe: single-node ATOM benchmark for MiniMax-M3-MXFP8 on MI355X" + - "Add minimaxm3-fp8-mi355x-atom-mtp: same with EAGLE3 speculative decoding (3 draft tokens)" + - "Both use rocm/atom-dev:MiniMax-M3-20260619; search space mirrors FP4 atom variants (ISL=1024,8192 OSL=1024 TP2/TP4/TP8)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1867 + - config-keys: - kimik2.5-fp4-gb200-dynamo-trt description: From 4700bd4ebeabaa49b4be8208fb14cb9c630a2e97 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:57:07 +0900 Subject: [PATCH 03/13] fix: remove stray GitHub review comment artifacts from minimaxm3_fp8 atom scripts Co-Authored-By: Claude Sonnet 4.6 --- .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh | 6 +++--- .../fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh index 95e3f669c..edf59c7bf 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh @@ -11,7 +11,7 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ EP_SIZE \ - DP_ATTENTION Expand commentComment on line R14Resolved + DP_ATTENTION if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -42,12 +42,12 @@ set -x python3 -m atom.entrypoints.openai_server \ --model $MODEL \ --server-port $PORT \ - "${PARALLEL_ARGS[@]}" \Expand commentComment on line R45Resolved + "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --trust-remote-code \ - > $SERVER_LOG 2>&1 &Expand commentComment on line R50ResolvedExpand commentComment on line R50Resolved + > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh index 55618a4bd..05eea3517 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh @@ -11,7 +11,7 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ EP_SIZE \ - DP_ATTENTION Expand commentComment on line R14Resolved + DP_ATTENTION if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -42,12 +42,12 @@ set -x python3 -m atom.entrypoints.openai_server \ --model $MODEL \ --server-port $PORT \ - "${PARALLEL_ARGS[@]}" \Expand commentComment on line R45Resolved + "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --trust-remote-code \ - > $SERVER_LOG 2>&1 &Expand commentComment on line R50ResolvedExpand commentComment on line R50Resolved + > $SERVER_LOG 2>&1 & SERVER_PID=$! From e17cbaa167d0478b3312236d10928776ef355a0c Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 14:50:57 +0900 Subject: [PATCH 04/13] fix: use MiniMaxAI/MiniMax-M3-MXFP8 model id for minimaxm3-fp8-mi355x-atom-mtp Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index daeca1443..e4676e0ff 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2900,7 +2900,7 @@ minimaxm3-fp8-mi355x-atom: minimaxm3-fp8-mi355x-atom-mtp: image: rocm/atom-dev:MiniMax-M3-20260619 - model: amd/MiniMax-M3-MXFP8 + model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x precision: fp8 From a068fde02f8ac76df3e7d65477e7304101f5acd0 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 14:53:16 +0900 Subject: [PATCH 05/13] fix: correct model id in amd-master.yaml for minimaxm3-fp8-mi355x-atom-mtp Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e4676e0ff..c202bd8b5 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2878,7 +2878,7 @@ minimaxm3-fp4-mi355x-atom: minimaxm3-fp8-mi355x-atom: image: rocm/atom-dev:MiniMax-M3-20260619 - model: amd/MiniMax-M3-MXFP8 + model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x precision: fp8 From 196961a1127cd3770c64e64a195ae8561ed67bf0 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 15:21:47 +0900 Subject: [PATCH 06/13] fix: trim minimaxm3-fp8-mi355x-atom and atom-mtp search spaces to TP4-only configs Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c202bd8b5..efe594525 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2889,14 +2889,11 @@ minimaxm3-fp8-mi355x-atom: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } - - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp8-mi355x-atom-mtp: image: rocm/atom-dev:MiniMax-M3-20260619 @@ -2911,14 +2908,11 @@ minimaxm3-fp8-mi355x-atom-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 1, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this From e016169137c18146456ff876e3875d19cb66c181 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 15:25:52 +0900 Subject: [PATCH 07/13] fix: correct minimaxm3-fp8-mi355x-atom-mtp ISL=1024 search space to tp4 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index efe594525..25fb1558e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2908,7 +2908,7 @@ minimaxm3-fp8-mi355x-atom-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: From 5c44abf815f3e045cb6a99b8938a37bc3e8d8918 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 10:52:38 +0900 Subject: [PATCH 08/13] fix: bump minimaxm3-fp8-mi355x-atom and atom-mtp images to MiniMax-M3-20260622 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 6086870cf..38223f547 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2613,7 +2613,7 @@ minimaxm3-fp4-mi355x-atom: - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp8-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260619 + image: rocm/atom-dev:MiniMax-M3-20260622 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2632,7 +2632,7 @@ minimaxm3-fp8-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } minimaxm3-fp8-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260619 + image: rocm/atom-dev:MiniMax-M3-20260622 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x From d2cd74ebcc749d2e2c412c9425b1cf4590272d97 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 12:03:55 +0900 Subject: [PATCH 09/13] fix: disable prefix caching for minimaxm3-fp8-mi355x-atom and atom-mtp Co-Authored-By: Claude Sonnet 4.6 --- .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh | 1 + .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh index edf59c7bf..e7a2a0757 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh @@ -47,6 +47,7 @@ python3 -m atom.entrypoints.openai_server \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --trust-remote-code \ + --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh index 05eea3517..5b9a97a19 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh @@ -47,6 +47,7 @@ python3 -m atom.entrypoints.openai_server \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --trust-remote-code \ + --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & SERVER_PID=$! From 5a0d2cfe61ecb8b797616bb2cd60f2eabf910d21 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 12:36:11 +0900 Subject: [PATCH 10/13] fix: update minimaxm3-fp8-mi355x-atom scripts to align with fp4 recipe - Add AITER_QUICK_REDUCE_QUANTIZATION=INT4, MAX_MODEL_LEN=32768, MAX_NUM_BATCHED_TOKENS=32768, MAX_NUM_SEQS=128 - Pass --max-model-len, --max-num-batched-tokens, --max-num-seqs to server - Conditional --use-chat-template based on SPEC_ARGS - Add trailing newline Co-Authored-By: Claude Sonnet 4.6 --- .../fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh | 11 +++++++++-- .../fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh | 12 +++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh index e7a2a0757..9bf74b6e6 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh @@ -37,6 +37,10 @@ start_gpu_monitor MEM_FRAC_STATIC=0.8 set -x +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export MAX_MODEL_LEN=32768 +export MAX_NUM_BATCHED_TOKENS=32768 +export MAX_NUM_SEQS=128 # (srok), not yet # --kv_cache_dtype fp8 \ python3 -m atom.entrypoints.openai_server \ @@ -46,6 +50,9 @@ python3 -m atom.entrypoints.openai_server \ "${SPEC_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ + --max-model-len $MAX_MODEL_LEN \ + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ + --max-num-seqs $MAX_NUM_SEQS \ --trust-remote-code \ --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & @@ -67,7 +74,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ - --trust-remote-code + --trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" ) # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -77,4 +84,4 @@ fi # Stop GPU monitoring stop_gpu_monitor -set +x \ No newline at end of file +set +x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh index 5b9a97a19..0ee78f504 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh @@ -37,6 +37,10 @@ start_gpu_monitor MEM_FRAC_STATIC=0.8 set -x +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export MAX_MODEL_LEN=32768 +export MAX_NUM_BATCHED_TOKENS=32768 +export MAX_NUM_SEQS=128 # (srok), not yet # --kv_cache_dtype fp8 \ python3 -m atom.entrypoints.openai_server \ @@ -46,6 +50,9 @@ python3 -m atom.entrypoints.openai_server \ "${SPEC_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ + --max-model-len $MAX_MODEL_LEN \ + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ + --max-num-seqs $MAX_NUM_SEQS \ --trust-remote-code \ --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & @@ -67,8 +74,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ - --trust-remote-code \ - --use-chat-template + --trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" ) # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -78,4 +84,4 @@ fi # Stop GPU monitoring stop_gpu_monitor -set +x \ No newline at end of file +set +x From 45dfb13d435a549fe2dac8f0328dc4054ca88050 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 14:36:28 +0900 Subject: [PATCH 11/13] fix: bump MAX_NUM_SEQS to 256 for minimaxm3-fp8-mi355x-atom scripts Co-Authored-By: Claude Sonnet 4.6 --- .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh | 2 +- .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh index 9bf74b6e6..6469949bc 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh @@ -40,7 +40,7 @@ set -x export AITER_QUICK_REDUCE_QUANTIZATION=INT4 export MAX_MODEL_LEN=32768 export MAX_NUM_BATCHED_TOKENS=32768 -export MAX_NUM_SEQS=128 +export MAX_NUM_SEQS=256 # (srok), not yet # --kv_cache_dtype fp8 \ python3 -m atom.entrypoints.openai_server \ diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh index 0ee78f504..fc0619b7b 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom_mtp.sh @@ -40,7 +40,7 @@ set -x export AITER_QUICK_REDUCE_QUANTIZATION=INT4 export MAX_MODEL_LEN=32768 export MAX_NUM_BATCHED_TOKENS=32768 -export MAX_NUM_SEQS=128 +export MAX_NUM_SEQS=256 # (srok), not yet # --kv_cache_dtype fp8 \ python3 -m atom.entrypoints.openai_server \ From 75572128f3dcb03365999798d87b66bd0a0da4fa Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 17:10:52 +0900 Subject: [PATCH 12/13] fix: cap minimaxm3-fp8-mi355x-atom search-space conc-end to 128 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 38223f547..76dc8894b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2625,11 +2625,11 @@ minimaxm3-fp8-mi355x-atom: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } + - { tp: 4, conc-start: 1, conc-end: 128 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 1, conc-end: 256 } + - { tp: 4, conc-start: 1, conc-end: 128 } minimaxm3-fp8-mi355x-atom-mtp: image: rocm/atom-dev:MiniMax-M3-20260622 From 0d1ed51dfbe19c74ec6f4fa6d8d0590141ab856e Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 17:14:31 +0900 Subject: [PATCH 13/13] fix: remove minimaxm3-fp8-mi355x-atom recipe, script, and perf-changelog entry Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 19 ---- .../minimaxm3_fp8_mi355x_atom.sh | 87 ------------------- perf-changelog.yaml | 2 - 3 files changed, 108 deletions(-) delete mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 76dc8894b..3caa5faae 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2612,25 +2612,6 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } - { tp: 8, conc-start: 1, conc-end: 2 } -minimaxm3-fp8-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260622 - model: MiniMaxAI/MiniMax-M3-MXFP8 - model-prefix: minimaxm3 - runner: mi355x - precision: fp8 - framework: atom - multinode: false - scenarios: - fixed-seq-len: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 128 } - minimaxm3-fp8-mi355x-atom-mtp: image: rocm/atom-dev:MiniMax-M3-20260622 model: MiniMaxAI/MiniMax-M3-MXFP8 diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh deleted file mode 100644 index 6469949bc..000000000 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE \ - DP_ATTENTION - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" - -SERVER_LOG=/workspace/server.log - -PARALLEL_ARGS=(-tp "$TP") #TP -if [ "$DP_ATTENTION" = "true" ]; then - if [ "$EP_SIZE" -gt 1 ]; then #DP+EP - PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) - else #DP+TP - PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) - fi -fi - -SPEC_ARGS=() - -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor -MEM_FRAC_STATIC=0.8 - -set -x -export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -export MAX_MODEL_LEN=32768 -export MAX_NUM_BATCHED_TOKENS=32768 -export MAX_NUM_SEQS=256 -# (srok), not yet -# --kv_cache_dtype fp8 \ -python3 -m atom.entrypoints.openai_server \ - --model $MODEL \ - --server-port $PORT \ - "${PARALLEL_ARGS[@]}" \ - "${SPEC_ARGS[@]}" \ - --block-size 128 \ - --gpu-memory-utilization $MEM_FRAC_STATIC \ - --max-model-len $MAX_MODEL_LEN \ - --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ - --max-num-seqs $MAX_NUM_SEQS \ - --trust-remote-code \ - --no-enable_prefix_caching \ - > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -export PYTHONDONTWRITEBYTECODE=1 -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" ) - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index bd687c26a..5e2b17560 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4002,10 +4002,8 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 - config-keys: - - minimaxm3-fp8-mi355x-atom - minimaxm3-fp8-mi355x-atom-mtp description: - - "Add minimaxm3-fp8-mi355x-atom CI recipe: single-node ATOM benchmark for MiniMax-M3-MXFP8 on MI355X" - "Add minimaxm3-fp8-mi355x-atom-mtp: same with EAGLE3 speculative decoding (3 draft tokens)" - "Both use rocm/atom-dev:MiniMax-M3-20260619; search space mirrors FP4 atom variants (ISL=1024,8192 OSL=1024 TP2/TP4/TP8)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1867