SemiAnalysisAI · seungrokj · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
@@ -2612,6 +2612,59 @@ minimaxm3-fp4-mi355x-atom:
       - { tp: 4, conc-start: 1, conc-end: 256 }
       - { tp: 8, conc-start: 1, conc-end: 2 }
 
+minimaxm3-fp8-mi355x-atom-disagg:
+  image: rocm/atom-dev:MiniMax-M3-20260622
+  model: amd/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp8
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 1P1D TP4
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P1D TP4
+      - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+
+# MiniMax-M3 MXFP8 MI300X recipe. Use the TP8-only H100 search space: TP8 for
+# latency and TP8+EP8 (TEP) at high concurrency.
 # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
 # MI355X serving shape, but retain the default BF16 KV cache because this
 # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100

diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
@@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do
         extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
     else
         if [ "$IS_MTP" = "true" ]; then
-            extra_flags="--use-chat-template"
+            if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+                extra_flags="--dsv4"
+            else
+                extra_flags="--use-chat-template"
+            fi
         fi
     fi
 

diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh
@@ -46,16 +46,18 @@ export LOGLEVEL=WARNING
 # mooncake RDMA KV transfer library path
 export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-}
 
-# ATOM MoE gather/scatter interleave optimization
-export ATOM_MOE_GU_ITLV=1
 
 # ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
 
 # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting)
 export AITER_LOG_LEVEL=WARNING
 
-# Disable bf16->fp8 MoE bound (matches reference script)
-export AITER_BF16_FP8_MOE_BOUND=0
+if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+    # ATOM MoE gather/scatter interleave optimization
+    export ATOM_MOE_GU_ITLV=1
+    # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro)
+    export AITER_BF16_FP8_MOE_BOUND=0
+fi
 
 # Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf)
 # No env var needed; documented here for reference.

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
@@ -363,6 +363,7 @@ DOCKER_ENV_COMMON=(
     -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO
     -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER
     -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY
+    -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE
     -e TQDM_MININTERVAL=\$TQDM_MININTERVAL
     -e DRY_RUN=\$DRY_RUN
     -e BENCHMARK_LOGS_DIR=/benchmark_logs
@@ -411,10 +412,12 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then
         -e DECODE_PORT=${DECODE_PORT:-8020}
         -e ROUTER_PORT=${ROUTER_PORT:-30000}
         -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301}
-        -e MEM_FRACTION=${MEM_FRACTION:-0.85}
+        -e MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85}
         -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8}
         -e BLOCK_SIZE=${BLOCK_SIZE:-16}
         -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
+        -e MAX_MODEL_LEN=${MAX_MODEL_LEN:-}
+        -e MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-}
         -e EXTRA_SERVER_ARGS=\${EXTRA_SERVER_ARGS:-}
         -e IBDEVICES=${IBDEVICES:-}
     )

diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml
@@ -44,3 +44,13 @@ DeepSeek-V4-Pro:
   base_flags: ""
   mtp_flags: ""
   dp_flags: ""
+
+MiniMax-M3-MXFP4:
+  base_flags: ""
+  mtp_flags: ""
+  dp_flags: ""
+
+MiniMax-M3-MXFP8:
+  base_flags: ""
+  mtp_flags: ""
+  dp_flags: ""
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -35,17 +35,23 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
 DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
 DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 
+# MTP
+SPEC_DECODING="${SPEC_DECODING:-}"
+DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}"
+
 # ATOM server ports (different from SGLang which uses 8000 for all)
 PREFILL_PORT="${PREFILL_PORT:-8010}"
 DECODE_PORT="${DECODE_PORT:-8020}"
 ROUTER_PORT="${ROUTER_PORT:-8000}"
 HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"
 
 # ATOM server tuning (from reference script defaults)
-MEM_FRACTION="${MEM_FRACTION:-0.85}"
+MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}"
 KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
 BLOCK_SIZE="${BLOCK_SIZE:-16}"
 MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
+MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}"
 EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}"
 
 # Benchmark Configuration
@@ -100,34 +106,91 @@ for i in $(seq 0 $((yD - 1))); do
     DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}"
 done
 
-echo "Prefill IPs : ${PREFILL_IPS[*]}"
-echo "Decode  IPs : ${DECODE_IPS[*]}"
-
 PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}"
 PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}"
 DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
 DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 
+# Parallel args
 PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
 if [ "$PREFILL_ENABLE_DP" = "true" ]; then
     if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
         PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #DPA+TP
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
+    else #TP+DPA+TBO
+        if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+            PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo )
+            export GPU_MAX_HW_QUEUES=5
+            export ATOM_CPU_AFFINITY=1
+        else #TP+DPA 
+            PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
+        fi
     fi
 fi 
 
+# (srok), split DPA & TBO cases
 DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
 if [ "$DECODE_ENABLE_DP" = "true" ]; then
     if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
         DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #DPA+TP
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
+    else #TP+DPA+TBO
+        if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+            DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo )
+            export GPU_MAX_HW_QUEUES=5
+            export ATOM_CPU_AFFINITY=1
+        else #TP+DPA 
+            DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
+        fi
     fi
 fi 
 
-echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}"
-echo "Decode  Parallel args : ${DECODE_PARALLEL_ARGS[*]}"
+# MTP args
+SPEC_ARGS=() #TP
+if [ "$SPEC_DECODING" = "mtp" ]; then
+    SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
+fi
+
+# HF overrides (single-quoted JSON preserved through eval)
+HF_OVERRIDES_ARG=""
+if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+    HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
+fi
+
+# KV cache dtype (skip if unset or 'auto')
+KV_CACHE_ARG=""
+if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then
+    KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}"
+fi
+
+# Optional model length / batched-token cap
+MODEL_LEN_ARGS=""
+if [[ -n "$MAX_MODEL_LEN" ]]; then
+    MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-model-len ${MAX_MODEL_LEN}"
+fi
+if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then
+    MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}"
+fi
+
+if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then
+      export AITER_QUICK_REDUCE_QUANTIZATION=INT4
+fi
+
+cat <<INFO
+=== Configuration ===
+PREFILL  : ${PREFILL_IPS[*]} (TP=${PREFILL_TP_SIZE}, EP=${PREFILL_ENABLE_EP:-false}, DP=${PREFILL_ENABLE_DP:-false}, port=${PREFILL_PORT})
+DECODE   : ${DECODE_IPS[*]}  (TP=${DECODE_TP_SIZE},  EP=${DECODE_ENABLE_EP:-false},  DP=${DECODE_ENABLE_DP:-false},  port=${DECODE_PORT})
+ROUTER   : port=${ROUTER_PORT}
+MODEL    : ${MODEL_NAME}
+BACKEND  : atom (PD mooncake KV transfer)
+MTP      : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}
+xP/yD    : ${xP} / ${yD}
+KV cache : dtype=${KV_CACHE_DTYPE:-auto} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
+Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NUM_BATCHED_TOKENS:-unset}
+Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
+Decode  args : ${DECODE_PARALLEL_ARGS[*]}
+Spec    args : ${SPEC_ARGS[*]}
+Opt     args : ${HF_OVERRIDES_ARG}
+=====================
+INFO
 
 # =============================================================================
 # Node Role Assignment
@@ -153,12 +216,15 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --model ${MODEL_DIR}/${MODEL_NAME} \
         --host 0.0.0.0 --server-port ${PREFILL_PORT} \
         --trust-remote-code \
-        "${PREFILL_PARALLEL_ARGS[@]}" \
-        --kv_cache_dtype ${KV_CACHE_DTYPE} \
+        ${PREFILL_PARALLEL_ARGS[*]} \
+        ${SPEC_ARGS[*]} \
+        ${KV_CACHE_ARG} \
         --block-size ${BLOCK_SIZE} \
-        --gpu-memory-utilization ${MEM_FRACTION} \
+        --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
+        ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
+        ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -248,6 +314,11 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     cd $ATOM_WS_PATH
 
+    export IS_MTP="false"
+    if [ "$SPEC_DECODING" = "mtp" ]; then
+        export IS_MTP="true"
+    fi
+
     BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
         $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
         ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \
@@ -367,12 +438,15 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --model ${MODEL_DIR}/${MODEL_NAME} \
         --host 0.0.0.0 --server-port ${PREFILL_PORT} \
         --trust-remote-code \
-        "${PREFILL_PARALLEL_ARGS[@]}" \
-        --kv_cache_dtype ${KV_CACHE_DTYPE} \
+        ${PREFILL_PARALLEL_ARGS[*]} \
+        ${SPEC_ARGS[*]} \
+        ${KV_CACHE_ARG} \
         --block-size ${BLOCK_SIZE} \
-        --gpu-memory-utilization ${MEM_FRACTION} \
+        --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
+        ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
+        ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -449,12 +523,15 @@ else
         --model ${MODEL_DIR}/${MODEL_NAME} \
         --host 0.0.0.0 --server-port ${DECODE_PORT} \
         --trust-remote-code \
-        "${DECODE_PARALLEL_ARGS[@]}" \
-        --kv_cache_dtype ${KV_CACHE_DTYPE} \
+        ${DECODE_PARALLEL_ARGS[*]} \
+        ${SPEC_ARGS[*]} \
+        ${KV_CACHE_ARG} \
         --block-size ${BLOCK_SIZE} \
-        --gpu-memory-utilization ${MEM_FRACTION} \
+        --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${DECODE_MAX_NUM_SEQS} \
+        ${MODEL_LEN_ARGS} \
         --no-enable_prefix_caching \
+        ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
         ${EXTRA_SERVER_ARGS}"