From ea33910cd0cf4b00350da2fc499537062aa113b5 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:02:53 +0900
Subject: [PATCH 01/18] [AMD] server_atom: improve config print and cleanup

- Replace individual echo lines with cat <<INFO heredoc showing
  EP/DP flags, KV cache settings alongside TP/port info
- Minor cleanup in parallel args setup

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 90 +++++++++++++++++++
 .../multi_node/amd_utils/server_atom.sh       | 79 +++++++++-------
 .../multi_node/dsv4_fp4_mi355x_atom-disagg.sh |  1 +
 3 files changed, 138 insertions(+), 32 deletions(-)
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 4e91d8116..e08635ce8 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2785,6 +2785,96 @@ dsv4-fp4-mi355x-atom-disagg:
           additional-settings:
           - "DECODE_NODES=1"
 
+dsv4-fp4-mi355x-atom-disagg-mtp:
+  image: rocm/atom-dev:nightly_202606181332
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 2P1D TP8+DPA+TBO+MTP1
+      - spec-decoding: "mtp"
+        conc-list: [ 256, 512, 768, 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+      # 1P1D TP8+MTP3 
+      - spec-decoding: "mtp"
+        conc-list: [ 4, 8, 16, 32, 64, 128 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+      # 1P1D TP8+DPA+MTP1 
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "mtp"
+        conc-list: [ 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+      # 1P1D TP8+MTP3 
+      - spec-decoding: "mtp"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
+
 # MiniMax-M3 MXFP8 MI355X recipe:
 # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 957c84d60..823eb99c1 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -35,6 +35,10 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
 DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
 DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 
+# MTP
+SPEC_DECODING="${SPEC_DECODING:-}"
+DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}"
+
 # ATOM server ports (different from SGLang which uses 8000 for all)
 PREFILL_PORT="${PREFILL_PORT:-8010}"
 DECODE_PORT="${DECODE_PORT:-8020}"
@@ -42,7 +46,7 @@ ROUTER_PORT="${ROUTER_PORT:-8000}"
 HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"
 
 # ATOM server tuning (from reference script defaults)
-MEM_FRACTION="${MEM_FRACTION:-0.85}"
+MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}"
 KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
 BLOCK_SIZE="${BLOCK_SIZE:-16}"
 MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
@@ -100,20 +104,20 @@ for i in $(seq 0 $((yD - 1))); do
     DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}"
 done
 
-echo "Prefill IPs : ${PREFILL_IPS[*]}"
-echo "Decode  IPs : ${DECODE_IPS[*]}"
-
 PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}"
 PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}"
 DECODE_ENABLE_EP="${DECODE_ENABLE_EP}"
 DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
 
+# Parallel args
 PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
 if [ "$PREFILL_ENABLE_DP" = "true" ]; then
     if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
         PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #DPA+TP
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
+    else #TP+DPA+TBO
+        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo)
+        export GPU_MAX_HW_QUEUES=5
+        export ATOM_CPU_AFFINITY=1
     fi
 fi 
 
@@ -121,13 +125,38 @@ DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
 if [ "$DECODE_ENABLE_DP" = "true" ]; then
     if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
         DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
-    else #DPA+TP
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
+    else #TP+DPA+TBO
+        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo)
+        export GPU_MAX_HW_QUEUES=5
+        export ATOM_CPU_AFFINITY=1
     fi
 fi 
 
-echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}"
-echo "Decode  Parallel args : ${DECODE_PARALLEL_ARGS[*]}"
+# MTP args
+SPEC_ARGS=() #TP
+if [ "$SPEC_DECODING" = "mtp" ]; then
+    SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
+fi
+
+# OPT args
+OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}')
+
+cat <<INFO
+=== Configuration ===
+PREFILL  : ${PREFILL_IPS[*]} (TP=${PREFILL_TP_SIZE}, EP=${PREFILL_ENABLE_EP:-false}, DP=${PREFILL_ENABLE_DP:-false}, port=${PREFILL_PORT})
+DECODE   : ${DECODE_IPS[*]}  (TP=${DECODE_TP_SIZE},  EP=${DECODE_ENABLE_EP:-false},  DP=${DECODE_ENABLE_DP:-false},  port=${DECODE_PORT})
+ROUTER   : port=${ROUTER_PORT}
+MODEL    : ${MODEL_NAME}
+BACKEND  : atom (PD mooncake KV transfer)
+MTP      : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}
+xP/yD    : ${xP} / ${yD}
+KV cache : dtype=${KV_CACHE_DTYPE} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
+Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
+Decode  args : ${DECODE_PARALLEL_ARGS[*]}
+Spec    args : ${SPEC_ARGS[*]}
+Opt     args : ${OPT_ARGS[*]}
+=====================
+INFO
 
 # =============================================================================
 # Node Role Assignment
@@ -156,9 +185,10 @@ if [ "$NODE_RANK" -eq 0 ]; then
         "${PREFILL_PARALLEL_ARGS[@]}" \
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
-        --gpu-memory-utilization ${MEM_FRACTION} \
+        --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
+        "${OPT_ARGS[@]}" \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -368,11 +398,13 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --host 0.0.0.0 --server-port ${PREFILL_PORT} \
         --trust-remote-code \
         "${PREFILL_PARALLEL_ARGS[@]}" \
+        "${SPEC_ARGS[@]}" \
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
-        --gpu-memory-utilization ${MEM_FRACTION} \
+        --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
+        "${OPT_ARGS[@]}" \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -428,35 +460,18 @@ else
     RANK=$((NODE_RANK - NODE_OFFSET))
     echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})"
 
-    _MAX_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
-    if [[ "$_MAX_CONC" -gt 2048 ]]; then
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048,4096]'
-    elif [[ "$_MAX_CONC" -gt 1024 ]]; then
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048]'
-    elif [[ "$_MAX_CONC" -gt 512 ]]; then
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,768,1024]'
-    else
-        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512]'
-    fi
-
-    if [[ "$BENCH_INPUT_LEN" == "1024" && "$BENCH_OUTPUT_LEN" == "1024" ]]; then
-        DECODE_MAX_NUM_SEQS="${_MAX_CONC}"
-    else
-        DECODE_MAX_NUM_SEQS="${MAX_NUM_SEQS}"
-    fi
-
     DECODE_CMD="python3 -m atom.entrypoints.openai_server \
         --model ${MODEL_DIR}/${MODEL_NAME} \
         --host 0.0.0.0 --server-port ${DECODE_PORT} \
         --trust-remote-code \
         "${DECODE_PARALLEL_ARGS[@]}" \
+        "${SPEC_ARGS[@]}" \
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
-        --gpu-memory-utilization ${MEM_FRACTION} \
-        --max-num-seqs ${DECODE_MAX_NUM_SEQS} \
+        --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --no-enable_prefix_caching \
+        "${OPT_ARGS[@]}" \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
-        --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
         ${EXTRA_SERVER_ARGS}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
index d17d1a323..4d527dae9 100644
--- a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
+++ b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
@@ -66,6 +66,7 @@ fi
 # by a list of numbers delimited by 'x'. This is because of how the underlying launch script
 # expects the concurrencies.
 JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $SPEC_DECODING \
     $PREFILL_NUM_WORKERS \
     $DECODE_NODES \
     $DECODE_NUM_WORKERS \

From 027f3f18fe435a5daca49fced16e0fa10a52549a Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:04:42 +0900
Subject: [PATCH 02/18] update perf-changelog for
 dsv4-fp4-mi355x-atom-disagg-mtp

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a232ddb16..83540513d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,13 @@
+- config-keys:
+    - dsv4-fp4-mi355x-atom-disagg-mtp
+  description:
+    - "Add dsv4-fp4-mi355x-atom-disagg-mtp recipe: multi-node disaggregated PD on MI355X via ATOM with MTP speculative decoding"
+    - "2P1D DPA+TBO+MTP1 sweep at ISL8192 (conc 256-2048)"
+    - "1P1D TP8+MTP3 sweep at ISL8192 (conc 4-128)"
+    - "1P1D TP8+DPA+MTP1 sweep at ISL1024 (conc 64-1024)"
+    - "Image: rocm/atom-dev:nightly_202606181332"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1855
+
 - config-keys:
     - 70b-fp8-*-vllm
   description:

From 2216d11c5fb1e740e0d14584cde050126d414564 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:22:32 +0900
Subject: [PATCH 03/18] [AMD] fix DECODE_MTP_SIZE and BENCH_REQUEST_RATE
 propagation in atom-disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Export DECODE_MTP_SIZE and SPEC_DECODING in dsv4_fp4_mi355x_atom-disagg.sh
  so they reach server_atom.sh via submit.sh → job.slurm
- Add DECODE_MTP_SIZE to check_env_vars in dsv4_fp4_mi355x_atom-disagg.sh
- Pass BENCH_REQUEST_RATE into Docker container in job.slurm DOCKER_ENV_COMMON

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm            | 3 ++-
 benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index c542280c4..f797913eb 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -363,6 +363,7 @@ DOCKER_ENV_COMMON=(
     -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO
     -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER
     -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY
+    -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE
     -e TQDM_MININTERVAL=\$TQDM_MININTERVAL
     -e DRY_RUN=\$DRY_RUN
     -e BENCHMARK_LOGS_DIR=/benchmark_logs
@@ -411,7 +412,7 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then
         -e DECODE_PORT=${DECODE_PORT:-8020}
         -e ROUTER_PORT=${ROUTER_PORT:-30000}
         -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301}
-        -e MEM_FRACTION=${MEM_FRACTION:-0.85}
+        -e MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85}
         -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8}
         -e BLOCK_SIZE=${BLOCK_SIZE:-16}
         -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
index 4d527dae9..4745eaa92 100644
--- a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
+++ b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
@@ -8,6 +8,7 @@ check_env_vars \
     OSL \
     IMAGE \
     SPEC_DECODING \
+    DECODE_MTP_SIZE \
     MODEL_PATH \
     PREFILL_NUM_WORKERS \
     PREFILL_TP \
@@ -61,12 +62,14 @@ else
 export DECODE_ENABLE_DP=false
 fi
 
+export SPEC_DECODING="${SPEC_DECODING}"
+export DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
+
 # Launch jobs based on ISL/OSL
 # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
 # by a list of numbers delimited by 'x'. This is because of how the underlying launch script
 # expects the concurrencies.
 JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
-    $SPEC_DECODING \
     $PREFILL_NUM_WORKERS \
     $DECODE_NODES \
     $DECODE_NUM_WORKERS \

From cd745fa5bd9d3fb98044227161ec635ed8ae5e39 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:23:37 +0900
Subject: [PATCH 04/18] [AMD] server_atom: pass SPEC_ARGS to prefill server

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 823eb99c1..d458a19f7 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -183,6 +183,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --host 0.0.0.0 --server-port ${PREFILL_PORT} \
         --trust-remote-code \
         "${PREFILL_PARALLEL_ARGS[@]}" \
+        "${SPEC_ARGS[@]}" \
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \

From baf0e063be01114482524e3df4e1941e20946668 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:27:31 +0900
Subject: [PATCH 05/18] [AMD] amd-master: fix comment for 1P1D TP8+DPA+TBO+MTP1
 config

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e08635ce8..715312391 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2835,7 +2835,7 @@ dsv4-fp4-mi355x-atom-disagg-mtp:
           additional-settings:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
-      # 1P1D TP8+DPA+MTP1 
+      # 1P1D TP8+DPA+TBO+MTP1
     - isl: 1024
       osl: 1024
       search-space:

From 1485744cebc4bf5096fc0e521208f8ef5db2c822 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:29:40 +0900
Subject: [PATCH 06/18] [AMD] dsv4_atom-disagg: remove DECODE_MTP_SIZE from
 check_env_vars

DECODE_MTP_SIZE comes from additional-settings and has a default of 0,
so it should not be required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
index 4745eaa92..60f85d553 100644
--- a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
+++ b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh
@@ -8,7 +8,6 @@ check_env_vars \
     OSL \
     IMAGE \
     SPEC_DECODING \
-    DECODE_MTP_SIZE \
     MODEL_PATH \
     PREFILL_NUM_WORKERS \
     PREFILL_TP \

From 4e039bc87927c24911d7b3e125a512cdd95f8362 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:37:26 +0900
Subject: [PATCH 07/18] [AMD] bench: use --dsv4 flag for DeepSeek-V4-Pro MTP
 benchmarks

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/bench.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
index 05384f435..c0dbb60ad 100755
--- a/benchmarks/multi_node/amd_utils/bench.sh
+++ b/benchmarks/multi_node/amd_utils/bench.sh
@@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do
         extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
     else
         if [ "$IS_MTP" = "true" ]; then
-            extra_flags="--use-chat-template"
+            if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+                extra_flags="--dsv4"
+            else
+                extra_flags="--use-chat-template"
+            fi
         fi
     fi
 

From 0868467009594f8902a82ef3cd4db2d079d473f2 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 17:42:15 +0900
Subject: [PATCH 08/18] [AMD] server_atom: export IS_MTP=true when
 SPEC_DECODING=mtp for bench.sh

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index d458a19f7..c461ed34b 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -279,6 +279,11 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     cd $ATOM_WS_PATH
 
+    export IS_MTP="false"
+    if [ "$SPEC_DECODING" = "mtp" ]; then
+        export IS_MTP="true"
+    fi
+
     BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
         $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
         ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \

From c7d48b0ada0a71d352ed2605af83933d3c3824e2 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 18:44:10 +0900
Subject: [PATCH 09/18] [AMD] server_atom: fix hf-overrides JSON quoting

Remove spaces from JSON value so it doesn't get word-split when
expanded inside the eval'd command string.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index c461ed34b..6c7484e2c 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -139,7 +139,7 @@ if [ "$SPEC_DECODING" = "mtp" ]; then
 fi
 
 # OPT args
-OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}')
+OPT_ARGS=(--hf-overrides '{"use_index_cache":true,"index_topk_freq":4}')
 
 cat <<INFO
 === Configuration ===

From 39e62ebfdf6c883d4cb2aa1546f31f38250290bd Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 18:49:07 +0900
Subject: [PATCH 10/18] update perf-changelog for minimaxm3-fp4-mi355x-atom

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 83540513d..4ceea4f78 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,13 +1,3 @@
-- config-keys:
-    - dsv4-fp4-mi355x-atom-disagg-mtp
-  description:
-    - "Add dsv4-fp4-mi355x-atom-disagg-mtp recipe: multi-node disaggregated PD on MI355X via ATOM with MTP speculative decoding"
-    - "2P1D DPA+TBO+MTP1 sweep at ISL8192 (conc 256-2048)"
-    - "1P1D TP8+MTP3 sweep at ISL8192 (conc 4-128)"
-    - "1P1D TP8+DPA+MTP1 sweep at ISL1024 (conc 64-1024)"
-    - "Image: rocm/atom-dev:nightly_202606181332"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1855
-
 - config-keys:
     - 70b-fp8-*-vllm
   description:
@@ -3961,6 +3951,12 @@
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
 
+- config-keys:
+    - minimaxm3-fp4-mi355x-atom
+  description:
+    - "Expand search space for minimaxm3-fp4-mi355x-atom: add TP2 and TP8 configurations, extend concurrency range to 256 for ISL1024 and ISL8192, and add TP8 conc=1-2 for ISL8192."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1825
+
 - config-keys:
     - minimaxm3-fp8-mi300x-vllm
   description:
@@ -3975,4 +3971,4 @@
     - "Update the MI300X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a"
     - "Use FP8 KV cache"
     - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843
\ No newline at end of file

From ba37d043089033fb6e22d584b1a30824eb800256 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 18:50:36 +0900
Subject: [PATCH 11/18] update perf-changelog for
 dsv4-fp4-mi355x-atom-disagg-mtp

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4ceea4f78..6fe43c6f8 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3971,4 +3971,14 @@
     - "Update the MI300X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a"
     - "Use FP8 KV cache"
     - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843
\ No newline at end of file
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom-disagg-mtp
+  description:
+    - "Add dsv4-fp4-mi355x-atom-disagg-mtp recipe: multi-node disaggregated PD on MI355X via ATOM with MTP speculative decoding"
+    - "2P1D DPA+TBO+MTP1 sweep at ISL8192 (conc 256-2048)"
+    - "1P1D TP8+MTP3 sweep at ISL8192 (conc 4-128)"
+    - "1P1D TP8+DPA+MTP1 sweep at ISL1024 (conc 64-1024)"
+    - "Image: rocm/atom-dev:nightly_202606181332"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1855

From 6893a06d1c170c9cc6f586b997d082fdef39c75c Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 19:02:52 +0900
Subject: [PATCH 12/18] fix: inline --hf-overrides to avoid eval
 word-splitting, remove OPT_ARGS

OPT_ARGS array expansion inside eval'd string caused bash word-splitting,
breaking the --hf-overrides JSON argument. Inline the flag directly in all
three server commands and remove the now-unused OPT_ARGS definition.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 6c7484e2c..c79d35ead 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -138,9 +138,6 @@ if [ "$SPEC_DECODING" = "mtp" ]; then
     SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
 fi
 
-# OPT args
-OPT_ARGS=(--hf-overrides '{"use_index_cache":true,"index_topk_freq":4}')
-
 cat <<INFO
 === Configuration ===
 PREFILL  : ${PREFILL_IPS[*]} (TP=${PREFILL_TP_SIZE}, EP=${PREFILL_ENABLE_EP:-false}, DP=${PREFILL_ENABLE_DP:-false}, port=${PREFILL_PORT})
@@ -154,7 +151,6 @@ KV cache : dtype=${KV_CACHE_DTYPE} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_
 Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
 Decode  args : ${DECODE_PARALLEL_ARGS[*]}
 Spec    args : ${SPEC_ARGS[*]}
-Opt     args : ${OPT_ARGS[*]}
 =====================
 INFO
 
@@ -189,7 +185,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
-        "${OPT_ARGS[@]}" \
+        --hf-overrides '{"use_index_cache":true,"index_topk_freq":4}' \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -410,7 +406,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
-        "${OPT_ARGS[@]}" \
+        --hf-overrides '{"use_index_cache":true,"index_topk_freq":4}' \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -475,8 +471,9 @@ else
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
+        --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
-        "${OPT_ARGS[@]}" \
+        --hf-overrides '{"use_index_cache":true,"index_topk_freq":4}' \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 

From 510600281ae2c388e2af2db58a53cc44ce0458eb Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 19:05:31 +0900
Subject: [PATCH 13/18] refactor: extract --hf-overrides into HF_OVERRIDES_ARG
 variable

Define once near SPEC_ARGS and reference in all three server commands
(prefill node 0, additional prefill nodes, decode nodes).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index c79d35ead..44550eef6 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -138,6 +138,9 @@ if [ "$SPEC_DECODING" = "mtp" ]; then
     SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
 fi
 
+# HF overrides (single-quoted JSON preserved through eval)
+HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
+
 cat <<INFO
 === Configuration ===
 PREFILL  : ${PREFILL_IPS[*]} (TP=${PREFILL_TP_SIZE}, EP=${PREFILL_ENABLE_EP:-false}, DP=${PREFILL_ENABLE_DP:-false}, port=${PREFILL_PORT})
@@ -185,7 +188,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
-        --hf-overrides '{"use_index_cache":true,"index_topk_freq":4}' \
+        ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -406,7 +409,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
-        --hf-overrides '{"use_index_cache":true,"index_topk_freq":4}' \
+        ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 
@@ -473,7 +476,7 @@ else
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${MAX_NUM_SEQS} \
         --no-enable_prefix_caching \
-        --hf-overrides '{"use_index_cache":true,"index_topk_freq":4}' \
+        ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
         ${EXTRA_SERVER_ARGS}"
 

From 55c810dbe24fdef7f2b65833dfd0d28843e6b2ad Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 19:06:31 +0900
Subject: [PATCH 14/18] fix: enable --hf-overrides only for DeepSeek-V4-Pro

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 44550eef6..4798961da 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -139,7 +139,10 @@ if [ "$SPEC_DECODING" = "mtp" ]; then
 fi
 
 # HF overrides (single-quoted JSON preserved through eval)
-HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
+HF_OVERRIDES_ARG=""
+if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
+    HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
+fi
 
 cat <<INFO
 === Configuration ===

From 6386657b7d79bc0ac249ecf4eaff425d3e18f958 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 19:07:20 +0900
Subject: [PATCH 15/18] fix: add HF_OVERRIDES_ARG to INFO config print block

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 4798961da..e75cbfe5e 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -157,6 +157,7 @@ KV cache : dtype=${KV_CACHE_DTYPE} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_
 Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
 Decode  args : ${DECODE_PARALLEL_ARGS[*]}
 Spec    args : ${SPEC_ARGS[*]}
+Opt     args : ${HF_OVERRIDES_ARG}
 =====================
 INFO
 

From 92746e9b789be9f0e4b423199c516297705dc3f0 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 20:16:01 +0900
Subject: [PATCH 16/18] fix: replace broken-quote array splice with ${ARRAY[*]}
 in CMD strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"${ARRAY[@]}" inside a double-quoted assignment breaks bash -n's quote
parser. Since all three CMD strings are passed to eval, ${ARRAY[*]}
is equivalent — eval handles word splitting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |  2 +-
 .../multi_node/amd_utils/server_atom.sh       | 23 +++++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d84c039cc..27928fca9 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2819,7 +2819,7 @@ dsv4-fp4-mi355x-atom-disagg-mtp:
           - "DECODE_MTP_SIZE=1"
       # 1P1D TP8+MTP3 
       - spec-decoding: "mtp"
-        conc-list: [ 4, 8, 16, 32, 64, 128 ]
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
         prefill:
           num-worker: 1
           tp: 8
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index e75cbfe5e..4ffaabd95 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -185,8 +185,8 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --model ${MODEL_DIR}/${MODEL_NAME} \
         --host 0.0.0.0 --server-port ${PREFILL_PORT} \
         --trust-remote-code \
-        "${PREFILL_PARALLEL_ARGS[@]}" \
-        "${SPEC_ARGS[@]}" \
+        ${PREFILL_PARALLEL_ARGS[*]} \
+        ${SPEC_ARGS[*]} \
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
@@ -406,8 +406,8 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --model ${MODEL_DIR}/${MODEL_NAME} \
         --host 0.0.0.0 --server-port ${PREFILL_PORT} \
         --trust-remote-code \
-        "${PREFILL_PARALLEL_ARGS[@]}" \
-        "${SPEC_ARGS[@]}" \
+        ${PREFILL_PARALLEL_ARGS[*]} \
+        ${SPEC_ARGS[*]} \
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
@@ -469,16 +469,25 @@ else
     RANK=$((NODE_RANK - NODE_OFFSET))
     echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})"
 
+    _MAX_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+
+    if [[ "$BENCH_INPUT_LEN" == "1024" && "$BENCH_OUTPUT_LEN" == "1024" ]]; then
+        DECODE_MAX_NUM_SEQS="${_MAX_CONC}"
+    else
+        DECODE_MAX_NUM_SEQS="${MAX_NUM_SEQS}"
+    fi
+
     DECODE_CMD="python3 -m atom.entrypoints.openai_server \
         --model ${MODEL_DIR}/${MODEL_NAME} \
         --host 0.0.0.0 --server-port ${DECODE_PORT} \
         --trust-remote-code \
-        "${DECODE_PARALLEL_ARGS[@]}" \
-        "${SPEC_ARGS[@]}" \
+        ${DECODE_PARALLEL_ARGS[*]} \
+        ${SPEC_ARGS[*]} \
         --kv_cache_dtype ${KV_CACHE_DTYPE} \
         --block-size ${BLOCK_SIZE} \
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
-        --max-num-seqs ${MAX_NUM_SEQS} \
+        --max-num-seqs ${DECODE_MAX_NUM_SEQS} \
+        ${CUDAGRAPH_OPT} \
         --no-enable_prefix_caching \
         ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \

From 97f0cab71cbf54f2bc05807cb1e983588ff15a1a Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 19 Jun 2026 20:18:47 +0900
Subject: [PATCH 17/18] fix: remove ${CUDAGRAPH_OPT} from decode CMD

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/server_atom.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 4ffaabd95..69c049ecb 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -487,7 +487,6 @@ else
         --block-size ${BLOCK_SIZE} \
         --gpu-memory-utilization ${MEM_FRAC_STATIC} \
         --max-num-seqs ${DECODE_MAX_NUM_SEQS} \
-        ${CUDAGRAPH_OPT} \
         --no-enable_prefix_caching \
         ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \

From f9a93c4b315cc516952e557affcdcfcaa0f99d53 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Sat, 20 Jun 2026 00:03:57 +0900
Subject: [PATCH 18/18] feat: add 2P1D DPA+MTP3 search space to
 dsv4-fp4-mi355x-atom-disagg-mtp; add printenv dump and
 cudagraph-capture-sizes to server_atom.sh

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 18 ++++++++++++++++++
 .../multi_node/amd_utils/server_atom.sh       | 19 +++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 27928fca9..1a9e36358 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2817,6 +2817,24 @@ dsv4-fp4-mi355x-atom-disagg-mtp:
           additional-settings:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
+      # 2P1D TP8+DPA+TBO+MTP3
+      - spec-decoding: "mtp"
+        conc-list: [ 256, 512, 768, 1024, 2048 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=3"
       # 1P1D TP8+MTP3 
       - spec-decoding: "mtp"
         conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh
index 69c049ecb..a3a48136e 100644
--- a/benchmarks/multi_node/amd_utils/server_atom.sh
+++ b/benchmarks/multi_node/amd_utils/server_atom.sh
@@ -115,7 +115,8 @@ if [ "$PREFILL_ENABLE_DP" = "true" ]; then
     if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
         PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
     else #TP+DPA+TBO
-        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo)
+        # (srok), TBO only on Prefill server
+        PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo )
         export GPU_MAX_HW_QUEUES=5
         export ATOM_CPU_AFFINITY=1
     fi
@@ -126,7 +127,7 @@ if [ "$DECODE_ENABLE_DP" = "true" ]; then
     if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
         DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
     else #TP+DPA+TBO
-        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo)
+        DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
         export GPU_MAX_HW_QUEUES=5
         export ATOM_CPU_AFFINITY=1
     fi
@@ -161,6 +162,10 @@ Opt     args : ${HF_OVERRIDES_ARG}
 =====================
 INFO
 
+echo "=== Environment Variables ==="
+printenv | sort
+echo "============================="
+
 # =============================================================================
 # Node Role Assignment
 #
@@ -470,6 +475,15 @@ else
     echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})"
 
     _MAX_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+    if [[ "$_MAX_CONC" -gt 2048 ]]; then
+        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048,4096]'
+    elif [[ "$_MAX_CONC" -gt 1024 ]]; then
+        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048]'
+    elif [[ "$_MAX_CONC" -gt 512 ]]; then
+        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,768,1024]'
+    else
+        CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512]'
+    fi
 
     if [[ "$BENCH_INPUT_LEN" == "1024" && "$BENCH_OUTPUT_LEN" == "1024" ]]; then
         DECODE_MAX_NUM_SEQS="${_MAX_CONC}"
@@ -490,6 +504,7 @@ else
         --no-enable_prefix_caching \
         ${HF_OVERRIDES_ARG} \
         --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \
+        --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
         ${EXTRA_SERVER_ARGS}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then