From ea33910cd0cf4b00350da2fc499537062aa113b5 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 17:02:53 +0900 Subject: [PATCH 01/18] [AMD] server_atom: improve config print and cleanup - Replace individual echo lines with cat < --- .github/configs/amd-master.yaml | 90 +++++++++++++++++++ .../multi_node/amd_utils/server_atom.sh | 79 +++++++++------- .../multi_node/dsv4_fp4_mi355x_atom-disagg.sh | 1 + 3 files changed, 138 insertions(+), 32 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4e91d8116..e08635ce8 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2785,6 +2785,96 @@ dsv4-fp4-mi355x-atom-disagg: additional-settings: - "DECODE_NODES=1" +dsv4-fp4-mi355x-atom-disagg-mtp: + image: rocm/atom-dev:nightly_202606181332 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 2P1D TP8+DPA+TBO+MTP1 + - spec-decoding: "mtp" + conc-list: [ 256, 512, 768, 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + # 1P1D TP8+MTP3 + - spec-decoding: "mtp" + conc-list: [ 4, 8, 16, 32, 64, 128 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # 1P1D TP8+DPA+MTP1 + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + # 1P1D TP8+MTP3 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # MiniMax-M3 MXFP8 MI355X recipe: # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA. diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 957c84d60..823eb99c1 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -35,6 +35,10 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# MTP +SPEC_DECODING="${SPEC_DECODING:-}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}" + # ATOM server ports (different from SGLang which uses 8000 for all) PREFILL_PORT="${PREFILL_PORT:-8010}" DECODE_PORT="${DECODE_PORT:-8020}" @@ -42,7 +46,7 @@ ROUTER_PORT="${ROUTER_PORT:-8000}" HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" # ATOM server tuning (from reference script defaults) -MEM_FRACTION="${MEM_FRACTION:-0.85}" +MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" BLOCK_SIZE="${BLOCK_SIZE:-16}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" @@ -100,20 +104,20 @@ for i in $(seq 0 $((yD - 1))); do DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}" done -echo "Prefill IPs : ${PREFILL_IPS[*]}" -echo "Decode IPs : ${DECODE_IPS[*]}" - PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}" PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$PREFILL_ENABLE_DP" = "true" ]; then if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) + else #TP+DPA+TBO + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 fi fi @@ -121,13 +125,38 @@ DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) + else #TP+DPA+TBO + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 fi fi -echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}" -echo "Decode Parallel args : ${DECODE_PARALLEL_ARGS[*]}" +# MTP args +SPEC_ARGS=() #TP +if [ "$SPEC_DECODING" = "mtp" ]; then + SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") +fi + +# OPT args +OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}') + +cat < Date: Fri, 19 Jun 2026 17:04:42 +0900 Subject: [PATCH 02/18] update perf-changelog for dsv4-fp4-mi355x-atom-disagg-mtp Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a232ddb16..83540513d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,13 @@ +- config-keys: + - dsv4-fp4-mi355x-atom-disagg-mtp + description: + - "Add dsv4-fp4-mi355x-atom-disagg-mtp recipe: multi-node disaggregated PD on MI355X via ATOM with MTP speculative decoding" + - "2P1D DPA+TBO+MTP1 sweep at ISL8192 (conc 256-2048)" + - "1P1D TP8+MTP3 sweep at ISL8192 (conc 4-128)" + - "1P1D TP8+DPA+MTP1 sweep at ISL1024 (conc 64-1024)" + - "Image: rocm/atom-dev:nightly_202606181332" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1855 + - config-keys: - 70b-fp8-*-vllm description: From 2216d11c5fb1e740e0d14584cde050126d414564 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 17:22:32 +0900 Subject: [PATCH 03/18] [AMD] fix DECODE_MTP_SIZE and BENCH_REQUEST_RATE propagation in atom-disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Export DECODE_MTP_SIZE and SPEC_DECODING in dsv4_fp4_mi355x_atom-disagg.sh so they reach server_atom.sh via submit.sh → job.slurm - Add DECODE_MTP_SIZE to check_env_vars in dsv4_fp4_mi355x_atom-disagg.sh - Pass BENCH_REQUEST_RATE into Docker container in job.slurm DOCKER_ENV_COMMON Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/job.slurm | 3 ++- benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index c542280c4..f797913eb 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -363,6 +363,7 @@ DOCKER_ENV_COMMON=( -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE -e TQDM_MININTERVAL=\$TQDM_MININTERVAL -e DRY_RUN=\$DRY_RUN -e BENCHMARK_LOGS_DIR=/benchmark_logs @@ -411,7 +412,7 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then -e DECODE_PORT=${DECODE_PORT:-8020} -e ROUTER_PORT=${ROUTER_PORT:-30000} -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301} - -e MEM_FRACTION=${MEM_FRACTION:-0.85} + -e MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85} -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8} -e BLOCK_SIZE=${BLOCK_SIZE:-16} -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256} diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh index 4d527dae9..4745eaa92 100644 --- a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh +++ b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh @@ -8,6 +8,7 @@ check_env_vars \ OSL \ IMAGE \ SPEC_DECODING \ + DECODE_MTP_SIZE \ MODEL_PATH \ PREFILL_NUM_WORKERS \ PREFILL_TP \ @@ -61,12 +62,14 @@ else export DECODE_ENABLE_DP=false fi +export SPEC_DECODING="${SPEC_DECODING}" +export DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + # Launch jobs based on ISL/OSL # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented # by a list of numbers delimited by 'x'. This is because of how the underlying launch script # expects the concurrencies. JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ - $SPEC_DECODING \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ From cd745fa5bd9d3fb98044227161ec635ed8ae5e39 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 17:23:37 +0900 Subject: [PATCH 04/18] [AMD] server_atom: pass SPEC_ARGS to prefill server Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 823eb99c1..d458a19f7 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -183,6 +183,7 @@ if [ "$NODE_RANK" -eq 0 ]; then --host 0.0.0.0 --server-port ${PREFILL_PORT} \ --trust-remote-code \ "${PREFILL_PARALLEL_ARGS[@]}" \ + "${SPEC_ARGS[@]}" \ --kv_cache_dtype ${KV_CACHE_DTYPE} \ --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization ${MEM_FRAC_STATIC} \ From baf0e063be01114482524e3df4e1941e20946668 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 17:27:31 +0900 Subject: [PATCH 05/18] [AMD] amd-master: fix comment for 1P1D TP8+DPA+TBO+MTP1 config Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e08635ce8..715312391 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2835,7 +2835,7 @@ dsv4-fp4-mi355x-atom-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # 1P1D TP8+DPA+MTP1 + # 1P1D TP8+DPA+TBO+MTP1 - isl: 1024 osl: 1024 search-space: From 1485744cebc4bf5096fc0e521208f8ef5db2c822 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 17:29:40 +0900 Subject: [PATCH 06/18] [AMD] dsv4_atom-disagg: remove DECODE_MTP_SIZE from check_env_vars DECODE_MTP_SIZE comes from additional-settings and has a default of 0, so it should not be required. Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh index 4745eaa92..60f85d553 100644 --- a/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh +++ b/benchmarks/multi_node/dsv4_fp4_mi355x_atom-disagg.sh @@ -8,7 +8,6 @@ check_env_vars \ OSL \ IMAGE \ SPEC_DECODING \ - DECODE_MTP_SIZE \ MODEL_PATH \ PREFILL_NUM_WORKERS \ PREFILL_TP \ From 4e039bc87927c24911d7b3e125a512cdd95f8362 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 17:37:26 +0900 Subject: [PATCH 07/18] [AMD] bench: use --dsv4 flag for DeepSeek-V4-Pro MTP benchmarks Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/bench.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..c0dbb60ad 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else if [ "$IS_MTP" = "true" ]; then - extra_flags="--use-chat-template" + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + extra_flags="--dsv4" + else + extra_flags="--use-chat-template" + fi fi fi From 0868467009594f8902a82ef3cd4db2d079d473f2 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 17:42:15 +0900 Subject: [PATCH 08/18] [AMD] server_atom: export IS_MTP=true when SPEC_DECODING=mtp for bench.sh Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index d458a19f7..c461ed34b 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -279,6 +279,11 @@ if [ "$NODE_RANK" -eq 0 ]; then cd $ATOM_WS_PATH + export IS_MTP="false" + if [ "$SPEC_DECODING" = "mtp" ]; then + export IS_MTP="true" + fi + BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ From c7d48b0ada0a71d352ed2605af83933d3c3824e2 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 18:44:10 +0900 Subject: [PATCH 09/18] [AMD] server_atom: fix hf-overrides JSON quoting Remove spaces from JSON value so it doesn't get word-split when expanded inside the eval'd command string. Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index c461ed34b..6c7484e2c 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -139,7 +139,7 @@ if [ "$SPEC_DECODING" = "mtp" ]; then fi # OPT args -OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}') +OPT_ARGS=(--hf-overrides '{"use_index_cache":true,"index_topk_freq":4}') cat < Date: Fri, 19 Jun 2026 18:49:07 +0900 Subject: [PATCH 10/18] update perf-changelog for minimaxm3-fp4-mi355x-atom Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 83540513d..4ceea4f78 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,13 +1,3 @@ -- config-keys: - - dsv4-fp4-mi355x-atom-disagg-mtp - description: - - "Add dsv4-fp4-mi355x-atom-disagg-mtp recipe: multi-node disaggregated PD on MI355X via ATOM with MTP speculative decoding" - - "2P1D DPA+TBO+MTP1 sweep at ISL8192 (conc 256-2048)" - - "1P1D TP8+MTP3 sweep at ISL8192 (conc 4-128)" - - "1P1D TP8+DPA+MTP1 sweep at ISL1024 (conc 64-1024)" - - "Image: rocm/atom-dev:nightly_202606181332" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1855 - - config-keys: - 70b-fp8-*-vllm description: @@ -3961,6 +3951,12 @@ - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 +- config-keys: + - minimaxm3-fp4-mi355x-atom + description: + - "Expand search space for minimaxm3-fp4-mi355x-atom: add TP2 and TP8 configurations, extend concurrency range to 256 for ISL1024 and ISL8192, and add TP8 conc=1-2 for ISL8192." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1825 + - config-keys: - minimaxm3-fp8-mi300x-vllm description: @@ -3975,4 +3971,4 @@ - "Update the MI300X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a" - "Use FP8 KV cache" - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 \ No newline at end of file From ba37d043089033fb6e22d584b1a30824eb800256 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 18:50:36 +0900 Subject: [PATCH 11/18] update perf-changelog for dsv4-fp4-mi355x-atom-disagg-mtp Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4ceea4f78..6fe43c6f8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3971,4 +3971,14 @@ - "Update the MI300X MiniMax-M3 EAGLE3 vLLM image to vllm/vllm-openai-rocm:nightly-b53b1c7ffe7aebdafd0876350f30e51d1226c92a" - "Use FP8 KV cache" - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 \ No newline at end of file + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 + +- config-keys: + - dsv4-fp4-mi355x-atom-disagg-mtp + description: + - "Add dsv4-fp4-mi355x-atom-disagg-mtp recipe: multi-node disaggregated PD on MI355X via ATOM with MTP speculative decoding" + - "2P1D DPA+TBO+MTP1 sweep at ISL8192 (conc 256-2048)" + - "1P1D TP8+MTP3 sweep at ISL8192 (conc 4-128)" + - "1P1D TP8+DPA+MTP1 sweep at ISL1024 (conc 64-1024)" + - "Image: rocm/atom-dev:nightly_202606181332" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1855 From 6893a06d1c170c9cc6f586b997d082fdef39c75c Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 19:02:52 +0900 Subject: [PATCH 12/18] fix: inline --hf-overrides to avoid eval word-splitting, remove OPT_ARGS OPT_ARGS array expansion inside eval'd string caused bash word-splitting, breaking the --hf-overrides JSON argument. Inline the flag directly in all three server commands and remove the now-unused OPT_ARGS definition. Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 6c7484e2c..c79d35ead 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -138,9 +138,6 @@ if [ "$SPEC_DECODING" = "mtp" ]; then SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") fi -# OPT args -OPT_ARGS=(--hf-overrides '{"use_index_cache":true,"index_topk_freq":4}') - cat < Date: Fri, 19 Jun 2026 19:05:31 +0900 Subject: [PATCH 13/18] refactor: extract --hf-overrides into HF_OVERRIDES_ARG variable Define once near SPEC_ARGS and reference in all three server commands (prefill node 0, additional prefill nodes, decode nodes). Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index c79d35ead..44550eef6 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -138,6 +138,9 @@ if [ "$SPEC_DECODING" = "mtp" ]; then SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") fi +# HF overrides (single-quoted JSON preserved through eval) +HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" + cat < Date: Fri, 19 Jun 2026 19:06:31 +0900 Subject: [PATCH 14/18] fix: enable --hf-overrides only for DeepSeek-V4-Pro Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 44550eef6..4798961da 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -139,7 +139,10 @@ if [ "$SPEC_DECODING" = "mtp" ]; then fi # HF overrides (single-quoted JSON preserved through eval) -HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" +HF_OVERRIDES_ARG="" +if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" +fi cat < Date: Fri, 19 Jun 2026 19:07:20 +0900 Subject: [PATCH 15/18] fix: add HF_OVERRIDES_ARG to INFO config print block Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 4798961da..e75cbfe5e 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -157,6 +157,7 @@ KV cache : dtype=${KV_CACHE_DTYPE} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_ Prefill args : ${PREFILL_PARALLEL_ARGS[*]} Decode args : ${DECODE_PARALLEL_ARGS[*]} Spec args : ${SPEC_ARGS[*]} +Opt args : ${HF_OVERRIDES_ARG} ===================== INFO From 92746e9b789be9f0e4b423199c516297705dc3f0 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 20:16:01 +0900 Subject: [PATCH 16/18] fix: replace broken-quote array splice with ${ARRAY[*]} in CMD strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "${ARRAY[@]}" inside a double-quoted assignment breaks bash -n's quote parser. Since all three CMD strings are passed to eval, ${ARRAY[*]} is equivalent — eval handles word splitting. Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- .../multi_node/amd_utils/server_atom.sh | 23 +++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d84c039cc..27928fca9 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2819,7 +2819,7 @@ dsv4-fp4-mi355x-atom-disagg-mtp: - "DECODE_MTP_SIZE=1" # 1P1D TP8+MTP3 - spec-decoding: "mtp" - conc-list: [ 4, 8, 16, 32, 64, 128 ] + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 8 diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index e75cbfe5e..4ffaabd95 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -185,8 +185,8 @@ if [ "$NODE_RANK" -eq 0 ]; then --model ${MODEL_DIR}/${MODEL_NAME} \ --host 0.0.0.0 --server-port ${PREFILL_PORT} \ --trust-remote-code \ - "${PREFILL_PARALLEL_ARGS[@]}" \ - "${SPEC_ARGS[@]}" \ + ${PREFILL_PARALLEL_ARGS[*]} \ + ${SPEC_ARGS[*]} \ --kv_cache_dtype ${KV_CACHE_DTYPE} \ --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization ${MEM_FRAC_STATIC} \ @@ -406,8 +406,8 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then --model ${MODEL_DIR}/${MODEL_NAME} \ --host 0.0.0.0 --server-port ${PREFILL_PORT} \ --trust-remote-code \ - "${PREFILL_PARALLEL_ARGS[@]}" \ - "${SPEC_ARGS[@]}" \ + ${PREFILL_PARALLEL_ARGS[*]} \ + ${SPEC_ARGS[*]} \ --kv_cache_dtype ${KV_CACHE_DTYPE} \ --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization ${MEM_FRAC_STATIC} \ @@ -469,16 +469,25 @@ else RANK=$((NODE_RANK - NODE_OFFSET)) echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})" + _MAX_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + + if [[ "$BENCH_INPUT_LEN" == "1024" && "$BENCH_OUTPUT_LEN" == "1024" ]]; then + DECODE_MAX_NUM_SEQS="${_MAX_CONC}" + else + DECODE_MAX_NUM_SEQS="${MAX_NUM_SEQS}" + fi + DECODE_CMD="python3 -m atom.entrypoints.openai_server \ --model ${MODEL_DIR}/${MODEL_NAME} \ --host 0.0.0.0 --server-port ${DECODE_PORT} \ --trust-remote-code \ - "${DECODE_PARALLEL_ARGS[@]}" \ - "${SPEC_ARGS[@]}" \ + ${DECODE_PARALLEL_ARGS[*]} \ + ${SPEC_ARGS[*]} \ --kv_cache_dtype ${KV_CACHE_DTYPE} \ --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization ${MEM_FRAC_STATIC} \ - --max-num-seqs ${MAX_NUM_SEQS} \ + --max-num-seqs ${DECODE_MAX_NUM_SEQS} \ + ${CUDAGRAPH_OPT} \ --no-enable_prefix_caching \ ${HF_OVERRIDES_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ From 97f0cab71cbf54f2bc05807cb1e983588ff15a1a Mon Sep 17 00:00:00 2001 From: seungrokj Date: Fri, 19 Jun 2026 20:18:47 +0900 Subject: [PATCH 17/18] fix: remove ${CUDAGRAPH_OPT} from decode CMD Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/server_atom.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 4ffaabd95..69c049ecb 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -487,7 +487,6 @@ else --block-size ${BLOCK_SIZE} \ --gpu-memory-utilization ${MEM_FRAC_STATIC} \ --max-num-seqs ${DECODE_MAX_NUM_SEQS} \ - ${CUDAGRAPH_OPT} \ --no-enable_prefix_caching \ ${HF_OVERRIDES_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ From f9a93c4b315cc516952e557affcdcfcaa0f99d53 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 00:03:57 +0900 Subject: [PATCH 18/18] feat: add 2P1D DPA+MTP3 search space to dsv4-fp4-mi355x-atom-disagg-mtp; add printenv dump and cudagraph-capture-sizes to server_atom.sh Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 18 ++++++++++++++++++ .../multi_node/amd_utils/server_atom.sh | 19 +++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 27928fca9..1a9e36358 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2817,6 +2817,24 @@ dsv4-fp4-mi355x-atom-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + # 2P1D TP8+DPA+TBO+MTP3 + - spec-decoding: "mtp" + conc-list: [ 256, 512, 768, 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" # 1P1D TP8+MTP3 - spec-decoding: "mtp" conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 69c049ecb..a3a48136e 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -115,7 +115,8 @@ if [ "$PREFILL_ENABLE_DP" = "true" ]; then if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) else #TP+DPA+TBO - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo) + # (srok), TBO only on Prefill server + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) export GPU_MAX_HW_QUEUES=5 export ATOM_CPU_AFFINITY=1 fi @@ -126,7 +127,7 @@ if [ "$DECODE_ENABLE_DP" = "true" ]; then if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) else #TP+DPA+TBO - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo) + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) export GPU_MAX_HW_QUEUES=5 export ATOM_CPU_AFFINITY=1 fi @@ -161,6 +162,10 @@ Opt args : ${HF_OVERRIDES_ARG} ===================== INFO +echo "=== Environment Variables ===" +printenv | sort +echo "=============================" + # ============================================================================= # Node Role Assignment # @@ -470,6 +475,15 @@ else echo "${host_name}:${host_ip} is Decode Node (rank ${RANK})" _MAX_CONC=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) + if [[ "$_MAX_CONC" -gt 2048 ]]; then + CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048,4096]' + elif [[ "$_MAX_CONC" -gt 1024 ]]; then + CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024,2048]' + elif [[ "$_MAX_CONC" -gt 512 ]]; then + CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,768,1024]' + else + CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512]' + fi if [[ "$BENCH_INPUT_LEN" == "1024" && "$BENCH_OUTPUT_LEN" == "1024" ]]; then DECODE_MAX_NUM_SEQS="${_MAX_CONC}" @@ -490,6 +504,7 @@ else --no-enable_prefix_caching \ ${HF_OVERRIDES_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ + --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \ ${EXTRA_SERVER_ARGS}" if [[ "$DRY_RUN" -eq 1 ]]; then