diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 04df9e3a5..639bd1f7f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2521,6 +2521,114 @@ dsv4-fp4-mi355x-atom-disagg: additional-settings: - "DECODE_NODES=1" +dsv4-fp4-mi355x-atom-disagg-mtp: + image: rocm/atom-dev:nightly_202606181332 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 2P1D TP8+DPA+TBO+MTP1 + - spec-decoding: "mtp" + conc-list: [ 256, 512, 768, 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + # 2P1D TP8+DPA+TBO+MTP3 + - spec-decoding: "mtp" + conc-list: [ 256, 512, 768, 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # 1P1D TP8+MTP3 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # 1P1D TP8+DPA+TBO+MTP1 + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + # 1P1D TP8+MTP3 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + # MiniMax-M3 MXFP8 MI355X recipe: # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA. diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..c0dbb60ad 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else if [ "$IS_MTP" = "true" ]; then - extra_flags="--use-chat-template" + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + extra_flags="--dsv4" + else + extra_flags="--use-chat-template" + fi fi fi diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 01a5bd386..004768a89 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -363,6 +363,7 @@ DOCKER_ENV_COMMON=( -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE -e TQDM_MININTERVAL=\$TQDM_MININTERVAL -e DRY_RUN=\$DRY_RUN -e BENCHMARK_LOGS_DIR=/benchmark_logs @@ -411,7 +412,7 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then -e DECODE_PORT=${DECODE_PORT:-8020} -e ROUTER_PORT=${ROUTER_PORT:-30000} -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301} - -e MEM_FRACTION=${MEM_FRACTION:-0.85} + -e MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85} -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8} -e BLOCK_SIZE=${BLOCK_SIZE:-16} -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256} diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 957c84d60..a3a48136e 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -35,6 +35,10 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# MTP +SPEC_DECODING="${SPEC_DECODING:-}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}" + # ATOM server ports (different from SGLang which uses 8000 for all) PREFILL_PORT="${PREFILL_PORT:-8010}" DECODE_PORT="${DECODE_PORT:-8020}" @@ -42,7 +46,7 @@ ROUTER_PORT="${ROUTER_PORT:-8000}" HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" # ATOM server tuning (from reference script defaults) -MEM_FRACTION="${MEM_FRACTION:-0.85}" +MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" BLOCK_SIZE="${BLOCK_SIZE:-16}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" @@ -100,20 +104,21 @@ for i in $(seq 0 $((yD - 1))); do DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}" done -echo "Prefill IPs : ${PREFILL_IPS[*]}" -echo "Decode IPs : ${DECODE_IPS[*]}" - PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}" PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$PREFILL_ENABLE_DP" = "true" ]; then if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) + else #TP+DPA+TBO + # (srok), TBO only on Prefill server + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 fi fi @@ -121,13 +126,45 @@ DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP + else #TP+DPA+TBO DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 fi fi -echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}" -echo "Decode Parallel args : ${DECODE_PARALLEL_ARGS[*]}" +# MTP args +SPEC_ARGS=() #TP +if [ "$SPEC_DECODING" = "mtp" ]; then + SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") +fi + +# HF overrides (single-quoted JSON preserved through eval) +HF_OVERRIDES_ARG="" +if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" +fi + +cat <