From 1e5f3a1f2bcf577843e99be87edb7948982e9e77 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 03:56:25 +0000 Subject: [PATCH 1/9] [AMD] add dsv4 sglang disagg --- .github/configs/amd-master.yaml | 63 ++++++++++++++ benchmarks/multi_node/amd_utils/bench.sh | 7 +- benchmarks/multi_node/amd_utils/env.sh | 55 ++++++++++++ benchmarks/multi_node/amd_utils/models.yaml | 35 ++++++++ .../multi_node/amd_utils/server_sglang.sh | 20 +++-- benchmarks/multi_node/amd_utils/submit.sh | 10 +++ .../dsv4_fp4_mi355x_sglang-disagg.sh | 83 +++++++++++++++++++ 7 files changed, 266 insertions(+), 7 deletions(-) create mode 100755 benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8909c0e28..80c14f58b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2136,6 +2136,69 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_MTP_SIZE=1" +# DSv4 PD-disaggregation on MI355X via SGLang + MoRI. Structure mirrors +# dsr1-fp4-mi355x-sglang-disagg but only the isl 8192 / osl 1024 scenario, with two +# topology families captured from the validated manual recipe (see +# dsv4_mi355x_sglang_disagg_plan.md): +# - pure-TP 1P1D (TP8, mori KV transfer) +# - DEP 1P1D (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention) +# DSv4-specific serving knobs (attention-backend dsv4, page-size 256, unified_kv_triton, +# AITER indexer, deepseekv4 parsers) live in amd_utils/{models.yaml,env.sh}; the bench +# client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has +# no spec decoding); MTP is a follow-up. +dsv4-fp4-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 (mori KV transfer) + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + dsv4-fp4-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..d198a4ddd 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -79,7 +79,12 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do if [[ "$ENGINE" == "vllm-disagg" ]]; then extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else - if [ "$IS_MTP" = "true" ]; then + # DeepSeek-V4-Pro ships no jinja chat_template, so --use-chat-template crashes; + # --dsv4 applies the DSv4 ... framing instead + # (chat-formatted inputs are required for correct EAGLE/MTP acceptance too). + if [[ "$model_name" == "DeepSeek-V4-Pro" ]]; then + extra_flags="--dsv4" + elif [ "$IS_MTP" = "true" ]; then extra_flags="--use-chat-template" fi fi diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 71d2653bd..6b0e4206a 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -228,6 +228,61 @@ $1 == "DSCP" && $2 == ":" && $NF == p { fi fi + # ========================================================================= + # DeepSeek-V4-Pro PD recipe overrides + # Placed at the end of the SGLang env block so it wins over the global + # MoRI/SGLang defaults set above. Mirrors the validated DSv4 manual PD + # commands (see dsv4_mi355x_sglang_disagg_plan.md §2). Only the SGLang/MoRI + # env knobs are pinned here; CLI flags live in models.yaml and the cluster + # NIC/socket vars (NCCL_IB_HCA, *_SOCKET_IFNAME, IBDEVICES) stay runner-derived. + # ========================================================================= + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + # MoRI dispatch/combine dtypes: auto for both roles (not the fp8 split default) + export SGLANG_MORI_DISPATCH_DTYPE=auto + export MORI_COMBINE_DTYPE_PREFILL=auto + export MORI_COMBINE_DTYPE_DECODE=auto + + # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 + export MORI_MAX_DISPATCH_TOKENS_DECODE=64 + export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 + export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 + + # PER_RANK dispatch tokens are pinned independently of the sizing above + # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh + # prefers these over the MORI_MAX_DISPATCH_TOKENS_* coupling when set. + export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL=16384 + export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE=128 + + # Fixed inter-kernel switch threshold (not derived). NOTE: the DP+EP path in + # server_sglang.sh recomputes this dynamically for the DEP topology. + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=4096 + + # Overlap plan stream on for DSv4 (global default is 0) + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 + + # DSv4 model kernel routing (mirrors the single-node / manual PD recipe) + export SGLANG_DEFAULT_THINKING=1 + export SGLANG_DSV4_REASONING_EFFORT=max + export SGLANG_USE_ROCM700A=0 + export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton + export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false + export SGLANG_OPT_USE_FUSED_COMPRESS=true + export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true + export SGLANG_OPT_FP8_WO_A_GEMM=false + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false + export SGLANG_OPT_USE_TOPK_V2=false + export SGLANG_OPT_USE_AITER_INDEXER=true + export SGLANG_OPT_USE_TILELANG_INDEXER=false + export SGLANG_OPT_USE_TILELANG_MHC_PRE=false + export SGLANG_OPT_USE_TILELANG_MHC_POST=false + export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 + export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false + export SGLANG_ROCM_USE_MULTI_STREAM=false + export AITER_BF16_FP8_MOE_BOUND=0 + export SGLANG_EAGER_INPUT_NO_COPY=true + fi + # FIXME: WA for latest upstream 0305 image export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 605a377be..e68c448ce 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -349,3 +349,38 @@ DeepSeek-R1-0528-MXFP4-v2: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +# DeepSeek-V4-Pro PD-disaggregation recipe (MI355X, SGLang + MoRI). +# KV transfer = mori for both topologies (pure-TP and DEP); the DP path additionally +# routes the MoE all-to-all through mori (--moe-a2a-backend mori) with dp-attention. +# DSv4-specific kernel routing (unified_kv_triton, AITER indexer, fp8 wo_a fallback, +# thinking/reasoning-effort, dispatch dtypes, per-role PER_RANK dispatch tokens) is set +# in env.sh's DeepSeek-V4-Pro block. The bench client uses --dsv4 framing (bench.sh). +# prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps +# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. +DeepSeek-V4-Pro: + base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + disable_cuda_graph: true + dp: + max_running_requests: 1024 + chunked_prefill_size: 131072 + context_length: 9217 + max_total_tokens: 262144 + no_dp: + max_running_requests: 128 + chunked_prefill_size: 131072 + context_length: 9217 + max_total_tokens: 262144 + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 1024 + cuda_graph_bs_range: "1-128" + no_dp: + max_running_requests: 128 + cuda_graph_bs_range: "1-128" diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index c28ccab41..38fbdfc8e 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -125,6 +125,7 @@ decode = m.get('decode', {}) print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') +print(f'PREFILL_DISABLE_CUDA_GRAPH=\"{prefill.get(\"disable_cuda_graph\", False)}\"') dp = prefill.get('dp', {}) no_dp = prefill.get('no_dp', {}) @@ -136,6 +137,8 @@ print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CONTEXT_LENGTH_NO_DP=\"{no_dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_NO_DP=\"{no_dp.get(\"max_total_tokens\", \"\")}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') @@ -183,8 +186,8 @@ else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP - prefill_context_length="" - prefill_max_total_tokens="" + prefill_context_length=$PREFILL_CONTEXT_LENGTH_NO_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_NO_DP prefill_enable_two_batch_overlap="false" fi @@ -222,7 +225,12 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +# disable_cuda_graph (model-level) routes prefill to --disable-cuda-graph instead of --cuda-graph-bs. +if [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "True" ]] || [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "true" ]]; then + PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --disable-cuda-graph " +else + PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +fi if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi @@ -418,7 +426,7 @@ if [ "$NODE_RANK" -eq 0 ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -650,7 +658,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -718,7 +726,7 @@ else DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" fi set +x - DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index fa3d65418..c264293a7 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -47,6 +47,10 @@ Required environment variables: MODEL_NAME Model name directory CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) RUNNER_NAME Runner identifier (for job name) + +Optional environment variables: + DRY_RUN 1 = echo composed server/router launch commands instead of + running them (preview a recipe against a real allocation). USAGE } @@ -125,6 +129,12 @@ export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} +# DRY_RUN=1 makes server_sglang.sh echo the composed prefill/decode/router launch +# commands instead of executing them (useful for previewing a recipe against a real +# allocation). Threaded here → job.slurm → Docker (-e DRY_RUN) → server_sglang.sh. +# sbatch defaults to --export=ALL, so exporting it is what carries it into the job. +export DRY_RUN="${DRY_RUN:-0}" + # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" export EVAL_ONLY="${EVAL_ONLY:-false}" diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh new file mode 100755 index 000000000..d17d1a323 --- /dev/null +++ b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" From 89961e80d10262b6ecffb4c20a8113aa19b4a1be Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 05:35:20 +0000 Subject: [PATCH 2/9] fix image --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 80c14f58b..b0a322465 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2141,13 +2141,13 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # topology families captured from the validated manual recipe (see # dsv4_mi355x_sglang_disagg_plan.md): # - pure-TP 1P1D (TP8, mori KV transfer) -# - DEP 1P1D (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention) +# - DEP 2P1D (TP8/EP8/DP8, mori KV transfer + mori MoE a2a, dp-attention) # DSv4-specific serving knobs (attention-backend dsv4, page-size 256, unified_kv_triton, # AITER indexer, deepseekv4 parsers) live in amd_utils/{models.yaml,env.sh}; the bench # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has # no spec decoding); MTP is a follow-up. dsv4-fp4-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 + image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x-disagg From 03ffadf3f0cc11f6d124fd085d84e80577c214bb Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 06:08:12 +0000 Subject: [PATCH 3/9] fix the image --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b0a322465..76e2925b6 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2147,7 +2147,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has # no spec decoding); MTP is a follow-up. dsv4-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4 + image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4-ep model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x-disagg From e8759336306b601a0f4ab42d24e49699370d5eb7 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 06:41:59 +0000 Subject: [PATCH 4/9] fix --- benchmarks/multi_node/amd_utils/env.sh | 4 ++-- benchmarks/multi_node/amd_utils/server_sglang.sh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 6b0e4206a..fe2348301 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -245,8 +245,8 @@ $1 == "DSCP" && $2 == ":" && $NF == p { # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math) export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 export MORI_MAX_DISPATCH_TOKENS_DECODE=64 - export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 - export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 + # export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 + # export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 # PER_RANK dispatch tokens are pinned independently of the sizing above # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 38fbdfc8e..ed4fbca30 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -196,7 +196,7 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; prefill_max_running_requests=$BENCH_MAX_CONC_VALUE prefill_dp_ranks=$PREFILL_TP_SIZE # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) - MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) + # MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" fi @@ -217,7 +217,7 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t decode_max_running_requests=$BENCH_MAX_CONC_VALUE decode_dp_ranks=$DECODE_TP_SIZE MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) # Update derived variable SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD @@ -253,7 +253,7 @@ fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= From dc23512493efc7f5dee0e7832c7e2506bb2b9e74 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 12 Jun 2026 07:53:01 +0000 Subject: [PATCH 5/9] fix --- benchmarks/multi_node/amd_utils/env.sh | 4 ++-- benchmarks/multi_node/amd_utils/models.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index fe2348301..4302d0f7e 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -245,8 +245,8 @@ $1 == "DSCP" && $2 == ":" && $NF == p { # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math) export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 export MORI_MAX_DISPATCH_TOKENS_DECODE=64 - # export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=2048 - # export MORI_MOE_MAX_INPUT_TOKENS_DECODE=332 + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE # PER_RANK dispatch tokens are pinned independently of the sizing above # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index e68c448ce..7adab68de 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -359,7 +359,7 @@ DeepSeek-R1-0528-MXFP4-v2: # prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps # --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. DeepSeek-V4-Pro: - base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 From b687779d9d0e7343c16652364ef0bb3046019a22 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 15 Jun 2026 01:02:57 +0000 Subject: [PATCH 6/9] bump image --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 76e2925b6..e813b39b9 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2147,7 +2147,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has # no spec decoding); MTP is a follow-up. dsv4-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.12.post1-rocm720-mi35x-mori-0610-dsv4-ep + image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260614 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x-disagg From 4e01d43103010f1fa6c596eece993fb35f8711ab Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 15 Jun 2026 07:23:19 +0000 Subject: [PATCH 7/9] add more sweeps --- .github/configs/amd-master.yaml | 61 ++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e813b39b9..38444609f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2182,7 +2182,46 @@ dsv4-fp4-mi355x-sglang-disagg: # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) - spec-decoding: "none" - conc-list: [ 512, 768, 1024 ] + conc-list: [ 500, 512 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 640, 768 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 1000, 1024 ] prefill: num-worker: 2 tp: 8 @@ -2199,6 +2238,26 @@ dsv4-fp4-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + dsv4-fp4-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 model: deepseek-ai/DeepSeek-V4-Pro From c22652bc8da82873e4e9c008d2372097c556b240 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 15 Jun 2026 16:38:11 +0000 Subject: [PATCH 8/9] fix --- .github/configs/amd-master.yaml | 5 +- benchmarks/multi_node/amd_utils/models.yaml | 54 ++++++++++++------- .../multi_node/amd_utils/server_sglang.sh | 12 +++++ 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 38444609f..53191807e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2147,7 +2147,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: # client uses --dsv4 framing (amd_utils/bench.sh). STP only for now (reference recipe has # no spec decoding); MTP is a follow-up. dsv4-fp4-mi355x-sglang-disagg: - image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260614 + image: lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260615 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x-disagg @@ -2239,7 +2239,8 @@ dsv4-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" - # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + # 1P1D dp-attention + TP-MoE (mori KV transfer; ep=1 -> MoE tensor-parallel, + # no MoE a2a / deepep / ep-dispatch — see amd_utils ep_flags decoupling) - spec-decoding: "none" conc-list: [ 512, 768, 1024 ] prefill: diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 7adab68de..c1df922d7 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -9,7 +9,10 @@ # : # base_flags: str # Common flags for both prefill and decode # mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0 -# dp_flags: str # Appended when DP is enabled (prefill or decode) +# dp_flags: str # Appended when DP attention is enabled (prefill or decode) +# ep_flags: str # Appended when EP is enabled. EP-specific MoE knobs only +# # (a2a backend, deepep mode, ep-dispatch algorithm). With +# # ep=1 these are dropped so the MoE runs tensor-parallel (TP). # prefill: # mem_fraction_static: float # disable_radix_cache: bool @@ -38,9 +41,10 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -69,9 +73,10 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -100,9 +105,10 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -131,9 +137,10 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -164,7 +171,8 @@ DeepSeek-R1-0528: Qwen3.5-397B-A17B-MXFP4: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori --moe-dense-tp-size 1" mtp_flags: "" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --enable-dp-lm-head" + ep_flags: "--moe-a2a-backend mori" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -195,7 +203,8 @@ Qwen3.5-397B-A17B-MXFP4: Qwen3.5-397B-A17B-FP8: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori --moe-dense-tp-size 1" mtp_flags: "" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --enable-dp-lm-head" + ep_flags: "--moe-a2a-backend mori" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -226,7 +235,8 @@ Qwen3.5-397B-A17B-FP8: GLM-5-FP8: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\\\"enable_multithread_load\\\": true, \\\"num_threads\\\": 8}'" mtp_flags: "" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--moe-a2a-backend mori" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -255,9 +265,10 @@ GLM-5-FP8: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -286,9 +297,10 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -317,9 +329,10 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -359,8 +372,9 @@ DeepSeek-R1-0528-MXFP4-v2: # prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps # --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. DeepSeek-V4-Pro: - base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index ed4fbca30..6d4d6b71d 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -119,6 +119,7 @@ def parse_range(cuda_range, default_start, default_end): print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') +print(f'MODEL_EP_FLAGS=\"{m.get(\"ep_flags\", \"\")}\"') prefill = m.get('prefill', {}) decode = m.get('decode', {}) @@ -326,6 +327,7 @@ build_server_config() { local base_config="$MODEL_BASE_FLAGS" local mtp_config="" local dp_config="" + local ep_config="" local specific_config="" # MTP config (only if MTP is enabled and mode is decode) @@ -338,6 +340,13 @@ build_server_config() { dp_config="$MODEL_DP_FLAGS" fi + # EP config (only if EP is enabled): a2a backend, deepep mode, ep-dispatch algo. + # With ep=1 (EP disabled) these are dropped, so the MoE runs tensor-parallel (TP) + # instead of expert-parallel — even when dp-attention is on. + if [[ "$enable_ep" == "true" ]]; then + ep_config="$MODEL_EP_FLAGS" + fi + # Mode-specific config if [[ "$mode" == "prefill" ]]; then specific_config="$PREFILL_MODE_FLAGS" @@ -350,6 +359,9 @@ build_server_config() { if [[ -n "$base_config" ]]; then full_config="$full_config $base_config" fi + if [[ -n "$ep_config" ]]; then + full_config="$full_config $ep_config" + fi if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then full_config="$full_config $mtp_config" fi From fb605ef38dc154454e6d6c6fe70783e529968821 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Thu, 18 Jun 2026 01:58:14 +0000 Subject: [PATCH 9/9] add perf log --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ed995b364..014a1e741 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3943,3 +3943,9 @@ - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)" - "Update Applied TBO on high concurrencies" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717 + +- config-keys: + - dsv4-fp4-mi355x-sglang-disagg + description: + - "init submission of dsv4 sglang disagg " + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1818