diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 04df9e3a5..38ace8243 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1873,6 +1873,117 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" + +dsv4-fp4-mi355x-sglang-disagg: + image: lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260615 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 (mori KV transfer) + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 500, 512 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 640, 768 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P1D DEP8 (mori KV transfer + mori MoE a2a, dp-attention) + - spec-decoding: "none" + conc-list: [ 1000, 1024 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P1D dp-attention + TP-MoE + - spec-decoding: "none" + conc-list: [ 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + dsv4-fp4-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260618 model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..d198a4ddd 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -79,7 +79,12 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do if [[ "$ENGINE" == "vllm-disagg" ]]; then extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else - if [ "$IS_MTP" = "true" ]; then + # DeepSeek-V4-Pro ships no jinja chat_template, so --use-chat-template crashes; + # --dsv4 applies the DSv4 ... framing instead + # (chat-formatted inputs are required for correct EAGLE/MTP acceptance too). + if [[ "$model_name" == "DeepSeek-V4-Pro" ]]; then + extra_flags="--dsv4" + elif [ "$IS_MTP" = "true" ]; then extra_flags="--use-chat-template" fi fi diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index a24347114..a0462570f 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -230,6 +230,61 @@ $1 == "DSCP" && $2 == ":" && $NF == p { fi fi + # ========================================================================= + # DeepSeek-V4-Pro PD recipe overrides + # Placed at the end of the SGLang env block so it wins over the global + # MoRI/SGLang defaults set above. Mirrors the validated DSv4 manual PD + # commands (see dsv4_mi355x_sglang_disagg_plan.md §2). Only the SGLang/MoRI + # env knobs are pinned here; CLI flags live in models.yaml and the cluster + # NIC/socket vars (NCCL_IB_HCA, *_SOCKET_IFNAME, IBDEVICES) stay runner-derived. + # ========================================================================= + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + # MoRI dispatch/combine dtypes: auto for both roles (not the fp8 split default) + export SGLANG_MORI_DISPATCH_DTYPE=auto + export MORI_COMBINE_DTYPE_PREFILL=auto + export MORI_COMBINE_DTYPE_DECODE=auto + + # Per-role MoRI dispatch sizing (used by the harness chunked/MoE math) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 + export MORI_MAX_DISPATCH_TOKENS_DECODE=64 + unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL + unset MORI_MOE_MAX_INPUT_TOKENS_DECODE + + # PER_RANK dispatch tokens are pinned independently of the sizing above + # (16384 prefill / 128 decode in the reference recipe). server_sglang.sh + # prefers these over the MORI_MAX_DISPATCH_TOKENS_* coupling when set. + export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL=16384 + export MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE=128 + + # Fixed inter-kernel switch threshold (not derived). NOTE: the DP+EP path in + # server_sglang.sh recomputes this dynamically for the DEP topology. + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=4096 + + # Overlap plan stream on for DSv4 (global default is 0) + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0 + + # DSv4 model kernel routing (mirrors the single-node / manual PD recipe) + export SGLANG_DEFAULT_THINKING=1 + export SGLANG_DSV4_REASONING_EFFORT=max + export SGLANG_USE_ROCM700A=0 + export SGLANG_HACK_FLASHMLA_BACKEND=unified_kv_triton + export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false + export SGLANG_OPT_USE_FUSED_COMPRESS=true + export SGLANG_OPT_USE_FUSED_COMPRESS_TRITON=true + export SGLANG_OPT_FP8_WO_A_GEMM=false + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=false + export SGLANG_OPT_USE_TOPK_V2=false + export SGLANG_OPT_USE_AITER_INDEXER=true + export SGLANG_OPT_USE_TILELANG_INDEXER=false + export SGLANG_OPT_USE_TILELANG_MHC_PRE=false + export SGLANG_OPT_USE_TILELANG_MHC_POST=false + export SGLANG_FP8_PAGED_MQA_LOGITS_TORCH=1 + export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=false + export SGLANG_ROCM_USE_MULTI_STREAM=false + export AITER_BF16_FP8_MOE_BOUND=0 + export SGLANG_EAGER_INPUT_NO_COPY=true + fi + # FIXME: WA for latest upstream 0305 image export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh index 52f81b7d6..bac1b2759 100644 --- a/benchmarks/multi_node/amd_utils/env_atom.sh +++ b/benchmarks/multi_node/amd_utils/env_atom.sh @@ -33,8 +33,17 @@ fi export IBDEVICES export SAFETENSORS_FAST_GPU=1 +# The atom engine (atom.entrypoints.openai_server) is vLLM-based; vLLM reads +# VLLM_LOGGING_LEVEL (not VLLM_LOG_LEVEL), so the latter alone was a no-op and +# the engine logged its per-request KV-connector flood at INFO. Set the real +# lever to WARNING (overridable for debugging). +export VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-WARNING}" export VLLM_LOG_LEVEL=WARNING -export ATOM_LOG_LEVEL=WARNING +export ATOM_LOG_LEVEL="${ATOM_LOG_LEVEL:-WARNING}" +# Quiet uvicorn (per-request access logs) and atomesh router. +export ATOM_UVICORN_LOG_LEVEL="${ATOM_UVICORN_LOG_LEVEL:-warning}" +export ATOM_UVICORN_ACCESS_LOG="${ATOM_UVICORN_ACCESS_LOG:-0}" +export ATOMESH_LOG_LEVEL="${ATOMESH_LOG_LEVEL:-warn}" export AITER_LOG_LEVEL=WARNING export LOG_LEVEL=WARNING export LOGLEVEL=WARNING diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 605a377be..c1df922d7 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -9,7 +9,10 @@ # : # base_flags: str # Common flags for both prefill and decode # mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0 -# dp_flags: str # Appended when DP is enabled (prefill or decode) +# dp_flags: str # Appended when DP attention is enabled (prefill or decode) +# ep_flags: str # Appended when EP is enabled. EP-specific MoE knobs only +# # (a2a backend, deepep mode, ep-dispatch algorithm). With +# # ep=1 these are dropped so the MoE runs tensor-parallel (TP). # prefill: # mem_fraction_static: float # disable_radix_cache: bool @@ -38,9 +41,10 @@ # cuda_graph_bs_range: str DeepSeek-V3: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -69,9 +73,10 @@ DeepSeek-V3: cuda_graph_bs_range: "1-128" DeepSeek-V3-0324: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -100,9 +105,10 @@ DeepSeek-V3-0324: cuda_graph_bs_range: "1-128" DeepSeek-R1: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -131,9 +137,10 @@ DeepSeek-R1: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -164,7 +171,8 @@ DeepSeek-R1-0528: Qwen3.5-397B-A17B-MXFP4: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori --moe-dense-tp-size 1" mtp_flags: "" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --enable-dp-lm-head" + ep_flags: "--moe-a2a-backend mori" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -195,7 +203,8 @@ Qwen3.5-397B-A17B-MXFP4: Qwen3.5-397B-A17B-FP8: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori --moe-dense-tp-size 1" mtp_flags: "" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --enable-dp-lm-head" + ep_flags: "--moe-a2a-backend mori" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -226,7 +235,8 @@ Qwen3.5-397B-A17B-FP8: GLM-5-FP8: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\\\"enable_multithread_load\\\": true, \\\"num_threads\\\": 8}'" mtp_flags: "" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--moe-a2a-backend mori" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -255,9 +265,10 @@ GLM-5-FP8: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-Preview: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -286,9 +297,10 @@ DeepSeek-R1-0528-MXFP4-Preview: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -317,9 +329,10 @@ DeepSeek-R1-0528-MXFP4: cuda_graph_bs_range: "1-128" DeepSeek-R1-0528-MXFP4-v2: - base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode " - dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 " + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -349,3 +362,39 @@ DeepSeek-R1-0528-MXFP4-v2: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +# DeepSeek-V4-Pro PD-disaggregation recipe (MI355X, SGLang + MoRI). +# KV transfer = mori for both topologies (pure-TP and DEP); the DP path additionally +# routes the MoE all-to-all through mori (--moe-a2a-backend mori) with dp-attention. +# DSv4-specific kernel routing (unified_kv_triton, AITER indexer, fp8 wo_a fallback, +# thinking/reasoning-effort, dispatch dtypes, per-role PER_RANK dispatch tokens) is set +# in env.sh's DeepSeek-V4-Pro block. The bench client uses --dsv4 framing (bench.sh). +# prefill.disable_cuda_graph routes prefill to --disable-cuda-graph; decode keeps +# --cuda-graph-bs. See dsv4_mi355x_sglang_disagg_plan.md. +DeepSeek-V4-Pro: + base_flags: "--decode-log-interval 100 --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend dsv4 --page-size 256 --swa-full-tokens-ratio 0.1 --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --disaggregation-transfer-backend mori" + dp_flags: "--enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + ep_flags: "--ep-dispatch-algorithm fake --moe-a2a-backend mori --deepep-mode normal" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + disable_cuda_graph: true + dp: + max_running_requests: 1024 + chunked_prefill_size: 131072 + context_length: 9217 + max_total_tokens: 262144 + no_dp: + max_running_requests: 128 + chunked_prefill_size: 131072 + context_length: 9217 + max_total_tokens: 262144 + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 1024 + cuda_graph_bs_range: "1-128" + no_dp: + max_running_requests: 128 + cuda_graph_bs_range: "1-128" diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh index 957c84d60..ccba4814e 100644 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -213,7 +213,7 @@ if [ "$NODE_RANK" -eq 0 ]; then ${DECODE_ARGS} \ --policy random \ --backend atom \ - --log-level info \ + --log-level ${ATOMESH_LOG_LEVEL:-warn} \ --disable-health-check \ --disable-circuit-breaker \ --prometheus-port 29100" diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index 34351b1e4..a7964778d 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -119,12 +119,14 @@ def parse_range(cuda_range, default_start, default_end): print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') +print(f'MODEL_EP_FLAGS=\"{m.get(\"ep_flags\", \"\")}\"') prefill = m.get('prefill', {}) decode = m.get('decode', {}) print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') +print(f'PREFILL_DISABLE_CUDA_GRAPH=\"{prefill.get(\"disable_cuda_graph\", False)}\"') dp = prefill.get('dp', {}) no_dp = prefill.get('no_dp', {}) @@ -136,6 +138,8 @@ print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"') print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"') print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CONTEXT_LENGTH_NO_DP=\"{no_dp.get(\"context_length\", \"\")}\"') +print(f'PREFILL_MAX_TOTAL_TOKENS_NO_DP=\"{no_dp.get(\"max_total_tokens\", \"\")}\"') s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') @@ -183,8 +187,8 @@ else prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP - prefill_context_length="" - prefill_max_total_tokens="" + prefill_context_length=$PREFILL_CONTEXT_LENGTH_NO_DP + prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_NO_DP prefill_enable_two_batch_overlap="false" fi @@ -193,7 +197,7 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; prefill_max_running_requests=$BENCH_MAX_CONC_VALUE prefill_dp_ranks=$PREFILL_TP_SIZE # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change) - MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) + # MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2)) echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL" fi @@ -214,7 +218,7 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t decode_max_running_requests=$BENCH_MAX_CONC_VALUE decode_dp_ranks=$DECODE_TP_SIZE MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) + # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) # Update derived variable SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD @@ -222,7 +226,12 @@ if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; t fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +# disable_cuda_graph (model-level) routes prefill to --disable-cuda-graph instead of --cuda-graph-bs. +if [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "True" ]] || [[ "$PREFILL_DISABLE_CUDA_GRAPH" == "true" ]]; then + PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --disable-cuda-graph " +else + PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} " +fi if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" fi @@ -245,7 +254,7 @@ fi if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) + # MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) fi # ============================================================================= @@ -318,6 +327,7 @@ build_server_config() { local base_config="$MODEL_BASE_FLAGS" local mtp_config="" local dp_config="" + local ep_config="" local specific_config="" # MTP config (only if MTP is enabled and mode is decode) @@ -330,6 +340,13 @@ build_server_config() { dp_config="$MODEL_DP_FLAGS" fi + # EP config (only if EP is enabled): a2a backend, deepep mode, ep-dispatch algo. + # With ep=1 (EP disabled) these are dropped, so the MoE runs tensor-parallel (TP) + # instead of expert-parallel — even when dp-attention is on. + if [[ "$enable_ep" == "true" ]]; then + ep_config="$MODEL_EP_FLAGS" + fi + # Mode-specific config if [[ "$mode" == "prefill" ]]; then specific_config="$PREFILL_MODE_FLAGS" @@ -342,6 +359,9 @@ build_server_config() { if [[ -n "$base_config" ]]; then full_config="$full_config $base_config" fi + if [[ -n "$ep_config" ]]; then + full_config="$full_config $ep_config" + fi if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then full_config="$full_config $mtp_config" fi @@ -418,7 +438,7 @@ if [ "$NODE_RANK" -eq 0 ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/$MODEL_NAME \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -651,7 +671,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}" fi set +x - PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_PREFILL:-${MORI_MAX_DISPATCH_TOKENS_PREFILL}} python3 -m sglang.launch_server \ --model-path $MODEL_DIR/${MODEL_NAME} \ --disaggregation-mode prefill \ --disaggregation-ib-device ${IBDEVICES} \ @@ -720,7 +740,7 @@ else DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}" fi set +x - DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK_DECODE:-${MORI_MAX_DISPATCH_TOKENS_DECODE}} python3 -m sglang.launch_server \ --model-path ${MODEL_DIR}/${MODEL_NAME} \ --disaggregation-mode decode \ --disaggregation-ib-device ${IBDEVICES} \ diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index fa3d65418..c264293a7 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -47,6 +47,10 @@ Required environment variables: MODEL_NAME Model name directory CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) RUNNER_NAME Runner identifier (for job name) + +Optional environment variables: + DRY_RUN 1 = echo composed server/router launch commands instead of + running them (preview a recipe against a real allocation). USAGE } @@ -125,6 +129,12 @@ export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} +# DRY_RUN=1 makes server_sglang.sh echo the composed prefill/decode/router launch +# commands instead of executing them (useful for previewing a recipe against a real +# allocation). Threaded here → job.slurm → Docker (-e DRY_RUN) → server_sglang.sh. +# sbatch defaults to --export=ALL, so exporting it is what carries it into the job. +export DRY_RUN="${DRY_RUN:-0}" + # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" export EVAL_ONLY="${EVAL_ONLY:-false}" diff --git a/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh new file mode 100755 index 000000000..d17d1a323 --- /dev/null +++ b/benchmarks/multi_node/dsv4_fp4_mi355x_sglang-disagg.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 404224648..80961ea97 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4021,3 +4021,9 @@ - "Bump image to lmsysorg/sglang-rocm:v0.5.13.post1-rocm720-mi35x-20260618." - "Enable SGLANG_DP_USE_GATHERV=1. Change to use all_gatherv + reduce_scatterv for tp + dp config." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1824 + +- config-keys: + - dsv4-fp4-mi355x-atom-disagg + description: + - "Eval-only all-evals run to validate reduced ATOM multinode logging (VLLM_LOGGING_LEVEL + uvicorn/atomesh)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1882