diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 606e3c2af..d5e334b8b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1,5 +1,5 @@ dsr1-fp4-mi355x-sglang: - image: lmsysorg/sglang:v0.5.12-rocm700-mi35x + image: lmsysorg/sglang:v0.5.13-rocm700-mi35x model: amd/DeepSeek-R1-0528-MXFP4-Preview model-prefix: dsr1 runner: mi355x @@ -11,13 +11,18 @@ dsr1-fp4-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + # Agentic-coding sweep commented out for this image-bump PR — the # 10-conc agentic matrix amplifies sweep cost and the bump validation # only needs the fixed-seq-len throughput shape. Re-enable once the diff --git a/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh index bb6ce75cb..b7d758d29 100644 --- a/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + EP_SIZE \ CONC \ ISL \ OSL \ @@ -37,16 +38,21 @@ fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [ "${EP_SIZE:-1}" -gt 1 ]; then + PARALLEL_ARGS+=( --ep-size "$EP_SIZE" ) +fi + set -x python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --host=0.0.0.0 --port=$PORT \ +"${PARALLEL_ARGS[@]}" \ --tensor-parallel-size=$TP \ --chunked-prefill-size=$PREFILL_SIZE \ --mem-fraction-static=0.8 \ --disable-radix-cache \ --num-continuous-decode-steps=4 \ --max-prefill-tokens=$PREFILL_SIZE \ ---cuda-graph-max-bs=128 \ +--cuda-graph-max-bs=512 \ --attention-backend aiter \ --kv-cache-dtype fp8_e4m3 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1f88b47aa..8cae1e723 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3919,3 +3919,8 @@ - "Also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767 + - config-keys: + - dsr1-fp4-mi355x-sglang + description: + - "Introduce EP configurations" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1811