SemiAnalysisAI · ppalanga · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
@@ -1,5 +1,5 @@
 dsr1-fp4-mi355x-sglang:
-  image: lmsysorg/sglang:v0.5.12-rocm700-mi35x
+  image: lmsysorg/sglang:v0.5.13-rocm700-mi35x
   model: amd/DeepSeek-R1-0528-MXFP4-Preview
   model-prefix: dsr1
   runner: mi355x
@@ -11,13 +11,18 @@ dsr1-fp4-mi355x-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+      - { tp: 8, conc-start: 4, conc-end: 256 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
+      - { tp: 4, conc-start: 4, conc-end: 256 }
+      - { tp: 8, conc-start: 4, conc-end: 256 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
+
     # Agentic-coding sweep commented out for this image-bump PR — the
     # 10-conc agentic matrix amplifies sweep cost and the bump validation
     # only needs the fixed-seq-len throughput shape. Re-enable once the

diff --git a/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp4_mi355x.sh
@@ -5,6 +5,7 @@ source "$(dirname "$0")/../../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
+	EP_SIZE \
     CONC \
     ISL \
     OSL \
@@ -37,16 +38,21 @@ fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    PARALLEL_ARGS+=( --ep-size "$EP_SIZE" )
+fi
+
 set -x
 python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --host=0.0.0.0 --port=$PORT \
+"${PARALLEL_ARGS[@]}" \
 --tensor-parallel-size=$TP \
 --chunked-prefill-size=$PREFILL_SIZE \
 --mem-fraction-static=0.8 \
 --disable-radix-cache \
 --num-continuous-decode-steps=4 \
 --max-prefill-tokens=$PREFILL_SIZE \
---cuda-graph-max-bs=128 \
+--cuda-graph-max-bs=512 \
 --attention-backend aiter \
 --kv-cache-dtype fp8_e4m3 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3919,3 +3919,8 @@
     - "Also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767
 
+  - config-keys:
+    - dsr1-fp4-mi355x-sglang
+  description:
+    - "Introduce EP configurations"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1811