diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 04df9e3a5..7771926b6 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2612,6 +2612,59 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } - { tp: 8, conc-start: 1, conc-end: 2 } +minimaxm3-fp8-mi355x-atom-disagg: + image: rocm/atom-dev:MiniMax-M3-20260622 + model: amd/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp8 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 1P1D TP4 + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + +# MiniMax-M3 MXFP8 MI300X recipe. Use the TP8-only H100 search space: TP8 for +# latency and TP8+EP8 (TEP) at high concurrency. # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..c0dbb60ad 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else if [ "$IS_MTP" = "true" ]; then - extra_flags="--use-chat-template" + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + extra_flags="--dsv4" + else + extra_flags="--use-chat-template" + fi fi fi diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh index 52f81b7d6..f2b906312 100644 --- a/benchmarks/multi_node/amd_utils/env_atom.sh +++ b/benchmarks/multi_node/amd_utils/env_atom.sh @@ -46,16 +46,18 @@ export LOGLEVEL=WARNING # mooncake RDMA KV transfer library path export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-} -# ATOM MoE gather/scatter interleave optimization -export ATOM_MOE_GU_ITLV=1 # ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting) export AITER_LOG_LEVEL=WARNING -# Disable bf16->fp8 MoE bound (matches reference script) -export AITER_BF16_FP8_MOE_BOUND=0 +if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + # ATOM MoE gather/scatter interleave optimization + export ATOM_MOE_GU_ITLV=1 + # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro) + export AITER_BF16_FP8_MOE_BOUND=0 +fi # Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf) # No env var needed; documented here for reference. diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 01a5bd386..92d790f76 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -363,6 +363,7 @@ DOCKER_ENV_COMMON=( -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE -e TQDM_MININTERVAL=\$TQDM_MININTERVAL -e DRY_RUN=\$DRY_RUN -e BENCHMARK_LOGS_DIR=/benchmark_logs @@ -411,10 +412,12 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then -e DECODE_PORT=${DECODE_PORT:-8020} -e ROUTER_PORT=${ROUTER_PORT:-30000} -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301} - -e MEM_FRACTION=${MEM_FRACTION:-0.85} + -e MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85} -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8} -e BLOCK_SIZE=${BLOCK_SIZE:-16} -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256} + -e MAX_MODEL_LEN=${MAX_MODEL_LEN:-} + -e MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-} -e EXTRA_SERVER_ARGS=\${EXTRA_SERVER_ARGS:-} -e IBDEVICES=${IBDEVICES:-} ) diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 0a3321fc2..85771eeaa 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -44,3 +44,13 @@ DeepSeek-V4-Pro: base_flags: "" mtp_flags: "" dp_flags: "" + +MiniMax-M3-MXFP4: + base_flags: "" + mtp_flags: "" + dp_flags: "" + +MiniMax-M3-MXFP8: + base_flags: "" + mtp_flags: "" + dp_flags: "" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh old mode 100644 new mode 100755 index 957c84d60..5ecb85ec2 --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -35,6 +35,10 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# MTP +SPEC_DECODING="${SPEC_DECODING:-}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}" + # ATOM server ports (different from SGLang which uses 8000 for all) PREFILL_PORT="${PREFILL_PORT:-8010}" DECODE_PORT="${DECODE_PORT:-8020}" @@ -42,10 +46,12 @@ ROUTER_PORT="${ROUTER_PORT:-8000}" HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" # ATOM server tuning (from reference script defaults) -MEM_FRACTION="${MEM_FRACTION:-0.85}" +MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" BLOCK_SIZE="${BLOCK_SIZE:-16}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" +MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}" EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}" # Benchmark Configuration @@ -100,34 +106,91 @@ for i in $(seq 0 $((yD - 1))); do DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}" done -echo "Prefill IPs : ${PREFILL_IPS[*]}" -echo "Decode IPs : ${DECODE_IPS[*]}" - PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}" PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$PREFILL_ENABLE_DP" = "true" ]; then if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) + else #TP+DPA+TBO + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 + else #TP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) + fi fi fi +# (srok), split DPA & TBO cases DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) + else #TP+DPA+TBO + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo ) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 + else #TP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) + fi fi fi -echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}" -echo "Decode Parallel args : ${DECODE_PARALLEL_ARGS[*]}" +# MTP args +SPEC_ARGS=() #TP +if [ "$SPEC_DECODING" = "mtp" ]; then + SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") +fi + +# HF overrides (single-quoted JSON preserved through eval) +HF_OVERRIDES_ARG="" +if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" +fi + +# KV cache dtype (skip if unset or 'auto') +KV_CACHE_ARG="" +if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then + KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}" +fi + +# Optional model length / batched-token cap +MODEL_LEN_ARGS="" +if [[ -n "$MAX_MODEL_LEN" ]]; then + MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-model-len ${MAX_MODEL_LEN}" +fi +if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then + MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" +fi + +if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then + export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +fi + +cat <&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4265d320b..d384196a9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4087,6 +4087,15 @@ - "6 topologies across 1k/1k and 8k/1k: 1P1D TP4 STP + wide-EP (DEP4 prefill / DEP16 decode) from 1P1D up to 8P1D, recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1810 +- config-keys: + - minimaxm3-fp8-mi355x-atom-disagg + description: + - "Add minimaxm3-fp8-mi355x-atom-disagg CI recipe: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP8" + - "Settings aligned with slurm reference: MEM_FRAC_STATIC=0.8, MAX_NUM_SEQS=128, BLOCK_SIZE=128, MAX_MODEL_LEN=32768, KV_CACHE_DTYPE=auto" + - "server_atom.sh: fix _MAX_CONC assignment before cudagraph size check; gate ATOM_MOE_GU_ITLV/AITER_BF16_FP8_MOE_BOUND on DeepSeek-V4-Pro only" + - "Search space: ISL=8192 and ISL=1024, 1P1D TP4, conc 1-512" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1865 + - config-keys: - minimaxm3-fp8-b300-dynamo-vllm description: