diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 04df9e3a5..cdac4b328 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2612,6 +2612,57 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } - { tp: 8, conc-start: 1, conc-end: 2 } +minimaxm3-fp4-mi355x-atom-disagg: + image: rocm/atom-dev:MiniMax-M3-20260622 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp4 + framework: atom-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 1P1D TP4 + - isl: 1024 + osl: 1024 + search-space: + # 1P1D TP4 + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index 05384f435..c0dbb60ad 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" else if [ "$IS_MTP" = "true" ]; then - extra_flags="--use-chat-template" + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + extra_flags="--dsv4" + else + extra_flags="--use-chat-template" + fi fi fi diff --git a/benchmarks/multi_node/amd_utils/env_atom.sh b/benchmarks/multi_node/amd_utils/env_atom.sh index 52f81b7d6..f2b906312 100644 --- a/benchmarks/multi_node/amd_utils/env_atom.sh +++ b/benchmarks/multi_node/amd_utils/env_atom.sh @@ -46,16 +46,18 @@ export LOGLEVEL=WARNING # mooncake RDMA KV transfer library path export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-} -# ATOM MoE gather/scatter interleave optimization -export ATOM_MOE_GU_ITLV=1 # ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting) export AITER_LOG_LEVEL=WARNING -# Disable bf16->fp8 MoE bound (matches reference script) -export AITER_BF16_FP8_MOE_BOUND=0 +if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + # ATOM MoE gather/scatter interleave optimization + export ATOM_MOE_GU_ITLV=1 + # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro) + export AITER_BF16_FP8_MOE_BOUND=0 +fi # Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf) # No env var needed; documented here for reference. diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 01a5bd386..92d790f76 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -363,6 +363,7 @@ DOCKER_ENV_COMMON=( -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e BENCH_REQUEST_RATE=\$BENCH_REQUEST_RATE -e TQDM_MININTERVAL=\$TQDM_MININTERVAL -e DRY_RUN=\$DRY_RUN -e BENCHMARK_LOGS_DIR=/benchmark_logs @@ -411,10 +412,12 @@ elif [[ "$ENGINE" == "atom-disagg" ]]; then -e DECODE_PORT=${DECODE_PORT:-8020} -e ROUTER_PORT=${ROUTER_PORT:-30000} -e HANDSHAKE_PORT=${HANDSHAKE_PORT:-6301} - -e MEM_FRACTION=${MEM_FRACTION:-0.85} + -e MEM_FRAC_STATIC=${MEM_FRAC_STATIC:-0.85} -e KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8} -e BLOCK_SIZE=${BLOCK_SIZE:-16} -e MAX_NUM_SEQS=${MAX_NUM_SEQS:-256} + -e MAX_MODEL_LEN=${MAX_MODEL_LEN:-} + -e MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-} -e EXTRA_SERVER_ARGS=\${EXTRA_SERVER_ARGS:-} -e IBDEVICES=${IBDEVICES:-} ) diff --git a/benchmarks/multi_node/amd_utils/models_atom.yaml b/benchmarks/multi_node/amd_utils/models_atom.yaml index 0a3321fc2..85771eeaa 100644 --- a/benchmarks/multi_node/amd_utils/models_atom.yaml +++ b/benchmarks/multi_node/amd_utils/models_atom.yaml @@ -44,3 +44,13 @@ DeepSeek-V4-Pro: base_flags: "" mtp_flags: "" dp_flags: "" + +MiniMax-M3-MXFP4: + base_flags: "" + mtp_flags: "" + dp_flags: "" + +MiniMax-M3-MXFP8: + base_flags: "" + mtp_flags: "" + dp_flags: "" \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/server_atom.sh b/benchmarks/multi_node/amd_utils/server_atom.sh old mode 100644 new mode 100755 index 957c84d60..e00a17a7d --- a/benchmarks/multi_node/amd_utils/server_atom.sh +++ b/benchmarks/multi_node/amd_utils/server_atom.sh @@ -35,6 +35,10 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# MTP +SPEC_DECODING="${SPEC_DECODING:-}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}" + # ATOM server ports (different from SGLang which uses 8000 for all) PREFILL_PORT="${PREFILL_PORT:-8010}" DECODE_PORT="${DECODE_PORT:-8020}" @@ -42,10 +46,12 @@ ROUTER_PORT="${ROUTER_PORT:-8000}" HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" # ATOM server tuning (from reference script defaults) -MEM_FRACTION="${MEM_FRACTION:-0.85}" +MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" BLOCK_SIZE="${BLOCK_SIZE:-16}" MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" +MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}" EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}" # Benchmark Configuration @@ -100,20 +106,24 @@ for i in $(seq 0 $((yD - 1))); do DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}" done -echo "Prefill IPs : ${PREFILL_IPS[*]}" -echo "Decode IPs : ${DECODE_IPS[*]}" - PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}" PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}" DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" +# Parallel args PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$PREFILL_ENABLE_DP" = "true" ]; then if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP - PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) + else #TP+DPA+TBO + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 + else #TP+DPA + PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) + fi fi fi @@ -121,13 +131,65 @@ DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP if [ "$DECODE_ENABLE_DP" = "true" ]; then if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) - else #DPA+TP - DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) + else #TP+DPA+TBO + if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo ) + export GPU_MAX_HW_QUEUES=5 + export ATOM_CPU_AFFINITY=1 + else #TP+DPA + DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) + fi fi fi -echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}" -echo "Decode Parallel args : ${DECODE_PARALLEL_ARGS[*]}" +# MTP args +SPEC_ARGS=() #TP +if [ "$SPEC_DECODING" = "mtp" ]; then + SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") +fi + +# HF overrides (single-quoted JSON preserved through eval) +HF_OVERRIDES_ARG="" +if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then + HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" +fi + +# KV cache dtype (skip if unset or 'auto') +KV_CACHE_ARG="" +if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then + KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}" +fi + +# Optional model length / batched-token cap +MODEL_LEN_ARGS="" +if [[ -n "$MAX_MODEL_LEN" ]]; then + MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-model-len ${MAX_MODEL_LEN}" +fi +if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then + MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" +fi + +if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then + export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +fi + +cat < remaining prefill nodes # rank NODE_OFFSET .. -> decode nodes # ============================================================================= - if [ "$NODE_RANK" -eq 0 ]; then # ────────────────────────────────────────────────────────────────────────── # Node 0: prefill server (producer) + atomesh router @@ -153,12 +214,15 @@ if [ "$NODE_RANK" -eq 0 ]; then --model ${MODEL_DIR}/${MODEL_NAME} \ --host 0.0.0.0 --server-port ${PREFILL_PORT} \ --trust-remote-code \ - "${PREFILL_PARALLEL_ARGS[@]}" \ - --kv_cache_dtype ${KV_CACHE_DTYPE} \ + ${PREFILL_PARALLEL_ARGS[*]} \ + ${SPEC_ARGS[*]} \ + ${KV_CACHE_ARG} \ --block-size ${BLOCK_SIZE} \ - --gpu-memory-utilization ${MEM_FRACTION} \ + --gpu-memory-utilization ${MEM_FRAC_STATIC} \ --max-num-seqs ${MAX_NUM_SEQS} \ + ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ + ${HF_OVERRIDES_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ ${EXTRA_SERVER_ARGS}" @@ -248,6 +312,11 @@ if [ "$NODE_RANK" -eq 0 ]; then cd $ATOM_WS_PATH + export IS_MTP="false" + if [ "$SPEC_DECODING" = "mtp" ]; then + export IS_MTP="true" + fi + BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ @@ -367,12 +436,15 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then --model ${MODEL_DIR}/${MODEL_NAME} \ --host 0.0.0.0 --server-port ${PREFILL_PORT} \ --trust-remote-code \ - "${PREFILL_PARALLEL_ARGS[@]}" \ - --kv_cache_dtype ${KV_CACHE_DTYPE} \ + ${PREFILL_PARALLEL_ARGS[*]} \ + ${SPEC_ARGS[*]} \ + ${KV_CACHE_ARG} \ --block-size ${BLOCK_SIZE} \ - --gpu-memory-utilization ${MEM_FRACTION} \ + --gpu-memory-utilization ${MEM_FRAC_STATIC} \ --max-num-seqs ${MAX_NUM_SEQS} \ + ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ + ${HF_OVERRIDES_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ ${EXTRA_SERVER_ARGS}" @@ -449,12 +521,15 @@ else --model ${MODEL_DIR}/${MODEL_NAME} \ --host 0.0.0.0 --server-port ${DECODE_PORT} \ --trust-remote-code \ - "${DECODE_PARALLEL_ARGS[@]}" \ - --kv_cache_dtype ${KV_CACHE_DTYPE} \ + ${DECODE_PARALLEL_ARGS[*]} \ + ${SPEC_ARGS[*]} \ + ${KV_CACHE_ARG} \ --block-size ${BLOCK_SIZE} \ - --gpu-memory-utilization ${MEM_FRACTION} \ + --gpu-memory-utilization ${MEM_FRAC_STATIC} \ --max-num-seqs ${DECODE_MAX_NUM_SEQS} \ + ${MODEL_LEN_ARGS} \ --no-enable_prefix_caching \ + ${HF_OVERRIDES_ARG} \ --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \ ${EXTRA_SERVER_ARGS}" diff --git a/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh new file mode 100644 index 000000000..505f74319 --- /dev/null +++ b/benchmarks/multi_node/minimaxm3_fp4_mi355x_atom-disagg.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +# Use upstreamed multi_node scripts (no external clone needed) +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else +export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then +export PREFILL_ENABLE_DP=true +else +export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else +export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then +export DECODE_ENABLE_DP=true +else +export DECODE_ENABLE_DP=false +fi + +# No MTP for MiniMax-M3 +export SPEC_DECODING="none" +export DECODE_MTP_SIZE=0 + +# Block size 128 +export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}" +export BLOCK_SIZE="${BLOCK_SIZE:-128}" +export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}" +export MAX_MODEL_LEN=32768 +export MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}" +export MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}" + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimited by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO}) + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4265d320b..cdcd78556 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4065,6 +4065,14 @@ - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862 +- config-keys: + - minimaxm3-fp4-mi355x-atom-disagg + description: + - "Add minimaxm3-fp4-mi355x-atom-disagg CI script: multi-node disaggregated PD on MI355X via ATOM for MiniMax-M3-MXFP4" + - "No MTP, KV_CACHE_DTYPE=auto (MXFP4 native, no fp8 override), MAX_MODEL_LEN=32768, MAX_NUM_BATCHED_TOKENS=32768" + - "server_atom.sh: conditional --kv_cache_dtype, MAX_MODEL_LEN/MAX_NUM_BATCHED_TOKENS/CUDAGRAPH_OPT support, syntax fixes" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1856 + - config-keys: - dsv4-fp4-mi355x-sglang description: