-
Notifications
You must be signed in to change notification settings - Fork 206
[AMD] Add MiniMax-M3-FP4 MI355X ATOMMESH #1856
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
512504d
7ffb3e3
50634be
2ccacee
dd1e8ac
1f854e4
1cf914d
1e7f3da
638b837
3c89eae
23808cf
78806c3
72734b0
688eb03
f804274
931727a
501a8cc
b430d91
7b80ea7
4ea680d
74aa3e0
26ba108
b76105f
de6ddc6
ac99718
aa67d5e
2de59c3
d19ea61
198e2c5
bf0538d
9ff735c
d0de9f3
190b055
b60d4ca
90d526d
392a286
cea10d2
f550a11
2f73986
132f240
f9545e5
3a7ab53
5f0310c
70a1a6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,7 +80,11 @@ for max_concurrency in "${chosen_concurrencies[@]}"; do | |
| extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" | ||
| else | ||
| if [ "$IS_MTP" = "true" ]; then | ||
| extra_flags="--use-chat-template" | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shall we clean this part?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I agree. it looks ugly. It will be clean if that part goes to benchmarks/multi_node/amd_utils/models_atom.yaml |
||
| extra_flags="--dsv4" | ||
| else | ||
| extra_flags="--use-chat-template" | ||
| fi | ||
| fi | ||
| fi | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,16 +46,18 @@ export LOGLEVEL=WARNING | |
| # mooncake RDMA KV transfer library path | ||
| export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-} | ||
|
|
||
| # ATOM MoE gather/scatter interleave optimization | ||
| export ATOM_MOE_GU_ITLV=1 | ||
|
|
||
| # ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP) | ||
|
|
||
| # aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting) | ||
| export AITER_LOG_LEVEL=WARNING | ||
|
|
||
| # Disable bf16->fp8 MoE bound (matches reference script) | ||
| export AITER_BF16_FP8_MOE_BOUND=0 | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I agree. it looks ugly. It will be clean if that part goes to benchmarks/multi_node/amd_utils/models_atom.yaml |
||
| # ATOM MoE gather/scatter interleave optimization | ||
| export ATOM_MOE_GU_ITLV=1 | ||
| # Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro) | ||
| export AITER_BF16_FP8_MOE_BOUND=0 | ||
| fi | ||
|
|
||
| # Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf) | ||
| # No env var needed; documented here for reference. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,17 +35,23 @@ DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" | |
| DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" | ||
| DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" | ||
|
|
||
| # MTP | ||
| SPEC_DECODING="${SPEC_DECODING:-}" | ||
| DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-1}" | ||
|
|
||
| # ATOM server ports (different from SGLang which uses 8000 for all) | ||
| PREFILL_PORT="${PREFILL_PORT:-8010}" | ||
| DECODE_PORT="${DECODE_PORT:-8020}" | ||
| ROUTER_PORT="${ROUTER_PORT:-8000}" | ||
| HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" | ||
|
|
||
| # ATOM server tuning (from reference script defaults) | ||
| MEM_FRACTION="${MEM_FRACTION:-0.85}" | ||
| MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" | ||
| KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" | ||
| BLOCK_SIZE="${BLOCK_SIZE:-16}" | ||
| MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" | ||
| MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" | ||
| MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-}" | ||
| EXTRA_SERVER_ARGS="${EXTRA_SERVER_ARGS:-}" | ||
|
|
||
| # Benchmark Configuration | ||
|
|
@@ -100,34 +106,90 @@ for i in $(seq 0 $((yD - 1))); do | |
| DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$idx]}:${DECODE_PORT}" | ||
| done | ||
|
|
||
| echo "Prefill IPs : ${PREFILL_IPS[*]}" | ||
| echo "Decode IPs : ${DECODE_IPS[*]}" | ||
|
|
||
| PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP}" | ||
| PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP}" | ||
| DECODE_ENABLE_EP="${DECODE_ENABLE_EP}" | ||
| DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" | ||
|
|
||
| # Parallel args | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP | ||
| if [ "$PREFILL_ENABLE_DP" = "true" ]; then | ||
| if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) | ||
| else #DPA+TP | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) | ||
| else #TP+DPA+TBO | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I agree. it looks ugly. It will be clean if that part goes to benchmarks/multi_node/amd_utils/models_atom.yaml |
||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) | ||
| export GPU_MAX_HW_QUEUES=5 | ||
| export ATOM_CPU_AFFINITY=1 | ||
| else #TP+DPA | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) | ||
| fi | ||
| fi | ||
| fi | ||
|
|
||
| DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP | ||
| if [ "$DECODE_ENABLE_DP" = "true" ]; then | ||
| if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) | ||
| else #DPA+TP | ||
|
seungrokj marked this conversation as resolved.
|
||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) | ||
| else #TP+DPA+TBO | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I agree. it looks ugly. It will be clean if that part goes to benchmarks/multi_node/amd_utils/models_atom.yaml |
||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo ) | ||
| export GPU_MAX_HW_QUEUES=5 | ||
| export ATOM_CPU_AFFINITY=1 | ||
| else #TP+DPA | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) | ||
| fi | ||
| fi | ||
| fi | ||
|
|
||
| echo "Prefill Parallel args : ${PREFILL_PARALLEL_ARGS[*]}" | ||
| echo "Decode Parallel args : ${DECODE_PARALLEL_ARGS[*]}" | ||
| # MTP args | ||
| SPEC_ARGS=() #TP | ||
| if [ "$SPEC_DECODING" = "mtp" ]; then | ||
| SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") | ||
| fi | ||
|
|
||
| # HF overrides (single-quoted JSON preserved through eval) | ||
| HF_OVERRIDES_ARG="" | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
| HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" | ||
| fi | ||
|
|
||
| # KV cache dtype (skip if unset or 'auto') | ||
| KV_CACHE_ARG="" | ||
| if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then | ||
| KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}" | ||
| fi | ||
|
|
||
| # Optional model length / batched-token cap | ||
| MODEL_LEN_ARGS="" | ||
| if [[ -n "$MAX_MODEL_LEN" ]]; then | ||
| MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-model-len ${MAX_MODEL_LEN}" | ||
| fi | ||
| if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then | ||
| MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" | ||
| fi | ||
|
|
||
| if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then | ||
| export AITER_QUICK_REDUCE_QUANTIZATION=INT4 | ||
| fi | ||
|
|
||
| cat <<INFO | ||
| === Configuration === | ||
| PREFILL : ${PREFILL_IPS[*]} (TP=${PREFILL_TP_SIZE}, EP=${PREFILL_ENABLE_EP:-false}, DP=${PREFILL_ENABLE_DP:-false}, port=${PREFILL_PORT}) | ||
| DECODE : ${DECODE_IPS[*]} (TP=${DECODE_TP_SIZE}, EP=${DECODE_ENABLE_EP:-false}, DP=${DECODE_ENABLE_DP:-false}, port=${DECODE_PORT}) | ||
| ROUTER : port=${ROUTER_PORT} | ||
| MODEL : ${MODEL_NAME} | ||
| BACKEND : atom (PD mooncake KV transfer) | ||
| MTP : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE} | ||
| xP/yD : ${xP} / ${yD} | ||
| KV cache : dtype=${KV_CACHE_DTYPE:-auto} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC} | ||
| Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NUM_BATCHED_TOKENS:-unset} | ||
| Prefill args : ${PREFILL_PARALLEL_ARGS[*]} | ||
| Decode args : ${DECODE_PARALLEL_ARGS[*]} | ||
| Spec args : ${SPEC_ARGS[*]} | ||
| Opt args : ${HF_OVERRIDES_ARG} | ||
| ===================== | ||
| INFO | ||
|
|
||
| # ============================================================================= | ||
| # Node Role Assignment | ||
|
|
@@ -137,7 +199,6 @@ echo "Decode Parallel args : ${DECODE_PARALLEL_ARGS[*]}" | |
| # rank 1 .. (NODE_OFFSET-1) -> remaining prefill nodes | ||
| # rank NODE_OFFSET .. -> decode nodes | ||
| # ============================================================================= | ||
|
|
||
| if [ "$NODE_RANK" -eq 0 ]; then | ||
| # ────────────────────────────────────────────────────────────────────────── | ||
| # Node 0: prefill server (producer) + atomesh router | ||
|
|
@@ -153,12 +214,15 @@ if [ "$NODE_RANK" -eq 0 ]; then | |
| --model ${MODEL_DIR}/${MODEL_NAME} \ | ||
| --host 0.0.0.0 --server-port ${PREFILL_PORT} \ | ||
| --trust-remote-code \ | ||
| "${PREFILL_PARALLEL_ARGS[@]}" \ | ||
| --kv_cache_dtype ${KV_CACHE_DTYPE} \ | ||
| ${PREFILL_PARALLEL_ARGS[*]} \ | ||
| ${SPEC_ARGS[*]} \ | ||
|
seungrokj marked this conversation as resolved.
|
||
| ${KV_CACHE_ARG} \ | ||
| --block-size ${BLOCK_SIZE} \ | ||
| --gpu-memory-utilization ${MEM_FRACTION} \ | ||
| --gpu-memory-utilization ${MEM_FRAC_STATIC} \ | ||
| --max-num-seqs ${MAX_NUM_SEQS} \ | ||
| ${MODEL_LEN_ARGS} \ | ||
| --no-enable_prefix_caching \ | ||
| ${HF_OVERRIDES_ARG} \ | ||
| --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ | ||
| ${EXTRA_SERVER_ARGS}" | ||
|
|
||
|
|
@@ -248,6 +312,11 @@ if [ "$NODE_RANK" -eq 0 ]; then | |
|
|
||
| cd $ATOM_WS_PATH | ||
|
|
||
| export IS_MTP="false" | ||
| if [ "$SPEC_DECODING" = "mtp" ]; then | ||
| export IS_MTP="true" | ||
| fi | ||
|
|
||
| BENCH_CMD="bash $ATOM_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ | ||
| $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ | ||
| ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ | ||
|
|
@@ -367,12 +436,15 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then | |
| --model ${MODEL_DIR}/${MODEL_NAME} \ | ||
| --host 0.0.0.0 --server-port ${PREFILL_PORT} \ | ||
| --trust-remote-code \ | ||
| "${PREFILL_PARALLEL_ARGS[@]}" \ | ||
| --kv_cache_dtype ${KV_CACHE_DTYPE} \ | ||
| ${PREFILL_PARALLEL_ARGS[*]} \ | ||
| ${SPEC_ARGS[*]} \ | ||
| ${KV_CACHE_ARG} \ | ||
| --block-size ${BLOCK_SIZE} \ | ||
| --gpu-memory-utilization ${MEM_FRACTION} \ | ||
| --gpu-memory-utilization ${MEM_FRAC_STATIC} \ | ||
| --max-num-seqs ${MAX_NUM_SEQS} \ | ||
| ${MODEL_LEN_ARGS} \ | ||
| --no-enable_prefix_caching \ | ||
| ${HF_OVERRIDES_ARG} \ | ||
| --kv-transfer-config '{\"kv_role\":\"kv_producer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ | ||
| ${EXTRA_SERVER_ARGS}" | ||
|
|
||
|
|
@@ -449,12 +521,15 @@ else | |
| --model ${MODEL_DIR}/${MODEL_NAME} \ | ||
| --host 0.0.0.0 --server-port ${DECODE_PORT} \ | ||
| --trust-remote-code \ | ||
| "${DECODE_PARALLEL_ARGS[@]}" \ | ||
| --kv_cache_dtype ${KV_CACHE_DTYPE} \ | ||
| ${DECODE_PARALLEL_ARGS[*]} \ | ||
|
seungrokj marked this conversation as resolved.
|
||
| ${SPEC_ARGS[*]} \ | ||
| ${KV_CACHE_ARG} \ | ||
| --block-size ${BLOCK_SIZE} \ | ||
| --gpu-memory-utilization ${MEM_FRACTION} \ | ||
| --gpu-memory-utilization ${MEM_FRAC_STATIC} \ | ||
| --max-num-seqs ${DECODE_MAX_NUM_SEQS} \ | ||
| ${MODEL_LEN_ARGS} \ | ||
| --no-enable_prefix_caching \ | ||
| ${HF_OVERRIDES_ARG} \ | ||
|
seungrokj marked this conversation as resolved.
seungrokj marked this conversation as resolved.
|
||
| --kv-transfer-config '{\"kv_role\":\"kv_consumer\",\"kv_connector\":\"mooncake\",\"proxy_ip\":\"${host_ip}\",\"handshake_port\":${HANDSHAKE_PORT}}' \ | ||
| --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \ | ||
| ${EXTRA_SERVER_ARGS}" | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.