diff --git a/benchmarks/multi_node/amd_utils/detect_ibdevices_bnxt.sh b/benchmarks/multi_node/amd_utils/detect_ibdevices_bnxt.sh new file mode 100755 index 000000000..7aedc0537 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/detect_ibdevices_bnxt.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# Print comma-separated RDMA HCA names for Broadcom NetXtreme-E (bnxt_re*) ports in state ACTIVE. +# Uses `rdma link` (iproute2 / rdma-core). Prints nothing if no match; exit 0. +# Example line: link bnxt_re0/1 state ACTIVE physical_state LINK_UP netdev ens26np0 + +if ! command -v rdma >/dev/null 2>&1; then + exit 0 +fi + +rdma link 2>/dev/null | awk ' +$1 == "link" && $2 ~ /^bnxt_re/ && $0 ~ /state ACTIVE/ { + split($2, a, "/") + d = a[1] + if (!seen[d]++) printf "%s%s", (n++ ? "," : ""), d +} +END { if (n) print "" }' diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 71d2653bd..aca5da7f9 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -45,6 +45,20 @@ set +x export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} +# Inter-node NCCL over this bnxt_re RoCE fabric. Only matters for multi-node TP/EP +# workers (e.g. decode TP16 over 2 nodes); inert for the single-node-worker tests +# (1-4), which do no inter-node NCCL. +# NCCL_IB_GID_INDEX=3: use the routable RoCEv2 GID (fd93:16d3:59b6:012*). Without +# it NCCL falls back to RoCEv1 (GID idx 0, link-local) and cross-node comm init +# hangs. +# NCCL_IB_TC=96 / NCCL_IB_SL=3: put NCCL's own RDMA on the PFC-protected lossless +# class (DSCP 24 / priority 3). Without it, NCCL's connection handshake can be +# dropped on the lossy default queue for some node pairs, hanging ncclCommInitRank. +# Override for a cluster with a different GID layout / PFC class. +export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}" +export NCCL_IB_TC="${NCCL_IB_TC:-96}" +export NCCL_IB_SL="${NCCL_IB_SL:-3}" + # ============================================================================= # Engine-specific environment # ============================================================================= @@ -160,7 +174,9 @@ else export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32 export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192 - export MORI_MAX_DISPATCH_TOKENS_DECODE=512 + # 512 undersizes the decode MoRI MoE dispatch buffer for conc-32/EP16, where the + # cross-node all-to-all stalls under load. + export MORI_MAX_DISPATCH_TOKENS_DECODE=4096 export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768 export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703 @@ -180,7 +196,16 @@ else # QoS/DSCP configuration # Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname if [[ -n "$MORI_RDMA_TC" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" + # Derive matching SL (VLAN 802.1p / priority) from TC when only TC is set. + # TC is the IP ToS byte (DSCP<<2); priority = DSCP>>3, so SL = TC>>5. + # bnxt_re REJECTS inconsistent DSCP/SL pairs ("Given DSCP N and/or SL M not + # mapping to lossless queue") and SILENTLY downgrades to the lossy best-effort + # queue, which surfaces under load as RETRY_EXC_ERR / stalled KV transfers. + # TC=104 (DSCP 26/AF31) -> SL=3, matching prio-pfc 3:on. Mirror to MoRI IO. + [[ -z "${MORI_RDMA_SL:-}" ]] && export MORI_RDMA_SL=$(( MORI_RDMA_TC >> 5 )) + export MORI_IO_TC="${MORI_IO_TC:-$MORI_RDMA_TC}" + export MORI_IO_SL="${MORI_IO_SL:-$MORI_RDMA_SL}" + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC MORI_RDMA_SL=$MORI_RDMA_SL MORI_IO_TC=$MORI_IO_TC MORI_IO_SL=$MORI_IO_SL (set by runner or environment)" elif command -v nicctl &> /dev/null; then ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 605a377be..df8ad68b0 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -135,7 +135,7 @@ DeepSeek-R1-0528: mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: - mem_fraction_static: 0.8 + mem_fraction_static: 0.9 disable_radix_cache: true dp: max_running_requests: 24 diff --git a/benchmarks/multi_node/amd_utils/rebuild_bnxt.sh b/benchmarks/multi_node/amd_utils/rebuild_bnxt.sh new file mode 100644 index 000000000..65e0adbfa --- /dev/null +++ b/benchmarks/multi_node/amd_utils/rebuild_bnxt.sh @@ -0,0 +1,109 @@ +#!/bin/bash +############################################################################### +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +# +# Optionally rebuild **libbnxt** (userspace libibverbs provider) from Broadcom’s +# source tarball — typical fix for libibverbs / bnxt_re user-kernel ABI mismatch +# inside a container when the kernel module is already supplied by the host. +# +# Trigger: +# export REBUILD_BNXT=1 +# +# Required: +# export PATH_TO_BNXT_TAR_PACKAGE=/path/to/libbnxt_re-*.tar.gz +# Path must exist in the environment where this script runs (e.g. /workspace/... in Docker). +# +# Optional: +# export REBUILD_BNXT_RESTORE_DIR=/some/dir # cwd after build (default: directory of this script) +# +# Disagg Docker: set REBUILD_LIBBNXT_IN_CONTAINER=1 and pass PATH_TO_BNXT_TAR_PACKAGE +# (container path; tarballs are often under /workspace/driver/ when kept in InferenceX/driver/). +# Invoked from scripts/_disagg_container_entry.sh (in-container path below). +# +# If `ibv_devinfo` still warns about kernel ABI after rebuild, try the other tarball version +# (e.g. 230.2.52 vs 231.0.162) to match your host bnxt_re kernel module. +# +# Implementation: inline steps (legacy runner/helpers/rebuild_bnxt.sh). +############################################################################### + +set -euo pipefail + +if ! declare -F LOG_INFO_RANK0 >/dev/null 2>&1; then + LOG_INFO_RANK0() { echo "$*"; } +fi + +REBUILD_BNXT="${REBUILD_BNXT:-0}" +PATH_TO_BNXT_TAR_PACKAGE="${PATH_TO_BNXT_TAR_PACKAGE:-}" + +if [[ "${REBUILD_BNXT}" != "1" ]]; then + exit 0 +fi + +if [[ -z "${REBUILD_BNXT_RESTORE_DIR:-}" ]]; then + # Directory to return to after the build (defaults to this script’s directory: amd_utils/). + REBUILD_BNXT_RESTORE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + export REBUILD_BNXT_RESTORE_DIR +fi + +if [[ -z "${PATH_TO_BNXT_TAR_PACKAGE}" ]]; then + LOG_INFO_RANK0 "[hook system] Skip bnxt rebuild: PATH_TO_BNXT_TAR_PACKAGE is unset (REBUILD_BNXT=${REBUILD_BNXT})." + exit 0 +fi +if [[ ! -f "${PATH_TO_BNXT_TAR_PACKAGE}" ]]; then + LOG_INFO_RANK0 "[hook system] Skip bnxt rebuild: tarball not found inside container at ${PATH_TO_BNXT_TAR_PACKAGE}" + LOG_INFO_RANK0 "[hook system] (With -v HOST_REPO:/workspace, the host must have that file under HOST_REPO, e.g. InferenceX/driver/libbnxt_re-*.tar.gz on every node that runs Docker.)" + exit 0 +fi + +LOG_INFO_RANK0 "[hook system] REBUILD_BNXT=1 → rebuilding libbnxt from ${PATH_TO_BNXT_TAR_PACKAGE}" + +# Inline implementation (previously runner/helpers/rebuild_bnxt.sh) +tar xzf "${PATH_TO_BNXT_TAR_PACKAGE}" -C /tmp/ +mv /tmp/libbnxt_re-* /tmp/libbnxt + +_inbox="/usr/lib/x86_64-linux-gnu/libibverbs/libbnxt_re-rdmav34.so" +if [[ -f "${_inbox}" ]]; then + mv "${_inbox}" "${_inbox}.inbox" +fi + +cd /tmp/libbnxt/ +if command -v apt-get >/dev/null 2>&1; then + DEBIAN_FRONTEND=noninteractive apt-get update -qq || true + if ! DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + autoconf automake libtool pkg-config make gcc; then + LOG_INFO_RANK0 "[hook system] WARN: apt-get install build deps failed (offline image?); autogen may fail." + fi +fi +if ! command -v autoconf >/dev/null 2>&1; then + LOG_INFO_RANK0 "[hook system] ERROR: autoconf not found; install autoconf automake libtool pkg-config make gcc in the image or fix apt." >&2 + exit 1 +fi + +sh ./autogen.sh +./configure +make clean all install + +echo '/usr/local/lib' > /etc/ld.so.conf.d/libbnxt_re.conf +ldconfig + +# Register provider with libibverbs (paths vary by image). +mkdir -p /etc/libibverbs.d +cp -f /tmp/libbnxt/bnxt_re.driver /etc/libibverbs.d/ || LOG_INFO_RANK0 "[hook system] WARN: could not copy bnxt_re.driver to /etc/libibverbs.d" +for _verbs_d in /usr/local/etc/libibverbs.d /usr/lib/libibverbs.d; do + if [[ -d "${_verbs_d}" ]]; then + cp -f /tmp/libbnxt/bnxt_re.driver "${_verbs_d}/" || LOG_INFO_RANK0 "[hook system] WARN: could not copy bnxt_re.driver to ${_verbs_d}" + fi +done + +LOG_INFO_RANK0 "[hook system] Installed libbnxt_re libraries:" +ls -la /usr/local/lib/libbnxt_re*.so* 2>/dev/null || LOG_INFO_RANK0 "[hook system] WARN: no libbnxt_re*.so under /usr/local/lib" +if command -v ibv_devinfo >/dev/null 2>&1; then + LOG_INFO_RANK0 "[hook system] ibv_devinfo (first bnxt device, if any):" + ibv_devinfo -d bnxt_re0 2>&1 | head -40 || true +fi + +cd "${REBUILD_BNXT_RESTORE_DIR}" +LOG_INFO_RANK0 "[hook system] Rebuilding libbnxt done." diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index c28ccab41..13455dc9d 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -211,14 +211,24 @@ fi # When both DP and EP are enabled, override max-running-requests and dispatch tokens if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_max_running_requests=$BENCH_MAX_CONC_VALUE decode_dp_ranks=$DECODE_TP_SIZE - MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks)) - MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10)) - # Update derived variable - SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) - export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD - echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD" + # max-running-requests is split across DP ranks (sglang's req_to_token_pool is + # per-rank = max_running_requests // dp_ranks). It must be >= dp_ranks, else the + # per-rank pool floors to 0 and get_batch_sizes_to_capture collapses capture_bs + # to [0] (AssertionError). This happens when bench concurrency < dp_ranks. + # Old heuristic set max_running=BENCH_MAX_CONC (=conc) and then shrank the MoRI + # dispatch tokens to max_running/dp_ranks (=2 at conc32/DP16). That gives each DP + # rank only ~2 request slots and a 2-token MoE all-to-all dispatch buffer, which + # starves the cross-node EP path under load (TTFT ~11s, mass timeouts). The patch + # reference instead keeps a generous per-rank pool (conc*TP) and the env.sh MoRI + # dispatch / MOE token sizes (4096 / 2703). Floor at dp_ranks to keep capture_bs>0. + decode_max_running_requests=$((BENCH_MAX_CONC_VALUE * decode_dp_ranks)) + if (( decode_max_running_requests < decode_dp_ranks )); then + decode_max_running_requests=$decode_dp_ranks + fi + # Keep MORI_MAX_DISPATCH_TOKENS_DECODE / MOE_MAX_INPUT / INTER_KERNEL_SWITCH from + # env.sh (do NOT shrink them to max_running/dp_ranks). + echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE (env), MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE (env)" fi # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) diff --git a/driver/libbnxt_re-237.1.137.0.tar.gz b/driver/libbnxt_re-237.1.137.0.tar.gz new file mode 100644 index 000000000..b5d8bce6e Binary files /dev/null and b/driver/libbnxt_re-237.1.137.0.tar.gz differ diff --git a/scripts/_disagg_container_entry.sh b/scripts/_disagg_container_entry.sh new file mode 100755 index 000000000..fe493ffde --- /dev/null +++ b/scripts/_disagg_container_entry.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# First-stage command inside the disagg Docker image (see _disagg_ssh_remote_inner.sh). +# Optional: rebuild **libbnxt** (userspace libibverbs) from a tarball before server.sh — not DKMS. + +set -euo pipefail + +: "${SLURM_JOB_ID:?}" + +if [[ "${REBUILD_LIBBNXT_IN_CONTAINER:-0}" == "1" ]]; then + export REBUILD_BNXT=1 + export PATH_TO_BNXT_TAR_PACKAGE="${PATH_TO_BNXT_TAR_PACKAGE:?Set PATH_TO_BNXT_TAR_PACKAGE to a path visible in-container (e.g. /workspace/driver/libbnxt_re-*.tar.gz)}" + bash /workspace/benchmarks/multi_node/amd_utils/rebuild_bnxt.sh + # Prefer freshly built libbnxt_re in /usr/local/lib over inbox/shipped providers in /usr/lib. + export LD_LIBRARY_PATH="/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" +fi + +if [[ "${INSTALL_MORI_IN_CONTAINER:-0}" == "1" ]]; then + bash /workspace/scripts/install_mori_in_container.sh + if [[ -f /tmp/mori_pythonpath_prefix ]]; then + _mori_pyp="$(cat /tmp/mori_pythonpath_prefix)" + export PYTHONPATH="${_mori_pyp}${PYTHONPATH:+:${PYTHONPATH}}" + unset _mori_pyp + fi +fi + +mkdir -p "/run_logs/slurm_job-${SLURM_JOB_ID}" +exec bash /workspace/benchmarks/multi_node/amd_utils/server.sh 2>&1 | tee "/run_logs/slurm_job-${SLURM_JOB_ID}/server_$(hostname).log" diff --git a/scripts/_disagg_ssh_remote_inner.sh b/scripts/_disagg_ssh_remote_inner.sh new file mode 100755 index 000000000..e461b35d4 --- /dev/null +++ b/scripts/_disagg_ssh_remote_inner.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +# Run on each benchmark node (invoked via SSH). Starts one Docker container that runs amd_utils/server.sh. +# Environment is supplied by off-repo launchers or scripts/run_dsr1_fp8_mi325x_sglang_disagg_ssh.sh. + +set -euo pipefail + +: "${NODE_RANK:?}" +: "${JOB_ID:?}" +: "${NODE0_ADDR:?}" +: "${IPADDRS:?}" +: "${HOST_MODEL_DIR:?}" +: "${HOST_REPO:?}" +: "${IMAGE:?}" +: "${IBDEVICES:?}" + +# Per-node IBDEVICES sort by RoCE-v2 GID subnet (always on). +# +# Multi-node TP workers (e.g. decode TP16 over 2 nodes) and Mori KV transfer bind +# rank N to IBDEVICES[N]. The kernel does NOT enumerate bnxt_re* in the same PCIe +# order on every node, so the same device NAME can map to different fabric subnets +# across nodes (/sys/class/infiniband//ports/1/gids/3, the RoCEv2 GID 4th +# hextet). Sorting devices by that subnet makes "rank N" land on the same fabric +# subnet on every node, which is required for the cross-node decode collective to +# come up (otherwise it hangs at NCCL/QP init). +# +# Safe for the single-node-worker configs (tests 1-4): they do no multi-node NCCL, +# and their MoRI KV transfer already routes cross-subnet, so reordering the same +# device set is correctness-neutral. On a non-bnxt fabric (no bnxt_re* devices) the +# sort yields nothing and we keep the original launcher-provided IBDEVICES. +_ibdev_orig="${IBDEVICES}" +_ibdev_sorted=$( + for _d in /sys/class/infiniband/bnxt_re*; do + [[ -d "${_d}" ]] || continue + _dev_name=$(basename "${_d}") + _gid=$(cat "${_d}/ports/1/gids/3" 2>/dev/null || true) + _subnet=$(printf '%s' "${_gid}" | cut -d: -f4) + case "${_subnet}" in + 0000|"") continue ;; + esac + printf '%s %s\n' "${_subnet}" "${_dev_name}" + done | LC_ALL=C sort | awk '{print $2}' | paste -sd, - +) +if [[ -n "${_ibdev_sorted}" ]]; then + echo "[ibdev-auto] $(hostname -s): IBDEVICES (orig) = ${_ibdev_orig}" + echo "[ibdev-auto] $(hostname -s): IBDEVICES (sorted) = ${_ibdev_sorted}" + IBDEVICES="${_ibdev_sorted}" +else + echo "[ibdev-auto] $(hostname -s): no bnxt_re devices with subnet found, keeping IBDEVICES=${_ibdev_orig}" >&2 +fi +unset _ibdev_orig _ibdev_sorted + +# Use single-token DOCKER_BIN (default: docker) + USE_SUDO_FOR_DOCKER (default: 1). +# Do not rely on a multi-word "DOCKER=sudo docker" env value — it breaks ssh/env on some setups. +: "${DOCKER_BIN:=docker}" +: "${USE_SUDO_FOR_DOCKER:=1}" +HOST_LOG_ROOT="${HOST_LOG_ROOT:-/tmp/inferencex_disagg_logs_${JOB_ID}}" +RUN_LOG_HOST="/tmp/run_logs_${JOB_ID}" + +mkdir -p "${RUN_LOG_HOST}" "${HOST_LOG_ROOT}" + +DOCKER_DEVICES=(--device /dev/kfd) +shopt -s nullglob +for _d in /dev/dri/renderD* /dev/dri/card*; do + DOCKER_DEVICES+=(--device "${_d}") +done +if [[ -d /dev/infiniband ]]; then + for _d in /dev/infiniband/*; do + [[ -e "${_d}" ]] && DOCKER_DEVICES+=(--device "${_d}") + done +fi +shopt -u nullglob + +EXTRA_ARR=() +if [[ -n "${EXTRA_DOCKER_ARGS:-}" ]]; then + # shellcheck disable=SC2206 + EXTRA_ARR=(${EXTRA_DOCKER_ARGS}) +fi + +if [[ "${USE_SUDO_FOR_DOCKER}" == "1" ]]; then + _dcmd=(sudo "${DOCKER_BIN}") +else + _dcmd=("${DOCKER_BIN}") +fi +exec "${_dcmd[@]}" run --rm --init \ + --stop-timeout 10 \ + "${DOCKER_DEVICES[@]}" \ + "${EXTRA_ARR[@]}" \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --network host \ + --ipc host \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v "${HOST_MODEL_DIR}:/models:ro" \ + -v "${HOST_REPO}:/workspace" \ + -v "${RUN_LOG_HOST}:/run_logs" \ + -v "${HOST_LOG_ROOT}:/benchmark_logs" \ + --shm-size "${DOCKER_SHM_SIZE:-128g}" \ + -e SLURM_JOB_ID="${JOB_ID}" \ + -e SLURM_JOB_NODELIST="manual" \ + -e NODE_RANK="${NODE_RANK}" \ + -e NODE0_ADDR="${NODE0_ADDR}" \ + -e NNODES="${NNODES:-2}" \ + -e IPADDRS="${IPADDRS}" \ + -e MODEL_DIR=/models \ + -e MODEL_NAME="${MODEL_NAME}" \ + -e SGLANG_WS_PATH=/workspace/benchmarks/multi_node/amd_utils \ + -e "xP=${xP:-1}" \ + -e "yD=${yD:-1}" \ + -e "GPUS_PER_NODE=${GPUS_PER_NODE:-8}" \ + -e "PREFILL_TP_SIZE=${PREFILL_TP_SIZE}" \ + -e "PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}" \ + -e "PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}" \ + -e "DECODE_TP_SIZE=${DECODE_TP_SIZE}" \ + -e "DECODE_ENABLE_EP=${DECODE_ENABLE_EP}" \ + -e "DECODE_ENABLE_DP=${DECODE_ENABLE_DP}" \ + -e "DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0}" \ + -e "BENCH_INPUT_LEN=${BENCH_INPUT_LEN}" \ + -e "BENCH_OUTPUT_LEN=${BENCH_OUTPUT_LEN}" \ + -e "BENCH_RANDOM_RANGE_RATIO=${BENCH_RANDOM_RANGE_RATIO}" \ + -e "BENCH_REQUEST_RATE=${BENCH_REQUEST_RATE:-inf}" \ + -e "BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" \ + -e "BENCH_MAX_CONCURRENCY=${BENCH_MAX_CONCURRENCY}" \ + -e "DRY_RUN=${DRY_RUN:-0}" \ + -e "IBDEVICES=${IBDEVICES}" \ + -e "MORI_RDMA_TC=${MORI_RDMA_TC:-}" \ + -e "BENCHMARK_LOGS_DIR=/benchmark_logs" \ + -e "PYTHONDONTWRITEBYTECODE=1" \ + -e "HOST_IP=${HOST_IP:-}" \ + -e "BARRIER_LOCAL_IP=${BARRIER_LOCAL_IP:-}" \ + -e "BARRIER_SYNC_PORT=${BARRIER_SYNC_PORT:-}" \ + -e "SGLANG_PD_PORT=${SGLANG_PD_PORT:-}" \ + -e "ROUTER_PORT=${ROUTER_PORT:-}" \ + -e "REBUILD_LIBBNXT_IN_CONTAINER=${REBUILD_LIBBNXT_IN_CONTAINER:-0}" \ + -e "PATH_TO_BNXT_TAR_PACKAGE=${PATH_TO_BNXT_TAR_PACKAGE:-}" \ + -e "INSTALL_MORI_IN_CONTAINER=${INSTALL_MORI_IN_CONTAINER:-0}" \ + -e "INSTALL_MORI_MODE=${INSTALL_MORI_MODE:-git}" \ + -e "MORI_GIT_URL=${MORI_GIT_URL:-}" \ + -e "MORI_GIT_REF=${MORI_GIT_REF:-}" \ + -e "MORI_GIT_CLONE_DIR=${MORI_GIT_CLONE_DIR:-}" \ + -e "MORI_SOURCE_PATH=${MORI_SOURCE_PATH:-}" \ + -e "INSTALL_MORI_PYTHON_BIN=${INSTALL_MORI_PYTHON_BIN:-}" \ + -e "INSTALL_MORI_NO_BUILD_ISOLATION=${INSTALL_MORI_NO_BUILD_ISOLATION:-0}" \ + "${IMAGE}" \ + bash /workspace/scripts/_disagg_container_entry.sh diff --git a/scripts/install_mori_in_container.sh b/scripts/install_mori_in_container.sh new file mode 100755 index 000000000..4b4ea2f67 --- /dev/null +++ b/scripts/install_mori_in_container.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# Optional: build and wire MoRI into the active Python before server.sh (disagg images). +# SGLang often ships MoRI under /sgl-workspace/mori; prepending this tree to PYTHONPATH +# makes the freshly built C++ extension + Python sources win over the vendored copy. +# +# Modes (INSTALL_MORI_MODE): +# git — shallow clone from MORI_GIT_URL at ref MORI_GIT_REF into MORI_GIT_CLONE_DIR (default /tmp/mori_git_src) +# path — use existing tree MORI_SOURCE_PATH (default /workspace/mori, e.g. bind-mount ROCm/mori next to InferenceX) +# +# Prereqs: network for git mode; libpci-dev / libibverbs-dev / cmake / ninja / pybind11 (image or apt). +# Example (host, before launcher): +# export INSTALL_MORI_IN_CONTAINER=1 +# export INSTALL_MORI_MODE=git +# export MORI_GIT_REF=main + +set -euo pipefail + +INSTALL_MORI_MODE="${INSTALL_MORI_MODE:-git}" +MORI_GIT_URL="${MORI_GIT_URL:-https://github.com/ROCm/mori.git}" +MORI_GIT_REF="${MORI_GIT_REF:-main}" +MORI_GIT_CLONE_DIR="${MORI_GIT_CLONE_DIR:-/tmp/mori_git_src}" +MORI_SOURCE_PATH="${MORI_SOURCE_PATH:-/workspace/mori}" + +if [[ -x /opt/venv/bin/python3 ]]; then + PY="${INSTALL_MORI_PYTHON_BIN:-/opt/venv/bin/python3}" +else + PY="${INSTALL_MORI_PYTHON_BIN:-$(command -v python3)}" +fi + +echo "[install_mori_in_container] using python: ${PY}" + +SRC="" +if [[ "${INSTALL_MORI_MODE}" == "path" ]]; then + if [[ ! -d "${MORI_SOURCE_PATH}" ]]; then + echo "INSTALL_MORI_MODE=path but MORI_SOURCE_PATH=${MORI_SOURCE_PATH} is missing" >&2 + exit 1 + fi + SRC="${MORI_SOURCE_PATH}" +elif [[ "${INSTALL_MORI_MODE}" == "git" ]]; then + rm -rf "${MORI_GIT_CLONE_DIR}" + git clone --depth 1 --branch "${MORI_GIT_REF}" "${MORI_GIT_URL}" "${MORI_GIT_CLONE_DIR}" || { + echo "[install_mori_in_container] shallow clone failed (wrong ref or need full clone). Try:" >&2 + echo " MORI_GIT_REF= or INSTALL_MORI_MODE=path + mount mori at ${MORI_SOURCE_PATH}" >&2 + exit 1 + } + SRC="${MORI_GIT_CLONE_DIR}" +else + echo "Unknown INSTALL_MORI_MODE=${INSTALL_MORI_MODE} (use git or path)" >&2 + exit 1 +fi + +# Best-effort headers for CMake (Ubuntu/Debian); ignore failures (offline / read-only apt). +if command -v apt-get >/dev/null 2>&1; then + apt-get update -qq >/dev/null 2>&1 || true + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + libpci-dev libibverbs-dev cmake ninja-build pybind11-dev git >/dev/null 2>&1 || true +fi + +echo "[install_mori_in_container] pip install -e ${SRC}" +PIP_ARGS=(--no-cache-dir) +if [[ "${INSTALL_MORI_NO_BUILD_ISOLATION:-0}" == "1" ]]; then + PIP_ARGS+=(--no-build-isolation) +fi +"${PY}" -m pip install -e "${SRC}" "${PIP_ARGS[@]}" + +# Parent shell (_disagg_container_entry.sh) prepends this so imports beat /sgl-workspace/mori. +echo -n "${SRC}/python" >/tmp/mori_pythonpath_prefix + +echo "[install_mori_in_container] done; wrote PYTHONPATH prefix to /tmp/mori_pythonpath_prefix"