From 1578f06dca5c0ce2c525f458096e25c1e38e6d6c Mon Sep 17 00:00:00 2001 From: andyluo7 Date: Fri, 19 Jun 2026 16:15:49 -0700 Subject: [PATCH] job.slurm: opt-in model auto-download fallback for SGLang path The SGLang/disagg model check hard-fails when $MODEL_DIR/$MODEL_NAME is missing on the allocated nodes, requiring every model to be manually pre-staged to the shared store. Add an opt-in fallback (MODEL_AUTODOWNLOAD=1) that fetches the HuggingFace repo ($MODEL) into the store once and re-checks before failing. Default behavior is unchanged: without the flag the launcher still hard-fails fast, so mis-staged models are caught immediately. The download is serialized across concurrent matrix jobs with flock and published via atomic rename, so a partial/crashed fetch never satisfies the existence check. Co-Authored-By: Claude Opus 4 (1M context) --- benchmarks/multi_node/amd_utils/job.slurm | 77 ++++++++++++++++++++--- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index c542280c4..a072c280d 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -203,14 +203,77 @@ else fi } - if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then - MODEL_PATH="$MODEL_DIR/$MODEL_NAME" - echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" - else - echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" - echo " - $MODEL_DIR/$MODEL_NAME" - exit 1 + # Opt-in auto-download fallback. Fetches the HuggingFace repo into the + # shared model store once, atomically, when the model is missing. The store + # ($MODEL_DIR) is a shared mount visible on every allocated node, so a single + # download on the batch host populates all of them. flock serializes + # concurrent matrix jobs targeting the same path; the download lands in a + # temp dir and is published with an atomic rename so a crashed/partial fetch + # never satisfies the existence check. + autodownload_model() { + local model_dir=$1 + local model_name=$2 + local repo_id="${MODEL:-$model_name}" + local dst="$model_dir/$model_name" + local lock="$model_dir/.$model_name.download.lock" + local tmp="$model_dir/.$model_name.tmp.$$" + + if [[ "$repo_id" != */* ]]; then + echo "[autodownload] MODEL='$repo_id' is not an 'org/name' repo id; cannot download. Set MODEL=." >&2 + return 1 + fi + + echo "[autodownload] Model missing; fetching '$repo_id' -> $dst" + mkdir -p "$model_dir" || { echo "[autodownload] cannot create $model_dir" >&2; return 1; } + + # Some lean runtime images ship without the HF CLI. + if ! command -v hf >/dev/null 2>&1; then + local pip_install=(python3 -m pip install -q) + python3 -m pip install --help 2>/dev/null | grep -q -- "--break-system-packages" \ + && pip_install+=(--break-system-packages) + "${pip_install[@]}" "huggingface_hub[cli]>=0.25.0" || { + echo "[autodownload] failed to install huggingface_hub CLI" >&2; return 1; } + fi + + exec 200>"$lock" + flock 200 + if [[ -d "$dst" ]]; then + echo "[autodownload] Another job already staged $dst; skipping." + flock -u 200 + return 0 + fi + rm -rf "$tmp" + if hf download "$repo_id" --local-dir "$tmp"; then + mv "$tmp" "$dst" + echo "[autodownload] Published $dst" + flock -u 200 + return 0 + fi + echo "[autodownload] hf download failed for '$repo_id'" >&2 + rm -rf "$tmp" + flock -u 200 + return 1 + } + + if ! check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + # Default (MODEL_AUTODOWNLOAD unset/false): hard-fail so a mis-staged + # model is caught immediately. When enabled, fetch from HuggingFace and + # re-check before giving up. + if [[ "${MODEL_AUTODOWNLOAD:-false}" == "true" || "${MODEL_AUTODOWNLOAD:-0}" == "1" ]]; then + autodownload_model "$MODEL_DIR" "$MODEL_NAME" + if ! check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + echo "FATAL ERROR: Model '$MODEL_NAME' still not found on all nodes after auto-download" + exit 1 + fi + else + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$MODEL_NAME" + echo " (set MODEL_AUTODOWNLOAD=1 to fetch '${MODEL:-}' from HuggingFace automatically)" + exit 1 + fi fi + MODEL_PATH="$MODEL_DIR/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" echo "Final MODEL_PATH: $MODEL_PATH" fi