diff --git a/.github/workflows/build-gfx11-rocm.yml b/.github/workflows/build-gfx11-rocm.yml index 905f40bc7bfc..6dabeabb5cd0 100644 --- a/.github/workflows/build-gfx11-rocm.yml +++ b/.github/workflows/build-gfx11-rocm.yml @@ -32,25 +32,12 @@ env: jobs: build-ubuntu: runs-on: ubuntu-24.04 - strategy: - matrix: - include: - - gfx_target: gfx1151 - s3_target: gfx1151 - gpu_targets: gfx1151 - - gfx_target: gfx1150 - s3_target: gfx1150 - gpu_targets: gfx1150 - - gfx_target: gfx1153 - s3_target: gfx1153 - gpu_targets: gfx1153 - # Hawk Point / Phoenix family (Radeon 760M/780M = gfx1103) ships only - # in the gfx110X-all bundle, which also covers desktop RDNA3 - # (RX 7900/7800/7600). Build+release only — no on-hardware test runner. - - gfx_target: gfx110X - s3_target: gfx110X-all - gpu_targets: gfx1100;gfx1101;gfx1102;gfx1103 - fail-fast: false + # Single multiarch build: one fat binary covering all current CI arches, + # sourced from TheRock's multiarch tarball (arch-neutral host + per-arch + # Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M), + # gfx1150/1151/1153 (RDNA3.5 Strix APUs). + env: + GPU_TARGETS: gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1153 outputs: rocm_version: ${{ steps.set-outputs.outputs.rocm_version }} llamacpp_commit_hash: ${{ steps.set-outputs.outputs.llamacpp_commit_hash }} @@ -84,16 +71,18 @@ jobs: ninja --version echo "Build dependencies installation completed" - - name: Download and extract ROCm directly to /opt/rocm + - name: Download and extract multiarch ROCm directly to /opt/rocm run: | rocm_version="${{ env.ROCM_VERSION }}" - s3_target="${{ matrix.s3_target }}" + base_url="https://rocm.nightlies.amd.com/tarball-multi-arch" if [ "$rocm_version" = "latest" ]; then - echo "Auto-detecting latest ROCm version for target: $s3_target" - s3_response=$(curl -s "https://therock-nightly-tarball.s3.amazonaws.com/?prefix=therock-dist-linux-${s3_target}-7") - - files=$(echo "$s3_response" | grep -oP '(?<=)[^<]*' | grep "therock-dist-linux-${s3_target}-") + echo "Auto-detecting latest multiarch ROCm version" + # The multiarch host serves an HTML index (not S3 XML); scrape the + # multiarch tarball names from it. + files=$(curl -s "$base_url/" \ + | grep -oE 'therock-dist-linux-multiarch-[0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+\.tar\.gz' \ + | sort -u) latest_file="" latest_major=0 @@ -103,7 +92,7 @@ jobs: latest_is_alpha=false while IFS= read -r file; do - if [[ "$file" =~ therock-dist-linux-${s3_target}-.*?([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then + if [[ "$file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then version="${BASH_REMATCH[1]}" major=$(echo "$version" | cut -d. -f1) minor=$(echo "$version" | cut -d. -f2) @@ -142,25 +131,47 @@ jobs: echo "Found latest file: $latest_file" - if [[ "$latest_file" =~ therock-dist-linux-${s3_target}-.*?([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then + if [[ "$latest_file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then rocm_version="${BASH_REMATCH[1]}" echo "Detected latest ROCm version: $rocm_version" else echo "Failed to extract ROCm version from latest file: $latest_file" - echo "Expected pattern: therock-dist-linux-${s3_target}-*.tar.gz" + echo "Expected pattern: therock-dist-linux-multiarch-.tar.gz" exit 1 fi - - rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/$latest_file" - else - rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-${s3_target}-${rocm_version}.tar.gz" fi + rocm_url="$base_url/therock-dist-linux-multiarch-${rocm_version}.tar.gz" echo "DETECTED_ROCM_VERSION=$rocm_version" >> $GITHUB_ENV - echo "Streaming ROCm from: $rocm_url directly to extraction" + # The multiarch tarball (~11.5 GB) ships device code for ALL 26 GPU + # arches: every arch's .kpack plus per-arch rocBLAS/hipBLASLt Tensile DBs. + # This consumer build only needs the 7 RDNA3/3.5 arches in GPU_TARGETS and + # uses the GEMM (Tensile) path, which works without .kpack files. So we + # stream-extract and prune at the tar level: drop ALL .kpack, and drop the + # Tensile DBs of every arch not in our target set. This keeps the runner + # disk footprint small (the 11.5 GB is streamed, never stored) and yields + # a lean multiarch package. tar matches --exclude on pre-strip member + # names, hence the leading "./". + drop_arches="gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 \ + gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 \ + gfx1152 gfx1200 gfx1201" + excludes=(--exclude='./.kpack' --exclude='./.kpack/*') + for a in $drop_arches; do + excludes+=("--exclude=./lib/*/library/${a}") + excludes+=("--exclude=./lib/*/library/${a}/*") + excludes+=("--exclude=./lib/*/library/*${a}*") + done + + echo "Streaming multiarch ROCm from: $rocm_url (pruning .kpack + non-target arches)" sudo mkdir -p /opt/rocm - curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - -C /opt/rocm --strip-components=1 + curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - \ + -C /opt/rocm --strip-components=1 "${excludes[@]}" + + echo "Retained rocBLAS Tensile arch dirs:" + ls /opt/rocm/lib/rocblas/library/ 2>/dev/null || echo "(none)" + echo "Retained hipBLASLt Tensile arch dirs:" + ls /opt/rocm/lib/hipblaslt/library/ 2>/dev/null || echo "(none)" - name: Set ROCm environment variables run: | @@ -189,9 +200,8 @@ jobs: - name: Build Llama.cpp + ROCm run: | - current_target="${{ matrix.gfx_target }}" - gpu_targets="${{ matrix.gpu_targets }}" - echo "Building for target: $current_target (GPU_TARGETS=$gpu_targets)" + gpu_targets="${{ env.GPU_TARGETS }}" + echo "Building multiarch binary (GPU_TARGETS=$gpu_targets)" mkdir build cd build @@ -300,7 +310,7 @@ jobs: - name: Upload build artifacts uses: actions/upload-artifact@v4 with: - name: llama-ubuntu-rocm-${{ matrix.gfx_target }}-x64 + name: llama-ubuntu-rocm-multiarch-x64 path: build/bin/ retention-days: 30 @@ -316,19 +326,10 @@ jobs: test-gfx: needs: build-ubuntu if: needs.build-ubuntu.result == 'success' - runs-on: ${{ matrix.runner }} - strategy: - matrix: - include: - - gfx_target: gfx1151 - runner: linux-gfx1151-gpu-rocm - # gfx1150 test temporarily disabled: the linux-gfx1150-gpu-rocm - # runner (Bangalore box) hangs in llama-cli GPU inference while the - # identical artifact/command passes on gfx1151 in seconds. Re-enable - # once the runner's GPU/driver issue is resolved. - # - gfx_target: gfx1150 - # runner: linux-gfx1150-gpu-rocm - fail-fast: false + # Single hardware test of the multiarch artifact on gfx1151. This is the + # end-to-end safety net for the Tensile-only multiarch package: a real + # llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device. + runs-on: linux-gfx1151-gpu-rocm steps: - name: Checkout repository @@ -337,7 +338,7 @@ jobs: - name: Download build artifacts uses: actions/download-artifact@v4 with: - name: llama-ubuntu-rocm-${{ matrix.gfx_target }}-x64 + name: llama-ubuntu-rocm-multiarch-x64 path: llama-binaries - name: Download test model @@ -380,7 +381,7 @@ jobs: # Use a prompt with a single correct answer and greedy decoding # (--temp 0) so the result is deterministic and verifiable. prompt="What is 2 + 2? Reply with only the number." - echo "Running llama-cli test for ${{ matrix.gfx_target }}..." + echo "Running llama-cli test for gfx1151 (multiarch artifact)..." echo "Command: $llama_cli_path -m \"$model_path\" -ngl 99 --temp 0 -p \"$prompt\" -st -v" # Bound the run: a healthy 0.6B inference finishes in seconds. If the @@ -469,12 +470,11 @@ jobs: contents: write # Publish only on the nightly dispatch (external cron passes # -f create_release=true). Push/PR and manual runs never release. - # Require the build to succeed and tests to pass-or-skip (gfx1150 test is - # currently skipped; its build artifact is still published). + # Require the build to succeed and the gfx1151 hardware test to pass. if: | always() && needs.build-ubuntu.result == 'success' && - (needs.test-gfx.result == 'success' || needs.test-gfx.result == 'skipped') && + needs.test-gfx.result == 'success' && github.event_name == 'workflow_dispatch' && github.event.inputs.create_release == 'true' steps: @@ -505,21 +505,20 @@ jobs: fi echo "Release tag: $TAG" - - name: Create per-target archives + - name: Create multiarch archive if: steps.generate-tag.outputs.tag_exists == 'false' run: | TAG="${{ steps.generate-tag.outputs.tag }}" root="$PWD" - for target in gfx1151 gfx1150 gfx1153 gfx110X; do - artifact_dir="./all-artifacts/llama-ubuntu-rocm-${target}-x64" - archive="llama-${TAG}-ubuntu-rocm-${target}-x64" - if [ -d "$artifact_dir" ]; then - echo "Creating ${archive}.tar.gz" - tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" . - else - echo "Warning: artifact dir not found: $artifact_dir" - fi - done + artifact_dir="./all-artifacts/llama-ubuntu-rocm-multiarch-x64" + archive="llama-${TAG}-ubuntu-rocm-multiarch-x64" + if [ -d "$artifact_dir" ]; then + echo "Creating ${archive}.tar.gz" + tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" . + else + echo "ERROR: artifact dir not found: $artifact_dir" + exit 1 + fi ls -la *.tar.gz - name: Create GitHub Release @@ -535,10 +534,10 @@ jobs: --title "$TAG" \ --notes "**Build**: $TAG **OS**: ubuntu - **GPU Target(s)**: gfx1151, gfx1150, gfx1153, gfx110X (gfx1100/1101/1102/1103) - **ROCm Version**: $ROCM_VERSION + **GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single multiarch binary) + **ROCm Version**: $ROCM_VERSION (multiarch) **Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH **Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC') - Prebuilt llama.cpp ROCm binaries for the RDNA3.5 gfx115x APUs (gfx1151/gfx1150/gfx1153) and the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M), with ROCm runtime libraries bundled." \ + Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multiarch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled)." \ *.tar.gz