From 46d7a95179135a2afa4060a11956d82958db1279 Mon Sep 17 00:00:00 2001 From: Jim Wu Date: Mon, 15 Jun 2026 18:05:12 -0600 Subject: [PATCH 1/3] ci(gfx11): build one universal multi-arch ROCm package Collapse the 4-leg per-family matrix (gfx1151/gfx1150/gfx1153/gfx110X) into a single build sourced from TheRock's multi-arch tarball. One fat binary covers all current CI arches (gfx1100-1103, gfx1150/1151/1153) and ships as one universal release archive instead of four mostly- duplicate per-family archives. The multi-arch tarball is streamed and pruned at the tar level: drop all .kpack and the Tensile DBs of every non-target arch. The GEMM path llama.cpp uses works from the per-arch Tensile DB alone (validated on gfx1151 hardware: rocBLAS sgemm succeeds with ROCM_KPACK_DISABLE=1), so no .kpack files are bundled. The gfx1151 hardware test job is the end-to-end safety net. Co-Authored-By: Claude Opus 4 --- .github/workflows/build-gfx11-rocm.yml | 139 ++++++++++++------------- 1 file changed, 69 insertions(+), 70 deletions(-) diff --git a/.github/workflows/build-gfx11-rocm.yml b/.github/workflows/build-gfx11-rocm.yml index 905f40bc7bfc..cdc509286b66 100644 --- a/.github/workflows/build-gfx11-rocm.yml +++ b/.github/workflows/build-gfx11-rocm.yml @@ -32,25 +32,12 @@ env: jobs: build-ubuntu: runs-on: ubuntu-24.04 - strategy: - matrix: - include: - - gfx_target: gfx1151 - s3_target: gfx1151 - gpu_targets: gfx1151 - - gfx_target: gfx1150 - s3_target: gfx1150 - gpu_targets: gfx1150 - - gfx_target: gfx1153 - s3_target: gfx1153 - gpu_targets: gfx1153 - # Hawk Point / Phoenix family (Radeon 760M/780M = gfx1103) ships only - # in the gfx110X-all bundle, which also covers desktop RDNA3 - # (RX 7900/7800/7600). Build+release only — no on-hardware test runner. - - gfx_target: gfx110X - s3_target: gfx110X-all - gpu_targets: gfx1100;gfx1101;gfx1102;gfx1103 - fail-fast: false + # Single universal build: one fat binary covering all current CI arches, + # sourced from TheRock's multi-arch tarball (arch-neutral host + per-arch + # Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M), + # gfx1150/1151/1153 (RDNA3.5 Strix APUs). + env: + GPU_TARGETS: gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1153 outputs: rocm_version: ${{ steps.set-outputs.outputs.rocm_version }} llamacpp_commit_hash: ${{ steps.set-outputs.outputs.llamacpp_commit_hash }} @@ -84,16 +71,18 @@ jobs: ninja --version echo "Build dependencies installation completed" - - name: Download and extract ROCm directly to /opt/rocm + - name: Download and extract multi-arch ROCm directly to /opt/rocm run: | rocm_version="${{ env.ROCM_VERSION }}" - s3_target="${{ matrix.s3_target }}" + base_url="https://rocm.nightlies.amd.com/tarball-multi-arch" if [ "$rocm_version" = "latest" ]; then - echo "Auto-detecting latest ROCm version for target: $s3_target" - s3_response=$(curl -s "https://therock-nightly-tarball.s3.amazonaws.com/?prefix=therock-dist-linux-${s3_target}-7") - - files=$(echo "$s3_response" | grep -oP '(?<=)[^<]*' | grep "therock-dist-linux-${s3_target}-") + echo "Auto-detecting latest multi-arch ROCm version" + # The multi-arch host serves an HTML index (not S3 XML); scrape the + # multiarch tarball names from it. + files=$(curl -s "$base_url/" \ + | grep -oE 'therock-dist-linux-multiarch-[0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+\.tar\.gz' \ + | sort -u) latest_file="" latest_major=0 @@ -103,7 +92,7 @@ jobs: latest_is_alpha=false while IFS= read -r file; do - if [[ "$file" =~ therock-dist-linux-${s3_target}-.*?([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then + if [[ "$file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then version="${BASH_REMATCH[1]}" major=$(echo "$version" | cut -d. -f1) minor=$(echo "$version" | cut -d. -f2) @@ -142,25 +131,47 @@ jobs: echo "Found latest file: $latest_file" - if [[ "$latest_file" =~ therock-dist-linux-${s3_target}-.*?([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then + if [[ "$latest_file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then rocm_version="${BASH_REMATCH[1]}" echo "Detected latest ROCm version: $rocm_version" else echo "Failed to extract ROCm version from latest file: $latest_file" - echo "Expected pattern: therock-dist-linux-${s3_target}-*.tar.gz" + echo "Expected pattern: therock-dist-linux-multiarch-.tar.gz" exit 1 fi - - rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/$latest_file" - else - rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-${s3_target}-${rocm_version}.tar.gz" fi + rocm_url="$base_url/therock-dist-linux-multiarch-${rocm_version}.tar.gz" echo "DETECTED_ROCM_VERSION=$rocm_version" >> $GITHUB_ENV - echo "Streaming ROCm from: $rocm_url directly to extraction" + # The multi-arch tarball (~11.5 GB) ships device code for ALL 26 GPU + # arches: every arch's .kpack plus per-arch rocBLAS/hipBLASLt Tensile DBs. + # This consumer build only needs the 7 RDNA3/3.5 arches in GPU_TARGETS and + # uses the GEMM (Tensile) path, which works without .kpack files. So we + # stream-extract and prune at the tar level: drop ALL .kpack, and drop the + # Tensile DBs of every arch not in our target set. This keeps the runner + # disk footprint small (the 11.5 GB is streamed, never stored) and yields + # a lean universal package. tar matches --exclude on pre-strip member + # names, hence the leading "./". + drop_arches="gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 \ + gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 \ + gfx1152 gfx1200 gfx1201" + excludes=(--exclude='./.kpack' --exclude='./.kpack/*') + for a in $drop_arches; do + excludes+=("--exclude=./lib/*/library/${a}") + excludes+=("--exclude=./lib/*/library/${a}/*") + excludes+=("--exclude=./lib/*/library/*${a}*") + done + + echo "Streaming multi-arch ROCm from: $rocm_url (pruning .kpack + non-target arches)" sudo mkdir -p /opt/rocm - curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - -C /opt/rocm --strip-components=1 + curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - \ + -C /opt/rocm --strip-components=1 "${excludes[@]}" + + echo "Retained rocBLAS Tensile arch dirs:" + ls /opt/rocm/lib/rocblas/library/ 2>/dev/null || echo "(none)" + echo "Retained hipBLASLt Tensile arch dirs:" + ls /opt/rocm/lib/hipblaslt/library/ 2>/dev/null || echo "(none)" - name: Set ROCm environment variables run: | @@ -189,9 +200,8 @@ jobs: - name: Build Llama.cpp + ROCm run: | - current_target="${{ matrix.gfx_target }}" - gpu_targets="${{ matrix.gpu_targets }}" - echo "Building for target: $current_target (GPU_TARGETS=$gpu_targets)" + gpu_targets="${{ env.GPU_TARGETS }}" + echo "Building universal binary (GPU_TARGETS=$gpu_targets)" mkdir build cd build @@ -300,7 +310,7 @@ jobs: - name: Upload build artifacts uses: actions/upload-artifact@v4 with: - name: llama-ubuntu-rocm-${{ matrix.gfx_target }}-x64 + name: llama-ubuntu-rocm-universal-x64 path: build/bin/ retention-days: 30 @@ -316,19 +326,10 @@ jobs: test-gfx: needs: build-ubuntu if: needs.build-ubuntu.result == 'success' - runs-on: ${{ matrix.runner }} - strategy: - matrix: - include: - - gfx_target: gfx1151 - runner: linux-gfx1151-gpu-rocm - # gfx1150 test temporarily disabled: the linux-gfx1150-gpu-rocm - # runner (Bangalore box) hangs in llama-cli GPU inference while the - # identical artifact/command passes on gfx1151 in seconds. Re-enable - # once the runner's GPU/driver issue is resolved. - # - gfx_target: gfx1150 - # runner: linux-gfx1150-gpu-rocm - fail-fast: false + # Single hardware test of the universal artifact on gfx1151. This is the + # end-to-end safety net for the Tensile-only multi-arch package: a real + # llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device. + runs-on: linux-gfx1151-gpu-rocm steps: - name: Checkout repository @@ -337,7 +338,7 @@ jobs: - name: Download build artifacts uses: actions/download-artifact@v4 with: - name: llama-ubuntu-rocm-${{ matrix.gfx_target }}-x64 + name: llama-ubuntu-rocm-universal-x64 path: llama-binaries - name: Download test model @@ -380,7 +381,7 @@ jobs: # Use a prompt with a single correct answer and greedy decoding # (--temp 0) so the result is deterministic and verifiable. prompt="What is 2 + 2? Reply with only the number." - echo "Running llama-cli test for ${{ matrix.gfx_target }}..." + echo "Running llama-cli test for gfx1151 (universal artifact)..." echo "Command: $llama_cli_path -m \"$model_path\" -ngl 99 --temp 0 -p \"$prompt\" -st -v" # Bound the run: a healthy 0.6B inference finishes in seconds. If the @@ -469,12 +470,11 @@ jobs: contents: write # Publish only on the nightly dispatch (external cron passes # -f create_release=true). Push/PR and manual runs never release. - # Require the build to succeed and tests to pass-or-skip (gfx1150 test is - # currently skipped; its build artifact is still published). + # Require the build to succeed and the gfx1151 hardware test to pass. if: | always() && needs.build-ubuntu.result == 'success' && - (needs.test-gfx.result == 'success' || needs.test-gfx.result == 'skipped') && + needs.test-gfx.result == 'success' && github.event_name == 'workflow_dispatch' && github.event.inputs.create_release == 'true' steps: @@ -505,21 +505,20 @@ jobs: fi echo "Release tag: $TAG" - - name: Create per-target archives + - name: Create universal archive if: steps.generate-tag.outputs.tag_exists == 'false' run: | TAG="${{ steps.generate-tag.outputs.tag }}" root="$PWD" - for target in gfx1151 gfx1150 gfx1153 gfx110X; do - artifact_dir="./all-artifacts/llama-ubuntu-rocm-${target}-x64" - archive="llama-${TAG}-ubuntu-rocm-${target}-x64" - if [ -d "$artifact_dir" ]; then - echo "Creating ${archive}.tar.gz" - tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" . - else - echo "Warning: artifact dir not found: $artifact_dir" - fi - done + artifact_dir="./all-artifacts/llama-ubuntu-rocm-universal-x64" + archive="llama-${TAG}-ubuntu-rocm-universal-x64" + if [ -d "$artifact_dir" ]; then + echo "Creating ${archive}.tar.gz" + tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" . + else + echo "ERROR: artifact dir not found: $artifact_dir" + exit 1 + fi ls -la *.tar.gz - name: Create GitHub Release @@ -535,10 +534,10 @@ jobs: --title "$TAG" \ --notes "**Build**: $TAG **OS**: ubuntu - **GPU Target(s)**: gfx1151, gfx1150, gfx1153, gfx110X (gfx1100/1101/1102/1103) - **ROCm Version**: $ROCM_VERSION + **GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single universal binary) + **ROCm Version**: $ROCM_VERSION (multi-arch) **Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH **Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC') - Prebuilt llama.cpp ROCm binaries for the RDNA3.5 gfx115x APUs (gfx1151/gfx1150/gfx1153) and the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M), with ROCm runtime libraries bundled." \ + Prebuilt llama.cpp ROCm binaries — one universal package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multi-arch ROCm runtime (per-arch Tensile databases bundled)." \ *.tar.gz From 4483e1b34c1e74df577d6cd1c2cd6bd6e15d8a86 Mon Sep 17 00:00:00 2001 From: Jim Wu Date: Wed, 17 Jun 2026 08:32:22 -0600 Subject: [PATCH 2/3] ci(gfx11): rename "universal" package to "multiarch" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "universal" vs "multi-arch" wording was redundant — both describe one package covering many arches. Standardize on "multiarch" to match TheRock's upstream vocabulary. Renames the artifact/archive to llama--ubuntu-rocm-multiarch-x64 and updates comments + release body. Co-Authored-By: Claude Opus 4 --- .github/workflows/build-gfx11-rocm.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build-gfx11-rocm.yml b/.github/workflows/build-gfx11-rocm.yml index cdc509286b66..5d9ce17fa39a 100644 --- a/.github/workflows/build-gfx11-rocm.yml +++ b/.github/workflows/build-gfx11-rocm.yml @@ -32,7 +32,7 @@ env: jobs: build-ubuntu: runs-on: ubuntu-24.04 - # Single universal build: one fat binary covering all current CI arches, + # Single multiarch build: one fat binary covering all current CI arches, # sourced from TheRock's multi-arch tarball (arch-neutral host + per-arch # Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M), # gfx1150/1151/1153 (RDNA3.5 Strix APUs). @@ -151,7 +151,7 @@ jobs: # stream-extract and prune at the tar level: drop ALL .kpack, and drop the # Tensile DBs of every arch not in our target set. This keeps the runner # disk footprint small (the 11.5 GB is streamed, never stored) and yields - # a lean universal package. tar matches --exclude on pre-strip member + # a lean multiarch package. tar matches --exclude on pre-strip member # names, hence the leading "./". drop_arches="gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 \ gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 \ @@ -201,7 +201,7 @@ jobs: - name: Build Llama.cpp + ROCm run: | gpu_targets="${{ env.GPU_TARGETS }}" - echo "Building universal binary (GPU_TARGETS=$gpu_targets)" + echo "Building multiarch binary (GPU_TARGETS=$gpu_targets)" mkdir build cd build @@ -310,7 +310,7 @@ jobs: - name: Upload build artifacts uses: actions/upload-artifact@v4 with: - name: llama-ubuntu-rocm-universal-x64 + name: llama-ubuntu-rocm-multiarch-x64 path: build/bin/ retention-days: 30 @@ -326,7 +326,7 @@ jobs: test-gfx: needs: build-ubuntu if: needs.build-ubuntu.result == 'success' - # Single hardware test of the universal artifact on gfx1151. This is the + # Single hardware test of the multiarch artifact on gfx1151. This is the # end-to-end safety net for the Tensile-only multi-arch package: a real # llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device. runs-on: linux-gfx1151-gpu-rocm @@ -338,7 +338,7 @@ jobs: - name: Download build artifacts uses: actions/download-artifact@v4 with: - name: llama-ubuntu-rocm-universal-x64 + name: llama-ubuntu-rocm-multiarch-x64 path: llama-binaries - name: Download test model @@ -381,7 +381,7 @@ jobs: # Use a prompt with a single correct answer and greedy decoding # (--temp 0) so the result is deterministic and verifiable. prompt="What is 2 + 2? Reply with only the number." - echo "Running llama-cli test for gfx1151 (universal artifact)..." + echo "Running llama-cli test for gfx1151 (multiarch artifact)..." echo "Command: $llama_cli_path -m \"$model_path\" -ngl 99 --temp 0 -p \"$prompt\" -st -v" # Bound the run: a healthy 0.6B inference finishes in seconds. If the @@ -505,13 +505,13 @@ jobs: fi echo "Release tag: $TAG" - - name: Create universal archive + - name: Create multiarch archive if: steps.generate-tag.outputs.tag_exists == 'false' run: | TAG="${{ steps.generate-tag.outputs.tag }}" root="$PWD" - artifact_dir="./all-artifacts/llama-ubuntu-rocm-universal-x64" - archive="llama-${TAG}-ubuntu-rocm-universal-x64" + artifact_dir="./all-artifacts/llama-ubuntu-rocm-multiarch-x64" + archive="llama-${TAG}-ubuntu-rocm-multiarch-x64" if [ -d "$artifact_dir" ]; then echo "Creating ${archive}.tar.gz" tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" . @@ -534,10 +534,10 @@ jobs: --title "$TAG" \ --notes "**Build**: $TAG **OS**: ubuntu - **GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single universal binary) + **GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single multiarch binary) **ROCm Version**: $ROCM_VERSION (multi-arch) **Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH **Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC') - Prebuilt llama.cpp ROCm binaries — one universal package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multi-arch ROCm runtime (per-arch Tensile databases bundled)." \ + Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multi-arch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled)." \ *.tar.gz From 171c7e0af8f4f498b05f7bc0ab441f4d67f015d4 Mon Sep 17 00:00:00 2001 From: Jim Wu Date: Wed, 17 Jun 2026 08:35:54 -0600 Subject: [PATCH 3/3] ci(gfx11): use "multiarch" spelling in comments and scripts Follow-up to the package rename: drop the hyphenated "multi-arch" prose in comments, echoes, and the release body for one consistent spelling. The upstream nightlies endpoint (tarball-multi-arch) and the therock-dist-linux- multiarch- filenames are external names and left untouched. Co-Authored-By: Claude Opus 4 --- .github/workflows/build-gfx11-rocm.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build-gfx11-rocm.yml b/.github/workflows/build-gfx11-rocm.yml index 5d9ce17fa39a..6dabeabb5cd0 100644 --- a/.github/workflows/build-gfx11-rocm.yml +++ b/.github/workflows/build-gfx11-rocm.yml @@ -33,7 +33,7 @@ jobs: build-ubuntu: runs-on: ubuntu-24.04 # Single multiarch build: one fat binary covering all current CI arches, - # sourced from TheRock's multi-arch tarball (arch-neutral host + per-arch + # sourced from TheRock's multiarch tarball (arch-neutral host + per-arch # Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M), # gfx1150/1151/1153 (RDNA3.5 Strix APUs). env: @@ -71,14 +71,14 @@ jobs: ninja --version echo "Build dependencies installation completed" - - name: Download and extract multi-arch ROCm directly to /opt/rocm + - name: Download and extract multiarch ROCm directly to /opt/rocm run: | rocm_version="${{ env.ROCM_VERSION }}" base_url="https://rocm.nightlies.amd.com/tarball-multi-arch" if [ "$rocm_version" = "latest" ]; then - echo "Auto-detecting latest multi-arch ROCm version" - # The multi-arch host serves an HTML index (not S3 XML); scrape the + echo "Auto-detecting latest multiarch ROCm version" + # The multiarch host serves an HTML index (not S3 XML); scrape the # multiarch tarball names from it. files=$(curl -s "$base_url/" \ | grep -oE 'therock-dist-linux-multiarch-[0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+\.tar\.gz' \ @@ -144,7 +144,7 @@ jobs: rocm_url="$base_url/therock-dist-linux-multiarch-${rocm_version}.tar.gz" echo "DETECTED_ROCM_VERSION=$rocm_version" >> $GITHUB_ENV - # The multi-arch tarball (~11.5 GB) ships device code for ALL 26 GPU + # The multiarch tarball (~11.5 GB) ships device code for ALL 26 GPU # arches: every arch's .kpack plus per-arch rocBLAS/hipBLASLt Tensile DBs. # This consumer build only needs the 7 RDNA3/3.5 arches in GPU_TARGETS and # uses the GEMM (Tensile) path, which works without .kpack files. So we @@ -163,7 +163,7 @@ jobs: excludes+=("--exclude=./lib/*/library/*${a}*") done - echo "Streaming multi-arch ROCm from: $rocm_url (pruning .kpack + non-target arches)" + echo "Streaming multiarch ROCm from: $rocm_url (pruning .kpack + non-target arches)" sudo mkdir -p /opt/rocm curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - \ -C /opt/rocm --strip-components=1 "${excludes[@]}" @@ -327,7 +327,7 @@ jobs: needs: build-ubuntu if: needs.build-ubuntu.result == 'success' # Single hardware test of the multiarch artifact on gfx1151. This is the - # end-to-end safety net for the Tensile-only multi-arch package: a real + # end-to-end safety net for the Tensile-only multiarch package: a real # llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device. runs-on: linux-gfx1151-gpu-rocm @@ -535,9 +535,9 @@ jobs: --notes "**Build**: $TAG **OS**: ubuntu **GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single multiarch binary) - **ROCm Version**: $ROCM_VERSION (multi-arch) + **ROCm Version**: $ROCM_VERSION (multiarch) **Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH **Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC') - Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multi-arch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled)." \ + Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multiarch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled)." \ *.tar.gz