From 7cd19421c13d2e5c02b27598b1f82d9335727432 Mon Sep 17 00:00:00 2001 From: Pierre Warnier Date: Wed, 10 Jun 2026 12:40:28 +0200 Subject: [PATCH] ci: make the Docker test matrix resilient to registry flakes (#172) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Test (debian/alpine/fedora) matrix failed four times in a row on 2026-06-10, each on a transient network error (Docker Hub base-image pull i/o timeout; crates.io download broken pipe) unrelated to the code. - fail-fast: false so one distro's flake no longer cancels the others. - Retry 'docker compose build ' (the Docker Hub pull) with backoff. - Pass CARGO_NET_RETRY into the container and set it for runner-native jobs so cargo retries crate downloads; real test failures still fail fast (no step-level retry around the test run). No new third-party actions — a shell retry loop keeps the supply-chain surface unchanged. Closes #172. --- .github/workflows/ci.yml | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da3ea01..a9a644d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,8 @@ on: env: CARGO_TERM_COLOR: always + # Ride out transient crates.io/registry blips instead of failing the job. + CARGO_NET_RETRY: "10" jobs: fmt: @@ -57,8 +59,23 @@ jobs: if: github.event_name == 'push' runs-on: ubuntu-latest strategy: + # One distro's transient registry timeout must not cancel the others. + fail-fast: false matrix: target: [debian, alpine, fedora] steps: - uses: actions/checkout@v6 - - run: docker compose run --rm ${{ matrix.target }} cargo test --workspace + # Base images are pulled from Docker Hub at build time; retry to ride + # out transient registry timeouts that would otherwise fail the job. + - name: Build ${{ matrix.target }} image + run: | + for attempt in 1 2 3; do + docker compose build ${{ matrix.target }} && exit 0 + echo "::warning::'${{ matrix.target }}' image build attempt $attempt failed (likely a registry timeout); retrying in 30s" + sleep 30 + done + exit 1 + # CARGO_NET_RETRY is passed into the container so cargo retries crate + # downloads; a real test failure still fails fast (no step-level retry). + - name: Test on ${{ matrix.target }} + run: docker compose run --rm -e CARGO_NET_RETRY=10 ${{ matrix.target }} cargo test --workspace