htekdev · htekdev · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/errors/caching-artifacts/alpine-busybox-tar-p-flag-unsupported.yml b/errors/caching-artifacts/alpine-busybox-tar-p-flag-unsupported.yml
@@ -0,0 +1,84 @@
+id: caching-artifacts-150
+title: 'actions/cache Fails in Alpine Containers — BusyBox tar Does Not Support -P Flag'
+category: caching-artifacts
+severity: error
+tags:
+  - alpine
+  - busybox
+  - tar
+  - container
+  - cache
+  - linux
+patterns:
+  - regex: 'tar: unrecognized option: P'
+    flags: 'i'
+  - regex: 'BusyBox.*?tar.*?unrecognized.*?option'
+    flags: 'i'
+  - regex: 'Tar failed with error: The process .*/bin/tar. failed with exit code 1'
+    flags: 'i'
+error_messages:
+  - "/bin/tar: unrecognized option: P"
+  - "BusyBox v1.31.1 () multi-call binary."
+  - "[warning]Tar failed with error: The process '/bin/tar' failed with exit code 1"
+root_cause: |
+  The `actions/cache` action uses GNU tar with the `-P` flag (preserve absolute path names)
+  when creating and extracting cache archives. Alpine Linux containers ship with BusyBox,
+  which provides a minimal tar implementation that does not recognise the `-P` flag.
+
+  When the cache step runs inside an Alpine container, `/bin/tar` is BusyBox tar and the
+  command fails immediately with "unrecognized option: P". The restore step returns exit
+  code 1 and the workflow stops or the cache is silently skipped depending on fail-on-cache-miss.
+
+  This is a long-standing issue first reported in actions/cache#352 and re-surfaced in
+  actions/cache#1765 (June 2026). No workaround has been added to the action itself.
+fix: |
+  Install GNU tar in the Alpine container before the cache step using apk:
+
+    steps:
+      - name: Install GNU tar (required for actions/cache)
+        run: apk add --no-cache tar
+      - name: Cache dependencies
+        uses: actions/cache@v4
+        with:
+          path: ~/.cargo/registry
+          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+
+  Alternatively, switch the job to a Debian/Ubuntu-based container image where
+  GNU tar is already the default (/usr/bin/tar).
+fix_code:
+  - language: yaml
+    label: Install GNU tar in Alpine before using actions/cache
+    code: |
+      - name: Install GNU tar
+        run: apk add --no-cache tar
+
+      - name: Cache dependencies
+        uses: actions/cache@v4
+        with:
+          path: ~/.cargo/registry
+          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+  - language: yaml
+    label: Switch to Debian/Ubuntu container to avoid BusyBox tar
+    code: |
+      jobs:
+        build:
+          runs-on: ubuntu-latest
+          container:
+            image: debian:bookworm-slim   # GNU tar available by default
+          steps:
+            - uses: actions/cache@v4
+              with:
+                path: ~/.cargo/registry
+                key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+prevention:
+  - "Run tar --version inside your Alpine container; if it shows BusyBox, add apk add --no-cache tar as the first step."
+  - "Set container: debian:bookworm-slim or ubuntu:latest instead of alpine when the job uses actions/cache."
+  - "Add apk add --no-cache tar as the very first step in any job that uses actions/cache inside an Alpine container."
+  - "Check the actions/cache documentation for container compatibility notes before choosing a base image."
+docs:
+  - url: "https://github.com/actions/cache/issues/1765"
+    label: "actions/cache issue #1765 — Post cache not working on alpine runners (2026)"
+  - url: "https://github.com/actions/cache/issues/352"
+    label: "actions/cache issue #352 — Original Alpine BusyBox tar report"
+  - url: "https://docs.github.com/en/actions/using-containerized-services/about-service-containers"
+    label: "GitHub Docs — About service containers"
diff --git a/...rrency-timing/environment-deployment-branch-scoped-concurrency-cross-branch-collision.yml b/...rrency-timing/environment-deployment-branch-scoped-concurrency-cross-branch-collision.yml
@@ -0,0 +1,130 @@
+id: ct-104
+title: 'Branch-Scoped Concurrency Group Allows Simultaneous Deployments to the Same Environment from Different Branches'
+category: concurrency-timing
+severity: silent-failure
+tags:
+  - concurrency
+  - environment
+  - deployment
+  - branch
+  - github-ref
+  - cross-branch
+  - concurrent-deploy
+  - job-environment
+patterns:
+  - regex: 'group:\s*[''"]?[^''"\n]*\$\{\{\s*github\.ref[^}]*\}\}[^''"\n]*'
+    flags: 'i'
+  - regex: 'group:\s*[''"]?[^''"\n]*\$\{\{\s*github\.ref_name[^}]*\}\}[^''"\n]*'
+    flags: 'i'
+  - regex: 'group:\s*[''"]?[^''"\n]*\$\{\{\s*github\.head_ref[^}]*\}\}[^''"\n]*'
+    flags: 'i'
+error_messages:
+  - "# No error — two branches deploy to the same environment simultaneously; may cause partial state or deployment races"
+root_cause: |
+  Workflows that include `github.ref`, `github.ref_name`, or `github.head_ref` in
+  their concurrency group key create **separate concurrency groups per branch**. When
+  two branches push to a workflow that deploys to the same environment, they get
+  different concurrency groups and do NOT queue or cancel each other.
+
+  Example:
+  - Branch `feature/alpha` pushes → group: `deploy-staging-refs/heads/feature/alpha`
+  - Branch `main` pushes       → group: `deploy-staging-refs/heads/main`
+  - Groups are different → both jobs run simultaneously → both deploy to `staging`
+
+  This pattern is correct for CI workflows (each branch's tests should run
+  independently), but incorrect for shared deployment environments where only one
+  deployment should be active at a time.
+
+  The confusion is common because teams copy a per-branch concurrency pattern from
+  CI into deploy workflows without adjusting the key. Environment protection rules
+  (required reviewers, wait timers) gate each individual job but do NOT prevent
+  multiple simultaneous deployments from different concurrency groups.
+
+  GitHub Actions provides the `job.environment` context — the environment name string
+  for the current job — which creates a stable, per-environment concurrency key that
+  applies across all branches deploying to that environment.
+fix: |
+  Key the concurrency group on the environment name, not the branch ref, so all
+  branches deploying to the same environment share one concurrency slot:
+
+    concurrency:
+      group: deploy-${{ job.environment }}
+      cancel-in-progress: false   # queue; do not discard deploys
+
+  Important: `job.environment` is only populated inside a job that declares
+  `environment:`. Set the concurrency group at the **job level**, not at the
+  workflow level, when using `job.environment`.
+
+  Use `cancel-in-progress: false` for deployments to ensure every triggered deploy
+  runs in order rather than being silently dropped.
+fix_code:
+  - language: yaml
+    label: 'WRONG — branch-scoped group; feature/ and main can deploy to staging simultaneously'
+    code: |
+      on: [push]
+
+      jobs:
+        deploy:
+          runs-on: ubuntu-latest
+          environment: staging
+          concurrency:
+            # BAD: different branches get different concurrency slots
+            # feature/alpha and main can both deploy to staging at the same time
+            group: deploy-staging-${{ github.ref }}
+            cancel-in-progress: true
+          steps:
+            - run: ./deploy.sh staging
+  - language: yaml
+    label: 'CORRECT — environment-scoped group; only one deploy to staging at a time'
+    code: |
+      on: [push]
+
+      jobs:
+        deploy:
+          runs-on: ubuntu-latest
+          environment: staging
+          concurrency:
+            # GOOD: all branches deploying to staging share one concurrency slot
+            # job.environment is the environment name ("staging")
+            group: deploy-${{ job.environment }}
+            cancel-in-progress: false   # queue — do not skip any deploys
+          steps:
+            - run: ./deploy.sh staging
+  - language: yaml
+    label: 'MULTI-ENV — staging and production each get their own independent slot'
+    code: |
+      on:
+        push:
+          branches: [main]
+
+      jobs:
+        deploy-staging:
+          runs-on: ubuntu-latest
+          environment: staging
+          concurrency:
+            group: deploy-${{ job.environment }}   # "staging" slot
+            cancel-in-progress: false
+          steps:
+            - run: ./deploy.sh staging
+
+        deploy-production:
+          needs: deploy-staging
+          runs-on: ubuntu-latest
+          environment: production
+          concurrency:
+            group: deploy-${{ job.environment }}   # "production" slot (separate)
+            cancel-in-progress: false
+          steps:
+            - run: ./deploy.sh production
+prevention:
+  - 'For deployment workflows, key concurrency groups on the environment name (`job.environment`), not on the branch ref.'
+  - 'Use `cancel-in-progress: false` for deployment jobs — silently dropping a deploy means a commit never reaches the environment.'
+  - 'Apply per-branch concurrency groups to CI jobs only; apply per-environment concurrency groups to deploy jobs.'
+  - 'Set job-level `concurrency:` (not workflow-level) when using `job.environment`, since that context is only available within a job.'
+docs:
+  - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency'
+    label: 'GitHub Docs — Using concurrency'
+  - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/accessing-contextual-information-about-workflow-runs#job-context'
+    label: 'GitHub Docs — job context (job.environment)'
+  - url: 'https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-deployments/managing-environments-for-deployment'
+    label: 'GitHub Docs — Managing environments for deployment'
diff --git a/errors/known-unsolved/ku-143.yml b/errors/known-unsolved/ku-143.yml
@@ -0,0 +1,126 @@
+id: known-unsolved-143
+title: 'JIT Token Expires During Sequential max-parallel:1 Matrix Workflows'
+category: known-unsolved
+severity: error
+tags:
+  - jit-runner
+  - self-hosted
+  - sequential
+  - max-parallel
+  - matrix
+  - token-expiry
+  - ephemeral
+patterns:
+  - regex: 'The operation was canceled\.'
+    flags: 'i'
+  - regex: 'jit.*token.*expir|token.*expir.*jit'
+    flags: 'i'
+  - regex: 'Failed to connect to the GitHub Actions service'
+    flags: 'i'
+error_messages:
+  - 'The operation was canceled.'
+  - 'Error: The operation was canceled.'
+  - 'Jobs 11+ fail with "The operation was canceled"'
+root_cause: |
+  JIT (Just-In-Time) runner tokens are scoped to a single job and expire after
+  approximately 60 minutes. When a matrix workflow uses `max-parallel: 1` to
+  enforce sequential execution, later jobs queue behind earlier ones. If the total
+  cumulative runtime of all preceding jobs exceeds ~60 minutes, the JIT token for
+  the waiting runner expires before GitHub dispatches the job to it.
+
+  The failure mode:
+  1. A serverless runner (Modal, AWS Lambda, Fargate, etc.) fetches a JIT config
+     on webhook receipt for job N.
+  2. Jobs 1–(N-1) run sequentially (max-parallel: 1), each taking several minutes.
+  3. By the time jobs 10+ become eligible to run, the 60-minute JIT token for those
+     runners has expired.
+  4. The runner attempts to connect to the GitHub Actions service, but the token
+     is no longer valid — the connection is cancelled.
+  5. The job fails with "The operation was canceled." even though the runner process
+     started and the worker code is intact.
+
+  This is a fundamental GitHub Actions JIT architecture limitation: JIT tokens are
+  not renewable and have no configurable TTL. There is no server-side mechanism to
+  refresh a JIT token while it is waiting in the queue.
+
+  This differs from the false-positive "lost communication" error (known-unsolved-058),
+  which affects already-running ephemeral jobs that complete successfully but appear
+  to disconnect. The JIT sequential expiry causes actual job failure before the job
+  begins executing user steps.
+fix: |
+  There is no direct fix — the 60-minute JIT token TTL is enforced by GitHub and is
+  not configurable. The following architectural workarounds are available:
+
+  1. Reduce total sequential runtime below 60 minutes:
+     - Combine short jobs into fewer, longer jobs to reduce the number of sequential
+       steps that must queue.
+     - Profile which matrix slices are slow and optimize or parallelize them.
+
+  2. Increase max-parallel (remove the strict sequential constraint):
+     - If ordering is required only between specific jobs, use `needs:` chains instead
+       of `max-parallel: 1` on a single matrix.
+     - This allows later jobs to obtain fresh JIT tokens earlier without waiting.
+
+  3. Use persistent (non-ephemeral) self-hosted runners:
+     - Persistent runners hold a long-lived registration token, not a JIT token.
+     - They do not expire while waiting in the queue.
+     - Trade-off: persistent runners have higher operational overhead.
+
+  4. Delay JIT token fetch until job dispatch (not webhook receipt):
+     - If the runner platform supports it, fetch the JIT config lazily at dispatch
+       time rather than pre-fetching on webhook receipt.
+     - This avoids holding a token that expires before the job starts.
+fix_code:
+  - language: yaml
+    label: 'Replace max-parallel:1 matrix with needs: chain to avoid JIT expiry'
+    code: |
+      # ❌ Problematic: max-parallel:1 matrix — job 11+ JIT token expires after 60 min
+      jobs:
+        sequential-work:
+          runs-on: [self-hosted, ephemeral]
+          strategy:
+            max-parallel: 1
+            matrix:
+              job_id: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+
+      # ✅ Workaround A: Remove max-parallel constraint so JIT tokens are fetched
+      # when jobs actually start, not when the workflow is dispatched
+      jobs:
+        parallel-work:
+          runs-on: [self-hosted, ephemeral]
+          strategy:
+            matrix:
+              job_id: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+
+      # ✅ Workaround B: Use needs: chains for true ordering without JIT expiry
+      jobs:
+        job-1:
+          runs-on: [self-hosted, ephemeral]
+          steps: [...]
+        job-2:
+          needs: job-1
+          runs-on: [self-hosted, ephemeral]
+          steps: [...]
+  - language: yaml
+    label: 'Use persistent runners when sequential execution with long total runtime is required'
+    code: |
+      # Persistent runners are not affected by the 60-min JIT token TTL
+      jobs:
+        sequential-build:
+          runs-on: [self-hosted, linux, persistent]  # NOT ephemeral
+          strategy:
+            max-parallel: 1
+            matrix:
+              job_id: [1, 2, 3, ..., 37]  # 37 x 6 min = 222 min total — safe on persistent
+prevention:
+  - 'Estimate total sequential runtime before using max-parallel:1 with JIT runners: N_jobs × avg_job_minutes must stay under 60 minutes.'
+  - 'Prefer needs: dependency chains over max-parallel:1 for ordered execution with ephemeral JIT runners.'
+  - 'Use persistent self-hosted runners for long-running sequential workflows that cannot be parallelized.'
+  - 'Monitor for "The operation was canceled." errors on jobs with high matrix indices — they are the signature of JIT token expiry, not infra failures.'
+docs:
+  - url: 'https://github.com/actions/runner/issues/4248'
+    label: 'actions/runner#4248 — JIT Token Expiration with Long-Running Sequential Workflows (2 comments, June 2026)'
+  - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners#using-just-in-time-runners'
+    label: 'GitHub Docs — Just-in-time (JIT) runners'
+  - url: 'https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymax-parallel'
+    label: 'Workflow syntax — jobs.<job_id>.strategy.max-parallel'