clockworklabs · bradleyshep · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
@@ -590,39 +590,6 @@ jobs:
         run: |
           cargo ci cli-docs
 
-  llm_ci_check:
-    name: Verify LLM benchmark is up to date
-    permissions:
-      contents: read
-    runs-on: ubuntu-latest
-    # Disable the tests because they are causing us headaches with merge conflicts and re-runs etc.
-    if: false
-    steps:
-      # Build the tool from master to ensure consistent hash computation
-      # with the llm-benchmark-update workflow (which also uses master's tool).
-      - name: Checkout master (build tool from trusted code)
-        uses: actions/checkout@v4
-        with:
-          ref: master
-          fetch-depth: 1
-
-      - uses: dtolnay/rust-toolchain@stable
-      - uses: Swatinem/rust-cache@v2
-
-      - name: Install llm-benchmark tool from master
-        run: |
-          cargo install --path tools/xtask-llm-benchmark --locked
-          command -v llm_benchmark
-
-      # Now checkout the PR branch to verify its benchmark files
-      - name: Checkout PR branch
-        uses: actions/checkout@v4
-        with:
-          clean: false
-
-      - name: Run hash check (both langs)
-        run: llm_benchmark ci-check
-
   unity-testsuite:
     needs: [lints]
     # Skip if this is an external contribution.

@@ -0,0 +1,67 @@
+name: Docs / Update llms files
+
+permissions:
+  contents: write
+
+on:
+  push:
+    branches:
+      - docs/release
+    paths:
+      - 'docs/docs/**'
+      - 'skills/**'
+  workflow_dispatch: # Allow manual trigger
+
+jobs:
+  update-llms:
+    runs-on: spacetimedb-new-runner-2
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          ref: docs/release
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v3
+        with:
+          node-version: '22'
+
+      - uses: pnpm/action-setup@v4
+        with:
+          run_install: true
+
+      - name: Get pnpm store directory
+        working-directory: sdks/typescript
+        shell: bash
+        run: |
+          echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_ENV
+
+      - uses: actions/cache@v4
+        name: Setup pnpm cache
+        with:
+          path: ${{ env.STORE_PATH }}
+          key: ${{ runner.os }}-pnpm-store-${{ hashFiles('**/pnpm-lock.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-pnpm-store-
+
+      - name: Install dependencies
+        working-directory: docs
+        run: pnpm install
+
+      - name: Docusaurus build
+        working-directory: docs
+        run: pnpm build
+
+      - name: Generate llms files
+        working-directory: docs
+        run: node scripts/generate-llms.mjs
+
+      - name: Commit updated llms files
+        working-directory: docs
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add static/llms.md
+          git diff --staged --quiet && echo "No changes" && exit 0
+          git commit -m "Update llms files from docs build"
+          git push
@@ -0,0 +1,118 @@
+name: Periodic LLM benchmarks
+
+on:
+  schedule:
+    # Daily at midnight UTC. Change to '0 */6 * * *' for every 6h,
+    # or '0 */4 * * *' for every 4h.
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+    inputs:
+      models:
+        description: 'Models to run (provider:model format, comma-separated, or "all")'
+        required: false
+        default: 'all'
+      languages:
+        description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)'
+        required: false
+        default: 'rust,csharp,typescript'
+      modes:
+        description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)'
+        required: false
+        default: 'guidelines,no_context'
+
+permissions:
+  contents: read
+
+concurrency:
+  group: llm-benchmark-periodic
+  cancel-in-progress: true
+
+jobs:
+  run-benchmarks:
+    runs-on: spacetimedb-new-runner
+    container:
+      image: localhost:5000/spacetimedb-ci:latest
+      options: >-
+        --privileged
+    timeout-minutes: 180
+
+    steps:
+      - name: Install spacetime CLI
+        run: |
+          curl -sSf https://install.spacetimedb.com | sh -s -- -y
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Checkout master
+        uses: actions/checkout@v4
+        with:
+          ref: master
+          fetch-depth: 1
+
+      - uses: dtolnay/rust-toolchain@stable
+      - uses: Swatinem/rust-cache@v2
+
+      - name: Setup .NET SDK
+        uses: actions/setup-dotnet@v4
+        with:
+          dotnet-version: "8.0.x"
+
+      - name: Install WASI workload
+        env:
+          DOTNET_MULTILEVEL_LOOKUP: "0"
+          DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
+          DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
+        run: |
+          dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Install pnpm
+        uses: pnpm/action-setup@v4
+
+      - name: Build llm-benchmark tool
+        run: cargo install --path tools/xtask-llm-benchmark --locked
+
+      - name: Run benchmarks
+        env:
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }}
+          LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }}
+          MSBUILDDISABLENODEREUSE: "1"
+          DOTNET_CLI_USE_MSBUILD_SERVER: "0"
+          INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }}
+          INPUT_MODELS: ${{ inputs.models || 'all' }}
+          INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }}
+        run: |
+          LANGS="$INPUT_LANGUAGES"
+          MODELS="$INPUT_MODELS"
+          MODES="$INPUT_MODES"
+
+          SUCCEEDED=0
+          FAILED=0
+          for LANG in $(echo "$LANGS" | tr ',' ' '); do
+            if [ "$MODELS" = "all" ]; then
+              if llm_benchmark run --lang "$LANG" --modes "$MODES"; then
+                SUCCEEDED=$((SUCCEEDED + 1))
+              else
+                echo "::warning::Benchmark run failed for lang=$LANG"
+                FAILED=$((FAILED + 1))
+              fi
+            else
+              if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then
+                SUCCEEDED=$((SUCCEEDED + 1))
+              else
+                echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS"
+                FAILED=$((FAILED + 1))
+              fi
+            fi
+          done
+          echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed"
+          if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then
+            echo "::error::All benchmark runs failed"
+            exit 1
+          fi