From 8568aa7851aa7a677f220d2bf00cd1c65e635365 Mon Sep 17 00:00:00 2001 From: Jorge Calvar Date: Tue, 9 Jun 2026 11:26:40 +0200 Subject: [PATCH 1/4] ci: trigger dogfood eval pipeline on run-evals PR label Add a GitHub Actions workflow that launches the dogfood eval pipeline (job 398185277057549) when the `run-evals` label is added to a PR, and re-launches it on each new commit while the label stays on. The real PR head commit is passed as `appkit_ref` so the pipeline can pull the code; `prompt_preset=custom-pr` and `tags=appkit_pr:` are also set. Authenticates as the apps-mcp-evals-runner service principal via OAuth M2M, and posts a sticky "Eval running" comment linking the evals-monitor PR page and the triggered job run. Comment logic lives in .github/scripts/upsert-eval-comment.cjs. Co-authored-by: Isaac Signed-off-by: Jorge Calvar --- .github/scripts/upsert-eval-comment.cjs | 72 ++++++++++++++++++++ .github/workflows/eval-trigger.yml | 88 +++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 .github/scripts/upsert-eval-comment.cjs create mode 100644 .github/workflows/eval-trigger.yml diff --git a/.github/scripts/upsert-eval-comment.cjs b/.github/scripts/upsert-eval-comment.cjs new file mode 100644 index 000000000..8bbf970c8 --- /dev/null +++ b/.github/scripts/upsert-eval-comment.cjs @@ -0,0 +1,72 @@ +/** + * Upserts a sticky "Eval running" comment on a PR after the dogfood eval + * pipeline has been launched. + * + * Invoked via `actions/github-script`. Inputs come from environment vars: + * PR_NUMBER - the pull request number + * HEAD_SHA - the commit the eval was launched for + * RUN_JSON - raw JSON from `databricks jobs run-now` (used to link the run) + */ + +const MARKER = ""; +const EVALS_MONITOR_URL = + "https://evals-monitor-6051921418418893.staging.aws.databricksapps.com"; +const DATABRICKS_HOST = "https://dogfood.staging.databricks.com"; +const JOB_ID = "398185277057549"; +const WORKSPACE_ID = "6051921418418893"; + +module.exports = async ({ github, context }) => { + const { owner, repo } = context.repo; + const issue_number = Number(process.env.PR_NUMBER); + const shortSha = (process.env.HEAD_SHA || "").substring(0, 7); + + // run_id comes back in the run-now response, so the run link costs no extra call. + let runId; + try { + runId = JSON.parse(process.env.RUN_JSON || "{}").run_id; + } catch { + runId = undefined; + } + + const links = [ + `[View results in evals-monitor →](${EVALS_MONITOR_URL}/prs/appkit/${issue_number})`, + ]; + if (runId) { + links.push( + `[job run ↗](${DATABRICKS_HOST}/jobs/${JOB_ID}/runs/${runId}?o=${WORKSPACE_ID})`, + ); + } + + const body = [ + MARKER, + "### ⏳ Eval running", + "", + `Eval pipeline launched for commit \`${shortSha}\`.`, + "", + links.join(" · "), + ].join("\n"); + + const comments = await github.paginate(github.rest.issues.listComments, { + owner, + repo, + issue_number, + per_page: 100, + }); + const existing = comments.find((c) => c.body?.includes(MARKER)); + + if (existing) { + await github.rest.issues.updateComment({ + owner, + repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner, + repo, + issue_number, + body, + }); + } +}; diff --git a/.github/workflows/eval-trigger.yml b/.github/workflows/eval-trigger.yml new file mode 100644 index 000000000..08485d938 --- /dev/null +++ b/.github/workflows/eval-trigger.yml @@ -0,0 +1,88 @@ +name: Eval Trigger + +# Launches the dogfood eval pipeline for a PR when the `run-evals` label is +# present, and re-launches it on every new commit while the label stays on. +# +# Uses `pull_request` (not `pull_request_target`): the workflow file that runs +# is the PR branch's version (so it's testable on the feature branch), and repo +# secrets are withheld from fork PRs, so an external contributor cannot exfil +# the Databricks credentials even by editing this file. Auth is OAuth M2M as the +# `apps-mcp-evals-runner` service principal (DATABRICKS_CLIENT_ID/SECRET), and +# those credentials are exposed only to the trigger step, never to the comment +# step that runs PR-authored script code. +on: + pull_request: + types: [labeled, synchronize] + +permissions: + contents: read + pull-requests: write + +# Latest push wins: a newer commit cancels the in-flight run for an older one, +# so the sticky comment always reflects the most recently triggered commit. +concurrency: + group: eval-trigger-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + trigger-evals: + name: Trigger eval pipeline + # Run when the `run-evals` label is added, or on a new commit while the PR + # already carries the label. + if: >- + (github.event.action == 'labeled' && github.event.label.name == 'run-evals') || + (github.event.action == 'synchronize' && contains(github.event.pull_request.labels.*.name, 'run-evals')) + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Install Databricks CLI + uses: databricks/setup-cli@772863b94473abd8b0cacbec8b6f80fa0cbe1136 # v1.2.1 + + - name: Trigger eval pipeline + id: trigger + env: + DATABRICKS_HOST: https://dogfood.staging.databricks.com + # OAuth M2M as the apps-mcp-evals-runner service principal. The CLI + # auto-selects client-credentials auth when these are present. + DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }} + # The real PR head commit — never the synthetic merge commit — so the + # pipeline can pull the code. + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} + # --no-wait: fire-and-forget; run-now otherwise blocks until the eval + # finishes. The JSON response carries run_id, which we forward to the + # comment step to link the run (no extra API call). + run: | + run_json=$(databricks jobs run-now --no-wait --output json --json "$(cat <> "$GITHUB_OUTPUT" + + - name: Post / update "Eval running" comment + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + env: + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} + RUN_JSON: ${{ steps.trigger.outputs.run_json }} + with: + script: | + const upsert = require('./.github/scripts/upsert-eval-comment.cjs'); + await upsert({ github, context }); From 12c3f8f460909531e2a622c8c1a73864761b1d30 Mon Sep 17 00:00:00 2001 From: Jorge Calvar Date: Tue, 9 Jun 2026 11:35:04 +0200 Subject: [PATCH 2/4] ci: add temporary auth/connectivity diagnostics to eval-trigger Probe dogfood reachability + workspace OIDC discovery and run a forced oauth-m2m authenticated call with debug logging, to pin down the "cannot configure default credentials" failure. To be reverted once auth works. Co-authored-by: Isaac Signed-off-by: Jorge Calvar --- .github/workflows/eval-trigger.yml | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval-trigger.yml b/.github/workflows/eval-trigger.yml index 08485d938..273ecdc4e 100644 --- a/.github/workflows/eval-trigger.yml +++ b/.github/workflows/eval-trigger.yml @@ -42,14 +42,37 @@ jobs: - name: Install Databricks CLI uses: databricks/setup-cli@772863b94473abd8b0cacbec8b6f80fa0cbe1136 # v1.2.1 + # TEMPORARY diagnostic — remove once auth works. Disambiguates "cannot + # configure default credentials": is dogfood reachable + does OIDC + # discovery resolve, and does forced M2M auth produce a clearer error? + - name: Diagnose Databricks connectivity + auth + env: + DATABRICKS_HOST: https://dogfood.staging.databricks.com + DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }} + DATABRICKS_AUTH_TYPE: oauth-m2m + run: | + echo "::group::CLI version" + databricks --version + echo "::endgroup::" + echo "::group::Reachability" + curl -sS -m 20 -o /dev/null -w "host root: HTTP %{http_code}\n" "$DATABRICKS_HOST/" || echo "host root: UNREACHABLE" + curl -sS -m 20 -w "\nworkspace OIDC: HTTP %{http_code}\n" "$DATABRICKS_HOST/oidc/.well-known/oauth-authorization-server" || echo "workspace OIDC: UNREACHABLE" + echo "::endgroup::" + echo "::group::Forced M2M auth (debug)" + # current-user me is a cheap authenticated call; --log-level debug + # surfaces the real auth-resolution error. Never fails the job. + databricks current-user me --log-level debug || true + echo "::endgroup::" + - name: Trigger eval pipeline id: trigger env: DATABRICKS_HOST: https://dogfood.staging.databricks.com - # OAuth M2M as the apps-mcp-evals-runner service principal. The CLI - # auto-selects client-credentials auth when these are present. + # OAuth M2M as the apps-mcp-evals-runner service principal. DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }} DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }} + DATABRICKS_AUTH_TYPE: oauth-m2m # The real PR head commit — never the synthetic merge commit — so the # pipeline can pull the code. HEAD_SHA: ${{ github.event.pull_request.head.sha }} From 2a240dd909d07ecc6deca571ce0b79094e761a0f Mon Sep 17 00:00:00 2001 From: Jorge Calvar Date: Tue, 9 Jun 2026 11:41:12 +0200 Subject: [PATCH 3/4] ci: add workflow_dispatch + configurable runner to eval-trigger Allow manual runs to probe staging connectivity from an arbitrary runner group (runner_group/runner_labels inputs), since dogfood.staging blocks the default databricks-protected-runner-group at the network edge. A bare dispatch runs only the diagnostic; pass pr_number to also trigger the job and post the comment. Co-authored-by: Isaac Signed-off-by: Jorge Calvar --- .github/workflows/eval-trigger.yml | 51 +++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/.github/workflows/eval-trigger.yml b/.github/workflows/eval-trigger.yml index 273ecdc4e..774a854ce 100644 --- a/.github/workflows/eval-trigger.yml +++ b/.github/workflows/eval-trigger.yml @@ -10,9 +10,32 @@ name: Eval Trigger # `apps-mcp-evals-runner` service principal (DATABRICKS_CLIENT_ID/SECRET), and # those credentials are exposed only to the trigger step, never to the comment # step that runs PR-authored script code. +# +# workflow_dispatch is provided for manual testing — notably to probe whether a +# given runner group can reach dogfood.staging (which is network-restricted). +# A bare manual run executes only the connectivity diagnostic; pass `pr_number` +# to also trigger the job and post the comment. on: pull_request: types: [labeled, synchronize] + workflow_dispatch: + inputs: + runner_group: + description: "Runner group to run on (default: databricks-protected-runner-group)" + required: false + type: string + runner_labels: + description: "Runner labels (default: linux-ubuntu-latest)" + required: false + type: string + pr_number: + description: "PR number — if set, actually triggers the eval job and posts the comment" + required: false + type: string + appkit_ref: + description: "Commit SHA to eval (default: the ref this run is on)" + required: false + type: string permissions: contents: read @@ -21,20 +44,21 @@ permissions: # Latest push wins: a newer commit cancels the in-flight run for an older one, # so the sticky comment always reflects the most recently triggered commit. concurrency: - group: eval-trigger-${{ github.event.pull_request.number }} + group: eval-trigger-${{ github.event.pull_request.number || inputs.pr_number || github.run_id }} cancel-in-progress: true jobs: trigger-evals: name: Trigger eval pipeline - # Run when the `run-evals` label is added, or on a new commit while the PR - # already carries the label. + # Run when manually dispatched, when the `run-evals` label is added, or on a + # new commit while the PR already carries the label. if: >- + github.event_name == 'workflow_dispatch' || (github.event.action == 'labeled' && github.event.label.name == 'run-evals') || (github.event.action == 'synchronize' && contains(github.event.pull_request.labels.*.name, 'run-evals')) runs-on: - group: databricks-protected-runner-group - labels: linux-ubuntu-latest + group: ${{ inputs.runner_group || 'databricks-protected-runner-group' }} + labels: ${{ inputs.runner_labels || 'linux-ubuntu-latest' }} steps: - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 @@ -42,9 +66,9 @@ jobs: - name: Install Databricks CLI uses: databricks/setup-cli@772863b94473abd8b0cacbec8b6f80fa0cbe1136 # v1.2.1 - # TEMPORARY diagnostic — remove once auth works. Disambiguates "cannot - # configure default credentials": is dogfood reachable + does OIDC - # discovery resolve, and does forced M2M auth produce a clearer error? + # TEMPORARY diagnostic — remove once auth/connectivity works. Probes + # whether this runner can reach dogfood.staging and resolve OIDC, then + # tries a forced M2M authenticated call with debug logging. - name: Diagnose Databricks connectivity + auth env: DATABRICKS_HOST: https://dogfood.staging.databricks.com @@ -67,6 +91,8 @@ jobs: - name: Trigger eval pipeline id: trigger + # On a bare manual dispatch (no pr_number) we only run diagnostics above. + if: github.event_name != 'workflow_dispatch' || inputs.pr_number != '' env: DATABRICKS_HOST: https://dogfood.staging.databricks.com # OAuth M2M as the apps-mcp-evals-runner service principal. @@ -75,8 +101,8 @@ jobs: DATABRICKS_AUTH_TYPE: oauth-m2m # The real PR head commit — never the synthetic merge commit — so the # pipeline can pull the code. - HEAD_SHA: ${{ github.event.pull_request.head.sha }} - PR_NUMBER: ${{ github.event.pull_request.number }} + HEAD_SHA: ${{ github.event.pull_request.head.sha || inputs.appkit_ref || github.sha }} + PR_NUMBER: ${{ github.event.pull_request.number || inputs.pr_number }} # --no-wait: fire-and-forget; run-now otherwise blocks until the eval # finishes. The JSON response carries run_id, which we forward to the # comment step to link the run (no extra API call). @@ -100,10 +126,11 @@ jobs: } >> "$GITHUB_OUTPUT" - name: Post / update "Eval running" comment + if: github.event_name != 'workflow_dispatch' || inputs.pr_number != '' uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: - HEAD_SHA: ${{ github.event.pull_request.head.sha }} - PR_NUMBER: ${{ github.event.pull_request.number }} + HEAD_SHA: ${{ github.event.pull_request.head.sha || inputs.appkit_ref || github.sha }} + PR_NUMBER: ${{ github.event.pull_request.number || inputs.pr_number }} RUN_JSON: ${{ steps.trigger.outputs.run_json }} with: script: | From eb61d576b7ba357610e1dee220dac11679c79430 Mon Sep 17 00:00:00 2001 From: Jorge Calvar Date: Tue, 9 Jun 2026 11:45:19 +0200 Subject: [PATCH 4/4] ci: add id-token: write so runner can reach dogfood.staging The databricks-protected-runner-group's egress to internal Databricks hosts is gated by the GitHub OIDC identity. Without `id-token: write` the egress proxy returns 403 "RBAC: access denied" for every request (incl. anonymous curl to dogfood.staging), which is what broke OAuth M2M. All other Databricks workflows in this repo set this permission. Also revert the temporary manual-dispatch/configurable-runner testing scaffolding; back to label/synchronize on databricks-protected-runner-group. Co-authored-by: Isaac Signed-off-by: Jorge Calvar --- .github/workflows/eval-trigger.yml | 62 ++++++++---------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/.github/workflows/eval-trigger.yml b/.github/workflows/eval-trigger.yml index 774a854ce..660c8dcbb 100644 --- a/.github/workflows/eval-trigger.yml +++ b/.github/workflows/eval-trigger.yml @@ -10,55 +10,36 @@ name: Eval Trigger # `apps-mcp-evals-runner` service principal (DATABRICKS_CLIENT_ID/SECRET), and # those credentials are exposed only to the trigger step, never to the comment # step that runs PR-authored script code. -# -# workflow_dispatch is provided for manual testing — notably to probe whether a -# given runner group can reach dogfood.staging (which is network-restricted). -# A bare manual run executes only the connectivity diagnostic; pass `pr_number` -# to also trigger the job and post the comment. on: pull_request: types: [labeled, synchronize] - workflow_dispatch: - inputs: - runner_group: - description: "Runner group to run on (default: databricks-protected-runner-group)" - required: false - type: string - runner_labels: - description: "Runner labels (default: linux-ubuntu-latest)" - required: false - type: string - pr_number: - description: "PR number — if set, actually triggers the eval job and posts the comment" - required: false - type: string - appkit_ref: - description: "Commit SHA to eval (default: the ref this run is on)" - required: false - type: string +# `id-token: write` is required for the databricks-protected-runner-group's +# egress to internal Databricks hosts (incl. dogfood.staging) — without it the +# egress proxy returns 403 "RBAC: access denied" for every request. Matches the +# other workflows in this repo (ci.yml, prepare-release.yml, docs-deploy.yml). permissions: contents: read pull-requests: write + id-token: write # Latest push wins: a newer commit cancels the in-flight run for an older one, # so the sticky comment always reflects the most recently triggered commit. concurrency: - group: eval-trigger-${{ github.event.pull_request.number || inputs.pr_number || github.run_id }} + group: eval-trigger-${{ github.event.pull_request.number }} cancel-in-progress: true jobs: trigger-evals: name: Trigger eval pipeline - # Run when manually dispatched, when the `run-evals` label is added, or on a - # new commit while the PR already carries the label. + # Run when the `run-evals` label is added, or on a new commit while the PR + # already carries the label. if: >- - github.event_name == 'workflow_dispatch' || (github.event.action == 'labeled' && github.event.label.name == 'run-evals') || (github.event.action == 'synchronize' && contains(github.event.pull_request.labels.*.name, 'run-evals')) runs-on: - group: ${{ inputs.runner_group || 'databricks-protected-runner-group' }} - labels: ${{ inputs.runner_labels || 'linux-ubuntu-latest' }} + group: databricks-protected-runner-group + labels: linux-ubuntu-latest steps: - name: Check out repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 @@ -66,9 +47,8 @@ jobs: - name: Install Databricks CLI uses: databricks/setup-cli@772863b94473abd8b0cacbec8b6f80fa0cbe1136 # v1.2.1 - # TEMPORARY diagnostic — remove once auth/connectivity works. Probes - # whether this runner can reach dogfood.staging and resolve OIDC, then - # tries a forced M2M authenticated call with debug logging. + # TEMPORARY diagnostic — remove once confirmed green. Probes dogfood + # reachability + OIDC discovery and a forced M2M authenticated call. - name: Diagnose Databricks connectivity + auth env: DATABRICKS_HOST: https://dogfood.staging.databricks.com @@ -76,23 +56,16 @@ jobs: DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }} DATABRICKS_AUTH_TYPE: oauth-m2m run: | - echo "::group::CLI version" - databricks --version - echo "::endgroup::" echo "::group::Reachability" curl -sS -m 20 -o /dev/null -w "host root: HTTP %{http_code}\n" "$DATABRICKS_HOST/" || echo "host root: UNREACHABLE" curl -sS -m 20 -w "\nworkspace OIDC: HTTP %{http_code}\n" "$DATABRICKS_HOST/oidc/.well-known/oauth-authorization-server" || echo "workspace OIDC: UNREACHABLE" echo "::endgroup::" - echo "::group::Forced M2M auth (debug)" - # current-user me is a cheap authenticated call; --log-level debug - # surfaces the real auth-resolution error. Never fails the job. + echo "::group::M2M auth check" databricks current-user me --log-level debug || true echo "::endgroup::" - name: Trigger eval pipeline id: trigger - # On a bare manual dispatch (no pr_number) we only run diagnostics above. - if: github.event_name != 'workflow_dispatch' || inputs.pr_number != '' env: DATABRICKS_HOST: https://dogfood.staging.databricks.com # OAuth M2M as the apps-mcp-evals-runner service principal. @@ -101,8 +74,8 @@ jobs: DATABRICKS_AUTH_TYPE: oauth-m2m # The real PR head commit — never the synthetic merge commit — so the # pipeline can pull the code. - HEAD_SHA: ${{ github.event.pull_request.head.sha || inputs.appkit_ref || github.sha }} - PR_NUMBER: ${{ github.event.pull_request.number || inputs.pr_number }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} # --no-wait: fire-and-forget; run-now otherwise blocks until the eval # finishes. The JSON response carries run_id, which we forward to the # comment step to link the run (no extra API call). @@ -126,11 +99,10 @@ jobs: } >> "$GITHUB_OUTPUT" - name: Post / update "Eval running" comment - if: github.event_name != 'workflow_dispatch' || inputs.pr_number != '' uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: - HEAD_SHA: ${{ github.event.pull_request.head.sha || inputs.appkit_ref || github.sha }} - PR_NUMBER: ${{ github.event.pull_request.number || inputs.pr_number }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} RUN_JSON: ${{ steps.trigger.outputs.run_json }} with: script: |