diff --git a/.github/scripts/upsert-eval-comment.cjs b/.github/scripts/upsert-eval-comment.cjs new file mode 100644 index 000000000..8bbf970c8 --- /dev/null +++ b/.github/scripts/upsert-eval-comment.cjs @@ -0,0 +1,72 @@ +/** + * Upserts a sticky "Eval running" comment on a PR after the dogfood eval + * pipeline has been launched. + * + * Invoked via `actions/github-script`. Inputs come from environment vars: + * PR_NUMBER - the pull request number + * HEAD_SHA - the commit the eval was launched for + * RUN_JSON - raw JSON from `databricks jobs run-now` (used to link the run) + */ + +const MARKER = ""; +const EVALS_MONITOR_URL = + "https://evals-monitor-6051921418418893.staging.aws.databricksapps.com"; +const DATABRICKS_HOST = "https://dogfood.staging.databricks.com"; +const JOB_ID = "398185277057549"; +const WORKSPACE_ID = "6051921418418893"; + +module.exports = async ({ github, context }) => { + const { owner, repo } = context.repo; + const issue_number = Number(process.env.PR_NUMBER); + const shortSha = (process.env.HEAD_SHA || "").substring(0, 7); + + // run_id comes back in the run-now response, so the run link costs no extra call. + let runId; + try { + runId = JSON.parse(process.env.RUN_JSON || "{}").run_id; + } catch { + runId = undefined; + } + + const links = [ + `[View results in evals-monitor →](${EVALS_MONITOR_URL}/prs/appkit/${issue_number})`, + ]; + if (runId) { + links.push( + `[job run ↗](${DATABRICKS_HOST}/jobs/${JOB_ID}/runs/${runId}?o=${WORKSPACE_ID})`, + ); + } + + const body = [ + MARKER, + "### ⏳ Eval running", + "", + `Eval pipeline launched for commit \`${shortSha}\`.`, + "", + links.join(" · "), + ].join("\n"); + + const comments = await github.paginate(github.rest.issues.listComments, { + owner, + repo, + issue_number, + per_page: 100, + }); + const existing = comments.find((c) => c.body?.includes(MARKER)); + + if (existing) { + await github.rest.issues.updateComment({ + owner, + repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner, + repo, + issue_number, + body, + }); + } +}; diff --git a/.github/workflows/eval-trigger.yml b/.github/workflows/eval-trigger.yml new file mode 100644 index 000000000..660c8dcbb --- /dev/null +++ b/.github/workflows/eval-trigger.yml @@ -0,0 +1,110 @@ +name: Eval Trigger + +# Launches the dogfood eval pipeline for a PR when the `run-evals` label is +# present, and re-launches it on every new commit while the label stays on. +# +# Uses `pull_request` (not `pull_request_target`): the workflow file that runs +# is the PR branch's version (so it's testable on the feature branch), and repo +# secrets are withheld from fork PRs, so an external contributor cannot exfil +# the Databricks credentials even by editing this file. Auth is OAuth M2M as the +# `apps-mcp-evals-runner` service principal (DATABRICKS_CLIENT_ID/SECRET), and +# those credentials are exposed only to the trigger step, never to the comment +# step that runs PR-authored script code. +on: + pull_request: + types: [labeled, synchronize] + +# `id-token: write` is required for the databricks-protected-runner-group's +# egress to internal Databricks hosts (incl. dogfood.staging) — without it the +# egress proxy returns 403 "RBAC: access denied" for every request. Matches the +# other workflows in this repo (ci.yml, prepare-release.yml, docs-deploy.yml). +permissions: + contents: read + pull-requests: write + id-token: write + +# Latest push wins: a newer commit cancels the in-flight run for an older one, +# so the sticky comment always reflects the most recently triggered commit. +concurrency: + group: eval-trigger-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + trigger-evals: + name: Trigger eval pipeline + # Run when the `run-evals` label is added, or on a new commit while the PR + # already carries the label. + if: >- + (github.event.action == 'labeled' && github.event.label.name == 'run-evals') || + (github.event.action == 'synchronize' && contains(github.event.pull_request.labels.*.name, 'run-evals')) + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Install Databricks CLI + uses: databricks/setup-cli@772863b94473abd8b0cacbec8b6f80fa0cbe1136 # v1.2.1 + + # TEMPORARY diagnostic — remove once confirmed green. Probes dogfood + # reachability + OIDC discovery and a forced M2M authenticated call. + - name: Diagnose Databricks connectivity + auth + env: + DATABRICKS_HOST: https://dogfood.staging.databricks.com + DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }} + DATABRICKS_AUTH_TYPE: oauth-m2m + run: | + echo "::group::Reachability" + curl -sS -m 20 -o /dev/null -w "host root: HTTP %{http_code}\n" "$DATABRICKS_HOST/" || echo "host root: UNREACHABLE" + curl -sS -m 20 -w "\nworkspace OIDC: HTTP %{http_code}\n" "$DATABRICKS_HOST/oidc/.well-known/oauth-authorization-server" || echo "workspace OIDC: UNREACHABLE" + echo "::endgroup::" + echo "::group::M2M auth check" + databricks current-user me --log-level debug || true + echo "::endgroup::" + + - name: Trigger eval pipeline + id: trigger + env: + DATABRICKS_HOST: https://dogfood.staging.databricks.com + # OAuth M2M as the apps-mcp-evals-runner service principal. + DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }} + DATABRICKS_AUTH_TYPE: oauth-m2m + # The real PR head commit — never the synthetic merge commit — so the + # pipeline can pull the code. + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} + # --no-wait: fire-and-forget; run-now otherwise blocks until the eval + # finishes. The JSON response carries run_id, which we forward to the + # comment step to link the run (no extra API call). + run: | + run_json=$(databricks jobs run-now --no-wait --output json --json "$(cat <> "$GITHUB_OUTPUT" + + - name: Post / update "Eval running" comment + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + env: + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_request.number }} + RUN_JSON: ${{ steps.trigger.outputs.run_json }} + with: + script: | + const upsert = require('./.github/scripts/upsert-eval-comment.cjs'); + await upsert({ github, context });