diff --git a/.github/scripts/upsert-eval-comment.cjs b/.github/scripts/upsert-eval-comment.cjs
new file mode 100644
index 000000000..8bbf970c8
--- /dev/null
+++ b/.github/scripts/upsert-eval-comment.cjs
@@ -0,0 +1,72 @@
+/**
+ * Upserts a sticky "Eval running" comment on a PR after the dogfood eval
+ * pipeline has been launched.
+ *
+ * Invoked via `actions/github-script`. Inputs come from environment vars:
+ * PR_NUMBER - the pull request number
+ * HEAD_SHA - the commit the eval was launched for
+ * RUN_JSON - raw JSON from `databricks jobs run-now` (used to link the run)
+ */
+
+const MARKER = "";
+const EVALS_MONITOR_URL =
+ "https://evals-monitor-6051921418418893.staging.aws.databricksapps.com";
+const DATABRICKS_HOST = "https://dogfood.staging.databricks.com";
+const JOB_ID = "398185277057549";
+const WORKSPACE_ID = "6051921418418893";
+
+module.exports = async ({ github, context }) => {
+ const { owner, repo } = context.repo;
+ const issue_number = Number(process.env.PR_NUMBER);
+ const shortSha = (process.env.HEAD_SHA || "").substring(0, 7);
+
+ // run_id comes back in the run-now response, so the run link costs no extra call.
+ let runId;
+ try {
+ runId = JSON.parse(process.env.RUN_JSON || "{}").run_id;
+ } catch {
+ runId = undefined;
+ }
+
+ const links = [
+ `[View results in evals-monitor →](${EVALS_MONITOR_URL}/prs/appkit/${issue_number})`,
+ ];
+ if (runId) {
+ links.push(
+ `[job run ↗](${DATABRICKS_HOST}/jobs/${JOB_ID}/runs/${runId}?o=${WORKSPACE_ID})`,
+ );
+ }
+
+ const body = [
+ MARKER,
+ "### ⏳ Eval running",
+ "",
+ `Eval pipeline launched for commit \`${shortSha}\`.`,
+ "",
+ links.join(" · "),
+ ].join("\n");
+
+ const comments = await github.paginate(github.rest.issues.listComments, {
+ owner,
+ repo,
+ issue_number,
+ per_page: 100,
+ });
+ const existing = comments.find((c) => c.body?.includes(MARKER));
+
+ if (existing) {
+ await github.rest.issues.updateComment({
+ owner,
+ repo,
+ comment_id: existing.id,
+ body,
+ });
+ } else {
+ await github.rest.issues.createComment({
+ owner,
+ repo,
+ issue_number,
+ body,
+ });
+ }
+};
diff --git a/.github/workflows/eval-trigger.yml b/.github/workflows/eval-trigger.yml
new file mode 100644
index 000000000..660c8dcbb
--- /dev/null
+++ b/.github/workflows/eval-trigger.yml
@@ -0,0 +1,110 @@
+name: Eval Trigger
+
+# Launches the dogfood eval pipeline for a PR when the `run-evals` label is
+# present, and re-launches it on every new commit while the label stays on.
+#
+# Uses `pull_request` (not `pull_request_target`): the workflow file that runs
+# is the PR branch's version (so it's testable on the feature branch), and repo
+# secrets are withheld from fork PRs, so an external contributor cannot exfil
+# the Databricks credentials even by editing this file. Auth is OAuth M2M as the
+# `apps-mcp-evals-runner` service principal (DATABRICKS_CLIENT_ID/SECRET), and
+# those credentials are exposed only to the trigger step, never to the comment
+# step that runs PR-authored script code.
+on:
+ pull_request:
+ types: [labeled, synchronize]
+
+# `id-token: write` is required for the databricks-protected-runner-group's
+# egress to internal Databricks hosts (incl. dogfood.staging) — without it the
+# egress proxy returns 403 "RBAC: access denied" for every request. Matches the
+# other workflows in this repo (ci.yml, prepare-release.yml, docs-deploy.yml).
+permissions:
+ contents: read
+ pull-requests: write
+ id-token: write
+
+# Latest push wins: a newer commit cancels the in-flight run for an older one,
+# so the sticky comment always reflects the most recently triggered commit.
+concurrency:
+ group: eval-trigger-${{ github.event.pull_request.number }}
+ cancel-in-progress: true
+
+jobs:
+ trigger-evals:
+ name: Trigger eval pipeline
+ # Run when the `run-evals` label is added, or on a new commit while the PR
+ # already carries the label.
+ if: >-
+ (github.event.action == 'labeled' && github.event.label.name == 'run-evals') ||
+ (github.event.action == 'synchronize' && contains(github.event.pull_request.labels.*.name, 'run-evals'))
+ runs-on:
+ group: databricks-protected-runner-group
+ labels: linux-ubuntu-latest
+ steps:
+ - name: Check out repository
+ uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+
+ - name: Install Databricks CLI
+ uses: databricks/setup-cli@772863b94473abd8b0cacbec8b6f80fa0cbe1136 # v1.2.1
+
+ # TEMPORARY diagnostic — remove once confirmed green. Probes dogfood
+ # reachability + OIDC discovery and a forced M2M authenticated call.
+ - name: Diagnose Databricks connectivity + auth
+ env:
+ DATABRICKS_HOST: https://dogfood.staging.databricks.com
+ DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }}
+ DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }}
+ DATABRICKS_AUTH_TYPE: oauth-m2m
+ run: |
+ echo "::group::Reachability"
+ curl -sS -m 20 -o /dev/null -w "host root: HTTP %{http_code}\n" "$DATABRICKS_HOST/" || echo "host root: UNREACHABLE"
+ curl -sS -m 20 -w "\nworkspace OIDC: HTTP %{http_code}\n" "$DATABRICKS_HOST/oidc/.well-known/oauth-authorization-server" || echo "workspace OIDC: UNREACHABLE"
+ echo "::endgroup::"
+ echo "::group::M2M auth check"
+ databricks current-user me --log-level debug || true
+ echo "::endgroup::"
+
+ - name: Trigger eval pipeline
+ id: trigger
+ env:
+ DATABRICKS_HOST: https://dogfood.staging.databricks.com
+ # OAuth M2M as the apps-mcp-evals-runner service principal.
+ DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }}
+ DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }}
+ DATABRICKS_AUTH_TYPE: oauth-m2m
+ # The real PR head commit — never the synthetic merge commit — so the
+ # pipeline can pull the code.
+ HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+ PR_NUMBER: ${{ github.event.pull_request.number }}
+ # --no-wait: fire-and-forget; run-now otherwise blocks until the eval
+ # finishes. The JSON response carries run_id, which we forward to the
+ # comment step to link the run (no extra API call).
+ run: |
+ run_json=$(databricks jobs run-now --no-wait --output json --json "$(cat <> "$GITHUB_OUTPUT"
+
+ - name: Post / update "Eval running" comment
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ env:
+ HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+ PR_NUMBER: ${{ github.event.pull_request.number }}
+ RUN_JSON: ${{ steps.trigger.outputs.run_json }}
+ with:
+ script: |
+ const upsert = require('./.github/scripts/upsert-eval-comment.cjs');
+ await upsert({ github, context });