Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions .github/scripts/upsert-eval-comment.cjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/**
* Upserts a sticky "Eval running" comment on a PR after the dogfood eval
* pipeline has been launched.
*
* Invoked via `actions/github-script`. Inputs come from environment vars:
* PR_NUMBER - the pull request number
* HEAD_SHA - the commit the eval was launched for
* RUN_JSON - raw JSON from `databricks jobs run-now` (used to link the run)
*/

const MARKER = "<!-- pr-eval-run -->";
const EVALS_MONITOR_URL =
"https://evals-monitor-6051921418418893.staging.aws.databricksapps.com";
const DATABRICKS_HOST = "https://dogfood.staging.databricks.com";
const JOB_ID = "398185277057549";
const WORKSPACE_ID = "6051921418418893";

module.exports = async ({ github, context }) => {
const { owner, repo } = context.repo;
const issue_number = Number(process.env.PR_NUMBER);
const shortSha = (process.env.HEAD_SHA || "").substring(0, 7);

// run_id comes back in the run-now response, so the run link costs no extra call.
let runId;
try {
runId = JSON.parse(process.env.RUN_JSON || "{}").run_id;
} catch {
runId = undefined;
}

const links = [
`[View results in evals-monitor →](${EVALS_MONITOR_URL}/prs/appkit/${issue_number})`,
];
if (runId) {
links.push(
`<sub>[job run ↗](${DATABRICKS_HOST}/jobs/${JOB_ID}/runs/${runId}?o=${WORKSPACE_ID})</sub>`,
);
}

const body = [
MARKER,
"### ⏳ Eval running",
"",
`Eval pipeline launched for commit \`${shortSha}\`.`,
"",
links.join(" · "),
].join("\n");

const comments = await github.paginate(github.rest.issues.listComments, {
owner,
repo,
issue_number,
per_page: 100,
});
const existing = comments.find((c) => c.body?.includes(MARKER));

if (existing) {
await github.rest.issues.updateComment({
owner,
repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner,
repo,
issue_number,
body,
});
}
};
110 changes: 110 additions & 0 deletions .github/workflows/eval-trigger.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: Eval Trigger

# Launches the dogfood eval pipeline for a PR when the `run-evals` label is
# present, and re-launches it on every new commit while the label stays on.
#
# Uses `pull_request` (not `pull_request_target`): the workflow file that runs
# is the PR branch's version (so it's testable on the feature branch), and repo
# secrets are withheld from fork PRs, so an external contributor cannot exfil
# the Databricks credentials even by editing this file. Auth is OAuth M2M as the
# `apps-mcp-evals-runner` service principal (DATABRICKS_CLIENT_ID/SECRET), and
# those credentials are exposed only to the trigger step, never to the comment
# step that runs PR-authored script code.
on:
pull_request:
types: [labeled, synchronize]

# `id-token: write` is required for the databricks-protected-runner-group's
# egress to internal Databricks hosts (incl. dogfood.staging) — without it the
# egress proxy returns 403 "RBAC: access denied" for every request. Matches the
# other workflows in this repo (ci.yml, prepare-release.yml, docs-deploy.yml).
permissions:
contents: read
pull-requests: write
id-token: write

# Latest push wins: a newer commit cancels the in-flight run for an older one,
# so the sticky comment always reflects the most recently triggered commit.
concurrency:
group: eval-trigger-${{ github.event.pull_request.number }}
cancel-in-progress: true

jobs:
trigger-evals:
name: Trigger eval pipeline
# Run when the `run-evals` label is added, or on a new commit while the PR
# already carries the label.
if: >-
(github.event.action == 'labeled' && github.event.label.name == 'run-evals') ||
(github.event.action == 'synchronize' && contains(github.event.pull_request.labels.*.name, 'run-evals'))
runs-on:
group: databricks-protected-runner-group
labels: linux-ubuntu-latest
steps:
- name: Check out repository
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1

- name: Install Databricks CLI
uses: databricks/setup-cli@772863b94473abd8b0cacbec8b6f80fa0cbe1136 # v1.2.1

# TEMPORARY diagnostic — remove once confirmed green. Probes dogfood
# reachability + OIDC discovery and a forced M2M authenticated call.
- name: Diagnose Databricks connectivity + auth
env:
DATABRICKS_HOST: https://dogfood.staging.databricks.com
DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }}
DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }}
DATABRICKS_AUTH_TYPE: oauth-m2m
run: |
echo "::group::Reachability"
curl -sS -m 20 -o /dev/null -w "host root: HTTP %{http_code}\n" "$DATABRICKS_HOST/" || echo "host root: UNREACHABLE"
curl -sS -m 20 -w "\nworkspace OIDC: HTTP %{http_code}\n" "$DATABRICKS_HOST/oidc/.well-known/oauth-authorization-server" || echo "workspace OIDC: UNREACHABLE"
echo "::endgroup::"
echo "::group::M2M auth check"
databricks current-user me --log-level debug || true
echo "::endgroup::"

- name: Trigger eval pipeline
id: trigger
env:
DATABRICKS_HOST: https://dogfood.staging.databricks.com
# OAuth M2M as the apps-mcp-evals-runner service principal.
DATABRICKS_CLIENT_ID: ${{ secrets.EVALS_DATABRICKS_CLIENT_ID_DOGFOOD }}
DATABRICKS_CLIENT_SECRET: ${{ secrets.EVALS_DATABRICKS_CLIENT_SECRET_DOGFOOD }}
DATABRICKS_AUTH_TYPE: oauth-m2m
# The real PR head commit — never the synthetic merge commit — so the
# pipeline can pull the code.
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
PR_NUMBER: ${{ github.event.pull_request.number }}
# --no-wait: fire-and-forget; run-now otherwise blocks until the eval
# finishes. The JSON response carries run_id, which we forward to the
# comment step to link the run (no extra API call).
run: |
run_json=$(databricks jobs run-now --no-wait --output json --json "$(cat <<EOF
{
"job_id": 398185277057549,
"job_parameters": {
"appkit_ref": "${HEAD_SHA}",
"prompt_preset": "custom-pr",
"tags": "appkit_pr:${PR_NUMBER}"
}
}
EOF
)")
echo "$run_json"
{
echo "run_json<<RUN_JSON_EOF"
echo "$run_json"
echo "RUN_JSON_EOF"
} >> "$GITHUB_OUTPUT"

- name: Post / update "Eval running" comment
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
env:
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
PR_NUMBER: ${{ github.event.pull_request.number }}
RUN_JSON: ${{ steps.trigger.outputs.run_json }}
with:
script: |
const upsert = require('./.github/scripts/upsert-eval-comment.cjs');
await upsert({ github, context });
Loading