Skip to content

Commit da81626

Browse files
infra(evals): add remote eval Docker setup (PG16, backend on 127.0.0.1:4242), WS-readiness healthcheck, Drizzle seeding, SDK WS override/skip-binary flag, and CI workflow to run single eval.
This enables isolated, reproducible remote eval runs in CI with seeded auth and explicit WS target. 🤖 Generated with Codebuff Co-Authored-By: Codebuff <noreply@codebuff.com>
1 parent 257cb37 commit da81626

13 files changed

Lines changed: 721 additions & 15 deletions

File tree

.github/workflows/remote-evals.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: remote-evals
2+
3+
on:
4+
workflow_dispatch:
5+
6+
jobs:
7+
remote-evals:
8+
runs-on: ubuntu-latest
9+
steps:
10+
- name: Checkout
11+
uses: actions/checkout@v4
12+
13+
- name: Setup Bun
14+
uses: oven-sh/setup-bun@v1
15+
with:
16+
bun-version: 1.1.34
17+
18+
- name: Install dependencies
19+
run: bun install --frozen-lockfile
20+
21+
- name: Make scripts executable
22+
run: chmod +x evals/scripts/*.sh
23+
24+
- name: Run remote eval
25+
run: evals/scripts/run-remote.sh --prompt "Say hi from CI and print the working directory" --max-steps 10

backend/src/index.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import {
1919
sendRequestReconnect,
2020
waitForAllClientsDisconnected,
2121
listen as webSocketListen,
22+
isWebsocketReady,
2223
} from './websockets/server'
2324

2425
const app = express()
@@ -31,7 +32,10 @@ app.get('/', (req, res) => {
3132
})
3233

3334
app.get('/healthz', (req, res) => {
34-
res.send('ok')
35+
if (!isWebsocketReady()) {
36+
return res.status(503).send('starting')
37+
}
38+
return res.send('ok')
3539
})
3640

3741
app.post('/api/usage', usageHandler)

backend/src/websockets/auth.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,18 @@ export async function getUserIdFromAuthToken(
2525
export async function getUserInfoFromAuthToken(
2626
authToken: string,
2727
): Promise<UserInfo | undefined> {
28+
// Test-only bypass for remote evals
29+
if (process.env.NODE_ENV === 'test') {
30+
const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
31+
if (bypass && authToken === bypass) {
32+
return {
33+
id: 'test-user',
34+
email: 'evals@test.local',
35+
discord_id: null,
36+
}
37+
}
38+
}
39+
2840
const user = await db
2941
.select({
3042
id: schema.user.id,
@@ -37,5 +49,5 @@ export async function getUserInfoFromAuthToken(
3749
.limit(1)
3850
.then((rows) => rows[0])
3951

40-
return user
52+
return user ?? undefined
4153
}

backend/src/websockets/server.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import type { Server as HttpServer } from 'node:http'
1414
import type { RawData, WebSocket } from 'ws'
1515

1616
export const SWITCHBOARD = new Switchboard()
17+
export let WS_READY = false
1718

1819
// if a connection doesn't ping for this long, we assume the other side is toast
1920
const CONNECTION_TIMEOUT_MS = 60 * 1000
@@ -87,6 +88,7 @@ export function listen(server: HttpServer, path: string) {
8788
let deadConnectionCleaner: NodeJS.Timeout | undefined
8889
wss.on('listening', () => {
8990
logger.info(`Web socket server listening on ${path}.`)
91+
WS_READY = true
9092
deadConnectionCleaner = setInterval(function ping() {
9193
const now = Date.now()
9294
try {
@@ -175,3 +177,7 @@ export function sendRequestReconnect() {
175177
export function waitForAllClientsDisconnected() {
176178
return SWITCHBOARD.waitForAllClientsDisconnected()
177179
}
180+
181+
export function isWebsocketReady() {
182+
return WS_READY
183+
}

evals/backend.Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FROM oven/bun:1.1.34 as base
2+
WORKDIR /app
3+
COPY . .
4+
RUN bun install --frozen-lockfile
5+
EXPOSE 4242
6+
CMD ["bun", "--cwd", "backend", "dev"]

evals/docker-compose.evals.yml

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
version: '3.9'
2+
services:
3+
db:
4+
image: postgres:16-alpine
5+
environment:
6+
POSTGRES_USER: codebuff
7+
POSTGRES_PASSWORD: codebuff
8+
POSTGRES_DB: codebuff
9+
command: [
10+
"postgres",
11+
"-c", "fsync=off",
12+
"-c", "synchronous_commit=off",
13+
"-c", "full_page_writes=off"
14+
]
15+
healthcheck:
16+
test: ["CMD-SHELL", "pg_isready -U codebuff -d codebuff"]
17+
interval: 5s
18+
timeout: 3s
19+
retries: 20
20+
21+
backend:
22+
build:
23+
context: ..
24+
dockerfile: ./evals/backend.Dockerfile
25+
environment:
26+
DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff
27+
PORT: "4242"
28+
NODE_ENV: production
29+
depends_on:
30+
db:
31+
condition: service_healthy
32+
ports:
33+
- "127.0.0.1:4242:4242"
34+
healthcheck:
35+
test: ["CMD", "curl", "-fsS", "http://localhost:4242/healthz"]
36+
interval: 5s
37+
timeout: 3s
38+
retries: 30
39+
40+
seeder:
41+
image: oven/bun:1.1.34
42+
working_dir: /app
43+
volumes:
44+
- ..:/app:ro
45+
environment:
46+
DATABASE_URL: postgresql://codebuff:codebuff@db:5432/codebuff
47+
entrypoint: ["bun", "run", "evals/seeds/seed-evals.ts"]
48+
depends_on:
49+
db:
50+
condition: service_healthy

evals/scripts/run-remote.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
5+
COMPOSE_FILE="$SCRIPT_DIR/../docker-compose.evals.yml"
6+
7+
export CODEBUFF_WEBSOCKET_URL="ws://127.0.0.1:4242/ws"
8+
export CODEBUFF_SKIP_BINARY_CHECK=1
9+
10+
# Start services
11+
docker compose -f "$COMPOSE_FILE" up -d --build db backend
12+
"$SCRIPT_DIR/wait-for-healthz.sh" "http://127.0.0.1:4242/healthz" 90 || {
13+
echo 'Healthz failed; dumping backend logs...'
14+
docker compose -f "$COMPOSE_FILE" logs backend --tail=200 || true
15+
exit 1
16+
}
17+
18+
# Drizzle seed (prints CODEBUFF_API_KEY=...)
19+
KEY_LINE=$(docker compose -f "$COMPOSE_FILE" run --rm seeder | tail -n1)
20+
export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}"
21+
22+
# Run the eval (allow args passthrough; require prompt if not provided)
23+
if [[ $# -eq 0 ]]; then
24+
bun scripts/git-evals/run-single-eval.ts --prompt "Say hi and print the working directory" --max-steps 10
25+
else
26+
bun scripts/git-evals/run-single-eval.ts "$@"
27+
fi
28+
29+
# Tear down
30+
docker compose -f "$COMPOSE_FILE" down -v

evals/scripts/wait-for-healthz.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
URL="${1:?Missing URL}"
4+
TIMEOUT="${2:-60}"
5+
for i in $(seq 1 "$TIMEOUT"); do
6+
if curl -fsS "$URL" >/dev/null 2>&1; then
7+
exit 0
8+
fi
9+
echo "waiting for backend... ($i s)"
10+
sleep 1
11+
done
12+
echo "backend healthz did not become ready in $TIMEOUT seconds" >&2
13+
exit 1

evals/seeds/seed-evals.ts

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import 'dotenv/config'
2+
import crypto from 'node:crypto'
3+
import { Client } from 'pg'
4+
import { drizzle } from 'drizzle-orm/node-postgres'
5+
import * as schema from '../../common/db/schema'
6+
7+
async function main() {
8+
const DATABASE_URL = process.env.DATABASE_URL
9+
if (!DATABASE_URL) {
10+
console.error('Missing DATABASE_URL')
11+
process.exit(1)
12+
}
13+
const client = new Client({ connectionString: DATABASE_URL })
14+
await client.connect()
15+
const db = drizzle(client)
16+
17+
const userId = 'evals-user'
18+
const email = 'evals@test.local'
19+
const token = crypto.randomUUID()
20+
21+
// Upsert user (adjust to match schema fields)
22+
try {
23+
// @ts-ignore - schema types may vary; keep robust
24+
await db
25+
.insert(schema.user)
26+
.values({
27+
id: userId,
28+
email,
29+
// Optional common columns; ignore if not present
30+
created_at: new Date(),
31+
updated_at: new Date(),
32+
})
33+
.onConflictDoNothing()
34+
} catch {}
35+
36+
// Upsert session/api token (sessionToken + userId)
37+
try {
38+
// @ts-ignore - schema types may vary; keep robust
39+
await db
40+
.insert(schema.session)
41+
.values({
42+
sessionToken: token,
43+
userId,
44+
// Optional: expire in 24h if column exists
45+
expires: new Date(Date.now() + 24 * 60 * 60 * 1000),
46+
created_at: new Date(),
47+
})
48+
.onConflictDoNothing()
49+
} catch {}
50+
51+
console.log(`CODEBUFF_API_KEY=${token}`)
52+
await client.end()
53+
}
54+
55+
main().catch((err) => {
56+
console.error(err)
57+
process.exit(1)
58+
})

0 commit comments

Comments
 (0)