Skip to content

Commit 2c21b43

Browse files
committed
feat: implement remote evaluation infrastructure with Docker Compose
Add comprehensive containerized infrastructure for running Codebuff evaluations in CI/CD environments with full isolation and reliability.
1 parent 77948bb commit 2c21b43

14 files changed

Lines changed: 913 additions & 4 deletions

File tree

.github/workflows/remote-evals.yml

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
name: Remote Evaluations
2+
3+
on:
4+
push:
5+
branches: ['**']
6+
workflow_dispatch:
7+
inputs:
8+
eval_file:
9+
description: 'Eval file to run (e.g., eval-codebuff.json)'
10+
required: false
11+
default: 'eval-codebuff.json'
12+
type: string
13+
commit_index:
14+
description: 'Commit index to evaluate (0-based)'
15+
required: false
16+
default: '0'
17+
type: string
18+
mode:
19+
description: 'Auth mode (seed or bypass)'
20+
required: false
21+
default: 'bypass'
22+
type: choice
23+
options:
24+
- 'bypass'
25+
- 'seed'
26+
27+
jobs:
28+
remote-evals:
29+
runs-on: ubuntu-latest
30+
timeout-minutes: 60
31+
32+
steps:
33+
- name: Checkout repository
34+
uses: actions/checkout@v4
35+
36+
- name: Check commit message
37+
id: check_commit
38+
env:
39+
COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
40+
run: |
41+
shopt -s nocasematch
42+
if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
43+
echo "should_run_evals=true" >> $GITHUB_OUTPUT
44+
echo "Will run remote evaluations"
45+
else
46+
echo "should_run_evals=false" >> $GITHUB_OUTPUT
47+
echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
48+
fi
49+
50+
- name: Set up Bun
51+
if: steps.check_commit.outputs.should_run_evals == 'true'
52+
uses: oven-sh/setup-bun@v2
53+
with:
54+
bun-version: '1.2.12'
55+
56+
- name: Install dependencies
57+
if: steps.check_commit.outputs.should_run_evals == 'true'
58+
run: bun install --frozen-lockfile
59+
60+
- name: Run remote evaluation
61+
if: steps.check_commit.outputs.should_run_evals == 'true'
62+
env:
63+
EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }}
64+
COMMIT_INDEX: ${{ inputs.commit_index || '0' }}
65+
MODE: ${{ inputs.mode || 'bypass' }}
66+
run: |
67+
echo "🚀 Starting remote evaluation..."
68+
bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"
69+
70+
- name: Upload evaluation logs
71+
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
72+
uses: actions/upload-artifact@v4
73+
with:
74+
name: remote-eval-logs-${{ github.sha }}
75+
path: |
76+
evals/test-repos/
77+
debug/
78+
retention-days: 7
79+
80+
- name: Cleanup containers
81+
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
82+
run: |
83+
echo "🧹 Cleaning up Docker containers..."
84+
docker compose -f evals/docker-compose.evals.yml down -v || true
85+
docker system prune -f || true
86+
87+
# Optional: Matrix job to run multiple evaluations in parallel
88+
remote-evals-matrix:
89+
runs-on: ubuntu-latest
90+
timeout-minutes: 90
91+
if: contains(github.event.head_commit.message, '[remote-eval-all]') || (github.event_name == 'workflow_dispatch' && inputs.mode == 'matrix')
92+
93+
strategy:
94+
fail-fast: false
95+
matrix:
96+
eval:
97+
- { file: 'eval-codebuff.json', index: '0' }
98+
- { file: 'eval-codebuff.json', index: '1' }
99+
- { file: 'eval-manifold.json', index: '0' }
100+
101+
steps:
102+
- name: Checkout repository
103+
uses: actions/checkout@v4
104+
105+
- name: Set up Bun
106+
uses: oven-sh/setup-bun@v2
107+
with:
108+
bun-version: '1.2.12'
109+
110+
- name: Install dependencies
111+
run: bun install --frozen-lockfile
112+
113+
- name: Run evaluation matrix
114+
env:
115+
EVAL_FILE: ${{ matrix.eval.file }}
116+
COMMIT_INDEX: ${{ matrix.eval.index }}
117+
run: |
118+
echo "🚀 Running matrix evaluation..."
119+
bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"
120+
121+
- name: Upload matrix evaluation results
122+
if: always()
123+
uses: actions/upload-artifact@v4
124+
with:
125+
name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
126+
path: |
127+
evals/test-repos/
128+
debug/
129+
retention-days: 7
130+
131+
- name: Cleanup containers
132+
if: always()
133+
run: |
134+
docker compose -f evals/docker-compose.evals.yml down -v || true
135+
docker system prune -f || true

backend/src/index.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import {
1919
sendRequestReconnect,
2020
waitForAllClientsDisconnected,
2121
listen as webSocketListen,
22+
isWebSocketReady,
2223
} from './websockets/server'
2324

2425
const app = express()
@@ -31,7 +32,11 @@ app.get('/', (req, res) => {
3132
})
3233

3334
app.get('/healthz', (req, res) => {
34-
res.send('ok')
35+
if (isWebSocketReady()) {
36+
res.send('ok')
37+
} else {
38+
res.status(503).send('starting')
39+
}
3540
})
3641

3742
app.post('/api/usage', usageHandler)

backend/src/websockets/auth.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ export interface UserInfo {
1111
export async function getUserIdFromAuthToken(
1212
authToken: string,
1313
): Promise<string | undefined> {
14+
// Test-only auth bypass
15+
const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
16+
if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
17+
return 'test-user'
18+
}
19+
1420
const user = await db
1521
.select({ id: schema.user.id })
1622
.from(schema.user)
@@ -25,6 +31,12 @@ export async function getUserIdFromAuthToken(
2531
export async function getUserInfoFromAuthToken(
2632
authToken: string,
2733
): Promise<UserInfo | undefined> {
34+
// Test-only auth bypass
35+
const bypass = process.env.CODEBUFF_TEST_AUTH_TOKEN
36+
if (process.env.NODE_ENV === 'test' && bypass && authToken === bypass) {
37+
return { id: 'test-user', email: 'evals@test.local', discord_id: null }
38+
}
39+
2840
const user = await db
2941
.select({
3042
id: schema.user.id,

backend/src/websockets/server.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ export const SWITCHBOARD = new Switchboard()
1818
// if a connection doesn't ping for this long, we assume the other side is toast
1919
const CONNECTION_TIMEOUT_MS = 60 * 1000
2020

21+
let wsReady = false
22+
2123
export class MessageParseError extends Error {
2224
details?: unknown
2325
constructor(message: string, details?: unknown) {
@@ -87,6 +89,7 @@ export function listen(server: HttpServer, path: string) {
8789
let deadConnectionCleaner: NodeJS.Timeout | undefined
8890
wss.on('listening', () => {
8991
logger.info(`Web socket server listening on ${path}.`)
92+
wsReady = true
9093
deadConnectionCleaner = setInterval(function ping() {
9194
const now = Date.now()
9295
try {
@@ -175,3 +178,7 @@ export function sendRequestReconnect() {
175178
export function waitForAllClientsDisconnected() {
176179
return SWITCHBOARD.waitForAllClientsDisconnected()
177180
}
181+
182+
export function isWebSocketReady() {
183+
return wsReady
184+
}

evals/README.md

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# Remote Evaluation Infrastructure
2+
3+
This directory contains the infrastructure for running Codebuff evaluations in containerized environments (Docker Compose) for CI/CD and local testing.
4+
5+
## Quick Start
6+
7+
### Option 1: Using Drizzle Seed (Recommended)
8+
```bash
9+
bash evals/scripts/run-remote.sh seed
10+
```
11+
12+
### Option 2: Using Test Auth Bypass (Faster)
13+
```bash
14+
bash evals/scripts/run-remote.sh bypass
15+
```
16+
17+
## Prerequisites
18+
19+
- Docker and Docker Compose
20+
- Bun runtime
21+
- Optional: `npm install -g codebuff` (or set `CODEBUFF_SKIP_BINARY_CHECK=1`)
22+
23+
## Architecture
24+
25+
- **evals/docker-compose.evals.yml**: Orchestrates PostgreSQL database and backend services
26+
- **evals/backend.Dockerfile**: Backend container definition
27+
- **evals/seeds/seed-evals.ts**: Drizzle-based database seeding for test users/sessions
28+
- **evals/scripts/run-remote.sh**: Main runner script with teardown
29+
- **evals/scripts/wait-for-healthz.sh**: Health check waiting utility
30+
31+
## Key Features
32+
33+
### SDK Enhancements
34+
- **Binary Check Skip**: Set `CODEBUFF_SKIP_BINARY_CHECK=1` to skip codebuff CLI requirement
35+
- **WebSocket URL Override**: Set `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` to target ephemeral backend
36+
37+
### Backend Enhancements
38+
- **Test Auth Bypass**: Set `CODEBUFF_TEST_AUTH_TOKEN` + `NODE_ENV=test` for quick auth
39+
- **WebSocket-Ready Health Check**: `/healthz` returns 503 until WebSocket server is accepting connections
40+
41+
### Container Strategy
42+
- **Loopback Binding**: Backend bound to `127.0.0.1:4242` only (no public exposure)
43+
- **Optimized PostgreSQL**: Fast settings for CI (fsync=off, etc.)
44+
- **Build Context**: Uses repo root with Dockerfile in evals/ for clean separation
45+
46+
## Environment Variables
47+
48+
- `CODEBUFF_WEBSOCKET_URL`: Override WebSocket URL (e.g., `ws://127.0.0.1:4242/ws`)
49+
- `CODEBUFF_SKIP_BINARY_CHECK=1`: Skip SDK binary presence check
50+
- `CODEBUFF_TEST_AUTH_TOKEN`: Enable test-only auth bypass (when NODE_ENV=test)
51+
- `CODEBUFF_API_KEY`: API key for SDK authentication (set by scripts)
52+
53+
## GitHub Actions Integration
54+
55+
### Automatic Trigger
56+
Add `[remote-eval]` to your commit message to trigger remote evaluations:
57+
```bash
58+
git commit -m "fix: terminal CWD handling [remote-eval]"
59+
```
60+
61+
### Manual Trigger
62+
Go to Actions → Remote Evaluations → Run workflow:
63+
- **Eval file**: `eval-codebuff.json` (default)
64+
- **Commit index**: `0` (default)
65+
- **Mode**: `bypass` or `seed`
66+
67+
### Matrix Evaluations
68+
Add `[remote-eval-all]` to run multiple evaluations in parallel:
69+
```bash
70+
git commit -m "major: refactor terminal logic [remote-eval-all]"
71+
```
72+
73+
### Workflow Files
74+
- `.github/workflows/remote-evals.yml` - Main remote evaluation workflow
75+
- Uses our containerized infrastructure with Docker Compose
76+
- Uploads artifacts and logs automatically
77+
- Handles cleanup and error reporting
78+
79+
### Usage in CI
80+
81+
```yaml
82+
# Single evaluation
83+
- name: Run remote eval (bypass mode)
84+
run: bash evals/scripts/run-remote-parameterized.sh bypass eval-codebuff.json 0
85+
86+
# With database seeding
87+
- name: Run remote eval (seed mode)
88+
run: bash evals/scripts/run-remote-parameterized.sh seed eval-manifold.json 1
89+
```
90+
91+
## Manual Usage
92+
93+
1. Start services:
94+
```bash
95+
docker compose -f evals/docker-compose.evals.yml up -d --build db backend
96+
```
97+
98+
2. Wait for readiness:
99+
```bash
100+
evals/scripts/wait-for-healthz.sh http://127.0.0.1:4242/healthz 90
101+
```
102+
103+
3. Seed database and capture API key:
104+
```bash
105+
KEY_LINE=$(docker compose -f evals/docker-compose.evals.yml run --rm seeder | tail -n1)
106+
export CODEBUFF_API_KEY="${KEY_LINE#CODEBUFF_API_KEY=}"
107+
```
108+
109+
4. Run evaluation:
110+
```bash
111+
export CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws
112+
export CODEBUFF_SKIP_BINARY_CHECK=1
113+
bun scripts/git-evals/run-single-eval.ts --prompt "Your test prompt"
114+
```
115+
116+
5. Cleanup:
117+
```bash
118+
docker compose -f evals/docker-compose.evals.yml down -v
119+
```
120+
121+
## Troubleshooting
122+
123+
- **Connection Issues**: Check that `CODEBUFF_WEBSOCKET_URL=ws://127.0.0.1:4242/ws` is set
124+
- **Auth Failures**: Verify `CODEBUFF_API_KEY` is properly captured from seeder output
125+
- **Backend Not Ready**: Ensure `/healthz` returns 200 before proceeding
126+
- **Port Conflicts**: Backend binds to `127.0.0.1:4242` - ensure port is available
127+
128+
## Implementation Details
129+
130+
Based on the remote-eval-infra-plan.md specification:
131+
- Monorepo + Bun compatible
132+
- Docker-agnostic backend (Dockerfile lives in evals/)
133+
- Idempotent Drizzle seeding with deterministic IDs
134+
- WS readiness validation in health checks
135+
- Test-only auth bypass for fast smoke tests
136+
- Comprehensive error logging and cleanup

evals/backend.Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FROM oven/bun:1.1.34 as base
2+
WORKDIR /app
3+
COPY . .
4+
RUN bun install --frozen-lockfile
5+
EXPOSE 4242
6+
CMD ["bun", "--cwd", "backend", "dev"]

0 commit comments

Comments
 (0)