Skip to content

Commit 21d92d2

Browse files
committed
feat: convert evaluation infrastructure to SDK-only [remote-eval]
1 parent 94be8b0 commit 21d92d2

11 files changed

Lines changed: 1356 additions & 140 deletions

β€Ž.github/workflows/remote-evals.ymlβ€Ž

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
name: Remote Evaluations
1+
name: Remote Evaluations (SDK)
2+
3+
# This workflow runs Codebuff evaluations using the public SDK exclusively.
4+
# It creates a containerized backend environment and runs evaluations via CodebuffClient.
5+
# Trigger: Add [remote-eval] to commit message or use workflow_dispatch
6+
# Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations
27

38
on:
49
push:
@@ -57,14 +62,28 @@ jobs:
5762
if: steps.check_commit.outputs.should_run_evals == 'true'
5863
run: bun install --frozen-lockfile
5964

65+
- name: Validate environment for SDK evaluation
66+
if: steps.check_commit.outputs.should_run_evals == 'true'
67+
run: |
68+
echo "πŸ” Validating SDK evaluation environment..."
69+
echo " Checking for required files..."
70+
test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
71+
test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
72+
test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
73+
echo " Checking SDK package..."
74+
bun --version
75+
echo "βœ… Environment validation passed"
76+
6077
- name: Run remote evaluation
6178
if: steps.check_commit.outputs.should_run_evals == 'true'
6279
env:
6380
EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }}
6481
COMMIT_INDEX: ${{ inputs.commit_index || '0' }}
6582
MODE: ${{ inputs.mode || 'bypass' }}
83+
CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
84+
CODEBUFF_SKIP_BINARY_CHECK: "1"
6685
run: |
67-
echo "πŸš€ Remote Evaluation Starting"
86+
echo "πŸš€ Remote Evaluation Starting (SDK Mode)"
6887
echo "πŸ“‹ GitHub Actions Environment:"
6988
echo " Runner: ${{ runner.os }}"
7089
echo " SHA: ${{ github.sha }}"
@@ -78,13 +97,17 @@ jobs:
7897
docker compose version
7998
echo "πŸ’Ύ Disk Space:"
8099
df -h
81-
echo "πŸ”§ Starting evaluation..."
100+
echo "πŸ”§ Starting SDK-based evaluation..."
82101
bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"
83102
84103
- name: Dump logs on failure
85104
if: failure() && steps.check_commit.outputs.should_run_evals == 'true'
86105
run: |
87-
echo "❌ Evaluation failed - dumping diagnostic information"
106+
echo "❌ SDK Evaluation failed - dumping diagnostic information"
107+
echo "πŸ”§ SDK Environment:"
108+
echo " CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}"
109+
echo " CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}"
110+
echo " CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}"
88111
echo "🐳 Docker containers status:"
89112
docker ps -a || true
90113
echo "πŸ“‹ Backend container logs:"
@@ -95,6 +118,9 @@ jobs:
95118
df -h || true
96119
echo "🧠 Memory usage:"
97120
free -h || true
121+
echo "πŸ“ Evaluation files:"
122+
ls -la evals/git-evals/ || true
123+
ls -la evals/scripts/ || true
98124
99125
- name: Upload evaluation logs
100126
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
@@ -119,7 +145,7 @@ jobs:
119145
remote-evals-matrix:
120146
runs-on: ubuntu-latest
121147
timeout-minutes: 90
122-
if: contains(github.event.head_commit.message, '[remote-eval-all]') || (github.event_name == 'workflow_dispatch' && inputs.mode == 'matrix')
148+
if: contains(github.event.head_commit.message, '[remote-eval-all]')
123149

124150
strategy:
125151
fail-fast: false
@@ -141,14 +167,34 @@ jobs:
141167
- name: Install dependencies
142168
run: bun install --frozen-lockfile
143169

170+
- name: Validate environment for SDK evaluation
171+
run: |
172+
echo "πŸ” Validating SDK evaluation environment for matrix job..."
173+
test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
174+
test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
175+
test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
176+
echo "βœ… Matrix environment validation passed"
177+
144178
- name: Run evaluation matrix
145179
env:
146180
EVAL_FILE: ${{ matrix.eval.file }}
147181
COMMIT_INDEX: ${{ matrix.eval.index }}
182+
CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
183+
CODEBUFF_SKIP_BINARY_CHECK: "1"
148184
run: |
149-
echo "πŸš€ Running matrix evaluation..."
185+
echo "πŸš€ Running matrix evaluation (SDK Mode)..."
150186
bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"
151187
188+
- name: Dump matrix logs on failure
189+
if: failure()
190+
run: |
191+
echo "❌ Matrix SDK Evaluation failed - dumping diagnostic information"
192+
echo "πŸ”§ Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX"
193+
echo "🐳 Docker containers status:"
194+
docker ps -a || true
195+
echo "πŸ“‹ Container logs:"
196+
docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true
197+
152198
- name: Upload matrix evaluation results
153199
if: always()
154200
uses: actions/upload-artifact@v4

0 commit comments

Comments
Β (0)