Skip to content

fix: correct class name reference in SDK evaluation script [remote-eval] #9

fix: correct class name reference in SDK evaluation script [remote-eval]

fix: correct class name reference in SDK evaluation script [remote-eval] #9

Workflow file for this run

name: Remote Evaluations (SDK)
# This workflow runs Codebuff evaluations using the public SDK exclusively.
# It creates a containerized backend environment and runs evaluations via CodebuffClient.
# Trigger: Add [remote-eval] to commit message or use workflow_dispatch
# Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations
on:
push:
branches: ['**']
workflow_dispatch:
inputs:
eval_file:
description: 'Eval file to run (e.g., eval-codebuff.json)'
required: false
default: 'eval-codebuff.json'
type: string
commit_index:
description: 'Commit index to evaluate (0-based)'
required: false
default: '0'
type: string
mode:
description: 'Auth mode (seed or bypass)'
required: false
default: 'bypass'
type: choice
options:
- 'bypass'
- 'seed'
jobs:
remote-evals:
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Check commit message
id: check_commit
env:
COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
run: |
shopt -s nocasematch
if [[ "$COMMIT_MESSAGE" == *"[remote-eval]"* ]] || [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "should_run_evals=true" >> $GITHUB_OUTPUT
echo "Will run remote evaluations"
else
echo "should_run_evals=false" >> $GITHUB_OUTPUT
echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
fi
- name: Set up Bun
if: steps.check_commit.outputs.should_run_evals == 'true'
uses: oven-sh/setup-bun@v2
with:
bun-version: '1.2.12'
- name: Install dependencies
if: steps.check_commit.outputs.should_run_evals == 'true'
run: bun install --frozen-lockfile
- name: Validate environment for SDK evaluation
if: steps.check_commit.outputs.should_run_evals == 'true'
run: |
echo "πŸ” Validating SDK evaluation environment..."
echo " Checking for required files..."
test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
echo " Checking SDK package..."
bun --version
echo "βœ… Environment validation passed"
- name: Run remote evaluation
if: steps.check_commit.outputs.should_run_evals == 'true'
env:
EVAL_FILE: ${{ inputs.eval_file || 'eval-codebuff.json' }}
COMMIT_INDEX: ${{ inputs.commit_index || '0' }}
MODE: ${{ inputs.mode || 'bypass' }}
CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
CODEBUFF_SKIP_BINARY_CHECK: "1"
run: |
echo "πŸš€ Remote Evaluation Starting (SDK Mode)"
echo "πŸ“‹ GitHub Actions Environment:"
echo " Runner: ${{ runner.os }}"
echo " SHA: ${{ github.sha }}"
echo " Ref: ${{ github.ref }}"
echo " Event: ${{ github.event_name }}"
echo " Eval File: $EVAL_FILE"
echo " Commit Index: $COMMIT_INDEX"
echo " Mode: $MODE"
echo "🐳 Docker Info:"
docker --version
docker compose version
echo "πŸ’Ύ Disk Space:"
df -h
echo "πŸ”§ Starting SDK-based evaluation..."
bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"
- name: Dump logs on failure
if: failure() && steps.check_commit.outputs.should_run_evals == 'true'
run: |
echo "❌ SDK Evaluation failed - dumping diagnostic information"
echo "πŸ”§ SDK Environment:"
echo " CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}"
echo " CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}"
echo " CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}"
echo "🐳 Docker containers status:"
docker ps -a || true
echo "πŸ“‹ Backend container logs:"
docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 || true
echo "πŸ“‹ Database container logs:"
docker compose -f evals/docker-compose.evals.yml logs db --tail=100 || true
echo "πŸ’Ύ Disk usage:"
df -h || true
echo "🧠 Memory usage:"
free -h || true
echo "πŸ“ Evaluation files:"
ls -la evals/git-evals/ || true
ls -la evals/scripts/ || true
- name: Upload evaluation logs
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
uses: actions/upload-artifact@v4
with:
name: remote-eval-logs-${{ github.sha }}
path: |
evals/test-repos/
debug/
~/.cache/bun/
retention-days: 7
- name: Cleanup containers
if: always() && steps.check_commit.outputs.should_run_evals == 'true'
run: |
echo "🧹 Final cleanup - removing all containers and volumes..."
docker compose -f evals/docker-compose.evals.yml down -v || true
docker system prune -f || true
echo "βœ… Cleanup completed"
# Optional: Matrix job to run multiple evaluations in parallel
remote-evals-matrix:
runs-on: ubuntu-latest
timeout-minutes: 90
if: contains(github.event.head_commit.message, '[remote-eval-all]')
strategy:
fail-fast: false
matrix:
eval:
- { file: 'eval-codebuff.json', index: '0' }
- { file: 'eval-codebuff.json', index: '1' }
- { file: 'eval-manifold.json', index: '0' }
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: '1.2.12'
- name: Install dependencies
run: bun install --frozen-lockfile
- name: Validate environment for SDK evaluation
run: |
echo "πŸ” Validating SDK evaluation environment for matrix job..."
test -f evals/scripts/run-remote-parameterized.sh || { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
test -f evals/git-evals/run-single-eval.ts || { echo "❌ Missing run-single-eval.ts"; exit 1; }
test -f evals/docker-compose.evals.yml || { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
echo "βœ… Matrix environment validation passed"
- name: Run evaluation matrix
env:
EVAL_FILE: ${{ matrix.eval.file }}
COMMIT_INDEX: ${{ matrix.eval.index }}
CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
CODEBUFF_SKIP_BINARY_CHECK: "1"
run: |
echo "πŸš€ Running matrix evaluation (SDK Mode)..."
bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"
- name: Dump matrix logs on failure
if: failure()
run: |
echo "❌ Matrix SDK Evaluation failed - dumping diagnostic information"
echo "πŸ”§ Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX"
echo "🐳 Docker containers status:"
docker ps -a || true
echo "πŸ“‹ Container logs:"
docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true
- name: Upload matrix evaluation results
if: always()
uses: actions/upload-artifact@v4
with:
name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
path: |
evals/test-repos/
debug/
retention-days: 7
- name: Cleanup containers
if: always()
run: |
docker compose -f evals/docker-compose.evals.yml down -v || true
docker system prune -f || true