fix: correct class name reference in SDK evaluation script [remote-eval] #9

Workflow file for this run

.github/workflows/remote-evals.yml at c7340c6

	name: Remote Evaluations (SDK)

	# This workflow runs Codebuff evaluations using the public SDK exclusively.
	# It creates a containerized backend environment and runs evaluations via CodebuffClient.
	# Trigger: Add [remote-eval] to commit message or use workflow_dispatch
	# Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations

	on:
	push:
	branches: ['**']
	workflow_dispatch:
	inputs:
	eval_file:
	description: 'Eval file to run (e.g., eval-codebuff.json)'
	required: false
	default: 'eval-codebuff.json'
	type: string
	commit_index:
	description: 'Commit index to evaluate (0-based)'
	required: false
	default: '0'
	type: string
	mode:
	description: 'Auth mode (seed or bypass)'
	required: false
	default: 'bypass'
	type: choice
	options:
	- 'bypass'
	- 'seed'

	jobs:
	remote-evals:
	runs-on: ubuntu-latest
	timeout-minutes: 60

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Check commit message
	id: check_commit
	env:
	COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
	run: \|
	shopt -s nocasematch
	if [[ "$COMMIT_MESSAGE" == "[remote-eval]" ]] \|\| [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
	echo "should_run_evals=true" >> $GITHUB_OUTPUT
	echo "Will run remote evaluations"
	else
	echo "should_run_evals=false" >> $GITHUB_OUTPUT
	echo "Skipping remote evaluations (add [remote-eval] to commit message to trigger)"
	fi

	- name: Set up Bun
	if: steps.check_commit.outputs.should_run_evals == 'true'
	uses: oven-sh/setup-bun@v2
	with:
	bun-version: '1.2.12'

	- name: Install dependencies
	if: steps.check_commit.outputs.should_run_evals == 'true'
	run: bun install --frozen-lockfile

	- name: Validate environment for SDK evaluation
	if: steps.check_commit.outputs.should_run_evals == 'true'
	run: \|
	echo "🔍 Validating SDK evaluation environment..."
	echo " Checking for required files..."
	test -f evals/scripts/run-remote-parameterized.sh \|\| { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
	test -f evals/git-evals/run-single-eval.ts \|\| { echo "❌ Missing run-single-eval.ts"; exit 1; }
	test -f evals/docker-compose.evals.yml \|\| { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
	echo " Checking SDK package..."
	bun --version
	echo "✅ Environment validation passed"

	- name: Run remote evaluation
	if: steps.check_commit.outputs.should_run_evals == 'true'
	env:
	EVAL_FILE: ${{ inputs.eval_file \|\| 'eval-codebuff.json' }}
	COMMIT_INDEX: ${{ inputs.commit_index \|\| '0' }}
	MODE: ${{ inputs.mode \|\| 'bypass' }}
	CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
	CODEBUFF_SKIP_BINARY_CHECK: "1"
	run: \|
	echo "🚀 Remote Evaluation Starting (SDK Mode)"
	echo "📋 GitHub Actions Environment:"
	echo " Runner: ${{ runner.os }}"
	echo " SHA: ${{ github.sha }}"
	echo " Ref: ${{ github.ref }}"
	echo " Event: ${{ github.event_name }}"
	echo " Eval File: $EVAL_FILE"
	echo " Commit Index: $COMMIT_INDEX"
	echo " Mode: $MODE"
	echo "🐳 Docker Info:"
	docker --version
	docker compose version
	echo "💾 Disk Space:"
	df -h
	echo "🔧 Starting SDK-based evaluation..."
	bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"

	- name: Dump logs on failure
	if: failure() && steps.check_commit.outputs.should_run_evals == 'true'
	run: \|
	echo "❌ SDK Evaluation failed - dumping diagnostic information"
	echo "🔧 SDK Environment:"
	echo " CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}"
	echo " CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}"
	echo " CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}"
	echo "🐳 Docker containers status:"
	docker ps -a \|\| true
	echo "📋 Backend container logs:"
	docker compose -f evals/docker-compose.evals.yml logs backend --tail=200 \|\| true
	echo "📋 Database container logs:"
	docker compose -f evals/docker-compose.evals.yml logs db --tail=100 \|\| true
	echo "💾 Disk usage:"
	df -h \|\| true
	echo "🧠 Memory usage:"
	free -h \|\| true
	echo "📁 Evaluation files:"
	ls -la evals/git-evals/ \|\| true
	ls -la evals/scripts/ \|\| true

	- name: Upload evaluation logs
	if: always() && steps.check_commit.outputs.should_run_evals == 'true'
	uses: actions/upload-artifact@v4
	with:
	name: remote-eval-logs-${{ github.sha }}
	path: \|
	evals/test-repos/
	debug/
	~/.cache/bun/
	retention-days: 7

	- name: Cleanup containers
	if: always() && steps.check_commit.outputs.should_run_evals == 'true'
	run: \|
	echo "🧹 Final cleanup - removing all containers and volumes..."
	docker compose -f evals/docker-compose.evals.yml down -v \|\| true
	docker system prune -f \|\| true
	echo "✅ Cleanup completed"

	# Optional: Matrix job to run multiple evaluations in parallel
	remote-evals-matrix:
	runs-on: ubuntu-latest
	timeout-minutes: 90
	if: contains(github.event.head_commit.message, '[remote-eval-all]')

	strategy:
	fail-fast: false
	matrix:
	eval:
	- { file: 'eval-codebuff.json', index: '0' }
	- { file: 'eval-codebuff.json', index: '1' }
	- { file: 'eval-manifold.json', index: '0' }

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set up Bun
	uses: oven-sh/setup-bun@v2
	with:
	bun-version: '1.2.12'

	- name: Install dependencies
	run: bun install --frozen-lockfile

	- name: Validate environment for SDK evaluation
	run: \|
	echo "🔍 Validating SDK evaluation environment for matrix job..."
	test -f evals/scripts/run-remote-parameterized.sh \|\| { echo "❌ Missing run-remote-parameterized.sh"; exit 1; }
	test -f evals/git-evals/run-single-eval.ts \|\| { echo "❌ Missing run-single-eval.ts"; exit 1; }
	test -f evals/docker-compose.evals.yml \|\| { echo "❌ Missing docker-compose.evals.yml"; exit 1; }
	echo "✅ Matrix environment validation passed"

	- name: Run evaluation matrix
	env:
	EVAL_FILE: ${{ matrix.eval.file }}
	COMMIT_INDEX: ${{ matrix.eval.index }}
	CODEBUFF_WEBSOCKET_URL: "ws://127.0.0.1:4242/ws"
	CODEBUFF_SKIP_BINARY_CHECK: "1"
	run: \|
	echo "🚀 Running matrix evaluation (SDK Mode)..."
	bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"

	- name: Dump matrix logs on failure
	if: failure()
	run: \|
	echo "❌ Matrix SDK Evaluation failed - dumping diagnostic information"
	echo "🔧 Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX"
	echo "🐳 Docker containers status:"
	docker ps -a \|\| true
	echo "📋 Container logs:"
	docker compose -f evals/docker-compose.evals.yml logs --tail=100 \|\| true

	- name: Upload matrix evaluation results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: remote-eval-matrix-${{ matrix.eval.file }}-${{ matrix.eval.index }}-${{ github.sha }}
	path: \|
	evals/test-repos/
	debug/
	retention-days: 7

	- name: Cleanup containers
	if: always()
	run: \|
	docker compose -f evals/docker-compose.evals.yml down -v \|\| true
	docker system prune -f \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix: correct class name reference in SDK evaluation script [remote-eval] #9

Workflow file

fix: correct class name reference in SDK evaluation script [remote-eval] #9

Uh oh!

Workflow file for this run