1- name : Remote Evaluations
1+ name : Remote Evaluations (SDK)
2+
3+ # This workflow runs Codebuff evaluations using the public SDK exclusively.
4+ # It creates a containerized backend environment and runs evaluations via CodebuffClient.
5+ # Trigger: Add [remote-eval] to commit message or use workflow_dispatch
6+ # Matrix mode: Add [remote-eval-all] to commit message for parallel evaluations
27
38on :
49 push :
@@ -57,14 +62,28 @@ jobs:
5762 if : steps.check_commit.outputs.should_run_evals == 'true'
5863 run : bun install --frozen-lockfile
5964
65+ - name : Validate environment for SDK evaluation
66+ if : steps.check_commit.outputs.should_run_evals == 'true'
67+ run : |
68+ echo "π Validating SDK evaluation environment..."
69+ echo " Checking for required files..."
70+ test -f evals/scripts/run-remote-parameterized.sh || { echo "β Missing run-remote-parameterized.sh"; exit 1; }
71+ test -f evals/git-evals/run-single-eval.ts || { echo "β Missing run-single-eval.ts"; exit 1; }
72+ test -f evals/docker-compose.evals.yml || { echo "β Missing docker-compose.evals.yml"; exit 1; }
73+ echo " Checking SDK package..."
74+ bun --version
75+ echo "β
Environment validation passed"
76+
6077 - name : Run remote evaluation
6178 if : steps.check_commit.outputs.should_run_evals == 'true'
6279 env :
6380 EVAL_FILE : ${{ inputs.eval_file || 'eval-codebuff.json' }}
6481 COMMIT_INDEX : ${{ inputs.commit_index || '0' }}
6582 MODE : ${{ inputs.mode || 'bypass' }}
83+ CODEBUFF_WEBSOCKET_URL : " ws://127.0.0.1:4242/ws"
84+ CODEBUFF_SKIP_BINARY_CHECK : " 1"
6685 run : |
67- echo "π Remote Evaluation Starting"
86+ echo "π Remote Evaluation Starting (SDK Mode) "
6887 echo "π GitHub Actions Environment:"
6988 echo " Runner: ${{ runner.os }}"
7089 echo " SHA: ${{ github.sha }}"
@@ -78,13 +97,17 @@ jobs:
7897 docker compose version
7998 echo "πΎ Disk Space:"
8099 df -h
81- echo "π§ Starting evaluation..."
100+ echo "π§ Starting SDK-based evaluation..."
82101 bash evals/scripts/run-remote-parameterized.sh "$MODE" "$EVAL_FILE" "$COMMIT_INDEX"
83102
84103 - name : Dump logs on failure
85104 if : failure() && steps.check_commit.outputs.should_run_evals == 'true'
86105 run : |
87- echo "β Evaluation failed - dumping diagnostic information"
106+ echo "β SDK Evaluation failed - dumping diagnostic information"
107+ echo "π§ SDK Environment:"
108+ echo " CODEBUFF_WEBSOCKET_URL: ${CODEBUFF_WEBSOCKET_URL:-not set}"
109+ echo " CODEBUFF_SKIP_BINARY_CHECK: ${CODEBUFF_SKIP_BINARY_CHECK:-not set}"
110+ echo " CODEBUFF_API_KEY: ${CODEBUFF_API_KEY:+[SET]}${CODEBUFF_API_KEY:-[NOT SET]}"
88111 echo "π³ Docker containers status:"
89112 docker ps -a || true
90113 echo "π Backend container logs:"
95118 df -h || true
96119 echo "π§ Memory usage:"
97120 free -h || true
121+ echo "π Evaluation files:"
122+ ls -la evals/git-evals/ || true
123+ ls -la evals/scripts/ || true
98124
99125 - name : Upload evaluation logs
100126 if : always() && steps.check_commit.outputs.should_run_evals == 'true'
@@ -119,7 +145,7 @@ jobs:
119145 remote-evals-matrix :
120146 runs-on : ubuntu-latest
121147 timeout-minutes : 90
122- if : contains(github.event.head_commit.message, '[remote-eval-all]') || (github.event_name == 'workflow_dispatch' && inputs.mode == 'matrix')
148+ if : contains(github.event.head_commit.message, '[remote-eval-all]')
123149
124150 strategy :
125151 fail-fast : false
@@ -141,14 +167,34 @@ jobs:
141167 - name : Install dependencies
142168 run : bun install --frozen-lockfile
143169
170+ - name : Validate environment for SDK evaluation
171+ run : |
172+ echo "π Validating SDK evaluation environment for matrix job..."
173+ test -f evals/scripts/run-remote-parameterized.sh || { echo "β Missing run-remote-parameterized.sh"; exit 1; }
174+ test -f evals/git-evals/run-single-eval.ts || { echo "β Missing run-single-eval.ts"; exit 1; }
175+ test -f evals/docker-compose.evals.yml || { echo "β Missing docker-compose.evals.yml"; exit 1; }
176+ echo "β
Matrix environment validation passed"
177+
144178 - name : Run evaluation matrix
145179 env :
146180 EVAL_FILE : ${{ matrix.eval.file }}
147181 COMMIT_INDEX : ${{ matrix.eval.index }}
182+ CODEBUFF_WEBSOCKET_URL : " ws://127.0.0.1:4242/ws"
183+ CODEBUFF_SKIP_BINARY_CHECK : " 1"
148184 run : |
149- echo "π Running matrix evaluation..."
185+ echo "π Running matrix evaluation (SDK Mode) ..."
150186 bash evals/scripts/run-remote-parameterized.sh "bypass" "$EVAL_FILE" "$COMMIT_INDEX"
151187
188+ - name : Dump matrix logs on failure
189+ if : failure()
190+ run : |
191+ echo "β Matrix SDK Evaluation failed - dumping diagnostic information"
192+ echo "π§ Matrix job details: File=$EVAL_FILE, Index=$COMMIT_INDEX"
193+ echo "π³ Docker containers status:"
194+ docker ps -a || true
195+ echo "π Container logs:"
196+ docker compose -f evals/docker-compose.evals.yml logs --tail=100 || true
197+
152198 - name : Upload matrix evaluation results
153199 if : always()
154200 uses : actions/upload-artifact@v4
0 commit comments