Skip to content

Commit b09d939

Browse files
m-messerclaude
andcommitted
Always include output in io_test and unit_test feedback
In io_test, hidden tests now show actual output (and expected on fail, error details on error) — only the input block remains suppressed. In unit_test, stdout from the code run is shown as an output feedback item when non-empty. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 9a69ade commit b09d939

2 files changed

Lines changed: 23 additions & 29 deletions

File tree

evaluation_function/evaluation.py

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -218,37 +218,28 @@ def _evaluate_io(response: str, tests: list, result: Result, answer: str = "") -
218218
result.add_feedback(tag, f"{label}: timed out after {_TIMEOUT}s.")
219219
elif stderr and not stdout:
220220
tag = "hidden_fail" if hidden else "fail"
221-
if hidden:
222-
result.add_feedback(tag, f"{label}: runtime error.")
223-
else:
224-
parts = [f"{label}: runtime error."]
225-
if input_block:
226-
parts.append(input_block)
227-
parts.append(_code_block("Error", stderr.strip()))
228-
result.add_feedback(tag, "\n\n".join(parts))
221+
parts = [f"{label}: runtime error."]
222+
if not hidden and input_block:
223+
parts.append(input_block)
224+
parts.append(_code_block("Error", stderr.strip()))
225+
result.add_feedback(tag, "\n\n".join(parts))
229226
elif actual == expected:
230227
passed += 1
231-
if hidden:
232-
result.add_feedback("pass", f"{label}: passed.")
233-
else:
234-
parts = [f"{label}: passed."]
235-
if input_block:
236-
parts.append(input_block)
237-
parts.append(_code_block("Output", actual or "(no output)"))
238-
parts.extend(_upload_plots(images))
239-
result.add_feedback("pass", "\n\n".join(parts))
228+
parts = [f"{label}: passed."]
229+
if not hidden and input_block:
230+
parts.append(input_block)
231+
parts.append(_code_block("Output", actual or "(no output)"))
232+
parts.extend(_upload_plots(images))
233+
result.add_feedback("pass", "\n\n".join(parts))
240234
else:
241235
tag = "hidden_fail" if hidden else "fail"
242-
if hidden:
243-
result.add_feedback(tag, f"{label}: failed.")
244-
else:
245-
parts = [f"{label}: failed."]
246-
if input_block:
247-
parts.append(input_block)
248-
parts.append(_code_block("Your output", actual or "(no output)"))
249-
parts.append(_code_block("Expected", expected))
250-
parts.extend(_upload_plots(images))
251-
result.add_feedback(tag, "\n\n".join(parts))
236+
parts = [f"{label}: failed."]
237+
if not hidden and input_block:
238+
parts.append(input_block)
239+
parts.append(_code_block("Your output", actual or "(no output)"))
240+
parts.append(_code_block("Expected", expected))
241+
parts.extend(_upload_plots(images))
242+
result.add_feedback(tag, "\n\n".join(parts))
252243

253244
result.is_correct = passed == len(tests)
254245
result.add_feedback("summary", f"{passed}/{len(tests)} tests passed.")
@@ -282,6 +273,9 @@ def _evaluate_unit(response: str, test_code: str, result: Result) -> Result:
282273
if test_results is None:
283274
return result
284275

276+
if stdout.strip():
277+
result.add_feedback("output", _code_block("Output", stdout.rstrip()))
278+
285279
passed = 0
286280
for r in test_results:
287281
name, status = r["name"], r["status"]

evaluation_function/evaluation_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def test_hidden_test_fail(self):
4646

4747
self.assertFalse(result["is_correct"])
4848
self.assertIn("Hidden test 1", result["feedback"])
49-
self.assertNotIn("999", result["feedback"])
50-
self.assertNotIn("5", result["feedback"])
49+
self.assertIn("999", result["feedback"])
50+
self.assertNotIn("Input", result["feedback"])
5151

5252
def test_runtime_error(self):
5353
params = _params(_test("5\n", "25\n"))

0 commit comments

Comments
 (0)