From 0a285418d78dc13ea946c41fe52110844ccf75b2 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Sun, 21 Jun 2026 19:40:53 +0800 Subject: [PATCH 1/3] fix: verify all requested eval concurrencies Keep the workflow's requested concurrency list independent from eval-produced metadata, fail when any requested result is missing, and preserve space-separated EVAL_CONC through the AMD Docker launch boundary. (cherry picked from commit 5e0f3101bc6856b7e7218fcb184216c0693e26c2) --- .../workflows/benchmark-multinode-tmpl.yml | 7 +- benchmarks/multi_node/amd_utils/job.slurm | 2 +- utils/evals/test_batched_eval.py | 77 +++++++++++++++++-- utils/evals/validate_scores.py | 53 +++++++++++-- 4 files changed, 122 insertions(+), 17 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index e58cff478..3beb246cc 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -313,7 +313,12 @@ jobs: - name: Verify eval scores if: ${{ (success() || failure()) && inputs.eval-only }} - run: python3 utils/evals/validate_scores.py + run: | + expected_concs="${EVAL_CONC}" + if [[ -z "${expected_concs}" ]]; then + expected_concs="$(printf '%s\n' "${CONC_LIST}" | tr ' ' '\n' | sort -n | tail -1)" + fi + python3 utils/evals/validate_scores.py --expected-concs "${expected_concs}" - name: Cleanup eval outputs (post-upload) if: ${{ always() && (inputs.run-eval || inputs.eval-only) }} diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 17f5b4f54..01a5bd386 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -370,7 +370,7 @@ DOCKER_ENV_COMMON=( -e WS_PATH=${WS_PATH} -e RUN_EVAL=\$RUN_EVAL -e EVAL_ONLY=\$EVAL_ONLY - -e EVAL_CONC + -e \"EVAL_CONC=\$EVAL_CONC\" -e FRAMEWORK=\$FRAMEWORK -e PRECISION=\$PRECISION -e MODEL_PREFIX=\$MODEL_PREFIX diff --git a/utils/evals/test_batched_eval.py b/utils/evals/test_batched_eval.py index f1ebb6b64..33edf1c2a 100644 --- a/utils/evals/test_batched_eval.py +++ b/utils/evals/test_batched_eval.py @@ -131,7 +131,7 @@ def test_batched_eval_requires_a_valid_manifest(tmp_path: Path) -> None: assert any("unavailable or invalid" in error for error in errors) -def test_validate_scores_warns_when_batch_status_metadata_is_unreadable( +def test_validate_scores_fails_when_expected_batch_metadata_is_unreadable( tmp_path: Path, monkeypatch, capsys, @@ -157,18 +157,70 @@ def test_validate_scores_warns_when_batch_status_metadata_is_unreadable( str(meta_path), "--results-glob", str(result_path), + "--expected-concs", + "1 4 8", ], ) - assert validate_scores_main() == 0 + assert validate_scores_main() == 1 captured = capsys.readouterr() - assert ( - "WARN: could not inspect eval metadata for batched concurrency status" - in captured.err + assert "unavailable or invalid" in captured.err + + +def test_workflow_concurrencies_are_independent_of_eval_metadata( + tmp_path: Path, +) -> None: + meta_path = tmp_path / "meta_env.json" + meta_path.write_text(json.dumps({ + "eval_concs": [8], + "completed_eval_concs": [8], + "failed_eval_concs": [], + })) + result_path = tmp_path / "results_test_conc8.json" + result_path.write_text('{"results": {}}') + + errors = validate_batch_manifest( + str(meta_path), + [str(result_path)], + expected_concs=[1, 4, 8], ) + assert "batched eval metadata does not match workflow concurrencies" in errors + assert any("missing completed concurrency: 1, 4" in error for error in errors) + assert any("missing result files for concurrency: 1, 4" in error for error in errors) + + +def test_validate_scores_checks_threshold_for_every_concurrency( + tmp_path: Path, + monkeypatch, +) -> None: + (tmp_path / "meta_env.json").write_text(json.dumps({ + "eval_concs": [1, 4], + "completed_eval_concs": [1, 4], + "failed_eval_concs": [], + })) + for conc, score in ((1, 0.9), (4, 0.8)): + (tmp_path / f"results_test_conc{conc}.json").write_text(json.dumps({ + "results": { + "gsm8k": { + "exact_match,strict-match": score, + }, + }, + })) + monkeypatch.setattr(sys, "argv", [ + "validate_scores.py", + "--meta-env", + str(tmp_path / "meta_env.json"), + "--results-glob", + str(tmp_path / "results*.json"), + "--expected-concs", + "1 4", + ]) + + assert validate_scores_main() == 1 + -def test_amd_multinode_container_inherits_eval_concurrency_list() -> None: +def test_amd_multinode_container_forwards_eval_concurrency_list() -> None: job_slurm = ( Path(__file__).resolve().parents[2] / "benchmarks" @@ -178,5 +230,14 @@ def test_amd_multinode_container_inherits_eval_concurrency_list() -> None: ) contents = job_slurm.read_text() - assert "-e EVAL_CONC\n" in contents - assert r"-e EVAL_CONC=\$EVAL_CONC" not in contents + assert r'-e \"EVAL_CONC=\$EVAL_CONC\"' in contents + assert "-e EVAL_CONC\n" not in contents + + workflow = ( + Path(__file__).resolve().parents[2] + / ".github" + / "workflows" + / "benchmark-multinode-tmpl.yml" + ).read_text() + assert 'expected_concs="${EVAL_CONC}"' in workflow + assert 'validate_scores.py --expected-concs "${expected_concs}"' in workflow diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py index f74ed267f..f78067940 100644 --- a/utils/evals/validate_scores.py +++ b/utils/evals/validate_scores.py @@ -91,13 +91,14 @@ def resolve_threshold(config: dict, prefix: str | None, task: str, fallback: flo def validate_batch_manifest( meta_env_path: str, result_files: list[str], + expected_concs: list[int] | None = None, ) -> list[str]: """Validate that a batched eval produced every requested concurrency.""" try: with open(meta_env_path) as f: meta = json.load(f) except (json.JSONDecodeError, OSError) as exc: - if any( + if expected_concs is not None or any( CONC_SUFFIX_RE.search(Path(result_file).name) for result_file in result_files ): @@ -107,29 +108,44 @@ def validate_batch_manifest( ] return [] + if expected_concs is not None and "eval_concs" not in meta: + if len(expected_concs) > 1: + return ["workflow requested multiple concurrencies but batched eval metadata is missing"] + errors = [] + if meta.get("conc") != expected_concs[0]: + errors.append("eval metadata concurrency does not match workflow request") + if len(result_files) != 1: + errors.append("eval must produce exactly one result file") + return errors if "eval_concs" not in meta: return [] - expected = meta.get("eval_concs") + metadata_expected = meta.get("eval_concs") completed = meta.get("completed_eval_concs") failed = meta.get("failed_eval_concs") - if not all(isinstance(values, list) for values in (expected, completed, failed)): + if not all( + isinstance(values, list) + for values in (metadata_expected, completed, failed) + ): return ["batched eval metadata must contain list-valued concurrency fields"] if not all( isinstance(value, int) and value > 0 - for values in (expected, completed, failed) + for values in (metadata_expected, completed, failed) for value in values ): return ["batched eval metadata contains an invalid concurrency"] errors = [] - expected_set = set(expected) + metadata_expected_set = set(metadata_expected) + expected_set = set(expected_concs or metadata_expected) completed_set = set(completed) failed_set = set(failed) - if len(expected_set) != len(expected): + if len(metadata_expected_set) != len(metadata_expected): errors.append("batched eval metadata contains duplicate expected concurrencies") if len(completed_set) != len(completed): errors.append("batched eval metadata contains duplicate completed concurrencies") + if expected_concs is not None and metadata_expected_set != expected_set: + errors.append("batched eval metadata does not match workflow concurrencies") if failed_set: errors.append( "batched eval failed for concurrency: " @@ -200,8 +216,27 @@ def main() -> int: "--results-glob", default="results*.json", help="Glob pattern for result files (default: 'results*.json')", ) + parser.add_argument( + "--expected-concs", + default=None, + help="Space-separated concurrencies requested by the workflow", + ) args = parser.parse_args() + expected_concs = None + if args.expected_concs is not None: + try: + expected_concs = [int(value) for value in args.expected_concs.split()] + except ValueError: + expected_concs = [] + if ( + not expected_concs + or any(value <= 0 for value in expected_concs) + or len(set(expected_concs)) != len(expected_concs) + ): + print("FAIL: expected concurrencies must be unique positive integers", file=sys.stderr) + return 1 + # Load thresholds config config = {"default": {}, "models": {}} thresholds_path = args.thresholds @@ -229,7 +264,11 @@ def main() -> int: checked = 0 result_files = sorted(glob.glob(args.results_glob)) - manifest_errors = validate_batch_manifest(args.meta_env, result_files) + manifest_errors = validate_batch_manifest( + args.meta_env, + result_files, + expected_concs, + ) for error in manifest_errors: print(f"FAIL: {error}", file=sys.stderr) failed = True From afdd9a4d7d257f6900e1aee5fb82e9df2f6dc7b4 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 22 Jun 2026 09:18:05 +0800 Subject: [PATCH 2/3] fix: label eval scores by concurrency, restrict all-evals to 8k1k, quiet sglang logs - validate_scores.py: prefix each PASS/FAIL with [conc=N] taken from the result filename so a failing concurrency is identifiable, and line-buffer stdout/stderr so the "Loaded thresholds" header prints in emission order instead of after the stderr FAIL lines when CI merges the streams. - generate_sweep_configs.py: mark_all_eval_entries now only expands evals for 8k1k entries (matching mark_eval_entries); 1k1k entries pass through unmarked and are dropped by the evals-only filter, so --all-evals no longer schedules 1k1k evals. - server_sglang.sh / env.sh: launch sglang prefill+decode with --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} and default MORI_APP_LOG_LEVEL to WARNING to cut multinode log spam (both overridable; server readiness is detected via health endpoints, not log scraping). --- benchmarks/multi_node/amd_utils/env.sh | 4 +- .../multi_node/amd_utils/server_sglang.sh | 3 + utils/evals/test_batched_eval.py | 7 ++ utils/evals/validate_scores.py | 19 ++++- utils/matrix_logic/generate_sweep_configs.py | 16 ++++- .../test_generate_sweep_configs.py | 71 +++++++++++++++---- 6 files changed, 103 insertions(+), 17 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 71d2653bd..a24347114 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -170,7 +170,9 @@ else export MORI_EP_LAUNCH_CONFIG_MODE=AUTO - export MORI_APP_LOG_LEVEL=INFO + # Default to WARNING to cut per-op MoRI log spam on long multinode/eval + # runs; override with MORI_APP_LOG_LEVEL=INFO when debugging. + export MORI_APP_LOG_LEVEL="${MORI_APP_LOG_LEVEL:-WARNING}" # Router logging control: # 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index c28ccab41..34351b1e4 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -425,6 +425,7 @@ if [ "$NODE_RANK" -eq 0 ]; then --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ + --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} \ ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then @@ -657,6 +658,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ + --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} \ ${PREFILL_SERVER_CONFIG} " if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then @@ -725,6 +727,7 @@ else --host 0.0.0.0 \ --port 8000 \ --trust-remote-code \ + --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} \ ${DECODE_SERVER_CONFIG} " if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then diff --git a/utils/evals/test_batched_eval.py b/utils/evals/test_batched_eval.py index 33edf1c2a..a5d6df008 100644 --- a/utils/evals/test_batched_eval.py +++ b/utils/evals/test_batched_eval.py @@ -193,6 +193,7 @@ def test_workflow_concurrencies_are_independent_of_eval_metadata( def test_validate_scores_checks_threshold_for_every_concurrency( tmp_path: Path, monkeypatch, + capsys, ) -> None: (tmp_path / "meta_env.json").write_text(json.dumps({ "eval_concs": [1, 4], @@ -219,6 +220,12 @@ def test_validate_scores_checks_threshold_for_every_concurrency( assert validate_scores_main() == 1 + # Each score line is attributed to the concurrency that produced it, so a + # failing concurrency is identifiable from the log (conc 4 here). + captured = capsys.readouterr() + assert "PASS: [conc=1] gsm8k exact_match,strict-match" in captured.out + assert "FAIL: [conc=4] gsm8k exact_match,strict-match" in captured.err + def test_amd_multinode_container_forwards_eval_concurrency_list() -> None: job_slurm = ( diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py index f78067940..0678d5670 100644 --- a/utils/evals/validate_scores.py +++ b/utils/evals/validate_scores.py @@ -191,6 +191,17 @@ def validate_batch_manifest( def main() -> int: + # CI merges this script's stdout and stderr into a single log. When stdout + # is a pipe it is block-buffered by default and only flushes at exit, which + # pushes the informational header (e.g. "Loaded thresholds...") below the + # unbuffered stderr FAIL lines. Force line buffering on both streams so + # every line reaches the log in emission order. + for _stream in (sys.stdout, sys.stderr): + try: + _stream.reconfigure(line_buffering=True) + except (AttributeError, ValueError): + pass + parser = argparse.ArgumentParser(description="Validate eval scores") parser.add_argument( "--min-score", type=float, default=0.85, @@ -285,6 +296,8 @@ def main() -> int: ) for f in result_files: + match = CONC_SUFFIX_RE.search(Path(f).name) + conc_label = f"[conc={match.group(1)}] " if match else "" with open(f) as fh: data = json.load(fh) for task, metrics in data.get("results", {}).items(): @@ -297,12 +310,14 @@ def main() -> int: checked += 1 if val < min_score: print( - f"FAIL: {task} {name} = {val:.4f} (< {min_score} from {source})", + f"FAIL: {conc_label}{task} {name} = {val:.4f} (< {min_score} from {source})", file=sys.stderr, ) failed = True else: - print(f"PASS: {task} {name} = {val:.4f} (>= {min_score} from {source})") + print( + f"PASS: {conc_label}{task} {name} = {val:.4f} (>= {min_score} from {source})" + ) if checked == 0: print("WARN: no metrics matched prefix '{}'".format(args.metric_prefix), file=sys.stderr) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index 51d69aed9..db3a268d4 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -149,8 +149,11 @@ def _eligible_eval_concs(entry): def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]: - """Expand eval selection to every fixed-sequence entry. + """Expand eval selection to every 8k1k fixed-sequence entry. + Evals only run at 8k1k (matching mark_eval_entries), so entries at other + sequence lengths (e.g. 1k1k) are passed through untouched rather than + expanded into eval rows. Agentic entries are left untouched because they do not support lm-eval. Multi-node rows with the same engine topology are merged into one eval row whose full concurrency list is run sequentially against the same engine. @@ -158,11 +161,22 @@ def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]: expanded_entries: list[dict] = [] multinode_indices: dict[tuple, int] = {} + target_isl, target_osl = seq_len_stoi["8k1k"] + for entry in matrix_values: if entry.get(Fields.SCENARIO_TYPE.value) == 'agentic-coding': expanded_entries.append(entry) continue + # Only 8k1k is eligible for evals; leave other sequence lengths as-is + # (their RUN_EVAL stays False, so the evals-only filter drops them). + if ( + entry.get(Fields.ISL.value) != target_isl + or entry.get(Fields.OSL.value) != target_osl + ): + expanded_entries.append(entry) + continue + if Fields.PREFILL.value in entry: conc = entry[Fields.CONC.value] conc_values = conc if isinstance(conc, list) else [conc] diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index 6179cef45..f082eada7 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -473,14 +473,14 @@ def test_never_marks_all_entries(self): class TestMarkAllEvalEntries: """Tests for the all-evals selection policy.""" - def test_marks_all_fixed_sequence_entries_without_policy_filters(self): + def test_marks_only_8k1k_entries_and_passes_other_seq_lens_through(self): entries = [ - { + { # 1k1k is not eligible for evals -> left unmarked 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', 'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 1, 'spec-decoding': 'none', 'dp-attn': False, 'run-eval': False, }, - { + { # 8k1k is eligible -> marked for eval 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', 'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 8, 'spec-decoding': 'none', 'dp-attn': False, 'run-eval': False, @@ -489,13 +489,15 @@ def test_marks_all_fixed_sequence_entries_without_policy_filters(self): result = mark_all_eval_entries(entries) - assert all(entry['run-eval'] for entry in result) + by_isl = {entry['isl']: entry for entry in result} + assert by_isl[1024]['run-eval'] is False + assert by_isl[8192]['run-eval'] is True def test_batches_every_multinode_concurrency_per_engine_topology(self): entries = [ { 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', - 'isl': 1024, 'osl': 1024, 'spec-decoding': 'none', + 'isl': 8192, 'osl': 1024, 'spec-decoding': 'none', 'prefill': {'dp-attn': False}, 'decode': {'dp-attn': False}, 'conc': [1, 4, 8, 16], @@ -545,7 +547,7 @@ def test_deduplicates_overlapping_concurrency_rows_for_same_parallelism(self): entries = [ { 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', - 'isl': 1024, 'osl': 1024, 'spec-decoding': 'none', + 'isl': 8192, 'osl': 1024, 'spec-decoding': 'none', 'prefill': {'dp-attn': False}, 'decode': {'dp-attn': False}, 'conc': [4, 8, 16], @@ -554,7 +556,7 @@ def test_deduplicates_overlapping_concurrency_rows_for_same_parallelism(self): }, { 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', - 'isl': 1024, 'osl': 1024, 'spec-decoding': 'none', + 'isl': 8192, 'osl': 1024, 'spec-decoding': 'none', 'prefill': {'dp-attn': False}, 'decode': {'dp-attn': False}, 'conc': [16, 32], @@ -570,6 +572,40 @@ def test_deduplicates_overlapping_concurrency_rows_for_same_parallelism(self): assert result[0]['eval-all-concs'] is True assert 'eval-conc' not in result[0] + def test_excludes_1k1k_multinode_entries_from_expansion(self): + entries = [ + { # 1k1k multinode: left untouched, never batched or eval-marked + 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', + 'isl': 1024, 'osl': 1024, 'spec-decoding': 'none', + 'prefill': {'dp-attn': False}, + 'decode': {'dp-attn': False}, + 'conc': [4, 8, 16], + 'run-eval': False, + }, + { # 8k1k multinode: expanded into a batched eval row + 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8', + 'isl': 8192, 'osl': 1024, 'spec-decoding': 'none', + 'prefill': {'dp-attn': False}, + 'decode': {'dp-attn': False}, + 'conc': [8, 32], + 'run-eval': False, + }, + ] + + result = mark_all_eval_entries(entries) + + assert len(result) == 2 + one_k = next(e for e in result if e['isl'] == 1024) + eight_k = next(e for e in result if e['isl'] == 8192) + # 1k1k untouched: not eval-marked, not batched, concurrency unchanged + assert one_k['run-eval'] is False + assert 'eval-all-concs' not in one_k + assert one_k['conc'] == [4, 8, 16] + # 8k1k expanded into a batched eval row + assert eight_k['run-eval'] is True + assert eight_k['eval-all-concs'] is True + assert eight_k['conc'] == [8, 32] + def test_skips_agentic_entries(self): entries = [ { @@ -1811,7 +1847,8 @@ def test_all_evals_cli_marks_every_fixed_sequence_entry( sample_single_node_config, sample_runner_config, ): - """--all-evals should bypass the default 8k1k/min-conc policy.""" + """--all-evals bypasses the default min-conc/highest-median policy but + still only evaluates 8k1k (1k1k entries are excluded).""" import sys import generate_sweep_configs @@ -1835,9 +1872,10 @@ def test_all_evals_cli_marks_every_fixed_sequence_entry( result = generate_sweep_configs.main() - assert len(result) == 10 + # Every 8k1k concurrency is marked (5 conc values), and the 1k1k + # entries are dropped rather than evaluated. + assert len(result) == 5 assert {(entry['isl'], entry['osl']) for entry in result} == { - (1024, 1024), (8192, 1024), } assert min(entry['conc'] for entry in result) == 4 @@ -1874,7 +1912,10 @@ def test_all_evals_composes_with_evals_only( result = generate_sweep_configs.main() - assert len(result) == 10 + assert len(result) == 5 + assert {(entry['isl'], entry['osl']) for entry in result} == { + (8192, 1024), + } assert all(entry['run-eval'] is True for entry in result) assert all(entry['eval-only'] is True for entry in result) @@ -1888,10 +1929,14 @@ def test_all_evals_batches_each_multinode_concurrency( import generate_sweep_configs config = sample_multinode_config - search_space = ( + seq_entry = ( config['dsr1-fp4-gb200-dynamo-trt']['scenarios'] - ['fixed-seq-len'][0]['search-space'] + ['fixed-seq-len'][0] ) + # all-evals only evaluates 8k1k, so target that sequence length. + seq_entry['isl'] = 8192 + seq_entry['osl'] = 1024 + search_space = seq_entry['search-space'] search_space[0]['conc-list'] = [4, 16, 64] monkeypatch.setattr( From 35dc57c85f30eee93936245b813595803153ae6f Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 22 Jun 2026 09:32:41 +0800 Subject: [PATCH 3/3] docs: explain best-effort stream reconfigure in validate_scores Address github-code-quality review: the except (AttributeError, ValueError) guard around stream reconfigure was an undocumented empty swallow. Add a comment noting it is best-effort (wrapped streams such as pytest's capture object do not support reconfigure). No functional change. --- utils/evals/validate_scores.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py index 0678d5670..1fff4178e 100644 --- a/utils/evals/validate_scores.py +++ b/utils/evals/validate_scores.py @@ -200,6 +200,8 @@ def main() -> int: try: _stream.reconfigure(line_buffering=True) except (AttributeError, ValueError): + # Best-effort only: some wrapped streams (e.g. pytest's capture + # object) don't support reconfigure; leave their buffering as-is. pass parser = argparse.ArgumentParser(description="Validate eval scores")