From 0a285418d78dc13ea946c41fe52110844ccf75b2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 21 Jun 2026 19:40:53 +0800
Subject: [PATCH 1/3] fix: verify all requested eval concurrencies

Keep the workflow's requested concurrency list independent from eval-produced metadata, fail when any requested result is missing, and preserve space-separated EVAL_CONC through the AMD Docker launch boundary.

(cherry picked from commit 5e0f3101bc6856b7e7218fcb184216c0693e26c2)
---
 .../workflows/benchmark-multinode-tmpl.yml    |  7 +-
 benchmarks/multi_node/amd_utils/job.slurm     |  2 +-
 utils/evals/test_batched_eval.py              | 77 +++++++++++++++++--
 utils/evals/validate_scores.py                | 53 +++++++++++--
 4 files changed, 122 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index e58cff478..3beb246cc 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -313,7 +313,12 @@ jobs:
 
       - name: Verify eval scores
         if: ${{ (success() || failure()) && inputs.eval-only }}
-        run: python3 utils/evals/validate_scores.py
+        run: |
+          expected_concs="${EVAL_CONC}"
+          if [[ -z "${expected_concs}" ]]; then
+            expected_concs="$(printf '%s\n' "${CONC_LIST}" | tr ' ' '\n' | sort -n | tail -1)"
+          fi
+          python3 utils/evals/validate_scores.py --expected-concs "${expected_concs}"
 
       - name: Cleanup eval outputs (post-upload)
         if: ${{ always() && (inputs.run-eval || inputs.eval-only) }}
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 17f5b4f54..01a5bd386 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -370,7 +370,7 @@ DOCKER_ENV_COMMON=(
     -e WS_PATH=${WS_PATH}
     -e RUN_EVAL=\$RUN_EVAL
     -e EVAL_ONLY=\$EVAL_ONLY
-    -e EVAL_CONC
+    -e \"EVAL_CONC=\$EVAL_CONC\"
     -e FRAMEWORK=\$FRAMEWORK
     -e PRECISION=\$PRECISION
     -e MODEL_PREFIX=\$MODEL_PREFIX
diff --git a/utils/evals/test_batched_eval.py b/utils/evals/test_batched_eval.py
index f1ebb6b64..33edf1c2a 100644
--- a/utils/evals/test_batched_eval.py
+++ b/utils/evals/test_batched_eval.py
@@ -131,7 +131,7 @@ def test_batched_eval_requires_a_valid_manifest(tmp_path: Path) -> None:
     assert any("unavailable or invalid" in error for error in errors)
 
 
-def test_validate_scores_warns_when_batch_status_metadata_is_unreadable(
+def test_validate_scores_fails_when_expected_batch_metadata_is_unreadable(
     tmp_path: Path,
     monkeypatch,
     capsys,
@@ -157,18 +157,70 @@ def test_validate_scores_warns_when_batch_status_metadata_is_unreadable(
             str(meta_path),
             "--results-glob",
             str(result_path),
+            "--expected-concs",
+            "1 4 8",
         ],
     )
 
-    assert validate_scores_main() == 0
+    assert validate_scores_main() == 1
     captured = capsys.readouterr()
-    assert (
-        "WARN: could not inspect eval metadata for batched concurrency status"
-        in captured.err
+    assert "unavailable or invalid" in captured.err
+
+
+def test_workflow_concurrencies_are_independent_of_eval_metadata(
+    tmp_path: Path,
+) -> None:
+    meta_path = tmp_path / "meta_env.json"
+    meta_path.write_text(json.dumps({
+        "eval_concs": [8],
+        "completed_eval_concs": [8],
+        "failed_eval_concs": [],
+    }))
+    result_path = tmp_path / "results_test_conc8.json"
+    result_path.write_text('{"results": {}}')
+
+    errors = validate_batch_manifest(
+        str(meta_path),
+        [str(result_path)],
+        expected_concs=[1, 4, 8],
     )
 
+    assert "batched eval metadata does not match workflow concurrencies" in errors
+    assert any("missing completed concurrency: 1, 4" in error for error in errors)
+    assert any("missing result files for concurrency: 1, 4" in error for error in errors)
+
+
+def test_validate_scores_checks_threshold_for_every_concurrency(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    (tmp_path / "meta_env.json").write_text(json.dumps({
+        "eval_concs": [1, 4],
+        "completed_eval_concs": [1, 4],
+        "failed_eval_concs": [],
+    }))
+    for conc, score in ((1, 0.9), (4, 0.8)):
+        (tmp_path / f"results_test_conc{conc}.json").write_text(json.dumps({
+            "results": {
+                "gsm8k": {
+                    "exact_match,strict-match": score,
+                },
+            },
+        }))
+    monkeypatch.setattr(sys, "argv", [
+        "validate_scores.py",
+        "--meta-env",
+        str(tmp_path / "meta_env.json"),
+        "--results-glob",
+        str(tmp_path / "results*.json"),
+        "--expected-concs",
+        "1 4",
+    ])
+
+    assert validate_scores_main() == 1
+
 
-def test_amd_multinode_container_inherits_eval_concurrency_list() -> None:
+def test_amd_multinode_container_forwards_eval_concurrency_list() -> None:
     job_slurm = (
         Path(__file__).resolve().parents[2]
         / "benchmarks"
@@ -178,5 +230,14 @@ def test_amd_multinode_container_inherits_eval_concurrency_list() -> None:
     )
     contents = job_slurm.read_text()
 
-    assert "-e EVAL_CONC\n" in contents
-    assert r"-e EVAL_CONC=\$EVAL_CONC" not in contents
+    assert r'-e \"EVAL_CONC=\$EVAL_CONC\"' in contents
+    assert "-e EVAL_CONC\n" not in contents
+
+    workflow = (
+        Path(__file__).resolve().parents[2]
+        / ".github"
+        / "workflows"
+        / "benchmark-multinode-tmpl.yml"
+    ).read_text()
+    assert 'expected_concs="${EVAL_CONC}"' in workflow
+    assert 'validate_scores.py --expected-concs "${expected_concs}"' in workflow
diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
index f74ed267f..f78067940 100644
--- a/utils/evals/validate_scores.py
+++ b/utils/evals/validate_scores.py
@@ -91,13 +91,14 @@ def resolve_threshold(config: dict, prefix: str | None, task: str, fallback: flo
 def validate_batch_manifest(
     meta_env_path: str,
     result_files: list[str],
+    expected_concs: list[int] | None = None,
 ) -> list[str]:
     """Validate that a batched eval produced every requested concurrency."""
     try:
         with open(meta_env_path) as f:
             meta = json.load(f)
     except (json.JSONDecodeError, OSError) as exc:
-        if any(
+        if expected_concs is not None or any(
             CONC_SUFFIX_RE.search(Path(result_file).name)
             for result_file in result_files
         ):
@@ -107,29 +108,44 @@ def validate_batch_manifest(
             ]
         return []
 
+    if expected_concs is not None and "eval_concs" not in meta:
+        if len(expected_concs) > 1:
+            return ["workflow requested multiple concurrencies but batched eval metadata is missing"]
+        errors = []
+        if meta.get("conc") != expected_concs[0]:
+            errors.append("eval metadata concurrency does not match workflow request")
+        if len(result_files) != 1:
+            errors.append("eval must produce exactly one result file")
+        return errors
     if "eval_concs" not in meta:
         return []
 
-    expected = meta.get("eval_concs")
+    metadata_expected = meta.get("eval_concs")
     completed = meta.get("completed_eval_concs")
     failed = meta.get("failed_eval_concs")
-    if not all(isinstance(values, list) for values in (expected, completed, failed)):
+    if not all(
+        isinstance(values, list)
+        for values in (metadata_expected, completed, failed)
+    ):
         return ["batched eval metadata must contain list-valued concurrency fields"]
     if not all(
         isinstance(value, int) and value > 0
-        for values in (expected, completed, failed)
+        for values in (metadata_expected, completed, failed)
         for value in values
     ):
         return ["batched eval metadata contains an invalid concurrency"]
 
     errors = []
-    expected_set = set(expected)
+    metadata_expected_set = set(metadata_expected)
+    expected_set = set(expected_concs or metadata_expected)
     completed_set = set(completed)
     failed_set = set(failed)
-    if len(expected_set) != len(expected):
+    if len(metadata_expected_set) != len(metadata_expected):
         errors.append("batched eval metadata contains duplicate expected concurrencies")
     if len(completed_set) != len(completed):
         errors.append("batched eval metadata contains duplicate completed concurrencies")
+    if expected_concs is not None and metadata_expected_set != expected_set:
+        errors.append("batched eval metadata does not match workflow concurrencies")
     if failed_set:
         errors.append(
             "batched eval failed for concurrency: "
@@ -200,8 +216,27 @@ def main() -> int:
         "--results-glob", default="results*.json",
         help="Glob pattern for result files (default: 'results*.json')",
     )
+    parser.add_argument(
+        "--expected-concs",
+        default=None,
+        help="Space-separated concurrencies requested by the workflow",
+    )
     args = parser.parse_args()
 
+    expected_concs = None
+    if args.expected_concs is not None:
+        try:
+            expected_concs = [int(value) for value in args.expected_concs.split()]
+        except ValueError:
+            expected_concs = []
+        if (
+            not expected_concs
+            or any(value <= 0 for value in expected_concs)
+            or len(set(expected_concs)) != len(expected_concs)
+        ):
+            print("FAIL: expected concurrencies must be unique positive integers", file=sys.stderr)
+            return 1
+
     # Load thresholds config
     config = {"default": {}, "models": {}}
     thresholds_path = args.thresholds
@@ -229,7 +264,11 @@ def main() -> int:
     checked = 0
     result_files = sorted(glob.glob(args.results_glob))
 
-    manifest_errors = validate_batch_manifest(args.meta_env, result_files)
+    manifest_errors = validate_batch_manifest(
+        args.meta_env,
+        result_files,
+        expected_concs,
+    )
     for error in manifest_errors:
         print(f"FAIL: {error}", file=sys.stderr)
         failed = True

From afdd9a4d7d257f6900e1aee5fb82e9df2f6dc7b4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:18:05 +0800
Subject: [PATCH 2/3] fix: label eval scores by concurrency, restrict all-evals
 to 8k1k, quiet sglang logs

- validate_scores.py: prefix each PASS/FAIL with [conc=N] taken from the
  result filename so a failing concurrency is identifiable, and line-buffer
  stdout/stderr so the "Loaded thresholds" header prints in emission order
  instead of after the stderr FAIL lines when CI merges the streams.
- generate_sweep_configs.py: mark_all_eval_entries now only expands evals for
  8k1k entries (matching mark_eval_entries); 1k1k entries pass through
  unmarked and are dropped by the evals-only filter, so --all-evals no longer
  schedules 1k1k evals.
- server_sglang.sh / env.sh: launch sglang prefill+decode with
  --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} and default
  MORI_APP_LOG_LEVEL to WARNING to cut multinode log spam (both overridable;
  server readiness is detected via health endpoints, not log scraping).
---
 benchmarks/multi_node/amd_utils/env.sh        |  4 +-
 .../multi_node/amd_utils/server_sglang.sh     |  3 +
 utils/evals/test_batched_eval.py              |  7 ++
 utils/evals/validate_scores.py                | 19 ++++-
 utils/matrix_logic/generate_sweep_configs.py  | 16 ++++-
 .../test_generate_sweep_configs.py            | 71 +++++++++++++++----
 6 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 71d2653bd..a24347114 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -170,7 +170,9 @@ else
 
     export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
 
-    export MORI_APP_LOG_LEVEL=INFO
+    # Default to WARNING to cut per-op MoRI log spam on long multinode/eval
+    # runs; override with MORI_APP_LOG_LEVEL=INFO when debugging.
+    export MORI_APP_LOG_LEVEL="${MORI_APP_LOG_LEVEL:-WARNING}"
 
     # Router logging control:
     # 0 (default) keeps noisy per-request access logs out of stdout while still logging to file.
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index c28ccab41..34351b1e4 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -425,6 +425,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --host 0.0.0.0 \
         --port 8000 \
         --trust-remote-code \
+        --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} \
         ${PREFILL_SERVER_CONFIG} "
 
     if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
@@ -657,6 +658,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --host 0.0.0.0 \
         --port 8000 \
         --trust-remote-code \
+        --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} \
         ${PREFILL_SERVER_CONFIG} "
 
     if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
@@ -725,6 +727,7 @@ else
         --host 0.0.0.0 \
         --port 8000 \
         --trust-remote-code \
+        --log-level ${SGLANG_SERVER_LOG_LEVEL:-warning} \
         ${DECODE_SERVER_CONFIG} "
 
     if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
diff --git a/utils/evals/test_batched_eval.py b/utils/evals/test_batched_eval.py
index 33edf1c2a..a5d6df008 100644
--- a/utils/evals/test_batched_eval.py
+++ b/utils/evals/test_batched_eval.py
@@ -193,6 +193,7 @@ def test_workflow_concurrencies_are_independent_of_eval_metadata(
 def test_validate_scores_checks_threshold_for_every_concurrency(
     tmp_path: Path,
     monkeypatch,
+    capsys,
 ) -> None:
     (tmp_path / "meta_env.json").write_text(json.dumps({
         "eval_concs": [1, 4],
@@ -219,6 +220,12 @@ def test_validate_scores_checks_threshold_for_every_concurrency(
 
     assert validate_scores_main() == 1
 
+    # Each score line is attributed to the concurrency that produced it, so a
+    # failing concurrency is identifiable from the log (conc 4 here).
+    captured = capsys.readouterr()
+    assert "PASS: [conc=1] gsm8k exact_match,strict-match" in captured.out
+    assert "FAIL: [conc=4] gsm8k exact_match,strict-match" in captured.err
+
 
 def test_amd_multinode_container_forwards_eval_concurrency_list() -> None:
     job_slurm = (
diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
index f78067940..0678d5670 100644
--- a/utils/evals/validate_scores.py
+++ b/utils/evals/validate_scores.py
@@ -191,6 +191,17 @@ def validate_batch_manifest(
 
 
 def main() -> int:
+    # CI merges this script's stdout and stderr into a single log.  When stdout
+    # is a pipe it is block-buffered by default and only flushes at exit, which
+    # pushes the informational header (e.g. "Loaded thresholds...") below the
+    # unbuffered stderr FAIL lines.  Force line buffering on both streams so
+    # every line reaches the log in emission order.
+    for _stream in (sys.stdout, sys.stderr):
+        try:
+            _stream.reconfigure(line_buffering=True)
+        except (AttributeError, ValueError):
+            pass
+
     parser = argparse.ArgumentParser(description="Validate eval scores")
     parser.add_argument(
         "--min-score", type=float, default=0.85,
@@ -285,6 +296,8 @@ def main() -> int:
             )
 
     for f in result_files:
+        match = CONC_SUFFIX_RE.search(Path(f).name)
+        conc_label = f"[conc={match.group(1)}] " if match else ""
         with open(f) as fh:
             data = json.load(fh)
         for task, metrics in data.get("results", {}).items():
@@ -297,12 +310,14 @@ def main() -> int:
                 checked += 1
                 if val < min_score:
                     print(
-                        f"FAIL: {task} {name} = {val:.4f} (< {min_score} from {source})",
+                        f"FAIL: {conc_label}{task} {name} = {val:.4f} (< {min_score} from {source})",
                         file=sys.stderr,
                     )
                     failed = True
                 else:
-                    print(f"PASS: {task} {name} = {val:.4f} (>= {min_score} from {source})")
+                    print(
+                        f"PASS: {conc_label}{task} {name} = {val:.4f} (>= {min_score} from {source})"
+                    )
 
     if checked == 0:
         print("WARN: no metrics matched prefix '{}'".format(args.metric_prefix), file=sys.stderr)
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index 51d69aed9..db3a268d4 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -149,8 +149,11 @@ def _eligible_eval_concs(entry):
 
 
 def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]:
-    """Expand eval selection to every fixed-sequence entry.
+    """Expand eval selection to every 8k1k fixed-sequence entry.
 
+    Evals only run at 8k1k (matching mark_eval_entries), so entries at other
+    sequence lengths (e.g. 1k1k) are passed through untouched rather than
+    expanded into eval rows.
     Agentic entries are left untouched because they do not support lm-eval.
     Multi-node rows with the same engine topology are merged into one eval row
     whose full concurrency list is run sequentially against the same engine.
@@ -158,11 +161,22 @@ def mark_all_eval_entries(matrix_values: list[dict]) -> list[dict]:
     expanded_entries: list[dict] = []
     multinode_indices: dict[tuple, int] = {}
 
+    target_isl, target_osl = seq_len_stoi["8k1k"]
+
     for entry in matrix_values:
         if entry.get(Fields.SCENARIO_TYPE.value) == 'agentic-coding':
             expanded_entries.append(entry)
             continue
 
+        # Only 8k1k is eligible for evals; leave other sequence lengths as-is
+        # (their RUN_EVAL stays False, so the evals-only filter drops them).
+        if (
+            entry.get(Fields.ISL.value) != target_isl
+            or entry.get(Fields.OSL.value) != target_osl
+        ):
+            expanded_entries.append(entry)
+            continue
+
         if Fields.PREFILL.value in entry:
             conc = entry[Fields.CONC.value]
             conc_values = conc if isinstance(conc, list) else [conc]
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index 6179cef45..f082eada7 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -473,14 +473,14 @@ def test_never_marks_all_entries(self):
 class TestMarkAllEvalEntries:
     """Tests for the all-evals selection policy."""
 
-    def test_marks_all_fixed_sequence_entries_without_policy_filters(self):
+    def test_marks_only_8k1k_entries_and_passes_other_seq_lens_through(self):
         entries = [
-            {
+            {  # 1k1k is not eligible for evals -> left unmarked
                 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
                 'isl': 1024, 'osl': 1024, 'tp': 2, 'conc': 1,
                 'spec-decoding': 'none', 'dp-attn': False, 'run-eval': False,
             },
-            {
+            {  # 8k1k is eligible -> marked for eval
                 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
                 'isl': 8192, 'osl': 1024, 'tp': 2, 'conc': 8,
                 'spec-decoding': 'none', 'dp-attn': False, 'run-eval': False,
@@ -489,13 +489,15 @@ def test_marks_all_fixed_sequence_entries_without_policy_filters(self):
 
         result = mark_all_eval_entries(entries)
 
-        assert all(entry['run-eval'] for entry in result)
+        by_isl = {entry['isl']: entry for entry in result}
+        assert by_isl[1024]['run-eval'] is False
+        assert by_isl[8192]['run-eval'] is True
 
     def test_batches_every_multinode_concurrency_per_engine_topology(self):
         entries = [
             {
                 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
-                'isl': 1024, 'osl': 1024, 'spec-decoding': 'none',
+                'isl': 8192, 'osl': 1024, 'spec-decoding': 'none',
                 'prefill': {'dp-attn': False},
                 'decode': {'dp-attn': False},
                 'conc': [1, 4, 8, 16],
@@ -545,7 +547,7 @@ def test_deduplicates_overlapping_concurrency_rows_for_same_parallelism(self):
         entries = [
             {
                 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
-                'isl': 1024, 'osl': 1024, 'spec-decoding': 'none',
+                'isl': 8192, 'osl': 1024, 'spec-decoding': 'none',
                 'prefill': {'dp-attn': False},
                 'decode': {'dp-attn': False},
                 'conc': [4, 8, 16],
@@ -554,7 +556,7 @@ def test_deduplicates_overlapping_concurrency_rows_for_same_parallelism(self):
             },
             {
                 'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
-                'isl': 1024, 'osl': 1024, 'spec-decoding': 'none',
+                'isl': 8192, 'osl': 1024, 'spec-decoding': 'none',
                 'prefill': {'dp-attn': False},
                 'decode': {'dp-attn': False},
                 'conc': [16, 32],
@@ -570,6 +572,40 @@ def test_deduplicates_overlapping_concurrency_rows_for_same_parallelism(self):
         assert result[0]['eval-all-concs'] is True
         assert 'eval-conc' not in result[0]
 
+    def test_excludes_1k1k_multinode_entries_from_expansion(self):
+        entries = [
+            {  # 1k1k multinode: left untouched, never batched or eval-marked
+                'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+                'isl': 1024, 'osl': 1024, 'spec-decoding': 'none',
+                'prefill': {'dp-attn': False},
+                'decode': {'dp-attn': False},
+                'conc': [4, 8, 16],
+                'run-eval': False,
+            },
+            {  # 8k1k multinode: expanded into a batched eval row
+                'model': 'm', 'runner': 'r', 'framework': 'f', 'precision': 'fp8',
+                'isl': 8192, 'osl': 1024, 'spec-decoding': 'none',
+                'prefill': {'dp-attn': False},
+                'decode': {'dp-attn': False},
+                'conc': [8, 32],
+                'run-eval': False,
+            },
+        ]
+
+        result = mark_all_eval_entries(entries)
+
+        assert len(result) == 2
+        one_k = next(e for e in result if e['isl'] == 1024)
+        eight_k = next(e for e in result if e['isl'] == 8192)
+        # 1k1k untouched: not eval-marked, not batched, concurrency unchanged
+        assert one_k['run-eval'] is False
+        assert 'eval-all-concs' not in one_k
+        assert one_k['conc'] == [4, 8, 16]
+        # 8k1k expanded into a batched eval row
+        assert eight_k['run-eval'] is True
+        assert eight_k['eval-all-concs'] is True
+        assert eight_k['conc'] == [8, 32]
+
     def test_skips_agentic_entries(self):
         entries = [
             {
@@ -1811,7 +1847,8 @@ def test_all_evals_cli_marks_every_fixed_sequence_entry(
         sample_single_node_config,
         sample_runner_config,
     ):
-        """--all-evals should bypass the default 8k1k/min-conc policy."""
+        """--all-evals bypasses the default min-conc/highest-median policy but
+        still only evaluates 8k1k (1k1k entries are excluded)."""
         import sys
         import generate_sweep_configs
 
@@ -1835,9 +1872,10 @@ def test_all_evals_cli_marks_every_fixed_sequence_entry(
 
         result = generate_sweep_configs.main()
 
-        assert len(result) == 10
+        # Every 8k1k concurrency is marked (5 conc values), and the 1k1k
+        # entries are dropped rather than evaluated.
+        assert len(result) == 5
         assert {(entry['isl'], entry['osl']) for entry in result} == {
-            (1024, 1024),
             (8192, 1024),
         }
         assert min(entry['conc'] for entry in result) == 4
@@ -1874,7 +1912,10 @@ def test_all_evals_composes_with_evals_only(
 
         result = generate_sweep_configs.main()
 
-        assert len(result) == 10
+        assert len(result) == 5
+        assert {(entry['isl'], entry['osl']) for entry in result} == {
+            (8192, 1024),
+        }
         assert all(entry['run-eval'] is True for entry in result)
         assert all(entry['eval-only'] is True for entry in result)
 
@@ -1888,10 +1929,14 @@ def test_all_evals_batches_each_multinode_concurrency(
         import generate_sweep_configs
 
         config = sample_multinode_config
-        search_space = (
+        seq_entry = (
             config['dsr1-fp4-gb200-dynamo-trt']['scenarios']
-            ['fixed-seq-len'][0]['search-space']
+            ['fixed-seq-len'][0]
         )
+        # all-evals only evaluates 8k1k, so target that sequence length.
+        seq_entry['isl'] = 8192
+        seq_entry['osl'] = 1024
+        search_space = seq_entry['search-space']
         search_space[0]['conc-list'] = [4, 16, 64]
 
         monkeypatch.setattr(

From 35dc57c85f30eee93936245b813595803153ae6f Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:32:41 +0800
Subject: [PATCH 3/3] docs: explain best-effort stream reconfigure in
 validate_scores

Address github-code-quality review: the except (AttributeError, ValueError)
guard around stream reconfigure was an undocumented empty swallow. Add a
comment noting it is best-effort (wrapped streams such as pytest's capture
object do not support reconfigure). No functional change.
---
 utils/evals/validate_scores.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
index 0678d5670..1fff4178e 100644
--- a/utils/evals/validate_scores.py
+++ b/utils/evals/validate_scores.py
@@ -200,6 +200,8 @@ def main() -> int:
         try:
             _stream.reconfigure(line_buffering=True)
         except (AttributeError, ValueError):
+            # Best-effort only: some wrapped streams (e.g. pytest's capture
+            # object) don't support reconfigure; leave their buffering as-is.
             pass
 
     parser = argparse.ArgumentParser(description="Validate eval scores")