From a821264abcef3a81955bb73382edaea035b4ea87 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:04:48 +0800 Subject: [PATCH] fix: allow all-evals sweep reuse --- .github/workflows/README.md | 5 +++-- .github/workflows/run-sweep.yml | 1 - AGENTS.md | 2 +- .../test_run_sweep_gating.py | 7 ++++--- .../test_validate_perf_changelog.py | 6 +++++- utils/find_reusable_sweep_run.py | 2 +- utils/merge_with_reuse.sh | 2 -- utils/test_find_reusable_sweep_run.py | 18 +++++++++++------- 8 files changed, 25 insertions(+), 18 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 01a0f79d9..e90333d0f 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -196,8 +196,9 @@ Use `all-evals` and/or `evals-only` with one primary sweep label. `all-evals` covers every fixed-sequence config; each multi-node topology runs all `conc-list` values on one engine. `evals-only` suppresses throughput; together they run all evals only. The primary label still controls canary/fail-fast. -Modifier runs are not reusable; default full sweeps, including default evals, -are. +`all-evals` full sweeps are reusable. Runs with `evals-only`, including runs +with both modifiers, are not. Default full sweeps, including default evals, +are also reusable. ## Reusing an Approved PR Full Sweep diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 68d0d87d4..797c693de 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -125,7 +125,6 @@ jobs: github.event_name == 'pull_request' && github.event.action == 'synchronize' && !github.event.pull_request.draft && - !contains(github.event.pull_request.labels.*.name, 'all-evals') && !contains(github.event.pull_request.labels.*.name, 'evals-only') && ( contains(github.event.pull_request.labels.*.name, 'full-sweep-enabled') || diff --git a/AGENTS.md b/AGENTS.md index 42b2cc5a5..e53e95e6e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -69,7 +69,7 @@ PRs do not run the sweep automatically - `run-sweep.yml` is gated on a primary s - `full-sweep-fail-fast` - runs the full intermediate concurrency sweep behind the same sequential single-node canary gate as `full-sweep-enabled` (so a globally broken change burns one job, not the whole fan-out), and with `strategy.fail-fast` enabled on every matrix: the first failure in a matrix cancels that matrix's remaining jobs. Fail-fast is matrix-scoped, so the other matrices (1k1k vs 8k1k vs agentic vs evals) keep running and self-terminate on their own first failure; their completed results remain valid. The failing job keeps its red *failure* conclusion and the run concludes failed. Use when a failure means the rest of that matrix is wasted GPU time (e.g. new image bring-up). Note one flaky job kills its matrix's in-flight results. - `full-sweep-fail-fast-no-canary` - same as `full-sweep-fail-fast` but without the canary gate: all matrices fan out immediately. Use when the canary is flaky or not representative of the affected configuration but you still want per-matrix fail-fast. -`all-evals` and `evals-only` are optional modifier labels. Combine either or both with one primary sweep label. `all-evals` expands eval selection to every generated fixed-sequence configuration without changing throughput. `evals-only` suppresses throughput while keeping the default eval subset; combining both runs every eval and no throughput. Runs with either modifier are not eligible for artifact reuse. +`all-evals` and `evals-only` are optional modifier labels. Combine either or both with one primary sweep label. `all-evals` expands eval selection to every generated fixed-sequence configuration without changing throughput. `evals-only` suppresses throughput while keeping the default eval subset; combining both runs every eval and no throughput. `all-evals` remains eligible for artifact reuse when paired with an eligible full-sweep label. Runs with `evals-only`, including runs with both modifiers, are not eligible. **The sweep does not trigger while the PR has merge conflicts.** Even with a sweep label applied, the `run-sweep.yml` workflow will not start until the PR cleanly merges into main — a stale claude/* or update-* branch with a `perf-changelog.yaml` conflict (the common case) will sit in NO_SWEEP / NO_SUCCESS until rebased. Resolution recipe is documented in `KLAUD_DEBUG.md §1.1`: `git merge origin/main`, then `git checkout origin/main -- perf-changelog.yaml`, then re-append the PR's own changelog entry at the tail. Don't 3-way merge `perf-changelog.yaml`; whitespace edits silently re-trigger the deletion check. diff --git a/utils/changelog_gate_tests/test_run_sweep_gating.py b/utils/changelog_gate_tests/test_run_sweep_gating.py index 399a36948..a36f1eb72 100644 --- a/utils/changelog_gate_tests/test_run_sweep_gating.py +++ b/utils/changelog_gate_tests/test_run_sweep_gating.py @@ -41,6 +41,7 @@ MODIFIER_LABELS = {"all-evals", "evals-only"} RELEVANT_LABELS = SWEEP_LABELS | MODIFIER_LABELS REUSE_ELIGIBLE_LABELS = SWEEP_LABELS - {"sweep-enabled"} +REUSE_INCOMPATIBLE_LABELS = {"evals-only"} # -------------------------------------------------------------------------- @@ -252,10 +253,10 @@ def run_dag(sc: dict) -> tuple[str, str, str]: ("PR-sync-evals-only-without-sweep-label", {**_PR, "action": "synchronize", "labels": ["evals-only"]}, ("success", "skipped", "SKIP")), - ("PR-sync-full-with-all-evals-ignores-reuse", + ("PR-sync-full-with-all-evals-uses-reuse", {**_PR, "action": "synchronize", "labels": ["full-sweep-enabled", "all-evals"], - "reuse_auth": True}, ("success", "skipped", "RUN")), + "reuse_auth": True}, ("success", "success", "SKIP")), ("PR-sync-full-with-evals-only-ignores-reuse", {**_PR, "action": "synchronize", "labels": ["full-sweep-enabled", "evals-only"], @@ -379,7 +380,7 @@ def reference_gate(sc: dict) -> tuple[str, str, str]: and sc.get("action") == "synchronize" and not draft and bool(labels & REUSE_ELIGIBLE_LABELS) - and labels.isdisjoint(MODIFIER_LABELS) + and labels.isdisjoint(REUSE_INCOMPATIBLE_LABELS) ) reuse = "success" if gate_runs else "skipped" authorized = gate_runs and sc.get("reuse_auth", False) diff --git a/utils/changelog_gate_tests/test_validate_perf_changelog.py b/utils/changelog_gate_tests/test_validate_perf_changelog.py index a09428381..d6b6a3565 100644 --- a/utils/changelog_gate_tests/test_validate_perf_changelog.py +++ b/utils/changelog_gate_tests/test_validate_perf_changelog.py @@ -351,6 +351,10 @@ def test_run_sweep_checks_changelog_before_reuse_and_setup() -> None: "!contains(github.event.pull_request.labels.*.name, 'evals-only')" in jobs["reuse-sweep-gate"]["if"] ) + assert ( + "!contains(github.event.pull_request.labels.*.name, 'all-evals')" + not in jobs["reuse-sweep-gate"]["if"] + ) setup_if = jobs["setup"]["if"] assert "needs.check-changelog.outputs.skip-pr-sweep != 'true'" in setup_if assert "github.event_name == 'push'" in setup_if @@ -376,7 +380,7 @@ def test_merge_helper_waits_for_pr_checks_before_merge() -> None: assert "CHECK_TIMEOUT_SECONDS" in script assert "prepare_perf_changelog_merge.py" in script assert "git commit --allow-empty" in script - assert "uses all-evals, which is not eligible for artifact reuse" in script + assert "uses all-evals, which is not eligible for artifact reuse" not in script assert "uses evals-only, which is not eligible for artifact reuse" in script assert script.count('CURRENT_HEAD="$(gh pr view') == 2 assert "must have exactly one sweep label" in script diff --git a/utils/find_reusable_sweep_run.py b/utils/find_reusable_sweep_run.py index 91652d65a..c88410d69 100644 --- a/utils/find_reusable_sweep_run.py +++ b/utils/find_reusable_sweep_run.py @@ -315,7 +315,7 @@ def main() -> int: ) parser.add_argument( "--reuse-incompatible-label", - default="all-evals,evals-only", + default="evals-only", help="Comma-separated PR labels that make sweep artifacts ineligible for reuse.", ) parser.add_argument("--pinned-run-command", default="/reuse-sweep-run") diff --git a/utils/merge_with_reuse.sh b/utils/merge_with_reuse.sh index cec185224..d1ff78c45 100755 --- a/utils/merge_with_reuse.sh +++ b/utils/merge_with_reuse.sh @@ -110,8 +110,6 @@ case "$SELECTED_SWEEP_LABEL" in die "PR #${PR} must use a full-sweep label for artifact reuse" ;; esac -[ "$(jq '[.labels[].name | select(. == "all-evals")] | length' <<<"$PR_INFO")" -eq 0 ] \ - || die "PR #${PR} uses all-evals, which is not eligible for artifact reuse" [ "$(jq '[.labels[].name | select(. == "evals-only")] | length' <<<"$PR_INFO")" -eq 0 ] \ || die "PR #${PR} uses evals-only, which is not eligible for artifact reuse" diff --git a/utils/test_find_reusable_sweep_run.py b/utils/test_find_reusable_sweep_run.py index eddaf1280..fd3c7669e 100644 --- a/utils/test_find_reusable_sweep_run.py +++ b/utils/test_find_reusable_sweep_run.py @@ -819,7 +819,10 @@ def fake_paginated_github_api(repo, path, token, item_key, params=None): assert outputs["reuse-reason"] == "PR #1321 has no /reuse-sweep-run authorization" -def test_main_accepts_non_canary_full_sweep_label(monkeypatch, tmp_path) -> None: +def test_main_accepts_all_evals_with_non_canary_full_sweep_label( + monkeypatch, + tmp_path, +) -> None: comments = [ { "created_at": "2026-05-13T00:00:00Z", @@ -845,7 +848,10 @@ def fake_github_api(repo, path, token, params=None): if path == "/pulls/1321": return { "merged_at": "2026-05-13T00:01:00Z", - "labels": [{"name": "non-canary-full-sweep-enabled"}], + "labels": [ + {"name": "non-canary-full-sweep-enabled"}, + {"name": "all-evals"}, + ], "head": {"sha": "abc123"}, } if path == "/actions/runs/25763404168": @@ -889,11 +895,9 @@ def fake_paginated_github_api(repo, path, token, item_key, params=None): assert outputs["reuse-enabled"] == "true" -@pytest.mark.parametrize("modifier", ["all-evals", "evals-only"]) -def test_main_rejects_eval_modifier_for_reuse( +def test_main_rejects_evals_only_for_reuse( monkeypatch, tmp_path, - modifier, ) -> None: comments = [ { @@ -911,7 +915,7 @@ def fake_github_api(repo, path, token, params=None): "merged_at": "2026-05-13T00:01:00Z", "labels": [ {"name": "full-sweep-enabled"}, - {"name": modifier}, + {"name": "evals-only"}, ], "head": {"sha": "abc123"}, } @@ -944,7 +948,7 @@ def fake_paginated_github_api(repo, path, token, item_key, params=None): ], ) - with pytest.raises(RuntimeError, match=rf"reuse-incompatible.*{modifier}"): + with pytest.raises(RuntimeError, match=r"reuse-incompatible.*evals-only"): reuse.main()