mozilla · junngo · Feb 18, 2026
@@ -1,4 +1,3 @@
-from copy import copy, deepcopy
 from datetime import datetime, timedelta
 from unittest.mock import MagicMock
 
@@ -12,12 +11,13 @@
     create_perf_alert,
     create_perf_signature,
 )
-from treeherder.model.models import Job, JobGroup, JobType, MachinePlatform
+from treeherder.model.models import Job, JobGroup, JobType, MachinePlatform, Push
 from treeherder.perf.auto_perf_sheriffing.secretary import Secretary
 from treeherder.perf.models import (
     BackfillRecord,
     BackfillReport,
     PerformanceAlert,
+    PerformanceDatum,
     PerformanceSettings,
     PerformanceSignature,
 )
@@ -35,26 +35,6 @@ def record_context_sample():
     return load_json_fixture("recordContext.json")
 
 
-@pytest.fixture(params=["totally_broken_json", "missing_job_fields", "null_job_fields"])
-def broken_context_str(record_context_sample: dict, request) -> list:
-    context_str = json.dumps(record_context_sample)
-    specific = request.param
-
-    if specific == "totally_broken_json":
-        return copy(context_str).replace(r'"', "<")
-
-    else:
-        record_copy = deepcopy(record_context_sample)
-        if specific == "missing_job_fields":
-            for data_point in record_copy:
-                del data_point["job_id"]
-
-        elif specific == "null_job_fields":
-            for data_point in record_copy:
-                data_point["job_id"] = None
-        return json.dumps(record_copy)
-
-
 @pytest.fixture(params=["preliminary", "from_non_linux"])
 def record_unsuited_for_backfill(test_perf_alert, request):
     report = BackfillReport.objects.create(summary=test_perf_alert.summary)
@@ -131,12 +111,26 @@ def platform_specific_perf_alert(
 
 
 @pytest.fixture
-def record_ready_for_processing(platform_specific_perf_alert, record_context_sample):
-    report = BackfillReport.objects.create(summary=platform_specific_perf_alert.summary)
+def record_ready_for_processing(platform_specific_perf_alert, record_context_sample, test_job_2):
+    alert = platform_specific_perf_alert
+    push_id = alert.summary.push_id
+    push = Push.objects.get(id=push_id)
+
+    PerformanceDatum.objects.create(
+        repository=alert.series_signature.repository,
+        signature=alert.series_signature,
+        push=push,
+        push_timestamp=push.time,
+        job=test_job_2,
+        value=10.0,
+    )
+
+    report = BackfillReport.objects.create(summary=alert.summary)
     record = BackfillRecord.objects.create(
-        alert=platform_specific_perf_alert,
+        alert=alert,
         report=report,
         status=BackfillRecord.READY_FOR_PROCESSING,
+        last_detected_push_id=push_id,
     )
     record.set_context(record_context_sample)
     record.save()

@@ -6,7 +6,7 @@
 from tests import settings as test_settings
 from treeherder.perf.auto_perf_sheriffing.sherlock import Sherlock
 from treeherder.perf.exceptions import MaxRuntimeExceededError
-from treeherder.perf.models import BackfillNotificationRecord
+from treeherder.perf.models import BackfillNotificationRecord, BackfillRecord
 
 EPOCH = datetime.utcfromtimestamp(0)
 
@@ -36,18 +36,23 @@ def test_email_is_sent_after_successful_backfills(
     assert BackfillNotificationRecord.objects.count() == 0
 
 
-def test_email_is_still_sent_if_context_is_too_corrupt_to_be_actionable(
+def test_email_is_still_sent_if_backfill_job_is_missing(
     report_maintainer_mock,
     backfill_tool_mock,
     secretary,
     record_ready_for_processing,
     sherlock_settings,
-    broken_context_str,
     tc_notify_mock,
-    # Note: parametrizes the test
 ):
-    record_ready_for_processing.context = broken_context_str
-    record_ready_for_processing.save()
+    # Simulate a job that has expired/been deleted: datum exists at last_detected_push_id but job=None.
+    # The backfill is skipped (no valid job_id), record ends up FAILED,
+    # but the notification must still be sent.
+    from treeherder.perf.models import PerformanceDatum
+
+    PerformanceDatum.objects.filter(
+        signature=record_ready_for_processing.alert.series_signature,
+        push_id=record_ready_for_processing.last_detected_push_id,
+    ).update(job=None)
 
     sherlock = Sherlock(
         report_maintainer_mock,
@@ -60,6 +65,8 @@ def test_email_is_still_sent_if_context_is_too_corrupt_to_be_actionable(
         repositories=[test_settings.TREEHERDER_TEST_REPOSITORY_NAME],
     )
 
+    record_ready_for_processing.refresh_from_db()
+    assert record_ready_for_processing.status == BackfillRecord.FAILED
     assert BackfillNotificationRecord.objects.count() == 1
     call_command("report_backfill_outcome")
     assert BackfillNotificationRecord.objects.count() == 0

@@ -1,5 +1,6 @@
 import uuid
 from datetime import datetime, timedelta
+from unittest.mock import patch
 
 import pytest
 import simplejson as json
@@ -299,3 +300,146 @@ def test_pending_and_failed_jobs_means_in_progress_outcome(
         assert record_backfilled.total_backfills_in_progress == 1
         assert record_backfilled.total_backfills_failed == 1
         assert response == OutcomeStatus.IN_PROGRESS
+
+
+class TestVerifyAndIterate:
+    @pytest.fixture
+    def secretary(self):
+        return Secretary()
+
+    @pytest.fixture
+    def anchor_push(self, create_push, test_repository):
+        return create_push(test_repository, revision=uuid.uuid4(), time=datetime.utcnow())
+
+    @pytest.fixture
+    def record_successful(self, test_perf_alert, record_context_sample, anchor_push):
+        report = BackfillReport.objects.create(summary=test_perf_alert.summary)
+        record = BackfillRecord.objects.create(
+            alert=test_perf_alert,
+            report=report,
+            status=BackfillRecord.SUCCESSFUL,
+            last_detected_push_id=anchor_push.id,
+        )
+        record.set_context(record_context_sample)
+        record.save()
+        return record
+
+    def test_stops_at_max_iterations(self, secretary, record_successful):
+        record_successful.iteration_count = 5
+        record_successful.save()
+
+        with patch.object(secretary, "re_run_detect_changes") as mock_detect:
+            secretary.verify_and_iterate(record_successful, max_iterations=5)
+            mock_detect.assert_not_called()
+
+        record_successful.refresh_from_db()
+        assert record_successful.status == BackfillRecord.SUCCESSFUL
+
+    def test_stops_when_no_change_detected(self, secretary, record_successful):
+        with patch.object(secretary, "re_run_detect_changes", return_value=(None, None, [])):
+            secretary.verify_and_iterate(record_successful)
+
+        record_successful.refresh_from_db()
+        assert record_successful.status == BackfillRecord.SUCCESSFUL
+
+    def test_stops_when_culprit_stabilized(self, secretary, record_successful):
+        push_id = record_successful.last_detected_push_id
+
+        with (
+            patch.object(secretary, "re_run_detect_changes", return_value=(push_id, 2.5, [])),
+            patch.object(secretary, "_calculate_gap_size", return_value=0),
+        ):
+            secretary.verify_and_iterate(record_successful)
+
+        record_successful.refresh_from_db()
+        assert record_successful.status == BackfillRecord.SUCCESSFUL
+        logs = record_successful.get_backfill_logs()
+        assert len(logs) == 1
+        assert logs[0]["direction"] == "stabilized"
+
+    def test_continues_when_culprit_stabilized_but_gap_remains(self, secretary, record_successful):
+        push_id = record_successful.last_detected_push_id
+
+        with (
+            patch.object(secretary, "re_run_detect_changes", return_value=(push_id, 2.5, [])),
+            patch.object(secretary, "_calculate_gap_size", return_value=3),
+        ):
+            secretary.verify_and_iterate(record_successful)
+
+        record_successful.refresh_from_db()
+        assert record_successful.status == BackfillRecord.READY_FOR_PROCESSING
+        assert record_successful.last_detected_push_id == push_id
+        logs = record_successful.get_backfill_logs()
+        assert len(logs) == 1
+        assert logs[0]["direction"] == "stabilized_with_gap"
+
+    def test_queues_next_iteration_when_culprit_moves_left(
+        self, secretary, record_successful, create_push, test_repository, anchor_push
+    ):
+        earlier_push = create_push(
+            test_repository,
+            revision=uuid.uuid4(),
+            time=anchor_push.time - timedelta(days=5),
+        )
+
+        with patch.object(
+            secretary, "re_run_detect_changes", return_value=(earlier_push.id, 3.0, [])
+        ):
+            secretary.verify_and_iterate(record_successful)
+
+        record_successful.refresh_from_db()
+        assert record_successful.status == BackfillRecord.READY_FOR_PROCESSING
+        assert record_successful.last_detected_push_id == earlier_push.id
+        assert record_successful.get_backfill_logs()[0]["direction"] == "left"
+
+    def test_queues_next_iteration_when_culprit_drifts_right(
+        self, secretary, record_successful, create_push, test_repository, anchor_push
+    ):
+        later_push = create_push(
+            test_repository,
+            revision=uuid.uuid4(),
+            time=anchor_push.time + timedelta(days=1),
+        )
+
+        with patch.object(
+            secretary, "re_run_detect_changes", return_value=(later_push.id, 3.0, [])
+        ):
+            secretary.verify_and_iterate(record_successful)
+
+        record_successful.refresh_from_db()
+        assert record_successful.status == BackfillRecord.READY_FOR_PROCESSING
+        assert record_successful.last_detected_push_id == later_push.id
+        logs = record_successful.get_backfill_logs()
+        assert len(logs) == 1
+        assert logs[0]["direction"] == "right"
+
+    def test_stops_when_gap_not_shrinking(self, secretary, record_successful):
+        """If gap_size doesn't decrease between iterations, pushes lack the target job — stop."""
+        push_id = record_successful.last_detected_push_id
+
+        # Simulate a previous log entry with gap_size=3
+        record_successful.append_to_backfill_logs(
+            {"iteration": 0, "detected_push_gap_size": 3, "direction": "stabilized_with_gap"}
+        )
+        record_successful.save()
+
+        with (
+            patch.object(secretary, "re_run_detect_changes", return_value=(push_id, 2.5, [])),
+            patch.object(secretary, "_calculate_gap_size", return_value=3),
+        ):
+            secretary.verify_and_iterate(record_successful)
+
+        record_successful.refresh_from_db()
+        # Status should not change to READY_FOR_PROCESSING — stuck gap means done
+        assert record_successful.status == BackfillRecord.SUCCESSFUL
+        logs = record_successful.get_backfill_logs()
+        assert logs[-1]["direction"] == "stabilized_gap_stuck"
+
+    def test_sets_verification_failed_on_exception(self, secretary, record_successful):
+        with patch.object(
+            secretary, "re_run_detect_changes", side_effect=Exception("unexpected error")
+        ):
+            secretary.verify_and_iterate(record_successful)
+
+        record_successful.refresh_from_db()
+        assert record_successful.status == BackfillRecord.VERIFICATION_FAILED
@@ -1,13 +1,11 @@
 from datetime import datetime, timedelta
-from json import JSONDecodeError
 
 import pytest
 import simplejson as json
 from django.db import models
 
 from tests import settings as test_settings
 from tests.perf.auto_perf_sheriffing.conftest import prepare_record_with_search_str
-from treeherder.model.models import Job, Push
 from treeherder.perf.auto_perf_sheriffing.sherlock import Sherlock
 from treeherder.perf.exceptions import MaxRuntimeExceededError
 from treeherder.perf.models import BackfillRecord, BackfillReport
@@ -180,31 +178,25 @@ def test_db_limits_update_if_backfills_left(
 
     record_ready_for_processing.refresh_from_db()
     assert record_ready_for_processing.status == BackfillRecord.BACKFILLED
-    assert (initial_backfills - 4) == secretary.backfills_left(on_platform=targeted_platform)
+    assert (initial_backfills - 1) == secretary.backfills_left(on_platform=targeted_platform)
 
 
-def test_backfilling_gracefully_handles_invalid_json_contexts_without_blowing_up(
+def test_record_fails_when_no_datum_at_last_detected_push_id(
     report_maintainer_mock,
     backfill_tool_mock,
     secretary,
     record_ready_for_processing,
     sherlock_settings,
-    broken_context_str,  # Note: parametrizes the test
 ):
-    record_ready_for_processing.context = broken_context_str
+    record_ready_for_processing.last_detected_push_id = 999999
     record_ready_for_processing.save()
 
     sherlock = Sherlock(report_maintainer_mock, backfill_tool_mock, secretary)
-    try:
-        sherlock.sheriff(
-            since=EPOCH,
-            frameworks=["test_talos"],
-            repositories=[test_settings.TREEHERDER_TEST_REPOSITORY_NAME],
-        )
-    except (JSONDecodeError, KeyError, Job.DoesNotExist, Push.DoesNotExist):
-        pytest.fail()
+    sherlock.sheriff(
+        since=EPOCH,
+        frameworks=["test_talos"],
+        repositories=[test_settings.TREEHERDER_TEST_REPOSITORY_NAME],
+    )
 
     record_ready_for_processing.refresh_from_db()
-
     assert record_ready_for_processing.status == BackfillRecord.FAILED
-    assert not has_changed(sherlock_settings)
@@ -325,6 +325,7 @@ def _provide_records(self, backfill_report: BackfillReport, alert_context_map: l
                 alert=alert,
                 report=backfill_report,
                 context=json.dumps(retrigger_context, default=default_serializer),
+                last_detected_push_id=alert.summary.push_id,
             )
 
     def __fetch_summaries_to_retrigger(

@@ -43,6 +43,7 @@ def backfill_job(self, job: Job | str) -> str:
             decision_task_id=decision_task_id,
             input={
                 "retrigger": False,
+                "slices": 3,
             },
             root_url=job.repository.tc_root_url,
         )

@@ -27,8 +27,17 @@ def check(self, record: BackfillRecord) -> OutcomeStatus:
         total_backfills_failed = 0
         total_backfills_successful = 0
 
-        pushes_in_range = record.get_pushes_in_context_range()
+        if record.last_detected_push_id is None:
+            # legacy: every push in the context range have been backfilled
+            pushes_in_range = record.get_pushes_in_context_range()
+        else:
+            pushes_in_range = record.get_pushes_from_anchor()
+
         for push in pushes_in_range:
+            has_any_job = push.jobs.filter(job_type=of_type).exists()
+            if not has_any_job:
+                continue
+
             # make sure it has at least one successful job of job type
             if push.total_jobs(of_type, with_successful_results) == 0:
                 # either (at least) one job is in progress or it failed