Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 19 additions & 25 deletions tests/perf/auto_perf_sheriffing/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from copy import copy, deepcopy
from datetime import datetime, timedelta
from unittest.mock import MagicMock

Expand All @@ -12,12 +11,13 @@
create_perf_alert,
create_perf_signature,
)
from treeherder.model.models import Job, JobGroup, JobType, MachinePlatform
from treeherder.model.models import Job, JobGroup, JobType, MachinePlatform, Push
from treeherder.perf.auto_perf_sheriffing.secretary import Secretary
from treeherder.perf.models import (
BackfillRecord,
BackfillReport,
PerformanceAlert,
PerformanceDatum,
PerformanceSettings,
PerformanceSignature,
)
Expand All @@ -35,26 +35,6 @@ def record_context_sample():
return load_json_fixture("recordContext.json")


@pytest.fixture(params=["totally_broken_json", "missing_job_fields", "null_job_fields"])
def broken_context_str(record_context_sample: dict, request) -> list:
context_str = json.dumps(record_context_sample)
specific = request.param

if specific == "totally_broken_json":
return copy(context_str).replace(r'"', "<")

else:
record_copy = deepcopy(record_context_sample)
if specific == "missing_job_fields":
for data_point in record_copy:
del data_point["job_id"]

elif specific == "null_job_fields":
for data_point in record_copy:
data_point["job_id"] = None
return json.dumps(record_copy)


@pytest.fixture(params=["preliminary", "from_non_linux"])
def record_unsuited_for_backfill(test_perf_alert, request):
report = BackfillReport.objects.create(summary=test_perf_alert.summary)
Expand Down Expand Up @@ -131,12 +111,26 @@ def platform_specific_perf_alert(


@pytest.fixture
def record_ready_for_processing(platform_specific_perf_alert, record_context_sample):
report = BackfillReport.objects.create(summary=platform_specific_perf_alert.summary)
def record_ready_for_processing(platform_specific_perf_alert, record_context_sample, test_job_2):
alert = platform_specific_perf_alert
push_id = alert.summary.push_id
push = Push.objects.get(id=push_id)

PerformanceDatum.objects.create(
repository=alert.series_signature.repository,
signature=alert.series_signature,
push=push,
push_timestamp=push.time,
job=test_job_2,
value=10.0,
)

report = BackfillReport.objects.create(summary=alert.summary)
record = BackfillRecord.objects.create(
alert=platform_specific_perf_alert,
alert=alert,
report=report,
status=BackfillRecord.READY_FOR_PROCESSING,
last_detected_push_id=push_id,
)
record.set_context(record_context_sample)
record.save()
Expand Down
19 changes: 13 additions & 6 deletions tests/perf/auto_perf_sheriffing/test_report_backfill_outcome.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tests import settings as test_settings
from treeherder.perf.auto_perf_sheriffing.sherlock import Sherlock
from treeherder.perf.exceptions import MaxRuntimeExceededError
from treeherder.perf.models import BackfillNotificationRecord
from treeherder.perf.models import BackfillNotificationRecord, BackfillRecord

EPOCH = datetime.utcfromtimestamp(0)

Expand Down Expand Up @@ -36,18 +36,23 @@ def test_email_is_sent_after_successful_backfills(
assert BackfillNotificationRecord.objects.count() == 0


def test_email_is_still_sent_if_context_is_too_corrupt_to_be_actionable(
def test_email_is_still_sent_if_backfill_job_is_missing(
report_maintainer_mock,
backfill_tool_mock,
secretary,
record_ready_for_processing,
sherlock_settings,
broken_context_str,
tc_notify_mock,
# Note: parametrizes the test
):
record_ready_for_processing.context = broken_context_str
record_ready_for_processing.save()
# Simulate a job that has expired/been deleted: datum exists at last_detected_push_id but job=None.
# The backfill is skipped (no valid job_id), record ends up FAILED,
# but the notification must still be sent.
from treeherder.perf.models import PerformanceDatum

PerformanceDatum.objects.filter(
signature=record_ready_for_processing.alert.series_signature,
push_id=record_ready_for_processing.last_detected_push_id,
).update(job=None)

sherlock = Sherlock(
report_maintainer_mock,
Expand All @@ -60,6 +65,8 @@ def test_email_is_still_sent_if_context_is_too_corrupt_to_be_actionable(
repositories=[test_settings.TREEHERDER_TEST_REPOSITORY_NAME],
)

record_ready_for_processing.refresh_from_db()
assert record_ready_for_processing.status == BackfillRecord.FAILED
assert BackfillNotificationRecord.objects.count() == 1
call_command("report_backfill_outcome")
assert BackfillNotificationRecord.objects.count() == 0
Expand Down
144 changes: 144 additions & 0 deletions tests/perf/auto_perf_sheriffing/test_secretary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import uuid
from datetime import datetime, timedelta
from unittest.mock import patch

import pytest
import simplejson as json
Expand Down Expand Up @@ -299,3 +300,146 @@ def test_pending_and_failed_jobs_means_in_progress_outcome(
assert record_backfilled.total_backfills_in_progress == 1
assert record_backfilled.total_backfills_failed == 1
assert response == OutcomeStatus.IN_PROGRESS


class TestVerifyAndIterate:
@pytest.fixture
def secretary(self):
return Secretary()

@pytest.fixture
def anchor_push(self, create_push, test_repository):
return create_push(test_repository, revision=uuid.uuid4(), time=datetime.utcnow())

@pytest.fixture
def record_successful(self, test_perf_alert, record_context_sample, anchor_push):
report = BackfillReport.objects.create(summary=test_perf_alert.summary)
record = BackfillRecord.objects.create(
alert=test_perf_alert,
report=report,
status=BackfillRecord.SUCCESSFUL,
last_detected_push_id=anchor_push.id,
)
record.set_context(record_context_sample)
record.save()
return record

def test_stops_at_max_iterations(self, secretary, record_successful):
record_successful.iteration_count = 5
record_successful.save()

with patch.object(secretary, "re_run_detect_changes") as mock_detect:
secretary.verify_and_iterate(record_successful, max_iterations=5)
mock_detect.assert_not_called()

record_successful.refresh_from_db()
assert record_successful.status == BackfillRecord.SUCCESSFUL

def test_stops_when_no_change_detected(self, secretary, record_successful):
with patch.object(secretary, "re_run_detect_changes", return_value=(None, None, [])):
secretary.verify_and_iterate(record_successful)

record_successful.refresh_from_db()
assert record_successful.status == BackfillRecord.SUCCESSFUL

def test_stops_when_culprit_stabilized(self, secretary, record_successful):
push_id = record_successful.last_detected_push_id

with (
patch.object(secretary, "re_run_detect_changes", return_value=(push_id, 2.5, [])),
patch.object(secretary, "_calculate_gap_size", return_value=0),
):
secretary.verify_and_iterate(record_successful)

record_successful.refresh_from_db()
assert record_successful.status == BackfillRecord.SUCCESSFUL
logs = record_successful.get_backfill_logs()
assert len(logs) == 1
assert logs[0]["direction"] == "stabilized"

def test_continues_when_culprit_stabilized_but_gap_remains(self, secretary, record_successful):
push_id = record_successful.last_detected_push_id

with (
patch.object(secretary, "re_run_detect_changes", return_value=(push_id, 2.5, [])),
patch.object(secretary, "_calculate_gap_size", return_value=3),
):
secretary.verify_and_iterate(record_successful)

record_successful.refresh_from_db()
assert record_successful.status == BackfillRecord.READY_FOR_PROCESSING
assert record_successful.last_detected_push_id == push_id
logs = record_successful.get_backfill_logs()
assert len(logs) == 1
assert logs[0]["direction"] == "stabilized_with_gap"

def test_queues_next_iteration_when_culprit_moves_left(
self, secretary, record_successful, create_push, test_repository, anchor_push
):
earlier_push = create_push(
test_repository,
revision=uuid.uuid4(),
time=anchor_push.time - timedelta(days=5),
)

with patch.object(
secretary, "re_run_detect_changes", return_value=(earlier_push.id, 3.0, [])
):
secretary.verify_and_iterate(record_successful)

record_successful.refresh_from_db()
assert record_successful.status == BackfillRecord.READY_FOR_PROCESSING
assert record_successful.last_detected_push_id == earlier_push.id
assert record_successful.get_backfill_logs()[0]["direction"] == "left"

def test_queues_next_iteration_when_culprit_drifts_right(
self, secretary, record_successful, create_push, test_repository, anchor_push
):
later_push = create_push(
test_repository,
revision=uuid.uuid4(),
time=anchor_push.time + timedelta(days=1),
)

with patch.object(
secretary, "re_run_detect_changes", return_value=(later_push.id, 3.0, [])
):
secretary.verify_and_iterate(record_successful)

record_successful.refresh_from_db()
assert record_successful.status == BackfillRecord.READY_FOR_PROCESSING
assert record_successful.last_detected_push_id == later_push.id
logs = record_successful.get_backfill_logs()
assert len(logs) == 1
assert logs[0]["direction"] == "right"

def test_stops_when_gap_not_shrinking(self, secretary, record_successful):
"""If gap_size doesn't decrease between iterations, pushes lack the target job — stop."""
push_id = record_successful.last_detected_push_id

# Simulate a previous log entry with gap_size=3
record_successful.append_to_backfill_logs(
{"iteration": 0, "detected_push_gap_size": 3, "direction": "stabilized_with_gap"}
)
record_successful.save()

with (
patch.object(secretary, "re_run_detect_changes", return_value=(push_id, 2.5, [])),
patch.object(secretary, "_calculate_gap_size", return_value=3),
):
secretary.verify_and_iterate(record_successful)

record_successful.refresh_from_db()
# Status should not change to READY_FOR_PROCESSING — stuck gap means done
assert record_successful.status == BackfillRecord.SUCCESSFUL
logs = record_successful.get_backfill_logs()
assert logs[-1]["direction"] == "stabilized_gap_stuck"

def test_sets_verification_failed_on_exception(self, secretary, record_successful):
with patch.object(
secretary, "re_run_detect_changes", side_effect=Exception("unexpected error")
):
secretary.verify_and_iterate(record_successful)

record_successful.refresh_from_db()
assert record_successful.status == BackfillRecord.VERIFICATION_FAILED
24 changes: 8 additions & 16 deletions tests/perf/auto_perf_sheriffing/test_sherlock.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from datetime import datetime, timedelta
from json import JSONDecodeError

import pytest
import simplejson as json
from django.db import models

from tests import settings as test_settings
from tests.perf.auto_perf_sheriffing.conftest import prepare_record_with_search_str
from treeherder.model.models import Job, Push
from treeherder.perf.auto_perf_sheriffing.sherlock import Sherlock
from treeherder.perf.exceptions import MaxRuntimeExceededError
from treeherder.perf.models import BackfillRecord, BackfillReport
Expand Down Expand Up @@ -180,31 +178,25 @@ def test_db_limits_update_if_backfills_left(

record_ready_for_processing.refresh_from_db()
assert record_ready_for_processing.status == BackfillRecord.BACKFILLED
assert (initial_backfills - 4) == secretary.backfills_left(on_platform=targeted_platform)
assert (initial_backfills - 1) == secretary.backfills_left(on_platform=targeted_platform)


def test_backfilling_gracefully_handles_invalid_json_contexts_without_blowing_up(
def test_record_fails_when_no_datum_at_last_detected_push_id(
report_maintainer_mock,
backfill_tool_mock,
secretary,
record_ready_for_processing,
sherlock_settings,
broken_context_str, # Note: parametrizes the test
):
record_ready_for_processing.context = broken_context_str
record_ready_for_processing.last_detected_push_id = 999999
record_ready_for_processing.save()

sherlock = Sherlock(report_maintainer_mock, backfill_tool_mock, secretary)
try:
sherlock.sheriff(
since=EPOCH,
frameworks=["test_talos"],
repositories=[test_settings.TREEHERDER_TEST_REPOSITORY_NAME],
)
except (JSONDecodeError, KeyError, Job.DoesNotExist, Push.DoesNotExist):
pytest.fail()
sherlock.sheriff(
since=EPOCH,
frameworks=["test_talos"],
repositories=[test_settings.TREEHERDER_TEST_REPOSITORY_NAME],
)

record_ready_for_processing.refresh_from_db()

assert record_ready_for_processing.status == BackfillRecord.FAILED
assert not has_changed(sherlock_settings)
1 change: 1 addition & 0 deletions treeherder/perf/auto_perf_sheriffing/backfill_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ def _provide_records(self, backfill_report: BackfillReport, alert_context_map: l
alert=alert,
report=backfill_report,
context=json.dumps(retrigger_context, default=default_serializer),
last_detected_push_id=alert.summary.push_id,
)

def __fetch_summaries_to_retrigger(
Expand Down
1 change: 1 addition & 0 deletions treeherder/perf/auto_perf_sheriffing/backfill_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def backfill_job(self, job: Job | str) -> str:
decision_task_id=decision_task_id,
input={
"retrigger": False,
"slices": 3,
},
root_url=job.repository.tc_root_url,
)
Expand Down
11 changes: 10 additions & 1 deletion treeherder/perf/auto_perf_sheriffing/outcome_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,17 @@ def check(self, record: BackfillRecord) -> OutcomeStatus:
total_backfills_failed = 0
total_backfills_successful = 0

pushes_in_range = record.get_pushes_in_context_range()
if record.last_detected_push_id is None:
# legacy: every push in the context range have been backfilled
pushes_in_range = record.get_pushes_in_context_range()
else:
pushes_in_range = record.get_pushes_from_anchor()

for push in pushes_in_range:
has_any_job = push.jobs.filter(job_type=of_type).exists()
if not has_any_job:
continue

# make sure it has at least one successful job of job type
if push.total_jobs(of_type, with_successful_results) == 0:
# either (at least) one job is in progress or it failed
Expand Down
Loading