From 343330297bb9e9f899d0a15b675e2ebe9bc9b295 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Wed, 25 Mar 2026 09:46:53 -0400 Subject: [PATCH 01/19] Add PagerDuty config with escalation policies --- config.ci.toml | 5 +++++ config.example.toml | 10 ++++++++++ src/firetower/config.py | 14 ++++++++++++++ src/firetower/settings.py | 16 ++++++++++++++++ 4 files changed, 45 insertions(+) diff --git a/config.ci.toml b/config.ci.toml index 6975ea64..10c0254b 100644 --- a/config.ci.toml +++ b/config.ci.toml @@ -28,3 +28,8 @@ incident_guide_message = "" [auth] iap_enabled = false iap_audience = "" + +[pagerduty] +api_token = "" + +[pagerduty.escalation_policies] diff --git a/config.example.toml b/config.example.toml index 2496a4f8..16557413 100644 --- a/config.example.toml +++ b/config.example.toml @@ -28,6 +28,16 @@ incident_guide_message = "This is the message posted whenever a new incident sla iap_enabled = false iap_audience = "" +[pagerduty] +api_token = "" + +[pagerduty.escalation_policies.IMOC] +id = "" +integration_key = "" + +[pagerduty.escalation_policies.ExampleTeam] +id = "" + [datadog] api_key = "" app_key = "" diff --git a/src/firetower/config.py b/src/firetower/config.py index 052024d6..f6804c6f 100644 --- a/src/firetower/config.py +++ b/src/firetower/config.py @@ -34,6 +34,18 @@ class JIRAConfig: severity_field: str +@deserialize +class EscalationPolicy: + id: str + integration_key: str | None = None + + +@deserialize +class PagerDutyConfig: + api_token: str + escalation_policies: dict[str, EscalationPolicy] + + @deserialize class SlackConfig: bot_token: str @@ -62,6 +74,7 @@ class ConfigFile: jira: JIRAConfig slack: SlackConfig auth: AuthConfig + pagerduty: PagerDutyConfig | None project_key: str django_secret_key: str @@ -133,6 +146,7 @@ def __init__(self) -> None: iap_audience="", ) self.datadog = None + self.pagerduty = None self.project_key = "" self.django_secret_key = "" self.sentry_dsn = "" diff --git a/src/firetower/settings.py b/src/firetower/settings.py index e69af035..37472bd0 100644 --- a/src/firetower/settings.py +++ b/src/firetower/settings.py @@ -231,6 +231,22 @@ class SlackSettings(TypedDict): FIRETOWER_BASE_URL = config.firetower_base_url HOOKS_ENABLED = config.hooks_enabled +# PagerDuty Integration Configuration +PAGERDUTY = ( + { + "API_TOKEN": config.pagerduty.api_token, + "ESCALATION_POLICIES": { + name: { + "id": policy.id, + "integration_key": policy.integration_key, + } + for name, policy in config.pagerduty.escalation_policies.items() + }, + } + if config.pagerduty + else None +) + # Django REST Framework Configuration REST_FRAMEWORK = { # Pagination From d192e12b4acad14ccf23bd8a3d4f7667ac70d4e7 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Wed, 25 Mar 2026 09:47:26 -0400 Subject: [PATCH 02/19] Add PagerDuty service client --- .../integrations/services/__init__.py | 3 +- .../integrations/services/pagerduty.py | 78 +++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 src/firetower/integrations/services/pagerduty.py diff --git a/src/firetower/integrations/services/__init__.py b/src/firetower/integrations/services/__init__.py index 568cfcb7..55f0e121 100644 --- a/src/firetower/integrations/services/__init__.py +++ b/src/firetower/integrations/services/__init__.py @@ -1,6 +1,7 @@ """Services package for external integrations.""" from .jira import JiraService +from .pagerduty import PagerDutyService from .slack import SlackService -__all__ = ["JiraService", "SlackService"] +__all__ = ["JiraService", "PagerDutyService", "SlackService"] diff --git a/src/firetower/integrations/services/pagerduty.py b/src/firetower/integrations/services/pagerduty.py new file mode 100644 index 00000000..8b917343 --- /dev/null +++ b/src/firetower/integrations/services/pagerduty.py @@ -0,0 +1,78 @@ +import logging + +import requests +from django.conf import settings + +logger = logging.getLogger(__name__) + +EVENTS_API_URL = "https://events.pagerduty.com/v2/enqueue" +REST_API_URL = "https://api.pagerduty.com" + + +class PagerDutyService: + def __init__(self) -> None: + pd_config = settings.PAGERDUTY + if not pd_config: + raise ValueError("PagerDuty is not configured") + + self.api_token = pd_config["API_TOKEN"] + self.escalation_policies = pd_config["ESCALATION_POLICIES"] + + def trigger_incident( + self, summary: str, dedup_key: str, integration_key: str + ) -> bool: + payload = { + "routing_key": integration_key, + "event_action": "trigger", + "dedup_key": dedup_key, + "payload": { + "summary": summary, + "severity": "critical", + "source": "firetower", + }, + } + + try: + resp = requests.post(EVENTS_API_URL, json=payload, timeout=10) + resp.raise_for_status() + logger.info( + "Triggered PagerDuty incident", + extra={"dedup_key": dedup_key}, + ) + return True + except requests.RequestException: + logger.exception( + "Failed to trigger PagerDuty incident", + extra={"dedup_key": dedup_key}, + ) + return False + + def get_oncall_users(self, escalation_policy_id: str) -> list[dict]: + headers = { + "Authorization": f"Token token={self.api_token}", + "Content-Type": "application/json", + } + + url = f"{REST_API_URL}/oncalls" + params = {"escalation_policy_ids[]": escalation_policy_id} + try: + resp = requests.get(url, headers=headers, params=params, timeout=10) + resp.raise_for_status() + results = [] + for oncall in resp.json().get("oncalls", []): + user = oncall.get("user", {}) + email = user.get("email") + if email: + results.append( + { + "email": email, + "escalation_level": oncall.get("escalation_level"), + } + ) + return results + except requests.RequestException: + logger.exception( + "Failed to fetch oncall users from PagerDuty", + extra={"escalation_policy_id": escalation_policy_id}, + ) + return [] From 8efa528a12815ac5f49d6e1351bf3d2973d20888 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Wed, 25 Mar 2026 09:47:45 -0400 Subject: [PATCH 03/19] Move integration tests into tests/ subdirectory and add PagerDuty tests --- src/firetower/integrations/tests/__init__.py | 0 .../integrations/{ => tests}/test_jira.py | 2 +- .../integrations/tests/test_pagerduty.py | 131 ++++++++++++++++++ .../integrations/{ => tests}/test_slack.py | 2 +- 4 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 src/firetower/integrations/tests/__init__.py rename src/firetower/integrations/{ => tests}/test_jira.py (99%) create mode 100644 src/firetower/integrations/tests/test_pagerduty.py rename src/firetower/integrations/{ => tests}/test_slack.py (99%) diff --git a/src/firetower/integrations/tests/__init__.py b/src/firetower/integrations/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/firetower/integrations/test_jira.py b/src/firetower/integrations/tests/test_jira.py similarity index 99% rename from src/firetower/integrations/test_jira.py rename to src/firetower/integrations/tests/test_jira.py index 855a053d..9dde0a69 100644 --- a/src/firetower/integrations/test_jira.py +++ b/src/firetower/integrations/tests/test_jira.py @@ -7,7 +7,7 @@ import pytest -from .services.jira import JiraService +from firetower.integrations.services.jira import JiraService # Set up Django settings os.environ.setdefault("DJANGO_SETTINGS_MODULE", "firetower.settings") diff --git a/src/firetower/integrations/tests/test_pagerduty.py b/src/firetower/integrations/tests/test_pagerduty.py new file mode 100644 index 00000000..b0da3dab --- /dev/null +++ b/src/firetower/integrations/tests/test_pagerduty.py @@ -0,0 +1,131 @@ +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from firetower.integrations.services.pagerduty import PagerDutyService + +MOCK_PD_CONFIG = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": { + "IMOC": { + "id": "P17I207", + "integration_key": "test-integration-key", + }, + "ProdEng": { + "id": "PABC123", + "integration_key": None, + }, + }, +} + + +@pytest.fixture +def pd_service(): + with patch("firetower.integrations.services.pagerduty.settings") as mock_settings: + mock_settings.PAGERDUTY = MOCK_PD_CONFIG + return PagerDutyService() + + +class TestPagerDutyServiceInit: + def test_init_raises_when_unconfigured(self): + with patch( + "firetower.integrations.services.pagerduty.settings" + ) as mock_settings: + mock_settings.PAGERDUTY = None + with pytest.raises(ValueError, match="not configured"): + PagerDutyService() + + def test_init_stores_config(self, pd_service): + assert pd_service.api_token == "test-token" + assert pd_service.escalation_policies["IMOC"]["id"] == "P17I207" + assert ( + pd_service.escalation_policies["IMOC"]["integration_key"] + == "test-integration-key" + ) + assert pd_service.escalation_policies["ProdEng"]["id"] == "PABC123" + assert pd_service.escalation_policies["ProdEng"]["integration_key"] is None + + +class TestTriggerIncident: + @patch("firetower.integrations.services.pagerduty.requests.post") + def test_trigger_success(self, mock_post, pd_service): + mock_post.return_value = MagicMock(status_code=202) + + result = pd_service.trigger_incident("Server down", "dedup-123", "int-key") + + assert result is True + mock_post.assert_called_once() + call_kwargs = mock_post.call_args + assert call_kwargs.kwargs["json"]["routing_key"] == "int-key" + assert call_kwargs.kwargs["json"]["dedup_key"] == "dedup-123" + assert call_kwargs.kwargs["json"]["payload"]["summary"] == "Server down" + + @patch("firetower.integrations.services.pagerduty.requests.post") + def test_trigger_failure(self, mock_post, pd_service): + mock_post.side_effect = requests.RequestException("connection error") + + result = pd_service.trigger_incident("Server down", "dedup-123", "int-key") + + assert result is False + + +class TestGetOncallUsers: + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_returns_users_with_escalation_level(self, mock_get, pd_service): + mock_get.return_value = MagicMock( + json=lambda: { + "oncalls": [ + { + "user": {"email": "alice@example.com"}, + "escalation_level": 1, + }, + { + "user": {"email": "bob@example.com"}, + "escalation_level": 2, + }, + ] + } + ) + + users = pd_service.get_oncall_users("P17I207") + + assert users == [ + {"email": "alice@example.com", "escalation_level": 1}, + {"email": "bob@example.com", "escalation_level": 2}, + ] + mock_get.assert_called_once() + assert mock_get.call_args.kwargs["params"] == { + "escalation_policy_ids[]": "P17I207" + } + + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_returns_empty_list_when_no_oncalls(self, mock_get, pd_service): + mock_get.return_value = MagicMock(json=lambda: {"oncalls": []}) + + users = pd_service.get_oncall_users("P17I207") + + assert users == [] + + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_returns_empty_list_on_api_error(self, mock_get, pd_service): + mock_get.side_effect = requests.RequestException("timeout") + + users = pd_service.get_oncall_users("P17I207") + + assert users == [] + + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_skips_users_without_email(self, mock_get, pd_service): + mock_get.return_value = MagicMock( + json=lambda: { + "oncalls": [ + {"user": {}, "escalation_level": 1}, + {"user": {"email": "alice@example.com"}, "escalation_level": 2}, + ] + } + ) + + users = pd_service.get_oncall_users("P17I207") + + assert users == [{"email": "alice@example.com", "escalation_level": 2}] diff --git a/src/firetower/integrations/test_slack.py b/src/firetower/integrations/tests/test_slack.py similarity index 99% rename from src/firetower/integrations/test_slack.py rename to src/firetower/integrations/tests/test_slack.py index fba3f107..088e3376 100644 --- a/src/firetower/integrations/test_slack.py +++ b/src/firetower/integrations/tests/test_slack.py @@ -7,7 +7,7 @@ from slack_sdk.errors import SlackApiError -from .services.slack import SlackService +from firetower.integrations.services.slack import SlackService # Set up Django settings os.environ.setdefault("DJANGO_SETTINGS_MODULE", "firetower.settings") From ba0683a2cdeb900dcc2d69b129c876b25d36b276 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 12:38:22 -0400 Subject: [PATCH 04/19] Wire PagerDuty IMOC paging into incident hooks for P0/P1 incidents --- src/firetower/incidents/hooks.py | 56 ++++- src/firetower/incidents/tests/test_hooks.py | 229 ++++++++++++++++++++ 2 files changed, 283 insertions(+), 2 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index b6872abf..98511225 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -4,13 +4,57 @@ from django.contrib.auth.models import User from firetower.auth.models import ExternalProfileType -from firetower.incidents.models import ExternalLink, ExternalLinkType, Incident -from firetower.integrations.services import SlackService +from firetower.incidents.models import ( + ExternalLink, + ExternalLinkType, + Incident, + IncidentSeverity, +) +from firetower.integrations.services import PagerDutyService, SlackService from firetower.integrations.services.slack import escape_slack_text logger = logging.getLogger(__name__) _slack_service = SlackService() +PAGEABLE_SEVERITIES = {IncidentSeverity.P0, IncidentSeverity.P1} + + +def _get_pagerduty_service() -> PagerDutyService | None: + if not settings.PAGERDUTY: + return None + try: + return PagerDutyService() + except Exception: + logger.exception("Failed to initialize PagerDutyService") + return None + + +def _page_imoc_if_needed(incident: Incident) -> None: + if incident.severity not in PAGEABLE_SEVERITIES: + return + + pd_service = _get_pagerduty_service() + if not pd_service: + return + + imoc_policy = pd_service.escalation_policies.get("IMOC") + if not imoc_policy: + logger.info("No IMOC escalation policy configured, skipping page") + return + + integration_key = imoc_policy.get("integration_key") + if not integration_key: + logger.info("No integration_key for IMOC escalation policy, skipping page") + return + + dedup_key = f"firetower-{incident.incident_number}" + summary = f"[{incident.severity}] {incident.incident_number}: {incident.title}" + + try: + pd_service.trigger_incident(summary, dedup_key, integration_key) + except Exception: + logger.exception(f"Failed to page IMOC for incident {incident.id}") + def _build_channel_name(incident: Incident) -> str: return incident.incident_number.lower() @@ -201,6 +245,8 @@ def on_incident_created(incident: Incident) -> None: f"Failed to post feed channel message for incident {incident.id}" ) + _page_imoc_if_needed(incident) + # TODO: Datadog notebook creation step will be added in RELENG-467 except Exception: logger.exception(f"Error in on_incident_created for incident {incident.id}") @@ -233,6 +279,12 @@ def on_severity_changed(incident: Incident, old_severity: str) -> None: channel_id, f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>", ) + + if ( + old_severity not in PAGEABLE_SEVERITIES + and incident.severity in PAGEABLE_SEVERITIES + ): + _page_imoc_if_needed(incident) except Exception: logger.exception(f"Error in on_severity_changed for incident {incident.id}") diff --git a/src/firetower/incidents/tests/test_hooks.py b/src/firetower/incidents/tests/test_hooks.py index c68a43f1..65eea5ee 100644 --- a/src/firetower/incidents/tests/test_hooks.py +++ b/src/firetower/incidents/tests/test_hooks.py @@ -7,6 +7,7 @@ from firetower.incidents.hooks import ( _build_channel_name, _build_channel_topic, + _page_imoc_if_needed, on_captain_changed, on_incident_created, on_severity_changed, @@ -573,3 +574,231 @@ def test_noop_without_slack_link(self, mock_slack): on_captain_changed(incident) mock_slack.set_channel_topic.assert_not_called() + + +MOCK_PD_CONFIG = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": { + "IMOC": { + "id": "P17I207", + "integration_key": "test-integration-key", + }, + }, +} + + +@pytest.mark.django_db +class TestPageImocIfNeeded: + @patch("firetower.incidents.hooks.PagerDutyService") + def test_pages_for_p0(self, mock_pd_cls, settings): + settings.PAGERDUTY = MOCK_PD_CONFIG + mock_pd = mock_pd_cls.return_value + mock_pd.escalation_policies = MOCK_PD_CONFIG["ESCALATION_POLICIES"] + mock_pd.trigger_incident.return_value = True + + incident = Incident.objects.create( + title="Major outage", + severity=IncidentSeverity.P0, + ) + + _page_imoc_if_needed(incident) + + mock_pd.trigger_incident.assert_called_once_with( + f"[P0] {incident.incident_number}: Major outage", + f"firetower-{incident.incident_number}", + "test-integration-key", + ) + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_pages_for_p1(self, mock_pd_cls, settings): + settings.PAGERDUTY = MOCK_PD_CONFIG + mock_pd = mock_pd_cls.return_value + mock_pd.escalation_policies = MOCK_PD_CONFIG["ESCALATION_POLICIES"] + mock_pd.trigger_incident.return_value = True + + incident = Incident.objects.create( + title="Service degradation", + severity=IncidentSeverity.P1, + ) + + _page_imoc_if_needed(incident) + + mock_pd.trigger_incident.assert_called_once() + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_skips_for_p2(self, mock_pd_cls, settings): + settings.PAGERDUTY = MOCK_PD_CONFIG + + incident = Incident.objects.create( + title="Minor issue", + severity=IncidentSeverity.P2, + ) + + _page_imoc_if_needed(incident) + + mock_pd_cls.assert_not_called() + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_skips_when_pagerduty_not_configured(self, mock_pd_cls, settings): + settings.PAGERDUTY = None + + incident = Incident.objects.create( + title="Test", + severity=IncidentSeverity.P0, + ) + + _page_imoc_if_needed(incident) + + mock_pd_cls.assert_not_called() + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_skips_when_no_imoc_policy(self, mock_pd_cls, settings): + settings.PAGERDUTY = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": {}, + } + mock_pd = mock_pd_cls.return_value + mock_pd.escalation_policies = {} + + incident = Incident.objects.create( + title="Test", + severity=IncidentSeverity.P0, + ) + + _page_imoc_if_needed(incident) + + mock_pd.trigger_incident.assert_not_called() + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_skips_when_no_integration_key(self, mock_pd_cls, settings): + settings.PAGERDUTY = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": { + "IMOC": {"id": "P17I207", "integration_key": None}, + }, + } + mock_pd = mock_pd_cls.return_value + mock_pd.escalation_policies = { + "IMOC": {"id": "P17I207", "integration_key": None} + } + + incident = Incident.objects.create( + title="Test", + severity=IncidentSeverity.P0, + ) + + _page_imoc_if_needed(incident) + + mock_pd.trigger_incident.assert_not_called() + + +@pytest.mark.django_db +class TestOnIncidentCreatedPagerDuty: + @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_pages_imoc_on_p0_creation(self, mock_slack, mock_page): + mock_slack.create_channel.return_value = "C99999" + mock_slack.build_channel_url.return_value = "https://slack.com/archives/C99999" + + incident = Incident.objects.create( + title="Major outage", + severity=IncidentSeverity.P0, + ) + + on_incident_created(incident) + + mock_page.assert_called_once_with(incident) + + @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_pages_imoc_on_p3_creation(self, mock_slack, mock_page): + mock_slack.create_channel.return_value = "C99999" + mock_slack.build_channel_url.return_value = "https://slack.com/archives/C99999" + + incident = Incident.objects.create( + title="Minor issue", + severity=IncidentSeverity.P3, + ) + + on_incident_created(incident) + + mock_page.assert_called_once_with(incident) + + +@pytest.mark.django_db +class TestOnSeverityChangedPagerDuty: + @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_pages_imoc_on_upgrade_to_p0(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Escalating issue", + severity=IncidentSeverity.P0, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P2) + + mock_page.assert_called_once_with(incident) + + @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_pages_imoc_on_upgrade_to_p1(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Escalating issue", + severity=IncidentSeverity.P1, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P3) + + mock_page.assert_called_once_with(incident) + + @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_does_not_page_on_p1_to_p0(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Already paged", + severity=IncidentSeverity.P0, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P1) + + mock_page.assert_not_called() + + @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_does_not_page_on_downgrade(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Downgraded", + severity=IncidentSeverity.P3, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P1) + + mock_page.assert_not_called() From e53acd8e339a815968cfead8a1ba4ee5ce10ec38 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 13:20:15 -0400 Subject: [PATCH 05/19] Rename IMOC references to HIGH_SEV in PagerDuty integration --- config.example.toml | 2 +- src/firetower/incidents/hooks.py | 18 ++++---- src/firetower/incidents/tests/test_hooks.py | 44 +++++++++---------- .../integrations/tests/test_pagerduty.py | 6 +-- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/config.example.toml b/config.example.toml index 16557413..7d25fabb 100644 --- a/config.example.toml +++ b/config.example.toml @@ -31,7 +31,7 @@ iap_audience = "" [pagerduty] api_token = "" -[pagerduty.escalation_policies.IMOC] +[pagerduty.escalation_policies.HIGH_SEV] id = "" integration_key = "" diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index 98511225..a720f544 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -29,7 +29,7 @@ def _get_pagerduty_service() -> PagerDutyService | None: return None -def _page_imoc_if_needed(incident: Incident) -> None: +def _page_high_sev_if_needed(incident: Incident) -> None: if incident.severity not in PAGEABLE_SEVERITIES: return @@ -37,14 +37,14 @@ def _page_imoc_if_needed(incident: Incident) -> None: if not pd_service: return - imoc_policy = pd_service.escalation_policies.get("IMOC") - if not imoc_policy: - logger.info("No IMOC escalation policy configured, skipping page") + high_sev_policy = pd_service.escalation_policies.get("HIGH_SEV") + if not high_sev_policy: + logger.info("No HIGH_SEV escalation policy configured, skipping page") return - integration_key = imoc_policy.get("integration_key") + integration_key = high_sev_policy.get("integration_key") if not integration_key: - logger.info("No integration_key for IMOC escalation policy, skipping page") + logger.info("No integration_key for HIGH_SEV escalation policy, skipping page") return dedup_key = f"firetower-{incident.incident_number}" @@ -53,7 +53,7 @@ def _page_imoc_if_needed(incident: Incident) -> None: try: pd_service.trigger_incident(summary, dedup_key, integration_key) except Exception: - logger.exception(f"Failed to page IMOC for incident {incident.id}") + logger.exception(f"Failed to page HIGH_SEV for incident {incident.id}") def _build_channel_name(incident: Incident) -> str: @@ -245,7 +245,7 @@ def on_incident_created(incident: Incident) -> None: f"Failed to post feed channel message for incident {incident.id}" ) - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) # TODO: Datadog notebook creation step will be added in RELENG-467 except Exception: @@ -284,7 +284,7 @@ def on_severity_changed(incident: Incident, old_severity: str) -> None: old_severity not in PAGEABLE_SEVERITIES and incident.severity in PAGEABLE_SEVERITIES ): - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) except Exception: logger.exception(f"Error in on_severity_changed for incident {incident.id}") diff --git a/src/firetower/incidents/tests/test_hooks.py b/src/firetower/incidents/tests/test_hooks.py index 65eea5ee..343787a6 100644 --- a/src/firetower/incidents/tests/test_hooks.py +++ b/src/firetower/incidents/tests/test_hooks.py @@ -7,7 +7,7 @@ from firetower.incidents.hooks import ( _build_channel_name, _build_channel_topic, - _page_imoc_if_needed, + _page_high_sev_if_needed, on_captain_changed, on_incident_created, on_severity_changed, @@ -579,7 +579,7 @@ def test_noop_without_slack_link(self, mock_slack): MOCK_PD_CONFIG = { "API_TOKEN": "test-token", "ESCALATION_POLICIES": { - "IMOC": { + "HIGH_SEV": { "id": "P17I207", "integration_key": "test-integration-key", }, @@ -588,7 +588,7 @@ def test_noop_without_slack_link(self, mock_slack): @pytest.mark.django_db -class TestPageImocIfNeeded: +class TestPageHighSevIfNeeded: @patch("firetower.incidents.hooks.PagerDutyService") def test_pages_for_p0(self, mock_pd_cls, settings): settings.PAGERDUTY = MOCK_PD_CONFIG @@ -601,7 +601,7 @@ def test_pages_for_p0(self, mock_pd_cls, settings): severity=IncidentSeverity.P0, ) - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) mock_pd.trigger_incident.assert_called_once_with( f"[P0] {incident.incident_number}: Major outage", @@ -621,7 +621,7 @@ def test_pages_for_p1(self, mock_pd_cls, settings): severity=IncidentSeverity.P1, ) - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) mock_pd.trigger_incident.assert_called_once() @@ -634,7 +634,7 @@ def test_skips_for_p2(self, mock_pd_cls, settings): severity=IncidentSeverity.P2, ) - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) mock_pd_cls.assert_not_called() @@ -647,12 +647,12 @@ def test_skips_when_pagerduty_not_configured(self, mock_pd_cls, settings): severity=IncidentSeverity.P0, ) - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) mock_pd_cls.assert_not_called() @patch("firetower.incidents.hooks.PagerDutyService") - def test_skips_when_no_imoc_policy(self, mock_pd_cls, settings): + def test_skips_when_no_high_sev_policy(self, mock_pd_cls, settings): settings.PAGERDUTY = { "API_TOKEN": "test-token", "ESCALATION_POLICIES": {}, @@ -665,7 +665,7 @@ def test_skips_when_no_imoc_policy(self, mock_pd_cls, settings): severity=IncidentSeverity.P0, ) - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) mock_pd.trigger_incident.assert_not_called() @@ -674,12 +674,12 @@ def test_skips_when_no_integration_key(self, mock_pd_cls, settings): settings.PAGERDUTY = { "API_TOKEN": "test-token", "ESCALATION_POLICIES": { - "IMOC": {"id": "P17I207", "integration_key": None}, + "HIGH_SEV": {"id": "P17I207", "integration_key": None}, }, } mock_pd = mock_pd_cls.return_value mock_pd.escalation_policies = { - "IMOC": {"id": "P17I207", "integration_key": None} + "HIGH_SEV": {"id": "P17I207", "integration_key": None} } incident = Incident.objects.create( @@ -687,16 +687,16 @@ def test_skips_when_no_integration_key(self, mock_pd_cls, settings): severity=IncidentSeverity.P0, ) - _page_imoc_if_needed(incident) + _page_high_sev_if_needed(incident) mock_pd.trigger_incident.assert_not_called() @pytest.mark.django_db class TestOnIncidentCreatedPagerDuty: - @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._page_high_sev_if_needed") @patch("firetower.incidents.hooks._slack_service") - def test_pages_imoc_on_p0_creation(self, mock_slack, mock_page): + def test_pages_high_sev_on_p0_creation(self, mock_slack, mock_page): mock_slack.create_channel.return_value = "C99999" mock_slack.build_channel_url.return_value = "https://slack.com/archives/C99999" @@ -709,9 +709,9 @@ def test_pages_imoc_on_p0_creation(self, mock_slack, mock_page): mock_page.assert_called_once_with(incident) - @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._page_high_sev_if_needed") @patch("firetower.incidents.hooks._slack_service") - def test_pages_imoc_on_p3_creation(self, mock_slack, mock_page): + def test_pages_high_sev_on_p3_creation(self, mock_slack, mock_page): mock_slack.create_channel.return_value = "C99999" mock_slack.build_channel_url.return_value = "https://slack.com/archives/C99999" @@ -727,9 +727,9 @@ def test_pages_imoc_on_p3_creation(self, mock_slack, mock_page): @pytest.mark.django_db class TestOnSeverityChangedPagerDuty: - @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._page_high_sev_if_needed") @patch("firetower.incidents.hooks._slack_service") - def test_pages_imoc_on_upgrade_to_p0(self, mock_slack, mock_page): + def test_pages_high_sev_on_upgrade_to_p0(self, mock_slack, mock_page): mock_slack.parse_channel_id_from_url.return_value = "C12345" incident = Incident.objects.create( @@ -746,9 +746,9 @@ def test_pages_imoc_on_upgrade_to_p0(self, mock_slack, mock_page): mock_page.assert_called_once_with(incident) - @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._page_high_sev_if_needed") @patch("firetower.incidents.hooks._slack_service") - def test_pages_imoc_on_upgrade_to_p1(self, mock_slack, mock_page): + def test_pages_high_sev_on_upgrade_to_p1(self, mock_slack, mock_page): mock_slack.parse_channel_id_from_url.return_value = "C12345" incident = Incident.objects.create( @@ -765,7 +765,7 @@ def test_pages_imoc_on_upgrade_to_p1(self, mock_slack, mock_page): mock_page.assert_called_once_with(incident) - @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._page_high_sev_if_needed") @patch("firetower.incidents.hooks._slack_service") def test_does_not_page_on_p1_to_p0(self, mock_slack, mock_page): mock_slack.parse_channel_id_from_url.return_value = "C12345" @@ -784,7 +784,7 @@ def test_does_not_page_on_p1_to_p0(self, mock_slack, mock_page): mock_page.assert_not_called() - @patch("firetower.incidents.hooks._page_imoc_if_needed") + @patch("firetower.incidents.hooks._page_high_sev_if_needed") @patch("firetower.incidents.hooks._slack_service") def test_does_not_page_on_downgrade(self, mock_slack, mock_page): mock_slack.parse_channel_id_from_url.return_value = "C12345" diff --git a/src/firetower/integrations/tests/test_pagerduty.py b/src/firetower/integrations/tests/test_pagerduty.py index b0da3dab..e4b262e3 100644 --- a/src/firetower/integrations/tests/test_pagerduty.py +++ b/src/firetower/integrations/tests/test_pagerduty.py @@ -8,7 +8,7 @@ MOCK_PD_CONFIG = { "API_TOKEN": "test-token", "ESCALATION_POLICIES": { - "IMOC": { + "HIGH_SEV": { "id": "P17I207", "integration_key": "test-integration-key", }, @@ -38,9 +38,9 @@ def test_init_raises_when_unconfigured(self): def test_init_stores_config(self, pd_service): assert pd_service.api_token == "test-token" - assert pd_service.escalation_policies["IMOC"]["id"] == "P17I207" + assert pd_service.escalation_policies["HIGH_SEV"]["id"] == "P17I207" assert ( - pd_service.escalation_policies["IMOC"]["integration_key"] + pd_service.escalation_policies["HIGH_SEV"]["integration_key"] == "test-integration-key" ) assert pd_service.escalation_policies["ProdEng"]["id"] == "PABC123" From 8f5e1caea17f5099076eddff3a8ae6b1118fe8f3 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 13:30:06 -0400 Subject: [PATCH 06/19] Move paging decision logic from PagerDutyService into hooks --- src/firetower/incidents/hooks.py | 11 ++++++++--- src/firetower/incidents/tests/test_hooks.py | 18 ++---------------- .../integrations/services/pagerduty.py | 1 - .../integrations/tests/test_pagerduty.py | 7 ------- 4 files changed, 10 insertions(+), 27 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index a720f544..ad531ae7 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -33,11 +33,12 @@ def _page_high_sev_if_needed(incident: Incident) -> None: if incident.severity not in PAGEABLE_SEVERITIES: return - pd_service = _get_pagerduty_service() - if not pd_service: + pd_config = settings.PAGERDUTY + if not pd_config: return - high_sev_policy = pd_service.escalation_policies.get("HIGH_SEV") + escalation_policies = pd_config.get("ESCALATION_POLICIES", {}) + high_sev_policy = escalation_policies.get("HIGH_SEV") if not high_sev_policy: logger.info("No HIGH_SEV escalation policy configured, skipping page") return @@ -47,6 +48,10 @@ def _page_high_sev_if_needed(incident: Incident) -> None: logger.info("No integration_key for HIGH_SEV escalation policy, skipping page") return + pd_service = _get_pagerduty_service() + if not pd_service: + return + dedup_key = f"firetower-{incident.incident_number}" summary = f"[{incident.severity}] {incident.incident_number}: {incident.title}" diff --git a/src/firetower/incidents/tests/test_hooks.py b/src/firetower/incidents/tests/test_hooks.py index 343787a6..439ca3e5 100644 --- a/src/firetower/incidents/tests/test_hooks.py +++ b/src/firetower/incidents/tests/test_hooks.py @@ -593,7 +593,6 @@ class TestPageHighSevIfNeeded: def test_pages_for_p0(self, mock_pd_cls, settings): settings.PAGERDUTY = MOCK_PD_CONFIG mock_pd = mock_pd_cls.return_value - mock_pd.escalation_policies = MOCK_PD_CONFIG["ESCALATION_POLICIES"] mock_pd.trigger_incident.return_value = True incident = Incident.objects.create( @@ -613,7 +612,6 @@ def test_pages_for_p0(self, mock_pd_cls, settings): def test_pages_for_p1(self, mock_pd_cls, settings): settings.PAGERDUTY = MOCK_PD_CONFIG mock_pd = mock_pd_cls.return_value - mock_pd.escalation_policies = MOCK_PD_CONFIG["ESCALATION_POLICIES"] mock_pd.trigger_incident.return_value = True incident = Incident.objects.create( @@ -651,14 +649,11 @@ def test_skips_when_pagerduty_not_configured(self, mock_pd_cls, settings): mock_pd_cls.assert_not_called() - @patch("firetower.incidents.hooks.PagerDutyService") - def test_skips_when_no_high_sev_policy(self, mock_pd_cls, settings): + def test_skips_when_no_high_sev_policy(self, settings): settings.PAGERDUTY = { "API_TOKEN": "test-token", "ESCALATION_POLICIES": {}, } - mock_pd = mock_pd_cls.return_value - mock_pd.escalation_policies = {} incident = Incident.objects.create( title="Test", @@ -667,20 +662,13 @@ def test_skips_when_no_high_sev_policy(self, mock_pd_cls, settings): _page_high_sev_if_needed(incident) - mock_pd.trigger_incident.assert_not_called() - - @patch("firetower.incidents.hooks.PagerDutyService") - def test_skips_when_no_integration_key(self, mock_pd_cls, settings): + def test_skips_when_no_integration_key(self, settings): settings.PAGERDUTY = { "API_TOKEN": "test-token", "ESCALATION_POLICIES": { "HIGH_SEV": {"id": "P17I207", "integration_key": None}, }, } - mock_pd = mock_pd_cls.return_value - mock_pd.escalation_policies = { - "HIGH_SEV": {"id": "P17I207", "integration_key": None} - } incident = Incident.objects.create( title="Test", @@ -689,8 +677,6 @@ def test_skips_when_no_integration_key(self, mock_pd_cls, settings): _page_high_sev_if_needed(incident) - mock_pd.trigger_incident.assert_not_called() - @pytest.mark.django_db class TestOnIncidentCreatedPagerDuty: diff --git a/src/firetower/integrations/services/pagerduty.py b/src/firetower/integrations/services/pagerduty.py index 8b917343..f324ceb1 100644 --- a/src/firetower/integrations/services/pagerduty.py +++ b/src/firetower/integrations/services/pagerduty.py @@ -16,7 +16,6 @@ def __init__(self) -> None: raise ValueError("PagerDuty is not configured") self.api_token = pd_config["API_TOKEN"] - self.escalation_policies = pd_config["ESCALATION_POLICIES"] def trigger_incident( self, summary: str, dedup_key: str, integration_key: str diff --git a/src/firetower/integrations/tests/test_pagerduty.py b/src/firetower/integrations/tests/test_pagerduty.py index e4b262e3..80f84596 100644 --- a/src/firetower/integrations/tests/test_pagerduty.py +++ b/src/firetower/integrations/tests/test_pagerduty.py @@ -38,13 +38,6 @@ def test_init_raises_when_unconfigured(self): def test_init_stores_config(self, pd_service): assert pd_service.api_token == "test-token" - assert pd_service.escalation_policies["HIGH_SEV"]["id"] == "P17I207" - assert ( - pd_service.escalation_policies["HIGH_SEV"]["integration_key"] - == "test-integration-key" - ) - assert pd_service.escalation_policies["ProdEng"]["id"] == "PABC123" - assert pd_service.escalation_policies["ProdEng"]["integration_key"] is None class TestTriggerIncident: From 2d0256e665ba36ec63a73d5efb554bd06ec3a440 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 16:27:28 -0400 Subject: [PATCH 07/19] Rename test to accurately describe what it verifies --- src/firetower/incidents/tests/test_hooks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/firetower/incidents/tests/test_hooks.py b/src/firetower/incidents/tests/test_hooks.py index 439ca3e5..970f3a54 100644 --- a/src/firetower/incidents/tests/test_hooks.py +++ b/src/firetower/incidents/tests/test_hooks.py @@ -697,7 +697,9 @@ def test_pages_high_sev_on_p0_creation(self, mock_slack, mock_page): @patch("firetower.incidents.hooks._page_high_sev_if_needed") @patch("firetower.incidents.hooks._slack_service") - def test_pages_high_sev_on_p3_creation(self, mock_slack, mock_page): + def test_on_incident_created_calls_page_high_sev_if_needed( + self, mock_slack, mock_page + ): mock_slack.create_channel.return_value = "C99999" mock_slack.build_channel_url.return_value = "https://slack.com/archives/C99999" From 16f632e6535f2a826744b3f2058acd7e00c2db56 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 16:38:26 -0400 Subject: [PATCH 08/19] Move PagerDuty paging before Slack operations so it runs even if Slack fails --- src/firetower/incidents/hooks.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index ad531ae7..78f76792 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -135,6 +135,8 @@ def _invite_user_to_channel( def on_incident_created(incident: Incident) -> None: try: + _page_high_sev_if_needed(incident) + # Use get_or_create to atomically claim the ExternalLink row before calling # the Slack API. If two concurrent requests both reach this point, only one # will get created=True; the other bails out without creating a second channel. @@ -250,8 +252,6 @@ def on_incident_created(incident: Incident) -> None: f"Failed to post feed channel message for incident {incident.id}" ) - _page_high_sev_if_needed(incident) - # TODO: Datadog notebook creation step will be added in RELENG-467 except Exception: logger.exception(f"Error in on_incident_created for incident {incident.id}") @@ -274,6 +274,12 @@ def on_status_changed(incident: Incident, old_status: str) -> None: def on_severity_changed(incident: Incident, old_severity: str) -> None: try: + if ( + old_severity not in PAGEABLE_SEVERITIES + and incident.severity in PAGEABLE_SEVERITIES + ): + _page_high_sev_if_needed(incident) + channel_id = _get_channel_id(incident) if not channel_id: return @@ -284,12 +290,6 @@ def on_severity_changed(incident: Incident, old_severity: str) -> None: channel_id, f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>", ) - - if ( - old_severity not in PAGEABLE_SEVERITIES - and incident.severity in PAGEABLE_SEVERITIES - ): - _page_high_sev_if_needed(incident) except Exception: logger.exception(f"Error in on_severity_changed for incident {incident.id}") From 622b922796f49f0c9bf96774f3f2eeec6675bd22 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 16:46:48 -0400 Subject: [PATCH 09/19] Don't skip non-Slack operations when Slack channel creation fails in on_incident_created --- src/firetower/incidents/hooks.py | 148 +++++++++++++++++-------------- 1 file changed, 79 insertions(+), 69 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index 78f76792..6ba5139d 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -151,6 +151,7 @@ def on_incident_created(incident: Incident) -> None: ) return + channel_id = None try: channel_id = _slack_service.create_channel( _build_channel_name(incident), is_private=incident.is_private @@ -160,97 +161,106 @@ def on_incident_created(incident: Incident) -> None: logger.warning( f"Failed to create Slack channel for incident {incident.id}" ) - return - channel_url = _slack_service.build_channel_url(channel_id) - slack_link.url = channel_url - slack_link.save(update_fields=["url"]) + else: + channel_url = _slack_service.build_channel_url(channel_id) + slack_link.url = channel_url + slack_link.save(update_fields=["url"]) except Exception: slack_link.delete() - raise - - captain_slack_id = ( - _get_slack_user_id(incident.captain) if incident.captain else None - ) - - try: - _slack_service.set_channel_topic( - channel_id, _build_channel_topic(incident, captain_slack_id) + logger.exception( + f"Failed to create Slack channel for incident {incident.id}" ) - except Exception: - logger.exception(f"Failed to set channel topic for incident {incident.id}") - incident_url = _build_incident_url(incident) - - try: - _slack_service.add_bookmark(channel_id, "Firetower Incident", incident_url) - except Exception: - logger.exception(f"Failed to add bookmark for incident {incident.id}") + if channel_id: + captain_slack_id = ( + _get_slack_user_id(incident.captain) if incident.captain else None + ) - guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") - if guide_message: try: - _slack_service.post_message(channel_id, guide_message) - except Exception: - logger.exception( - f"Failed to post guide message for incident {incident.id}" + _slack_service.set_channel_topic( + channel_id, _build_channel_topic(incident, captain_slack_id) ) - - ic_mention = "" - if incident.captain: - if captain_slack_id: - ic_mention = f"Incident Captain: <@{captain_slack_id}>" - else: - captain_name = escape_slack_text( - incident.captain.get_full_name() or incident.captain.username - ) - ic_mention = f"Incident Captain: {captain_name}" - - if ic_mention: - try: - _slack_service.post_message(channel_id, ic_mention) except Exception: logger.exception( - f"Failed to post IC mention for incident {incident.id}" + f"Failed to set channel topic for incident {incident.id}" ) - if incident.description: + incident_url = _build_incident_url(incident) + try: - _slack_service.post_message( - channel_id, - f"*Incident Description:*\n{escape_slack_text(incident.description)}", + _slack_service.add_bookmark( + channel_id, "Firetower Incident", incident_url ) except Exception: - logger.exception( - f"Failed to post description for incident {incident.id}" - ) + logger.exception(f"Failed to add bookmark for incident {incident.id}") - if incident.captain: - _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) + guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") + if guide_message: + try: + _slack_service.post_message(channel_id, guide_message) + except Exception: + logger.exception( + f"Failed to post guide message for incident {incident.id}" + ) - always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) - if always_invited: - ids_to_invite = [uid for uid in always_invited if uid != captain_slack_id] - if ids_to_invite: + ic_mention = "" + if incident.captain: + if captain_slack_id: + ic_mention = f"Incident Captain: <@{captain_slack_id}>" + else: + captain_name = escape_slack_text( + incident.captain.get_full_name() or incident.captain.username + ) + ic_mention = f"Incident Captain: {captain_name}" + + if ic_mention: try: - _slack_service.invite_to_channel(channel_id, ids_to_invite) + _slack_service.post_message(channel_id, ic_mention) except Exception: logger.exception( - f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" + f"Failed to post IC mention for incident {incident.id}" ) - feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") - if feed_channel_id and not incident.is_private: - feed_message = ( - f"A {incident.severity} incident has been created.\n" - f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" - f"\n\nFor those involved, please join <#{channel_id}>" - ) - try: - _slack_service.post_message(feed_channel_id, feed_message) - except Exception: - logger.exception( - f"Failed to post feed channel message for incident {incident.id}" + if incident.description: + try: + _slack_service.post_message( + channel_id, + f"*Incident Description:*\n{escape_slack_text(incident.description)}", + ) + except Exception: + logger.exception( + f"Failed to post description for incident {incident.id}" + ) + + if incident.captain: + _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) + + always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) + if always_invited: + ids_to_invite = [ + uid for uid in always_invited if uid != captain_slack_id + ] + if ids_to_invite: + try: + _slack_service.invite_to_channel(channel_id, ids_to_invite) + except Exception: + logger.exception( + f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" + ) + + feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") + if feed_channel_id and not incident.is_private: + feed_message = ( + f"A {incident.severity} incident has been created.\n" + f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" + f"\n\nFor those involved, please join <#{channel_id}>" ) + try: + _slack_service.post_message(feed_channel_id, feed_message) + except Exception: + logger.exception( + f"Failed to post feed channel message for incident {incident.id}" + ) # TODO: Datadog notebook creation step will be added in RELENG-467 except Exception: From 139a3d2055d7de8e5f5a23b85c7834fb8bab4f18 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 17:14:32 -0400 Subject: [PATCH 10/19] Refactor hooks --- src/firetower/incidents/hooks.py | 190 +++++++++++++++---------------- 1 file changed, 93 insertions(+), 97 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index 6ba5139d..c915ee0a 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -134,31 +134,27 @@ def _invite_user_to_channel( def on_incident_created(incident: Incident) -> None: - try: - _page_high_sev_if_needed(incident) - - # Use get_or_create to atomically claim the ExternalLink row before calling - # the Slack API. If two concurrent requests both reach this point, only one - # will get created=True; the other bails out without creating a second channel. - slack_link, created = ExternalLink.objects.get_or_create( - incident=incident, - type=ExternalLinkType.SLACK, - defaults={"url": ""}, + # Use get_or_create to atomically claim the ExternalLink row before calling + # the Slack API. If two concurrent requests both reach this point, only one + # will get created=True; the other bails out without creating a second channel. + slack_link, created = ExternalLink.objects.get_or_create( + incident=incident, + type=ExternalLinkType.SLACK, + defaults={"url": ""}, + ) + channel_id = None + if not created: + logger.info( + f"Incident {incident.id} already has a Slack link, skipping channel creation" ) - if not created: - logger.info( - f"Incident {incident.id} already has a Slack link, skipping channel creation" - ) - return - - channel_id = None + else: try: channel_id = _slack_service.create_channel( _build_channel_name(incident), is_private=incident.is_private ) if not channel_id: slack_link.delete() - logger.warning( + logger.error( f"Failed to create Slack channel for incident {incident.id}" ) else: @@ -171,100 +167,97 @@ def on_incident_created(incident: Incident) -> None: f"Failed to create Slack channel for incident {incident.id}" ) - if channel_id: - captain_slack_id = ( - _get_slack_user_id(incident.captain) if incident.captain else None + if channel_id: + captain_slack_id = ( + _get_slack_user_id(incident.captain) if incident.captain else None + ) + + try: + _slack_service.set_channel_topic( + channel_id, _build_channel_topic(incident, captain_slack_id) ) + except Exception: + logger.exception(f"Failed to set channel topic for incident {incident.id}") + incident_url = _build_incident_url(incident) + + try: + _slack_service.add_bookmark(channel_id, "Firetower Incident", incident_url) + except Exception: + logger.exception(f"Failed to add bookmark for incident {incident.id}") + + guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") + if guide_message: try: - _slack_service.set_channel_topic( - channel_id, _build_channel_topic(incident, captain_slack_id) - ) + _slack_service.post_message(channel_id, guide_message) except Exception: logger.exception( - f"Failed to set channel topic for incident {incident.id}" + f"Failed to post guide message for incident {incident.id}" ) - incident_url = _build_incident_url(incident) + ic_mention = "" + if incident.captain: + if captain_slack_id: + ic_mention = f"Incident Captain: <@{captain_slack_id}>" + else: + captain_name = escape_slack_text( + incident.captain.get_full_name() or incident.captain.username + ) + ic_mention = f"Incident Captain: {captain_name}" + if ic_mention: try: - _slack_service.add_bookmark( - channel_id, "Firetower Incident", incident_url - ) + _slack_service.post_message(channel_id, ic_mention) except Exception: - logger.exception(f"Failed to add bookmark for incident {incident.id}") - - guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") - if guide_message: - try: - _slack_service.post_message(channel_id, guide_message) - except Exception: - logger.exception( - f"Failed to post guide message for incident {incident.id}" - ) + logger.exception( + f"Failed to post IC mention for incident {incident.id}" + ) - ic_mention = "" - if incident.captain: - if captain_slack_id: - ic_mention = f"Incident Captain: <@{captain_slack_id}>" - else: - captain_name = escape_slack_text( - incident.captain.get_full_name() or incident.captain.username - ) - ic_mention = f"Incident Captain: {captain_name}" + if incident.description: + try: + _slack_service.post_message( + channel_id, + f"*Incident Description:*\n{escape_slack_text(incident.description)}", + ) + except Exception: + logger.exception( + f"Failed to post description for incident {incident.id}" + ) - if ic_mention: - try: - _slack_service.post_message(channel_id, ic_mention) - except Exception: - logger.exception( - f"Failed to post IC mention for incident {incident.id}" - ) + if incident.captain: + _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) - if incident.description: + always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) + if always_invited: + ids_to_invite = [uid for uid in always_invited if uid != captain_slack_id] + if ids_to_invite: try: - _slack_service.post_message( - channel_id, - f"*Incident Description:*\n{escape_slack_text(incident.description)}", - ) + _slack_service.invite_to_channel(channel_id, ids_to_invite) except Exception: logger.exception( - f"Failed to post description for incident {incident.id}" + f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" ) - if incident.captain: - _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) - - always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) - if always_invited: - ids_to_invite = [ - uid for uid in always_invited if uid != captain_slack_id - ] - if ids_to_invite: - try: - _slack_service.invite_to_channel(channel_id, ids_to_invite) - except Exception: - logger.exception( - f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" - ) - - feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") - if feed_channel_id and not incident.is_private: - feed_message = ( - f"A {incident.severity} incident has been created.\n" - f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" - f"\n\nFor those involved, please join <#{channel_id}>" + feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") + if feed_channel_id and not incident.is_private: + feed_message = ( + f"A {incident.severity} incident has been created.\n" + f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" + f"\n\nFor those involved, please join <#{channel_id}>" + ) + try: + _slack_service.post_message(feed_channel_id, feed_message) + except Exception: + logger.exception( + f"Failed to post feed channel message for incident {incident.id}" ) - try: - _slack_service.post_message(feed_channel_id, feed_message) - except Exception: - logger.exception( - f"Failed to post feed channel message for incident {incident.id}" - ) - # TODO: Datadog notebook creation step will be added in RELENG-467 + try: + _page_high_sev_if_needed(incident) except Exception: - logger.exception(f"Error in on_incident_created for incident {incident.id}") + logger.exception(f"Failed to page for incident {incident.id}") + + # TODO: Datadog notebook creation step will be added in RELENG-467 def on_status_changed(incident: Incident, old_status: str) -> None: @@ -284,12 +277,6 @@ def on_status_changed(incident: Incident, old_status: str) -> None: def on_severity_changed(incident: Incident, old_severity: str) -> None: try: - if ( - old_severity not in PAGEABLE_SEVERITIES - and incident.severity in PAGEABLE_SEVERITIES - ): - _page_high_sev_if_needed(incident) - channel_id = _get_channel_id(incident) if not channel_id: return @@ -303,6 +290,15 @@ def on_severity_changed(incident: Incident, old_severity: str) -> None: except Exception: logger.exception(f"Error in on_severity_changed for incident {incident.id}") + if ( + old_severity not in PAGEABLE_SEVERITIES + and incident.severity in PAGEABLE_SEVERITIES + ): + try: + _page_high_sev_if_needed(incident) + except Exception: + logger.exception(f"Failed to page for incident {incident.id}") + def on_title_changed(incident: Incident) -> None: try: From afb29e1c785f643c9e899103b2fdb9cf2f8dc7fb Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 17:19:09 -0400 Subject: [PATCH 11/19] Add slack and firetower links to page --- src/firetower/incidents/hooks.py | 7 ++++++- src/firetower/integrations/services/pagerduty.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index c915ee0a..51c8d340 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -55,8 +55,13 @@ def _page_high_sev_if_needed(incident: Incident) -> None: dedup_key = f"firetower-{incident.incident_number}" summary = f"[{incident.severity}] {incident.incident_number}: {incident.title}" + links = [{"href": _build_incident_url(incident), "text": "View in Firetower"}] + slack_link = incident.external_links.filter(type=ExternalLinkType.SLACK).first() + if slack_link and slack_link.url: + links.append({"href": slack_link.url, "text": "Slack Channel"}) + try: - pd_service.trigger_incident(summary, dedup_key, integration_key) + pd_service.trigger_incident(summary, dedup_key, integration_key, links=links) except Exception: logger.exception(f"Failed to page HIGH_SEV for incident {incident.id}") diff --git a/src/firetower/integrations/services/pagerduty.py b/src/firetower/integrations/services/pagerduty.py index f324ceb1..be08dcf8 100644 --- a/src/firetower/integrations/services/pagerduty.py +++ b/src/firetower/integrations/services/pagerduty.py @@ -18,9 +18,13 @@ def __init__(self) -> None: self.api_token = pd_config["API_TOKEN"] def trigger_incident( - self, summary: str, dedup_key: str, integration_key: str + self, + summary: str, + dedup_key: str, + integration_key: str, + links: list[dict] | None = None, ) -> bool: - payload = { + payload: dict = { "routing_key": integration_key, "event_action": "trigger", "dedup_key": dedup_key, @@ -30,6 +34,8 @@ def trigger_incident( "source": "firetower", }, } + if links: + payload["links"] = links try: resp = requests.post(EVENTS_API_URL, json=payload, timeout=10) From 3f01f61cc9f0a6cd36b6b048d63c6a544a6a7e85 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 17:23:41 -0400 Subject: [PATCH 12/19] Fix on_severity_changed early return blocking PD paging --- src/firetower/incidents/hooks.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index 51c8d340..ab4f3a9b 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -283,15 +283,13 @@ def on_status_changed(incident: Incident, old_status: str) -> None: def on_severity_changed(incident: Incident, old_severity: str) -> None: try: channel_id = _get_channel_id(incident) - if not channel_id: - return - - _slack_service.set_channel_topic(channel_id, _build_channel_topic(incident)) - incident_url = _build_incident_url(incident) - _slack_service.post_message( - channel_id, - f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>", - ) + if channel_id: + _slack_service.set_channel_topic(channel_id, _build_channel_topic(incident)) + incident_url = _build_incident_url(incident) + _slack_service.post_message( + channel_id, + f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>", + ) except Exception: logger.exception(f"Error in on_severity_changed for incident {incident.id}") From 5824152c246920094af5f1896f897e4d4636a7b4 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 17:24:17 -0400 Subject: [PATCH 13/19] Fix test_pages_for_p0 assertion to include links kwarg --- src/firetower/incidents/tests/test_hooks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/firetower/incidents/tests/test_hooks.py b/src/firetower/incidents/tests/test_hooks.py index 970f3a54..f79a6f8b 100644 --- a/src/firetower/incidents/tests/test_hooks.py +++ b/src/firetower/incidents/tests/test_hooks.py @@ -606,6 +606,9 @@ def test_pages_for_p0(self, mock_pd_cls, settings): f"[P0] {incident.incident_number}: Major outage", f"firetower-{incident.incident_number}", "test-integration-key", + links=[ + {"href": f"/{incident.incident_number}", "text": "View in Firetower"} + ], ) @patch("firetower.incidents.hooks.PagerDutyService") From a5cc8932389f8cc976327349d903fdfa0977c56d Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 17:35:39 -0400 Subject: [PATCH 14/19] Fix test_pages_for_p0 to use explicit FIRETOWER_BASE_URL --- src/firetower/incidents/tests/test_hooks.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/firetower/incidents/tests/test_hooks.py b/src/firetower/incidents/tests/test_hooks.py index f79a6f8b..5e3a23dc 100644 --- a/src/firetower/incidents/tests/test_hooks.py +++ b/src/firetower/incidents/tests/test_hooks.py @@ -592,6 +592,7 @@ class TestPageHighSevIfNeeded: @patch("firetower.incidents.hooks.PagerDutyService") def test_pages_for_p0(self, mock_pd_cls, settings): settings.PAGERDUTY = MOCK_PD_CONFIG + settings.FIRETOWER_BASE_URL = "https://firetower.example.com" mock_pd = mock_pd_cls.return_value mock_pd.trigger_incident.return_value = True @@ -607,7 +608,10 @@ def test_pages_for_p0(self, mock_pd_cls, settings): f"firetower-{incident.incident_number}", "test-integration-key", links=[ - {"href": f"/{incident.incident_number}", "text": "View in Firetower"} + { + "href": f"https://firetower.example.com/{incident.incident_number}", + "text": "View in Firetower", + } ], ) From ab88efe783171f93e4464ece653c9483bbe29036 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 18:05:56 -0400 Subject: [PATCH 15/19] Re-add outer try/except safety net around on_incident_created --- src/firetower/incidents/hooks.py | 203 ++++++++++++++++--------------- 1 file changed, 106 insertions(+), 97 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index ab4f3a9b..e56d60d6 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -139,130 +139,139 @@ def _invite_user_to_channel( def on_incident_created(incident: Incident) -> None: - # Use get_or_create to atomically claim the ExternalLink row before calling - # the Slack API. If two concurrent requests both reach this point, only one - # will get created=True; the other bails out without creating a second channel. - slack_link, created = ExternalLink.objects.get_or_create( - incident=incident, - type=ExternalLinkType.SLACK, - defaults={"url": ""}, - ) - channel_id = None - if not created: - logger.info( - f"Incident {incident.id} already has a Slack link, skipping channel creation" + try: + # Use get_or_create to atomically claim the ExternalLink row before calling + # the Slack API. If two concurrent requests both reach this point, only one + # will get created=True; the other bails out without creating a second channel. + slack_link, created = ExternalLink.objects.get_or_create( + incident=incident, + type=ExternalLinkType.SLACK, + defaults={"url": ""}, ) - else: - try: - channel_id = _slack_service.create_channel( - _build_channel_name(incident), is_private=incident.is_private + channel_id = None + if not created: + logger.info( + f"Incident {incident.id} already has a Slack link, skipping channel creation" ) - if not channel_id: + else: + try: + channel_id = _slack_service.create_channel( + _build_channel_name(incident), is_private=incident.is_private + ) + if not channel_id: + slack_link.delete() + logger.error( + f"Failed to create Slack channel for incident {incident.id}" + ) + else: + channel_url = _slack_service.build_channel_url(channel_id) + slack_link.url = channel_url + slack_link.save(update_fields=["url"]) + except Exception: slack_link.delete() - logger.error( + logger.exception( f"Failed to create Slack channel for incident {incident.id}" ) - else: - channel_url = _slack_service.build_channel_url(channel_id) - slack_link.url = channel_url - slack_link.save(update_fields=["url"]) - except Exception: - slack_link.delete() - logger.exception( - f"Failed to create Slack channel for incident {incident.id}" - ) - if channel_id: - captain_slack_id = ( - _get_slack_user_id(incident.captain) if incident.captain else None - ) - - try: - _slack_service.set_channel_topic( - channel_id, _build_channel_topic(incident, captain_slack_id) + if channel_id: + captain_slack_id = ( + _get_slack_user_id(incident.captain) if incident.captain else None ) - except Exception: - logger.exception(f"Failed to set channel topic for incident {incident.id}") - - incident_url = _build_incident_url(incident) - - try: - _slack_service.add_bookmark(channel_id, "Firetower Incident", incident_url) - except Exception: - logger.exception(f"Failed to add bookmark for incident {incident.id}") - guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") - if guide_message: try: - _slack_service.post_message(channel_id, guide_message) - except Exception: - logger.exception( - f"Failed to post guide message for incident {incident.id}" + _slack_service.set_channel_topic( + channel_id, _build_channel_topic(incident, captain_slack_id) ) - - ic_mention = "" - if incident.captain: - if captain_slack_id: - ic_mention = f"Incident Captain: <@{captain_slack_id}>" - else: - captain_name = escape_slack_text( - incident.captain.get_full_name() or incident.captain.username - ) - ic_mention = f"Incident Captain: {captain_name}" - - if ic_mention: - try: - _slack_service.post_message(channel_id, ic_mention) except Exception: logger.exception( - f"Failed to post IC mention for incident {incident.id}" + f"Failed to set channel topic for incident {incident.id}" ) - if incident.description: + incident_url = _build_incident_url(incident) + try: - _slack_service.post_message( - channel_id, - f"*Incident Description:*\n{escape_slack_text(incident.description)}", + _slack_service.add_bookmark( + channel_id, "Firetower Incident", incident_url ) except Exception: - logger.exception( - f"Failed to post description for incident {incident.id}" - ) + logger.exception(f"Failed to add bookmark for incident {incident.id}") - if incident.captain: - _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) + guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") + if guide_message: + try: + _slack_service.post_message(channel_id, guide_message) + except Exception: + logger.exception( + f"Failed to post guide message for incident {incident.id}" + ) - always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) - if always_invited: - ids_to_invite = [uid for uid in always_invited if uid != captain_slack_id] - if ids_to_invite: + ic_mention = "" + if incident.captain: + if captain_slack_id: + ic_mention = f"Incident Captain: <@{captain_slack_id}>" + else: + captain_name = escape_slack_text( + incident.captain.get_full_name() or incident.captain.username + ) + ic_mention = f"Incident Captain: {captain_name}" + + if ic_mention: try: - _slack_service.invite_to_channel(channel_id, ids_to_invite) + _slack_service.post_message(channel_id, ic_mention) except Exception: logger.exception( - f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" + f"Failed to post IC mention for incident {incident.id}" ) - feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") - if feed_channel_id and not incident.is_private: - feed_message = ( - f"A {incident.severity} incident has been created.\n" - f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" - f"\n\nFor those involved, please join <#{channel_id}>" - ) - try: - _slack_service.post_message(feed_channel_id, feed_message) - except Exception: - logger.exception( - f"Failed to post feed channel message for incident {incident.id}" + if incident.description: + try: + _slack_service.post_message( + channel_id, + f"*Incident Description:*\n{escape_slack_text(incident.description)}", + ) + except Exception: + logger.exception( + f"Failed to post description for incident {incident.id}" + ) + + if incident.captain: + _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) + + always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) + if always_invited: + ids_to_invite = [ + uid for uid in always_invited if uid != captain_slack_id + ] + if ids_to_invite: + try: + _slack_service.invite_to_channel(channel_id, ids_to_invite) + except Exception: + logger.exception( + f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" + ) + + feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") + if feed_channel_id and not incident.is_private: + feed_message = ( + f"A {incident.severity} incident has been created.\n" + f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" + f"\n\nFor those involved, please join <#{channel_id}>" ) + try: + _slack_service.post_message(feed_channel_id, feed_message) + except Exception: + logger.exception( + f"Failed to post feed channel message for incident {incident.id}" + ) - try: - _page_high_sev_if_needed(incident) - except Exception: - logger.exception(f"Failed to page for incident {incident.id}") + try: + _page_high_sev_if_needed(incident) + except Exception: + logger.exception(f"Failed to page for incident {incident.id}") - # TODO: Datadog notebook creation step will be added in RELENG-467 + # TODO: Datadog notebook creation step will be added in RELENG-467 + except Exception: + logger.exception(f"Error in on_incident_created for incident {incident.id}") def on_status_changed(incident: Incident, old_status: str) -> None: From af91ed65a3783afcdf7268190fba646cb59d2551 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 18:06:06 -0400 Subject: [PATCH 16/19] Revert "Re-add outer try/except safety net around on_incident_created" This reverts commit 0cb3500196c5a205a36d7c6f86db772d09041a6f. --- src/firetower/incidents/hooks.py | 203 +++++++++++++++---------------- 1 file changed, 97 insertions(+), 106 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index e56d60d6..ab4f3a9b 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -139,139 +139,130 @@ def _invite_user_to_channel( def on_incident_created(incident: Incident) -> None: - try: - # Use get_or_create to atomically claim the ExternalLink row before calling - # the Slack API. If two concurrent requests both reach this point, only one - # will get created=True; the other bails out without creating a second channel. - slack_link, created = ExternalLink.objects.get_or_create( - incident=incident, - type=ExternalLinkType.SLACK, - defaults={"url": ""}, + # Use get_or_create to atomically claim the ExternalLink row before calling + # the Slack API. If two concurrent requests both reach this point, only one + # will get created=True; the other bails out without creating a second channel. + slack_link, created = ExternalLink.objects.get_or_create( + incident=incident, + type=ExternalLinkType.SLACK, + defaults={"url": ""}, + ) + channel_id = None + if not created: + logger.info( + f"Incident {incident.id} already has a Slack link, skipping channel creation" ) - channel_id = None - if not created: - logger.info( - f"Incident {incident.id} already has a Slack link, skipping channel creation" + else: + try: + channel_id = _slack_service.create_channel( + _build_channel_name(incident), is_private=incident.is_private ) - else: - try: - channel_id = _slack_service.create_channel( - _build_channel_name(incident), is_private=incident.is_private - ) - if not channel_id: - slack_link.delete() - logger.error( - f"Failed to create Slack channel for incident {incident.id}" - ) - else: - channel_url = _slack_service.build_channel_url(channel_id) - slack_link.url = channel_url - slack_link.save(update_fields=["url"]) - except Exception: + if not channel_id: slack_link.delete() - logger.exception( + logger.error( f"Failed to create Slack channel for incident {incident.id}" ) + else: + channel_url = _slack_service.build_channel_url(channel_id) + slack_link.url = channel_url + slack_link.save(update_fields=["url"]) + except Exception: + slack_link.delete() + logger.exception( + f"Failed to create Slack channel for incident {incident.id}" + ) - if channel_id: - captain_slack_id = ( - _get_slack_user_id(incident.captain) if incident.captain else None + if channel_id: + captain_slack_id = ( + _get_slack_user_id(incident.captain) if incident.captain else None + ) + + try: + _slack_service.set_channel_topic( + channel_id, _build_channel_topic(incident, captain_slack_id) ) + except Exception: + logger.exception(f"Failed to set channel topic for incident {incident.id}") + + incident_url = _build_incident_url(incident) + + try: + _slack_service.add_bookmark(channel_id, "Firetower Incident", incident_url) + except Exception: + logger.exception(f"Failed to add bookmark for incident {incident.id}") + guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") + if guide_message: try: - _slack_service.set_channel_topic( - channel_id, _build_channel_topic(incident, captain_slack_id) - ) + _slack_service.post_message(channel_id, guide_message) except Exception: logger.exception( - f"Failed to set channel topic for incident {incident.id}" + f"Failed to post guide message for incident {incident.id}" ) - incident_url = _build_incident_url(incident) + ic_mention = "" + if incident.captain: + if captain_slack_id: + ic_mention = f"Incident Captain: <@{captain_slack_id}>" + else: + captain_name = escape_slack_text( + incident.captain.get_full_name() or incident.captain.username + ) + ic_mention = f"Incident Captain: {captain_name}" + if ic_mention: try: - _slack_service.add_bookmark( - channel_id, "Firetower Incident", incident_url - ) + _slack_service.post_message(channel_id, ic_mention) except Exception: - logger.exception(f"Failed to add bookmark for incident {incident.id}") - - guide_message = settings.SLACK.get("INCIDENT_GUIDE_MESSAGE", "") - if guide_message: - try: - _slack_service.post_message(channel_id, guide_message) - except Exception: - logger.exception( - f"Failed to post guide message for incident {incident.id}" - ) + logger.exception( + f"Failed to post IC mention for incident {incident.id}" + ) - ic_mention = "" - if incident.captain: - if captain_slack_id: - ic_mention = f"Incident Captain: <@{captain_slack_id}>" - else: - captain_name = escape_slack_text( - incident.captain.get_full_name() or incident.captain.username - ) - ic_mention = f"Incident Captain: {captain_name}" + if incident.description: + try: + _slack_service.post_message( + channel_id, + f"*Incident Description:*\n{escape_slack_text(incident.description)}", + ) + except Exception: + logger.exception( + f"Failed to post description for incident {incident.id}" + ) - if ic_mention: - try: - _slack_service.post_message(channel_id, ic_mention) - except Exception: - logger.exception( - f"Failed to post IC mention for incident {incident.id}" - ) + if incident.captain: + _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) - if incident.description: + always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) + if always_invited: + ids_to_invite = [uid for uid in always_invited if uid != captain_slack_id] + if ids_to_invite: try: - _slack_service.post_message( - channel_id, - f"*Incident Description:*\n{escape_slack_text(incident.description)}", - ) + _slack_service.invite_to_channel(channel_id, ids_to_invite) except Exception: logger.exception( - f"Failed to post description for incident {incident.id}" + f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" ) - if incident.captain: - _invite_user_to_channel(channel_id, incident.captain, captain_slack_id) - - always_invited = settings.SLACK.get("ALWAYS_INVITED_IDS", []) - if always_invited: - ids_to_invite = [ - uid for uid in always_invited if uid != captain_slack_id - ] - if ids_to_invite: - try: - _slack_service.invite_to_channel(channel_id, ids_to_invite) - except Exception: - logger.exception( - f"Failed to invite always_invited users to channel {channel_id} for incident {incident.id}" - ) - - feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") - if feed_channel_id and not incident.is_private: - feed_message = ( - f"A {incident.severity} incident has been created.\n" - f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" - f"\n\nFor those involved, please join <#{channel_id}>" + feed_channel_id = settings.SLACK.get("INCIDENT_FEED_CHANNEL_ID", "") + if feed_channel_id and not incident.is_private: + feed_message = ( + f"A {incident.severity} incident has been created.\n" + f"<{incident_url}|{incident.incident_number} {escape_slack_text(incident.title)}>" + f"\n\nFor those involved, please join <#{channel_id}>" + ) + try: + _slack_service.post_message(feed_channel_id, feed_message) + except Exception: + logger.exception( + f"Failed to post feed channel message for incident {incident.id}" ) - try: - _slack_service.post_message(feed_channel_id, feed_message) - except Exception: - logger.exception( - f"Failed to post feed channel message for incident {incident.id}" - ) - try: - _page_high_sev_if_needed(incident) - except Exception: - logger.exception(f"Failed to page for incident {incident.id}") - - # TODO: Datadog notebook creation step will be added in RELENG-467 + try: + _page_high_sev_if_needed(incident) except Exception: - logger.exception(f"Error in on_incident_created for incident {incident.id}") + logger.exception(f"Failed to page for incident {incident.id}") + + # TODO: Datadog notebook creation step will be added in RELENG-467 def on_status_changed(incident: Incident, old_status: str) -> None: From c81588202651fe9712ba325d206ba16b14e599b1 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 18:06:36 -0400 Subject: [PATCH 17/19] Wrap get_or_create ExternalLink in try/except instead of wrapping entire function --- src/firetower/incidents/hooks.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index ab4f3a9b..302cd169 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -142,11 +142,17 @@ def on_incident_created(incident: Incident) -> None: # Use get_or_create to atomically claim the ExternalLink row before calling # the Slack API. If two concurrent requests both reach this point, only one # will get created=True; the other bails out without creating a second channel. - slack_link, created = ExternalLink.objects.get_or_create( - incident=incident, - type=ExternalLinkType.SLACK, - defaults={"url": ""}, - ) + try: + slack_link, created = ExternalLink.objects.get_or_create( + incident=incident, + type=ExternalLinkType.SLACK, + defaults={"url": ""}, + ) + except Exception: + logger.exception( + f"Failed to get or create Slack ExternalLink for incident {incident.id}" + ) + return channel_id = None if not created: logger.info( From 2454c0512472da06c101aae442c861eb8ac933a3 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 18:20:30 -0400 Subject: [PATCH 18/19] Move PD paging before ExternalLink creation so DB failures don't skip it --- src/firetower/incidents/hooks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index 302cd169..15903246 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -139,6 +139,11 @@ def _invite_user_to_channel( def on_incident_created(incident: Incident) -> None: + try: + _page_high_sev_if_needed(incident) + except Exception: + logger.exception(f"Failed to page for incident {incident.id}") + # Use get_or_create to atomically claim the ExternalLink row before calling # the Slack API. If two concurrent requests both reach this point, only one # will get created=True; the other bails out without creating a second channel. @@ -263,11 +268,6 @@ def on_incident_created(incident: Incident) -> None: f"Failed to post feed channel message for incident {incident.id}" ) - try: - _page_high_sev_if_needed(incident) - except Exception: - logger.exception(f"Failed to page for incident {incident.id}") - # TODO: Datadog notebook creation step will be added in RELENG-467 From 3880417cc6846efa783cbe993ae01cac1a492489 Mon Sep 17 00:00:00 2001 From: Spencer Murray Date: Tue, 7 Apr 2026 18:23:15 -0400 Subject: [PATCH 19/19] Move PD paging after Slack setup and handle Slack link creation failure gracefully --- src/firetower/incidents/hooks.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index 15903246..1ad6330f 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -139,14 +139,11 @@ def _invite_user_to_channel( def on_incident_created(incident: Incident) -> None: - try: - _page_high_sev_if_needed(incident) - except Exception: - logger.exception(f"Failed to page for incident {incident.id}") - # Use get_or_create to atomically claim the ExternalLink row before calling # the Slack API. If two concurrent requests both reach this point, only one # will get created=True; the other bails out without creating a second channel. + slack_link = None + created = False try: slack_link, created = ExternalLink.objects.get_or_create( incident=incident, @@ -157,13 +154,12 @@ def on_incident_created(incident: Incident) -> None: logger.exception( f"Failed to get or create Slack ExternalLink for incident {incident.id}" ) - return channel_id = None - if not created: + if not created and slack_link is not None: logger.info( f"Incident {incident.id} already has a Slack link, skipping channel creation" ) - else: + elif created and slack_link is not None: try: channel_id = _slack_service.create_channel( _build_channel_name(incident), is_private=incident.is_private @@ -268,6 +264,11 @@ def on_incident_created(incident: Incident) -> None: f"Failed to post feed channel message for incident {incident.id}" ) + try: + _page_high_sev_if_needed(incident) + except Exception: + logger.exception(f"Failed to page for incident {incident.id}") + # TODO: Datadog notebook creation step will be added in RELENG-467