diff --git a/config.ci.toml b/config.ci.toml index 6975ea64..10c0254b 100644 --- a/config.ci.toml +++ b/config.ci.toml @@ -28,3 +28,8 @@ incident_guide_message = "" [auth] iap_enabled = false iap_audience = "" + +[pagerduty] +api_token = "" + +[pagerduty.escalation_policies] diff --git a/config.example.toml b/config.example.toml index 2496a4f8..7d25fabb 100644 --- a/config.example.toml +++ b/config.example.toml @@ -28,6 +28,16 @@ incident_guide_message = "This is the message posted whenever a new incident sla iap_enabled = false iap_audience = "" +[pagerduty] +api_token = "" + +[pagerduty.escalation_policies.HIGH_SEV] +id = "" +integration_key = "" + +[pagerduty.escalation_policies.ExampleTeam] +id = "" + [datadog] api_key = "" app_key = "" diff --git a/src/firetower/config.py b/src/firetower/config.py index 052024d6..f6804c6f 100644 --- a/src/firetower/config.py +++ b/src/firetower/config.py @@ -34,6 +34,18 @@ class JIRAConfig: severity_field: str +@deserialize +class EscalationPolicy: + id: str + integration_key: str | None = None + + +@deserialize +class PagerDutyConfig: + api_token: str + escalation_policies: dict[str, EscalationPolicy] + + @deserialize class SlackConfig: bot_token: str @@ -62,6 +74,7 @@ class ConfigFile: jira: JIRAConfig slack: SlackConfig auth: AuthConfig + pagerduty: PagerDutyConfig | None project_key: str django_secret_key: str @@ -133,6 +146,7 @@ def __init__(self) -> None: iap_audience="", ) self.datadog = None + self.pagerduty = None self.project_key = "" self.django_secret_key = "" self.sentry_dsn = "" diff --git a/src/firetower/incidents/hooks.py b/src/firetower/incidents/hooks.py index b6872abf..1ad6330f 100644 --- a/src/firetower/incidents/hooks.py +++ b/src/firetower/incidents/hooks.py @@ -4,13 +4,67 @@ from django.contrib.auth.models import User from firetower.auth.models import ExternalProfileType -from firetower.incidents.models import ExternalLink, ExternalLinkType, Incident -from firetower.integrations.services import SlackService +from firetower.incidents.models import ( + ExternalLink, + ExternalLinkType, + Incident, + IncidentSeverity, +) +from firetower.integrations.services import PagerDutyService, SlackService from firetower.integrations.services.slack import escape_slack_text logger = logging.getLogger(__name__) _slack_service = SlackService() +PAGEABLE_SEVERITIES = {IncidentSeverity.P0, IncidentSeverity.P1} + + +def _get_pagerduty_service() -> PagerDutyService | None: + if not settings.PAGERDUTY: + return None + try: + return PagerDutyService() + except Exception: + logger.exception("Failed to initialize PagerDutyService") + return None + + +def _page_high_sev_if_needed(incident: Incident) -> None: + if incident.severity not in PAGEABLE_SEVERITIES: + return + + pd_config = settings.PAGERDUTY + if not pd_config: + return + + escalation_policies = pd_config.get("ESCALATION_POLICIES", {}) + high_sev_policy = escalation_policies.get("HIGH_SEV") + if not high_sev_policy: + logger.info("No HIGH_SEV escalation policy configured, skipping page") + return + + integration_key = high_sev_policy.get("integration_key") + if not integration_key: + logger.info("No integration_key for HIGH_SEV escalation policy, skipping page") + return + + pd_service = _get_pagerduty_service() + if not pd_service: + return + + dedup_key = f"firetower-{incident.incident_number}" + summary = f"[{incident.severity}] {incident.incident_number}: {incident.title}" + + links = [{"href": _build_incident_url(incident), "text": "View in Firetower"}] + slack_link = incident.external_links.filter(type=ExternalLinkType.SLACK).first() + if slack_link and slack_link.url: + links.append({"href": slack_link.url, "text": "Slack Channel"}) + + try: + pd_service.trigger_incident(summary, dedup_key, integration_key, links=links) + except Exception: + logger.exception(f"Failed to page HIGH_SEV for incident {incident.id}") + def _build_channel_name(incident: Incident) -> str: return incident.incident_number.lower() @@ -85,38 +139,47 @@ def _invite_user_to_channel( def on_incident_created(incident: Incident) -> None: + # Use get_or_create to atomically claim the ExternalLink row before calling + # the Slack API. If two concurrent requests both reach this point, only one + # will get created=True; the other bails out without creating a second channel. + slack_link = None + created = False try: - # Use get_or_create to atomically claim the ExternalLink row before calling - # the Slack API. If two concurrent requests both reach this point, only one - # will get created=True; the other bails out without creating a second channel. slack_link, created = ExternalLink.objects.get_or_create( incident=incident, type=ExternalLinkType.SLACK, defaults={"url": ""}, ) - if not created: - logger.info( - f"Incident {incident.id} already has a Slack link, skipping channel creation" - ) - return - + except Exception: + logger.exception( + f"Failed to get or create Slack ExternalLink for incident {incident.id}" + ) + channel_id = None + if not created and slack_link is not None: + logger.info( + f"Incident {incident.id} already has a Slack link, skipping channel creation" + ) + elif created and slack_link is not None: try: channel_id = _slack_service.create_channel( _build_channel_name(incident), is_private=incident.is_private ) if not channel_id: slack_link.delete() - logger.warning( + logger.error( f"Failed to create Slack channel for incident {incident.id}" ) - return - channel_url = _slack_service.build_channel_url(channel_id) - slack_link.url = channel_url - slack_link.save(update_fields=["url"]) + else: + channel_url = _slack_service.build_channel_url(channel_id) + slack_link.url = channel_url + slack_link.save(update_fields=["url"]) except Exception: slack_link.delete() - raise + logger.exception( + f"Failed to create Slack channel for incident {incident.id}" + ) + if channel_id: captain_slack_id = ( _get_slack_user_id(incident.captain) if incident.captain else None ) @@ -201,9 +264,12 @@ def on_incident_created(incident: Incident) -> None: f"Failed to post feed channel message for incident {incident.id}" ) - # TODO: Datadog notebook creation step will be added in RELENG-467 + try: + _page_high_sev_if_needed(incident) except Exception: - logger.exception(f"Error in on_incident_created for incident {incident.id}") + logger.exception(f"Failed to page for incident {incident.id}") + + # TODO: Datadog notebook creation step will be added in RELENG-467 def on_status_changed(incident: Incident, old_status: str) -> None: @@ -224,18 +290,25 @@ def on_status_changed(incident: Incident, old_status: str) -> None: def on_severity_changed(incident: Incident, old_severity: str) -> None: try: channel_id = _get_channel_id(incident) - if not channel_id: - return - - _slack_service.set_channel_topic(channel_id, _build_channel_topic(incident)) - incident_url = _build_incident_url(incident) - _slack_service.post_message( - channel_id, - f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>", - ) + if channel_id: + _slack_service.set_channel_topic(channel_id, _build_channel_topic(incident)) + incident_url = _build_incident_url(incident) + _slack_service.post_message( + channel_id, + f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>", + ) except Exception: logger.exception(f"Error in on_severity_changed for incident {incident.id}") + if ( + old_severity not in PAGEABLE_SEVERITIES + and incident.severity in PAGEABLE_SEVERITIES + ): + try: + _page_high_sev_if_needed(incident) + except Exception: + logger.exception(f"Failed to page for incident {incident.id}") + def on_title_changed(incident: Incident) -> None: try: diff --git a/src/firetower/incidents/tests/test_hooks.py b/src/firetower/incidents/tests/test_hooks.py index c68a43f1..5e3a23dc 100644 --- a/src/firetower/incidents/tests/test_hooks.py +++ b/src/firetower/incidents/tests/test_hooks.py @@ -7,6 +7,7 @@ from firetower.incidents.hooks import ( _build_channel_name, _build_channel_topic, + _page_high_sev_if_needed, on_captain_changed, on_incident_created, on_severity_changed, @@ -573,3 +574,226 @@ def test_noop_without_slack_link(self, mock_slack): on_captain_changed(incident) mock_slack.set_channel_topic.assert_not_called() + + +MOCK_PD_CONFIG = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": { + "HIGH_SEV": { + "id": "P17I207", + "integration_key": "test-integration-key", + }, + }, +} + + +@pytest.mark.django_db +class TestPageHighSevIfNeeded: + @patch("firetower.incidents.hooks.PagerDutyService") + def test_pages_for_p0(self, mock_pd_cls, settings): + settings.PAGERDUTY = MOCK_PD_CONFIG + settings.FIRETOWER_BASE_URL = "https://firetower.example.com" + mock_pd = mock_pd_cls.return_value + mock_pd.trigger_incident.return_value = True + + incident = Incident.objects.create( + title="Major outage", + severity=IncidentSeverity.P0, + ) + + _page_high_sev_if_needed(incident) + + mock_pd.trigger_incident.assert_called_once_with( + f"[P0] {incident.incident_number}: Major outage", + f"firetower-{incident.incident_number}", + "test-integration-key", + links=[ + { + "href": f"https://firetower.example.com/{incident.incident_number}", + "text": "View in Firetower", + } + ], + ) + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_pages_for_p1(self, mock_pd_cls, settings): + settings.PAGERDUTY = MOCK_PD_CONFIG + mock_pd = mock_pd_cls.return_value + mock_pd.trigger_incident.return_value = True + + incident = Incident.objects.create( + title="Service degradation", + severity=IncidentSeverity.P1, + ) + + _page_high_sev_if_needed(incident) + + mock_pd.trigger_incident.assert_called_once() + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_skips_for_p2(self, mock_pd_cls, settings): + settings.PAGERDUTY = MOCK_PD_CONFIG + + incident = Incident.objects.create( + title="Minor issue", + severity=IncidentSeverity.P2, + ) + + _page_high_sev_if_needed(incident) + + mock_pd_cls.assert_not_called() + + @patch("firetower.incidents.hooks.PagerDutyService") + def test_skips_when_pagerduty_not_configured(self, mock_pd_cls, settings): + settings.PAGERDUTY = None + + incident = Incident.objects.create( + title="Test", + severity=IncidentSeverity.P0, + ) + + _page_high_sev_if_needed(incident) + + mock_pd_cls.assert_not_called() + + def test_skips_when_no_high_sev_policy(self, settings): + settings.PAGERDUTY = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": {}, + } + + incident = Incident.objects.create( + title="Test", + severity=IncidentSeverity.P0, + ) + + _page_high_sev_if_needed(incident) + + def test_skips_when_no_integration_key(self, settings): + settings.PAGERDUTY = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": { + "HIGH_SEV": {"id": "P17I207", "integration_key": None}, + }, + } + + incident = Incident.objects.create( + title="Test", + severity=IncidentSeverity.P0, + ) + + _page_high_sev_if_needed(incident) + + +@pytest.mark.django_db +class TestOnIncidentCreatedPagerDuty: + @patch("firetower.incidents.hooks._page_high_sev_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_pages_high_sev_on_p0_creation(self, mock_slack, mock_page): + mock_slack.create_channel.return_value = "C99999" + mock_slack.build_channel_url.return_value = "https://slack.com/archives/C99999" + + incident = Incident.objects.create( + title="Major outage", + severity=IncidentSeverity.P0, + ) + + on_incident_created(incident) + + mock_page.assert_called_once_with(incident) + + @patch("firetower.incidents.hooks._page_high_sev_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_on_incident_created_calls_page_high_sev_if_needed( + self, mock_slack, mock_page + ): + mock_slack.create_channel.return_value = "C99999" + mock_slack.build_channel_url.return_value = "https://slack.com/archives/C99999" + + incident = Incident.objects.create( + title="Minor issue", + severity=IncidentSeverity.P3, + ) + + on_incident_created(incident) + + mock_page.assert_called_once_with(incident) + + +@pytest.mark.django_db +class TestOnSeverityChangedPagerDuty: + @patch("firetower.incidents.hooks._page_high_sev_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_pages_high_sev_on_upgrade_to_p0(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Escalating issue", + severity=IncidentSeverity.P0, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P2) + + mock_page.assert_called_once_with(incident) + + @patch("firetower.incidents.hooks._page_high_sev_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_pages_high_sev_on_upgrade_to_p1(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Escalating issue", + severity=IncidentSeverity.P1, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P3) + + mock_page.assert_called_once_with(incident) + + @patch("firetower.incidents.hooks._page_high_sev_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_does_not_page_on_p1_to_p0(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Already paged", + severity=IncidentSeverity.P0, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P1) + + mock_page.assert_not_called() + + @patch("firetower.incidents.hooks._page_high_sev_if_needed") + @patch("firetower.incidents.hooks._slack_service") + def test_does_not_page_on_downgrade(self, mock_slack, mock_page): + mock_slack.parse_channel_id_from_url.return_value = "C12345" + + incident = Incident.objects.create( + title="Downgraded", + severity=IncidentSeverity.P3, + ) + ExternalLink.objects.create( + incident=incident, + type=ExternalLinkType.SLACK, + url="https://slack.com/archives/C12345", + ) + + on_severity_changed(incident, IncidentSeverity.P1) + + mock_page.assert_not_called() diff --git a/src/firetower/integrations/services/__init__.py b/src/firetower/integrations/services/__init__.py index 568cfcb7..55f0e121 100644 --- a/src/firetower/integrations/services/__init__.py +++ b/src/firetower/integrations/services/__init__.py @@ -1,6 +1,7 @@ """Services package for external integrations.""" from .jira import JiraService +from .pagerduty import PagerDutyService from .slack import SlackService -__all__ = ["JiraService", "SlackService"] +__all__ = ["JiraService", "PagerDutyService", "SlackService"] diff --git a/src/firetower/integrations/services/pagerduty.py b/src/firetower/integrations/services/pagerduty.py new file mode 100644 index 00000000..be08dcf8 --- /dev/null +++ b/src/firetower/integrations/services/pagerduty.py @@ -0,0 +1,83 @@ +import logging + +import requests +from django.conf import settings + +logger = logging.getLogger(__name__) + +EVENTS_API_URL = "https://events.pagerduty.com/v2/enqueue" +REST_API_URL = "https://api.pagerduty.com" + + +class PagerDutyService: + def __init__(self) -> None: + pd_config = settings.PAGERDUTY + if not pd_config: + raise ValueError("PagerDuty is not configured") + + self.api_token = pd_config["API_TOKEN"] + + def trigger_incident( + self, + summary: str, + dedup_key: str, + integration_key: str, + links: list[dict] | None = None, + ) -> bool: + payload: dict = { + "routing_key": integration_key, + "event_action": "trigger", + "dedup_key": dedup_key, + "payload": { + "summary": summary, + "severity": "critical", + "source": "firetower", + }, + } + if links: + payload["links"] = links + + try: + resp = requests.post(EVENTS_API_URL, json=payload, timeout=10) + resp.raise_for_status() + logger.info( + "Triggered PagerDuty incident", + extra={"dedup_key": dedup_key}, + ) + return True + except requests.RequestException: + logger.exception( + "Failed to trigger PagerDuty incident", + extra={"dedup_key": dedup_key}, + ) + return False + + def get_oncall_users(self, escalation_policy_id: str) -> list[dict]: + headers = { + "Authorization": f"Token token={self.api_token}", + "Content-Type": "application/json", + } + + url = f"{REST_API_URL}/oncalls" + params = {"escalation_policy_ids[]": escalation_policy_id} + try: + resp = requests.get(url, headers=headers, params=params, timeout=10) + resp.raise_for_status() + results = [] + for oncall in resp.json().get("oncalls", []): + user = oncall.get("user", {}) + email = user.get("email") + if email: + results.append( + { + "email": email, + "escalation_level": oncall.get("escalation_level"), + } + ) + return results + except requests.RequestException: + logger.exception( + "Failed to fetch oncall users from PagerDuty", + extra={"escalation_policy_id": escalation_policy_id}, + ) + return [] diff --git a/src/firetower/integrations/tests/__init__.py b/src/firetower/integrations/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/firetower/integrations/test_jira.py b/src/firetower/integrations/tests/test_jira.py similarity index 99% rename from src/firetower/integrations/test_jira.py rename to src/firetower/integrations/tests/test_jira.py index 855a053d..9dde0a69 100644 --- a/src/firetower/integrations/test_jira.py +++ b/src/firetower/integrations/tests/test_jira.py @@ -7,7 +7,7 @@ import pytest -from .services.jira import JiraService +from firetower.integrations.services.jira import JiraService # Set up Django settings os.environ.setdefault("DJANGO_SETTINGS_MODULE", "firetower.settings") diff --git a/src/firetower/integrations/tests/test_pagerduty.py b/src/firetower/integrations/tests/test_pagerduty.py new file mode 100644 index 00000000..80f84596 --- /dev/null +++ b/src/firetower/integrations/tests/test_pagerduty.py @@ -0,0 +1,124 @@ +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from firetower.integrations.services.pagerduty import PagerDutyService + +MOCK_PD_CONFIG = { + "API_TOKEN": "test-token", + "ESCALATION_POLICIES": { + "HIGH_SEV": { + "id": "P17I207", + "integration_key": "test-integration-key", + }, + "ProdEng": { + "id": "PABC123", + "integration_key": None, + }, + }, +} + + +@pytest.fixture +def pd_service(): + with patch("firetower.integrations.services.pagerduty.settings") as mock_settings: + mock_settings.PAGERDUTY = MOCK_PD_CONFIG + return PagerDutyService() + + +class TestPagerDutyServiceInit: + def test_init_raises_when_unconfigured(self): + with patch( + "firetower.integrations.services.pagerduty.settings" + ) as mock_settings: + mock_settings.PAGERDUTY = None + with pytest.raises(ValueError, match="not configured"): + PagerDutyService() + + def test_init_stores_config(self, pd_service): + assert pd_service.api_token == "test-token" + + +class TestTriggerIncident: + @patch("firetower.integrations.services.pagerduty.requests.post") + def test_trigger_success(self, mock_post, pd_service): + mock_post.return_value = MagicMock(status_code=202) + + result = pd_service.trigger_incident("Server down", "dedup-123", "int-key") + + assert result is True + mock_post.assert_called_once() + call_kwargs = mock_post.call_args + assert call_kwargs.kwargs["json"]["routing_key"] == "int-key" + assert call_kwargs.kwargs["json"]["dedup_key"] == "dedup-123" + assert call_kwargs.kwargs["json"]["payload"]["summary"] == "Server down" + + @patch("firetower.integrations.services.pagerduty.requests.post") + def test_trigger_failure(self, mock_post, pd_service): + mock_post.side_effect = requests.RequestException("connection error") + + result = pd_service.trigger_incident("Server down", "dedup-123", "int-key") + + assert result is False + + +class TestGetOncallUsers: + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_returns_users_with_escalation_level(self, mock_get, pd_service): + mock_get.return_value = MagicMock( + json=lambda: { + "oncalls": [ + { + "user": {"email": "alice@example.com"}, + "escalation_level": 1, + }, + { + "user": {"email": "bob@example.com"}, + "escalation_level": 2, + }, + ] + } + ) + + users = pd_service.get_oncall_users("P17I207") + + assert users == [ + {"email": "alice@example.com", "escalation_level": 1}, + {"email": "bob@example.com", "escalation_level": 2}, + ] + mock_get.assert_called_once() + assert mock_get.call_args.kwargs["params"] == { + "escalation_policy_ids[]": "P17I207" + } + + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_returns_empty_list_when_no_oncalls(self, mock_get, pd_service): + mock_get.return_value = MagicMock(json=lambda: {"oncalls": []}) + + users = pd_service.get_oncall_users("P17I207") + + assert users == [] + + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_returns_empty_list_on_api_error(self, mock_get, pd_service): + mock_get.side_effect = requests.RequestException("timeout") + + users = pd_service.get_oncall_users("P17I207") + + assert users == [] + + @patch("firetower.integrations.services.pagerduty.requests.get") + def test_skips_users_without_email(self, mock_get, pd_service): + mock_get.return_value = MagicMock( + json=lambda: { + "oncalls": [ + {"user": {}, "escalation_level": 1}, + {"user": {"email": "alice@example.com"}, "escalation_level": 2}, + ] + } + ) + + users = pd_service.get_oncall_users("P17I207") + + assert users == [{"email": "alice@example.com", "escalation_level": 2}] diff --git a/src/firetower/integrations/test_slack.py b/src/firetower/integrations/tests/test_slack.py similarity index 99% rename from src/firetower/integrations/test_slack.py rename to src/firetower/integrations/tests/test_slack.py index fba3f107..088e3376 100644 --- a/src/firetower/integrations/test_slack.py +++ b/src/firetower/integrations/tests/test_slack.py @@ -7,7 +7,7 @@ from slack_sdk.errors import SlackApiError -from .services.slack import SlackService +from firetower.integrations.services.slack import SlackService # Set up Django settings os.environ.setdefault("DJANGO_SETTINGS_MODULE", "firetower.settings") diff --git a/src/firetower/settings.py b/src/firetower/settings.py index e69af035..37472bd0 100644 --- a/src/firetower/settings.py +++ b/src/firetower/settings.py @@ -231,6 +231,22 @@ class SlackSettings(TypedDict): FIRETOWER_BASE_URL = config.firetower_base_url HOOKS_ENABLED = config.hooks_enabled +# PagerDuty Integration Configuration +PAGERDUTY = ( + { + "API_TOKEN": config.pagerduty.api_token, + "ESCALATION_POLICIES": { + name: { + "id": policy.id, + "integration_key": policy.integration_key, + } + for name, policy in config.pagerduty.escalation_policies.items() + }, + } + if config.pagerduty + else None +) + # Django REST Framework Configuration REST_FRAMEWORK = { # Pagination