Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
3433302
Add PagerDuty config with escalation policies
spalmurray Mar 25, 2026
d192e12
Add PagerDuty service client
spalmurray Mar 25, 2026
8efa528
Move integration tests into tests/ subdirectory and add PagerDuty tests
spalmurray Mar 25, 2026
ba0683a
Wire PagerDuty IMOC paging into incident hooks for P0/P1 incidents
spalmurray Apr 7, 2026
e53acd8
Rename IMOC references to HIGH_SEV in PagerDuty integration
spalmurray Apr 7, 2026
8f5e1ca
Move paging decision logic from PagerDutyService into hooks
spalmurray Apr 7, 2026
2d0256e
Rename test to accurately describe what it verifies
spalmurray Apr 7, 2026
16f632e
Move PagerDuty paging before Slack operations so it runs even if Slac…
spalmurray Apr 7, 2026
622b922
Don't skip non-Slack operations when Slack channel creation fails in …
spalmurray Apr 7, 2026
139a3d2
Refactor hooks
spalmurray Apr 7, 2026
afb29e1
Add slack and firetower links to page
spalmurray Apr 7, 2026
3f01f61
Fix on_severity_changed early return blocking PD paging
spalmurray Apr 7, 2026
5824152
Fix test_pages_for_p0 assertion to include links kwarg
spalmurray Apr 7, 2026
a5cc893
Fix test_pages_for_p0 to use explicit FIRETOWER_BASE_URL
spalmurray Apr 7, 2026
ab88efe
Re-add outer try/except safety net around on_incident_created
spalmurray Apr 7, 2026
af91ed6
Revert "Re-add outer try/except safety net around on_incident_created"
spalmurray Apr 7, 2026
c815882
Wrap get_or_create ExternalLink in try/except instead of wrapping ent…
spalmurray Apr 7, 2026
2454c05
Move PD paging before ExternalLink creation so DB failures don't skip it
spalmurray Apr 7, 2026
3880417
Move PD paging after Slack setup and handle Slack link creation failu…
spalmurray Apr 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions config.ci.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,8 @@ incident_guide_message = ""
[auth]
iap_enabled = false
iap_audience = ""

[pagerduty]
api_token = ""

[pagerduty.escalation_policies]
10 changes: 10 additions & 0 deletions config.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ incident_guide_message = "This is the message posted whenever a new incident sla
iap_enabled = false
iap_audience = ""

[pagerduty]
api_token = ""

[pagerduty.escalation_policies.HIGH_SEV]
id = ""
integration_key = ""

[pagerduty.escalation_policies.ExampleTeam]
id = ""

[datadog]
api_key = ""
app_key = ""
14 changes: 14 additions & 0 deletions src/firetower/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@ class JIRAConfig:
severity_field: str


@deserialize
class EscalationPolicy:
id: str
integration_key: str | None = None


@deserialize
class PagerDutyConfig:
api_token: str
escalation_policies: dict[str, EscalationPolicy]


@deserialize
class SlackConfig:
bot_token: str
Expand Down Expand Up @@ -62,6 +74,7 @@ class ConfigFile:
jira: JIRAConfig
slack: SlackConfig
auth: AuthConfig
pagerduty: PagerDutyConfig | None

project_key: str
django_secret_key: str
Expand Down Expand Up @@ -133,6 +146,7 @@ def __init__(self) -> None:
iap_audience="",
)
self.datadog = None
self.pagerduty = None
self.project_key = ""
self.django_secret_key = ""
self.sentry_dsn = ""
Expand Down
129 changes: 101 additions & 28 deletions src/firetower/incidents/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,67 @@
from django.contrib.auth.models import User

from firetower.auth.models import ExternalProfileType
from firetower.incidents.models import ExternalLink, ExternalLinkType, Incident
from firetower.integrations.services import SlackService
from firetower.incidents.models import (
ExternalLink,
ExternalLinkType,
Incident,
IncidentSeverity,
)
from firetower.integrations.services import PagerDutyService, SlackService
from firetower.integrations.services.slack import escape_slack_text

logger = logging.getLogger(__name__)
_slack_service = SlackService()

PAGEABLE_SEVERITIES = {IncidentSeverity.P0, IncidentSeverity.P1}


def _get_pagerduty_service() -> PagerDutyService | None:
if not settings.PAGERDUTY:
return None
try:
return PagerDutyService()
except Exception:
logger.exception("Failed to initialize PagerDutyService")
return None


def _page_high_sev_if_needed(incident: Incident) -> None:
if incident.severity not in PAGEABLE_SEVERITIES:
return

pd_config = settings.PAGERDUTY
if not pd_config:
return

escalation_policies = pd_config.get("ESCALATION_POLICIES", {})
high_sev_policy = escalation_policies.get("HIGH_SEV")
if not high_sev_policy:
logger.info("No HIGH_SEV escalation policy configured, skipping page")
return

integration_key = high_sev_policy.get("integration_key")
if not integration_key:
logger.info("No integration_key for HIGH_SEV escalation policy, skipping page")
return

pd_service = _get_pagerduty_service()
if not pd_service:
return

dedup_key = f"firetower-{incident.incident_number}"
summary = f"[{incident.severity}] {incident.incident_number}: {incident.title}"

links = [{"href": _build_incident_url(incident), "text": "View in Firetower"}]
slack_link = incident.external_links.filter(type=ExternalLinkType.SLACK).first()
if slack_link and slack_link.url:
links.append({"href": slack_link.url, "text": "Slack Channel"})

try:
pd_service.trigger_incident(summary, dedup_key, integration_key, links=links)
except Exception:
logger.exception(f"Failed to page HIGH_SEV for incident {incident.id}")


def _build_channel_name(incident: Incident) -> str:
return incident.incident_number.lower()
Expand Down Expand Up @@ -85,38 +139,47 @@


def on_incident_created(incident: Incident) -> None:
# Use get_or_create to atomically claim the ExternalLink row before calling
# the Slack API. If two concurrent requests both reach this point, only one
# will get created=True; the other bails out without creating a second channel.
slack_link = None
created = False
try:
# Use get_or_create to atomically claim the ExternalLink row before calling
# the Slack API. If two concurrent requests both reach this point, only one
# will get created=True; the other bails out without creating a second channel.
slack_link, created = ExternalLink.objects.get_or_create(
incident=incident,
type=ExternalLinkType.SLACK,
defaults={"url": ""},
)
if not created:
logger.info(
f"Incident {incident.id} already has a Slack link, skipping channel creation"
)
return

except Exception:
logger.exception(
f"Failed to get or create Slack ExternalLink for incident {incident.id}"
)

Check warning on line 156 in src/firetower/incidents/hooks.py

View workflow job for this annotation

GitHub Actions / warden: code-review

Database exception silently swallows error and leaves function in inconsistent state

When the `ExternalLink.objects.get_or_create` call fails (lines 147-156), the exception is caught and logged, but the function continues execution with `slack_link=None` and `created=False`. This means `channel_id` stays `None`, the PagerDuty paging at line 267-270 will still run, but the incident is left without a Slack channel with no retry mechanism. The original code would have re-raised, allowing the caller to handle or retry the operation.
channel_id = None
if not created and slack_link is not None:
logger.info(
f"Incident {incident.id} already has a Slack link, skipping channel creation"
)
elif created and slack_link is not None:
try:
channel_id = _slack_service.create_channel(
_build_channel_name(incident), is_private=incident.is_private
)
if not channel_id:
slack_link.delete()
logger.warning(
logger.error(
f"Failed to create Slack channel for incident {incident.id}"
)
return
channel_url = _slack_service.build_channel_url(channel_id)
slack_link.url = channel_url
slack_link.save(update_fields=["url"])
else:
channel_url = _slack_service.build_channel_url(channel_id)
slack_link.url = channel_url
slack_link.save(update_fields=["url"])
except Exception:
slack_link.delete()
raise
logger.exception(
f"Failed to create Slack channel for incident {incident.id}"
)

Check warning on line 180 in src/firetower/incidents/hooks.py

View workflow job for this annotation

GitHub Actions / warden: code-review

[XM4-8WK] Database exception silently swallows error and leaves function in inconsistent state (additional location)

When the `ExternalLink.objects.get_or_create` call fails (lines 147-156), the exception is caught and logged, but the function continues execution with `slack_link=None` and `created=False`. This means `channel_id` stays `None`, the PagerDuty paging at line 267-270 will still run, but the incident is left without a Slack channel with no retry mechanism. The original code would have re-raised, allowing the caller to handle or retry the operation.

Check warning on line 180 in src/firetower/incidents/hooks.py

View workflow job for this annotation

GitHub Actions / warden: find-bugs

Slack channel created but ExternalLink deleted when save() fails

When `slack_link.save(update_fields=["url"])` raises an exception, the except block deletes the ExternalLink but does not reset `channel_id` to None. Since `channel_id` retains its value from the successful `create_channel()` call, the subsequent `if channel_id:` block executes, setting up channel topic, bookmarks, and posting messages to a Slack channel that has no corresponding ExternalLink in the database. This creates an orphaned channel that cannot be found by other hooks (e.g., on_status_changed, on_severity_changed) because they rely on querying the ExternalLink.

Comment on lines +175 to 181
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: If saving the SlackLink model fails, the channel_id is not cleared, leading to subsequent operations on an orphaned Slack channel not linked to the incident.
Severity: MEDIUM

Suggested Fix

Wrap the SlackLink.objects.create() call in a try...except block. In the except block, ensure the channel_id is reset to None to prevent further operations on the orphaned channel.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: src/firetower/incidents/hooks.py#L175-L181

Potential issue: If the `SlackLink` model fails to save after a new Slack channel has
been successfully created, the `channel_id` variable is not reset. As a result,
subsequent code will attempt to perform operations like setting the topic, adding
bookmarks, or posting messages to this 'orphaned' channel. This channel exists in Slack
but is not linked to any incident in the application's database, leading to an
inconsistent state and breaking the incident management workflow for that specific
event.

if channel_id:
captain_slack_id = (
_get_slack_user_id(incident.captain) if incident.captain else None
)
Expand Down Expand Up @@ -201,9 +264,12 @@
f"Failed to post feed channel message for incident {incident.id}"
)

# TODO: Datadog notebook creation step will be added in RELENG-467
try:
_page_high_sev_if_needed(incident)
except Exception:
logger.exception(f"Error in on_incident_created for incident {incident.id}")
logger.exception(f"Failed to page for incident {incident.id}")

# TODO: Datadog notebook creation step will be added in RELENG-467


def on_status_changed(incident: Incident, old_status: str) -> None:
Expand All @@ -224,18 +290,25 @@
def on_severity_changed(incident: Incident, old_severity: str) -> None:
try:
channel_id = _get_channel_id(incident)
if not channel_id:
return

_slack_service.set_channel_topic(channel_id, _build_channel_topic(incident))
incident_url = _build_incident_url(incident)
_slack_service.post_message(
channel_id,
f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>",
)
if channel_id:
_slack_service.set_channel_topic(channel_id, _build_channel_topic(incident))
incident_url = _build_incident_url(incident)
_slack_service.post_message(
channel_id,
f"Incident severity updated: {old_severity} -> {incident.severity}\n<{incident_url}|View in Firetower>",
)
except Exception:
logger.exception(f"Error in on_severity_changed for incident {incident.id}")

if (
old_severity not in PAGEABLE_SEVERITIES
and incident.severity in PAGEABLE_SEVERITIES
):
try:
_page_high_sev_if_needed(incident)
except Exception:
logger.exception(f"Failed to page for incident {incident.id}")


def on_title_changed(incident: Incident) -> None:
try:
Expand Down
Loading
Loading