From 4300d8920b60c403a8e67d3de4be7f1b6106a13e Mon Sep 17 00:00:00 2001 From: Nathan Tournant Date: Thu, 28 May 2026 07:22:29 +0000 Subject: [PATCH] fix(host_tags): skip 404 "Host doesn't exist" instead of failing the resource In dynamic infrastructure (autoscaled VMs, ephemeral GKE nodes, short-lived containers), it's common for hosts to exist in the source org's tag inventory but no longer exist on the destination by the time sync runs. The destination API correctly returns `404 Not Found - {"errors":["Host doesn't exist"]}` for these. Before this change, every such 404 raised CustomClientHTTPError, which resources_handler's generic `except Exception` bucketed as a hard failure, producing an ERROR line per missing host and aborting the whole host_tags resource for that run. After: 404 -> log INFO + raise SkipResource so the existing handler SkipResource branch buckets it as `skipped`. Other status codes (4xx/5xx) continue to propagate so the retry layer and failure accounting still engage. - Caught in update_resource; create_resource delegates so the same skip fires there. - Logged at INFO via the project's LOGGER_NAME logger so a future refactor cannot accidentally promote the line back to ERROR. - 9 new unit tests pin the contract: 404 skips, 400/403/500 propagate (with explicit status_code assertions), happy-path unchanged, create-vs-update delegation, multi-host loop continues, all-404 path completes cleanly. --- datadog_sync/model/host_tags.py | 22 +++++- tests/unit/test_host_tags.py | 128 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_host_tags.py diff --git a/datadog_sync/model/host_tags.py b/datadog_sync/model/host_tags.py index dbcc54ca..240ccc75 100644 --- a/datadog_sync/model/host_tags.py +++ b/datadog_sync/model/host_tags.py @@ -4,14 +4,19 @@ # Copyright 2019 Datadog, Inc. from __future__ import annotations +import logging from collections import defaultdict from typing import TYPE_CHECKING, Optional, List, Dict, Tuple +from datadog_sync.constants import LOGGER_NAME from datadog_sync.utils.base_resource import BaseResource, ResourceConfig +from datadog_sync.utils.resource_utils import CustomClientHTTPError, SkipResource if TYPE_CHECKING: from datadog_sync.utils.custom_client import CustomClient +log = logging.getLogger(LOGGER_NAME) + class HostTags(BaseResource): resource_type = "host_tags" @@ -52,7 +57,22 @@ async def create_resource(self, _id: str, resource: Dict) -> Tuple[str, Dict]: async def update_resource(self, _id: str, resource: Dict) -> Tuple[str, Dict]: destination_client = self.config.destination_client body = {"tags": resource} - resp = await destination_client.put(self.resource_config.base_path + f"/{_id}", body) + try: + resp = await destination_client.put(self.resource_config.base_path + f"/{_id}", body) + except CustomClientHTTPError as e: + if e.status_code == 404: + # Source orgs frequently carry ephemeral hosts (GKE node pools, + # autoscaled VMs) that no longer exist on destination. 404 here + # means "host gone — nothing to tag" and is the correct skip + # signal, not a sync failure. Other status codes (4xx/5xx) still + # propagate so the retry layer and failure accounting engage. + log.info(f"[host_tags - {_id}] skipping: host no longer exists on destination") + raise SkipResource( + _id, + self.resource_type, + f"host no longer exists on destination ({_id})", + ) from None + raise return _id, resp["tags"] diff --git a/tests/unit/test_host_tags.py b/tests/unit/test_host_tags.py new file mode 100644 index 00000000..80e783e0 --- /dev/null +++ b/tests/unit/test_host_tags.py @@ -0,0 +1,128 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the 3-clause BSD style license (see LICENSE). +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2019 Datadog, Inc. + +"""Unit tests for host_tags 404 skip contract (NATHAN-53).""" + +import asyncio +import logging +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from datadog_sync.constants import LOGGER_NAME +from datadog_sync.model.host_tags import HostTags +from datadog_sync.utils.resource_utils import CustomClientHTTPError, SkipResource + + +def _http_error(status, message="Host doesn't exist"): + return CustomClientHTTPError(SimpleNamespace(status=status, message="err"), message=message) + + +@pytest.fixture +def host_tags(): + mock_config = MagicMock() + mock_config.state = MagicMock() + mock_config.destination_client = AsyncMock() + return HostTags(mock_config) + + +def test_update_resource_404_raises_skip_resource(host_tags): + host_tags.config.destination_client.put = AsyncMock(side_effect=_http_error(404)) + with pytest.raises(SkipResource) as exc_info: + asyncio.run(host_tags.update_resource("missing-host.example.com", ["env:prod"])) + assert "missing-host.example.com" in str(exc_info.value) + + +def test_update_resource_404_logs_at_info_level(host_tags, caplog): + host_tags.config.destination_client.put = AsyncMock(side_effect=_http_error(404)) + with caplog.at_level(logging.INFO, logger=LOGGER_NAME): + with pytest.raises(SkipResource): + asyncio.run(host_tags.update_resource("missing-host.example.com", ["env:prod"])) + skip_records = [ + r + for r in caplog.records + if r.name == LOGGER_NAME + and "missing-host.example.com" in r.getMessage() + and "host no longer exists" in r.getMessage().lower() + ] + assert skip_records, "Expected an INFO log identifying the skipped host. caplog records: %r" % [ + (r.levelname, r.getMessage()) for r in caplog.records + ] + assert all(r.levelno == logging.INFO for r in skip_records) + + +def test_update_resource_500_propagates(host_tags): + host_tags.config.destination_client.put = AsyncMock(side_effect=_http_error(500, "Internal Server Error")) + with pytest.raises(CustomClientHTTPError) as exc_info: + asyncio.run(host_tags.update_resource("some-host", ["env:prod"])) + assert exc_info.value.status_code == 500 + + +def test_update_resource_400_propagates(host_tags): + host_tags.config.destination_client.put = AsyncMock(side_effect=_http_error(400, "Bad Request")) + with pytest.raises(CustomClientHTTPError) as exc_info: + asyncio.run(host_tags.update_resource("some-host", ["env:prod"])) + assert exc_info.value.status_code == 400 + + +def test_update_resource_403_propagates(host_tags): + host_tags.config.destination_client.put = AsyncMock(side_effect=_http_error(403, "Forbidden")) + with pytest.raises(CustomClientHTTPError) as exc_info: + asyncio.run(host_tags.update_resource("some-host", ["env:prod"])) + assert exc_info.value.status_code == 403 + + +def test_update_resource_200_unchanged(host_tags): + host_tags.config.destination_client.put = AsyncMock(return_value={"tags": ["env:prod", "team:hamr"]}) + _id, tags = asyncio.run(host_tags.update_resource("live-host", ["env:prod", "team:hamr"])) + assert _id == "live-host" + assert tags == ["env:prod", "team:hamr"] + host_tags.config.destination_client.put.assert_awaited_once_with( + "/api/v1/tags/hosts/live-host", {"tags": ["env:prod", "team:hamr"]} + ) + + +def test_create_resource_delegates_404_skip(host_tags): + host_tags.config.destination_client.put = AsyncMock(side_effect=_http_error(404)) + with pytest.raises(SkipResource): + asyncio.run(host_tags.create_resource("missing-host", ["env:prod"])) + + +def test_multi_host_loop_continues_after_404(host_tags): + host_tags.config.destination_client.put = AsyncMock( + side_effect=[ + {"tags": ["env:prod"]}, + _http_error(404), + {"tags": ["env:staging"]}, + ] + ) + synced, skipped = [], [] + for host_id, tags in [ + ("host-a", ["env:prod"]), + ("host-b", ["env:prod"]), + ("host-c", ["env:staging"]), + ]: + try: + _id, returned_tags = asyncio.run(host_tags.update_resource(host_id, tags)) + synced.append((_id, returned_tags)) + except SkipResource: + skipped.append(host_id) + assert synced == [("host-a", ["env:prod"]), ("host-c", ["env:staging"])] + assert skipped == ["host-b"] + assert host_tags.config.destination_client.put.await_count == 3 + + +def test_all_404_completes_cleanly(host_tags): + host_tags.config.destination_client.put = AsyncMock(side_effect=_http_error(404)) + skipped = 0 + for host_id in ("host-a", "host-b", "host-c"): + try: + asyncio.run(host_tags.update_resource(host_id, ["env:prod"])) + except SkipResource: + skipped += 1 + except CustomClientHTTPError: + pytest.fail("all-404 path must not surface CustomClientHTTPError") + assert skipped == 3