From d770db70d96dfda95077cc90533deecaf8dc5568 Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Thu, 11 Jun 2026 18:54:55 -0700 Subject: [PATCH 1/8] fix(agentex): bootstrap OTel auto-instrumentation in uvicorn spawn workers --- agentex/src/api/app.py | 4 +- agentex/src/utils/otel_metrics.py | 71 ++++++++++++++++--- agentex/tests/unit/utils/test_otel_metrics.py | 47 ++++++++++++ 3 files changed, 111 insertions(+), 11 deletions(-) diff --git a/agentex/src/api/app.py b/agentex/src/api/app.py index 76f80015..580e45b5 100644 --- a/agentex/src/api/app.py +++ b/agentex/src/api/app.py @@ -1,3 +1,6 @@ +# First import: bootstrap auto-instrumentation before any instrumented library. +from src.utils.otel_metrics import init_otel_metrics, shutdown_otel_metrics + import os from contextlib import asynccontextmanager from pathlib import Path @@ -38,7 +41,6 @@ from src.config.environment_variables import EnvVarKeys from src.domain.exceptions import GenericException from src.utils.logging import make_logger -from src.utils.otel_metrics import init_otel_metrics, shutdown_otel_metrics logger = make_logger(__name__) diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index e676fbe9..934ea7a5 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -1,27 +1,78 @@ """ -OpenTelemetry metrics configuration for Agentex. +OpenTelemetry bootstrap and custom metrics for Agentex. -When auto-instrumentation (e.g. OTel Operator) has already installed a global -MeterProvider, custom app metrics attach to it instead of replacing it. -Otherwise this module creates its own provider with OTLP export when an endpoint -is configured. +Two responsibilities: -Environment Variables: - OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: Metrics OTLP endpoint (falls back to +1. **Auto-instrumentation** — ``bootstrap_auto_instrumentation()`` runs at import + (keep ``otel_metrics`` first in ``app.py``, before any auto-instrumented + library) so ``initialize()`` executes in each uvicorn spawn worker when + contrib packages are installed. + +2. **Custom app metrics** — ``init_otel_metrics()`` registers Agentex instruments + (``auth_cache_*``, ``db_*``, etc.). Attaches to an existing global + ``MeterProvider`` from bootstrap/operator when present; otherwise creates a + standalone OTLP pipeline when an endpoint is configured. + +Environment variables (custom metrics / standalone mode): + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: Metrics endpoint (falls back to OTEL_EXPORTER_OTLP_ENDPOINT) - OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: Metrics export protocol (falls back to + OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: Export protocol (falls back to OTEL_EXPORTER_OTLP_PROTOCOL; default: grpc) OTEL_EXPORTER_OTLP_ENDPOINT: General OTLP endpoint URL - OTEL_EXPORTER_OTLP_HEADERS: Optional headers for authentication - OTEL_SERVICE_NAME: Service name for metrics (default: agentex) + OTEL_EXPORTER_OTLP_HEADERS: Passed through by OTLP exporters when set + OTEL_SERVICE_NAME: Service name (default: agentex) OTEL_METRICS_EXPORT_INTERVAL_MS: Export interval in ms (default: 30000) """ from __future__ import annotations +import logging import os from typing import TYPE_CHECKING +_auto_instrumentation_bootstrapped = False + +_bootstrap_log = logging.getLogger(__name__) + + +def bootstrap_auto_instrumentation() -> bool: + """Call ``initialize()`` once per process when auto-instrumentation is available. + + Import ``otel_metrics`` before any auto-instrumented library (FastAPI, httpx, + SQLAlchemy, etc.) — instrumentors patch at import time. In ``app.py`` this + must be the first import so bootstrap runs in each uvicorn spawn worker. + + Runs when: contrib packages are installed (no ``ImportError``). + Skips when: already bootstrapped in this process, or packages absent. + + Export config, ``OTEL_SDK_DISABLED``, and disabled instrumentations are + handled inside ``initialize()`` — not gated here. Custom app metrics use + ``init_otel_metrics()`` separately. + + Returns: + True if ``initialize()`` ran; False if skipped. + """ + global _auto_instrumentation_bootstrapped + + if _auto_instrumentation_bootstrapped: + return False + _auto_instrumentation_bootstrapped = True + + try: + from opentelemetry.instrumentation.auto_instrumentation import initialize + except ImportError: + return False + + initialize() + _bootstrap_log.debug( + "OpenTelemetry auto-instrumentation bootstrapped (pid=%s)", + os.getpid(), + ) + return True + + +bootstrap_auto_instrumentation() + from opentelemetry import metrics from opentelemetry.metrics import NoOpMeterProvider from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( diff --git a/agentex/tests/unit/utils/test_otel_metrics.py b/agentex/tests/unit/utils/test_otel_metrics.py index 22b0c974..11c1dea4 100644 --- a/agentex/tests/unit/utils/test_otel_metrics.py +++ b/agentex/tests/unit/utils/test_otel_metrics.py @@ -2,6 +2,7 @@ from __future__ import annotations +import os from unittest.mock import patch import pytest @@ -37,15 +38,61 @@ def _set_global_meter_provider(provider: object | None = None) -> None: def reset_otel_metrics_state(): """Reset module and global OTel state between tests.""" saved_provider = metrics.get_meter_provider() + saved_bootstrap = otel_metrics._auto_instrumentation_bootstrapped otel_metrics.shutdown_otel_metrics() + otel_metrics._auto_instrumentation_bootstrapped = False _set_global_meter_provider() yield otel_metrics.shutdown_otel_metrics() + otel_metrics._auto_instrumentation_bootstrapped = saved_bootstrap _set_global_meter_provider(saved_provider) +@pytest.mark.unit +def test_bootstrap_skips_when_auto_instrumentation_not_installed(monkeypatch): + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + + import builtins + + real_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "opentelemetry.instrumentation.auto_instrumentation": + raise ImportError(name) + return real_import(name, globals, locals, fromlist, level) + + with patch.object(builtins, "__import__", side_effect=fake_import): + assert otel_metrics.bootstrap_auto_instrumentation() is False + + +@pytest.mark.unit +def test_bootstrap_runs_without_otlp_env(monkeypatch): + for key in list(os.environ): + if key.startswith("OTEL_EXPORTER_OTLP") and key.endswith("_ENDPOINT"): + monkeypatch.delenv(key, raising=False) + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + + with patch( + "opentelemetry.instrumentation.auto_instrumentation.initialize" + ) as initialize: + assert otel_metrics.bootstrap_auto_instrumentation() is True + initialize.assert_called_once() + + +@pytest.mark.unit +def test_bootstrap_calls_initialize_when_packages_available(monkeypatch): + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + + with patch( + "opentelemetry.instrumentation.auto_instrumentation.initialize" + ) as initialize: + assert otel_metrics.bootstrap_auto_instrumentation() is True + initialize.assert_called_once() + assert otel_metrics.bootstrap_auto_instrumentation() is False + + def _set_operator_provider() -> MeterProvider: provider = MeterProvider(resource=Resource.create({})) _set_global_meter_provider(provider) From 47658dc4782f3392dd74f02f5567f3743ed95ab3 Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Thu, 11 Jun 2026 19:13:23 -0700 Subject: [PATCH 2/8] Document ddtrace-run + otel coexistence --- agentex/src/utils/otel_metrics.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index 934ea7a5..9a805dc4 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -13,6 +13,14 @@ ``MeterProvider`` from bootstrap/operator when present; otherwise creates a standalone OTLP pipeline when an endpoint is configured. +**Datadog ``ddtrace-run`` coexistence:** Neither OTel nor ddtrace detects the other's +FastAPI patches. If both run in one process, ddtrace wraps the middleware stack +first; OTel skips ``OpenTelemetryMiddleware`` with "unexpected middleware stack" +and HTTP OTel metrics/traces are not emitted. Helm avoids this by using +``ddtrace-run`` only when ``datadog.env`` is set (OTel-only otherwise). If both +are required, set ``DD_TRACE_FASTAPI_ENABLED=false`` and +``DD_TRACE_STARLETTE_ENABLED=false`` so OTel owns HTTP instrumentation. + Environment variables (custom metrics / standalone mode): OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: Metrics endpoint (falls back to OTEL_EXPORTER_OTLP_ENDPOINT) From 5ccdeee7158e59938c521803e45561fad27b6e96 Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Thu, 11 Jun 2026 22:05:21 -0700 Subject: [PATCH 3/8] fix(agentex): assign per-worker service.instance.id for OTel metrics --- agentex/src/utils/otel_metrics.py | 76 ++++++++++++++++--- agentex/tests/unit/utils/test_otel_metrics.py | 76 ++++++++++++++++++- 2 files changed, 139 insertions(+), 13 deletions(-) diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index 9a805dc4..bd67ab71 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -21,6 +21,17 @@ are required, set ``DD_TRACE_FASTAPI_ENABLED=false`` and ``DD_TRACE_STARLETTE_ENABLED=false`` so OTel owns HTTP instrumentation. +**Per-worker ``service.instance.id``:** Uvicorn spawn workers share pod-level +``OTEL_RESOURCE_ATTRIBUTES``, so auto-instrumentation would otherwise emit all +workers on the same metric timeseries (see `OTel #4390 +`_). +``bootstrap_auto_instrumentation()`` appends ``.`` to ``service.instance.id`` +in ``OTEL_RESOURCE_ATTRIBUTES`` before ``initialize()``; standalone +``init_otel_metrics()`` applies the same via ``Resource.merge``. With +``--workers 1``, operator ``sitecustomize`` may have already called +``initialize()``; bootstrap calls it again (OTel providers and instrumentors +are set-once; duplicate calls only produce log warnings). + Environment variables (custom metrics / standalone mode): OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: Metrics endpoint (falls back to OTEL_EXPORTER_OTLP_ENDPOINT) @@ -38,11 +49,53 @@ import os from typing import TYPE_CHECKING +from opentelemetry.sdk.resources import ( + OTELResourceDetector, + Resource, + get_aggregated_resources, +) + _auto_instrumentation_bootstrapped = False _bootstrap_log = logging.getLogger(__name__) +def _unique_instance_id(resource: Resource) -> str: + """Worker-unique service.instance.id (OTel #4390).""" + pid = os.getpid() + existing = resource.attributes.get("service.instance.id") + if existing: + existing = str(existing) + suffix = f".{pid}" + return existing if existing.endswith(suffix) else f"{existing}{suffix}" + service = ( + resource.attributes.get("service.name") + or os.environ.get("OTEL_SERVICE_NAME") + or "unknown" + ) + pod = resource.attributes.get("k8s.pod.name") or "unknown" + return f"{service}.{pod}.{pid}" + + +def _resource_with_unique_instance_id() -> Resource: + resource = get_aggregated_resources([OTELResourceDetector()]) + return resource.merge( + Resource.create({"service.instance.id": _unique_instance_id(resource)}) + ) + + +def _sync_instance_id_to_env(instance_id: str) -> None: + """Write service.instance.id into OTEL_RESOURCE_ATTRIBUTES for auto-instrumentation.""" + key = "service.instance.id" + parts = [ + part.strip() + for part in os.environ.get("OTEL_RESOURCE_ATTRIBUTES", "").split(",") + if part.strip() and not part.strip().startswith(f"{key}=") + ] + parts.append(f"{key}={instance_id}") + os.environ["OTEL_RESOURCE_ATTRIBUTES"] = ",".join(parts) + + def bootstrap_auto_instrumentation() -> bool: """Call ``initialize()`` once per process when auto-instrumentation is available. @@ -71,6 +124,9 @@ def bootstrap_auto_instrumentation() -> bool: except ImportError: return False + _sync_instance_id_to_env( + _unique_instance_id(get_aggregated_resources([OTELResourceDetector()])) + ) initialize() _bootstrap_log.debug( "OpenTelemetry auto-instrumentation bootstrapped (pid=%s)", @@ -91,7 +147,7 @@ def bootstrap_auto_instrumentation() -> bool: ) from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader -from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource +from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION from src.utils.logging import make_logger @@ -203,14 +259,16 @@ def init_otel_metrics( ) ) ) - resource = Resource.create( - { - SERVICE_NAME: resolved_service_name, - SERVICE_VERSION: service_version - or os.environ.get("SERVICE_VERSION", "0.1.0"), - "deployment.environment": environment - or os.environ.get("ENVIRONMENT", "development"), - } + resource = _resource_with_unique_instance_id().merge( + Resource.create( + { + SERVICE_NAME: resolved_service_name, + SERVICE_VERSION: service_version + or os.environ.get("SERVICE_VERSION", "0.1.0"), + "deployment.environment": environment + or os.environ.get("ENVIRONMENT", "development"), + } + ) ) reader = PeriodicExportingMetricReader( exporter=_create_metric_exporter(endpoint, protocol), diff --git a/agentex/tests/unit/utils/test_otel_metrics.py b/agentex/tests/unit/utils/test_otel_metrics.py index 11c1dea4..04f809c7 100644 --- a/agentex/tests/unit/utils/test_otel_metrics.py +++ b/agentex/tests/unit/utils/test_otel_metrics.py @@ -15,7 +15,7 @@ ) from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import InMemoryMetricReader -from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.resources import OTELResourceDetector, Resource, get_aggregated_resources from src.utils import otel_metrics from src.utils import cache_metrics @@ -85,14 +85,82 @@ def test_bootstrap_runs_without_otlp_env(monkeypatch): def test_bootstrap_calls_initialize_when_packages_available(monkeypatch): monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) - with patch( - "opentelemetry.instrumentation.auto_instrumentation.initialize" - ) as initialize: + with ( + patch.object(otel_metrics, "_sync_instance_id_to_env") as sync_env, + patch( + "opentelemetry.instrumentation.auto_instrumentation.initialize" + ) as initialize, + ): assert otel_metrics.bootstrap_auto_instrumentation() is True + sync_env.assert_called_once() initialize.assert_called_once() assert otel_metrics.bootstrap_auto_instrumentation() is False +@pytest.mark.unit +def test_unique_instance_id_extends_operator_value(monkeypatch): + monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") + monkeypatch.setenv( + "OTEL_RESOURCE_ATTRIBUTES", + "k8s.pod.name=my-pod,service.instance.id=agentex.my-pod.agentex", + ) + monkeypatch.setattr(otel_metrics.os, "getpid", lambda: 42) + base = get_aggregated_resources([OTELResourceDetector()]) + assert otel_metrics._unique_instance_id(base) == "agentex.my-pod.agentex.42" + + +@pytest.mark.unit +def test_unique_instance_id_builds_when_missing(monkeypatch): + monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") + monkeypatch.setenv("OTEL_RESOURCE_ATTRIBUTES", "k8s.pod.name=my-pod") + monkeypatch.setattr(otel_metrics.os, "getpid", lambda: 42) + base = get_aggregated_resources([OTELResourceDetector()]) + assert otel_metrics._unique_instance_id(base) == "agentex.my-pod.42" + + +@pytest.mark.unit +def test_resource_with_unique_instance_id_does_not_mutate_env(monkeypatch): + monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") + original = "k8s.pod.name=my-pod,service.instance.id=agentex.my-pod.agentex" + monkeypatch.setenv("OTEL_RESOURCE_ATTRIBUTES", original) + monkeypatch.setattr(otel_metrics.os, "getpid", lambda: 42) + otel_metrics._resource_with_unique_instance_id() + assert os.environ["OTEL_RESOURCE_ATTRIBUTES"] == original + + +@pytest.mark.unit +def test_sync_instance_id_to_env_updates_env(monkeypatch): + monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") + monkeypatch.setenv( + "OTEL_RESOURCE_ATTRIBUTES", + "k8s.pod.name=operator-pod,service.instance.id=agentex.operator-pod.agentex", + ) + monkeypatch.setattr(otel_metrics.os, "getpid", lambda: 6789) + + otel_metrics._sync_instance_id_to_env("agentex.operator-pod.agentex.6789") + + env = os.environ["OTEL_RESOURCE_ATTRIBUTES"] + assert "service.instance.id=agentex.operator-pod.agentex.6789" in env + assert "k8s.pod.name=operator-pod" in env + + +@pytest.mark.unit +def test_resource_with_unique_instance_id_from_otel_env(monkeypatch): + monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") + monkeypatch.setenv( + "OTEL_RESOURCE_ATTRIBUTES", + "k8s.pod.name=operator-pod,k8s.namespace.name=agentex," + "k8s.deployment.name=agentex,service.instance.id=agentex.operator-pod.agentex", + ) + monkeypatch.setattr(otel_metrics.os, "getpid", lambda: 6789) + + resource = otel_metrics._resource_with_unique_instance_id() + attrs = resource.attributes + assert attrs.get("service.name") == "agentex" + assert attrs.get("k8s.pod.name") == "operator-pod" + assert attrs.get("service.instance.id") == "agentex.operator-pod.agentex.6789" + + def _set_operator_provider() -> MeterProvider: provider = MeterProvider(resource=Resource.create({})) _set_global_meter_provider(provider) From 40d74584ff9cb8bd28dadeb299b762a2da8d90c5 Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Thu, 11 Jun 2026 22:09:06 -0700 Subject: [PATCH 4/8] Removed the ineffective NoOpMeterProvider reset --- agentex/src/utils/otel_metrics.py | 6 ------ agentex/tests/unit/utils/test_otel_metrics.py | 20 ++++++++++++++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index bd67ab71..941fa82e 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -138,7 +138,6 @@ def bootstrap_auto_instrumentation() -> bool: bootstrap_auto_instrumentation() from opentelemetry import metrics -from opentelemetry.metrics import NoOpMeterProvider from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( OTLPMetricExporter as OTLPGrpcMetricExporter, ) @@ -326,11 +325,6 @@ def shutdown_otel_metrics() -> None: except Exception: logger.exception("OpenTelemetry metrics shutdown failed") finally: - if _meter_provider is not None: - try: - metrics.set_meter_provider(NoOpMeterProvider()) - except Exception: - logger.exception("Failed to reset global MeterProvider after shutdown") _meter_provider = None _initialized = False diff --git a/agentex/tests/unit/utils/test_otel_metrics.py b/agentex/tests/unit/utils/test_otel_metrics.py index 04f809c7..25e32ee3 100644 --- a/agentex/tests/unit/utils/test_otel_metrics.py +++ b/agentex/tests/unit/utils/test_otel_metrics.py @@ -353,14 +353,32 @@ def test_init_after_shutdown_in_standalone_mode(monkeypatch): first = otel_metrics.init_otel_metrics() assert first is not None + assert otel_metrics._meter_provider is first + otel_metrics.shutdown_otel_metrics() + assert otel_metrics._initialized is False + assert otel_metrics._meter_provider is None second = otel_metrics.init_otel_metrics() assert second is not None - assert second is not first + assert otel_metrics._initialized is True assert otel_metrics.get_meter("agentex.test") is not None +@pytest.mark.unit +def test_shutdown_does_not_replace_global_meter_provider(monkeypatch): + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") + provider = otel_metrics.init_otel_metrics() + assert provider is not None + global_before = metrics.get_meter_provider() + + with patch.object(metrics, "set_meter_provider") as set_provider: + otel_metrics.shutdown_otel_metrics() + + set_provider.assert_not_called() + assert metrics.get_meter_provider() is global_before + + @pytest.mark.unit def test_shutdown_resets_state_when_provider_shutdown_raises(monkeypatch): monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") From a97a5897373b00f775bddc6432fb76a7956c29e6 Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Thu, 11 Jun 2026 22:29:21 -0700 Subject: [PATCH 5/8] Fix greptile concerns --- agentex/src/utils/otel_metrics.py | 24 ++++++--- agentex/tests/unit/utils/test_otel_metrics.py | 51 +++++++++++++++---- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index 941fa82e..859c77fa 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -104,30 +104,40 @@ def bootstrap_auto_instrumentation() -> bool: must be the first import so bootstrap runs in each uvicorn spawn worker. Runs when: contrib packages are installed (no ``ImportError``). - Skips when: already bootstrapped in this process, or packages absent. + Skips when: bootstrap already succeeded in this process. + On ``ImportError`` or ``initialize()`` failure, returns False and leaves + the flag unset so a later call can retry. Export config, ``OTEL_SDK_DISABLED``, and disabled instrumentations are handled inside ``initialize()`` — not gated here. Custom app metrics use ``init_otel_metrics()`` separately. Returns: - True if ``initialize()`` ran; False if skipped. + True if ``initialize()`` completed; False if skipped or failed. """ global _auto_instrumentation_bootstrapped if _auto_instrumentation_bootstrapped: return False - _auto_instrumentation_bootstrapped = True try: from opentelemetry.instrumentation.auto_instrumentation import initialize except ImportError: return False - _sync_instance_id_to_env( - _unique_instance_id(get_aggregated_resources([OTELResourceDetector()])) - ) - initialize() + try: + _sync_instance_id_to_env( + _unique_instance_id(get_aggregated_resources([OTELResourceDetector()])) + ) + initialize() + except Exception: + _bootstrap_log.warning( + "OpenTelemetry auto-instrumentation bootstrap failed; continuing without it", + exc_info=True, + ) + return False + + _auto_instrumentation_bootstrapped = True _bootstrap_log.debug( "OpenTelemetry auto-instrumentation bootstrapped (pid=%s)", os.getpid(), diff --git a/agentex/tests/unit/utils/test_otel_metrics.py b/agentex/tests/unit/utils/test_otel_metrics.py index 25e32ee3..038c7037 100644 --- a/agentex/tests/unit/utils/test_otel_metrics.py +++ b/agentex/tests/unit/utils/test_otel_metrics.py @@ -2,7 +2,10 @@ from __future__ import annotations +import builtins import os +from types import ModuleType +from unittest.mock import MagicMock from unittest.mock import patch import pytest @@ -34,6 +37,23 @@ def _set_global_meter_provider(provider: object | None = None) -> None: pytest.skip(f"OpenTelemetry SDK internals changed: {exc}") +def _fake_auto_instrumentation_import( + initialize: MagicMock | None = None, +) -> tuple[MagicMock, patch]: + """Inject a fake auto_instrumentation module (no contrib deps required).""" + mock_initialize = initialize or MagicMock() + real_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "opentelemetry.instrumentation.auto_instrumentation": + mod = ModuleType(name) + mod.initialize = mock_initialize + return mod + return real_import(name, globals, locals, fromlist, level) + + return mock_initialize, patch.object(builtins, "__import__", side_effect=fake_import) + + @pytest.fixture(autouse=True) def reset_otel_metrics_state(): """Reset module and global OTel state between tests.""" @@ -54,8 +74,6 @@ def reset_otel_metrics_state(): def test_bootstrap_skips_when_auto_instrumentation_not_installed(monkeypatch): monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) - import builtins - real_import = builtins.__import__ def fake_import(name, globals=None, locals=None, fromlist=(), level=0): @@ -65,6 +83,8 @@ def fake_import(name, globals=None, locals=None, fromlist=(), level=0): with patch.object(builtins, "__import__", side_effect=fake_import): assert otel_metrics.bootstrap_auto_instrumentation() is False + assert otel_metrics._auto_instrumentation_bootstrapped is False + assert otel_metrics.bootstrap_auto_instrumentation() is False @pytest.mark.unit @@ -74,27 +94,38 @@ def test_bootstrap_runs_without_otlp_env(monkeypatch): monkeypatch.delenv(key, raising=False) monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) - with patch( - "opentelemetry.instrumentation.auto_instrumentation.initialize" - ) as initialize: + mock_initialize, import_patch = _fake_auto_instrumentation_import() + with import_patch: assert otel_metrics.bootstrap_auto_instrumentation() is True - initialize.assert_called_once() + mock_initialize.assert_called_once() @pytest.mark.unit def test_bootstrap_calls_initialize_when_packages_available(monkeypatch): monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + mock_initialize, import_patch = _fake_auto_instrumentation_import() with ( + import_patch, patch.object(otel_metrics, "_sync_instance_id_to_env") as sync_env, - patch( - "opentelemetry.instrumentation.auto_instrumentation.initialize" - ) as initialize, ): assert otel_metrics.bootstrap_auto_instrumentation() is True sync_env.assert_called_once() - initialize.assert_called_once() + mock_initialize.assert_called_once() + assert otel_metrics.bootstrap_auto_instrumentation() is False + + +@pytest.mark.unit +def test_bootstrap_initialize_failure_returns_false(monkeypatch): + monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) + + mock_initialize = MagicMock(side_effect=RuntimeError("boom")) + _, import_patch = _fake_auto_instrumentation_import(mock_initialize) + with import_patch: + assert otel_metrics.bootstrap_auto_instrumentation() is False + assert otel_metrics._auto_instrumentation_bootstrapped is False assert otel_metrics.bootstrap_auto_instrumentation() is False + assert mock_initialize.call_count == 2 @pytest.mark.unit From 96cdec21ad3232f29763cdfa4cd4b16589f79352 Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Thu, 11 Jun 2026 22:34:58 -0700 Subject: [PATCH 6/8] Move imports cleanup --- agentex/src/utils/otel_metrics.py | 53 +++++++++++++++---------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index 859c77fa..d6acfd42 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -49,15 +49,39 @@ import os from typing import TYPE_CHECKING +from opentelemetry import metrics +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( + OTLPMetricExporter as OTLPGrpcMetricExporter, +) +from opentelemetry.exporter.otlp.proto.http.metric_exporter import ( + OTLPMetricExporter as OTLPHttpMetricExporter, +) +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.resources import ( OTELResourceDetector, Resource, + SERVICE_NAME, + SERVICE_VERSION, get_aggregated_resources, ) -_auto_instrumentation_bootstrapped = False +from src.utils.logging import make_logger + +if TYPE_CHECKING: + from opentelemetry.metrics import Meter + from opentelemetry.sdk.metrics.export import MetricExporter _bootstrap_log = logging.getLogger(__name__) +logger = make_logger(__name__) + +_auto_instrumentation_bootstrapped = False + +_meter_provider: MeterProvider | None = None # Set only when this module creates the provider +_initialized: bool = False + +DEFAULT_SERVICE_NAME = "agentex" +DEFAULT_EXPORT_INTERVAL_MS = 30000 # 30 seconds def _unique_instance_id(resource: Resource) -> str: @@ -147,33 +171,6 @@ def bootstrap_auto_instrumentation() -> bool: bootstrap_auto_instrumentation() -from opentelemetry import metrics -from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( - OTLPMetricExporter as OTLPGrpcMetricExporter, -) -from opentelemetry.exporter.otlp.proto.http.metric_exporter import ( - OTLPMetricExporter as OTLPHttpMetricExporter, -) -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader -from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION - -from src.utils.logging import make_logger - -if TYPE_CHECKING: - from opentelemetry.metrics import Meter - from opentelemetry.sdk.metrics.export import MetricExporter - -logger = make_logger(__name__) - -# Global state -_meter_provider: MeterProvider | None = None # Set only when this module creates the provider -_initialized: bool = False - -# Default configuration -DEFAULT_SERVICE_NAME = "agentex" -DEFAULT_EXPORT_INTERVAL_MS = 30000 # 30 seconds - def _global_meter_provider() -> MeterProvider | None: """Return the global MeterProvider if installed, else None (proxy is ignored).""" From 8dce22f274af1f5060abba262afb3fd39538598d Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Thu, 11 Jun 2026 22:42:14 -0700 Subject: [PATCH 7/8] refactor(agentex): polish otel_metrics module structure and logging --- agentex/src/api/app.py | 3 +- agentex/src/utils/otel_metrics.py | 37 +++++++++++++------ agentex/tests/unit/utils/test_otel_metrics.py | 25 ++++++++----- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/agentex/src/api/app.py b/agentex/src/api/app.py index 580e45b5..1c454fd6 100644 --- a/agentex/src/api/app.py +++ b/agentex/src/api/app.py @@ -1,4 +1,5 @@ -# First import: bootstrap auto-instrumentation before any instrumented library. +# Importing this module runs OTel auto-instrumentation bootstrap at load time +# (before FastAPI and other instrumented libraries are imported below). from src.utils.otel_metrics import init_otel_metrics, shutdown_otel_metrics import os diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index d6acfd42..639c4ba4 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -45,7 +45,6 @@ from __future__ import annotations -import logging import os from typing import TYPE_CHECKING @@ -72,11 +71,10 @@ from opentelemetry.metrics import Meter from opentelemetry.sdk.metrics.export import MetricExporter -_bootstrap_log = logging.getLogger(__name__) logger = make_logger(__name__) +# Module state _auto_instrumentation_bootstrapped = False - _meter_provider: MeterProvider | None = None # Set only when this module creates the provider _initialized: bool = False @@ -84,6 +82,14 @@ DEFAULT_EXPORT_INTERVAL_MS = 30000 # 30 seconds +def _detected_resource() -> Resource: + """Resource attributes from OTEL_* env (operator-injected or local).""" + return get_aggregated_resources([OTELResourceDetector()]) + + +# --- Resource identity (shared by bootstrap and standalone metrics) --- + + def _unique_instance_id(resource: Resource) -> str: """Worker-unique service.instance.id (OTel #4390).""" pid = os.getpid() @@ -102,7 +108,7 @@ def _unique_instance_id(resource: Resource) -> str: def _resource_with_unique_instance_id() -> Resource: - resource = get_aggregated_resources([OTELResourceDetector()]) + resource = _detected_resource() return resource.merge( Resource.create({"service.instance.id": _unique_instance_id(resource)}) ) @@ -120,6 +126,9 @@ def _sync_instance_id_to_env(instance_id: str) -> None: os.environ["OTEL_RESOURCE_ATTRIBUTES"] = ",".join(parts) +# --- Auto-instrumentation bootstrap --- + + def bootstrap_auto_instrumentation() -> bool: """Call ``initialize()`` once per process when auto-instrumentation is available. @@ -150,28 +159,30 @@ def bootstrap_auto_instrumentation() -> bool: return False try: - _sync_instance_id_to_env( - _unique_instance_id(get_aggregated_resources([OTELResourceDetector()])) - ) + _sync_instance_id_to_env(_unique_instance_id(_detected_resource())) initialize() except Exception: - _bootstrap_log.warning( + logger.warning( "OpenTelemetry auto-instrumentation bootstrap failed; continuing without it", exc_info=True, ) return False _auto_instrumentation_bootstrapped = True - _bootstrap_log.debug( + logger.debug( "OpenTelemetry auto-instrumentation bootstrapped (pid=%s)", os.getpid(), ) return True +# Runs at import time so uvicorn spawn workers bootstrap before instrumented libs load. bootstrap_auto_instrumentation() +# --- Custom application metrics --- + + def _global_meter_provider() -> MeterProvider | None: """Return the global MeterProvider if installed, else None (proxy is ignored).""" provider = metrics.get_meter_provider() @@ -290,9 +301,11 @@ def init_otel_metrics( _meter_provider = provider _initialized = True logger.info( - f"OpenTelemetry metrics initialized: endpoint={endpoint}, " - f"protocol={protocol}, service={resolved_service_name}, " - f"interval={resolved_export_interval_ms}ms" + "OpenTelemetry metrics initialized: endpoint=%s, protocol=%s, service=%s, interval=%sms", + endpoint, + protocol, + resolved_service_name, + resolved_export_interval_ms, ) return _meter_provider diff --git a/agentex/tests/unit/utils/test_otel_metrics.py b/agentex/tests/unit/utils/test_otel_metrics.py index 038c7037..8efe8dbd 100644 --- a/agentex/tests/unit/utils/test_otel_metrics.py +++ b/agentex/tests/unit/utils/test_otel_metrics.py @@ -4,7 +4,9 @@ import builtins import os +from contextlib import AbstractContextManager from types import ModuleType +from typing import Any from unittest.mock import MagicMock from unittest.mock import patch @@ -39,7 +41,7 @@ def _set_global_meter_provider(provider: object | None = None) -> None: def _fake_auto_instrumentation_import( initialize: MagicMock | None = None, -) -> tuple[MagicMock, patch]: +) -> tuple[MagicMock, AbstractContextManager[Any]]: """Inject a fake auto_instrumentation module (no contrib deps required).""" mock_initialize = initialize or MagicMock() real_import = builtins.__import__ @@ -54,6 +56,18 @@ def fake_import(name, globals=None, locals=None, fromlist=(), level=0): return mock_initialize, patch.object(builtins, "__import__", side_effect=fake_import) +def _block_auto_instrumentation_import() -> AbstractContextManager[Any]: + """Simulate missing opentelemetry-instrumentation contrib packages.""" + real_import = builtins.__import__ + + def fake_import(name, globals=None, locals=None, fromlist=(), level=0): + if name == "opentelemetry.instrumentation.auto_instrumentation": + raise ImportError(name) + return real_import(name, globals, locals, fromlist, level) + + return patch.object(builtins, "__import__", side_effect=fake_import) + + @pytest.fixture(autouse=True) def reset_otel_metrics_state(): """Reset module and global OTel state between tests.""" @@ -74,14 +88,7 @@ def reset_otel_metrics_state(): def test_bootstrap_skips_when_auto_instrumentation_not_installed(monkeypatch): monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False) - real_import = builtins.__import__ - - def fake_import(name, globals=None, locals=None, fromlist=(), level=0): - if name == "opentelemetry.instrumentation.auto_instrumentation": - raise ImportError(name) - return real_import(name, globals, locals, fromlist, level) - - with patch.object(builtins, "__import__", side_effect=fake_import): + with _block_auto_instrumentation_import(): assert otel_metrics.bootstrap_auto_instrumentation() is False assert otel_metrics._auto_instrumentation_bootstrapped is False assert otel_metrics.bootstrap_auto_instrumentation() is False From 941708ee3ea1ea921b2c532cb83f8d66ed898ce0 Mon Sep 17 00:00:00 2001 From: James Cardenas Date: Fri, 12 Jun 2026 01:00:36 -0700 Subject: [PATCH 8/8] Call OTel bootstrap explicitly from app.py and fix standalone instance id --- agentex/src/api/app.py | 14 ++++-- agentex/src/utils/otel_metrics.py | 50 +++++++------------ agentex/tests/unit/utils/test_otel_metrics.py | 40 +++++++-------- 3 files changed, 48 insertions(+), 56 deletions(-) diff --git a/agentex/src/api/app.py b/agentex/src/api/app.py index 1c454fd6..41a9eada 100644 --- a/agentex/src/api/app.py +++ b/agentex/src/api/app.py @@ -1,6 +1,14 @@ -# Importing this module runs OTel auto-instrumentation bootstrap at load time -# (before FastAPI and other instrumented libraries are imported below). -from src.utils.otel_metrics import init_otel_metrics, shutdown_otel_metrics +# ruff: noqa: E402 +# E402 suppressed: bootstrap_auto_instrumentation() must run before imports of +# auto-instrumented libraries (FastAPI, httpx, SQLAlchemy, etc.). + +from src.utils.otel_metrics import ( + bootstrap_auto_instrumentation, + init_otel_metrics, + shutdown_otel_metrics, +) + +bootstrap_auto_instrumentation() import os from contextlib import asynccontextmanager diff --git a/agentex/src/utils/otel_metrics.py b/agentex/src/utils/otel_metrics.py index 639c4ba4..f286e9f1 100644 --- a/agentex/src/utils/otel_metrics.py +++ b/agentex/src/utils/otel_metrics.py @@ -3,10 +3,10 @@ Two responsibilities: -1. **Auto-instrumentation** — ``bootstrap_auto_instrumentation()`` runs at import - (keep ``otel_metrics`` first in ``app.py``, before any auto-instrumented - library) so ``initialize()`` executes in each uvicorn spawn worker when - contrib packages are installed. +1. **Auto-instrumentation** — call ``bootstrap_auto_instrumentation()`` from + ``app.py`` before importing FastAPI or other auto-instrumented libraries so + ``initialize()`` runs in each uvicorn spawn worker when contrib packages + are installed. 2. **Custom app metrics** — ``init_otel_metrics()`` registers Agentex instruments (``auth_cache_*``, ``db_*``, etc.). Attaches to an existing global @@ -58,10 +58,10 @@ from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.resources import ( - OTELResourceDetector, - Resource, SERVICE_NAME, SERVICE_VERSION, + OTELResourceDetector, + Resource, get_aggregated_resources, ) @@ -87,9 +87,6 @@ def _detected_resource() -> Resource: return get_aggregated_resources([OTELResourceDetector()]) -# --- Resource identity (shared by bootstrap and standalone metrics) --- - - def _unique_instance_id(resource: Resource) -> str: """Worker-unique service.instance.id (OTel #4390).""" pid = os.getpid() @@ -107,13 +104,6 @@ def _unique_instance_id(resource: Resource) -> str: return f"{service}.{pod}.{pid}" -def _resource_with_unique_instance_id() -> Resource: - resource = _detected_resource() - return resource.merge( - Resource.create({"service.instance.id": _unique_instance_id(resource)}) - ) - - def _sync_instance_id_to_env(instance_id: str) -> None: """Write service.instance.id into OTEL_RESOURCE_ATTRIBUTES for auto-instrumentation.""" key = "service.instance.id" @@ -132,9 +122,9 @@ def _sync_instance_id_to_env(instance_id: str) -> None: def bootstrap_auto_instrumentation() -> bool: """Call ``initialize()`` once per process when auto-instrumentation is available. - Import ``otel_metrics`` before any auto-instrumented library (FastAPI, httpx, - SQLAlchemy, etc.) — instrumentors patch at import time. In ``app.py`` this - must be the first import so bootstrap runs in each uvicorn spawn worker. + Call from ``app.py`` before any auto-instrumented library (FastAPI, httpx, + SQLAlchemy, etc.) — instrumentors patch at import time. Each uvicorn spawn + worker imports ``app.py`` fresh, so one call per worker is enough. Runs when: contrib packages are installed (no ``ImportError``). Skips when: bootstrap already succeeded in this process. @@ -176,10 +166,6 @@ def bootstrap_auto_instrumentation() -> bool: return True -# Runs at import time so uvicorn spawn workers bootstrap before instrumented libs load. -bootstrap_auto_instrumentation() - - # --- Custom application metrics --- @@ -276,15 +262,17 @@ def init_otel_metrics( ) ) ) - resource = _resource_with_unique_instance_id().merge( + resource = Resource.create( + { + SERVICE_NAME: resolved_service_name, + SERVICE_VERSION: service_version + or os.environ.get("SERVICE_VERSION", "0.1.0"), + "deployment.environment": environment + or os.environ.get("ENVIRONMENT", "development"), + } + ).merge( Resource.create( - { - SERVICE_NAME: resolved_service_name, - SERVICE_VERSION: service_version - or os.environ.get("SERVICE_VERSION", "0.1.0"), - "deployment.environment": environment - or os.environ.get("ENVIRONMENT", "development"), - } + {"service.instance.id": _unique_instance_id(_detected_resource())} ) ) reader = PeriodicExportingMetricReader( diff --git a/agentex/tests/unit/utils/test_otel_metrics.py b/agentex/tests/unit/utils/test_otel_metrics.py index 8efe8dbd..1893b4ef 100644 --- a/agentex/tests/unit/utils/test_otel_metrics.py +++ b/agentex/tests/unit/utils/test_otel_metrics.py @@ -7,8 +7,7 @@ from contextlib import AbstractContextManager from types import ModuleType from typing import Any -from unittest.mock import MagicMock -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from opentelemetry import metrics @@ -20,9 +19,12 @@ ) from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import InMemoryMetricReader -from opentelemetry.sdk.resources import OTELResourceDetector, Resource, get_aggregated_resources -from src.utils import otel_metrics -from src.utils import cache_metrics +from opentelemetry.sdk.resources import ( + OTELResourceDetector, + Resource, + get_aggregated_resources, +) +from src.utils import cache_metrics, otel_metrics def _set_global_meter_provider(provider: object | None = None) -> None: @@ -156,16 +158,6 @@ def test_unique_instance_id_builds_when_missing(monkeypatch): assert otel_metrics._unique_instance_id(base) == "agentex.my-pod.42" -@pytest.mark.unit -def test_resource_with_unique_instance_id_does_not_mutate_env(monkeypatch): - monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") - original = "k8s.pod.name=my-pod,service.instance.id=agentex.my-pod.agentex" - monkeypatch.setenv("OTEL_RESOURCE_ATTRIBUTES", original) - monkeypatch.setattr(otel_metrics.os, "getpid", lambda: 42) - otel_metrics._resource_with_unique_instance_id() - assert os.environ["OTEL_RESOURCE_ATTRIBUTES"] == original - - @pytest.mark.unit def test_sync_instance_id_to_env_updates_env(monkeypatch): monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") @@ -183,20 +175,24 @@ def test_sync_instance_id_to_env_updates_env(monkeypatch): @pytest.mark.unit -def test_resource_with_unique_instance_id_from_otel_env(monkeypatch): +def test_init_otel_metrics_standalone_resource_has_pid_suffixed_instance_id( + monkeypatch, +): + monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317") monkeypatch.setenv("OTEL_SERVICE_NAME", "agentex") - monkeypatch.setenv( - "OTEL_RESOURCE_ATTRIBUTES", + original = ( "k8s.pod.name=operator-pod,k8s.namespace.name=agentex," - "k8s.deployment.name=agentex,service.instance.id=agentex.operator-pod.agentex", + "k8s.deployment.name=agentex,service.instance.id=agentex.operator-pod.agentex" ) + monkeypatch.setenv("OTEL_RESOURCE_ATTRIBUTES", original) monkeypatch.setattr(otel_metrics.os, "getpid", lambda: 6789) - resource = otel_metrics._resource_with_unique_instance_id() - attrs = resource.attributes + provider = otel_metrics.init_otel_metrics() + assert provider is not None + attrs = provider._sdk_config.resource.attributes assert attrs.get("service.name") == "agentex" - assert attrs.get("k8s.pod.name") == "operator-pod" assert attrs.get("service.instance.id") == "agentex.operator-pod.agentex.6789" + assert os.environ["OTEL_RESOURCE_ATTRIBUTES"] == original def _set_operator_provider() -> MeterProvider: