From 1a107d1fea7b93906ac3c62a4dc650c7b690f0a0 Mon Sep 17 00:00:00 2001 From: Abdulrahman Alfozan Date: Tue, 19 May 2026 23:24:23 -0700 Subject: [PATCH] Improve OpenAI Agents SDK instrumentation --- instrumentation/README.md | 2 +- .../.changelog/49.fixed | 1 + .../README.rst | 14 +- .../examples/zero-code/.env.example | 2 +- .../examples/zero-code/requirements.txt | 2 +- .../pyproject.toml | 2 +- .../genai/openai_agents/__init__.py | 2 + .../genai/openai_agents/package.py | 4 +- .../genai/openai_agents/span_processor.py | 659 ++++++++++++------ .../tests/requirements.oldest.txt | 2 +- .../tests/test_conformance.py | 13 +- .../tests/test_tracer.py | 254 ++++++- .../tests/test_z_span_processor_unit.py | 65 +- .../tests/test_zz_coverage_improvements.py | 34 +- uv.lock | 2 +- 15 files changed, 798 insertions(+), 260 deletions(-) create mode 100644 instrumentation/opentelemetry-instrumentation-genai-openai-agents/.changelog/49.fixed diff --git a/instrumentation/README.md b/instrumentation/README.md index 48ee952f..cf80042a 100644 --- a/instrumentation/README.md +++ b/instrumentation/README.md @@ -5,6 +5,6 @@ | [opentelemetry-instrumentation-genai-claude-agent-sdk](./opentelemetry-instrumentation-genai-claude-agent-sdk) | claude-agent-sdk >= 0.1.14 | No | development | [opentelemetry-instrumentation-genai-langchain](./opentelemetry-instrumentation-genai-langchain) | langchain >= 0.3.21 | No | development | [opentelemetry-instrumentation-genai-openai](./opentelemetry-instrumentation-genai-openai) | openai >= 1.26.0 | Yes | development -| [opentelemetry-instrumentation-genai-openai-agents](./opentelemetry-instrumentation-genai-openai-agents) | openai-agents >= 0.3.3 | No | development +| [opentelemetry-instrumentation-genai-openai-agents](./opentelemetry-instrumentation-genai-openai-agents) | openai-agents >= 0.17.0 | Yes | development | [opentelemetry-instrumentation-genai-weaviate-client](./opentelemetry-instrumentation-genai-weaviate-client) | weaviate-client >= 3.0.0,<5.0.0 | No | development | [opentelemetry-instrumentation-google-genai](./opentelemetry-instrumentation-google-genai) | google-genai >= 1.32.0, <3 | No | development \ No newline at end of file diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/.changelog/49.fixed b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/.changelog/49.fixed new file mode 100644 index 00000000..3c394b61 --- /dev/null +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/.changelog/49.fixed @@ -0,0 +1 @@ +Improve OpenAI Agents SDK instrumentation support for current tracing payloads. diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/README.rst b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/README.rst index 24a4710d..a769abc2 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/README.rst +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/README.rst @@ -21,8 +21,12 @@ Features * Generates spans for agents, tools, generations, guardrails, and handoffs using the OpenTelemetry GenAI semantic conventions. -* Captures prompts, responses, tool arguments, and system instructions when content - capture is enabled. +* Captures prompts, responses, tool arguments, tool results, and system + instructions when content capture is enabled. +* Handles current Agents SDK response and generation payloads, including response + IDs, models, token usage, and string response inputs. +* Preserves custom span attributes from Agents SDK custom spans, including + sandbox attributes such as ``sandbox.*`` and process exit data. * Publishes duration and token metrics for every operation. * Supports environment overrides so you can configure agent metadata or disable telemetry without code changes. @@ -76,6 +80,12 @@ Configure OpenTelemetry as usual, then call :class:`OpenAIAgentsInstrumentor`. result = Runner.run_sync(assistant, "I'm visiting Barcelona this weekend. How should I pack?") print(result.final_output) +The instrumentation maps Agents SDK spans to OpenTelemetry GenAI semantic +attributes where applicable. Custom Agents SDK spans are preserved without a +placeholder GenAI operation name, and supported ``CustomSpanData.data`` values +are copied to OpenTelemetry attributes, including sandbox attributes such as +``sandbox.*`` and process exit data. + Configuration ------------- diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/.env.example b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/.env.example index bc649aeb..d78104f0 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/.env.example +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/.env.example @@ -2,7 +2,7 @@ # OTEL_GENAI_AGENT_NAME=Travel Concierge # Update this with your real OpenAI API key -OPENAI_API_KEY=sk-YOUR_API_KEY +OPENAI_API_KEY= # Uncomment to use Ollama instead of OpenAI # OPENAI_BASE_URL=http://localhost:11434/v1 diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/requirements.txt b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/requirements.txt index 827648b4..dfe1073c 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/requirements.txt +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/examples/zero-code/requirements.txt @@ -3,5 +3,5 @@ python-dotenv~=1.0 opentelemetry-sdk~=1.42.0 opentelemetry-exporter-otlp-proto-grpc~=1.42.0 -opentelemetry-distro~=0.57b0 +opentelemetry-distro~=0.62b1 opentelemetry-instrumentation-genai-openai-agents diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/pyproject.toml b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/pyproject.toml index bad22251..fb69b893 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/pyproject.toml +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ [project.optional-dependencies] instruments = [ - "openai-agents >= 0.3.3", + "openai-agents >= 0.17.0", ] [project.entry-points.opentelemetry_instrumentor] diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/__init__.py b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/__init__.py index 955c3f1a..a4d8eea4 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/__init__.py @@ -142,6 +142,7 @@ def _instrument(self, **kwargs) -> None: tracer_provider, schema_url=Schemas.V1_28_0.value, ) + meter_provider = kwargs.get("meter_provider") system_override = kwargs.get("system") or os.getenv( _SYSTEM_OVERRIDE_ENV @@ -174,6 +175,7 @@ def _instrument(self, **kwargs) -> None: != ContentCaptureMode.NO_CONTENT, content_mode=content_mode, metrics_enabled=metrics_enabled, + meter_provider=meter_provider, agent_name=agent_name, agent_id=agent_id, agent_description=agent_description, diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/package.py b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/package.py index c5292e65..972ff60d 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/package.py +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/package.py @@ -1,5 +1,5 @@ # Copyright The OpenTelemetry Authors # SPDX-License-Identifier: Apache-2.0 -_instruments = ("openai-agents >= 0.3.3",) -_supports_metrics = False +_instruments = ("openai-agents >= 0.17.0",) +_supports_metrics = True diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/span_processor.py b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/span_processor.py index f78552f1..7e950260 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/span_processor.py +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/src/opentelemetry/instrumentation/genai/openai_agents/span_processor.py @@ -19,43 +19,31 @@ from __future__ import annotations -import importlib import logging +from collections.abc import Mapping as MappingABC +from collections.abc import Sequence as SequenceABC from dataclasses import dataclass from datetime import datetime, timezone from enum import Enum from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Sequence from urllib.parse import urlparse -from opentelemetry.util.genai.utils import gen_ai_json_dumps - -try: - from agents.tracing import Span, Trace, TracingProcessor - from agents.tracing.span_data import ( - AgentSpanData, - FunctionSpanData, - GenerationSpanData, - GuardrailSpanData, - HandoffSpanData, - ResponseSpanData, - SpeechSpanData, - TranscriptionSpanData, - ) -except ModuleNotFoundError: # pragma: no cover - test stubs - tracing_module = importlib.import_module("agents.tracing") - Span = getattr(tracing_module, "Span") - Trace = getattr(tracing_module, "Trace") - TracingProcessor = getattr(tracing_module, "TracingProcessor") - AgentSpanData = getattr(tracing_module, "AgentSpanData", Any) # type: ignore[assignment] - FunctionSpanData = getattr(tracing_module, "FunctionSpanData", Any) # type: ignore[assignment] - GenerationSpanData = getattr(tracing_module, "GenerationSpanData", Any) # type: ignore[assignment] - GuardrailSpanData = getattr(tracing_module, "GuardrailSpanData", Any) # type: ignore[assignment] - HandoffSpanData = getattr(tracing_module, "HandoffSpanData", Any) # type: ignore[assignment] - ResponseSpanData = getattr(tracing_module, "ResponseSpanData", Any) # type: ignore[assignment] - SpeechSpanData = getattr(tracing_module, "SpeechSpanData", Any) # type: ignore[assignment] - TranscriptionSpanData = getattr( - tracing_module, "TranscriptionSpanData", Any - ) # type: ignore[assignment] +from agents.tracing import Span, Trace, TracingProcessor +from agents.tracing.span_data import ( + AgentSpanData, + CustomSpanData, + FunctionSpanData, + GenerationSpanData, + GuardrailSpanData, + HandoffSpanData, + MCPListToolsSpanData, + ResponseSpanData, + SpeechGroupSpanData, + SpeechSpanData, + TaskSpanData, + TranscriptionSpanData, + TurnSpanData, +) from opentelemetry.context import attach, detach from opentelemetry.metrics import Histogram, get_meter @@ -73,6 +61,11 @@ Tracer, set_span_in_context, ) +from opentelemetry.util.genai.instruments import ( + create_duration_histogram, + create_token_histogram, +) +from opentelemetry.util.genai.utils import gen_ai_json_dumps from opentelemetry.util.types import AttributeValue # Import all semantic convention constants @@ -122,7 +115,6 @@ class GenAIOperationName: TRANSCRIPTION = "transcription" SPEECH = "speech_generation" GUARDRAIL = "guardrail_check" - HANDOFF = "agent_handoff" RESPONSE = "response" # internal aggregator in current processor CLASS_FALLBACK = { @@ -240,13 +232,22 @@ def _attr(name: str, fallback: str) -> str: GEN_AI_ORCHESTRATOR_AGENT_DEFINITIONS = "gen_ai.orchestrator.agent.definitions" GEN_AI_GUARDRAIL_NAME = "gen_ai.guardrail.name" GEN_AI_GUARDRAIL_TRIGGERED = "gen_ai.guardrail.triggered" -GEN_AI_HANDOFF_FROM_AGENT = "gen_ai.handoff.from_agent" -GEN_AI_HANDOFF_TO_AGENT = "gen_ai.handoff.to_agent" GEN_AI_EMBEDDINGS_DIMENSION_COUNT = "gen_ai.embeddings.dimension.count" GEN_AI_TOKEN_TYPE = _attr("GEN_AI_TOKEN_TYPE", "gen_ai.token.type") +_DEFAULT_FINISH_REASON = "unknown" + # ---- Normalization utilities (embedded from utils.py) ---- +_CUSTOM_ATTRIBUTE_RESERVED_PREFIXES = ( + "gen_ai.", + "otel.", + "server.", + "telemetry.", +) +_CUSTOM_ATTRIBUTE_RESERVED_KEYS = {"span.kind"} +_CUSTOM_ATTRIBUTE_VALUE_LENGTH_LIMIT = 8192 + def normalize_provider(provider: Optional[str]) -> Optional[str]: """Normalize provider name to spec-compliant value.""" @@ -304,8 +305,6 @@ def normalize_output_type(output_type: Optional[str]) -> str: logger = logging.getLogger(__name__) -GEN_AI_SYSTEM_KEY = getattr(GenAIAttributes, "GEN_AI_SYSTEM", "gen_ai.system") - class ContentCaptureMode(Enum): """Controls whether sensitive content is recorded on spans, events, or both.""" @@ -341,19 +340,6 @@ class ContentPayload: tool_result: Any = None -def _is_instance_of(value: Any, classes: Any) -> bool: - """Safe isinstance that tolerates typing.Any placeholders.""" - if not isinstance(classes, tuple): - classes = (classes,) - for cls in classes: - try: - if isinstance(value, cls): - return True - except TypeError: - continue - return False - - def _infer_server_attributes(base_url: Optional[str]) -> dict[str, Any]: """Return server.address / server.port attributes if base_url provided.""" out: dict[str, Any] = {} @@ -378,19 +364,140 @@ def safe_json_dumps(obj: Any) -> str: return str(obj) +def _truncate_custom_attribute_value(value: str) -> str: + """Limit custom attribute strings so arbitrary span data does not explode spans.""" + if len(value) <= _CUSTOM_ATTRIBUTE_VALUE_LENGTH_LIMIT: + return value + return value[: _CUSTOM_ATTRIBUTE_VALUE_LENGTH_LIMIT - 3] + "..." + + +def _is_reserved_custom_attribute_key(key: str) -> bool: + """Return True when custom span data should not overwrite SDK/OTel attrs.""" + return key in _CUSTOM_ATTRIBUTE_RESERVED_KEYS or key.startswith( + _CUSTOM_ATTRIBUTE_RESERVED_PREFIXES + ) + + +def _jsonable_custom_attribute_value(value: Any) -> Any: + """Normalize values before serializing arbitrary custom span data.""" + if isinstance(value, Enum): + return value.value + if isinstance(value, MappingABC): + return { + str(key): _jsonable_custom_attribute_value(item) + for key, item in value.items() + } + if isinstance(value, (list, tuple, set)): + return [_jsonable_custom_attribute_value(item) for item in value] + return value + + +def _coerce_custom_scalar_attribute_value( + value: Any, +) -> str | bool | int | float | None: + """Return primitive OTel attribute values from custom span data.""" + if value is None: + return None + if isinstance(value, Enum): + return _truncate_custom_attribute_value(str(value.value)) + if isinstance(value, str): + return _truncate_custom_attribute_value(value) + if isinstance(value, bool): + return value + if isinstance(value, int): + return value + if isinstance(value, float): + return value + return None + + +def _is_homogeneous_custom_attribute_sequence( + values: list[str | bool | int | float], +) -> bool: + """Return True when values can be emitted as an OTel array attribute.""" + if all(isinstance(item, str) for item in values): + return True + if all(isinstance(item, bool) for item in values): + return True + if all( + isinstance(item, int) and not isinstance(item, bool) for item in values + ): + return True + return all(isinstance(item, float) for item in values) + + +def _coerce_custom_attribute_value(value: Any) -> AttributeValue | None: + """Convert arbitrary custom span data into valid OTel attribute values.""" + if value is None: + return None + + scalar_value = _coerce_custom_scalar_attribute_value(value) + if scalar_value is not None: + return scalar_value + + if isinstance(value, SequenceABC) and not isinstance( + value, (str, bytes, bytearray) + ): + scalar_values: list[str | bool | int | float] = [] + for item in value: + scalar_value = _coerce_custom_scalar_attribute_value(item) + if scalar_value is None: + break + scalar_values.append(scalar_value) + else: + if scalar_values and _is_homogeneous_custom_attribute_sequence( + scalar_values + ): + return scalar_values + + serialized = safe_json_dumps(_jsonable_custom_attribute_value(value)) + return _truncate_custom_attribute_value(serialized) + + def _as_utc_nano(dt: datetime) -> int: """Convert datetime to UTC nanoseconds timestamp.""" return int(dt.astimezone(timezone.utc).timestamp() * 1_000_000_000) -def _get_span_status(span: Span[Any]) -> Status: +def _get_span_status(span: Span[Any]) -> Optional[Status]: """Get OpenTelemetry span status from agent span.""" if error := getattr(span, "error", None): return Status( status_code=StatusCode.ERROR, description=f"{error.get('message', '')}: {error.get('data', '')}", ) - return Status(StatusCode.OK) + return None + + +def _get_finish_reason(value: Any) -> Optional[str]: + """Return a finish reason from a dict/object when one is available.""" + if isinstance(value, dict): + finish_reason = value.get("finish_reason") or value.get("stop_reason") + else: + finish_reason = getattr(value, "finish_reason", None) or getattr( + value, "stop_reason", None + ) + if finish_reason: + return ( + finish_reason + if isinstance(finish_reason, str) + else str(finish_reason) + ) + return None + + +def _get_finish_reasons_from_sequence( + values: Sequence[Any] | None, +) -> list[str]: + """Collect finish reasons from a response/generation output sequence.""" + if not values: + return [] + finish_reasons: list[str] = [] + for value in values: + finish_reason = _get_finish_reason(value) + if finish_reason: + finish_reasons.append(finish_reason) + return finish_reasons def get_span_name( @@ -420,9 +527,6 @@ def get_span_name( if operation_name == GenAIOperationName.EXECUTE_TOOL: return f"{base_name} {tool_name}" if tool_name else base_name - if operation_name == GenAIOperationName.HANDOFF: - return f"{base_name} {agent_name}" if agent_name else base_name - return base_name @@ -443,6 +547,7 @@ def __init__( server_address: Optional[str] = None, server_port: Optional[int] = None, metrics_enabled: bool = True, + meter_provider: Any = None, agent_name_default: Optional[str] = None, agent_id_default: Optional[str] = None, agent_description_default: Optional[str] = None, @@ -517,6 +622,7 @@ def __init__( # Metrics configuration self._metrics_enabled = metrics_enabled + self._meter_provider = meter_provider self._meter = None self._duration_histogram: Optional[Histogram] = None self._token_usage_histogram: Optional[Histogram] = None @@ -535,22 +641,12 @@ def _get_server_attributes(self) -> dict[str, Any]: def _init_metrics(self): """Initialize metric instruments.""" self._meter = get_meter( - "opentelemetry.instrumentation.genai.openai_agents", "0.1.0" - ) - - # Operation duration histogram - self._duration_histogram = self._meter.create_histogram( - name="gen_ai.client.operation.duration", - description="GenAI operation duration", - unit="s", - ) - - # Token usage histogram - self._token_usage_histogram = self._meter.create_histogram( - name="gen_ai.client.token.usage", - description="Number of input and output tokens used", - unit="{token}", + "opentelemetry.instrumentation.genai.openai_agents", + "0.1.0", + meter_provider=self._meter_provider, ) + self._duration_histogram = create_duration_histogram(self._meter) + self._token_usage_histogram = create_token_histogram(self._meter) def _record_metrics( self, span: Span[Any], attributes: dict[str, AttributeValue] @@ -567,8 +663,12 @@ def _record_metrics( duration = None if hasattr(span, "started_at") and hasattr(span, "ended_at"): try: - start = datetime.fromisoformat(span.started_at) - end = datetime.fromisoformat(span.ended_at) + start = datetime.fromisoformat( + span.started_at.replace("Z", "+00:00") + ) + end = datetime.fromisoformat( + span.ended_at.replace("Z", "+00:00") + ) duration = (end - start).total_seconds() except Exception: pass @@ -602,7 +702,9 @@ def _record_metrics( # Record duration if duration is not None and self._duration_histogram is not None: - self._duration_histogram.record(duration, metric_attrs) + self._duration_histogram.record( + duration, attributes=metric_attrs + ) # Record token usage if self._token_usage_histogram: @@ -611,7 +713,7 @@ def _record_metrics( token_attrs = dict(metric_attrs) token_attrs[GEN_AI_TOKEN_TYPE] = "input" self._token_usage_histogram.record( - input_tokens, token_attrs + input_tokens, attributes=token_attrs ) output_tokens = attributes.get(GEN_AI_USAGE_OUTPUT_TOKENS) @@ -619,7 +721,7 @@ def _record_metrics( token_attrs = dict(metric_attrs) token_attrs[GEN_AI_TOKEN_TYPE] = "output" self._token_usage_histogram.record( - output_tokens, token_attrs + output_tokens, attributes=token_attrs ) except Exception as e: @@ -707,19 +809,33 @@ def _normalize_to_text_parts(self, content: Any) -> list[dict[str, str]]: def _redacted_text_parts(self) -> list[dict[str, str]]: """Return a single redacted text part for system instructions.""" - return [{"type": "text", "content": "readacted"}] + return [{"type": "text", "content": "redacted"}] def _normalize_messages_to_role_parts( - self, messages: Sequence[Any] | None + self, messages: Sequence[Any] | str | None ) -> list[dict[str, Any]]: """Normalize input messages to enforced role+parts schema. Each message becomes: {"role": , "parts": [ {"type": ..., ...} ]} Redaction: when include_sensitive_data is False, replace text content, - tool_call arguments, and tool_call_response result with "readacted". + tool_call arguments, and tool_call_response result with "redacted". """ if not messages: return [] + if isinstance(messages, str): + return [ + { + "role": "user", + "parts": [ + { + "type": "text", + "content": "redacted" + if not self.include_sensitive_data + else messages, + } + ], + } + ] normalized: list[dict[str, Any]] = [] for m in messages: if not isinstance(m, dict): @@ -730,7 +846,7 @@ def _normalize_messages_to_role_parts( "parts": [ { "type": "text", - "content": "readacted" + "content": "redacted" if not self.include_sensitive_data else str(m), } @@ -751,7 +867,7 @@ def _normalize_messages_to_role_parts( if ptype == "text": txt = p.get("content") or p.get("text") newp["content"] = ( - "readacted" + "redacted" if not self.include_sensitive_data else (txt if isinstance(txt, str) else str(p)) ) @@ -760,7 +876,7 @@ def _normalize_messages_to_role_parts( newp["name"] = p.get("name") args = p.get("arguments") newp["arguments"] = ( - "readacted" + "redacted" if not self.include_sensitive_data else args ) @@ -768,13 +884,13 @@ def _normalize_messages_to_role_parts( newp["id"] = p.get("id") or m.get("tool_call_id") result = p.get("result") or p.get("content") newp["result"] = ( - "readacted" + "redacted" if not self.include_sensitive_data else result ) else: newp["content"] = ( - "readacted" + "redacted" if not self.include_sensitive_data else str(p) ) @@ -783,7 +899,7 @@ def _normalize_messages_to_role_parts( parts.append( { "type": "text", - "content": "readacted" + "content": "redacted" if not self.include_sensitive_data else str(p), } @@ -795,7 +911,7 @@ def _normalize_messages_to_role_parts( parts.append( { "type": "text", - "content": "readacted" + "content": "redacted" if not self.include_sensitive_data else content, } @@ -809,7 +925,7 @@ def _normalize_messages_to_role_parts( parts.append( { "type": "text", - "content": "readacted" + "content": "redacted" if not self.include_sensitive_data else ( txt @@ -823,7 +939,7 @@ def _normalize_messages_to_role_parts( parts.append( { "type": "text", - "content": "readacted" + "content": "redacted" if not self.include_sensitive_data else str(item), } @@ -832,7 +948,7 @@ def _normalize_messages_to_role_parts( parts.append( { "type": "text", - "content": "readacted" + "content": "redacted" if not self.include_sensitive_data else str(item), } @@ -852,7 +968,7 @@ def _normalize_messages_to_role_parts( p["name"] = fn.get("name") args = fn.get("arguments") p["arguments"] = ( - "readacted" + "redacted" if not self.include_sensitive_data else args ) @@ -864,7 +980,7 @@ def _normalize_messages_to_role_parts( p["id"] = m.get("tool_call_id") or m.get("id") result = m.get("result") or m.get("content") p["result"] = ( - "readacted" if not self.include_sensitive_data else result + "redacted" if not self.include_sensitive_data else result ) parts.append(p) @@ -899,7 +1015,7 @@ def _normalize_output_messages_to_role_parts( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else output_text ), @@ -916,7 +1032,7 @@ def _normalize_output_messages_to_role_parts( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else txt ), @@ -928,16 +1044,14 @@ def _normalize_output_messages_to_role_parts( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else str(item) ), } ) - # Capture finish_reason from parts when present - fr = getattr(item, "finish_reason", None) - if isinstance(fr, str) and not finish_reason: - finish_reason = fr + if not finish_reason: + finish_reason = _get_finish_reason(item) # Generation span: use span_data.output if not parts: @@ -952,7 +1066,7 @@ def _normalize_output_messages_to_role_parts( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else txt ), @@ -965,7 +1079,7 @@ def _normalize_output_messages_to_role_parts( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else item["content"] ), @@ -976,22 +1090,20 @@ def _normalize_output_messages_to_role_parts( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else str(item) ), } ) - if not finish_reason and isinstance( - item.get("finish_reason"), str - ): - finish_reason = item.get("finish_reason") + if not finish_reason: + finish_reason = _get_finish_reason(item) elif isinstance(item, str): parts.append( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else item ), @@ -1002,7 +1114,7 @@ def _normalize_output_messages_to_role_parts( { "type": "text", "content": ( - "readacted" + "redacted" if not self.include_sensitive_data else str(item) ), @@ -1011,10 +1123,9 @@ def _normalize_output_messages_to_role_parts( # Build assistant message msg: dict[str, Any] = {"role": "assistant", "parts": parts} - if finish_reason: - msg["finish_reason"] = finish_reason # Only include if there is content if parts: + msg["finish_reason"] = finish_reason or _DEFAULT_FINISH_REASON messages.append(msg) return messages @@ -1035,10 +1146,10 @@ def _build_content_payload(self, span: Span[Any]) -> ContentPayload: ) capture_tools = self._content_mode.capture_in_span or ( self._content_mode.capture_in_event - and _is_instance_of(span_data, FunctionSpanData) + and isinstance(span_data, FunctionSpanData) ) - if _is_instance_of(span_data, GenerationSpanData): + if isinstance(span_data, GenerationSpanData): span_input = getattr(span_data, "input", None) if capture_messages and span_input: payload.input_messages = ( @@ -1058,7 +1169,7 @@ def _build_content_payload(self, span: Span[Any]) -> ContentPayload: if normalized_out: payload.output_messages = normalized_out - elif _is_instance_of(span_data, ResponseSpanData): + elif isinstance(span_data, ResponseSpanData): span_input = getattr(span_data, "input", None) response_obj = getattr(span_data, "response", None) if capture_messages and span_input: @@ -1085,7 +1196,7 @@ def _build_content_payload(self, span: Span[Any]) -> ContentPayload: if normalized_out: payload.output_messages = normalized_out - elif _is_instance_of(span_data, FunctionSpanData) and capture_tools: + elif isinstance(span_data, FunctionSpanData) and capture_tools: def _serialize_tool_value(value: Any) -> Optional[str]: if value is None: @@ -1157,20 +1268,22 @@ def _update_agent_aggregate( def _infer_output_type(self, span_data: Any) -> str: """Infer gen_ai.output.type for multiple span kinds.""" - if _is_instance_of(span_data, FunctionSpanData): + if isinstance(span_data, FunctionSpanData): # Tool results are typically JSON return GenAIOutputType.JSON - if _is_instance_of(span_data, TranscriptionSpanData): + if isinstance(span_data, MCPListToolsSpanData): + return GenAIOutputType.JSON + if isinstance(span_data, TranscriptionSpanData): return GenAIOutputType.TEXT - if _is_instance_of(span_data, SpeechSpanData): + if isinstance(span_data, (SpeechGroupSpanData, SpeechSpanData)): return GenAIOutputType.SPEECH - if _is_instance_of(span_data, GuardrailSpanData): + if isinstance(span_data, GuardrailSpanData): return GenAIOutputType.TEXT - if _is_instance_of(span_data, HandoffSpanData): + if isinstance(span_data, HandoffSpanData): return GenAIOutputType.TEXT # Check for embeddings operation - if _is_instance_of(span_data, GenerationSpanData): + if isinstance(span_data, GenerationSpanData): if hasattr(span_data, "embedding_dimension"): return ( GenAIOutputType.TEXT @@ -1272,9 +1385,9 @@ def _sanitize_usage_payload(usage: Any) -> None: def _get_span_kind(self, span_data: Any) -> SpanKind: """Determine appropriate span kind based on span data type.""" - if _is_instance_of(span_data, FunctionSpanData): + if isinstance(span_data, FunctionSpanData): return SpanKind.INTERNAL # Tool execution is internal - if _is_instance_of( + if isinstance( span_data, ( GenerationSpanData, @@ -1284,19 +1397,72 @@ def _get_span_kind(self, span_data: Any) -> SpanKind: ), ): return SpanKind.CLIENT # API calls to model providers - if _is_instance_of(span_data, AgentSpanData): + if isinstance(span_data, AgentSpanData): return SpanKind.CLIENT - if _is_instance_of(span_data, (GuardrailSpanData, HandoffSpanData)): + if isinstance( + span_data, + ( + GuardrailSpanData, + HandoffSpanData, + MCPListToolsSpanData, + SpeechGroupSpanData, + TaskSpanData, + TurnSpanData, + ), + ): return SpanKind.INTERNAL # Agent operations are internal return SpanKind.INTERNAL + def _get_agent_name_for_span(self, span_data: Any) -> Optional[str]: + """Return the best available agent label for span naming.""" + if self.agent_name: + return self.agent_name + if isinstance(span_data, AgentSpanData): + return getattr(span_data, "name", None) + if isinstance(span_data, TaskSpanData): + return getattr(span_data, "name", None) + if isinstance(span_data, TurnSpanData): + return getattr(span_data, "agent_name", None) + return self._agent_name_default + + def _get_tool_name_for_span(self, span_data: Any) -> Optional[str]: + """Return the best available tool label for span naming.""" + if isinstance(span_data, FunctionSpanData): + return getattr(span_data, "name", None) + if isinstance(span_data, MCPListToolsSpanData): + return "list_tools" + return None + + def _get_span_display_name( + self, + span_data: Any, + operation_name: Optional[str], + model: Optional[str], + ) -> str: + """Return the OTel span name for known and custom Agents spans.""" + if operation_name: + return get_span_name( + operation_name, + model, + self._get_agent_name_for_span(span_data), + self._get_tool_name_for_span(span_data), + ) + + name = getattr(span_data, "name", None) + if isinstance(name, str) and name: + return name + + span_type = getattr(span_data, "type", None) + if isinstance(span_type, str) and span_type: + return span_type + + return "span" + def on_trace_start(self, trace: Trace) -> None: """Create root span when trace starts.""" if self._tracer: attributes = { GEN_AI_PROVIDER_NAME: self.system_name, - GEN_AI_SYSTEM_KEY: self.system_name, - GEN_AI_OPERATION_NAME: GenAIOperationName.INVOKE_AGENT, } # Legacy emission removed @@ -1319,8 +1485,6 @@ def on_trace_start(self, trace: Trace) -> None: def on_trace_end(self, trace: Trace) -> None: """End root span when trace ends.""" if root_span := self._root_spans.pop(trace.trace_id, None): - if root_span.is_recording(): - root_span.set_status(Status(StatusCode.OK)) root_span.end() self._cleanup_spans_for_trace(trace.trace_id) @@ -1331,7 +1495,7 @@ def on_span_start(self, span: Span[Any]) -> None: self._span_parents[span.span_id] = span.parent_id if ( - _is_instance_of(span.span_data, AgentSpanData) + isinstance(span.span_data, AgentSpanData) and span.span_id not in self._agent_content ): self._agent_content[span.span_id] = { @@ -1355,27 +1519,16 @@ def on_span_start(self, span: Span[Any]) -> None: response_obj = getattr(span.span_data, "response", None) model = getattr(response_obj, "model", None) - # Use configured agent name or get from span data - agent_name = self.agent_name - if not agent_name and _is_instance_of(span.span_data, AgentSpanData): - agent_name = getattr(span.span_data, "name", None) - if not agent_name: - agent_name = self._agent_name_default - - tool_name = ( - getattr(span.span_data, "name", None) - if _is_instance_of(span.span_data, FunctionSpanData) - else None - ) - # Generate spec-compliant span name - span_name = get_span_name(operation_name, model, agent_name, tool_name) + span_name = self._get_span_display_name( + span.span_data, operation_name, model + ) attributes = { GEN_AI_PROVIDER_NAME: self.system_name, - GEN_AI_SYSTEM_KEY: self.system_name, - GEN_AI_OPERATION_NAME: operation_name, } + if operation_name: + attributes[GEN_AI_OPERATION_NAME] = operation_name # Legacy emission removed # Add configured agent and server attributes @@ -1410,7 +1563,7 @@ def on_span_end(self, span: Span[Any]) -> None: self._update_agent_aggregate(span, payload) agent_content = ( self._agent_content.get(span.span_id) - if _is_instance_of(span.span_data, AgentSpanData) + if isinstance(span.span_data, AgentSpanData) else None ) @@ -1432,7 +1585,7 @@ def on_span_end(self, span: Span[Any]) -> None: span.span_id, e, ) - if _is_instance_of(span.span_data, AgentSpanData): + if isinstance(span.span_data, AgentSpanData): self._agent_content.pop(span.span_id, None) self._span_parents.pop(span.span_id, None) return @@ -1450,7 +1603,7 @@ def on_span_end(self, span: Span[Any]) -> None: otel_span.set_attribute(key, value) attributes[key] = value - if _is_instance_of( + if isinstance( span.span_data, (GenerationSpanData, ResponseSpanData) ): operation_name = attributes.get(GEN_AI_OPERATION_NAME) @@ -1474,7 +1627,9 @@ def on_span_end(self, span: Span[Any]) -> None: # Emit operation details event if configured # Set error status if applicable - otel_span.set_status(status=_get_span_status(span)) + status = _get_span_status(span) + if status is not None: + otel_span.set_status(status=status) if getattr(span, "error", None): err_obj = span.error err_type = err_obj.get("type") or err_obj.get("name") @@ -1492,7 +1647,7 @@ def on_span_end(self, span: Span[Any]) -> None: otel_span.set_status(Status(StatusCode.ERROR, str(e))) otel_span.end() finally: - if _is_instance_of(span.span_data, AgentSpanData): + if isinstance(span.span_data, AgentSpanData): self._agent_content.pop(span.span_id, None) self._span_parents.pop(span.span_id, None) @@ -1520,9 +1675,9 @@ def force_flush(self) -> None: """Force flush (no-op for this processor).""" pass - def _get_operation_name(self, span_data: Any) -> str: + def _get_operation_name(self, span_data: Any) -> Optional[str]: """Determine operation name from span data type.""" - if _is_instance_of(span_data, GenerationSpanData): + if isinstance(span_data, GenerationSpanData): # Check if it's embeddings if hasattr(span_data, "embedding_dimension"): return GenAIOperationName.EMBEDDINGS @@ -1532,23 +1687,21 @@ def _get_operation_name(self, span_data: Any) -> str: if isinstance(first_input, dict) and "role" in first_input: return GenAIOperationName.CHAT return GenAIOperationName.TEXT_COMPLETION - if _is_instance_of(span_data, AgentSpanData): + if isinstance(span_data, AgentSpanData): # The OpenAI Agents SDK AgentSpanData has no "operation" field; # agent spans always represent invoke_agent. return GenAIOperationName.INVOKE_AGENT - if _is_instance_of(span_data, FunctionSpanData): + if isinstance(span_data, FunctionSpanData): return GenAIOperationName.EXECUTE_TOOL - if _is_instance_of(span_data, ResponseSpanData): + if isinstance(span_data, ResponseSpanData): return GenAIOperationName.CHAT # Response typically from chat - if _is_instance_of(span_data, TranscriptionSpanData): + if isinstance(span_data, TranscriptionSpanData): return GenAIOperationName.TRANSCRIPTION - if _is_instance_of(span_data, SpeechSpanData): + if isinstance(span_data, SpeechSpanData): return GenAIOperationName.SPEECH - if _is_instance_of(span_data, GuardrailSpanData): + if isinstance(span_data, GuardrailSpanData): return GenAIOperationName.GUARDRAIL - if _is_instance_of(span_data, HandoffSpanData): - return GenAIOperationName.HANDOFF - return "unknown" + return None def _extract_genai_attributes( self, @@ -1561,7 +1714,6 @@ def _extract_genai_attributes( # Base attributes yield GEN_AI_PROVIDER_NAME, self.system_name - yield GEN_AI_SYSTEM_KEY, self.system_name # Legacy emission removed # Add configured agent attributes (always include when set) @@ -1582,32 +1734,72 @@ def _extract_genai_attributes( yield key, value # Process different span types - if _is_instance_of(span_data, GenerationSpanData): + if isinstance(span_data, GenerationSpanData): yield from self._get_attributes_from_generation_span_data( span_data, payload ) - elif _is_instance_of(span_data, AgentSpanData): + elif isinstance(span_data, AgentSpanData): yield from self._get_attributes_from_agent_span_data( span_data, agent_content ) - elif _is_instance_of(span_data, FunctionSpanData): + elif isinstance(span_data, TaskSpanData): + yield from self._get_attributes_from_task_span_data(span_data) + elif isinstance(span_data, TurnSpanData): + yield from self._get_attributes_from_turn_span_data(span_data) + elif isinstance(span_data, CustomSpanData): + yield from self._get_attributes_from_custom_span_data(span_data) + elif isinstance(span_data, FunctionSpanData): yield from self._get_attributes_from_function_span_data( - span_data, payload + span_data, payload, span.span_id ) - elif _is_instance_of(span_data, ResponseSpanData): + elif isinstance(span_data, MCPListToolsSpanData): + yield from self._get_attributes_from_mcp_tools_span_data(span_data) + elif isinstance(span_data, ResponseSpanData): yield from self._get_attributes_from_response_span_data( span_data, payload ) - elif _is_instance_of(span_data, TranscriptionSpanData): + elif isinstance(span_data, TranscriptionSpanData): yield from self._get_attributes_from_transcription_span_data( span_data ) - elif _is_instance_of(span_data, SpeechSpanData): + elif isinstance(span_data, SpeechSpanData): yield from self._get_attributes_from_speech_span_data(span_data) - elif _is_instance_of(span_data, GuardrailSpanData): + elif isinstance(span_data, GuardrailSpanData): yield from self._get_attributes_from_guardrail_span_data(span_data) - elif _is_instance_of(span_data, HandoffSpanData): + elif isinstance(span_data, HandoffSpanData): yield from self._get_attributes_from_handoff_span_data(span_data) + elif isinstance(span_data, SpeechGroupSpanData): + yield from self._get_attributes_from_speech_group_span_data( + span_data + ) + + def _get_usage_attributes( + self, usage: Any + ) -> Iterator[tuple[str, AttributeValue]]: + """Extract token usage attributes from dict or object payloads.""" + if not usage: + return + + self._sanitize_usage_payload(usage) + if isinstance(usage, dict): + input_tokens = usage.get("prompt_tokens") or usage.get( + "input_tokens" + ) + output_tokens = usage.get("completion_tokens") or usage.get( + "output_tokens" + ) + else: + input_tokens = getattr(usage, "input_tokens", None) + if input_tokens is None: + input_tokens = getattr(usage, "prompt_tokens", None) + output_tokens = getattr(usage, "output_tokens", None) + if output_tokens is None: + output_tokens = getattr(usage, "completion_tokens", None) + + if input_tokens is not None: + yield GEN_AI_USAGE_INPUT_TOKENS, input_tokens + if output_tokens is not None: + yield GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens def _get_attributes_from_generation_span_data( self, span_data: GenerationSpanData, payload: ContentPayload @@ -1632,17 +1824,12 @@ def _get_attributes_from_generation_span_data( if hasattr(span_data, "data_source_id"): yield GEN_AI_DATA_SOURCE_ID, span_data.data_source_id - finish_reasons: list[Any] = [] - if span_data.output: - for part in span_data.output: - if isinstance(part, dict): - fr = part.get("finish_reason") or part.get("stop_reason") - else: - fr = getattr(part, "finish_reason", None) - if fr: - finish_reasons.append( - fr if isinstance(fr, str) else str(fr) - ) + finish_reasons = _get_finish_reasons_from_sequence(span_data.output) + if ( + not finish_reasons + and operation_name != GenAIOperationName.EMBEDDINGS + ): + finish_reasons = [_DEFAULT_FINISH_REASON] if finish_reasons: yield GEN_AI_RESPONSE_FINISH_REASONS, finish_reasons @@ -1815,7 +2002,7 @@ def _is_placeholder_message(self, message: Any) -> bool: if ( not isinstance(part, dict) or part.get("type") != "text" - or part.get("content") != "readacted" + or part.get("content") != "redacted" ): return False return True @@ -1924,8 +2111,58 @@ def _get_attributes_from_agent_span_data( normalize_output_type(self._infer_output_type(span_data)), ) + def _get_attributes_from_task_span_data( + self, span_data: TaskSpanData + ) -> Iterator[tuple[str, AttributeValue]]: + """Extract attributes from an Agents SDK task span.""" + yield from self._get_usage_attributes( + getattr(span_data, "usage", None) + ) + yield ( + GEN_AI_OUTPUT_TYPE, + normalize_output_type(self._infer_output_type(span_data)), + ) + + def _get_attributes_from_turn_span_data( + self, span_data: TurnSpanData + ) -> Iterator[tuple[str, AttributeValue]]: + """Extract attributes from an Agents SDK turn span.""" + agent_name = getattr(span_data, "agent_name", None) + if agent_name: + yield GEN_AI_AGENT_NAME, agent_name + yield from self._get_usage_attributes( + getattr(span_data, "usage", None) + ) + yield ( + GEN_AI_OUTPUT_TYPE, + normalize_output_type(self._infer_output_type(span_data)), + ) + + def _get_attributes_from_custom_span_data( + self, span_data: CustomSpanData + ) -> Iterator[tuple[str, AttributeValue]]: + """Extract attributes from custom Agents SDK span data.""" + custom_data = getattr(span_data, "data", None) + if isinstance(custom_data, MappingABC): + for key, value in custom_data.items(): + if not isinstance(key, str) or not key: + continue + if _is_reserved_custom_attribute_key(key): + continue + attr_value = _coerce_custom_attribute_value(value) + if attr_value is not None: + yield key, attr_value + + yield ( + GEN_AI_OUTPUT_TYPE, + normalize_output_type(self._infer_output_type(span_data)), + ) + def _get_attributes_from_function_span_data( - self, span_data: FunctionSpanData, payload: ContentPayload + self, + span_data: FunctionSpanData, + payload: ContentPayload, + span_id: Optional[str] = None, ) -> Iterator[tuple[str, AttributeValue]]: """Extract attributes from function/tool span.""" yield GEN_AI_OPERATION_NAME, GenAIOperationName.EXECUTE_TOOL @@ -1939,8 +2176,9 @@ def _get_attributes_from_function_span_data( tool_type = span_data.tool_type yield GEN_AI_TOOL_TYPE, validate_tool_type(tool_type) - if hasattr(span_data, "call_id") and span_data.call_id: - yield GEN_AI_TOOL_CALL_ID, span_data.call_id + call_id = getattr(span_data, "call_id", None) or span_id + if call_id: + yield GEN_AI_TOOL_CALL_ID, call_id if hasattr(span_data, "description") and span_data.description: yield GEN_AI_TOOL_DESCRIPTION, span_data.description @@ -1973,6 +2211,30 @@ def _get_attributes_from_function_span_data( normalize_output_type(self._infer_output_type(span_data)), ) + def _get_attributes_from_mcp_tools_span_data( + self, span_data: MCPListToolsSpanData + ) -> Iterator[tuple[str, AttributeValue]]: + """Extract attributes from MCP list-tools spans.""" + yield GEN_AI_TOOL_NAME, "list_tools" + yield GEN_AI_TOOL_TYPE, GenAIToolType.EXTENSION + + server = getattr(span_data, "server", None) + if server: + yield GEN_AI_DATA_SOURCE_ID, server + + result = getattr(span_data, "result", None) + if ( + result is not None + and self.include_sensitive_data + and self._content_mode.capture_in_span + ): + yield GEN_AI_TOOL_CALL_RESULT, safe_json_dumps(result) + + yield ( + GEN_AI_OUTPUT_TYPE, + normalize_output_type(self._infer_output_type(span_data)), + ) + def _get_attributes_from_response_span_data( self, span_data: ResponseSpanData, payload: ContentPayload ) -> Iterator[tuple[str, AttributeValue]]: @@ -1994,20 +2256,10 @@ def _get_attributes_from_response_span_data( yield GEN_AI_REQUEST_MODEL, span_data.response.model # Finish reasons - finish_reasons = [] - if ( - hasattr(span_data.response, "output") - and span_data.response.output - ): - for part in span_data.response.output: - if isinstance(part, dict): - fr = part.get("finish_reason") or part.get( - "stop_reason" - ) - else: - fr = getattr(part, "finish_reason", None) - if fr: - finish_reasons.append(fr) + response_output = getattr(span_data.response, "output", None) + finish_reasons = _get_finish_reasons_from_sequence(response_output) + if not finish_reasons: + finish_reasons = [_DEFAULT_FINISH_REASON] if finish_reasons: yield GEN_AI_RESPONSE_FINISH_REASONS, finish_reasons @@ -2160,14 +2412,15 @@ def _get_attributes_from_handoff_span_data( self, span_data: HandoffSpanData ) -> Iterator[tuple[str, AttributeValue]]: """Extract attributes from handoff span.""" - yield GEN_AI_OPERATION_NAME, GenAIOperationName.HANDOFF - - if span_data.from_agent: - yield GEN_AI_HANDOFF_FROM_AGENT, span_data.from_agent - - if span_data.to_agent: - yield GEN_AI_HANDOFF_TO_AGENT, span_data.to_agent + yield ( + GEN_AI_OUTPUT_TYPE, + normalize_output_type(self._infer_output_type(span_data)), + ) + def _get_attributes_from_speech_group_span_data( + self, span_data: SpeechGroupSpanData + ) -> Iterator[tuple[str, AttributeValue]]: + """Extract attributes from a speech group span.""" yield ( GEN_AI_OUTPUT_TYPE, normalize_output_type(self._infer_output_type(span_data)), diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/requirements.oldest.txt b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/requirements.oldest.txt index 6e69dd5c..21942e43 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/requirements.oldest.txt +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/requirements.oldest.txt @@ -15,7 +15,7 @@ # This variant of the requirements aims to test the system using # the oldest supported version of external dependencies. -openai-agents==0.3.3 +openai-agents==0.17.0 pydantic>=2.10,<3 httpx==0.27.2 Deprecated==1.2.14 diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_conformance.py b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_conformance.py index 6a38496b..1a26e1e3 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_conformance.py +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_conformance.py @@ -25,18 +25,7 @@ @pytest.mark.parametrize( "scenario", - [ - pytest.param( - OrchestrationScenario(), - marks=pytest.mark.skip( - reason=( - "openai-agents instrumentation has multiple semconv gaps " - "surfaced by this scenario; tracked in " - "https://github.com/open-telemetry/opentelemetry-python-genai/issues/86" - ) - ), - ), - ], + [OrchestrationScenario()], ids=lambda s: type(s).__name__, ) def test_conformance( diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_tracer.py b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_tracer.py index 72379388..5ad48a75 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_tracer.py +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_tracer.py @@ -6,16 +6,20 @@ from __future__ import annotations import json +from enum import Enum from types import SimpleNamespace from typing import Any import agents.tracing as agents_tracing from agents.tracing import ( agent_span, + custom_span, function_span, generation_span, + mcp_tools_span, response_span, set_trace_processors, + speech_group_span, trace, ) from openai.types.responses import FunctionTool # noqa: E402 @@ -27,7 +31,12 @@ ContentPayload, GenAISemanticProcessor, ) +from opentelemetry.sdk.metrics import MeterProvider # noqa: E402 +from opentelemetry.sdk.metrics.export import InMemoryMetricReader # noqa: E402 from opentelemetry.sdk.trace import TracerProvider # noqa: E402 +from opentelemetry.semconv._incubating.metrics import ( # noqa: E402 + gen_ai_metrics, +) try: from opentelemetry.sdk.trace.export import ( # type: ignore[attr-defined] @@ -59,6 +68,10 @@ GEN_AI_TOOL_DEFINITIONS = getattr( GenAI, "GEN_AI_TOOL_DEFINITIONS", "gen_ai.tool.definitions" ) +GEN_AI_DATA_SOURCE_ID = getattr( + GenAI, "GEN_AI_DATA_SOURCE_ID", "gen_ai.data_source.id" +) +GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result" def _instrument_with_provider(**instrument_kwargs): @@ -111,6 +124,181 @@ def test_generation_span_creates_client_span(): exporter.clear() +def test_response_span_string_input_records_single_user_message(): + instrumentor, exporter = _instrument_with_provider( + include_sensitive_data=True + ) + + class _Response: + def __init__(self) -> None: + self.id = "resp-456" + self.model = "gpt-4o-mini" + self.output = [] + self.usage = None + self.tools = [] + + try: + provider = agents_tracing.get_trace_provider() + with trace("workflow") as workflow: + response_span_obj = provider.create_span( + agents_tracing.ResponseSpanData( + input="single user prompt", + response=_Response(), + ), + parent=workflow, + ) + response_span_obj.start() + response_span_obj.finish() + + spans = exporter.get_finished_spans() + response = next( + span + for span in spans + if span.attributes.get(GenAI.GEN_AI_RESPONSE_ID) == "resp-456" + ) + + prompt = json.loads(response.attributes[GEN_AI_INPUT_MESSAGES]) + assert prompt == [ + { + "role": "user", + "parts": [{"type": "text", "content": "single user prompt"}], + } + ] + assert response.attributes[GenAI.GEN_AI_RESPONSE_FINISH_REASONS] == ( + "unknown", + ) + finally: + instrumentor.uninstrument() + exporter.clear() + + +def _assert_current_sdk_non_genai_spans(spans): + task = next(span for span in spans if span.name == "Agent workflow") + assert GenAI.GEN_AI_OPERATION_NAME not in task.attributes + assert task.attributes[GenAI.GEN_AI_USAGE_INPUT_TOKENS] == 20 + assert task.attributes[GenAI.GEN_AI_USAGE_OUTPUT_TOKENS] == 6 + + turn = next(span for span in spans if span.name == "turn") + assert GenAI.GEN_AI_OPERATION_NAME not in turn.attributes + assert turn.attributes[GenAI.GEN_AI_AGENT_NAME] == "Support Agent" + assert turn.attributes[GenAI.GEN_AI_USAGE_INPUT_TOKENS] == 8 + assert turn.attributes[GenAI.GEN_AI_USAGE_OUTPUT_TOKENS] == 4 + + mcp = next(span for span in spans if span.name == "mcp_tools") + assert GenAI.GEN_AI_OPERATION_NAME not in mcp.attributes + assert mcp.attributes[GenAI.GEN_AI_TOOL_NAME] == "list_tools" + assert mcp.attributes[GenAI.GEN_AI_TOOL_TYPE] == "extension" + assert mcp.attributes[GEN_AI_DATA_SOURCE_ID] == "filesystem" + assert json.loads(mcp.attributes[GEN_AI_TOOL_CALL_RESULT]) == [ + "read_file", + "write_file", + ] + + speech = next(span for span in spans if span.name == "speech_group") + assert GenAI.GEN_AI_OPERATION_NAME not in speech.attributes + assert speech.attributes[GenAI.GEN_AI_OUTPUT_TYPE] == "speech" + + custom = next(span for span in spans if span.name == "application work") + assert GenAI.GEN_AI_OPERATION_NAME not in custom.attributes + assert custom.attributes["step"] == "local" + + +def _assert_sandbox_custom_span_attributes(spans): + sandbox = next(span for span in spans if span.name == "sandbox.exec") + assert GenAI.GEN_AI_OPERATION_NAME not in sandbox.attributes + assert sandbox.attributes["sandbox.backend"] == "unix_local" + assert sandbox.attributes["sandbox.operation"] == "exec" + assert sandbox.attributes["sandbox.session.id"] == "session-123" + assert sandbox.attributes["exit_code"] == 0 + assert sandbox.attributes["process.exit.code"] == 0 + assert sandbox.attributes["error_code"] == "workspace_read_not_found" + assert sandbox.attributes["path.parts"] == ("case", "scenario.json") + assert json.loads(sandbox.attributes["nested"]) == { + "path": "missing.txt", + "error_code": "workspace_read_not_found", + } + assert "none" not in sandbox.attributes + assert GenAI.GEN_AI_OPERATION_NAME not in sandbox.attributes + assert "otel.status_code" not in sandbox.attributes + assert ( + sandbox.attributes[ServerAttributes.SERVER_ADDRESS] == "api.openai.com" + ) + + +def test_current_agents_sdk_span_types_do_not_emit_unknown_operations(): + class ErrorCode(Enum): + WORKSPACE_READ_NOT_FOUND = "workspace_read_not_found" + + instrumentor, exporter = _instrument_with_provider() + + try: + provider = agents_tracing.get_trace_provider() + with trace("workflow") as workflow: + task_span_obj = provider.create_span( + agents_tracing.TaskSpanData( + name="Agent workflow", + usage={"input_tokens": 20, "output_tokens": 6}, + ), + parent=workflow, + ) + task_span_obj.start() + task_span_obj.finish() + + turn_span_obj = provider.create_span( + agents_tracing.TurnSpanData( + turn=1, + agent_name="Support Agent", + usage={"input_tokens": 8, "output_tokens": 4}, + ), + parent=workflow, + ) + turn_span_obj.start() + turn_span_obj.finish() + + with mcp_tools_span( + server="filesystem", + result=["read_file", "write_file"], + ): + pass + with speech_group_span(input="say hello"): + pass + with custom_span(name="application work", data={"step": "local"}): + pass + with custom_span( + name="sandbox.exec", + data={ + "sandbox.backend": "unix_local", + "sandbox.operation": "exec", + "sandbox.session.id": "session-123", + "exit_code": 0, + "process.exit.code": 0, + "error_code": ErrorCode.WORKSPACE_READ_NOT_FOUND, + "path.parts": ["case", "scenario.json"], + "nested": { + "path": "missing.txt", + "error_code": ErrorCode.WORKSPACE_READ_NOT_FOUND, + }, + "none": None, + "gen_ai.operation.name": "unknown", + "otel.status_code": "ERROR", + "server.address": "malicious.example", + }, + ): + pass + + spans = exporter.get_finished_spans() + assert all( + span.attributes.get(GenAI.GEN_AI_OPERATION_NAME) != "unknown" + for span in spans + ) + + _assert_current_sdk_non_genai_spans(spans) + _assert_sandbox_custom_span_attributes(spans) + finally: + instrumentor.uninstrument() + exporter.clear() + + def test_generation_span_without_roles_uses_text_completion(): instrumentor, exporter = _instrument_with_provider() @@ -161,6 +349,7 @@ def test_function_span_records_tool_attributes(): tool_span.attributes[GenAI.GEN_AI_OPERATION_NAME] == "execute_tool" ) assert tool_span.attributes[GenAI.GEN_AI_TOOL_NAME] == "fetch_weather" + assert GenAI.GEN_AI_TOOL_CALL_ID in tool_span.attributes assert tool_span.attributes[GenAI.GEN_AI_TOOL_TYPE] == "function" assert tool_span.attributes[GEN_AI_PROVIDER_NAME] == "openai" finally: @@ -198,10 +387,38 @@ def test_agent_invoke_span_records_attributes(): exporter.clear() +def test_handoff_span_does_not_emit_undocumented_genai_operation(): + instrumentor, exporter = _instrument_with_provider() + + try: + provider = agents_tracing.get_trace_provider() + with trace("workflow") as workflow: + handoff = provider.create_span( + agents_tracing.HandoffSpanData( + from_agent="triage", to_agent="weather_specialist" + ), + parent=workflow, + ) + handoff.start() + handoff.finish() + + handoff_span = next( + span + for span in exporter.get_finished_spans() + if span.name == "handoff" + ) + assert GenAI.GEN_AI_OPERATION_NAME not in handoff_span.attributes + assert "gen_ai.handoff.from_agent" not in handoff_span.attributes + assert "gen_ai.handoff.to_agent" not in handoff_span.attributes + finally: + instrumentor.uninstrument() + exporter.clear() + + def _placeholder_message() -> dict[str, Any]: return { "role": "user", - "parts": [{"type": "text", "content": "readacted"}], + "parts": [{"type": "text", "content": "redacted"}], } @@ -237,7 +454,7 @@ def test_agent_content_aggregation_skips_duplicate_snapshots(): {"role": "user", "parts": [{"type": "text", "content": "hello"}]}, { "role": "user", - "parts": [{"type": "text", "content": "readacted"}], + "parts": [{"type": "text", "content": "redacted"}], }, ] ) @@ -397,6 +614,7 @@ def test_agent_span_collects_child_messages(): { "role": "assistant", "parts": [{"type": "text", "content": "hello"}], + "finish_reason": "unknown", } ] @@ -549,3 +767,35 @@ def __init__(self) -> None: finally: instrumentor.uninstrument() exporter.clear() + + +def test_metrics_are_emitted_with_configured_meter_provider(): + metric_reader = InMemoryMetricReader() + meter_provider = MeterProvider(metric_readers=[metric_reader]) + instrumentor, exporter = _instrument_with_provider( + meter_provider=meter_provider + ) + + try: + with trace("workflow"): + with generation_span( + input=[{"role": "user", "content": "hi"}], + output=[{"type": "text", "content": "hello"}], + model="gpt-4o-mini", + usage={"input_tokens": 12, "output_tokens": 3}, + ): + pass + + metrics_data = metric_reader.get_metrics_data() + metrics = { + metric.name: metric + for resource_metric in metrics_data.resource_metrics + for scope_metric in resource_metric.scope_metrics + for metric in scope_metric.metrics + } + assert gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION in metrics + assert gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE in metrics + finally: + instrumentor.uninstrument() + exporter.clear() + meter_provider.shutdown() diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_z_span_processor_unit.py b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_z_span_processor_unit.py index 0a2d2cda..58d46e1f 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_z_span_processor_unit.py +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_z_span_processor_unit.py @@ -18,6 +18,7 @@ AgentSpanData, FunctionSpanData, GenerationSpanData, + HandoffSpanData, ResponseSpanData, ) @@ -184,11 +185,18 @@ def test_operation_and_span_naming(processor_setup): == sp.GenAIOperationName.CHAT ) + assert ( + processor._get_operation_name( + HandoffSpanData(from_agent=None, to_agent=None) + ) + is None + ) + class UnknownSpanData: pass unknown = UnknownSpanData() - assert processor._get_operation_name(unknown) == "unknown" + assert processor._get_operation_name(unknown) is None assert processor._get_span_kind(GenerationSpanData()) is SpanKind.CLIENT assert ( @@ -232,6 +240,7 @@ def test_attribute_builders(processor_setup): { "role": "assistant", "parts": [{"type": "text", "content": "hello"}], + "finish_reason": "stop", } ], system_instructions=[{"type": "text", "content": "be helpful"}], @@ -368,6 +377,47 @@ def __init__(self) -> None: assert function_attrs[sp.GEN_AI_TOOL_CALL_RESULT] == {"temperature": 70} assert function_attrs[sp.GEN_AI_OUTPUT_TYPE] == sp.GenAIOutputType.JSON + generation_no_finish_reason = GenerationSpanData( + input=[{"role": "user"}], + output=[{"type": "text", "content": "hello"}], + ) + generation_no_finish_attrs = _collect( + processor._get_attributes_from_generation_span_data( + generation_no_finish_reason, sp.ContentPayload() + ) + ) + assert generation_no_finish_attrs[sp.GEN_AI_RESPONSE_FINISH_REASONS] == [ + "unknown" + ] + assert processor._normalize_output_messages_to_role_parts( + generation_no_finish_reason + ) == [ + { + "role": "assistant", + "parts": [{"type": "text", "content": "hello"}], + "finish_reason": "unknown", + } + ] + + function_without_call_id = FunctionSpanData( + name="lookup_weather", input=None, output=None + ) + function_attrs_with_fallback = _collect( + processor._get_attributes_from_function_span_data( + function_without_call_id, sp.ContentPayload(), "span-42" + ) + ) + assert function_attrs_with_fallback[sp.GEN_AI_TOOL_CALL_ID] == "span-42" + + handoff_attrs = _collect( + processor._get_attributes_from_handoff_span_data( + HandoffSpanData(from_agent="triage", to_agent="weather") + ) + ) + assert sp.GEN_AI_OPERATION_NAME not in handoff_attrs + assert "gen_ai.handoff.from_agent" not in handoff_attrs + assert "gen_ai.handoff.to_agent" not in handoff_attrs + def test_extract_genai_attributes_unknown_type(processor_setup): processor, _ = processor_setup @@ -385,7 +435,6 @@ def __init__(self) -> None: ) ) assert attrs[sp.GEN_AI_PROVIDER_NAME] == "openai" - assert attrs[sp.GEN_AI_SYSTEM_KEY] == "openai" assert sp.GEN_AI_OPERATION_NAME not in attrs @@ -396,8 +445,7 @@ def test_span_status_helper(): assert status.status_code is StatusCode.ERROR assert status.description == "boom: bad" - ok_status = sp._get_span_status(SimpleNamespace(error=None)) - assert ok_status.status_code is StatusCode.OK + assert sp._get_span_status(SimpleNamespace(error=None)) is None @dataclass @@ -487,8 +535,8 @@ def test_span_lifecycle_and_shutdown(processor_setup): statuses["execute_tool lookup"].status_code is StatusCode.ERROR and statuses["execute_tool lookup"].description == "boom: bad" ) - assert statuses["invoke_agent agent"].status_code is StatusCode.OK - assert statuses["workflow"].status_code is StatusCode.OK + assert statuses["invoke_agent agent"].status_code is StatusCode.UNSET + assert statuses["workflow"].status_code is StatusCode.UNSET assert ( statuses["invoke_agent"].status_code is StatusCode.ERROR and statuses["invoke_agent"].description == "Application shutdown" @@ -498,10 +546,7 @@ def test_span_lifecycle_and_shutdown(processor_setup): and statuses["linger"].description == "Application shutdown" ) workflow_span = next(span for span in finished if span.name == "workflow") - assert ( - workflow_span.attributes[sp.GEN_AI_OPERATION_NAME] - == sp.GenAIOperationName.INVOKE_AGENT - ) + assert sp.GEN_AI_OPERATION_NAME not in workflow_span.attributes def test_chat_span_renamed_with_model(processor_setup): diff --git a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_zz_coverage_improvements.py b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_zz_coverage_improvements.py index d9880806..231af110 100644 --- a/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_zz_coverage_improvements.py +++ b/instrumentation/opentelemetry-instrumentation-genai-openai-agents/tests/test_zz_coverage_improvements.py @@ -320,12 +320,12 @@ def test_normalize_output_type_returns_text_for_unknown(self): class TestGetSpanName: """Tests for get_span_name function.""" - def test_span_name_handoff(self): + def test_span_name_unsupported_operation(self): sp, _ = _get_modules() name = sp.get_span_name("agent_handoff", agent_name="target_agent") - assert name == "agent_handoff target_agent" + assert name == "agent_handoff" - def test_span_name_handoff_no_agent(self): + def test_span_name_unsupported_operation_no_agent(self): sp, _ = _get_modules() name = sp.get_span_name("agent_handoff") assert name == "agent_handoff" @@ -493,7 +493,7 @@ def test_normalize_with_redaction(self): processor = self._make_processor(sensitive=False) messages = [{"role": "user", "content": "Secret message"}] normalized = processor._normalize_messages_to_role_parts(messages) - assert normalized[0]["parts"][0]["content"] == "readacted" + assert normalized[0]["parts"][0]["content"] == "redacted" def test_normalize_empty_messages_returns_empty_list(self): processor = self._make_processor() @@ -546,18 +546,18 @@ def test_normalize_parts_tool_calls_and_tool_responses(self): tool_call = next( p for p in normalized[0]["parts"] if p["type"] == "tool_call" ) - assert tool_call["arguments"] == "readacted" + assert tool_call["arguments"] == "redacted" tool_response = next( p for p in normalized[0]["parts"] if p["type"] == "tool_call_response" ) - assert tool_response["result"] == "readacted" + assert tool_response["result"] == "redacted" assert normalized[1]["role"] == "tool" assert normalized[1]["parts"][0]["type"] == "tool_call_response" - assert normalized[1]["parts"][0]["result"] == "readacted" + assert normalized[1]["parts"][0]["result"] == "redacted" # ============================================================================ @@ -568,17 +568,6 @@ def test_normalize_parts_tool_calls_and_tool_responses(self): class TestHelperFunctions: """Tests for various helper functions.""" - def test_is_instance_of_single_class(self): - sp, _ = _get_modules() - assert sp._is_instance_of("hello", str) is True - assert sp._is_instance_of(123, str) is False - - def test_is_instance_of_tuple_classes(self): - sp, _ = _get_modules() - assert sp._is_instance_of("hello", (str, int)) is True - assert sp._is_instance_of(123, (str, int)) is True - assert sp._is_instance_of(3.14, (str, int)) is False - def test_span_status_helper(self): from types import SimpleNamespace @@ -592,8 +581,7 @@ def test_span_status_helper(self): assert status.status_code is StatusCode.ERROR assert status.description == "boom: bad" - ok_status = sp._get_span_status(SimpleNamespace(error=None)) - assert ok_status.status_code is StatusCode.OK + assert sp._get_span_status(SimpleNamespace(error=None)) is None # ============================================================================ @@ -695,7 +683,7 @@ class _Histogram: def __init__(self) -> None: self.records = [] - def record(self, value, attributes) -> None: + def record(self, value, attributes=None) -> None: self.records.append((value, attributes)) duration_histogram = _Histogram() @@ -709,8 +697,8 @@ def record(self, value, attributes) -> None: processor._token_usage_histogram = token_histogram span = SimpleNamespace( - started_at="2024-01-01T00:00:00+00:00", - ended_at="2024-01-01T00:00:02+00:00", + started_at="2024-01-01T00:00:00Z", + ended_at="2024-01-01T00:00:02Z", error={"type": "timeout"}, ) attributes = { diff --git a/uv.lock b/uv.lock index 871731a0..808d708d 100644 --- a/uv.lock +++ b/uv.lock @@ -1239,7 +1239,7 @@ instruments = [ [package.metadata] requires-dist = [ - { name = "openai-agents", marker = "extra == 'instruments'", specifier = ">=0.3.3" }, + { name = "openai-agents", marker = "extra == 'instruments'", specifier = ">=0.17.0" }, { name = "opentelemetry-api", specifier = ">=1.40" }, { name = "opentelemetry-instrumentation", specifier = ">=0.61b0" }, { name = "opentelemetry-semantic-conventions", specifier = ">=0.61b0" },