diff --git a/python/packages/anthropic/agent_framework_anthropic/_chat_client.py b/python/packages/anthropic/agent_framework_anthropic/_chat_client.py index c90b061b4ff..183ae6b6012 100644 --- a/python/packages/anthropic/agent_framework_anthropic/_chat_client.py +++ b/python/packages/anthropic/agent_framework_anthropic/_chat_client.py @@ -1024,8 +1024,10 @@ def _parse_usage_from_anthropic(self, usage: BetaUsage | BetaMessageDeltaUsage | usage_details["input_token_count"] = usage.input_tokens if usage.cache_creation_input_tokens is not None: usage_details["anthropic.cache_creation_input_tokens"] = usage.cache_creation_input_tokens # type: ignore[typeddict-unknown-key] + usage_details["cache_creation_input_token_count"] = usage.cache_creation_input_tokens if usage.cache_read_input_tokens is not None: usage_details["anthropic.cache_read_input_tokens"] = usage.cache_read_input_tokens # type: ignore[typeddict-unknown-key] + usage_details["cache_read_input_token_count"] = usage.cache_read_input_tokens return usage_details def _parse_contents_from_anthropic( diff --git a/python/packages/anthropic/tests/test_anthropic_client.py b/python/packages/anthropic/tests/test_anthropic_client.py index abad158b8c0..d2caad94793 100644 --- a/python/packages/anthropic/tests/test_anthropic_client.py +++ b/python/packages/anthropic/tests/test_anthropic_client.py @@ -2354,6 +2354,27 @@ def test_parse_usage_with_cache_tokens(mock_anthropic_client: MagicMock) -> None assert result["input_token_count"] == 100 assert result["anthropic.cache_creation_input_tokens"] == 20 assert result["anthropic.cache_read_input_tokens"] == 30 + assert result["cache_creation_input_token_count"] == 20 + assert result["cache_read_input_token_count"] == 30 + + +def test_parse_usage_preserves_zero_cache_tokens(mock_anthropic_client: MagicMock) -> None: + """Test parsing usage preserves zero-valued mapped cache tokens.""" + client = create_test_anthropic_client(mock_anthropic_client) + + mock_usage = MagicMock() + mock_usage.input_tokens = 100 + mock_usage.output_tokens = 50 + mock_usage.cache_creation_input_tokens = 0 + mock_usage.cache_read_input_tokens = 0 + + result = client._parse_usage_from_anthropic(mock_usage) + + assert result is not None + assert result["anthropic.cache_creation_input_tokens"] == 0 + assert result["cache_creation_input_token_count"] == 0 + assert result["anthropic.cache_read_input_tokens"] == 0 + assert result["cache_read_input_token_count"] == 0 # Code Execution Result Tests diff --git a/python/packages/core/agent_framework/_types.py b/python/packages/core/agent_framework/_types.py index f30fc04789d..1dd9deb4917 100644 --- a/python/packages/core/agent_framework/_types.py +++ b/python/packages/core/agent_framework/_types.py @@ -400,12 +400,18 @@ class UsageDetails(TypedDict, total=False, extra_items=int): # type: ignore[cal input_token_count: The number of input tokens used. output_token_count: The number of output tokens generated. total_token_count: The total number of tokens (input + output). + cache_creation_input_token_count: The number of input tokens written to a provider-managed cache. + cache_read_input_token_count: The number of input tokens served from a provider-managed cache. + reasoning_output_token_count: The number of output tokens used for reasoning. """ input_token_count: int | None output_token_count: int | None total_token_count: int | None + cache_creation_input_token_count: int | None + cache_read_input_token_count: int | None + reasoning_output_token_count: int | None def add_usage_details(usage1: UsageDetails | None, usage2: UsageDetails | None) -> UsageDetails: diff --git a/python/packages/core/agent_framework/observability.py b/python/packages/core/agent_framework/observability.py index a36b1f6aae2..d38cba768c5 100644 --- a/python/packages/core/agent_framework/observability.py +++ b/python/packages/core/agent_framework/observability.py @@ -201,6 +201,9 @@ class OtelAttr(str, Enum): # Usage attributes INPUT_TOKENS = "gen_ai.usage.input_tokens" OUTPUT_TOKENS = "gen_ai.usage.output_tokens" + CACHE_CREATION_INPUT_TOKENS = "gen_ai.usage.cache_creation.input_tokens" + CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read.input_tokens" + REASONING_OUTPUT_TOKENS = "gen_ai.usage.reasoning.output_tokens" # Tool attributes TOOL_CALL_ID = "gen_ai.tool.call.id" TOOL_DESCRIPTION = "gen_ai.tool.description" @@ -327,6 +330,20 @@ def __str__(self) -> str: "tool_calls": "tool_call", "length": "length", } +USAGE_DETAIL_TO_OTEL_ATTR: Final[tuple[tuple[str, OtelAttr], ...]] = ( + ("input_token_count", OtelAttr.INPUT_TOKENS), + ("output_token_count", OtelAttr.OUTPUT_TOKENS), + ("cache_creation_input_token_count", OtelAttr.CACHE_CREATION_INPUT_TOKENS), + ("cache_read_input_token_count", OtelAttr.CACHE_READ_INPUT_TOKENS), + ("reasoning_output_token_count", OtelAttr.REASONING_OUTPUT_TOKENS), + ("anthropic.cache_creation_input_tokens", OtelAttr.CACHE_CREATION_INPUT_TOKENS), + ("anthropic.cache_read_input_tokens", OtelAttr.CACHE_READ_INPUT_TOKENS), + ("openai.cached_input_tokens", OtelAttr.CACHE_READ_INPUT_TOKENS), + ("prompt/cached_tokens", OtelAttr.CACHE_READ_INPUT_TOKENS), + ("openai.reasoning_tokens", OtelAttr.REASONING_OUTPUT_TOKENS), + ("completion/reasoning_tokens", OtelAttr.REASONING_OUTPUT_TOKENS), + ("reasoning_tokens", OtelAttr.REASONING_OUTPUT_TOKENS), +) # region Telemetry utils @@ -2350,12 +2367,16 @@ def _apply_accumulated_usage(attributes: dict[str, Any], captured_fields: set[st accumulated = INNER_ACCUMULATED_USAGE.get() if not accumulated: return - input_tokens = accumulated.get("input_token_count") - if input_tokens: - attributes[OtelAttr.INPUT_TOKENS] = input_tokens - output_tokens = accumulated.get("output_token_count") - if output_tokens: - attributes[OtelAttr.OUTPUT_TOKENS] = output_tokens + _apply_usage_attributes(attributes, accumulated) + + +def _apply_usage_attributes(attributes: dict[str, Any], usage: Mapping[str, Any]) -> None: + """Apply known usage details as standard OTel GenAI attributes.""" + for usage_key, otel_attr in USAGE_DETAIL_TO_OTEL_ATTR: + value = usage.get(usage_key) + if value is None or isinstance(value, bool) or not isinstance(value, int): + continue + attributes.setdefault(otel_attr, value) def _get_response_attributes( @@ -2378,12 +2399,7 @@ def _get_response_attributes( if model := getattr(response, "model", None): attributes[OtelAttr.RESPONSE_MODEL] = model if capture_usage and (usage := response.usage_details): - input_tokens = usage.get("input_token_count") - if input_tokens: - attributes[OtelAttr.INPUT_TOKENS] = input_tokens - output_tokens = usage.get("output_token_count") - if output_tokens: - attributes[OtelAttr.OUTPUT_TOKENS] = output_tokens + _apply_usage_attributes(attributes, usage) return attributes @@ -2407,9 +2423,9 @@ def _capture_response( """Set the response for a given span.""" span.set_attributes(attributes) attrs: dict[str, Any] = {k: v for k, v in attributes.items() if k in GEN_AI_METRIC_ATTRIBUTES} - if token_usage_histogram and (input_tokens := attributes.get(OtelAttr.INPUT_TOKENS)): + if token_usage_histogram and (input_tokens := attributes.get(OtelAttr.INPUT_TOKENS)) is not None: token_usage_histogram.record(input_tokens, attributes={**attrs, OtelAttr.T_TYPE: OtelAttr.T_TYPE_INPUT}) - if token_usage_histogram and (output_tokens := attributes.get(OtelAttr.OUTPUT_TOKENS)): + if token_usage_histogram and (output_tokens := attributes.get(OtelAttr.OUTPUT_TOKENS)) is not None: token_usage_histogram.record(output_tokens, {**attrs, OtelAttr.T_TYPE: OtelAttr.T_TYPE_OUTPUT}) if operation_duration_histogram and duration is not None: if OtelAttr.ERROR_TYPE in attributes: diff --git a/python/packages/core/tests/core/test_observability.py b/python/packages/core/tests/core/test_observability.py index 46f6e2c1517..6b9465b167e 100644 --- a/python/packages/core/tests/core/test_observability.py +++ b/python/packages/core/tests/core/test_observability.py @@ -2154,6 +2154,58 @@ def test_get_response_attributes_with_usage(): assert result[OtelAttr.OUTPUT_TOKENS] == 50 +def test_get_response_attributes_with_additional_usage(): + """Test _get_response_attributes maps additional usage details to OTel attributes.""" + from unittest.mock import Mock + + from agent_framework.observability import OtelAttr, _get_response_attributes + + response = Mock() + response.response_id = None + response.finish_reason = None + response.raw_representation = None + response.usage_details = { + "input_token_count": 0, + "output_token_count": 50, + "cache_creation_input_token_count": 10, + "cache_read_input_token_count": 0, + "reasoning_output_token_count": 30, + } + + attrs = {} + result = _get_response_attributes(attrs, response) + + assert result[OtelAttr.INPUT_TOKENS] == 0 + assert result[OtelAttr.OUTPUT_TOKENS] == 50 + assert result[OtelAttr.CACHE_CREATION_INPUT_TOKENS] == 10 + assert result[OtelAttr.CACHE_READ_INPUT_TOKENS] == 0 + assert result[OtelAttr.REASONING_OUTPUT_TOKENS] == 30 + + +def test_get_response_attributes_maps_legacy_usage_keys(): + """Test _get_response_attributes maps legacy provider usage keys to standard OTel attributes.""" + from unittest.mock import Mock + + from agent_framework.observability import OtelAttr, _get_response_attributes + + response = Mock() + response.response_id = None + response.finish_reason = None + response.raw_representation = None + response.usage_details = { + "anthropic.cache_creation_input_tokens": 12, + "openai.cached_input_tokens": 0, + "completion/reasoning_tokens": 34, + } + + attrs = {} + result = _get_response_attributes(attrs, response) + + assert result[OtelAttr.CACHE_CREATION_INPUT_TOKENS] == 12 + assert result[OtelAttr.CACHE_READ_INPUT_TOKENS] == 0 + assert result[OtelAttr.REASONING_OUTPUT_TOKENS] == 34 + + def test_get_response_attributes_capture_usage_false(): """Test _get_response_attributes skips usage when capture_usage is False.""" from unittest.mock import Mock @@ -2164,13 +2216,22 @@ def test_get_response_attributes_capture_usage_false(): response.response_id = None response.finish_reason = None response.raw_representation = None - response.usage_details = {"input_token_count": 100, "output_token_count": 50} + response.usage_details = { + "input_token_count": 100, + "output_token_count": 50, + "cache_creation_input_token_count": 10, + "cache_read_input_token_count": 20, + "reasoning_output_token_count": 30, + } attrs = {} result = _get_response_attributes(attrs, response, capture_usage=False) assert OtelAttr.INPUT_TOKENS not in result assert OtelAttr.OUTPUT_TOKENS not in result + assert OtelAttr.CACHE_CREATION_INPUT_TOKENS not in result + assert OtelAttr.CACHE_READ_INPUT_TOKENS not in result + assert OtelAttr.REASONING_OUTPUT_TOKENS not in result def test_get_response_attributes_capture_response_id_false(): @@ -2933,6 +2994,23 @@ def test_capture_response(span_exporter: InMemorySpanExporter): assert spans[0].attributes.get(OtelAttr.OUTPUT_TOKENS) == 50 +def test_capture_response_records_zero_token_usage(): + """Test _capture_response records zero-valued token usage.""" + from agent_framework.observability import OtelAttr, _capture_response + + span = Mock() + token_histogram = Mock() + attrs = { + OtelAttr.INPUT_TOKENS: 0, + OtelAttr.OUTPUT_TOKENS: 0, + } + + _capture_response(span=span, attributes=attrs, token_usage_histogram=token_histogram) + + span.set_attributes.assert_called_once_with(attrs) + assert token_histogram.record.call_count == 2 + + async def test_layer_ordering_span_sequence_with_function_calling(span_exporter: InMemorySpanExporter): """Test that with correct layer ordering, spans appear in the expected sequence. @@ -3937,11 +4015,21 @@ class _InstrumentedAgent(AgentTelemetryLayer, RawAgent): Content.from_function_call(call_id="call_1", name="get_weather", arguments='{"city": "Seattle"}') ], ), - usage_details=UsageDetails(input_token_count=2239, output_token_count=192), + usage_details=UsageDetails( + input_token_count=2239, + output_token_count=192, + cache_read_input_token_count=100, + reasoning_output_token_count=25, + ), ), ChatResponse( messages=Message(role="assistant", contents=["The weather in Seattle is sunny."]), - usage_details=UsageDetails(input_token_count=2569, output_token_count=99), + usage_details=UsageDetails( + input_token_count=2569, + output_token_count=99, + cache_read_input_token_count=200, + reasoning_output_token_count=0, + ), ), ] @@ -3965,12 +4053,18 @@ class _InstrumentedAgent(AgentTelemetryLayer, RawAgent): # Individual chat spans retain their own usage assert chat_spans[0].attributes.get(OtelAttr.INPUT_TOKENS) == 2239 assert chat_spans[0].attributes.get(OtelAttr.OUTPUT_TOKENS) == 192 + assert chat_spans[0].attributes.get(OtelAttr.CACHE_READ_INPUT_TOKENS) == 100 + assert chat_spans[0].attributes.get(OtelAttr.REASONING_OUTPUT_TOKENS) == 25 assert chat_spans[1].attributes.get(OtelAttr.INPUT_TOKENS) == 2569 assert chat_spans[1].attributes.get(OtelAttr.OUTPUT_TOKENS) == 99 + assert chat_spans[1].attributes.get(OtelAttr.CACHE_READ_INPUT_TOKENS) == 200 + assert chat_spans[1].attributes.get(OtelAttr.REASONING_OUTPUT_TOKENS) == 0 # The invoke_agent span must report the aggregate across all LLM round-trips assert agent_span.attributes.get(OtelAttr.INPUT_TOKENS) == 2239 + 2569 assert agent_span.attributes.get(OtelAttr.OUTPUT_TOKENS) == 192 + 99 + assert agent_span.attributes.get(OtelAttr.CACHE_READ_INPUT_TOKENS) == 100 + 200 + assert agent_span.attributes.get(OtelAttr.REASONING_OUTPUT_TOKENS) == 25 @pytest.mark.parametrize("enable_sensitive_data", [False], indirect=True) diff --git a/python/packages/openai/agent_framework_openai/_chat_client.py b/python/packages/openai/agent_framework_openai/_chat_client.py index 2d5cda9ee5d..14237e06a02 100644 --- a/python/packages/openai/agent_framework_openai/_chat_client.py +++ b/python/packages/openai/agent_framework_openai/_chat_client.py @@ -2979,10 +2979,16 @@ def _parse_usage_from_openai(self, usage: ResponseUsage) -> UsageDetails | None: output_token_count=usage.output_tokens, total_token_count=usage.total_tokens, ) - if usage.input_tokens_details and usage.input_tokens_details.cached_tokens: - details["openai.cached_input_tokens"] = usage.input_tokens_details.cached_tokens # type: ignore[typeddict-unknown-key] - if usage.output_tokens_details and usage.output_tokens_details.reasoning_tokens: - details["openai.reasoning_tokens"] = usage.output_tokens_details.reasoning_tokens # type: ignore[typeddict-unknown-key] + if usage.input_tokens_details: + cached_tokens = cast("int | None", getattr(usage.input_tokens_details, "cached_tokens", None)) + if cached_tokens is not None: + details["openai.cached_input_tokens"] = cached_tokens # type: ignore[typeddict-unknown-key] + details["cache_read_input_token_count"] = cached_tokens + if usage.output_tokens_details: + reasoning_tokens = cast("int | None", getattr(usage.output_tokens_details, "reasoning_tokens", None)) + if reasoning_tokens is not None: + details["openai.reasoning_tokens"] = reasoning_tokens # type: ignore[typeddict-unknown-key] + details["reasoning_output_token_count"] = reasoning_tokens return details def _get_metadata_from_response(self, output: Any) -> dict[str, Any]: diff --git a/python/packages/openai/agent_framework_openai/_chat_completion_client.py b/python/packages/openai/agent_framework_openai/_chat_completion_client.py index 0fd14aa2ef5..9cca30a4179 100644 --- a/python/packages/openai/agent_framework_openai/_chat_completion_client.py +++ b/python/packages/openai/agent_framework_openai/_chat_completion_client.py @@ -765,15 +765,17 @@ def _parse_usage_from_openai(self, usage: CompletionUsage) -> UsageDetails: details["completion/accepted_prediction_tokens"] = tokens # type: ignore[typeddict-unknown-key] if tokens := usage.completion_tokens_details.audio_tokens: details["completion/audio_tokens"] = tokens # type: ignore[typeddict-unknown-key] - if tokens := usage.completion_tokens_details.reasoning_tokens: + if (tokens := usage.completion_tokens_details.reasoning_tokens) is not None: details["completion/reasoning_tokens"] = tokens # type: ignore[typeddict-unknown-key] + details["reasoning_output_token_count"] = tokens if tokens := usage.completion_tokens_details.rejected_prediction_tokens: details["completion/rejected_prediction_tokens"] = tokens # type: ignore[typeddict-unknown-key] if usage.prompt_tokens_details: if tokens := usage.prompt_tokens_details.audio_tokens: details["prompt/audio_tokens"] = tokens # type: ignore[typeddict-unknown-key] - if tokens := usage.prompt_tokens_details.cached_tokens: + if (tokens := usage.prompt_tokens_details.cached_tokens) is not None: details["prompt/cached_tokens"] = tokens # type: ignore[typeddict-unknown-key] + details["cache_read_input_token_count"] = tokens return details def _parse_text_from_openai(self, choice: Choice | ChunkChoice) -> Content | None: diff --git a/python/packages/openai/tests/openai/test_openai_chat_client.py b/python/packages/openai/tests/openai/test_openai_chat_client.py index 9bc598d3cbe..2992e0f41d5 100644 --- a/python/packages/openai/tests/openai/test_openai_chat_client.py +++ b/python/packages/openai/tests/openai/test_openai_chat_client.py @@ -3301,6 +3301,7 @@ def test_usage_details_with_cached_tokens() -> None: assert details is not None assert details["input_token_count"] == 200 assert details["openai.cached_input_tokens"] == 25 + assert details["cache_read_input_token_count"] == 25 def test_usage_details_with_reasoning_tokens() -> None: @@ -3319,6 +3320,49 @@ def test_usage_details_with_reasoning_tokens() -> None: assert details is not None assert details["output_token_count"] == 80 assert details["openai.reasoning_tokens"] == 30 + assert details["reasoning_output_token_count"] == 30 + + +def test_usage_details_with_zero_cached_and_reasoning_tokens() -> None: + """Test _parse_usage_from_openai preserves zero-valued mapped usage details.""" + client = OpenAIChatClient(model="test-model", api_key="test-key") + + mock_usage = MagicMock() + mock_usage.input_tokens = 150 + mock_usage.output_tokens = 80 + mock_usage.total_tokens = 230 + mock_usage.input_tokens_details = MagicMock() + mock_usage.input_tokens_details.cached_tokens = 0 + mock_usage.output_tokens_details = MagicMock() + mock_usage.output_tokens_details.reasoning_tokens = 0 + + details = client._parse_usage_from_openai(mock_usage) # type: ignore + assert details is not None + assert details["openai.cached_input_tokens"] == 0 + assert details["cache_read_input_token_count"] == 0 + assert details["openai.reasoning_tokens"] == 0 + assert details["reasoning_output_token_count"] == 0 + + +def test_usage_details_omits_missing_cached_and_reasoning_tokens() -> None: + """Test _parse_usage_from_openai omits missing mapped usage details.""" + client = OpenAIChatClient(model="test-model", api_key="test-key") + + mock_usage = MagicMock() + mock_usage.input_tokens = 150 + mock_usage.output_tokens = 80 + mock_usage.total_tokens = 230 + mock_usage.input_tokens_details = MagicMock() + mock_usage.input_tokens_details.cached_tokens = None + mock_usage.output_tokens_details = MagicMock() + mock_usage.output_tokens_details.reasoning_tokens = None + + details = client._parse_usage_from_openai(mock_usage) # type: ignore + assert details is not None + assert "openai.cached_input_tokens" not in details + assert "cache_read_input_token_count" not in details + assert "openai.reasoning_tokens" not in details + assert "reasoning_output_token_count" not in details def test_get_metadata_from_response() -> None: diff --git a/python/packages/openai/tests/openai/test_openai_chat_completion_client.py b/python/packages/openai/tests/openai/test_openai_chat_completion_client.py index 85e12b8626d..ba8ca08d34f 100644 --- a/python/packages/openai/tests/openai/test_openai_chat_completion_client.py +++ b/python/packages/openai/tests/openai/test_openai_chat_completion_client.py @@ -1099,6 +1099,31 @@ def test_usage_content_in_streaming_response( assert usage_content.usage_details["total_token_count"] == 150 +def test_parse_usage_includes_standard_and_legacy_mapped_token_details() -> None: + """Test _parse_usage_from_openai emits standard and legacy mapped token details.""" + client = OpenAIChatCompletionClient(model="test-model", api_key="test-key") + + mock_usage = MagicMock() + mock_usage.prompt_tokens = 100 + mock_usage.completion_tokens = 50 + mock_usage.total_tokens = 150 + mock_usage.completion_tokens_details = MagicMock() + mock_usage.completion_tokens_details.accepted_prediction_tokens = None + mock_usage.completion_tokens_details.audio_tokens = None + mock_usage.completion_tokens_details.reasoning_tokens = 0 + mock_usage.completion_tokens_details.rejected_prediction_tokens = None + mock_usage.prompt_tokens_details = MagicMock() + mock_usage.prompt_tokens_details.audio_tokens = None + mock_usage.prompt_tokens_details.cached_tokens = 0 + + details = client._parse_usage_from_openai(mock_usage) # type: ignore[arg-type] + + assert details["completion/reasoning_tokens"] == 0 + assert details["reasoning_output_token_count"] == 0 + assert details["prompt/cached_tokens"] == 0 + assert details["cache_read_input_token_count"] == 0 + + def test_streaming_chunk_with_usage_and_text( openai_unit_test_env: dict[str, str], ) -> None: