open-telemetry · Nik-Reddy · May 31, 2026 · lzchen · May 20, 2026 · Nik-Reddy
@@ -0,0 +1 @@
+Record `gen_ai.client.operation.time_to_first_chunk` and `gen_ai.client.operation.time_per_output_chunk` metrics for chat completion streams.
@@ -182,7 +182,11 @@ def __init__(
         invocation: InferenceInvocation,
         capture_content: bool,
     ) -> None:
-        super().__init__(stream)
+        super().__init__(
+            stream,
+            start_time_s=invocation.monotonic_start_s,
+            timing_target=invocation,
+        )
         self._self_invocation = invocation
         self._self_choice_buffers = []
         self._self_capture_content = capture_content
@@ -203,7 +207,11 @@ def __init__(
         invocation: InferenceInvocation,
         capture_content: bool,
     ) -> None:
-        super().__init__(stream)
+        super().__init__(
+            stream,
+            start_time_s=invocation.monotonic_start_s,
+            timing_target=invocation,
+        )
         self._self_invocation = invocation
         self._self_choice_buffers = []
         self._self_capture_content = capture_content

@@ -254,3 +254,172 @@ async def test_async_chat_completion_metrics(
     assert_all_metric_attributes(
         output_token_usage, latest_experimental_enabled
     )
+
+
+# TTFC and per-output-chunk histograms have a different attribute shape than
+# the duration/token histograms: streaming responses in the recorded cassettes
+# do not include system_fingerprint or service_tier, so we assert only the
+# core gen_ai.* attributes that should always be populated.
+def assert_streaming_metric_attributes(
+    data_point, latest_experimental_enabled, expected_request_model
+):
+    assert GenAIAttributes.GEN_AI_OPERATION_NAME in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]
+        == GenAIAttributes.GenAiOperationNameValues.CHAT.value
+    )
+
+    provider_name_attr_name = (
+        "gen_ai.provider.name"
+        if latest_experimental_enabled
+        else GenAIAttributes.GEN_AI_SYSTEM
+    )
+    assert provider_name_attr_name in data_point.attributes
+    assert (
+        data_point.attributes[provider_name_attr_name]
+        == GenAIAttributes.GenAiSystemValues.OPENAI.value
+    )
+
+    assert GenAIAttributes.GEN_AI_REQUEST_MODEL in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
+        == expected_request_model
+    )
+    assert GenAIAttributes.GEN_AI_RESPONSE_MODEL in data_point.attributes
+    assert data_point.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL]
+
+    assert (
+        data_point.attributes[ServerAttributes.SERVER_ADDRESS]
+        == "api.openai.com"
+    )
+
+
+def test_chat_completion_streaming_metrics(
+    metric_reader, openai_client, instrument_with_content, vcr
+):
+    """Regression test for the openai_v2 sync chat stream wrapper wiring.
+
+    Exercises the actual ChatStreamWrapper path so that removing
+    timing_target=invocation in chat_wrappers.py would cause this test to
+    fail, not just the util-layer tests.
+    """
+    if not is_experimental_mode():
+        pytest.skip("new stream wrapper only")
+
+    latest_experimental_enabled = is_experimental_mode()
+    request_model = "gpt-4"
+
+    with vcr.use_cassette("test_chat_completion_streaming.yaml"):
+        response = openai_client.chat.completions.create(
+            messages=USER_ONLY_PROMPT,
+            model=request_model,
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        for _ in response:
+            pass
+
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+    metric_data = metrics[0].scope_metrics[0].metrics
+
+    ttfc_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == "gen_ai.client.operation.time_to_first_chunk"
+        ),
+        None,
+    )
+    assert ttfc_metric is not None
+    assert len(ttfc_metric.data.data_points) == 1
+    ttfc_point = ttfc_metric.data.data_points[0]
+    assert ttfc_point.count == 1
+    assert ttfc_point.sum >= 0
+    assert_streaming_metric_attributes(
+        ttfc_point, latest_experimental_enabled, request_model
+    )
+
+    chunk_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == "gen_ai.client.operation.time_per_output_chunk"
+        ),
+        None,
+    )
+    assert chunk_metric is not None
+    assert len(chunk_metric.data.data_points) == 1
+    chunk_point = chunk_metric.data.data_points[0]
+    assert chunk_point.count >= 1
+    assert chunk_point.sum >= 0
+    assert_streaming_metric_attributes(
+        chunk_point, latest_experimental_enabled, request_model
+    )
+
+
+@pytest.mark.asyncio()
+async def test_async_chat_completion_streaming_metrics(
+    metric_reader, async_openai_client, instrument_with_content, vcr
+):
+    """Regression test for the openai_v2 async chat stream wrapper wiring.
+
+    The async path has separate __init__ wiring from the sync path in
+    chat_wrappers.py, so it needs its own coverage. Removing
+    timing_target=invocation in AsyncChatStreamWrapper would still pass
+    every util-layer test, but would silently break TTFC and per-output-chunk
+    metrics for async OpenAI streaming.
+    """
+    if not is_experimental_mode():
+        pytest.skip("new stream wrapper only")
+
+    latest_experimental_enabled = is_experimental_mode()
+    request_model = "gpt-4"
+
+    with vcr.use_cassette("test_async_chat_completion_streaming.yaml"):
+        response = await async_openai_client.chat.completions.create(
+            messages=USER_ONLY_PROMPT,
+            model=request_model,
+            stream=True,
+            stream_options={"include_usage": True},
+        )
+        async for _ in response:
+            pass
+
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+    metric_data = metrics[0].scope_metrics[0].metrics
+
+    ttfc_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == "gen_ai.client.operation.time_to_first_chunk"
+        ),
+        None,
+    )
+    assert ttfc_metric is not None
+    assert len(ttfc_metric.data.data_points) == 1
+    ttfc_point = ttfc_metric.data.data_points[0]
+    assert ttfc_point.count == 1
+    assert ttfc_point.sum >= 0
+    assert_streaming_metric_attributes(
+        ttfc_point, latest_experimental_enabled, request_model
+    )
+
+    chunk_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == "gen_ai.client.operation.time_per_output_chunk"
+        ),
+        None,
+    )
+    assert chunk_metric is not None
+    assert len(chunk_metric.data.data_points) == 1
+    chunk_point = chunk_metric.data.data_points[0]
+    assert chunk_point.count >= 1
+    assert chunk_point.sum >= 0
+    assert_streaming_metric_attributes(
+        chunk_point, latest_experimental_enabled, request_model
+    )
@@ -0,0 +1 @@
+Record streaming `gen_ai.client.operation.time_to_first_chunk` and `gen_ai.client.operation.time_per_output_chunk` histograms on inference invocations, and expose a `_TimingTarget` protocol on the shared stream wrappers so per-chunk gaps are pushed inline rather than buffered in the wrapper.
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from opentelemetry._logs import Logger, LogRecord
 from opentelemetry.semconv._incubating.attributes import (
@@ -18,7 +18,6 @@
     get_content_attributes,
 )
 from opentelemetry.util.genai.completion_hook import CompletionHook
-from opentelemetry.util.genai.metrics import InvocationMetricsRecorder
 from opentelemetry.util.genai.types import (
     InputMessage,
     MessagePart,
@@ -30,8 +29,12 @@
     should_emit_event,
 )
 
+if TYPE_CHECKING:
+    from opentelemetry.util.genai.metrics import InvocationMetricsRecorder
+
 # TODO: Migrate to GenAI constants once available in semconv package
 _GEN_AI_REASONING_OUTPUT_TOKENS = "gen_ai.usage.reasoning.output_tokens"
+_GEN_AI_RESPONSE_TIME_TO_FIRST_CHUNK = "gen_ai.response.time_to_first_chunk"
 
 
 class InferenceInvocation(GenAIInvocation):
@@ -93,6 +96,9 @@ def __init__(
         self.cache_creation_input_tokens: int | None = None
         self.cache_read_input_tokens: int | None = None
         self.tool_definitions: list[ToolDefinition] | None = None
+        # Streaming timing fields (populated by stream wrappers)
+        self.ttfc_seconds: float | None = None
+        self.chunk_gap_seconds: list[float] = []
         self._start(self._get_base_attributes())
 
     def _get_message_attributes(self, *, for_span: bool) -> dict[str, Any]:
@@ -161,6 +167,10 @@ def _get_attributes(self) -> dict[str, Any]:
                 _GEN_AI_REASONING_OUTPUT_TOKENS,
                 self.thinking_tokens,
             ),
+            (
+                _GEN_AI_RESPONSE_TIME_TO_FIRST_CHUNK,
+                self.ttfc_seconds,
+            ),
         )
         attrs.update({k: v for k, v in optional_attrs if v is not None})
         return attrs
@@ -172,6 +182,29 @@ def _get_metric_attributes(self) -> dict[str, Any]:
         attrs.update(self.metric_attributes)
         return attrs
 
+    def _record_chunk_gap(self, gap: float) -> None:
+        """Buffer a time-per-output-chunk gap (in seconds).
+
+        Called by the stream wrapper as each chunk after the first arrives.
+        Buffered gaps are drained by ``_consume_streaming_timing`` when the
+        invocation stops.
+        """
+        self.chunk_gap_seconds.append(gap)
+
+    def _consume_streaming_timing(
+        self,
+    ) -> tuple[float | None, list[float]]:
+        """Return TTFC and chunk gaps, then reset them on the invocation.
+
+        Called by InvocationMetricsRecorder so the timing values are emitted
+        once and not held past finalization.
+        """
+        ttfc = self.ttfc_seconds
+        gaps = self.chunk_gap_seconds
+        self.ttfc_seconds = None
+        self.chunk_gap_seconds = []
+        return ttfc, gaps
+
     def _get_metric_token_counts(self) -> dict[str, int]:
         counts: dict[str, int] = {}
         if self.input_tokens is not None:

@@ -87,6 +87,17 @@ def __init__(
         self._context_token: ContextToken | None = None
         self._monotonic_start_s: float | None = None
 
+    @property
+    def monotonic_start_s(self) -> float | None:
+        """Monotonic timestamp (seconds) when this invocation started.
+
+        This timestamp is the anchor for the streaming time-to-first-chunk
+        (TTFC) metric. Instrumentations MUST NOT perform meaningful work
+        between ``start_inference()`` and the wrapped SDK call: anything
+        heavier than building attribute dicts will silently inflate TTFC.
+        """
+        return self._monotonic_start_s
+
     def _start(self, attributes: dict[str, Any] | None = None) -> None:
         """Start the invocation span and attach it to the current context.
 

@@ -4,6 +4,14 @@
 from opentelemetry.metrics import Histogram, Meter
 from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
 
+# TODO: Migrate to GenAI constants once available in semconv package
+_GEN_AI_CLIENT_OPERATION_TIME_TO_FIRST_CHUNK = (
+    "gen_ai.client.operation.time_to_first_chunk"
+)
+_GEN_AI_CLIENT_OPERATION_TIME_PER_OUTPUT_CHUNK = (
+    "gen_ai.client.operation.time_per_output_chunk"
+)
+
 _GEN_AI_CLIENT_OPERATION_DURATION_BUCKETS = [
     0.01,
     0.02,
@@ -55,3 +63,29 @@ def create_token_histogram(meter: Meter) -> Histogram:
         unit="{token}",
         explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_TOKEN_USAGE_BUCKETS,
     )
+
+
+def create_ttfc_histogram(meter: Meter) -> Histogram:
+    return meter.create_histogram(
+        name=_GEN_AI_CLIENT_OPERATION_TIME_TO_FIRST_CHUNK,
+        description=(
+            "Time to receive the first chunk, measured from when the client "
+            "issues the generation request to when the first chunk is "
+            "received in the response stream."
+        ),
+        unit="s",
+        explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_OPERATION_DURATION_BUCKETS,
+    )
+
+
+def create_time_per_chunk_histogram(meter: Meter) -> Histogram:
+    return meter.create_histogram(
+        name=_GEN_AI_CLIENT_OPERATION_TIME_PER_OUTPUT_CHUNK,
+        description=(
+            "Time per output chunk, recorded for each chunk received after "
+            "the first one, measured as the time elapsed from the end of "
+            "the previous chunk to the end of the current chunk."
+        ),
+        unit="s",
+        explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_OPERATION_DURATION_BUCKETS,
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Record `gen_ai.client.operation.time_to_first_chunk` and `gen_ai.client.operation.time_per_output_chunk` metrics for chat completion streams.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Record streaming `gen_ai.client.operation.time_to_first_chunk` and `gen_ai.client.operation.time_per_output_chunk` histograms on inference invocations, and expose a `_TimingTarget` protocol on the shared stream wrappers so per-chunk gaps are pushed inline rather than buffered in the wrapper.