From 0e35e5f0b10c2c9db10094031a2ac92e59fff9f3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 23 Apr 2026 13:40:27 -0500
Subject: [PATCH 01/96] feat: agentic benchmark ingest + UI with offload-mode
 halo

Adds agentic_traces scenario end-to-end:
- Schema migrations for agentic scenario, availability, and KV offload mode
- DB ingest/ETL + query updates to carry scenario, offload_mode, and
  server/theoretical cache-hit rates through to the API layer
- Frontend types, filters (GlobalFilterContext / InferenceContext /
  ChartControls), URL state, and tooltip rows for agentic-only fields
- ScatterGraph: subtle dashed halo on Pareto-frontier points that used
  KV offload so the tradeoff is visible at a glance
---
 packages/app/cypress/support/mock-data.ts     |   2 +
 .../app/src/app/api/unofficial-run/route.ts   |   2 +
 .../src/components/GlobalFilterContext.tsx    |  12 +-
 .../components/inference/InferenceContext.tsx |  15 ++-
 .../inference/hooks/useChartData.ts           |  34 +++--
 .../app/src/components/inference/types.ts     |  26 ++++
 .../components/inference/ui/ChartControls.tsx |  27 +++-
 .../components/inference/ui/ScatterGraph.tsx  |  21 +++
 .../inference/utils/tooltipUtils.ts           |  54 +++++++-
 .../app/src/components/ui/chart-selectors.tsx | 124 ++++++++++++++++++
 .../unofficial-run-provider.test.ts           |   2 +
 .../components/unofficial-run-provider.tsx    |   4 +-
 packages/app/src/lib/api.ts                   |  14 +-
 .../app/src/lib/benchmark-transform.test.ts   |   2 +
 packages/app/src/lib/benchmark-transform.ts   |  65 ++++++++-
 packages/app/src/lib/data-mappings.ts         |  72 +++++++++-
 packages/app/src/lib/url-state.ts             |   2 +
 packages/constants/src/models.ts              |  17 +++
 .../db/migrations/002_agentic_scenario.sql    |  30 +++++
 .../migrations/003_agentic_availability.sql   |  21 +++
 packages/db/migrations/004_offload_mode.sql   |  42 ++++++
 packages/db/src/etl/benchmark-ingest.ts       |  28 ++--
 packages/db/src/etl/benchmark-mapper.ts       |  45 ++++++-
 packages/db/src/ingest-ci-run.ts              |   6 +-
 packages/db/src/ingest-gcs-backup.ts          |   6 +-
 packages/db/src/ingest-supplemental.ts        |  14 +-
 packages/db/src/json-provider.ts              |   8 +-
 packages/db/src/queries/benchmarks.ts         |  13 +-
 packages/db/src/queries/workflow-info.ts      |  15 ++-
 29 files changed, 645 insertions(+), 78 deletions(-)
 create mode 100644 packages/db/migrations/002_agentic_scenario.sql
 create mode 100644 packages/db/migrations/003_agentic_availability.sql
 create mode 100644 packages/db/migrations/004_offload_mode.sql

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index e6720c0b..7a4f59a9 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,6 +189,8 @@ export function createMockInferenceContext(
     workflowInfo: null,
     selectedYAxisMetric: 'y_tpPerGpu',
     setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
+    selectedPercentile: 'median',
+    setSelectedPercentile: namedStub('setSelectedPercentile'),
     selectedXAxisMetric: null,
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 79ac0665..dbfb9c33 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -49,6 +49,8 @@ export function normalizeArtifactRows(
       decode_num_workers: config.decodeNumWorkers,
       num_prefill_gpu: config.numPrefillGpu,
       num_decode_gpu: config.numDecodeGpu,
+      benchmark_type: params.benchmarkType,
+      offload_mode: params.offloadMode,
       isl: params.isl,
       osl: params.osl,
       conc: params.conc,
diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 65f510cd..f603081a 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -11,7 +11,7 @@ import {
   useState,
 } from 'react';
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
 
 import { useAvailability } from '@/hooks/api/use-availability';
 import { useWorkflowInfo } from '@/hooks/api/use-workflow-info';
@@ -172,11 +172,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   const availableSequences = useMemo(() => {
     if (!availabilityRows) return SEQUENCE_OPTIONS;
     const seqs = [
-      ...new Set(
-        modelRows
-          .map((r) => islOslToSequence(r.isl, r.osl))
-          .filter((s): s is Sequence => s !== null),
-      ),
+      ...new Set(modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null)),
     ];
     return seqs.length > 0 ? seqs : SEQUENCE_OPTIONS;
   }, [availabilityRows, modelRows]);
@@ -190,7 +186,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   // Precisions available for the selected model + sequence
   const availablePrecisions = useMemo(() => {
     if (!availabilityRows) return ['fp4'];
-    const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const precs = [...new Set(rows.map((r) => r.precision))].toSorted();
     return precs.length > 0 ? precs : ['fp4'];
   }, [availabilityRows, modelRows, effectiveSequence]);
@@ -205,7 +201,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   // Dates available for selected model + sequence + precisions
   const availableDates = useMemo(() => {
     if (!availabilityRows) return [];
-    const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence);
+    const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence);
     const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision));
     if (rows.length === 0) {
       return [...new Set(seqRows.map((r) => r.date))].toSorted();
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 7fa416fd..6f45d8d7 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -11,7 +11,7 @@ import {
   useState,
 } from 'react';
 
-import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { track } from '@/lib/analytics';
 import { FAVORITE_PRESETS, type FavoritePreset } from '@/components/favorites/favorite-presets';
 
@@ -110,6 +110,11 @@ export function InferenceProvider({
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || null,
   );
+  // Latency percentile applied to the chart x-axis for agentic scenarios.
+  // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
+  const [selectedPercentile, setSelectedPercentile] = useState<string>(
+    () => getUrlParam('i_pctl') || 'median',
+  );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
   );
@@ -163,6 +168,7 @@ export function InferenceProvider({
     effectiveRunDate,
     isActive,
     latestDate,
+    selectedPercentile,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
@@ -176,7 +182,7 @@ export function InferenceProvider({
     if (!availabilityRows) return availableDates;
     const rows = availabilityRows.filter((r) => {
       if (!dbModelKeys.includes(r.model)) return false;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false;
+      if (rowToSequence(r) !== effectiveSequence) return false;
       if (!effectivePrecisions.includes(r.precision)) return false;
       if (!r.hardware) return false;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -201,7 +207,7 @@ export function InferenceProvider({
     const hwKeys = new Set<string>();
     for (const r of availabilityRows) {
       if (!dbModelKeys.includes(r.model)) continue;
-      if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue;
+      if (rowToSequence(r) !== effectiveSequence) continue;
       if (!effectivePrecisions.includes(r.precision)) continue;
       if (!r.hardware) continue;
       const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg);
@@ -589,6 +595,7 @@ export function InferenceProvider({
   useUrlStateSync(
     {
       i_metric: selectedYAxisMetric,
+      i_pctl: selectedPercentile,
       i_gpus: selectedGPUs.join(','),
       i_dates: selectedDates.join(','),
       i_dstart: selectedDateRange.startDate,
@@ -783,6 +790,8 @@ export function InferenceProvider({
       workflowInfo,
       selectedYAxisMetric,
       setSelectedYAxisMetric: setSelectedYAxisMetricAndClear,
+      selectedPercentile,
+      setSelectedPercentile,
       selectedGPUs,
       setSelectedGPUs: setSelectedGPUsAndClear,
       availableGPUs,
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 625e63ab..81ab0780 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -1,7 +1,7 @@
 import { useMemo, useRef } from 'react';
 
 import { useQueries } from '@tanstack/react-query';
-import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants';
+import { rowToSequence } from '@semianalysisai/inferencex-constants';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type {
@@ -15,7 +15,7 @@ import type {
 import { filterDataByCostLimit } from '@/components/inference/utils';
 import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants';
-import { transformBenchmarkRows } from '@/lib/benchmark-transform';
+import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
 import type { Model, Sequence } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 
@@ -79,6 +79,7 @@ export function useChartData(
   selectedRunDate?: string,
   enabled = true,
   latestAvailableDate?: string,
+  selectedPercentile = 'median',
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
@@ -119,11 +120,13 @@ export function useChartData(
   // Merge main rows with comparison date rows.
   // Stamp each row with the *requested* date (not the actual DB date) so that
   // GPUGraph's activeDates filter (keyed by user-selected date) matches the points.
-  const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]);
+  //
+  // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via
+  // benchmark_type), so one filter covers every scenario.
   const rows = useMemo(() => {
-    if (!allRows || !sequenceIslOsl) return [];
-    const seqFilter = (r: { isl: number; osl: number }) =>
-      r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl;
+    if (!allRows) return [];
+    const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) =>
+      rowToSequence(r) === selectedSequence;
     const seqFiltered = allRows.filter(seqFilter);
 
     // For each (hw, framework, spec_method, disagg, precision) group, keep only
@@ -150,14 +153,14 @@ export function useChartData(
         .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })),
     );
     return [...mainRows, ...extraRows];
-  }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]);
+  }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]);
 
   // Transform filtered rows into chart data
   const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => {
     if (rows.length === 0)
       return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig };
-    return transformBenchmarkRows(rows);
-  }, [rows]);
+    return transformBenchmarkRows(rows, selectedPercentile);
+  }, [rows, selectedPercentile]);
 
   // Sort hardware config — stabilize reference when keys haven't changed.
   // Different sequences for the same model often have the same GPU configs,
@@ -192,8 +195,11 @@ export function useChartData(
       (chartDefinitions as ChartDefinition[]).map((chartDef) => {
         const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
 
-        // Determine dynamic x-axis
-        let xAxisField: keyof AggDataEntry = chartDef.x;
+        // Default x-axis = chart's natural latency metric, percentile-adjusted
+        // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic
+        // scenarios `withPercentile` is a no-op when percentile === 'median'.
+        const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry;
+        let xAxisField: keyof AggDataEntry = naturalX;
         let xAxisLabel = chartDef.x_label;
 
         const metricTitle =
@@ -232,8 +238,10 @@ export function useChartData(
         // (e.g. interactivity → TTFT: "higher is better" → "lower is better").
         // E2EL → TTFT keeps the same direction ("lower is better" for both),
         // so no roofline flip is needed for the e2e chart.
+        // Compare against `naturalX` (percentile-adjusted) — switching the
+        // percentile of the same logical metric is NOT a flip.
         const xAxisFlipped =
-          xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride);
+          xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride);
 
         const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition;
         const dynamicYLabel = chartDef[yLabelKey];
@@ -261,7 +269,7 @@ export function useChartData(
           xAxisField,
         };
       }),
-    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric],
+    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile],
   );
 
   // Build renderable graphs (data processing + stable chart definitions)
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index a23707ba..53c8d84c 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -88,6 +88,29 @@ export interface AggDataEntry {
   actualDate?: string;
   /** URL to the GitHub Actions workflow run that produced this data point. */
   run_url?: string;
+  /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */
+  benchmark_type?: string;
+  /** ISL in tokens — null for agentic_traces. */
+  isl?: number | null;
+  /** OSL in tokens — null for agentic_traces. */
+  osl?: number | null;
+  // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ──
+  /** "on" | "off" — whether KV cache offload to CPU was enabled. */
+  offload_mode?: string;
+  /** Actual server-observed GPU prefix-cache hit rate (0..1). */
+  server_gpu_cache_hit_rate?: number;
+  /** Actual server-observed CPU prefix-cache hit rate (0..1). */
+  server_cpu_cache_hit_rate?: number;
+  /** Infinite-cache theoretical hit rate (0..1) computed from trace. */
+  theoretical_cache_hit_rate?: number;
+  /** Total requests attempted during the window. */
+  num_requests_total?: number;
+  /** Requests that completed successfully. */
+  num_requests_successful?: number;
+  /** Total prompt tokens served. */
+  total_prompt_tokens?: number;
+  /** Total generated (output) tokens. */
+  total_generation_tokens?: number;
 }
 
 /**
@@ -468,6 +491,9 @@ export interface InferenceChartContextType {
   workflowInfo: any;
   selectedYAxisMetric: string;
   setSelectedYAxisMetric: (metric: string) => void;
+  /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */
+  selectedPercentile: string;
+  setSelectedPercentile: (p: string) => void;
   selectedXAxisMetric: string | null;
   setSelectedXAxisMetric: (metric: string | null) => void;
   selectedE2eXAxisMetric: string | null;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 5f8e7787..e4f55ad7 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -1,11 +1,14 @@
 'use client';
 
+import { useEffect, useState } from 'react';
+
 import { track } from '@/lib/analytics';
 
 import { useInference } from '@/components/inference/InferenceContext';
 import {
   ModelSelector,
-  SequenceSelector,
+  ScenarioSelector,
+  PercentileSelector,
   PrecisionSelector,
 } from '@/components/ui/chart-selectors';
 import { DateRangePicker } from '@/components/ui/date-range-picker';
@@ -23,7 +26,7 @@ import {
 import { TooltipProvider } from '@/components/ui/tooltip';
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import type { ChartDefinition } from '@/components/inference/types';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model, type Percentile } from '@/lib/data-mappings';
 
 // Build Y-axis metric options from static chart config JSON — available immediately, no API wait
 const METRIC_GROUPS = [
@@ -78,6 +81,13 @@ interface ChartControlsProps {
 }
 
 export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) {
+  // The percentile selector is rendered conditionally on `selectedSequence`,
+  // which on the client is hydrated from URL params. SSR doesn't see the URL,
+  // so deferring the conditional until after mount keeps the initial DOM
+  // identical between server and client (avoids hydration warnings).
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const {
     selectedModel,
     setSelectedModel,
@@ -87,6 +97,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
     setSelectedPrecisions,
     selectedYAxisMetric,
     setSelectedYAxisMetric,
+    selectedPercentile,
+    setSelectedPercentile,
     graphs,
     selectedGPUs,
     setSelectedGPUs,
@@ -203,12 +215,19 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
             availableModels={availableModels}
             data-testid="model-selector"
           />
-          <SequenceSelector
+          <ScenarioSelector
             value={selectedSequence}
             onChange={handleSequenceChange}
             availableSequences={availableSequences}
-            data-testid="sequence-selector"
+            data-testid="scenario-selector"
           />
+          {mounted && selectedSequence === Sequence.AgenticTraces && (
+            <PercentileSelector
+              value={selectedPercentile}
+              onChange={(p: Percentile) => setSelectedPercentile(p)}
+              data-testid="percentile-selector"
+            />
+          )}
           <PrecisionSelector
             value={selectedPrecisions}
             onChange={handlePrecisionChange}
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 2e078f89..15bb60f0 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -1512,6 +1512,24 @@ const ScatterGraph = React.memo(
             .attr('pointer-events', 'none');
         });
 
+        // Offload halo: dashed ring on frontier points that used KV offload
+        zoomGroup.selectAll<SVGGElement, InferenceData>('.dot-group').each(function (d) {
+          const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`);
+          const showHalo = onFrontier && d.offload_mode === 'on';
+          d3.select(this)
+            .selectAll<SVGCircleElement, boolean>('.offload-halo')
+            .data(showHalo ? [true] : [])
+            .join('circle')
+            .attr('class', 'offload-halo')
+            .attr('r', POINT_SIZE + 4)
+            .attr('fill', 'none')
+            .attr('stroke', 'var(--foreground)')
+            .attr('stroke-width', 1.5)
+            .attr('stroke-dasharray', '3 2')
+            .attr('opacity', 0.9)
+            .attr('pointer-events', 'none');
+        });
+
         // Double-click to track/untrack
         zoomGroup
           .selectAll<SVGGElement, InferenceData>('.dot-group')
@@ -1567,6 +1585,9 @@ const ScatterGraph = React.memo(
         chartDefinition.chartType,
         xScaleConfig._isLog,
         yScaleConfig.type,
+        optimalPointKeys,
+        getCssColor,
+        resolveColor,
       ],
     );
 
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index e88e9930..7391225e 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -88,6 +88,51 @@ const runLinkHTML = (runUrl?: string) =>
 const tooltipLine = (label: string, value: string | number) =>
   `<div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;"><strong>${label}:</strong> ${value}</div>`;
 
+const formatPct = (v: number | undefined): string | null =>
+  v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
+
+/**
+ * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
+ * success, token totals. Returns an empty string for non-agentic rows.
+ */
+const generateAgenticHTML = (d: InferenceData): string => {
+  if (d.benchmark_type !== 'agentic_traces') return '';
+
+  const parts: string[] = [];
+  if (d.offload_mode) {
+    parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase()));
+  }
+
+  const gpuHit = formatPct(d.server_gpu_cache_hit_rate);
+  const cpuHit = formatPct(d.server_cpu_cache_hit_rate);
+  const theoHit = formatPct(d.theoretical_cache_hit_rate);
+  if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit));
+  if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit));
+  if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit));
+
+  if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) {
+    const successPct =
+      d.num_requests_total > 0
+        ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)`
+        : '';
+    parts.push(
+      tooltipLine(
+        'Requests',
+        `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`,
+      ),
+    );
+  }
+
+  if (d.total_prompt_tokens !== undefined) {
+    parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens)));
+  }
+  if (d.total_generation_tokens !== undefined) {
+    parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
+  }
+
+  return parts.join('');
+};
+
 /**
  * Generates HTML for the parallelism configuration section of a tooltip.
  * Falls back to GPU count for old data without parallelism fields.
@@ -177,9 +222,10 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
       ${
         isPinned
@@ -231,9 +277,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
     </div>
   `;
 };
@@ -292,9 +339,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Concurrency:</strong> ${d.conc}
       </div>
-      <div style="color: var(--muted-foreground); font-size: 11px;">
+      <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
         <strong>Precision:</strong> ${d.precision.toUpperCase()}
       </div>
+      ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
     </div>
   `;
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 75e2f257..1c843e12 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -19,12 +19,16 @@ import {
   type Model,
   type Precision,
   type Sequence,
+  type Percentile,
+  PERCENTILE_OPTIONS,
   getModelCategory,
   getModelLabel,
+  getPercentileLabel,
   getPrecisionLabel,
   getSequenceCategory,
   getSequenceLabel,
   groupByCategory,
+  sequenceKind,
 } from '@/lib/data-mappings';
 
 function DeprecatedLabel({ reason }: { reason: string }) {
@@ -167,6 +171,126 @@ export function SequenceSelector({
   );
 }
 
+interface ScenarioSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Sequence) => void;
+  availableSequences: string[];
+  'data-testid'?: string;
+}
+
+/**
+ * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length",
+ * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL
+ * framing only applies to the fixed-seq subset).
+ */
+export function ScenarioSelector({
+  id = 'scenario-select',
+  value,
+  onChange,
+  availableSequences,
+  'data-testid': testId,
+}: ScenarioSelectorProps) {
+  const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq');
+  const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic');
+  const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence));
+
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Scenario"
+        tooltip="Benchmark scenario. Fixed Sequence Length runs use a defined input/output token count (ISL/OSL). Agentic Traces replay real agentic workloads with variable inputs/outputs."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_scenario_changed', { scenario: v });
+          onChange(v as Sequence);
+        }}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {fixedSeq.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Fixed Sequence Length</SelectLabel>
+              {fixedGroups.default.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+              {fixedGroups.deprecated.length > 0 && (
+                <>
+                  <DeprecatedLabel reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  {fixedGroups.deprecated.map((seq) => (
+                    <SelectItem key={seq} value={seq}>
+                      {getSequenceLabel(seq as Sequence)}
+                    </SelectItem>
+                  ))}
+                </>
+              )}
+            </SelectGroup>
+          )}
+          {agentic.map((seq) => (
+            <SelectItem key={seq} value={seq}>
+              {getSequenceLabel(seq as Sequence)}
+            </SelectItem>
+          ))}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
+interface PercentileSelectorProps {
+  id?: string;
+  value: string;
+  onChange: (value: Percentile) => void;
+  'data-testid'?: string;
+}
+
+/**
+ * Latency percentile selector for agentic-trace charts. The selected value
+ * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so
+ * picking p99 plots p99 e2e latency / interactivity instead of the median.
+ */
+export function PercentileSelector({
+  id = 'percentile-select',
+  value,
+  onChange,
+  'data-testid': testId,
+}: PercentileSelectorProps) {
+  return (
+    <div className="flex flex-col space-y-1.5 lg:col-span-1">
+      <LabelWithTooltip
+        htmlFor={id}
+        label="Latency Percentile"
+        tooltip="Percentile of the latency distribution used for the chart x-axis. Agentic runs carry median/p90/p99/p99.9 variants; switch percentiles to see tail-latency behavior."
+      />
+      <Select
+        value={value}
+        onValueChange={(v) => {
+          track('selector_percentile_changed', { percentile: v });
+          onChange(v as Percentile);
+        }}
+      >
+        <SelectTrigger id={id} data-testid={testId} className="w-full">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          {PERCENTILE_OPTIONS.map((p) => (
+            <SelectItem key={p} value={p}>
+              {getPercentileLabel(p)}
+            </SelectItem>
+          ))}
+        </SelectContent>
+      </Select>
+    </div>
+  );
+}
+
 interface PrecisionSelectorProps {
   id?: string;
   value: string[];
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index f4263d2c..05b522c5 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -29,6 +29,8 @@ function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 128,
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index 2dccdf7f..42530a51 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -12,7 +12,7 @@ import {
 
 import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types';
 import { UnofficialBanner } from '@/components/ui/unofficial-banner';
-import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants';
+import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants';
 import { computeToggle } from '@/hooks/useTogglableSet';
 import type { BenchmarkRow, EvalRow } from '@/lib/api';
 import { normalizeEvalHardwareKey } from '@/lib/chart-utils';
@@ -93,7 +93,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData
   const groups = new Map<string, BenchmarkRow[]>();
   for (const row of benchmarks) {
     const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model;
-    const sequence = islOslToSequence(row.isl, row.osl);
+    const sequence = rowToSequence(row);
     if (!sequence) continue;
     const key = `${displayModel}_${sequence}`;
     if (!groups.has(key)) groups.set(key, []);
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 11ba4521..240251c3 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -23,9 +23,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   date: string;
@@ -140,13 +144,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index be76438e..6a6c97c8 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -23,6 +23,8 @@ function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
     decode_num_workers: 0,
     num_prefill_gpu: 8,
     num_decode_gpu: 8,
+    benchmark_type: 'single_turn',
+    offload_mode: 'off',
     isl: 1024,
     osl: 1024,
     conc: 64,
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 107f0b12..69745da2 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -15,9 +15,39 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils';
 import { getHardwareConfig } from '@/lib/constants';
 import type { BenchmarkRow } from '@/lib/api';
 
+/**
+ * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl
+ * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here:
+ *   e2el   ≡ ttlt   (time-to-last-token == end-to-end latency)
+ *   tpot   ≡ itl    (time-per-output-token == inter-token-latency for single-output)
+ *   intvty ≡ 1/itl  (tok/s from the user's perspective)
+ * Existing fields win if present; we only fill in the gaps.
+ */
+function agenticAliases(m: Record<string, number>): Record<string, number> {
+  const out: Record<string, number> = {};
+  for (const suffix of ['mean', 'median', 'p90', 'p99']) {
+    const itl = m[`${suffix}_itl`];
+    const ttlt = m[`${suffix}_ttlt`];
+    if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
+    if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl;
+    if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) {
+      out[`${suffix}_intvty`] = 1 / itl;
+    }
+  }
+  return out;
+}
+
 /** Convert a DB benchmark row to an AggDataEntry. */
 export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
-  const m = row.metrics;
+  const isAgentic = row.benchmark_type === 'agentic_traces';
+  const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics;
+  // Prefer the dedicated column (added in migration 004); fall back to the
+  // legacy stash inside `metrics` for any rows ingested before that column
+  // existed.
+  const rawMetrics = row.metrics as Record<string, unknown>;
+  const offloadMode =
+    row.offload_mode ??
+    (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
   return {
     hw: row.hardware,
     framework: row.framework,
@@ -68,6 +98,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     date: row.date,
     actualDate: (row as any).actualDate ?? row.date,
     run_url: row.run_url ?? undefined,
+    benchmark_type: row.benchmark_type,
+    isl: row.isl,
+    osl: row.osl,
+    offload_mode: offloadMode,
+    server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate,
+    server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate,
+    theoretical_cache_hit_rate: m.theoretical_cache_hit_rate,
+    num_requests_total: m.num_requests_total,
+    num_requests_successful: m.num_requests_successful,
+    total_prompt_tokens: m.total_prompt_tokens,
+    total_generation_tokens: m.total_generation_tokens,
   };
 }
 
@@ -77,13 +118,30 @@ interface PreparedEntry {
   date: string;
 }
 
+/**
+ * Rewrite a chart x-axis key to use a different latency percentile prefix
+ * (`median_` → `p99_` etc). Only touches keys that start with a known
+ * percentile prefix; leaves everything else alone.
+ */
+export function withPercentile(key: string, percentile: string): string {
+  return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`);
+}
+
 /**
  * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
  * Returns one InferenceData[] per chart definition (e2e, interactivity).
  *
  * Converts rows to AggDataEntry once, then reuses for each chart definition.
+ *
+ * @param percentile Optional latency percentile for the chart x-axis
+ *   (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart
+ *   definition for the chosen percentile — only agentic rows carry the
+ *   full set (median/p90/p99/p99.9) so this mainly affects that scenario.
  */
-export function transformBenchmarkRows(rows: BenchmarkRow[]): {
+export function transformBenchmarkRows(
+  rows: BenchmarkRow[],
+  percentile = 'median',
+): {
   chartData: InferenceData[][];
   hardwareConfig: HardwareConfig;
 } {
@@ -109,13 +167,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): {
 
   // Phase 2: Build chart data per chart definition (reusing prepared entries)
   const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => {
+    const xKey = withPercentile(chartDef.x, percentile);
     const groupedByHw: Record<string, InferenceData[]> = {};
 
     for (const { entry, hwKey, date } of prepared) {
       const dataPoint = createChartDataPoint(
         date,
         entry,
-        chartDef.x as keyof AggDataEntry,
+        xKey as keyof AggDataEntry,
         chartDef.y as keyof AggDataEntry,
         hwKey,
       );
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 823b6823..8900f50e 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -102,17 +102,77 @@ export enum Sequence {
   OneK_OneK = '1k/1k',
   OneK_EightK = '1k/8k',
   EightK_OneK = '8k/1k',
+  AgenticTraces = 'agentic-traces',
 }
 
-const SEQUENCE_CONFIG: Record<Sequence, { label: string; compact: string; category: CategoryTag }> =
-  {
-    [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' },
-    [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' },
-    [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' },
-  };
+/**
+ * Top-level scenario kind. Fixed-seq sequences cluster under a single group
+ * in the selector; agentic traces sit alongside as their own kind.
+ */
+export type ScenarioKind = 'fixed-seq' | 'agentic';
+
+export function sequenceKind(seq: Sequence): ScenarioKind {
+  return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq';
+}
+
+const SEQUENCE_CONFIG: Record<
+  Sequence,
+  { label: string; compact: string; category: CategoryTag; kind: ScenarioKind }
+> = {
+  [Sequence.OneK_OneK]: {
+    label: '1K / 1K',
+    compact: '1k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.OneK_EightK]: {
+    label: '1K / 8K',
+    compact: '1k8k',
+    category: 'deprecated',
+    kind: 'fixed-seq',
+  },
+  [Sequence.EightK_OneK]: {
+    label: '8K / 1K',
+    compact: '8k1k',
+    category: 'default',
+    kind: 'fixed-seq',
+  },
+  [Sequence.AgenticTraces]: {
+    label: 'Agentic Traces',
+    compact: 'agentic',
+    category: 'default',
+    kind: 'agentic',
+  },
+};
 
 export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 
+/**
+ * Percentile of the latency distribution used for the chart x-axis when
+ * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which
+ * slice to plot.
+ */
+export enum Percentile {
+  Median = 'median',
+  P90 = 'p90',
+  P99 = 'p99',
+  P99_9 = 'p99.9',
+}
+
+const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
+  [Percentile.Median]: { label: 'p50 (median)' },
+  [Percentile.P90]: { label: 'p90' },
+  [Percentile.P99]: { label: 'p99' },
+  [Percentile.P99_9]: { label: 'p99.9' },
+};
+
+export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
+
+export function getPercentileLabel(p: Percentile): string {
+  return PERCENTILE_CONFIG[p]?.label ?? p;
+}
+
 export const DEPRECATED_SEQUENCES: ReadonlySet<Sequence> = new Set(
   (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][])
     .filter(([, c]) => c.category === 'deprecated')
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 3947488f..fb2e9d70 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -22,6 +22,7 @@ const URL_STATE_KEYS = [
   'i_seq',
   'i_prec',
   'i_metric',
+  'i_pctl',
   'i_xmetric',
   'i_e2e_xmetric',
   'i_scale',
@@ -61,6 +62,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_seq: '8k/1k',
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
+  i_pctl: 'median',
   i_xmetric: 'p99_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',
diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts
index 6d646f08..d9a3d2d1 100644
--- a/packages/constants/src/models.ts
+++ b/packages/constants/src/models.ts
@@ -53,3 +53,20 @@ export function islOslToSequence(isl: number, osl: number): string | null {
   };
   return map[`${isl}_${osl}`] ?? null;
 }
+
+/**
+ * Map a benchmark/availability row to its sequence (scenario) string.
+ * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl.
+ * - Other rows (today: `single_turn`) fall back to `islOslToSequence`.
+ * Returns `null` for rows that can't be classified (e.g. `single_turn` with
+ * unmapped isl/osl values).
+ */
+export function rowToSequence(row: {
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+}): string | null {
+  if (row.benchmark_type === 'agentic_traces') return 'agentic-traces';
+  if (row.isl === null || row.osl === null) return null;
+  return islOslToSequence(row.isl, row.osl);
+}
diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql
new file mode 100644
index 00000000..c143914e
--- /dev/null
+++ b/packages/db/migrations/002_agentic_scenario.sql
@@ -0,0 +1,30 @@
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+--   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+--   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+  alter column isl drop not null,
+  alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+  drop constraint benchmark_results_isl_positive,
+  drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+--    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql
new file mode 100644
index 00000000..e96cbd50
--- /dev/null
+++ b/packages/db/migrations/003_agentic_availability.sql
@@ -0,0 +1,21 @@
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+  drop constraint availability_pkey;
+
+alter table availability
+  alter column isl drop not null,
+  alter column osl drop not null,
+  add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+  add constraint availability_natural_key unique nulls not distinct
+    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql
new file mode 100644
index 00000000..24b617f1
--- /dev/null
+++ b/packages/db/migrations/004_offload_mode.sql
@@ -0,0 +1,42 @@
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+  add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+   set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+   and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+  br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts
index 67173c64..ea802d3f 100644
--- a/packages/db/src/etl/benchmark-ingest.ts
+++ b/packages/db/src/etl/benchmark-ingest.ts
@@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows(
 
   // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears
   // more than once in a single batch. Deduplicate within the batch, keeping
-  // the last occurrence (last metrics for each unique config/isl/osl/conc).
+  // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode).
   const seen = new Map<string, BenchmarkParams & { configId: number }>();
-  for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r);
+  for (const r of rows) {
+    seen.set(
+      `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`,
+      r,
+    );
+  }
   const deduped = [...seen.values()];
 
   const configIds = deduped.map((r) => r.configId);
+  const benchmarkTypes = deduped.map((r) => r.benchmarkType);
+  const offloadModes = deduped.map((r) => r.offloadMode);
   const isls = deduped.map((r) => r.isl);
   const osls = deduped.map((r) => r.osl);
   const concs = deduped.map((r) => r.conc);
@@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows(
 
   const result = await sql<{ inserted: boolean; id: number }[]>`
     insert into benchmark_results (
-      workflow_run_id, config_id, benchmark_type, date,
+      workflow_run_id, config_id, benchmark_type, offload_mode, date,
       isl, osl, conc, image, metrics
     )
     select
       ${workflowRunId},
       unnest(${sql.array(configIds)}::int[]),
-      'single_turn',
+      unnest(${sql.array(benchmarkTypes)}::text[]),
+      unnest(${sql.array(offloadModes)}::text[]),
       ${date}::date,
       unnest(${sql.array(isls)}::int[]),
       unnest(${sql.array(osls)}::int[]),
       unnest(${sql.array(concs)}::int[]),
       unnest(${sql.array(images)}),
       unnest(${sql.array(metricsJsons)}::jsonb[])
-    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc)
+    on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode)
     do update set
       metrics = excluded.metrics,
       image = excluded.image
@@ -147,13 +155,14 @@ export async function bulkUpsertAvailability(
   sql: Sql,
   rows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[],
   date: string,
 ): Promise<void> {
@@ -162,7 +171,7 @@ export async function bulkUpsertAvailability(
   const seen = new Set<string>();
   const unique: typeof rows = [];
   for (const r of rows) {
-    const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`;
+    const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`;
     if (!seen.has(key)) {
       seen.add(key);
       unique.push(r);
@@ -170,7 +179,7 @@ export async function bulkUpsertAvailability(
   }
 
   await sql`
-    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date)
+    insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date)
     select
       unnest(${sql.array(unique.map((r) => r.model))}::text[]),
       unnest(${sql.array(unique.map((r) => r.isl))}::int[]),
@@ -180,6 +189,7 @@ export async function bulkUpsertAvailability(
       unnest(${sql.array(unique.map((r) => r.framework))}::text[]),
       unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]),
       unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]),
+      unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]),
       ${date}::date
     on conflict do nothing
   `;
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 7d78e175..5b120843 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([
   'decode_num_workers',
   'num_prefill_gpu',
   'num_decode_gpu',
+  // agentic scenario
+  'scenario_type',
+  'users',
+  'offload_mode',
+  'num_requests_total',
+  'num_requests_successful',
 ]);
 
+/**
+ * `benchmark_type` values understood by the ingest.
+ * - `single_turn`    — fixed sequence-length runs (isl/osl set).
+ * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc).
+ */
+export type BenchmarkType = 'single_turn' | 'agentic_traces';
+
 /**
  * METRIC_KEYS from constants is the canonical set of known metric keys.
  * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured
@@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set<string>();
 
 export interface BenchmarkParams {
   config: ConfigParams;
-  isl: number;
-  osl: number;
+  benchmarkType: BenchmarkType;
+  // Null for agentic_traces; present for single_turn.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */
+  offloadMode: string;
   image: string | null;
   metrics: Record<string, number>;
 }
@@ -114,10 +131,15 @@ export function mapBenchmarkRow(
     return null;
   }
 
-  const isl = parseInt2(row.isl) ?? islOslFallback?.isl;
-  const osl = parseInt2(row.osl) ?? islOslFallback?.osl;
-  const conc = parseInt2(row.conc);
-  if (!isl || !osl || !conc) {
+  // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants),
+  // no isl/osl, and `users` instead of `conc`. Everything else stays as-is.
+  const isAgentic = String(row.scenario_type ?? '').startsWith('agentic');
+  const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn';
+
+  const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
+  const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
+  const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc);
+  if (!conc || (!isAgentic && (!isl || !osl))) {
     tracker.skips.noIslOsl++;
     return null;
   }
@@ -182,6 +204,12 @@ export function mapBenchmarkRow(
     }
   }
 
+  // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it
+  // as a stringified metric so the frontend can expose it in tooltips.
+  if (isAgentic && typeof row.offload_mode === 'string') {
+    (metrics as Record<string, unknown>).offload_mode = row.offload_mode;
+  }
+
   // Artifact names encode '/' as '#' to avoid path separators; restore the URI.
   const image = row.image ? String(row.image).replaceAll('#', '/') : null;
 
@@ -205,9 +233,14 @@ export function mapBenchmarkRow(
       numPrefillGpu,
       numDecodeGpu,
     },
+    benchmarkType,
     isl,
     osl,
     conc,
+    offloadMode:
+      typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+        ? row.offload_mode
+        : 'off',
     image,
     metrics,
   };
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 14c7b4d0..8cce43ca 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -248,13 +248,14 @@ async function main(): Promise<void> {
 
   const availRows: {
     model: string;
-    isl: number;
-    osl: number;
+    isl: number | null;
+    osl: number | null;
     precision: string;
     hardware: string;
     framework: string;
     specMethod: string;
     disagg: boolean;
+    benchmarkType: string;
   }[] = [];
 
   let totalNewBmk = 0,
@@ -367,6 +368,7 @@ async function main(): Promise<void> {
               framework: r.config.framework,
               specMethod: r.config.specMethod,
               disagg: r.config.disagg,
+              benchmarkType: r.benchmarkType,
             });
           }
 
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index e20278d6..6dc604e9 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -596,13 +596,14 @@ async function main(): Promise<void> {
     // Upsert availability rows only for successfully resolved configs
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const r of allInserted) {
       availRows.push({
@@ -614,6 +615,7 @@ async function main(): Promise<void> {
         framework: r.config.framework,
         specMethod: r.config.specMethod,
         disagg: r.config.disagg,
+        benchmarkType: r.benchmarkType,
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts
index 1e494e9f..43aae047 100644
--- a/packages/db/src/ingest-supplemental.ts
+++ b/packages/db/src/ingest-supplemental.ts
@@ -219,8 +219,10 @@ async function ingestSupplementalBmk(
 
     const rows: {
       configId: number;
-      isl: number;
-      osl: number;
+      benchmarkType: 'single_turn' | 'agentic_traces';
+      offloadMode: string;
+      isl: number | null;
+      osl: number | null;
       conc: number;
       image: string | null;
       metrics: Record<string, number>;
@@ -271,6 +273,8 @@ async function ingestSupplementalBmk(
 
       rows.push({
         configId,
+        benchmarkType: 'single_turn',
+        offloadMode: 'off',
         isl: entry.isl,
         osl: entry.osl,
         conc: entry.conc,
@@ -294,13 +298,14 @@ async function ingestSupplementalBmk(
     // to `rows` are exactly the valid ones.
     const availRows: {
       model: string;
-      isl: number;
-      osl: number;
+      isl: number | null;
+      osl: number | null;
       precision: string;
       hardware: string;
       framework: string;
       specMethod: string;
       disagg: boolean;
+      benchmarkType: string;
     }[] = [];
     for (const entry of entries) {
       const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined });
@@ -317,6 +322,7 @@ async function ingestSupplementalBmk(
         framework,
         specMethod,
         disagg,
+        benchmarkType: 'single_turn',
       });
     }
     if (availRows.length > 0) {
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 0d9373d3..f09a2686 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -290,6 +290,8 @@ function toBenchmarkRow(
     decode_num_workers: c.decode_num_workers,
     num_prefill_gpu: c.num_prefill_gpu,
     num_decode_gpu: c.num_decode_gpu,
+    benchmark_type: br.benchmark_type ?? 'single_turn',
+    offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off',
     isl: br.isl,
     osl: br.osl,
     conc: br.conc,
@@ -410,7 +412,11 @@ export function getAvailabilityData(): AvailabilityRow[] {
   for (const a of s.availability) {
     const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`;
     if (validKeys.has(key)) {
-      rows.push({ ...a, date: toDateString(a.date) });
+      rows.push({
+        ...a,
+        benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn',
+        date: toDateString(a.date),
+      });
     }
   }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 1c30b1fd..74e20380 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -18,9 +18,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces; numeric for single_turn fixed-seq runs.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   date: string;
@@ -68,6 +72,8 @@ export async function getLatestBenchmarks(
         c.decode_num_workers,
         c.num_prefill_gpu,
         c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
         br.isl,
         br.osl,
         br.conc,
@@ -106,6 +112,8 @@ export async function getLatestBenchmarks(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      lb.benchmark_type,
+      lb.offload_mode,
       lb.isl,
       lb.osl,
       lb.conc,
@@ -153,6 +161,7 @@ export async function getAllBenchmarksForHistory(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
       br.isl,
       br.osl,
       br.conc,
diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts
index b4e4f255..d5e2d933 100644
--- a/packages/db/src/queries/workflow-info.ts
+++ b/packages/db/src/queries/workflow-info.ts
@@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise<DateC
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
-/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, date) combos for the availability API. */
+/** Get available (model, ISL/OSL, precision, hardware, framework, spec_method, benchmark_type, date) combos for the availability API. */
 export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRow[]> {
   const rows = await sql`
-    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text
+    SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text
     FROM availability a
     WHERE EXISTS (
       SELECT 1
@@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise<AvailabilityRo
         AND c.hardware = a.hardware
         AND c.framework = a.framework
         AND c.precision = a.precision
-        AND br.isl = a.isl
-        AND br.osl = a.osl
+        AND br.isl IS NOT DISTINCT FROM a.isl
+        AND br.osl IS NOT DISTINCT FROM a.osl
+        AND br.benchmark_type = a.benchmark_type
         AND br.date = a.date
         AND br.error IS NULL
         AND wr.conclusion IS NOT NULL

From 9c43a762cdaf9edd0091ef9d3034d4a974071c6d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 30 Apr 2026 19:01:56 -0500
Subject: [PATCH 02/96] =?UTF-8?q?fix:=20agentic=20offload=20variants=20?=
 =?UTF-8?q?=E2=80=94=20render=20both=20halos=20+=20map=20renamed=20fields?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ScatterGraph: include `offload_mode` in `buildPointConfigId` so d3's data
  join keeps both `on` and `off` variants for the same (config, conc).
  Without it, the second variant collapsed onto the first key, so FP8
  offload-on points (and their halos) silently disappeared.
- benchmark-mapper: handle older artifacts that emit `users`/`offload_mode`
  AND newer ones that emit `conc`/`offloading` (with 'none' → 'off' mapping).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  |  4 +++
 packages/db/src/etl/benchmark-mapper.ts       | 27 ++++++++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 15bb60f0..55a206ce 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -295,6 +295,10 @@ const ScatterGraph = React.memo(
     const buildPointConfigId = useCallback((point: InferenceData): string => {
       let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`;
       if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`;
+      // Agentic runs emit two rows per (config, conc) — one offload=on, one off.
+      // Without this suffix, d3's data join treats them as the same point and
+      // drops one variant (along with its halo).
+      if (point.offload_mode) key += `|offload-${point.offload_mode}`;
       return key;
     }, []);
 
diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index 5b120843..d842276e 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -138,12 +138,24 @@ export function mapBenchmarkRow(
 
   const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null);
   const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null);
-  const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc);
+  // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones.
+  const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc);
   if (!conc || (!isAgentic && (!isl || !osl))) {
     tracker.skips.noIslOsl++;
     return null;
   }
 
+  // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
+  // ('none' → 'off'; any other non-empty value → 'on').
+  const offloadModeRaw =
+    typeof row.offload_mode === 'string' && row.offload_mode.length > 0
+      ? row.offload_mode
+      : typeof row.offloading === 'string' && row.offloading.length > 0
+        ? row.offloading === 'none'
+          ? 'off'
+          : 'on'
+        : 'off';
+
   const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg);
   const isMultinode = parseBool(row.is_multinode);
   const precision = normalizePrecision(String(row.precision ?? ''));
@@ -204,10 +216,10 @@ export function mapBenchmarkRow(
     }
   }
 
-  // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it
-  // as a stringified metric so the frontend can expose it in tooltips.
-  if (isAgentic && typeof row.offload_mode === 'string') {
-    (metrics as Record<string, unknown>).offload_mode = row.offload_mode;
+  // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`)
+  // — preserve as a stringified metric so the frontend can expose it in tooltips.
+  if (isAgentic) {
+    (metrics as Record<string, unknown>).offload_mode = offloadModeRaw;
   }
 
   // Artifact names encode '/' as '#' to avoid path separators; restore the URI.
@@ -237,10 +249,7 @@ export function mapBenchmarkRow(
     isl,
     osl,
     conc,
-    offloadMode:
-      typeof row.offload_mode === 'string' && row.offload_mode.length > 0
-        ? row.offload_mode
-        : 'off',
+    offloadMode: offloadModeRaw,
     image,
     metrics,
   };

From 07ba10636dae87b5a819afa524d7c10322fae41b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 00:29:55 -0500
Subject: [PATCH 03/96] fix: render offload halo on every offload-on point, not
 just frontier

The halo's purpose is to surface KV-offload usage; restricting it to
Pareto-frontier-only points hid the indicator on most runs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/ScatterGraph.tsx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 55a206ce..61ac0983 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -1516,10 +1516,9 @@ const ScatterGraph = React.memo(
             .attr('pointer-events', 'none');
         });
 
-        // Offload halo: dashed ring on frontier points that used KV offload
+        // Offload halo: dashed ring on every point that used KV offload (Pareto or not)
         zoomGroup.selectAll<SVGGElement, InferenceData>('.dot-group').each(function (d) {
-          const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`);
-          const showHalo = onFrontier && d.offload_mode === 'on';
+          const showHalo = d.offload_mode === 'on';
           d3.select(this)
             .selectAll<SVGCircleElement, boolean>('.offload-halo')
             .data(showHalo ? [true] : [])

From 95e9dc77431adf5354ef0df36989816199624383 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 01:13:42 -0500
Subject: [PATCH 04/96] fix: strip runner-pool suffix (-p1, -p2, ...) from hw
 identifier

b300-p1 (and similar) artifacts were skipping ingest because the runner-pool
suffix wasn't in the strip list and didn't normalize to the canonical b300
GPU key.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/normalizers.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts
index ad12a454..bd497f7a 100644
--- a/packages/db/src/etl/normalizers.ts
+++ b/packages/db/src/etl/normalizers.ts
@@ -34,7 +34,8 @@ export function hwToGpuKey(hw: string): string | null {
     .replace(/-dgxc-slurm$/, '')
     .replace(/-dgxc$/, '')
     .replace(/-nb$/, '')
-    .replace(/-nv$/, '');
+    .replace(/-nv$/, '')
+    .replace(/-p\d+$/, ''); // strip runner-pool suffix (e.g. b300-p1 → b300)
   return GPU_KEYS.has(base) ? base : null;
 }
 

From 982106da5f4421983841304f0503b6467033852d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:25:33 -0500
Subject: [PATCH 05/96] feat: bold scatter labels with concurrency tag +
 collision avoidance

- Label text now includes `C=<conc>` alongside the GPU/parallelism tag
  (default `<tp> C=<conc>`, advanced `<getPointLabel> C=<conc>`)
- Bumped point-label font-weight to 700 so the labels read clearly against
  the chart fill
- Greedy collision-avoidance pass on render and zoom: tries placing each
  label above/below the point through 4 candidate dy offsets, hiding the
  label only when no slot is free

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 68 ++++++++++++++++++-
 .../src/lib/d3-chart/layers/scatter-points.ts |  1 +
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 61ac0983..3fbd8588 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -55,6 +55,63 @@ import {
   buildGradientColorMap,
 } from '@/components/inference/utils/paretoLabels';
 
+// Greedy label-collision avoidance: try positions above/below the point;
+// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom.
+function avoidLabelCollisions(
+  zoomGroup: d3.Selection<SVGGElement, unknown, null, undefined>,
+): void {
+  const labels: {
+    el: SVGTextElement;
+    cx: number;
+    cy: number;
+    w: number;
+    h: number;
+  }[] = [];
+  zoomGroup.selectAll<SVGGElement, unknown>('.dot-group').each(function () {
+    const labelEl = this.querySelector<SVGTextElement>('.point-label');
+    if (!labelEl) return;
+    if ((this as SVGGElement).style.opacity === '0') return;
+    const transform = (this as SVGGElement).getAttribute('transform') ?? '';
+    const m = transform.match(/translate\(([^,]+),([^)]+)\)/);
+    if (!m) return;
+    const cx = parseFloat(m[1]);
+    const cy = parseFloat(m[2]);
+    labelEl.setAttribute('dy', '-8');
+    labelEl.style.opacity = '1';
+    const bbox = labelEl.getBBox();
+    labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height });
+  });
+  labels.sort((a, b) => a.cx - b.cx);
+  const placed: { left: number; right: number; top: number; bottom: number }[] = [];
+  const pad = 1;
+  const candidates = [-8, 14, -22, 28];
+  for (const lab of labels) {
+    let chosenDy: number | null = null;
+    let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
+    for (const dy of candidates) {
+      const top = lab.cy + dy - lab.h - pad;
+      const bottom = lab.cy + dy + pad;
+      const left = lab.cx - lab.w / 2 - pad;
+      const right = lab.cx + lab.w / 2 + pad;
+      const collides = placed.some(
+        (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
+      );
+      if (!collides) {
+        chosenDy = dy;
+        chosenBox = { left, right, top, bottom };
+        break;
+      }
+    }
+    if (chosenDy !== null && chosenBox) {
+      lab.el.setAttribute('dy', String(chosenDy));
+      lab.el.style.opacity = '1';
+      placed.push(chosenBox);
+    } else {
+      lab.el.style.opacity = '0';
+    }
+  }
+}
+
 // X-shape path for overlay (unofficial) data points
 const X_SIZE = 5;
 const X_HOVER_SIZE = 7;
@@ -603,6 +660,7 @@ const ScatterGraph = React.memo(
               d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any,
             );
           }
+          avoidLabelCollisions(ctx.layout.zoomGroup);
         },
       }),
       [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type],
@@ -1251,7 +1309,8 @@ const ScatterGraph = React.memo(
           getOpacity: (d) => (isPointVisible(d) ? 1 : 0),
           getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'),
           hideLabels: hidePointLabels || showGradientLabels,
-          getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+          getLabelText: (d) =>
+            useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
           foreground: 'var(--foreground)',
           dataAttrs: {
             'hw-key': (d) => String(d.hwKey),
@@ -1353,8 +1412,11 @@ const ScatterGraph = React.memo(
                   .attr('text-anchor', 'middle')
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
+                  .attr('font-weight', '700')
                   .attr('pointer-events', 'none')
-                  .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp));
+                  .text(
+                    useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
+                  );
               });
 
               // Overlay tooltip handlers
@@ -1566,6 +1628,8 @@ const ScatterGraph = React.memo(
             });
           });
 
+        avoidLabelCollisions(zoomGroup);
+
         // Log tick formatting on initial render
         if (xScaleConfig._isLog) {
           const xScale = ctx.xScale as d3.ScaleLogarithmic<number, number>;
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 507654e1..9f2d2f38 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -72,6 +72,7 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
       .attr('text-anchor', 'middle')
       .attr('fill', config.foreground)
       .attr('font-size', '10px')
+      .attr('font-weight', '700')
       .attr('pointer-events', 'none')
       .text(config.getLabelText);
   }

From 9572b95e86de7cece1179b5f48dd29135765002b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:32:44 -0500
Subject: [PATCH 06/96] fix: stack multi-line point labels upward so they don't
 overlap the point
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tspans now ride above the text's `dy` anchor — the LAST line sits at the
anchor (just above the point) and earlier lines stack above it. Previously
the second tspan landed below the anchor and crashed into the marker.

Also widened collision candidates by label height so the flipped-below
position fully clears the point on multi-line labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 28 +++++++---
 .../src/lib/d3-chart/layers/scatter-points.ts | 52 +++++++++++++------
 2 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 3fbd8588..f8ce9b8f 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -84,8 +84,11 @@ function avoidLabelCollisions(
   labels.sort((a, b) => a.cx - b.cx);
   const placed: { left: number; right: number; top: number; bottom: number }[] = [];
   const pad = 1;
-  const candidates = [-8, 14, -22, 28];
   for (const lab of labels) {
+    // Candidates scale with the label's own height so multi-line labels don't
+    // overlap the point shape when flipped below.
+    const below = lab.h + 8;
+    const candidates = [-8, below, -8 - below - 4, 2 * below];
     let chosenDy: number | null = null;
     let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
     for (const dy of candidates) {
@@ -1310,7 +1313,7 @@ const ScatterGraph = React.memo(
           getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'),
           hideLabels: hidePointLabels || showGradientLabels,
           getLabelText: (d) =>
-            useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
+            useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`,
           foreground: 'var(--foreground)',
           dataAttrs: {
             'hw-key': (d) => String(d.hwKey),
@@ -1403,7 +1406,14 @@ const ScatterGraph = React.memo(
               // Labels
               const showLabels = !hidePointLabels && !showGradientLabels;
               overlayPoints.each(function (d) {
-                d3.select(this)
+                const lines = showLabels
+                  ? (useAdvancedLabels
+                      ? `${getPointLabel(d)}\nC=${d.conc}`
+                      : `${d.tp}\nC=${d.conc}`
+                    ).split('\n')
+                  : [];
+                const text = d3
+                  .select(this)
                   .selectAll<SVGTextElement, boolean>('.overlay-label')
                   .data(showLabels ? [true] : [])
                   .join('text')
@@ -1413,10 +1423,14 @@ const ScatterGraph = React.memo(
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
                   .attr('font-weight', '700')
-                  .attr('pointer-events', 'none')
-                  .text(
-                    useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`,
-                  );
+                  .attr('pointer-events', 'none');
+                text
+                  .selectAll<SVGTSpanElement, string>('tspan')
+                  .data(lines)
+                  .join('tspan')
+                  .attr('x', 0)
+                  .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+                  .text((l) => l);
               });
 
               // Overlay tooltip handlers
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 9f2d2f38..13c588d8 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -63,18 +63,30 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
     applyNormalState(shape, d.precision);
   });
 
-  // Label (enter only)
+  // Label (enter only). Multi-line labels are passed as `\n`-separated strings;
+  // we stack tspans UPWARD from the text's `dy` anchor so the LAST line sits
+  // at `dy` (just above the point) and earlier lines land above it. That way,
+  // the collision-avoidance pass only has to move the `<text>` element — the
+  // intra-stack offsets stay correct whether the label ends up above or below.
   if (!config.hideLabels && config.getLabelText && config.foreground) {
-    entered
-      .append('text')
-      .attr('class', 'point-label')
-      .attr('dy', -8)
-      .attr('text-anchor', 'middle')
-      .attr('fill', config.foreground)
-      .attr('font-size', '10px')
-      .attr('font-weight', '700')
-      .attr('pointer-events', 'none')
-      .text(config.getLabelText);
+    const labelGetter = config.getLabelText;
+    entered.each(function (d) {
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .append('text')
+        .attr('class', 'point-label')
+        .attr('dy', -8)
+        .attr('text-anchor', 'middle')
+        .attr('fill', config.foreground!)
+        .attr('font-size', '10px')
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      lines.forEach((line, i) => {
+        const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em';
+        text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line);
+      });
+    });
   }
 
   // Exit: remove stale points
@@ -103,9 +115,12 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
 
   // Update labels: use data join so labels are created/removed properly on toggle
   if (!config.hideLabels && config.getLabelText && config.foreground) {
+    const labelGetter = config.getLabelText;
     points.each(function (d) {
-      const g = d3.select(this);
-      g.selectAll<SVGTextElement, boolean>('.point-label')
+      const lines = labelGetter(d).split('\n');
+      const text = d3
+        .select(this)
+        .selectAll<SVGTextElement, boolean>('.point-label')
         .data([true])
         .join('text')
         .attr('class', 'point-label')
@@ -113,8 +128,15 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
-        .attr('pointer-events', 'none')
-        .text(config.getLabelText!(d));
+        .attr('font-weight', '700')
+        .attr('pointer-events', 'none');
+      text
+        .selectAll<SVGTSpanElement, string>('tspan')
+        .data(lines)
+        .join('tspan')
+        .attr('x', 0)
+        .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+        .text((l) => l);
     });
   } else {
     points.selectAll('.point-label').remove();

From 37eecc6e28c10751ffc52c8a0d0588177e43d4d8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 09:38:39 -0500
Subject: [PATCH 07/96] fix: anchor multi-line labels via first tspan +
 tspan-aware collision pass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a `<text>` contains tspans, the parent's `dy` does not shift the bbox
cleanly — its (unused) y=0 origin still factors in, so the rendered text
ended up centered on the point. Move the absolute offset into the FIRST
tspan's `dy`; later tspans cascade by 1.1em.

Collision avoidance now drives the first tspan's `dy` and tries four
candidate baselines (primary above, primary below, secondary above,
secondary below), accounting for full label height when picking a non-
overlapping slot. Labels still hidden as a last resort.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 72 +++++++++++++------
 .../src/lib/d3-chart/layers/scatter-points.ts | 25 ++++---
 2 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index f8ce9b8f..27d3680c 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -55,58 +55,88 @@ import {
   buildGradientColorMap,
 } from '@/components/inference/utils/paretoLabels';
 
-// Greedy label-collision avoidance: try positions above/below the point;
-// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom.
+// Greedy label-collision avoidance.
+// Each candidate is the y-position of the FIRST baseline (relative to point
+// center) which we apply via the first tspan's `dy` — later tspans cascade
+// down by 1.1em. We try above/below at primary and secondary offsets, and
+// hide the label if all four positions collide.
 function avoidLabelCollisions(
   zoomGroup: d3.Selection<SVGGElement, unknown, null, undefined>,
 ): void {
-  const labels: {
+  interface LabelInfo {
     el: SVGTextElement;
+    firstTspan: SVGTSpanElement;
     cx: number;
     cy: number;
     w: number;
-    h: number;
-  }[] = [];
+    nLines: number;
+    defaultFirstY: number;
+  }
+  const labels: LabelInfo[] = [];
+  const ASCENT = 9;
+  const DESCENT = 3;
+  const LINE_H = 11;
+
   zoomGroup.selectAll<SVGGElement, unknown>('.dot-group').each(function () {
     const labelEl = this.querySelector<SVGTextElement>('.point-label');
     if (!labelEl) return;
     if ((this as SVGGElement).style.opacity === '0') return;
+    const tspans = labelEl.querySelectorAll<SVGTSpanElement>('tspan');
+    if (tspans.length === 0) return;
     const transform = (this as SVGGElement).getAttribute('transform') ?? '';
     const m = transform.match(/translate\(([^,]+),([^)]+)\)/);
     if (!m) return;
     const cx = parseFloat(m[1]);
     const cy = parseFloat(m[2]);
-    labelEl.setAttribute('dy', '-8');
+    const nLines = tspans.length;
+    const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point
+    // Reset to default before measuring so prior positioning doesn't bias bbox
+    tspans[0].setAttribute('dy', `${defaultFirstY}px`);
     labelEl.style.opacity = '1';
     const bbox = labelEl.getBBox();
-    labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height });
+    labels.push({
+      el: labelEl,
+      firstTspan: tspans[0],
+      cx,
+      cy,
+      w: bbox.width,
+      nLines,
+      defaultFirstY,
+    });
   });
+
   labels.sort((a, b) => a.cx - b.cx);
   const placed: { left: number; right: number; top: number; bottom: number }[] = [];
-  const pad = 1;
+  const pad = 2;
+
   for (const lab of labels) {
-    // Candidates scale with the label's own height so multi-line labels don't
-    // overlap the point shape when flipped below.
-    const below = lab.h + 8;
-    const candidates = [-8, below, -8 - below - 4, 2 * below];
-    let chosenDy: number | null = null;
+    const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT;
+    const aboveFirstY = lab.defaultFirstY;
+    const belowFirstY = 14; // first baseline 14px below point center
+    const candidates = [
+      aboveFirstY,
+      belowFirstY,
+      aboveFirstY - blockH - 2,
+      belowFirstY + blockH + 2,
+    ];
+    let chosenY: number | null = null;
     let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null;
-    for (const dy of candidates) {
-      const top = lab.cy + dy - lab.h - pad;
-      const bottom = lab.cy + dy + pad;
+    for (const firstY of candidates) {
+      const top = lab.cy + firstY - ASCENT - pad;
+      const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad;
       const left = lab.cx - lab.w / 2 - pad;
       const right = lab.cx + lab.w / 2 + pad;
       const collides = placed.some(
         (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom),
       );
       if (!collides) {
-        chosenDy = dy;
+        chosenY = firstY;
         chosenBox = { left, right, top, bottom };
         break;
       }
     }
-    if (chosenDy !== null && chosenBox) {
-      lab.el.setAttribute('dy', String(chosenDy));
+    if (chosenY !== null && chosenBox) {
+      lab.firstTspan.setAttribute('dy', `${chosenY}px`);
       lab.el.style.opacity = '1';
       placed.push(chosenBox);
     } else {
@@ -1418,18 +1448,18 @@ const ScatterGraph = React.memo(
                   .data(showLabels ? [true] : [])
                   .join('text')
                   .attr('class', 'overlay-label')
-                  .attr('dy', -10)
                   .attr('text-anchor', 'middle')
                   .style('fill', 'var(--foreground)')
                   .attr('font-size', '10px')
                   .attr('font-weight', '700')
                   .attr('pointer-events', 'none');
+                const firstDy = -(1 + (lines.length - 1) * 1.1);
                 text
                   .selectAll<SVGTSpanElement, string>('tspan')
                   .data(lines)
                   .join('tspan')
                   .attr('x', 0)
-                  .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+                  .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
                   .text((l) => l);
               });
 
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 13c588d8..71d1f050 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -64,10 +64,10 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
   });
 
   // Label (enter only). Multi-line labels are passed as `\n`-separated strings;
-  // we stack tspans UPWARD from the text's `dy` anchor so the LAST line sits
-  // at `dy` (just above the point) and earlier lines land above it. That way,
-  // the collision-avoidance pass only has to move the `<text>` element — the
-  // intra-stack offsets stay correct whether the label ends up above or below.
+  // we anchor the entire stack via the FIRST tspan's `dy` so getBBox() doesn't
+  // pick up the text element's own (unused) y=0 origin. The first tspan is
+  // raised so the LAST line baseline lands ~8px above the point; subsequent
+  // tspans cascade down by 1.1em.
   if (!config.hideLabels && config.getLabelText && config.foreground) {
     const labelGetter = config.getLabelText;
     entered.each(function (d) {
@@ -76,15 +76,18 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .select(this)
         .append('text')
         .attr('class', 'point-label')
-        .attr('dy', -8)
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
         .attr('font-weight', '700')
         .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
       lines.forEach((line, i) => {
-        const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em';
-        text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line);
+        text
+          .append('tspan')
+          .attr('x', 0)
+          .attr('dy', i === 0 ? `${firstDy}em` : '1.1em')
+          .text(line);
       });
     });
   }
@@ -113,7 +116,9 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
   // Update colors on existing shapes (handles hw color changes)
   points.select('.visible-shape').attr('fill', config.getColor as any);
 
-  // Update labels: use data join so labels are created/removed properly on toggle
+  // Update labels: use data join so labels are created/removed properly on toggle.
+  // Anchor the stack via the first tspan (NOT the text dy — that doesn't shift the
+  // bbox cleanly when there are tspan children).
   if (!config.hideLabels && config.getLabelText && config.foreground) {
     const labelGetter = config.getLabelText;
     points.each(function (d) {
@@ -124,18 +129,18 @@ export function renderScatterPoints<T extends { precision: string; x: number; y:
         .data([true])
         .join('text')
         .attr('class', 'point-label')
-        .attr('dy', -8)
         .attr('text-anchor', 'middle')
         .attr('fill', config.foreground!)
         .attr('font-size', '10px')
         .attr('font-weight', '700')
         .attr('pointer-events', 'none');
+      const firstDy = -(0.8 + (lines.length - 1) * 1.1);
       text
         .selectAll<SVGTSpanElement, string>('tspan')
         .data(lines)
         .join('tspan')
         .attr('x', 0)
-        .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'))
+        .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em'))
         .text((l) => l);
     });
   } else {

From f317377dfaea35f9cb5dc435ea177966aa17fbf8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 1 May 2026 10:21:00 -0500
Subject: [PATCH 08/96] fix: dedupe artifacts by logical name + skip
 0-successful agg rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two complementary fixes for runs whose `results_bmk` aggregated artifact
ends up containing both a successful row and a failed-attempt row for the
same (config, conc, offload) — the failed row's null metrics were
overwriting the good row via ON CONFLICT DO UPDATE.

1. Artifact-level: strip the trailing `_<runner-pool>_<attempt>` suffix
   from each artifact name and group by the logical name, keeping only the
   most recent per group.

2. Row-level: skip rows with `num_requests_successful === 0` AND
   `num_requests_total > 0`. The aggregated artifact merges rows from all
   runners — including failed ones — so artifact-level dedup alone can't
   reach inside it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/benchmark-mapper.ts | 14 +++++++++++
 packages/db/src/etl/skip-tracker.ts     | 10 +++++++-
 packages/db/src/ingest-ci-run.ts        | 33 ++++++++++++++++++++-----
 packages/db/src/ingest-gcs-backup.ts    |  1 +
 4 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts
index d842276e..1aff5ea9 100644
--- a/packages/db/src/etl/benchmark-mapper.ts
+++ b/packages/db/src/etl/benchmark-mapper.ts
@@ -145,6 +145,20 @@ export function mapBenchmarkRow(
     return null;
   }
 
+  // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from
+  // every runner, including ones with 0 successful requests and null metrics.
+  // Without this skip, the empty row's nulls overwrite a good row via
+  // ON CONFLICT DO UPDATE when both share the same (config, conc, offload).
+  if (
+    typeof row.num_requests_successful === 'number' &&
+    row.num_requests_successful === 0 &&
+    typeof row.num_requests_total === 'number' &&
+    row.num_requests_total > 0
+  ) {
+    tracker.skips.failedRun++;
+    return null;
+  }
+
   // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading`
   // ('none' → 'off'; any other non-empty value → 'on').
   const offloadModeRaw =
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 6166ea44..588718dd 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -8,6 +8,7 @@ export interface Skips {
   unmappedModel: number;
   unmappedHw: number;
   noIslOsl: number;
+  failedRun: number;
   dbError: number;
 }
 
@@ -66,7 +67,14 @@ const MAX_DB_ERRORS = 10;
  * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets.
  */
 export function createSkipTracker(): SkipTracker {
-  const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 };
+  const skips: Skips = {
+    badZip: 0,
+    unmappedModel: 0,
+    unmappedHw: 0,
+    noIslOsl: 0,
+    failedRun: 0,
+    dbError: 0,
+  };
   const unmappedModels = new Set<string>();
   const unmappedHws = new Set<string>();
   const unmappedPrecisions = new Set<string>();
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 8cce43ca..fb1fbbbc 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -101,15 +101,30 @@ if (isDownloadMode) {
     } catch {}
   }
 
-  const byName = new Map<string, (typeof allArtifacts)[0]>();
+  // Strip the trailing `_<runner-pool>_<attempt-digits>` token from each
+  // artifact name, then group by the resulting logical name and keep only
+  // the most recent per group. Without this, two artifacts produced on
+  // different runners for the same logical config (e.g. `…_h200-cw_00` and
+  // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty
+  // metrics can overwrite the good one via ON CONFLICT DO UPDATE.
+  //
+  // The runner pool name itself has no underscores (`h200-cw`,
+  // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip
+  // bounded — using `\w` here would over-match across earlier `_`
+  // separators and collapse different (conc, offload) variants into the
+  // same logical name.
+  const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/;
+  const byLogical = new Map<string, (typeof allArtifacts)[0]>();
   for (const a of allArtifacts) {
-    const existing = byName.get(a.name);
+    const key = a.name.replace(RUNNER_SUFFIX_RE, '');
+    const existing = byLogical.get(key);
     if (!existing || a.created_at > existing.created_at) {
-      byName.set(a.name, a);
+      byLogical.set(key, a);
     }
   }
 
-  for (const [name, artifact] of byName) {
+  for (const [, artifact] of byLogical) {
+    const name = artifact.name;
     console.log(`  ${name}`);
     const zipPath = path.join(artifactsDir, 'artifact.zip');
     execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, {
@@ -121,7 +136,7 @@ if (isDownloadMode) {
     fs.unlinkSync(zipPath);
   }
 
-  console.log(`\n  Downloaded ${byName.size} artifact(s)`);
+  console.log(`\n  Downloaded ${byLogical.size} artifact(s)`);
 
   // Fetch run attempt from API
   const attemptStr = execSync(
@@ -510,11 +525,17 @@ async function main(): Promise<void> {
 
   const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker;
   const totalSkips =
-    skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError;
+    skips.badZip +
+    skips.unmappedModel +
+    skips.unmappedHw +
+    skips.noIslOsl +
+    skips.failedRun +
+    skips.dbError;
   if (totalSkips > 0) {
     console.log(`\n  Skipped: ${totalSkips} rows`);
     const skipLines: [string, number][] = [
       ['no isl/osl (old format)', skips.noIslOsl],
+      ['failed run (0 successful)', skips.failedRun],
       ['unmapped model', skips.unmappedModel],
       ['unmapped hw', skips.unmappedHw],
       ['bad/empty zip', skips.badZip],
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index 6dc604e9..d67f5164 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -434,6 +434,7 @@ async function mapWorkflowDir(
       unmappedModel: local.skips.unmappedModel,
       unmappedHw: local.skips.unmappedHw,
       noIslOsl: local.skips.noIslOsl,
+      failedRun: local.skips.failedRun,
     },
     localUnmappedModels: new Set(local.unmappedModels),
     localUnmappedHws: new Set(local.unmappedHws),

From c2f66f62f5a1dedb6a87c7c5e58ca990b3cb0956 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 7 May 2026 08:41:26 -0500
Subject: [PATCH 09/96] feat: add AIPerf to FRAMEWORK_LABELS

Tag display name for the `aiperf` spec_method suffix used by the
alternate-harness runs ingested for the agentic minimax sweep.
Without this entry the legend shows 'AIPERF' from the default
toUpperCase fallback.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/constants/src/framework-aliases.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts
index cc5eb6b4..e23a93bc 100644
--- a/packages/constants/src/framework-aliases.ts
+++ b/packages/constants/src/framework-aliases.ts
@@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record<string, string> = {
     ]),
   ),
   mtp: 'MTP',
+  aiperf: 'AIPerf',
 };
 
 /**

From 024797a978a2a6e2954f66a963de3205b62a149e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 12 May 2026 15:02:07 -0500
Subject: [PATCH 10/96] fix(changelog): coerce ids to string when filtering
 changelog by run

bigint workflow_run_id sometimes deserializes as a number on the
frontend depending on the postgres adapter's behavior; strict ===
between a number and a string silently dropped every match, so the
changelog popover always reported "no changelog data available."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/GlobalFilterContext.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 08fc7094..11e56de7 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -87,7 +87,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record<string, RunInfo> {
   const runs: Record<string, RunInfo> = {};
   for (const run of data.runs) {
     const runId = String(run.github_run_id);
-    const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id);
+    const runChangelogs = data.changelogs.filter(
+      (c) => String(c.workflow_run_id) === String(run.github_run_id),
+    );
     runs[runId] = {
       runId,
       runDate: run.created_at,

From aa154193dfbc12535f25444cdf6fccc16a3e1382 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 12 May 2026 15:36:57 -0500
Subject: [PATCH 11/96] feat: default sequence to Agentic Traces when available

If the selected model has agentic_traces data, prefer that over the
default 8K/1K fixed-seq when the user hasn't explicitly chosen via URL.
effectiveSequence already falls back to availableSequences[0] for models
without agentic, so models with only fixed-seq data still render correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/GlobalFilterContext.tsx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx
index 11e56de7..7813d079 100644
--- a/packages/app/src/components/GlobalFilterContext.tsx
+++ b/packages/app/src/components/GlobalFilterContext.tsx
@@ -125,7 +125,9 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) {
   const [selectedSequence, setSelectedSequence] = useState<Sequence>(() => {
     const urlSeq = getUrlParam('i_seq');
     if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence;
-    return Sequence.EightK_OneK;
+    // Prefer Agentic Traces by default when the selected model has it; the
+    // effectiveSequence fallback below handles models without agentic data.
+    return Sequence.AgenticTraces;
   });
 
   const [selectedPrecisions, setSelectedPrecisionsRaw] = useState<string[]>(() => {

From 099a33efcb53f5130dc40d715a0f4b86d6136a93 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:25:25 -0500
Subject: [PATCH 12/96] fix(agentic): respect percentile selector for
 input-throughput x axis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rowToAggDataEntry was only copying median/p99 metric variants — picking
p90/p99.9 in the percentile selector silently fell back to 0 and
collapsed every point into a vertical line at x=0. Copy the full
median/p90/p99/p99.9 set into AggDataEntry.

Hide the X-Axis Metric dropdown for agentic mode (it doubled up with the
percentile selector) and route the input-metric chart through
withPercentile so picking p99 actually plots p99_ttft instead of the
hard-coded p99_ttft config default. Percentile options pared back to
median + p99.
---
 .../inference/hooks/useChartData.ts           | 46 +++++++++++++++++--
 .../app/src/components/inference/types.ts     | 10 ++++
 .../components/inference/ui/ChartControls.tsx |  3 +-
 packages/app/src/lib/benchmark-transform.ts   | 12 ++++-
 packages/app/src/lib/data-mappings.ts         |  8 +---
 packages/app/src/lib/energy-metrics.test.ts   | 10 ++++
 6 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 81ab0780..57e9a1c2 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -16,7 +16,7 @@ import { filterDataByCostLimit } from '@/components/inference/utils';
 import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants';
 import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
-import type { Model, Sequence } from '@/lib/data-mappings';
+import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 
 /** Build deduplicated comparison dates, excluding the main run date. */
@@ -216,7 +216,14 @@ export function useChartData(
             ? 'P99 Time To First Token (s)'
             : 'Median Time To First Token (s)';
 
-        if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
+
+        if (
+          effectiveXMetric &&
+          chartDef.chartType === 'interactivity' &&
+          isInputMetric &&
+          !isAgentic
+        ) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) {
@@ -225,15 +232,40 @@ export function useChartData(
             xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label;
           }
         } else if (chartDef.chartType === 'interactivity' && isInputMetric) {
+          // Agentic falls through here too — the manual X-axis dropdown is
+          // hidden in agentic mode (would double up with the percentile
+          // selector), so the config default + percentile post-processing
+          // below drives the x axis.
           const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition;
           const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
           xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label;
-        } else if (chartDef.chartType === 'e2e' && isTtftOverride) {
+        } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           xAxisLabel = ttftLabel;
         }
 
+        // Agentic: rewrite the resolved x metric to the chosen percentile,
+        // and relabel accordingly. naturalX is already percentile-adjusted,
+        // so the per-metric override path is the only one that actually
+        // changes here.
+        if (isAgentic) {
+          const adjusted = withPercentile(
+            xAxisField as string,
+            selectedPercentile,
+          ) as keyof AggDataEntry;
+          if (adjusted !== xAxisField) {
+            const pctlWord =
+              selectedPercentile === 'median'
+                ? 'Median'
+                : selectedPercentile === 'p99.9'
+                  ? 'P99.9'
+                  : selectedPercentile.toUpperCase();
+            xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+            xAxisField = adjusted;
+          }
+        }
+
         // The x-axis is "flipped" only when the good-direction reverses
         // (e.g. interactivity → TTFT: "higher is better" → "lower is better").
         // E2EL → TTFT keeps the same direction ("lower is better" for both),
@@ -269,7 +301,13 @@ export function useChartData(
           xAxisField,
         };
       }),
-    [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile],
+    [
+      selectedYAxisMetric,
+      selectedXAxisMetric,
+      selectedE2eXAxisMetric,
+      selectedPercentile,
+      selectedSequence,
+    ],
   );
 
   // Build renderable graphs (data processing + stable chart definitions)
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index a2d9ef2e..cddeba54 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -50,23 +50,33 @@ export interface AggDataEntry {
   mean_ttft: number;
   median_ttft: number;
   std_ttft: number;
+  p90_ttft: number;
   p99_ttft: number;
+  'p99.9_ttft': number;
   mean_tpot: number;
   mean_intvty: number;
   median_tpot: number;
   median_intvty: number;
   std_tpot: number;
   std_intvty: number;
+  p90_tpot: number;
+  p90_intvty: number;
   p99_tpot: number;
   p99_intvty: number;
+  'p99.9_tpot': number;
+  'p99.9_intvty': number;
   mean_itl: number;
   median_itl: number;
   std_itl: number;
+  p90_itl: number;
   p99_itl: number;
+  'p99.9_itl': number;
   mean_e2el: number;
   median_e2el: number;
   std_e2el: number;
+  p90_e2el: number;
   p99_e2el: number;
+  'p99.9_e2el': number;
   disagg: boolean;
   num_prefill_gpu: number;
   num_decode_gpu: number;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 6707bd9e..7b4fa08f 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -269,7 +269,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
           </div>
 
           {graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') &&
-            isInputMetric && (
+            isInputMetric &&
+            selectedSequence !== Sequence.AgenticTraces && (
               <div className="flex flex-col space-y-1.5 lg:col-span-1">
                 <LabelWithTooltip
                   htmlFor="x-axis-select"
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 69745da2..eb62a18a 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -25,7 +25,7 @@ import type { BenchmarkRow } from '@/lib/api';
  */
 function agenticAliases(m: Record<string, number>): Record<string, number> {
   const out: Record<string, number> = {};
-  for (const suffix of ['mean', 'median', 'p90', 'p99']) {
+  for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) {
     const itl = m[`${suffix}_itl`];
     const ttlt = m[`${suffix}_ttlt`];
     if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt;
@@ -62,23 +62,33 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     mean_ttft: m.mean_ttft ?? 0,
     median_ttft: m.median_ttft ?? 0,
     std_ttft: m.std_ttft ?? 0,
+    p90_ttft: m.p90_ttft ?? 0,
     p99_ttft: m.p99_ttft ?? 0,
+    'p99.9_ttft': m['p99.9_ttft'] ?? 0,
     mean_tpot: m.mean_tpot ?? 0,
     median_tpot: m.median_tpot ?? 0,
     std_tpot: m.std_tpot ?? 0,
+    p90_tpot: m.p90_tpot ?? 0,
     p99_tpot: m.p99_tpot ?? 0,
+    'p99.9_tpot': m['p99.9_tpot'] ?? 0,
     mean_intvty: m.mean_intvty ?? 0,
     median_intvty: m.median_intvty ?? 0,
     std_intvty: m.std_intvty ?? 0,
+    p90_intvty: m.p90_intvty ?? 0,
     p99_intvty: m.p99_intvty ?? 0,
+    'p99.9_intvty': m['p99.9_intvty'] ?? 0,
     mean_itl: m.mean_itl ?? 0,
     median_itl: m.median_itl ?? 0,
     std_itl: m.std_itl ?? 0,
+    p90_itl: m.p90_itl ?? 0,
     p99_itl: m.p99_itl ?? 0,
+    'p99.9_itl': m['p99.9_itl'] ?? 0,
     mean_e2el: m.mean_e2el ?? 0,
     median_e2el: m.median_e2el ?? 0,
     std_e2el: m.std_e2el ?? 0,
+    p90_e2el: m.p90_e2el ?? 0,
     p99_e2el: m.p99_e2el ?? 0,
+    'p99.9_e2el': m['p99.9_e2el'] ?? 0,
     disagg: row.disagg,
     num_prefill_gpu: row.num_prefill_gpu,
     num_decode_gpu: row.num_decode_gpu,
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index f137875c..bf48c864 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,21 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which
- * slice to plot.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the
+ * two most commonly read slices (p50, p99) are surfaced in the UI.
  */
 export enum Percentile {
   Median = 'median',
-  P90 = 'p90',
   P99 = 'p99',
-  P99_9 = 'p99.9',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.Median]: { label: 'p50 (median)' },
-  [Percentile.P90]: { label: 'p90' },
   [Percentile.P99]: { label: 'p99' },
-  [Percentile.P99_9]: { label: 'p99.9' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 28cc1e36..54788585 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,23 +57,33 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_ttft: 0.5,
     median_ttft: 0.4,
     std_ttft: 0.1,
+    p90_ttft: 0.7,
     p99_ttft: 0.8,
+    'p99.9_ttft': 0.9,
     mean_tpot: 0.02,
     mean_intvty: 45,
     median_tpot: 0.02,
     median_intvty: 44,
     std_tpot: 0.005,
     std_intvty: 5,
+    p90_tpot: 0.025,
+    p90_intvty: 55,
     p99_tpot: 0.03,
     p99_intvty: 60,
+    'p99.9_tpot': 0.035,
+    'p99.9_intvty': 65,
     mean_itl: 0.01,
     median_itl: 0.01,
     std_itl: 0.002,
+    p90_itl: 0.013,
     p99_itl: 0.015,
+    'p99.9_itl': 0.018,
     mean_e2el: 5,
     median_e2el: 4.8,
     std_e2el: 0.5,
+    p90_e2el: 5.5,
     p99_e2el: 6,
+    'p99.9_e2el': 6.5,
     disagg: false,
     num_prefill_gpu: 0,
     num_decode_gpu: 0,

From 50a06d1419c70ddd8d24b2c6545da44fe6be3a4d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:27:19 -0500
Subject: [PATCH 13/96] fix(agentic): default percentile to p99 and drop median
 option

---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/data-mappings.ts                      | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index b4ccb9ef..af2d364e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -122,7 +122,7 @@ export function InferenceProvider({
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
-    () => getUrlParam('i_pctl') || 'median',
+    () => getUrlParam('i_pctl') || 'p99',
   );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index bf48c864..1b4f47c3 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,16 +186,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the
- * two most commonly read slices (p50, p99) are surfaced in the UI.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p99
+ * is surfaced in the UI.
  */
 export enum Percentile {
-  Median = 'median',
   P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
-  [Percentile.Median]: { label: 'p50 (median)' },
   [Percentile.P99]: { label: 'p99' },
 };
 

From 3c96e9137776d1c368a0acdfeee6e769d5733464 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:31:27 -0500
Subject: [PATCH 14/96] fix(agentic): keep only p90 as the percentile option

---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/data-mappings.ts                      | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 0ba14a21..accfdf9e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -136,7 +136,7 @@ export function InferenceProvider({
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
-    () => getUrlParam('i_pctl') || 'p99',
+    () => getUrlParam('i_pctl') || 'p90',
   );
   const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>(
     () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto',
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 0afb304a..83e6648a 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -191,12 +191,10 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
  */
 export enum Percentile {
   P90 = 'p90',
-  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
-  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];

From 642081af77c8165ac89a5177abbd6c0244dfb9c0 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:31:30 -0400
Subject: [PATCH 15/96] fix(agentic): default percentile to p90, surface only
 p90/p99

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts                | 2 +-
 .../app/src/components/inference/InferenceContext.tsx    | 2 +-
 .../app/src/components/inference/hooks/useChartData.ts   | 9 ++-------
 packages/app/src/components/ui/chart-selectors.tsx       | 2 +-
 packages/app/src/lib/data-mappings.ts                    | 6 ++++--
 packages/app/src/lib/url-state.ts                        | 2 +-
 6 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index f267dcc9..34b89aba 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -189,7 +189,7 @@ export function createMockInferenceContext(
     workflowInfo: null,
     selectedYAxisMetric: 'y_tpPerGpu',
     setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'),
-    selectedPercentile: 'median',
+    selectedPercentile: 'p90',
     setSelectedPercentile: namedStub('setSelectedPercentile'),
     selectedXAxisMetric: null,
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index accfdf9e..36dc672d 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -134,7 +134,7 @@ export function InferenceProvider({
     () => getUrlParam('i_e2e_xmetric') || null,
   );
   // Latency percentile applied to the chart x-axis for agentic scenarios.
-  // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore.
+  // Values: 'p90' | 'p99'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
     () => getUrlParam('i_pctl') || 'p90',
   );
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index f2ef85ec..436fd662 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -83,7 +83,7 @@ export function useChartData(
   selectedRunDate?: string,
   enabled = true,
   latestAvailableDate?: string,
-  selectedPercentile = 'median',
+  selectedPercentile = 'p90',
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
 ) {
@@ -261,12 +261,7 @@ export function useChartData(
             selectedPercentile,
           ) as keyof AggDataEntry;
           if (adjusted !== xAxisField) {
-            const pctlWord =
-              selectedPercentile === 'median'
-                ? 'Median'
-                : selectedPercentile === 'p99.9'
-                  ? 'P99.9'
-                  : selectedPercentile.toUpperCase();
+            const pctlWord = selectedPercentile.toUpperCase();
             xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
             xAxisField = adjusted;
           }
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index d2940de4..e30816fa 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -315,7 +315,7 @@ export function PercentileSelector({
       <LabelWithTooltip
         htmlFor={id}
         label="Latency Percentile"
-        tooltip="Percentile of the latency distribution used for the chart x-axis. Agentic runs carry median/p90/p99/p99.9 variants; switch percentiles to see tail-latency behavior."
+        tooltip="Percentile of the latency distribution used for the chart x-axis. Switch between p90 and p99 to see tail-latency behavior on agentic runs."
       />
       <Select
         value={value}
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 83e6648a..0970f8d7 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -186,15 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 /**
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); p90 and
- * p99 are surfaced in the UI.
+ * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
+ * and p99 are surfaced in the UI.
  */
 export enum Percentile {
   P90 = 'p90',
+  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
+  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 35ac2359..54ce43d9 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -67,7 +67,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_seq: '8k/1k',
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
-  i_pctl: 'median',
+  i_pctl: 'p90',
   i_xmetric: 'p99_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',

From 3f45f4df92e1990070bf5a58dd7753aa9a91baff Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:38:23 -0400
Subject: [PATCH 16/96] fix(agentic): drop p99 + median TTFT, p90 only across
 selectors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aligns the TTFT x-axis selectors with the percentile selector — only
p90 is offered everywhere. Default x-axis metric and chart config
input-throughput x are p90_ttft.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx |  2 +-
 .../inference/hooks/useChartData.ts           | 10 +---
 .../inference/inference-chart-config.json     | 10 ++--
 .../inference/replay/buildReplayTimeline.ts   |  3 +-
 .../components/inference/ui/ChartControls.tsx |  7 +--
 .../components/inference/ui/ChartDisplay.tsx  | 19 ++-----
 .../src/components/inference/utils.test.ts    | 57 +++++++------------
 .../app/src/components/inference/utils.ts     |  3 +-
 .../app/src/components/ui/chart-selectors.tsx |  2 +-
 packages/app/src/lib/data-mappings.ts         |  4 +-
 packages/app/src/lib/url-state.ts             |  2 +-
 11 files changed, 42 insertions(+), 77 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 36dc672d..e88f57d8 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -128,7 +128,7 @@ export function InferenceProvider({
     () => getUrlParam('i_metric') || 'y_tpPerGpu',
   );
   const [selectedXAxisMetric, setSelectedXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_xmetric') || 'p99_ttft',
+    () => getUrlParam('i_xmetric') || 'p90_ttft',
   );
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || null,
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 436fd662..69222859 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -215,12 +215,8 @@ export function useChartData(
         // Resolve the effective x-axis override per chart type
         const effectiveXMetric =
           chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
-        const isTtftOverride =
-          effectiveXMetric === 'p99_ttft' || effectiveXMetric === 'median_ttft';
-        const ttftLabel =
-          effectiveXMetric === 'p99_ttft'
-            ? 'P99 Time To First Token (s)'
-            : 'Median Time To First Token (s)';
+        const isTtftOverride = effectiveXMetric === 'p90_ttft';
+        const ttftLabel = 'P90 Time To First Token (s)';
 
         const isAgentic = selectedSequence === Sequence.AgenticTraces;
 
@@ -340,7 +336,7 @@ export function useChartData(
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
-        const isTtftX = xAxisField === 'p99_ttft' || xAxisField === 'median_ttft';
+        const isTtftX = xAxisField === 'p90_ttft';
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)
diff --git a/packages/app/src/components/inference/inference-chart-config.json b/packages/app/src/components/inference/inference-chart-config.json
index e26d237e..dcd91e60 100644
--- a/packages/app/src/components/inference/inference-chart-config.json
+++ b/packages/app/src/components/inference/inference-chart-config.json
@@ -13,9 +13,9 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_left",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
-    "y_inputTputPerGpu_heading": "vs. P99 Time To First Token",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
+    "y_inputTputPerGpu_heading": "vs. P90 Time To First Token",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
@@ -105,8 +105,8 @@
     "y_inputTputPerGpu_label": "Input Token Throughput per GPU (tok/s/gpu)",
     "y_inputTputPerGpu_title": "Input Token Throughput per GPU",
     "y_inputTputPerGpu_roofline": "upper_right",
-    "y_inputTputPerGpu_x": "p99_ttft",
-    "y_inputTputPerGpu_x_label": "P99 Time To First Token (s)",
+    "y_inputTputPerGpu_x": "p90_ttft",
+    "y_inputTputPerGpu_x_label": "P90 Time To First Token (s)",
     "y_outputTputPerGpu": "outputTputPerGpu.y",
     "y_outputTputPerGpu_label": "Output Token Throughput per GPU (tok/s/gpu)",
     "y_outputTputPerGpu_title": "Output Token Throughput per GPU",
diff --git a/packages/app/src/components/inference/replay/buildReplayTimeline.ts b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
index be076418..b0eb1446 100644
--- a/packages/app/src/components/inference/replay/buildReplayTimeline.ts
+++ b/packages/app/src/components/inference/replay/buildReplayTimeline.ts
@@ -82,8 +82,7 @@ function resolveXAxisField(
   const metricTitle =
     (chartDef[`${selectedYAxisMetric}_title` as keyof ChartDefinition] as string) || '';
   const isInputMetric = metricTitle.toLowerCase().includes('input');
-  const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     return selectedXAxisMetric;
diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx
index 7b4fa08f..ad222edc 100644
--- a/packages/app/src/components/inference/ui/ChartControls.tsx
+++ b/packages/app/src/components/inference/ui/ChartControls.tsx
@@ -275,11 +275,11 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
                 <LabelWithTooltip
                   htmlFor="x-axis-select"
                   label="X-Axis Metric"
-                  tooltip="The latency metric displayed on the chart's X-axis. Options include P99 Time To First Token and Median Time To First Token."
+                  tooltip="The latency metric displayed on the chart's X-axis: P90 Time To First Token."
                 />
                 <Select
                   onValueChange={handleXAxisMetricChange}
-                  value={selectedXAxisMetric ?? 'p99_ttft'}
+                  value={selectedXAxisMetric ?? 'p90_ttft'}
                 >
                   <SelectTrigger
                     id="x-axis-select"
@@ -289,8 +289,7 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
                     <SelectValue />
                   </SelectTrigger>
                   <SelectContent portalled={false}>
-                    <SelectItem value="p99_ttft">P99 TTFT</SelectItem>
-                    <SelectItem value="median_ttft">Median TTFT</SelectItem>
+                    <SelectItem value="p90_ttft">P90 TTFT</SelectItem>
                   </SelectContent>
                 </Select>
               </div>
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index f0e1692a..78df2c37 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -408,27 +408,20 @@ export default function ChartDisplay() {
                             if (
                               graph.chartDefinition.chartType === 'interactivity' &&
                               isInputMetric &&
-                              selectedXAxisMetric
+                              selectedXAxisMetric === 'p90_ttft'
                             ) {
-                              if (selectedXAxisMetric === 'p99_ttft') {
-                                return 'vs. P99 Time To First Token';
-                              } else if (selectedXAxisMetric === 'median_ttft') {
-                                return 'vs. Median Time To First Token';
-                              }
+                              return 'vs. P90 Time To First Token';
                             }
 
                             // For e2e chart: render clickable inline dropdown for x-axis
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p99_ttft'
-                                  ? 'P99 TTFT'
-                                  : selectedE2eXAxisMetric === 'median_ttft'
-                                    ? 'Median TTFT'
-                                    : 'End-to-end Latency';
+                                selectedE2eXAxisMetric === 'p90_ttft'
+                                  ? 'P90 TTFT'
+                                  : 'End-to-end Latency';
                               const xAxisOptions = [
                                 { value: null, label: 'End-to-end Latency' },
-                                { value: 'p99_ttft', label: 'P99 TTFT' },
-                                { value: 'median_ttft', label: 'Median TTFT' },
+                                { value: 'p90_ttft', label: 'P90 TTFT' },
                               ];
                               const zoomPrefix =
                                 selectedDateRange.startDate &&
diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts
index 8f8705e1..589ba580 100644
--- a/packages/app/src/components/inference/utils.test.ts
+++ b/packages/app/src/components/inference/utils.test.ts
@@ -157,12 +157,12 @@ describe('processOverlayChartData', () => {
   });
 
   it('remaps x to config override for input metrics on interactivity chart', () => {
-    // inputTputPerGpu has x override to p99_ttft on interactivity chart
+    // inputTputPerGpu has x override to p90_ttft on interactivity chart
     const data = [
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_intvty: 50,
       } as any),
     ];
@@ -176,16 +176,11 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        median_ttft: 0.1,
+        p90_ttft: 0.1,
         median_intvty: 50,
       } as any),
     ];
-    const result = processOverlayChartData(
-      data,
-      'interactivity',
-      'y_inputTputPerGpu',
-      'median_ttft',
-    );
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.1);
   });
@@ -195,76 +190,62 @@ describe('processOverlayChartData', () => {
       pt({
         x: 100,
         inputTputPerGpu: { y: 5, roof: false },
-        p99_ttft: 0.25,
+        p90_ttft: 0.25,
         median_e2el: 2.5,
       } as any),
     ];
     const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null);
     expect(result).toHaveLength(1);
-    // e2e uses median_e2el as x (from chart config default), not p99_ttft
+    // e2e uses median_e2el as x (from chart config default), not p90_ttft
     expect(result[0].x).toBe(2.5);
   });
 
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => {
-    const data = [
-      pt({
-        x: 100,
-        tpPerGpu: { y: 42, roof: false },
-        p99_ttft: 0.35,
-        median_e2el: 2.5,
-      } as any),
-    ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
-    expect(result).toHaveLength(1);
-    expect(result[0].x).toBe(0.35);
-  });
-
-  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => {
+  it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => {
     const data = [
       pt({
         x: 100,
         tpPerGpu: { y: 42, roof: false },
-        median_ttft: 0.12,
+        p90_ttft: 0.12,
         median_e2el: 2.5,
       } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.12);
   });
 
   it('filters e2e TTFT outliers exceeding y_latency_limit', () => {
     const data = [
-      pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any),
-      pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any),
+      pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any),
+      pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any),
     ];
-    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft');
     // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
 
   it('does not filter interactivity points by latency limit when x-axis is default', () => {
-    // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity
+    // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity
     // chart's x-axis stays median_intvty for non-input metrics. The latency limit
     // (60) must NOT apply to median_intvty values.
     const data = [
       pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any),
       pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft');
+    const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft');
     expect(result).toHaveLength(2);
   });
 
   it('applies latency limit on interactivity only when x-axis is actually overridden', () => {
-    // When an input metric IS selected and x-axis overrides to p99_ttft,
+    // When an input metric IS selected and x-axis overrides to p90_ttft,
     // the latency limit should apply.
     const data = [
-      pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any),
-      pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any),
+      pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any),
+      pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any),
     ];
-    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft');
-    // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999
+    const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft');
+    // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999
     expect(result).toHaveLength(1);
     expect(result[0].x).toBe(0.5);
   });
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 4b5335b6..735007ab 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -88,8 +88,7 @@ export function processOverlayChartData(
   let xAxisField: string = chartDef.x;
   // selectedXAxisMetric is already the effective metric for this chart type
   // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
-  const isTtftOverride =
-    selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft';
+  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     xAxisField = selectedXAxisMetric;
diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index e30816fa..19b4bfb0 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -315,7 +315,7 @@ export function PercentileSelector({
       <LabelWithTooltip
         htmlFor={id}
         label="Latency Percentile"
-        tooltip="Percentile of the latency distribution used for the chart x-axis. Switch between p90 and p99 to see tail-latency behavior on agentic runs."
+        tooltip="Percentile of the latency distribution used for the chart x-axis on agentic runs."
       />
       <Select
         value={value}
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 0970f8d7..91f65a34 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -187,16 +187,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
  * Percentile of the latency distribution used for the chart x-axis when
  * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
  * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
- * and p99 are surfaced in the UI.
+ * is surfaced in the UI.
  */
 export enum Percentile {
   P90 = 'p90',
-  P99 = 'p99',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
   [Percentile.P90]: { label: 'p90' },
-  [Percentile.P99]: { label: 'p99' },
 };
 
 export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[];
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 54ce43d9..b88c92b2 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -68,7 +68,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_prec: 'fp4',
   i_metric: 'y_tpPerGpu',
   i_pctl: 'p90',
-  i_xmetric: 'p99_ttft',
+  i_xmetric: 'p90_ttft',
   i_e2e_xmetric: '',
   i_scale: 'auto',
   i_gpus: '',

From 03c775ac9710b4a95d2d2c270adfcfe202219130 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:41:14 -0400
Subject: [PATCH 17/96] fix(agentic): honor e2e TTFT override in agentic mode
 too

The `!isAgentic` gate on the e2e TTFT override branch dropped the
user's `p90_ttft` pick in agentic mode, leaving the chart on the
default p90_e2el. The trailing withPercentile pass is idempotent
when xAxisField is already at the right percentile, so the gate is
unnecessary.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/hooks/useChartData.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 69222859..2a344cef 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -242,7 +242,7 @@ export function useChartData(
           const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition;
           xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x;
           xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label;
-        } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) {
+        } else if (chartDef.chartType === 'e2e' && isTtftOverride) {
           xAxisField = effectiveXMetric as keyof AggDataEntry;
           xAxisLabel = ttftLabel;
         }

From 49f2b2780d71cdad7b4a52ae0fdab0e2b8013d09 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 15 May 2026 13:45:19 -0400
Subject: [PATCH 18/96] fix(agentic): default e2e chart x-axis to p90 TTFT

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/InferenceContext.tsx | 2 +-
 packages/app/src/lib/url-state.ts                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index e88f57d8..c80afc2e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -131,7 +131,7 @@ export function InferenceProvider({
     () => getUrlParam('i_xmetric') || 'p90_ttft',
   );
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
-    () => getUrlParam('i_e2e_xmetric') || null,
+    () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
   );
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index b88c92b2..4a48a776 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -69,7 +69,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_metric: 'y_tpPerGpu',
   i_pctl: 'p90',
   i_xmetric: 'p90_ttft',
-  i_e2e_xmetric: '',
+  i_e2e_xmetric: 'p90_ttft',
   i_scale: 'auto',
   i_gpus: '',
   i_dates: '',

From 9e2c5322b0873ecd8ba8720d7e7e21961a7178dd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 12:47:22 -0500
Subject: [PATCH 19/96] fix(tooltip): cap data-point numeric values at 3
 decimal places

---
 .../inference/utils/tooltipUtils.ts           | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 4359fc44..3154070a 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -91,6 +91,14 @@ const tooltipLine = (label: string, value: string | number) =>
 const formatPct = (v: number | undefined): string | null =>
   v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`;
 
+/** Tooltip numeric values are capped at 3 decimal places (trailing zeros stripped). */
+const fmt = (v: number): string => {
+  if (!Number.isFinite(v)) return String(v);
+  const rounded = parseFloat(v.toFixed(3));
+  if (Math.abs(rounded) >= 10000) return new Intl.NumberFormat('en-US').format(rounded);
+  return String(rounded);
+};
+
 /**
  * Agentic-only tooltip rows: offload mode, KV cache hit rates, request
  * success, token totals. Returns an empty string for non-agentic rows.
@@ -201,16 +209,16 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -218,7 +226,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -274,10 +282,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
         <strong>Date:</strong> ${d.actualDate ?? d.date}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${tooltipLine('Total GPUs', d.tp)}
       ${generateParallelismHTML(d)}
@@ -318,16 +326,16 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
           : ''
       }
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${xLabel}:</strong> ${formatNumber(d.x)}
+        <strong>${xLabel}:</strong> ${fmt(d.x)}
       </div>
       <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-        <strong>${yLabel}:</strong> ${formatNumber(d.y)}
+        <strong>${yLabel}:</strong> ${fmt(d.y)}
       </div>
       ${
         selectedYAxisMetric === 'y_tpPerGpu' && d['inputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Input Token Throughput per GPU:</strong> ${formatNumber(d['inputTputPerGpu'].y)}
+            <strong>Input Token Throughput per GPU:</strong> ${fmt(d['inputTputPerGpu'].y)}
           </div>`
           : ''
       }
@@ -335,7 +343,7 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
         selectedYAxisMetric === 'y_tpPerGpu' && d['outputTputPerGpu']
           ? `
           <div style="color: var(--muted-foreground); font-size: 11px; margin-bottom: 4px;">
-            <strong>Output Token Throughput per GPU:</strong> ${formatNumber(d['outputTputPerGpu'].y)}
+            <strong>Output Token Throughput per GPU:</strong> ${fmt(d['outputTputPerGpu'].y)}
           </div>`
           : ''
       }

From 50ed25fa95e36d2ad881a1f68aa70010a19f34de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:05:09 -0500
Subject: [PATCH 20/96] fix(agentic): relabel x-axis title for natural-x case
 too

---
 .../components/inference/hooks/useChartData.ts    | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 2a344cef..b14775b6 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -248,19 +248,16 @@ export function useChartData(
         }
 
         // Agentic: rewrite the resolved x metric to the chosen percentile,
-        // and relabel accordingly. naturalX is already percentile-adjusted,
-        // so the per-metric override path is the only one that actually
-        // changes here.
+        // and relabel accordingly. Both have to be updated unconditionally —
+        // xAxisField may already be percentile-adjusted (via naturalX) while
+        // xAxisLabel still carries the raw chartDef.x_label prefix.
         if (isAgentic) {
-          const adjusted = withPercentile(
+          xAxisField = withPercentile(
             xAxisField as string,
             selectedPercentile,
           ) as keyof AggDataEntry;
-          if (adjusted !== xAxisField) {
-            const pctlWord = selectedPercentile.toUpperCase();
-            xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
-            xAxisField = adjusted;
-          }
+          const pctlWord = selectedPercentile.toUpperCase();
+          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
         }
 
         // The x-axis is "flipped" only when the good-direction reverses

From e9d8e3f66143fcdce8709f4a55bd0f29889d7174 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:08:05 -0500
Subject: [PATCH 21/96] fix(agentic): include percentile word in chart heading

---
 .../app/src/components/inference/hooks/useChartData.ts |  9 +++++++++
 .../app/src/components/inference/ui/ChartDisplay.tsx   | 10 ++++------
 .../components/inference/ui/UnofficialChartDisplay.tsx |  4 +---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index b14775b6..0d13b8ca 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -251,6 +251,10 @@ export function useChartData(
         // and relabel accordingly. Both have to be updated unconditionally —
         // xAxisField may already be percentile-adjusted (via naturalX) while
         // xAxisLabel still carries the raw chartDef.x_label prefix.
+        // The chart heading ("vs. <latency>") is also rewritten to include
+        // the percentile so the title above the plot reflects what's drawn.
+        const headingKey = `${selectedYAxisMetric}_heading` as keyof ChartDefinition;
+        let chartHeading = (chartDef[headingKey] as string) || chartDef.heading;
         if (isAgentic) {
           xAxisField = withPercentile(
             xAxisField as string,
@@ -258,6 +262,10 @@ export function useChartData(
           ) as keyof AggDataEntry;
           const pctlWord = selectedPercentile.toUpperCase();
           xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+          chartHeading = chartHeading.replace(
+            /^(vs\.\s+)(?:(Median|Mean|P90|P99(?:\.9)?)\s+)?/iu,
+            `$1${pctlWord} `,
+          );
         }
 
         // The x-axis is "flipped" only when the good-direction reverses
@@ -288,6 +296,7 @@ export function useChartData(
           chartDefinition: {
             ...chartDef,
             ...rooflineOverrides,
+            heading: chartHeading,
             x_label: xAxisLabel,
             y_label: dynamicYLabel === null ? undefined : String(dynamicYLabel),
           },
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 78df2c37..35213a14 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -449,12 +449,10 @@ export default function ChartDisplay() {
                               );
                             }
 
-                            // Fall back to configured heading
-                            return (
-                              graph.chartDefinition[
-                                `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                              ] || graph.chartDefinition.heading
-                            );
+                            // Fall back to the heading baked into chartDefinition
+                            // by useChartData (already resolves per-metric overrides
+                            // and applies the agentic percentile rewrite).
+                            return graph.chartDefinition.heading;
                           })()}
                         </h2>
                         <p className="text-sm text-muted-foreground mb-2">
diff --git a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
index f9b1b3c8..73018483 100644
--- a/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/UnofficialChartDisplay.tsx
@@ -194,9 +194,7 @@ export function UnofficialChartDisplay() {
                           `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
                         ]
                       }{' '}
-                      {graph.chartDefinition[
-                        `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                      ] || graph.chartDefinition.heading}
+                      {graph.chartDefinition.heading}
                     </h2>
                     <p className="text-sm text-muted-foreground mb-2">
                       {graph.model} • {selectedPrecisions.join(', ')} • {graph.sequence}

From 2046282eb3386bd0e7164b57a3f5dace9465e169 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 15 May 2026 17:15:24 -0500
Subject: [PATCH 22/96] fix(agentic): include percentile in e2e chart heading
 dropdown

---
 .../src/components/inference/ui/ChartDisplay.tsx    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 35213a14..e9021aed 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -40,6 +40,7 @@ import {
   getModelLabel,
   getPrecisionLabel,
   getSequenceLabel,
+  sequenceKind,
 } from '@/lib/data-mappings';
 import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
@@ -152,6 +153,7 @@ export default function ChartDisplay() {
     activeHwTypes,
     activeDates,
     setSelectedE2eXAxisMetric,
+    selectedPercentile,
     compareGpuPair,
   } = useInference();
 
@@ -415,12 +417,15 @@ export default function ChartDisplay() {
 
                             // For e2e chart: render clickable inline dropdown for x-axis
                             if (graph.chartDefinition.chartType === 'e2e') {
+                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                              const pctlWord = selectedPercentile.toUpperCase();
+                              const e2elLabel = isAgentic
+                                ? `${pctlWord} End-to-end Latency`
+                                : 'End-to-end Latency';
                               const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p90_ttft'
-                                  ? 'P90 TTFT'
-                                  : 'End-to-end Latency';
+                                selectedE2eXAxisMetric === 'p90_ttft' ? 'P90 TTFT' : e2elLabel;
                               const xAxisOptions = [
-                                { value: null, label: 'End-to-end Latency' },
+                                { value: null, label: e2elLabel },
                                 { value: 'p90_ttft', label: 'P90 TTFT' },
                               ];
                               const zoomPrefix =

From 9957f19e630c14fbfadb411725ba1736d58a83e1 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 18:53:56 -0500
Subject: [PATCH 23/96] feat(agentic): per-point trace_replay storage + detail
 page POC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Persist aiperf's profile_export.jsonl and server_metrics_export.{csv,json}
per agentic benchmark point in a new agentic_trace_replay sibling table
(migration 006), then a follow-up column for the gzipped time-series JSON
(migration 007). Ingest hook walks the agentic_<suffix> sibling artifact
and captures all three files; ~6 MB gz per point.

New /inference/agentic/[id] detail page renders:
- ISL / OSL histograms with p50/p75/p90/p95 guide lines
- KV cache utilization over time (raw scatter + 50-sample rolling avg)
- Request queue depth (running / waiting / total, smoothed)
- Prefix cache hit rate per interval (raw scatter + smoothed)
- Total + decode throughput with cumulative running-avg overlay
- Cumulative prompt token source breakdown (stacked area)

SiblingNav at the top renders the SKU label (e.g. "B200 · DeepSeek V4 Pro
· FP4 · vLLM") with chips for every (TP, conc, offload) variant in the
same workflow run so users can jump between sibling points.

Tooltip changes:
- portal to document.body + position:fixed so the tooltip can escape
  parent stacking contexts (backdrop-filter on the chart Card)
- clamp positioning to keep the tooltip inside the chart area
- "View charts →" button on pinned agentic points navigates to the
  detail page

Also ignores .claude/worktrees/ from oxlint so parallel agent worktrees
don't trip the pre-commit hook.
---
 .eslintignore                                 |   3 +
 .../inference/agentic/[id]/page.tsx           |  17 +
 .../app/src/app/api/unofficial-run/route.ts   |   4 +
 .../app/api/v1/benchmark-siblings/route.ts    |  38 +++
 .../src/app/api/v1/trace-histograms/route.ts  |  60 ++++
 .../app/api/v1/trace-server-metrics/route.ts  |  40 +++
 .../agentic-point/agentic-point-detail.tsx    | 308 +++++++++++++++++
 .../inference/agentic-point/distribution.tsx  | 140 ++++++++
 .../inference/agentic-point/sibling-nav.tsx   | 118 +++++++
 .../agentic-point/time-series-chart.tsx       | 311 ++++++++++++++++++
 .../app/src/components/inference/types.ts     |   2 +
 .../components/inference/ui/ScatterGraph.tsx  | 225 +++++++++----
 .../inference/utils/tooltipUtils.ts           |  34 +-
 .../src/components/ui/d3-chart-wrapper.tsx    |  53 ++-
 .../unofficial-run-provider.test.ts           |   1 +
 .../src/hooks/api/use-benchmark-siblings.ts   |  46 +++
 .../app/src/hooks/api/use-trace-histograms.ts |  39 +++
 .../src/hooks/api/use-trace-server-metrics.ts |  70 ++++
 packages/app/src/lib/api.ts                   |   2 +
 .../app/src/lib/benchmark-transform.test.ts   |   1 +
 packages/app/src/lib/benchmark-transform.ts   |   2 +
 .../app/src/lib/compare-pair-defaults.test.ts |   1 +
 .../src/lib/d3-chart/layers/scatter-points.ts |  30 +-
 .../migrations/006_agentic_trace_replay.sql   |  34 ++
 .../007_agentic_trace_server_metrics_json.sql |  17 +
 packages/db/src/etl/skip-tracker.test.ts      |   1 +
 packages/db/src/etl/skip-tracker.ts           |   3 +
 packages/db/src/etl/trace-replay-ingest.ts    |  83 +++++
 packages/db/src/ingest-ci-run.ts              |  90 +++++
 packages/db/src/ingest-gcs-backup.ts          |   2 +
 packages/db/src/json-provider.ts              |   1 +
 packages/db/src/queries/benchmark-siblings.ts | 132 ++++++++
 packages/db/src/queries/benchmarks.ts         |   9 +
 packages/db/src/queries/trace-histograms.ts   |  82 +++++
 .../db/src/queries/trace-server-metrics.ts    | 275 ++++++++++++++++
 35 files changed, 2196 insertions(+), 78 deletions(-)
 create mode 100644 .eslintignore
 create mode 100644 packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
 create mode 100644 packages/app/src/app/api/v1/benchmark-siblings/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-histograms/route.ts
 create mode 100644 packages/app/src/app/api/v1/trace-server-metrics/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/distribution.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/sibling-nav.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/time-series-chart.tsx
 create mode 100644 packages/app/src/hooks/api/use-benchmark-siblings.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-histograms.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-server-metrics.ts
 create mode 100644 packages/db/migrations/006_agentic_trace_replay.sql
 create mode 100644 packages/db/migrations/007_agentic_trace_server_metrics_json.sql
 create mode 100644 packages/db/src/etl/trace-replay-ingest.ts
 create mode 100644 packages/db/src/queries/benchmark-siblings.ts
 create mode 100644 packages/db/src/queries/trace-histograms.ts
 create mode 100644 packages/db/src/queries/trace-server-metrics.ts

diff --git a/.eslintignore b/.eslintignore
new file mode 100644
index 00000000..513a873e
--- /dev/null
+++ b/.eslintignore
@@ -0,0 +1,3 @@
+# Stale agent worktrees produced by parallel Claude Code sessions — they
+# hold their own branches and are linted as part of their own runs.
+.claude/worktrees/
diff --git a/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
new file mode 100644
index 00000000..77f29805
--- /dev/null
+++ b/packages/app/src/app/(dashboard)/inference/agentic/[id]/page.tsx
@@ -0,0 +1,17 @@
+import type { Metadata } from 'next';
+
+import { AgenticPointDetail } from '@/components/inference/agentic-point/agentic-point-detail';
+
+export const metadata: Metadata = {
+  title: 'Agentic trace detail | InferenceX',
+  robots: { index: false },
+};
+
+export default async function AgenticPointDetailPage({
+  params,
+}: {
+  params: Promise<{ id: string }>;
+}) {
+  const { id } = await params;
+  return <AgenticPointDetail id={Number(id)} />;
+}
diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts
index 7578e897..3d2d0da7 100644
--- a/packages/app/src/app/api/unofficial-run/route.ts
+++ b/packages/app/src/app/api/unofficial-run/route.ts
@@ -33,6 +33,10 @@ export function normalizeArtifactRows(
     if (!params) continue;
     const { config } = params;
     results.push({
+      // Synthetic id — overlay rows aren't persisted, so trace_replay lookups
+      // (keyed on benchmark_results.id) will always miss, which is the
+      // intended behaviour: overlays never have stored trace_replay blobs.
+      id: 0,
       hardware: config.hardware,
       framework: config.framework,
       model: config.model,
diff --git a/packages/app/src/app/api/v1/benchmark-siblings/route.ts b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
new file mode 100644
index 00000000..14c1d461
--- /dev/null
+++ b/packages/app/src/app/api/v1/benchmark-siblings/route.ts
@@ -0,0 +1,38 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getBenchmarkSiblings,
+  type BenchmarkSiblings,
+} from '@semianalysisai/inferencex-db/queries/benchmark-siblings';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedSiblings = cachedQuery(
+  (id: number): Promise<BenchmarkSiblings | null> => getBenchmarkSiblings(getDb(), id),
+  'benchmark-siblings',
+);
+
+/**
+ * GET /api/v1/benchmark-siblings?id=N
+ *
+ * Returns the SKU (hw/framework/model/precision/spec/benchmark_type) of the
+ * benchmark_result + all sibling rows that share that SKU within the same
+ * workflow_run. Used by the agentic detail page to render a navigator.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedSiblings(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching benchmark siblings:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
new file mode 100644
index 00000000..fd7572a8
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -0,0 +1,60 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceHistograms,
+  type TraceHistogramMap,
+} from '@semianalysisai/inferencex-db/queries/trace-histograms';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceHistograms = cachedQuery(
+  (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
+  'trace-histograms',
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/trace-histograms?ids=1,2,3
+ *
+ * Returns per-request ISL/OSL arrays parsed from the stored aiperf
+ * `profile_export.jsonl` blobs, keyed by `benchmark_results.id`.
+ * Ids without a trace_replay blob are omitted from the response.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    // Sort the cache key so the same set of ids in any order hits the same entry.
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const histograms = await getCachedTraceHistograms(sorted);
+    return cachedJson(histograms);
+  } catch (error) {
+    console.error('Error fetching trace histograms:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/trace-server-metrics/route.ts b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
new file mode 100644
index 00000000..7346a3e8
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-server-metrics/route.ts
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceServerMetrics,
+  type TraceServerMetrics,
+} from '@semianalysisai/inferencex-db/queries/trace-server-metrics';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceServerMetrics = cachedQuery(
+  (id: number): Promise<TraceServerMetrics | null> => getTraceServerMetrics(getDb(), id),
+  'trace-server-metrics',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/trace-server-metrics?id=N
+ *
+ * Returns parsed time-series for the agentic detail view: KV cache usage,
+ * prefix cache hit rate per interval, queue depth, and per-source prompt
+ * token rates. Times are in seconds from benchmark start. 404 if the point
+ * has no stored server_metrics_export.json blob.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedTraceServerMetrics(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching trace server metrics:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
new file mode 100644
index 00000000..3cd274ba
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -0,0 +1,308 @@
+'use client';
+
+import Link from 'next/link';
+import { useRouter } from 'next/navigation';
+import { ArrowLeft } from 'lucide-react';
+
+import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import {
+  useTraceServerMetrics,
+  type PointMeta,
+  type QueueDepthPoint,
+  type TimeSeriesPoint,
+} from '@/hooks/api/use-trace-server-metrics';
+import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+
+import { Distribution } from './distribution';
+import { SiblingNav } from './sibling-nav';
+import {
+  StackedAreaChart,
+  TimeSeriesChart,
+  cumulativeAverage,
+  rollingAverage,
+  sumSeries,
+} from './time-series-chart';
+
+interface Props {
+  id: number;
+}
+
+const fmtPct = (v: number | null | undefined): string =>
+  v === null || v === undefined || Number.isNaN(v) ? '—' : `${(v * 100).toFixed(2)}%`;
+
+function MetaLine({ label, value }: { label: string; value: React.ReactNode }) {
+  return (
+    <div className="flex flex-col gap-0.5">
+      <span className="text-xs uppercase tracking-wide text-muted-foreground">{label}</span>
+      <span className="text-sm font-medium text-foreground">{value}</span>
+    </div>
+  );
+}
+
+function PointSummary({ meta }: { meta: PointMeta }) {
+  return (
+    <div className="mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-2">
+        <p className="text-sm text-muted-foreground">
+          Selected point
+          {meta.disagg ? ' · disagg' : ''}
+          {meta.spec_method && meta.spec_method !== 'none' ? ` · spec=${meta.spec_method}` : ''}
+        </p>
+        {meta.run_url && (
+          <a
+            href={meta.run_url}
+            target="_blank"
+            rel="noopener noreferrer"
+            className="text-xs text-muted-foreground hover:text-foreground underline"
+          >
+            GitHub Actions run →
+          </a>
+        )}
+      </div>
+      <div className="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-6 gap-3">
+        <MetaLine label="Offload" value={(meta.offload_mode ?? 'off').toUpperCase()} />
+        <MetaLine label="Concurrency" value={meta.conc} />
+        <MetaLine label="GPU cache hit" value={fmtPct(meta.server_gpu_cache_hit_rate)} />
+        <MetaLine label="CPU cache hit" value={fmtPct(meta.server_cpu_cache_hit_rate)} />
+        {meta.isl !== null && <MetaLine label="ISL" value={meta.isl} />}
+        {meta.osl !== null && <MetaLine label="OSL" value={meta.osl} />}
+      </div>
+    </div>
+  );
+}
+
+function ChartCard({ title, children }: { title: string; children: React.ReactNode }) {
+  return (
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+      <h2 className="text-sm font-semibold text-foreground mb-3">{title}</h2>
+      {children}
+    </div>
+  );
+}
+
+export function AgenticPointDetail({ id }: Props) {
+  const router = useRouter();
+  const histQuery = useTraceHistograms([id], true);
+  const metricsQuery = useTraceServerMetrics(id, true);
+  const siblingsQuery = useBenchmarkSiblings(id);
+
+  const hist = histQuery.data?.[id];
+  const metrics = metricsQuery.data;
+  const siblingsData = siblingsQuery.data;
+
+  return (
+    <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
+      <div className="flex items-center gap-2">
+        <button
+          type="button"
+          onClick={() => router.back()}
+          className="inline-flex items-center gap-1 text-sm text-muted-foreground hover:text-foreground"
+        >
+          <ArrowLeft className="size-4" /> Back
+        </button>
+        <span className="text-sm text-muted-foreground">·</span>
+        <Link href="/inference" className="text-sm text-muted-foreground hover:text-foreground">
+          Inference chart
+        </Link>
+      </div>
+
+      {siblingsData ? (
+        <SiblingNav sku={siblingsData.sku} siblings={siblingsData.siblings} />
+      ) : siblingsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading SKU navigator…</div>
+      ) : null}
+
+      {metrics ? (
+        <PointSummary meta={metrics.meta} />
+      ) : metricsQuery.isLoading ? (
+        <div className="text-sm text-muted-foreground">Loading point metadata…</div>
+      ) : null}
+
+      {metricsQuery.isError && (
+        <div className="rounded-lg border border-destructive/40 bg-destructive/10 p-4 text-sm text-destructive">
+          Failed to load trace data for benchmark point #{id}.
+        </div>
+      )}
+      {metricsQuery.data === null && !metricsQuery.isLoading && (
+        <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+          No stored trace_replay blob for benchmark point #{id}. This point predates the aiperf
+          time-series capture, or its source artifacts have expired on GitHub.
+        </div>
+      )}
+
+      <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+        <ChartCard title="Input sequence length distribution">
+          {hist ? (
+            <Distribution values={hist.isl} unit="tokens" />
+          ) : histQuery.isLoading ? (
+            <Skeleton />
+          ) : (
+            <Empty />
+          )}
+        </ChartCard>
+        <ChartCard title="Output sequence length distribution">
+          {hist ? (
+            <Distribution values={hist.osl} unit="tokens" />
+          ) : histQuery.isLoading ? (
+            <Skeleton />
+          ) : (
+            <Empty />
+          )}
+        </ChartCard>
+
+        <ChartCard title="KV cache utilization over time">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'GPU KV cache (avg n=50)',
+                  data: rollingAverage(metrics.kvCacheUsage, 50),
+                  rawData: metrics.kvCacheUsage,
+                  color: '#3b82f6',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yMax={1}
+              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+              yAxisLabel="KV cache (%)"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Request queue depth">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'Running (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.running,
+                    })),
+                    50,
+                  ),
+                  color: '#22c55e',
+                  strokeWidth: 2,
+                },
+                {
+                  name: 'Waiting (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.waiting,
+                    })),
+                    50,
+                  ),
+                  color: '#ef4444',
+                  strokeWidth: 2,
+                },
+                {
+                  name: 'Total (avg n=50)',
+                  data: rollingAverage(
+                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                      t: p.t,
+                      value: p.total,
+                    })),
+                    50,
+                  ),
+                  color: '#3b82f6',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yAxisLabel="Requests"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Prefix cache hit rate per interval">
+          {metrics ? (
+            <TimeSeriesChart
+              series={[
+                {
+                  name: 'GPU (HBM, avg n=50)',
+                  data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                  rawData: metrics.prefixCacheHitRate,
+                  color: '#a855f7',
+                  strokeWidth: 2,
+                },
+              ]}
+              durationS={metrics.durationS}
+              yMax={1}
+              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+              yAxisLabel="Hit rate (%)"
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Throughput (total & decode)">
+          {metrics ? (
+            (() => {
+              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(total, 50),
+                      color: '#3b82f6',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Decode (avg n=50)',
+                      data: rollingAverage(metrics.decodeTps, 50),
+                      color: '#f97316',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Total running avg',
+                      data: cumulativeAverage(total),
+                      color: '#ef4444',
+                      strokeWidth: 3,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens / sec"
+                />
+              );
+            })()
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+
+        <ChartCard title="Cumulative prompt token source breakdown">
+          {metrics ? (
+            <StackedAreaChart
+              sourceSeries={metrics.promptTokensBySource}
+              durationS={metrics.durationS}
+            />
+          ) : (
+            <Skeleton />
+          )}
+        </ChartCard>
+      </div>
+    </div>
+  );
+}
+
+function Skeleton() {
+  return <div className="h-[260px] rounded-md bg-muted/30 animate-pulse" />;
+}
+
+function Empty() {
+  return (
+    <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+  );
+}
+
+// Re-export type for use by sub-components
+export type { TimeSeriesPoint, QueueDepthPoint };
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
new file mode 100644
index 00000000..c9a563fe
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -0,0 +1,140 @@
+'use client';
+
+import { useMemo, useRef } from 'react';
+
+/**
+ * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
+ * detail-page card — fills its container width via `viewBox` + 100% width.
+ */
+export function Distribution({
+  values,
+  unit,
+  height = 260,
+}: {
+  values: readonly number[];
+  unit: string;
+  height?: number;
+}) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const svgParts = useMemo(() => {
+    if (values.length === 0) return { bars: '', guides: '', legend: '', axis: '', yTicks: '' };
+    const sorted = [...values].toSorted((a, b) => a - b);
+    const min = sorted[0]!;
+    const max = sorted.at(-1)!;
+    const range = Math.max(1e-9, max - min);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+
+    // Sturges-ish, scaled with sample size, capped so bars stay visible.
+    const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
+    const counts: number[] = Array.from({ length: nBins }, () => 0);
+    for (const v of values) {
+      const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+      counts[i]!++;
+    }
+    const maxCount = Math.max(...counts, 1);
+    const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+    const barW = innerW / nBins;
+
+    const fmt = (n: number) =>
+      n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+    const quantile = (q: number): number => {
+      const pos = (sorted.length - 1) * q;
+      const lo = Math.floor(pos);
+      const hi = Math.ceil(pos);
+      return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+    };
+
+    const bars = counts
+      .map((c, i) => {
+        const h = (c / maxCount) * innerH;
+        const x = PAD.left + i * barW;
+        const y = PAD.top + (innerH - h);
+        return `<rect x="${x.toFixed(2)}" y="${y.toFixed(2)}" width="${Math.max(0, barW - 1).toFixed(2)}" height="${h.toFixed(2)}" fill="currentColor" opacity="0.55" />`;
+      })
+      .join('');
+
+    const GUIDES = [
+      { label: 'p50', q: 0.5, color: '#3b82f6' },
+      { label: 'p75', q: 0.75, color: '#22c55e' },
+      { label: 'p90', q: 0.9, color: '#f59e0b' },
+      { label: 'p95', q: 0.95, color: '#ef4444' },
+    ] as const;
+    const guides = GUIDES.map(({ q, color }) => {
+      const v = quantile(q);
+      const x = xScale(v);
+      return `<line x1="${x.toFixed(2)}" x2="${x.toFixed(2)}" y1="${PAD.top}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" opacity="0.95" />`;
+    }).join('');
+
+    // 4-tick x-axis: min, ~33%, ~66%, max
+    const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+    const axisY = PAD.top + innerH + 14;
+    const axisLine = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${(PAD.top + innerH).toFixed(2)}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="currentColor" opacity="0.2" />`;
+    const xLabels = xTickVals
+      .map((v, i) => {
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${xScale(v).toFixed(2)}" y="${axisY}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmt(v)}</text>`;
+      })
+      .join('');
+    const axisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">value (${unit})</text>`;
+
+    // 5-tick y-axis
+    const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
+    const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${PAD.left}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.4" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${fmt(v)}</text></g>`;
+      })
+      .join('');
+    const yAxisLabel = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">count</text>`;
+
+    const chipY = H - 8;
+    const chipW = innerW / GUIDES.length;
+    const legend = GUIDES.map(({ label: ql, q, color }, i) => {
+      const v = quantile(q);
+      const x = PAD.left + i * chipW;
+      return `
+      <line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" />
+      <text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${ql} ${fmt(v)}</text>`;
+    }).join('');
+
+    return {
+      bars,
+      guides,
+      legend,
+      axis: axisLine + xLabels + axisTitle + yAxisLabel,
+      yTicks,
+    };
+  }, [values, unit, H]);
+
+  const ref = useRef<HTMLDivElement | null>(null);
+
+  if (values.length === 0) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <div ref={ref} className="w-full">
+      <div className="mb-2 text-xs text-muted-foreground">
+        {values.length.toLocaleString()} requests · range {Math.round(Math.min(...values))}–
+        {Math.round(Math.max(...values))} {unit}
+      </div>
+      <svg
+        viewBox={`0 0 ${W} ${H}`}
+        preserveAspectRatio="xMidYMid meet"
+        className="w-full h-auto text-foreground"
+        dangerouslySetInnerHTML={{
+          __html:
+            svgParts.bars + svgParts.guides + svgParts.axis + svgParts.yTicks + svgParts.legend,
+        }}
+      />
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
new file mode 100644
index 00000000..776c8ba2
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -0,0 +1,118 @@
+'use client';
+
+import { useRouter } from 'next/navigation';
+import { ChevronLeft, ChevronRight } from 'lucide-react';
+
+import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
+
+const HW_LABELS: Record<string, string> = {
+  b200: 'B200',
+  b300: 'B300',
+  gb200: 'GB200',
+  gb300: 'GB300',
+  h100: 'H100',
+  h200: 'H200',
+  mi300x: 'MI300X',
+  mi325x: 'MI325X',
+  mi355x: 'MI355X',
+};
+
+const MODEL_LABELS: Record<string, string> = {
+  dsr1: 'DeepSeek R1',
+  dsv4: 'DeepSeek V4 Pro',
+  glm5: 'GLM-5',
+  'glm5.1': 'GLM-5.1',
+  gptoss120b: 'gpt-oss 120B',
+  kimik2: 'Kimi K2',
+  'kimik2.5': 'Kimi K2.5',
+  'kimik2.6': 'Kimi K2.6',
+  llama70b: 'Llama 3.3 70B',
+  'minimaxm2.5': 'MiniMax M2.5',
+  'minimaxm2.7': 'MiniMax M2.7',
+  'qwen3.5': 'Qwen 3.5',
+};
+
+function hwLabel(hw: string) {
+  return HW_LABELS[hw] ?? hw.toUpperCase();
+}
+function modelLabel(m: string) {
+  return MODEL_LABELS[m] ?? m;
+}
+function frameworkLabel(fw: string) {
+  if (fw === 'vllm') return 'vLLM';
+  if (fw === 'sglang') return 'SGLang';
+  if (fw === 'trt') return 'TRT';
+  if (fw === 'mori-sglang') return 'Mori-SGLang';
+  if (fw.startsWith('dynamo-')) return `Dynamo ${fw.slice('dynamo-'.length).toUpperCase()}`;
+  return fw;
+}
+
+/** Short label for a sibling chip: parallelism + concurrency. */
+function chipLabel(s: BenchmarkSibling): string {
+  const parallel = s.disagg
+    ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
+    : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
+  const offload = s.offload_mode === 'on' ? ' • off=ON' : '';
+  return `${parallel} • c=${s.conc}${offload}`;
+}
+
+export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) {
+  const router = useRouter();
+  const currentIdx = siblings.findIndex((s) => s.is_current);
+  const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null;
+  const next =
+    currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null;
+
+  const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`;
+
+  return (
+    <div className="border-b border-border/40 pb-4 mb-4">
+      <div className="flex items-baseline justify-between gap-3 mb-3">
+        <h1 className="text-2xl font-semibold text-foreground">{skuLabel}</h1>
+        <span className="text-xs text-muted-foreground">
+          {siblings.length} point{siblings.length === 1 ? '' : 's'} in this run · {sku.date}
+        </span>
+      </div>
+      <div className="flex items-center gap-2 flex-wrap">
+        <button
+          type="button"
+          disabled={!prev}
+          onClick={() => prev && router.push(`/inference/agentic/${prev.id}`)}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Previous point"
+        >
+          <ChevronLeft className="size-3.5" /> prev
+        </button>
+        <div className="flex items-center gap-1 flex-wrap">
+          {siblings.map((s) => {
+            const active = s.is_current;
+            return (
+              <button
+                key={s.id}
+                type="button"
+                onClick={() => !active && router.push(`/inference/agentic/${s.id}`)}
+                className={`px-2 py-1 rounded-md text-xs border transition-colors ${
+                  active
+                    ? 'border-primary bg-primary text-primary-foreground font-medium'
+                    : 'border-border/40 text-foreground hover:bg-accent'
+                } ${s.has_trace ? '' : 'opacity-60'}`}
+                title={s.has_trace ? undefined : 'No stored trace data'}
+              >
+                {chipLabel(s)}
+              </button>
+            );
+          })}
+        </div>
+        <button
+          type="button"
+          disabled={!next}
+          onClick={() => next && router.push(`/inference/agentic/${next.id}`)}
+          className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
+          aria-label="Next point"
+        >
+          next <ChevronRight className="size-3.5" />
+        </button>
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
new file mode 100644
index 00000000..bc081b4e
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -0,0 +1,311 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+
+interface Series {
+  name: string;
+  /** The line to draw (caller pre-smooths if desired). */
+  data: TimeSeriesPoint[];
+  /** Optional raw per-scrape values; rendered as low-opacity scatter behind the line. */
+  rawData?: TimeSeriesPoint[];
+  color: string;
+  /** Override default stroke width (1.8). Use higher values for emphasis lines. */
+  strokeWidth?: number;
+}
+
+interface TimeSeriesChartProps {
+  series: Series[];
+  durationS: number;
+  yMax?: number;
+  yFmt?: (v: number) => string;
+  yAxisLabel?: string;
+  height?: number;
+}
+
+/** Centered rolling average over `windowSize` samples. */
+export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowSize <= 1) return data;
+  const half = Math.floor(windowSize / 2);
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const start = Math.max(0, i - half);
+    const end = Math.min(data.length, i + half + 1);
+    let sum = 0;
+    let n = 0;
+    for (let j = start; j < end; j++) {
+      sum += data[j]!.value;
+      n++;
+    }
+    out[i] = { t: data[i]!.t, value: n > 0 ? sum / n : 0 };
+  }
+  return out;
+}
+
+/**
+ * Expanding-window cumulative mean from index 0..i. Useful for "running
+ * average over the entire run" lines (red overlay in the throughput chart).
+ */
+export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  let sum = 0;
+  for (let i = 0; i < data.length; i++) {
+    sum += data[i]!.value;
+    out[i] = { t: data[i]!.t, value: sum / (i + 1) };
+  }
+  return out;
+}
+
+/** Pointwise sum of two arrays sharing the same t index. */
+export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  const n = Math.min(a.length, b.length);
+  const out: TimeSeriesPoint[] = Array.from({ length: n });
+  for (let i = 0; i < n; i++) {
+    out[i] = { t: a[i]!.t, value: a[i]!.value + b[i]!.value };
+  }
+  return out;
+}
+
+const fmtInt = (n: number) =>
+  n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
+
+const fmtSeconds = (s: number) => {
+  if (s < 60) return `${Math.round(s)}s`;
+  const m = Math.floor(s / 60);
+  const rem = Math.round(s % 60);
+  return `${m}m ${rem}s`;
+};
+
+export function TimeSeriesChart({
+  series,
+  durationS,
+  yMax: yMaxOpt,
+  yFmt = fmtInt,
+  yAxisLabel,
+  height = 260,
+}: TimeSeriesChartProps) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const inner = useMemo(() => {
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const xMax = Math.max(durationS, 1);
+    const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value)));
+    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+    const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
+
+    const subsample = (arr: TimeSeriesPoint[]) => {
+      if (arr.length === 0) return arr;
+      const stride = Math.max(1, Math.floor(arr.length / innerW));
+      return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+    };
+
+    // Layered render: raw scatter (back) → lines (front). Iterate twice so
+    // emphasis lines (high strokeWidth) draw over everything else.
+    const dotsLayer = series
+      .filter((s) => s.rawData && s.rawData.length > 0)
+      .map((s) =>
+        subsample(s.rawData!)
+          .map((d) => {
+            const x = xScale(d.t);
+            const y = yScale(d.value);
+            return `<circle cx="${x.toFixed(2)}" cy="${y.toFixed(2)}" r="1.5" fill="${s.color}" opacity="0.2" />`;
+          })
+          .join(''),
+      )
+      .join('');
+
+    const lineLayer = series
+      .map((s) => {
+        if (s.data.length === 0) return '';
+        const sampled = subsample(s.data);
+        const pts = sampled.map((d) => [xScale(d.t), yScale(d.value)] as [number, number]);
+        const path = pts
+          .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+          .join(' ');
+        return `<path d="${path}" fill="none" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 1.8}" />`;
+      })
+      .join('');
+
+    const paths = dotsLayer + lineLayer;
+
+    // X-axis: 5 ticks at 0..xMax
+    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+    const axisY = PAD.top + innerH;
+    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
+      .map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
+      })
+      .join('')}`;
+    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
+
+    // Y-axis: 5 ticks at 0..yMax
+    const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${yFmt(v)}</text></g>`;
+      })
+      .join('');
+    const yAxisTitle = yAxisLabel
+      ? `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">${yAxisLabel}</text>`
+      : '';
+
+    // Legend at the bottom of the SVG
+    const chipY = H - 8;
+    const chipW = innerW / Math.max(1, series.length);
+    const legend = series
+      .map((s, i) => {
+        const x = PAD.left + i * chipW;
+        return `<line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 2}" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${s.name}</text>`;
+      })
+      .join('');
+
+    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
+  }, [series, durationS, yMaxOpt, yFmt, yAxisLabel, H]);
+
+  if (series.every((s) => s.data.length === 0)) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <svg
+      viewBox={`0 0 ${W} ${H}`}
+      preserveAspectRatio="xMidYMid meet"
+      className="w-full h-auto text-foreground"
+      dangerouslySetInnerHTML={{ __html: inner }}
+    />
+  );
+}
+
+/** Stacked-area chart for token-source share over time. */
+export function StackedAreaChart({
+  sourceSeries,
+  durationS,
+  height = 260,
+}: {
+  sourceSeries: Record<string, TimeSeriesPoint[]>;
+  durationS: number;
+  height?: number;
+}) {
+  const W = 720;
+  const H = height;
+  const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
+
+  const inner = useMemo(() => {
+    const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
+    if (entries.length === 0) return '';
+    const tValues = entries[0]![1].map((p) => p.t);
+    const cum: Record<string, number[]> = {};
+    for (const [name, arr] of entries) {
+      let acc = 0;
+      cum[name] = arr.map((p) => {
+        acc += p.value;
+        return acc;
+      });
+    }
+    const shares: Record<string, number[]> = {};
+    for (const name of Object.keys(cum)) shares[name] = [];
+    for (let i = 0; i < tValues.length; i++) {
+      const total = entries.reduce((s, [name]) => s + (cum[name]?.[i] ?? 0), 0);
+      for (const [name] of entries) {
+        shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
+      }
+    }
+
+    const colors: Record<string, string> = {
+      local_compute: '#f97316',
+      local_cache_hit: '#3b82f6',
+      external_kv_transfer: '#22c55e',
+      miss: '#f97316',
+    };
+    const labelFor: Record<string, string> = {
+      local_compute: 'Prefill',
+      local_cache_hit: 'HBM Cache Hit',
+      external_kv_transfer: 'Offload Cache Hit',
+      miss: 'Miss',
+    };
+
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const xMax = Math.max(durationS, 1);
+    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+    const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+    const stackOrder = Object.keys(shares);
+    const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+    const layers = stackOrder.map((name) => {
+      const upper = shares[name]!.map((v, i) => lower[i]! + v);
+      const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+      const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+      const d = `${top
+        .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+        .join(' ')} ${[...bottom]
+        .toReversed()
+        .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+        .join(' ')} Z`;
+      const color = colors[name] ?? '#6b7280';
+      const path = `<path d="${d}" fill="${color}" opacity="0.75" />`;
+      for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+      return { name, color, path };
+    });
+
+    const paths = layers.map((l) => l.path).join('');
+
+    // X-axis
+    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+    const axisY = PAD.top + innerH;
+    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
+      .map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
+      })
+      .join('')}`;
+    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
+
+    // Y-axis 0..100%
+    const yTickVals = [0, 0.25, 0.5, 0.75, 1];
+    const yTicks = yTickVals
+      .map((v) => {
+        const y = yScale(v);
+        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${(v * 100).toFixed(0)}%</text></g>`;
+      })
+      .join('');
+    const yAxisTitle = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">% of prefill tokens</text>`;
+
+    const chipY = H - 8;
+    const chipW = innerW / Math.max(1, layers.length);
+    const legend = layers
+      .map((l, i) => {
+        const x = PAD.left + i * chipW;
+        return `<rect x="${(x + 2).toFixed(2)}" y="${chipY - 9}" width="12" height="8" fill="${l.color}" opacity="0.75" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${labelFor[l.name] ?? l.name}</text>`;
+      })
+      .join('');
+
+    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
+  }, [sourceSeries, durationS, H]);
+
+  if (Object.values(sourceSeries).every((v) => v.length === 0)) {
+    return (
+      <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
+    );
+  }
+
+  return (
+    <svg
+      viewBox={`0 0 ${W} ${H}`}
+      preserveAspectRatio="xMidYMid meet"
+      className="w-full h-auto text-foreground"
+      dangerouslySetInnerHTML={{ __html: inner }}
+    />
+  );
+}
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index f848e0e4..7a39bbd1 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -36,6 +36,8 @@ import type { Model, Sequence } from '@/lib/data-mappings';
  * @property {number} p99_e2el - 99th percentile of End-to-End Latency.
  */
 export interface AggDataEntry {
+  /** Stable per-point id from benchmark_results — for trace_replay lookups. */
+  id?: number;
   hw: string;
   mtp?: string;
   hwKey: string;
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 98562fb9..fdcf8952 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -6,6 +6,8 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react';
 
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
+import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import { useRouter } from 'next/navigation';
 import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import { computeToggle } from '@/hooks/useTogglableSet';
@@ -348,6 +350,10 @@ const ScatterGraph = React.memo(
     );
 
     const rooflines = useMemo(() => {
+      // Frontier scope is (hw, precision, date) — points from different dates
+      // can never share a frontier (a May 15 point can't dominate a May 17 plot).
+      // The legend grouping is still by (hw, precision); we just split the
+      // pareto compute per date and re-merge into the legend bucket.
       const result: Record<string, InferenceData[]> = {};
       const rooflineKey = `${selectedYAxisMetric}_roofline` as keyof ChartDefinition;
       const dir = chartDefinition[rooflineKey] as
@@ -356,17 +362,31 @@ const ScatterGraph = React.memo(
         | 'lower_left'
         | 'lower_right'
         | undefined;
-      for (const hw of Object.keys(groupedData)) {
-        const front =
-          dir === 'upper_right'
-            ? paretoFrontUpperRight(groupedData[hw])
-            : dir === 'upper_left'
-              ? paretoFrontUpperLeft(groupedData[hw])
-              : dir === 'lower_left'
-                ? paretoFrontLowerLeft(groupedData[hw])
-                : paretoFrontLowerRight(groupedData[hw]);
-        front.sort((a, b) => a.x - b.x);
-        result[hw] = front;
+      const frontierFn =
+        dir === 'upper_right'
+          ? paretoFrontUpperRight
+          : dir === 'upper_left'
+            ? paretoFrontUpperLeft
+            : dir === 'lower_left'
+              ? paretoFrontLowerLeft
+              : paretoFrontLowerRight;
+      for (const hwKey of Object.keys(groupedData)) {
+        const byDate = new Map<string, InferenceData[]>();
+        for (const p of groupedData[hwKey]) {
+          const d = p.date;
+          let bucket = byDate.get(d);
+          if (!bucket) {
+            bucket = [];
+            byDate.set(d, bucket);
+          }
+          bucket.push(p);
+        }
+        const combined: InferenceData[] = [];
+        for (const datePoints of byDate.values()) {
+          combined.push(...frontierFn(datePoints));
+        }
+        combined.sort((a, b) => a.x - b.x);
+        result[hwKey] = combined;
       }
       return result;
     }, [groupedData, selectedYAxisMetric, chartDefinition]);
@@ -374,7 +394,7 @@ const ScatterGraph = React.memo(
     const optimalPointKeys = useMemo(() => {
       const keys = new Set<string>();
       Object.values(rooflines).forEach((pts) =>
-        pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}-${p.x}-${p.y}`)),
+        pts.forEach((p) => keys.add(`${p.hwKey}_${p.precision}_${p.date}-${p.x}-${p.y}`)),
       );
       return keys;
     }, [rooflines]);
@@ -477,6 +497,18 @@ const ScatterGraph = React.memo(
     // All official points for rendering (unfiltered — visibility via opacity)
     const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]);
 
+    // Trace-replay histograms (ISL / OSL distributions) for agentic points.
+    // Pre-fetch the whole visible set so tooltip render stays synchronous.
+    const agenticIds = useMemo(() => {
+      const ids: number[] = [];
+      for (const p of pointsData) {
+        if (p.benchmark_type === 'agentic_traces' && typeof p.id === 'number') ids.push(p.id);
+      }
+      return ids;
+    }, [pointsData]);
+    const { data: traceHistograms } = useTraceHistograms(agenticIds);
+    const router = useRouter();
+
     // Gradient label data
     const allPointLabelsByKey = useMemo(() => {
       const globalLabelColorMap = new Map<string, string>();
@@ -516,7 +548,9 @@ const ScatterGraph = React.memo(
     const visiblePoints = useMemo(() => {
       let pts = filteredData;
       if (hideNonOptimal) {
-        pts = pts.filter((d) => optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`));
+        pts = pts.filter((d) =>
+          optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`),
+        );
       }
       return processedOverlayData.length > 0 ? [...pts, ...processedOverlayData] : pts;
     }, [filteredData, processedOverlayData, hideNonOptimal, optimalPointKeys]);
@@ -601,7 +635,8 @@ const ScatterGraph = React.memo(
       (d: InferenceData) =>
         effectiveActiveHwTypes.has(d.hwKey as string) &&
         selectedPrecisions.includes(d.precision) &&
-        (!hideNonOptimal || optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`)),
+        (!hideNonOptimal ||
+          optimalPointKeys.has(`${d.hwKey}_${d.precision}_${d.date}-${d.x}-${d.y}`)),
       [effectiveActiveHwTypes, selectedPrecisions, hideNonOptimal, optimalPointKeys],
     );
 
@@ -739,6 +774,8 @@ const ScatterGraph = React.memo(
             hardwareConfig,
             isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)),
             runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
+            traceHistogram:
+              typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined,
           }),
         getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x),
         getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y),
@@ -754,26 +791,43 @@ const ScatterGraph = React.memo(
           ),
         onPointClick: (d: InferenceData) => {
           track('latency_data_point_clicked', { hw: String(d.hwKey), x: d.x, y: d.y });
-          // Attach track-over-time button handler in the tooltip
           const tooltipEl = chartRef.current?.getTooltipElement();
-          if (tooltipEl) {
-            const btn = tooltipEl.querySelector('[data-action="track-over-time"]');
-            if (btn) {
-              btn.addEventListener('click', (btnEvent) => {
-                btnEvent.stopPropagation();
-                const configId = buildPointConfigId(d);
-                if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
-                else addTrackedConfig(d, chartDefinition.chartType);
-                chartRef.current?.dismissTooltip();
-                chartRef.current?.hideTooltip();
-                track('latency_point_tracked_via_tooltip', {
-                  hwKey: String(d.hwKey),
-                  tp: d.tp,
-                  conc: d.conc,
-                  precision: d.precision,
-                });
+          if (!tooltipEl) return;
+
+          // ── Summary-page actions ──────────────────────────────────────────
+          const trackBtn = tooltipEl.querySelector('[data-action="track-over-time"]');
+          if (trackBtn) {
+            trackBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              const configId = buildPointConfigId(d);
+              if (trackedConfigIdsRef.current.has(configId)) removeTrackedConfig(configId);
+              else addTrackedConfig(d, chartDefinition.chartType);
+              chartRef.current?.dismissTooltip();
+              chartRef.current?.hideTooltip();
+              track('latency_point_tracked_via_tooltip', {
+                hwKey: String(d.hwKey),
+                tp: d.tp,
+                conc: d.conc,
+                precision: d.precision,
               });
-            }
+            });
+          }
+
+          // ── "View charts" → navigate to dedicated detail page ────────────
+          const viewBtn = tooltipEl.querySelector('[data-action="view-charts"]');
+          if (viewBtn && typeof d.id === 'number') {
+            const pointId = d.id;
+            viewBtn.addEventListener('click', (btnEvent) => {
+              btnEvent.stopPropagation();
+              track('latency_view_charts_opened', {
+                id: pointId,
+                hwKey: String(d.hwKey),
+                conc: d.conc,
+              });
+              chartRef.current?.dismissTooltip();
+              chartRef.current?.hideTooltip();
+              router.push(`/inference/agentic/${pointId}`);
+            });
           }
         },
         attachToLayer: 1, // scatter layer is index 1 (after rooflines at 0)
@@ -788,6 +842,11 @@ const ScatterGraph = React.memo(
         removeTrackedConfig,
         chartDefinition.chartType,
         selectedPrecisions,
+        // Tooltip content closure reads traceHistograms to decide whether to
+        // show the "View charts" button — rebuild config when the histogram
+        // fetch resolves so the button appears for points that have data.
+        traceHistograms,
+        router,
       ],
     );
 
@@ -838,35 +897,64 @@ const ScatterGraph = React.memo(
             const precision = key.split('_').pop()!;
             const visible =
               effectiveActiveHwTypes.has(hw) && selectedPrecisions.includes(precision);
-            let stroke = getCssColor(resolveColor(hw));
-
-            if (showGradientLabels) {
-              const pointLabels = allPointLabelsByKey[key];
-              if (pointLabels) {
-                const stops = computeGradientStops(pointLabels, xScale);
-                if (stops) {
-                  const gid = `roofline-gradient-${chartId}-${key}`;
-                  activeGradientIds.add(gid);
-                  let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
-                  if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
-                  gradient
-                    .attr('gradientUnits', 'userSpaceOnUse')
-                    .attr('x1', xScale(pts[0].x))
-                    .attr('y1', 0)
-                    .attr('x2', xScale(pts.at(-1)!.x))
-                    .attr('y2', 0);
-                  gradient
-                    .selectAll('stop')
-                    .data(stops)
-                    .join('stop')
-                    .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
-                    .attr('stop-color', (s) => s.color);
-                  stroke = `url(#${gid})`;
-                }
+            const baseStroke = getCssColor(resolveColor(hw));
+
+            // Split into per-date sub-paths so the line never crosses dates.
+            // (When only one date is present the loop runs once with the full set.)
+            const byDate = new Map<string, InferenceData[]>();
+            for (const p of pts) {
+              let bucket = byDate.get(p.date);
+              if (!bucket) {
+                bucket = [];
+                byDate.set(p.date, bucket);
               }
+              bucket.push(p);
             }
+            const singleDate = byDate.size === 1;
+
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length <= 1) continue;
+              const entryKey = singleDate ? key : `${key}__${date}`;
+              let stroke = baseStroke;
+
+              // Gradient labels only apply in the single-date case; mapping the
+              // (key-wide) ParetoPointLabel array onto per-date sub-segments is
+              // ambiguous and the comparison-date overlay is a rare combo.
+              if (singleDate && showGradientLabels) {
+                const pointLabels = allPointLabelsByKey[key];
+                if (pointLabels) {
+                  const stops = computeGradientStops(pointLabels, xScale);
+                  if (stops) {
+                    const gid = `roofline-gradient-${chartId}-${entryKey}`;
+                    activeGradientIds.add(gid);
+                    let gradient = defs.select<SVGLinearGradientElement>(`#${CSS.escape(gid)}`);
+                    if (gradient.empty()) gradient = defs.append('linearGradient').attr('id', gid);
+                    gradient
+                      .attr('gradientUnits', 'userSpaceOnUse')
+                      .attr('x1', xScale(datePoints[0].x))
+                      .attr('y1', 0)
+                      .attr('x2', xScale(datePoints.at(-1)!.x))
+                      .attr('y2', 0);
+                    gradient
+                      .selectAll('stop')
+                      .data(stops)
+                      .join('stop')
+                      .attr('offset', (s) => `${(s.offset * 100).toFixed(2)}%`)
+                      .attr('stop-color', (s) => s.color);
+                    stroke = `url(#${gid})`;
+                  }
+                }
+              }
 
-            entries.push({ key, hw, precision, points: pts, stroke, visible });
+              entries.push({
+                key: entryKey,
+                hw,
+                precision,
+                points: datePoints,
+                stroke,
+                visible,
+              });
+            }
           });
 
           // Remove stale gradients
@@ -1271,11 +1359,26 @@ const ScatterGraph = React.memo(
             .y((d) => newYScale(d.y))
             .curve(d3.curveMonotoneX);
 
-          // Update roofline paths
+          // Update roofline paths — must split per-date so the zoom redraw
+          // matches the per-date sub-paths created in the initial render.
           Object.entries(rooflines).forEach(([key, pts]) => {
             if (pts.length < 2) return;
-            const sel = zoomGroup.select<SVGPathElement>(`.roofline-${key}`);
-            if (!sel.empty()) sel.attr('d', lineGen(pts) as string);
+            const byDate = new Map<string, InferenceData[]>();
+            for (const p of pts) {
+              let bucket = byDate.get(p.date);
+              if (!bucket) {
+                bucket = [];
+                byDate.set(p.date, bucket);
+              }
+              bucket.push(p);
+            }
+            const singleDate = byDate.size === 1;
+            for (const [date, datePoints] of byDate) {
+              if (datePoints.length < 2) continue;
+              const cls = singleDate ? `roofline-${key}` : `roofline-${key}__${date}`;
+              const sel = zoomGroup.select<SVGPathElement>(`.${CSS.escape(cls)}`);
+              if (!sel.empty()) sel.attr('d', lineGen(datePoints) as string);
+            }
           });
 
           // Update gradient coordinates
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 3154070a..ccc371f9 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -19,6 +19,13 @@ export interface TooltipConfig {
   isTracked?: boolean;
   /** URL to the GitHub Actions workflow run */
   runUrl?: string;
+  /**
+   * Per-request ISL/OSL arrays for agentic points, sourced from the stored
+   * aiperf `profile_export.jsonl`. Used to detect whether the point has any
+   * trace data (so the "View charts" button can appear); the actual
+   * distributions are rendered on the detail page, not inline.
+   */
+  traceHistogram?: { isl: number[]; osl: number[] } | undefined;
 }
 
 export interface OverlayTooltipConfig extends TooltipConfig {
@@ -138,9 +145,24 @@ const generateAgenticHTML = (d: InferenceData): string => {
     parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens)));
   }
 
+  // Histograms + time-series live on the dedicated detail page now; the
+  // "View charts" button (rendered by the wrapper when pinned + has trace
+  // data) takes the user there.
+
   return parts.join('');
 };
 
+/** "View charts" button — only visible when the tooltip is pinned and the
+ *  point has stored trace data. Wired up by the ScatterGraph click handler. */
+const viewChartsButtonHTML = (isPinned: boolean, hasTraceData: boolean): string => {
+  if (!isPinned || !hasTraceData) return '';
+  return `<button data-action="view-charts" style="
+    margin-top: 8px; width: 100%; padding: 4px 8px; font-size: 11px; font-weight: 500;
+    border: 1px solid var(--border); border-radius: 6px; cursor: pointer;
+    background: var(--accent); color: var(--accent-foreground);
+  ">View charts &rarr;</button>`;
+};
+
 const shortenSha = (image: string) => image.replaceAll(/(sha256:[a-f0-9]{7})[a-f0-9]+/giu, '$1…');
 
 const imageTooltipLine = (image: string) =>
@@ -191,7 +213,16 @@ const generateParallelismHTML = (d: InferenceData): string => {
  * @returns HTML string for the tooltip content
  */
 export const generateTooltipContent = (config: TooltipConfig): string => {
-  const { data: d, isPinned, xLabel, yLabel, selectedYAxisMetric, hardwareConfig, runUrl } = config;
+  const {
+    data: d,
+    isPinned,
+    xLabel,
+    yLabel,
+    selectedYAxisMetric,
+    hardwareConfig,
+    runUrl,
+    traceHistogram,
+  } = config;
 
   return `
     <div style="background: var(--popover); border: 1px solid var(--border); border-radius: 8px; padding: 12px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1); user-select: ${isPinned ? 'text' : 'none'};">
@@ -240,6 +271,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       </div>
       ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
+      ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))}
       ${
         isPinned
           ? `<button data-action="track-over-time" style="
diff --git a/packages/app/src/components/ui/d3-chart-wrapper.tsx b/packages/app/src/components/ui/d3-chart-wrapper.tsx
index 0392ac10..44013b1b 100644
--- a/packages/app/src/components/ui/d3-chart-wrapper.tsx
+++ b/packages/app/src/components/ui/d3-chart-wrapper.tsx
@@ -1,6 +1,41 @@
 'use client';
 
-import React from 'react';
+import React, { useEffect, useState } from 'react';
+import { createPortal } from 'react-dom';
+
+/**
+ * Renders the d3 tooltip element via React Portal to document.body so it
+ * escapes any parent stacking context (e.g. the chart Card's backdrop-filter
+ * creates one, trapping z-index inside it). Position is set as viewport
+ * coordinates by the d3 layer.
+ */
+function PortalTooltip({
+  tooltipRef,
+  pinned,
+}: {
+  tooltipRef: React.RefObject<HTMLDivElement | null>;
+  pinned: boolean;
+}) {
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+  const node = (
+    <div
+      ref={tooltipRef}
+      data-chart-tooltip
+      style={{
+        position: 'fixed',
+        left: 0,
+        top: 0,
+        opacity: pinned ? 1 : 0,
+        pointerEvents: pinned ? 'auto' : 'none',
+        display: pinned ? 'block' : 'none',
+        zIndex: 9999,
+      }}
+    />
+  );
+  if (!mounted || typeof document === 'undefined') return node;
+  return createPortal(node, document.body);
+}
 
 export interface D3ChartWrapperProps {
   chartId: string;
@@ -72,17 +107,11 @@ export function D3ChartWrapper({
                 }
               }}
             />
-            <div
-              ref={tooltipRef}
-              data-chart-tooltip
-              style={{
-                position: 'absolute',
-                opacity: pinnedPoint ? 1 : 0,
-                pointerEvents: pinnedPoint ? 'auto' : 'none',
-                display: pinnedPoint ? 'block' : 'none',
-                zIndex: 50,
-              }}
-            />
+            {/* Tooltip is portalled to <body> with position:fixed so it can
+                rise above sibling chart cards' stacking contexts. The d3 layer
+                writes viewport-coords into style.left/top — see
+                computeTooltipPosition. */}
+            <PortalTooltip tooltipRef={tooltipRef} pinned={Boolean(pinnedPoint)} />
             {noDataOverlay}
           </div>
           <p className="no-export text-xs text-muted-foreground text-center mt-2">{instructions}</p>
diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts
index aa0f6c43..3c24d32b 100644
--- a/packages/app/src/components/unofficial-run-provider.test.ts
+++ b/packages/app/src/components/unofficial-run-provider.test.ts
@@ -12,6 +12,7 @@ import { buildChartData, parseAvailableModelsAndSequences } from './unofficial-r
 /** Minimal BenchmarkRow stub — only fields used by buildChartData key logic. */
 function stubRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'sglang',
     model: 'dsr1',
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
new file mode 100644
index 00000000..1ea90c0d
--- /dev/null
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -0,0 +1,46 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  is_current: boolean;
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  github_run_id: number;
+  date: string;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export function useBenchmarkSiblings(id: number | null) {
+  return useQuery({
+    queryKey: ['benchmark-siblings', id] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch(`/api/v1/benchmark-siblings?id=${id}`, { signal });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`benchmark-siblings ${res.status}`);
+      return (await res.json()) as BenchmarkSiblings;
+    },
+    enabled: id !== null && id > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-trace-histograms.ts b/packages/app/src/hooks/api/use-trace-histograms.ts
new file mode 100644
index 00000000..db4220d2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-histograms.ts
@@ -0,0 +1,39 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TraceHistogramPoint {
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+async function fetchTraceHistograms(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<TraceHistogramMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/trace-histograms?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`trace-histograms ${res.status}`);
+  return (await res.json()) as TraceHistogramMap;
+}
+
+/**
+ * Fetch per-request ISL/OSL arrays for a set of benchmark_results.id values.
+ * Ids without a stored trace_replay blob are silently omitted from the response.
+ *
+ * Caller passes the agentic id set currently on screen; React Query handles
+ * dedup + stale-while-revalidate. Cache key is sorted-ids-comma-joined so
+ * any permutation of the same set hits the same cache entry.
+ */
+export function useTraceHistograms(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['trace-histograms', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchTraceHistograms(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
new file mode 100644
index 00000000..8418aa4f
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -0,0 +1,70 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  run_url: string | null;
+  server_gpu_cache_hit_rate: number | null;
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+  meta: PointMeta;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+}
+
+async function fetchTraceServerMetrics(
+  id: number,
+  signal?: AbortSignal,
+): Promise<TraceServerMetrics | null> {
+  const res = await fetch(`/api/v1/trace-server-metrics?id=${id}`, { signal });
+  if (res.status === 404) return null;
+  if (!res.ok) throw new Error(`trace-server-metrics ${res.status}`);
+  return (await res.json()) as TraceServerMetrics;
+}
+
+/**
+ * Lazy-fetch parsed server-metric time-series for one agentic point.
+ * Enabled only when the caller passes `enabled=true` (the detail panel opens),
+ * so we don't pay the parse cost on every hover.
+ */
+export function useTraceServerMetrics(id: number | null, enabled = false) {
+  return useQuery({
+    queryKey: ['trace-server-metrics', id] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      id ? fetchTraceServerMetrics(id, signal) : Promise.resolve(null),
+    enabled: enabled && Boolean(id),
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 435f7629..98587c2f 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -6,6 +6,8 @@
 import type { SubmissionsResponse } from './submissions-types';
 
 export interface BenchmarkRow {
+  /** Stable per-point id from benchmark_results; used to look up trace histograms. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 6a6c97c8..fcbca681 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -6,6 +6,7 @@ import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform
 
 function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h200',
     framework: 'trt',
     model: 'dsr1',
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index eb62a18a..c5bdd6ed 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -49,6 +49,8 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     row.offload_mode ??
     (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined);
   return {
+    // Coerce: Postgres bigint comes through the SQL client as a string.
+    id: typeof row.id === 'number' ? row.id : Number(row.id),
     hw: row.hardware,
     framework: row.framework,
     model: DB_MODEL_TO_DISPLAY[row.model] ?? row.model,
diff --git a/packages/app/src/lib/compare-pair-defaults.test.ts b/packages/app/src/lib/compare-pair-defaults.test.ts
index 3b49dfbc..da81ca0e 100644
--- a/packages/app/src/lib/compare-pair-defaults.test.ts
+++ b/packages/app/src/lib/compare-pair-defaults.test.ts
@@ -6,6 +6,7 @@ import { pickPairDefaults } from './compare-pair-defaults';
 
 function makeRow(overrides: Partial<BenchmarkRow>): BenchmarkRow {
   return {
+    id: 1,
     hardware: 'h100',
     framework: 'sglang',
     model: 'dsr1',
diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
index 4fa19fe8..421ac69b 100644
--- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts
+++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts
@@ -289,7 +289,21 @@ export function attachScatterTooltipHandlers<
     });
 }
 
-/** Compute tooltip left/top, flipping when it would overflow the chart container. */
+/**
+ * Compute tooltip left/top **in viewport coordinates** so the tooltip can be
+ * rendered via portal with `position: fixed`. Callers still pass cursor coords
+ * relative to `container` (matching `d3.pointer(event, container)`).
+ *
+ * Why viewport coords: the chart cards use `backdrop-filter`, which creates
+ * a stacking context. A tooltip painted inside the upper card's stacking
+ * context cannot rise above the lower card's stacking context regardless of
+ * its z-index. Portalling to document.body + `position: fixed` sidesteps the
+ * whole problem; we just need the coordinates in viewport space.
+ *
+ * Strategy: pick preferred side (right/below cursor), flip if it overflows the
+ * container, then clamp to container bounds. Tall tooltips that don't fit get
+ * clamped to the container edges.
+ */
 export function computeTooltipPosition(
   mx: number,
   my: number,
@@ -308,13 +322,21 @@ export function computeTooltipPosition(
   // Force reflow so we get real dimensions
   const tw = node.getBoundingClientRect().width || node.offsetWidth;
   const th = node.getBoundingClientRect().height || node.offsetHeight;
+  const rect = container.getBoundingClientRect();
   const cw = container.clientWidth;
   const ch = container.clientHeight;
+  const EDGE_PAD = 4;
+
+  // Prefer right of cursor; flip to left if no room.
+  let left = mx + offset + tw <= cw ? mx + offset : mx - offset - tw;
+  left = Math.max(EDGE_PAD, Math.min(cw - tw - EDGE_PAD, left));
 
-  const left = mx + offset + tw > cw ? mx - offset - tw : mx + offset;
-  const top = my + offset + th > ch ? my - offset - th : my + offset;
+  // Prefer below cursor; flip above if no room.
+  let top = my + offset + th <= ch ? my + offset : my - offset - th;
+  top = Math.max(EDGE_PAD, Math.min(ch - th - EDGE_PAD, top));
 
-  return { left, top };
+  // Convert container-local coords → viewport coords for `position: fixed`.
+  return { left: left + rect.left, top: top + rect.top };
 }
 
 /** Update scatter point positions on zoom. */
diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql
new file mode 100644
index 00000000..398bc725
--- /dev/null
+++ b/packages/db/migrations/006_agentic_trace_replay.sql
@@ -0,0 +1,34 @@
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_<suffix>` artifact:
+--   - profile_export.jsonl         (~2 MB raw, per-request data)
+--   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+  id                                bigserial   primary key,
+  -- gzip(profile_export.jsonl); null when only the server metrics file existed
+  profile_export_jsonl_gz           bytea,
+  profile_export_uncompressed_size  bigint,
+  -- raw csv bytes; null when only the profile file existed
+  server_metrics_csv                bytea,
+  server_metrics_csv_size           bigint,
+  created_at                        timestamptz not null default now()
+);
+
+alter table benchmark_results
+  add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+  on benchmark_results (trace_replay_id)
+  where trace_replay_id is not null;
diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
new file mode 100644
index 00000000..ba7bd095
--- /dev/null
+++ b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
@@ -0,0 +1,17 @@
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+  add column server_metrics_json_gz bytea,
+  add column server_metrics_json_uncompressed_size bigint;
diff --git a/packages/db/src/etl/skip-tracker.test.ts b/packages/db/src/etl/skip-tracker.test.ts
index 90ad73b7..e407db3a 100644
--- a/packages/db/src/etl/skip-tracker.test.ts
+++ b/packages/db/src/etl/skip-tracker.test.ts
@@ -9,6 +9,7 @@ describe('createSkipTracker', () => {
     expect(tracker.skips.unmappedHw).toBe(0);
     expect(tracker.skips.noIslOsl).toBe(0);
     expect(tracker.skips.dbError).toBe(0);
+    expect(tracker.skips.traceReplayMissing).toBe(0);
   });
 
   it('initializes with empty unmapped sets', () => {
diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts
index 588718dd..401d197c 100644
--- a/packages/db/src/etl/skip-tracker.ts
+++ b/packages/db/src/etl/skip-tracker.ts
@@ -10,6 +10,8 @@ export interface Skips {
   noIslOsl: number;
   failedRun: number;
   dbError: number;
+  /** Agentic point whose sibling `agentic_<suffix>` artifact had no trace_replay files. */
+  traceReplayMissing: number;
 }
 
 export interface SkipSnapshot {
@@ -74,6 +76,7 @@ export function createSkipTracker(): SkipTracker {
     noIslOsl: 0,
     failedRun: 0,
     dbError: 0,
+    traceReplayMissing: 0,
   };
   const unmappedModels = new Set<string>();
   const unmappedHws = new Set<string>();
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
new file mode 100644
index 00000000..8c6d92b6
--- /dev/null
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -0,0 +1,83 @@
+/**
+ * Insert per-point aiperf trace files (`profile_export.jsonl` +
+ * `server_metrics_export.csv`) into `agentic_trace_replay` and link the new row
+ * to each provided benchmark_results row via `trace_replay_id`.
+ *
+ * Mirrors the {@link insertServerLog} idempotency contract: rows that already
+ * have a non-null `trace_replay_id` are left alone so a re-ingest doesn't
+ * duplicate the sibling blob.
+ */
+
+import { gzipSync } from 'node:zlib';
+
+import type postgres from 'postgres';
+
+type Sql = ReturnType<typeof postgres>;
+
+/**
+ * Persist the per-point trace files and link them to `benchmarkResultIds`.
+ *
+ * @param sql                 Active `postgres` connection.
+ * @param benchmarkResultIds  DB ids of the benchmark_results rows produced by
+ *                            the same `bmk_agentic_<suffix>` artifact whose
+ *                            sibling `agentic_<suffix>` directory holds these
+ *                            trace files.
+ * @param profileExportJsonl  Raw bytes of `profile_export.jsonl`, or null.
+ *                            Gzipped before storage.
+ * @param serverMetricsCsv    Raw bytes of `server_metrics_export.csv`, or null.
+ *                            Stored as-is.
+ * @param serverMetricsJson   Raw bytes of `server_metrics_export.json` —
+ *                            per-scrape time-series of every Prometheus metric.
+ *                            Optional, gzipped before storage (~42x ratio).
+ */
+export async function insertTraceReplay(
+  sql: Sql,
+  benchmarkResultIds: number[],
+  profileExportJsonl: Buffer | null,
+  serverMetricsCsv: Buffer | null,
+  serverMetricsJson: Buffer | null = null,
+): Promise<void> {
+  if (benchmarkResultIds.length === 0) return;
+  if (!profileExportJsonl && !serverMetricsCsv && !serverMetricsJson) return;
+
+  // Only link rows that don't already point at a trace_replay row — keeps
+  // re-ingest from inserting duplicate sibling blobs.
+  const unlinked = await sql<{ id: number }[]>`
+    select id from benchmark_results
+    where id = any(${sql.array(benchmarkResultIds)}::bigint[])
+      and trace_replay_id is null
+  `;
+  if (unlinked.length === 0) return;
+
+  const profileGz = profileExportJsonl ? gzipSync(profileExportJsonl) : null;
+  const profileSize = profileExportJsonl ? profileExportJsonl.length : null;
+  const csvSize = serverMetricsCsv ? serverMetricsCsv.length : null;
+  const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
+  const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
+
+  const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
+    insert into agentic_trace_replay (
+      profile_export_jsonl_gz,
+      profile_export_uncompressed_size,
+      server_metrics_csv,
+      server_metrics_csv_size,
+      server_metrics_json_gz,
+      server_metrics_json_uncompressed_size
+    )
+    values (
+      ${profileGz},
+      ${profileSize},
+      ${serverMetricsCsv},
+      ${csvSize},
+      ${metricsJsonGz},
+      ${metricsJsonSize}
+    )
+    returning id
+  `;
+
+  await sql`
+    update benchmark_results
+    set trace_replay_id = ${traceReplayId}
+    where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+  `;
+}
diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts
index 35183789..eeb55313 100644
--- a/packages/db/src/ingest-ci-run.ts
+++ b/packages/db/src/ingest-ci-run.ts
@@ -45,6 +45,7 @@ import {
   bulkUpsertAvailability,
   insertServerLog,
 } from './etl/benchmark-ingest';
+import { insertTraceReplay } from './etl/trace-replay-ingest';
 import { mapAggEvalRow, mapEvalRow } from './etl/eval-mapper';
 import { ingestEvalRow } from './etl/eval-ingest';
 import { mapEvalSamples } from './etl/eval-samples-mapper';
@@ -209,6 +210,14 @@ const ARTIFACT_NAMES = {
   changelog: 'changelog-metadata',
 } as const;
 
+/**
+ * Strip the `bmk_` and/or `agentic_` prefixes from an artifact directory name
+ * so the bare suffix becomes a shared key between `bmk_agentic_<suffix>` and
+ * its sibling `agentic_<suffix>` artifact.
+ */
+const stripBmkAndAgenticPrefix = (s: string): string =>
+  s.replace(/^bmk_/u, '').replace(/^agentic_/u, '');
+
 function readJson(filePath: string): unknown {
   try {
     return JSON.parse(fs.readFileSync(filePath, 'utf8'));
@@ -327,6 +336,7 @@ async function main(): Promise<void> {
   let totalSamples = 0;
   let totalSampleFiles = 0;
   let totalChangelogs = 0;
+  let totalTraceReplayLinked = 0;
 
   // ── Check for evals-only flag in changelog ────────────────────────────
   const changelogDir = path.join(artifactsDir, ARTIFACT_NAMES.changelog);
@@ -381,6 +391,56 @@ async function main(): Promise<void> {
       console.log(`  Found ${serverLogPaths.size} server log artifact(s)`);
     }
 
+    // Sibling aiperf artifacts: each `bmk_agentic_<suffix>` is paired with an
+    // `agentic_<suffix>` dir holding `profile_export.jsonl` and
+    // `server_metrics_export.csv`. The harness emits these under either a
+    // `trace_replay/` subdir (older layout) or `aiperf_artifacts/` (current).
+    // Older non-aiperf agentic runs don't ship this sibling. Key on the bare
+    // suffix so both names map to the same Map entry.
+    const TRACE_SUBDIRS = ['aiperf_artifacts', 'trace_replay'];
+    const traceReplayPaths = new Map<
+      string,
+      {
+        profileJsonl: string | null;
+        serverMetricsCsv: string | null;
+        serverMetricsJson: string | null;
+      }
+    >();
+    if (fs.existsSync(artifactsDir)) {
+      for (const d of fs.readdirSync(artifactsDir)) {
+        if (!d.startsWith('agentic_')) continue;
+        let profile: string | null = null;
+        let metrics: string | null = null;
+        let metricsJson: string | null = null;
+        for (const sub of TRACE_SUBDIRS) {
+          const dir = path.join(artifactsDir, d, sub);
+          if (!fs.existsSync(dir) || !fs.statSync(dir).isDirectory()) continue;
+          if (!profile) {
+            const p = path.join(dir, 'profile_export.jsonl');
+            if (fs.existsSync(p)) profile = p;
+          }
+          if (!metrics) {
+            const m = path.join(dir, 'server_metrics_export.csv');
+            if (fs.existsSync(m)) metrics = m;
+          }
+          if (!metricsJson) {
+            const j = path.join(dir, 'server_metrics_export.json');
+            if (fs.existsSync(j)) metricsJson = j;
+          }
+        }
+        if (!profile && !metrics && !metricsJson) continue;
+        const suffix = stripBmkAndAgenticPrefix(d);
+        traceReplayPaths.set(suffix, {
+          profileJsonl: profile,
+          serverMetricsCsv: metrics,
+          serverMetricsJson: metricsJson,
+        });
+      }
+    }
+    if (traceReplayPaths.size > 0) {
+      console.log(`  Found ${traceReplayPaths.size} trace_replay sibling artifact(s)`);
+    }
+
     const allBmkFiles = [...bmkFiles, ...allBmkDirs.flatMap((d) => findJsonFiles(d))];
     console.log(`  Found ${allBmkFiles.length} benchmark JSON file(s)`);
 
@@ -448,12 +508,42 @@ async function main(): Promise<void> {
               }
             }
           }
+
+          // Trace-replay sibling lookup for agentic points only. The aiperf
+          // harness emits `agentic_<suffix>/trace_replay/...` next to the
+          // `bmk_agentic_<suffix>` artifact we just ingested.
+          if (parentDir.startsWith('bmk_agentic_') && insertedIds.length > 0) {
+            const suffix = stripBmkAndAgenticPrefix(parentDir);
+            const trace = traceReplayPaths.get(suffix);
+            if (trace) {
+              try {
+                const profile = trace.profileJsonl ? fs.readFileSync(trace.profileJsonl) : null;
+                const metrics = trace.serverMetricsCsv
+                  ? fs.readFileSync(trace.serverMetricsCsv)
+                  : null;
+                const metricsJson = trace.serverMetricsJson
+                  ? fs.readFileSync(trace.serverMetricsJson)
+                  : null;
+                await insertTraceReplay(sql, insertedIds, profile, metrics, metricsJson);
+                totalTraceReplayLinked += insertedIds.length;
+              } catch (error: any) {
+                tracker.recordDbError(`trace_replay for ${suffix}`, error);
+              }
+            } else {
+              tracker.skips.traceReplayMissing++;
+            }
+          }
         } catch (error: any) {
           tracker.recordDbError(path.basename(file), error);
         }
       }
     }
     console.log(`  Benchmarks: +${totalNewBmk} new, ${totalDupBmk} dup`);
+    if (totalTraceReplayLinked > 0 || tracker.skips.traceReplayMissing > 0) {
+      console.log(
+        `  Trace replay: ${totalTraceReplayLinked} rows linked, ${tracker.skips.traceReplayMissing} agentic point(s) missing sibling artifact`,
+      );
+    }
 
     if (availRows.length > 0) {
       try {
diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts
index 6857f817..b4a6fb95 100644
--- a/packages/db/src/ingest-gcs-backup.ts
+++ b/packages/db/src/ingest-gcs-backup.ts
@@ -458,6 +458,8 @@ async function mapWorkflowDir(
       unmappedHw: local.skips.unmappedHw,
       noIslOsl: local.skips.noIslOsl,
       failedRun: local.skips.failedRun,
+      // GCS backup doesn't ingest aiperf trace files; counter stays 0.
+      traceReplayMissing: local.skips.traceReplayMissing,
     },
     localUnmappedModels: new Set(local.unmappedModels),
     localUnmappedHws: new Set(local.unmappedHws),
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index 19527f22..785d82c4 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -273,6 +273,7 @@ function toBenchmarkRow(
   metrics?: Record<string, number>,
 ): BenchmarkRow {
   return {
+    id: br.id,
     hardware: c.hardware,
     framework: c.framework,
     model: c.model,
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
new file mode 100644
index 00000000..245a1170
--- /dev/null
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -0,0 +1,132 @@
+/**
+ * Find all benchmark_results that share the same SKU (hardware + framework +
+ * model + precision + spec_method + disagg + benchmark_type + workflow_run)
+ * as the given point. Used by the detail page to render a "switch between
+ * concs / parallelisms" navigator within a single run.
+ */
+
+import type { DbClient } from '../connection.js';
+
+export interface BenchmarkSibling {
+  id: number;
+  conc: number;
+  /** "on" | "off" | null. */
+  offload_mode: string | null;
+  decode_tp: number;
+  decode_ep: number;
+  prefill_tp: number;
+  prefill_ep: number;
+  num_prefill_gpu: number;
+  num_decode_gpu: number;
+  disagg: boolean;
+  /** True if this row IS the point passed in. */
+  is_current: boolean;
+  /** Whether the row has a stored trace_replay blob (for navigation hint). */
+  has_trace: boolean;
+}
+
+export interface BenchmarkSku {
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  benchmark_type: string;
+  /** Human-readable workflow_run summary so the page header can hint at provenance. */
+  github_run_id: number;
+  date: string;
+}
+
+export interface BenchmarkSiblings {
+  sku: BenchmarkSku;
+  siblings: BenchmarkSibling[];
+}
+
+export async function getBenchmarkSiblings(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<BenchmarkSiblings | null> {
+  // Step 1: resolve the SKU defining fields for the requested point.
+  const seed = (await sql`
+    select
+      c.hardware, c.framework, c.model, c.precision, c.spec_method,
+      br.benchmark_type, br.workflow_run_id, br.date::text,
+      wr.github_run_id
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as {
+    hardware: string;
+    framework: string;
+    model: string;
+    precision: string;
+    spec_method: string;
+    benchmark_type: string;
+    workflow_run_id: number;
+    date: string;
+    github_run_id: number;
+  }[];
+  const root = seed[0];
+  if (!root) return null;
+
+  // Step 2: pull every sibling row sharing the SKU within the same workflow_run.
+  const rows = (await sql`
+    select
+      br.id, br.conc, br.offload_mode,
+      c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep,
+      c.num_prefill_gpu, c.num_decode_gpu, c.disagg,
+      (br.trace_replay_id is not null) as has_trace
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    where br.workflow_run_id = ${root.workflow_run_id}
+      and br.benchmark_type = ${root.benchmark_type}
+      and c.hardware = ${root.hardware}
+      and c.framework = ${root.framework}
+      and c.model = ${root.model}
+      and c.precision = ${root.precision}
+      and c.spec_method = ${root.spec_method}
+    order by c.decode_tp, c.decode_ep, br.offload_mode nulls first, br.conc
+  `) as unknown as {
+    id: number;
+    conc: number;
+    offload_mode: string | null;
+    decode_tp: number;
+    decode_ep: number;
+    prefill_tp: number;
+    prefill_ep: number;
+    num_prefill_gpu: number;
+    num_decode_gpu: number;
+    disagg: boolean;
+    has_trace: boolean;
+  }[];
+
+  const siblings: BenchmarkSibling[] = rows.map((r) => ({
+    id: Number(r.id),
+    conc: r.conc,
+    offload_mode: r.offload_mode,
+    decode_tp: r.decode_tp,
+    decode_ep: r.decode_ep,
+    prefill_tp: r.prefill_tp,
+    prefill_ep: r.prefill_ep,
+    num_prefill_gpu: r.num_prefill_gpu,
+    num_decode_gpu: r.num_decode_gpu,
+    disagg: r.disagg,
+    is_current: Number(r.id) === benchmarkResultId,
+    has_trace: r.has_trace,
+  }));
+
+  return {
+    sku: {
+      hardware: root.hardware,
+      framework: root.framework,
+      model: root.model,
+      precision: root.precision,
+      spec_method: root.spec_method,
+      benchmark_type: root.benchmark_type,
+      github_run_id: Number(root.github_run_id),
+      date: root.date,
+    },
+    siblings,
+  };
+}
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 74e20380..36bb0e65 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -1,6 +1,13 @@
 import type { DbClient } from '../connection.js';
 
 export interface BenchmarkRow {
+  /**
+   * Stable per-point id from benchmark_results. Used by the frontend to look
+   * up associated detail blobs (e.g. trace_replay histograms).
+   * Number is fine in TS but it's a Postgres bigint — Date arithmetic on huge
+   * runs is hypothetically lossy, in practice well below Number.MAX_SAFE_INTEGER.
+   */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -55,6 +62,7 @@ export async function getLatestBenchmarks(
     const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`;
     const rows = await sql`
       SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+        br.id,
         c.hardware,
         c.framework,
         c.model,
@@ -95,6 +103,7 @@ export async function getLatestBenchmarks(
   // No date filter: use materialized view for instant lookups
   const rows = await sql`
     SELECT
+      lb.id,
       c.hardware,
       c.framework,
       c.model,
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
new file mode 100644
index 00000000..c243afd8
--- /dev/null
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -0,0 +1,82 @@
+/**
+ * Fetch per-request ISL/OSL arrays from stored aiperf `profile_export.jsonl`
+ * blobs (gzipped in `agentic_trace_replay.profile_export_jsonl_gz`). Caller
+ * passes the set of `benchmark_results.id`s it wants and receives one entry
+ * per id that actually has a trace_replay blob (others are silently skipped).
+ *
+ * The JSONL has one JSON object per request with the shape:
+ *   { metrics: { input_sequence_length: { value, unit }, output_sequence_length: {...}, ... } }
+ *
+ * Returns raw arrays rather than pre-binned histograms — payload stays tiny
+ * (~256 ints * 2 fields per point, ~2 KB compressed) and the frontend can bin
+ * however it wants.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface TraceHistogramPoint {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Input sequence length (tokens) per completed request. */
+  isl: number[];
+  /** Output sequence length (tokens) per completed request. */
+  osl: number[];
+}
+
+export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
+
+export async function getTraceHistograms(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<TraceHistogramMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.profile_export_jsonl_gz as blob
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+      and atr.profile_export_jsonl_gz is not null
+  `) as { benchmark_result_id: number; blob: Buffer }[];
+
+  const result: TraceHistogramMap = {};
+  for (const row of rows) {
+    try {
+      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const isl: number[] = [];
+      const osl: number[] = [];
+      for (const line of jsonl.split('\n')) {
+        if (!line) continue;
+        let rec: { metrics?: Record<string, { value?: number } | number> };
+        try {
+          rec = JSON.parse(line);
+        } catch {
+          continue;
+        }
+        const m = rec.metrics ?? {};
+        const islVal = readMetric(m['input_sequence_length']);
+        const oslVal = readMetric(m['output_sequence_length']);
+        if (typeof islVal === 'number' && Number.isFinite(islVal)) isl.push(islVal);
+        if (typeof oslVal === 'number' && Number.isFinite(oslVal)) osl.push(oslVal);
+      }
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        isl,
+        osl,
+      };
+    } catch {
+      // Drop malformed blobs silently — caller treats missing ids as "no data".
+    }
+  }
+  return result;
+}
+
+function readMetric(v: { value?: number } | number | undefined): number | undefined {
+  if (v === undefined || v === null) return undefined;
+  if (typeof v === 'number') return v;
+  return v.value;
+}
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
new file mode 100644
index 00000000..822ae633
--- /dev/null
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -0,0 +1,275 @@
+/**
+ * Parse aiperf's `server_metrics_export.json` blob (gzipped in
+ * `agentic_trace_replay.server_metrics_json_gz`) and return a slim, chart-ready
+ * time-series for one benchmark point.
+ *
+ * The raw JSON has shape:
+ *   metrics: {
+ *     "<metric_name>": {
+ *       series: [
+ *         {
+ *           labels: { ... },
+ *           stats: { ... summary ... },
+ *           timeslices: [
+ *             { start_ns, end_ns, avg, min, max }            // gauges
+ *             { start_ns, end_ns, total, rate }              // counters
+ *           ]
+ *         }
+ *       ]
+ *     }
+ *   }
+ *
+ * Timeslices are ~1 Hz windows. The benchmark window can be tens of minutes
+ * (1800+ windows). We return them as `[{ t, ...}]` arrays with `t` measured
+ * in seconds from the benchmark start so the frontend doesn't need to
+ * shuffle bigint nanoseconds around.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+interface GaugeSlice {
+  start_ns: number;
+  end_ns: number;
+  avg?: number;
+  min?: number;
+  max?: number;
+}
+
+interface CounterSlice {
+  start_ns: number;
+  end_ns: number;
+  total?: number;
+  rate?: number;
+}
+
+interface Series {
+  endpoint_url?: string;
+  labels?: Record<string, string>;
+  stats?: Record<string, unknown>;
+  timeslices?: (GaugeSlice & CounterSlice)[];
+}
+
+interface MetricsJson {
+  metrics?: Record<string, { type?: string; description?: string; series?: Series[] }>;
+}
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  /** Optional total — frontend can compute too. */
+  total: number;
+}
+
+export interface PointMeta {
+  id: number;
+  hardware: string;
+  framework: string;
+  model: string;
+  precision: string;
+  spec_method: string;
+  disagg: boolean;
+  conc: number;
+  offload_mode: string | null;
+  isl: number | null;
+  osl: number | null;
+  benchmark_type: string;
+  date: string;
+  /** GitHub Actions run URL for jumping to the source. */
+  run_url: string | null;
+  /** Cumulative end-of-run cache-hit number the dashboard already shows. */
+  server_gpu_cache_hit_rate: number | null;
+  /** Cumulative end-of-run CPU offload cache-hit. */
+  server_cpu_cache_hit_rate: number | null;
+}
+
+export interface TraceServerMetrics {
+  /** Point context — hardware, model, conc, etc. for the page header. */
+  meta: PointMeta;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  /** vllm:kv_cache_usage_perc avg per scrape, values in 0..1. */
+  kvCacheUsage: TimeSeriesPoint[];
+  /** Per-window prefix-cache hit rate computed as Δhits / Δqueries (0..1). */
+  prefixCacheHitRate: TimeSeriesPoint[];
+  /** Request queue depth: running, waiting, total per scrape. */
+  queueDepth: QueueDepthPoint[];
+  /**
+   * Per-source prompt-token counts over time (counter rate per scrape).
+   * Keyed by the value of the `source` label (typically `local_cache_hit`,
+   * `external_cache_hit`, `miss`, etc.). Plot as stacked area.
+   */
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  /** Prefill throughput: vllm:prompt_tokens rate (tokens/sec) per scrape. */
+  prefillTps: TimeSeriesPoint[];
+  /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
+  decodeTps: TimeSeriesPoint[];
+}
+
+export async function getTraceServerMetrics(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const rows = (await sql`
+    select
+      atr.server_metrics_json_gz as blob,
+      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+      br.date::text,
+      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as ({ blob: Buffer | null } & PointMeta)[];
+  const row = rows[0];
+  if (!row) return null;
+  const blob = row.blob;
+  if (!blob) return null;
+  const pointMeta: PointMeta = {
+    id: Number(row.id),
+    hardware: row.hardware,
+    framework: row.framework,
+    model: row.model,
+    precision: row.precision,
+    spec_method: row.spec_method,
+    disagg: row.disagg,
+    conc: row.conc,
+    offload_mode: row.offload_mode,
+    isl: row.isl,
+    osl: row.osl,
+    benchmark_type: row.benchmark_type,
+    date: row.date,
+    run_url: row.run_url,
+    server_gpu_cache_hit_rate:
+      row.server_gpu_cache_hit_rate === null ? null : Number(row.server_gpu_cache_hit_rate),
+    server_cpu_cache_hit_rate:
+      row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
+  };
+
+  const parsed = JSON.parse(gunzipSync(blob).toString('utf8')) as MetricsJson;
+  const metrics = parsed.metrics ?? {};
+
+  const firstSeries = (name: string): Series | undefined => {
+    const s = metrics[name]?.series;
+    return s && s.length > 0 ? s[0] : undefined;
+  };
+
+  // Compute timing reference from the first gauge metric we can find.
+  let startNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+  let timeslicesCount = 0;
+  for (const metricMeta of Object.values(metrics)) {
+    for (const s of metricMeta?.series ?? []) {
+      const ts = s.timeslices ?? [];
+      if (ts.length === 0) continue;
+      timeslicesCount = Math.max(timeslicesCount, ts.length);
+      const first = ts[0]!;
+      const last = ts.at(-1)!;
+      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+    }
+  }
+  if (!Number.isFinite(startNs)) startNs = 0;
+  const tOf = (ns: number) => (ns - startNs) / 1e9;
+
+  // KV cache usage (gauge, 0..1)
+  const kvCacheUsage: TimeSeriesPoint[] = [];
+  const kvSeries =
+    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number') {
+      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
+    }
+  }
+
+  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
+  // `rate` is already per-window delta; we just divide.
+  const hitsTs = firstSeries('vllm:prefix_cache_hits')?.timeslices ?? [];
+  const qsTs = firstSeries('vllm:prefix_cache_queries')?.timeslices ?? [];
+  const prefixCacheHitRate: TimeSeriesPoint[] = [];
+  const minLen = Math.min(hitsTs.length, qsTs.length);
+  for (let i = 0; i < minLen; i++) {
+    const h = hitsTs[i]!;
+    const q = qsTs[i]!;
+    if (typeof q.rate === 'number' && q.rate > 0 && typeof h.rate === 'number') {
+      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
+    }
+  }
+
+  // Queue depth: pair running + waiting by index.
+  const runTs = firstSeries('vllm:num_requests_running')?.timeslices ?? [];
+  const waitTs = firstSeries('vllm:num_requests_waiting')?.timeslices ?? [];
+  const queueDepth: QueueDepthPoint[] = [];
+  const qlen = Math.min(runTs.length, waitTs.length);
+  for (let i = 0; i < qlen; i++) {
+    const r = runTs[i]!;
+    const w = waitTs[i]!;
+    const running = typeof r.avg === 'number' ? r.avg : 0;
+    const waiting = typeof w.avg === 'number' ? w.avg : 0;
+    queueDepth.push({
+      t: tOf(r.start_ns),
+      running,
+      waiting,
+      total: running + waiting,
+    });
+  }
+
+  // Throughput: extract counter `rate` (already per-second delta from aiperf).
+  const counterRateSeries = (name: string): TimeSeriesPoint[] => {
+    const s = firstSeries(name);
+    if (!s) return [];
+    const out: TimeSeriesPoint[] = [];
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.rate === 'number') out.push({ t: tOf(ts.start_ns), value: ts.rate });
+    }
+    return out;
+  };
+  const prefillTps = counterRateSeries('vllm:prompt_tokens');
+  const decodeTps = counterRateSeries('vllm:generation_tokens');
+
+  // Per-source prompt tokens — emit one TS array per source label.
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+    const labels = series.labels ?? {};
+    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+    const arr: TimeSeriesPoint[] = [];
+    for (const ts of series.timeslices ?? []) {
+      if (typeof ts.rate === 'number') {
+        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    if (arr.length > 0) promptTokensBySource[source] = arr;
+  }
+
+  return {
+    meta: pointMeta,
+    startNs,
+    endNs,
+    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+    timeslicesCount,
+    kvCacheUsage,
+    prefixCacheHitRate,
+    queueDepth,
+    promptTokensBySource,
+    prefillTps,
+    decodeTps,
+  };
+}

From 0067bfcd72d0f57242a418e5acc1cef604135554 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 19:01:49 -0500
Subject: [PATCH 24/96] feat(agentic): hover crosshair + expand-to-dialog on
 detail charts

Refactor every chart on /inference/agentic/[id] from innerHTML string SVG
to JSX SVG so we can attach mouse handlers. New shared ChartHover overlay
renders a vertical crosshair following the cursor and a floating tooltip
listing series values at that x:
- TimeSeriesChart: linearly interpolated value per series, timestamp title
- Distribution: bin range + count + cumulative percentile under cursor
- StackedAreaChart: per-source % share at the nearest timeslice

Each chart card now has a maximize button that opens the same chart in
a Dialog at 1300x520 (vs 720x260 inline), preserving hover and all data
labels. Charts accept width/height props so they re-render appropriately
in either size.
---
 .../agentic-point/agentic-point-detail.tsx    | 334 +++++------
 .../inference/agentic-point/chart-hover.tsx   | 148 +++++
 .../inference/agentic-point/distribution.tsx  | 298 ++++++----
 .../agentic-point/expandable-chart.tsx        |  46 ++
 .../agentic-point/time-series-chart.tsx       | 525 ++++++++++++------
 5 files changed, 922 insertions(+), 429 deletions(-)
 create mode 100644 packages/app/src/components/inference/agentic-point/chart-hover.tsx
 create mode 100644 packages/app/src/components/inference/agentic-point/expandable-chart.tsx

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 3cd274ba..ee58332d 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -14,6 +14,7 @@ import {
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
 
 import { Distribution } from './distribution';
+import { ExpandableChart } from './expandable-chart';
 import { SiblingNav } from './sibling-nav';
 import {
   StackedAreaChart,
@@ -71,14 +72,11 @@ function PointSummary({ meta }: { meta: PointMeta }) {
   );
 }
 
-function ChartCard({ title, children }: { title: string; children: React.ReactNode }) {
-  return (
-    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
-      <h2 className="text-sm font-semibold text-foreground mb-3">{title}</h2>
-      {children}
-    </div>
-  );
-}
+/** Sizes passed to charts for the inline (small) vs expanded (dialog) render. */
+const CHART_SIZES = {
+  inline: { width: 720, height: 260 },
+  expanded: { width: 1300, height: 520 },
+};
 
 export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
@@ -131,164 +129,178 @@ export function AgenticPointDetail({ id }: Props) {
       )}
 
       <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-        <ChartCard title="Input sequence length distribution">
-          {hist ? (
-            <Distribution values={hist.isl} unit="tokens" />
-          ) : histQuery.isLoading ? (
-            <Skeleton />
-          ) : (
-            <Empty />
-          )}
-        </ChartCard>
-        <ChartCard title="Output sequence length distribution">
-          {hist ? (
-            <Distribution values={hist.osl} unit="tokens" />
-          ) : histQuery.isLoading ? (
-            <Skeleton />
-          ) : (
-            <Empty />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Input sequence length distribution"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
+            return histQuery.isLoading ? <Skeleton /> : <Empty />;
+          }}
+        />
+        <ExpandableChart
+          title="Output sequence length distribution"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
+            return histQuery.isLoading ? <Skeleton /> : <Empty />;
+          }}
+        />
 
-        <ChartCard title="KV cache utilization over time">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'GPU KV cache (avg n=50)',
-                  data: rollingAverage(metrics.kvCacheUsage, 50),
-                  rawData: metrics.kvCacheUsage,
-                  color: '#3b82f6',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yMax={1}
-              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-              yAxisLabel="KV cache (%)"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="KV cache utilization over time"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'GPU KV cache (avg n=50)',
+                    data: rollingAverage(metrics.kvCacheUsage, 50),
+                    rawData: metrics.kvCacheUsage,
+                    color: '#3b82f6',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yMax={1}
+                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                yAxisLabel="KV cache (%)"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Request queue depth">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'Running (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.running,
-                    })),
-                    50,
-                  ),
-                  color: '#22c55e',
-                  strokeWidth: 2,
-                },
-                {
-                  name: 'Waiting (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.waiting,
-                    })),
-                    50,
-                  ),
-                  color: '#ef4444',
-                  strokeWidth: 2,
-                },
-                {
-                  name: 'Total (avg n=50)',
-                  data: rollingAverage(
-                    metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                      t: p.t,
-                      value: p.total,
-                    })),
-                    50,
-                  ),
-                  color: '#3b82f6',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yAxisLabel="Requests"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Request queue depth"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'Running (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.running,
+                      })),
+                      50,
+                    ),
+                    color: '#22c55e',
+                    strokeWidth: 2,
+                  },
+                  {
+                    name: 'Waiting (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.waiting,
+                      })),
+                      50,
+                    ),
+                    color: '#ef4444',
+                    strokeWidth: 2,
+                  },
+                  {
+                    name: 'Total (avg n=50)',
+                    data: rollingAverage(
+                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                        t: p.t,
+                        value: p.total,
+                      })),
+                      50,
+                    ),
+                    color: '#3b82f6',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yAxisLabel="Requests"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Prefix cache hit rate per interval">
-          {metrics ? (
-            <TimeSeriesChart
-              series={[
-                {
-                  name: 'GPU (HBM, avg n=50)',
-                  data: rollingAverage(metrics.prefixCacheHitRate, 50),
-                  rawData: metrics.prefixCacheHitRate,
-                  color: '#a855f7',
-                  strokeWidth: 2,
-                },
-              ]}
-              durationS={metrics.durationS}
-              yMax={1}
-              yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-              yAxisLabel="Hit rate (%)"
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Prefix cache hit rate per interval"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'GPU (HBM, avg n=50)',
+                    data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                    rawData: metrics.prefixCacheHitRate,
+                    color: '#a855f7',
+                    strokeWidth: 2,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yMax={1}
+                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                yAxisLabel="Hit rate (%)"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Throughput (total & decode)">
-          {metrics ? (
-            (() => {
-              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
-              return (
-                <TimeSeriesChart
-                  series={[
-                    {
-                      name: 'Total (avg n=50)',
-                      data: rollingAverage(total, 50),
-                      color: '#3b82f6',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Decode (avg n=50)',
-                      data: rollingAverage(metrics.decodeTps, 50),
-                      color: '#f97316',
-                      strokeWidth: 1.6,
-                    },
-                    {
-                      name: 'Total running avg',
-                      data: cumulativeAverage(total),
-                      color: '#ef4444',
-                      strokeWidth: 3,
-                    },
-                  ]}
-                  durationS={metrics.durationS}
-                  yAxisLabel="Tokens / sec"
-                />
-              );
-            })()
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Throughput (total & decode)"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+            return (
+              <TimeSeriesChart
+                series={[
+                  {
+                    name: 'Total (avg n=50)',
+                    data: rollingAverage(total, 50),
+                    color: '#3b82f6',
+                    strokeWidth: 1.6,
+                  },
+                  {
+                    name: 'Decode (avg n=50)',
+                    data: rollingAverage(metrics.decodeTps, 50),
+                    color: '#f97316',
+                    strokeWidth: 1.6,
+                  },
+                  {
+                    name: 'Total running avg',
+                    data: cumulativeAverage(total),
+                    color: '#ef4444',
+                    strokeWidth: 3,
+                  },
+                ]}
+                durationS={metrics.durationS}
+                yAxisLabel="Tokens / sec"
+                {...size}
+              />
+            );
+          }}
+        />
 
-        <ChartCard title="Cumulative prompt token source breakdown">
-          {metrics ? (
-            <StackedAreaChart
-              sourceSeries={metrics.promptTokensBySource}
-              durationS={metrics.durationS}
-            />
-          ) : (
-            <Skeleton />
-          )}
-        </ChartCard>
+        <ExpandableChart
+          title="Cumulative prompt token source breakdown"
+          render={(expanded) => {
+            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+            if (!metrics) return <Skeleton />;
+            return (
+              <StackedAreaChart
+                sourceSeries={metrics.promptTokensBySource}
+                durationS={metrics.durationS}
+                {...size}
+              />
+            );
+          }}
+        />
       </div>
     </div>
   );
diff --git a/packages/app/src/components/inference/agentic-point/chart-hover.tsx b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
new file mode 100644
index 00000000..24270122
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/chart-hover.tsx
@@ -0,0 +1,148 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+
+/** Vertical crosshair + floating value tooltip overlay shared by every chart. */
+export interface HoverItem {
+  /** Color swatch to render next to the label. */
+  color: string;
+  label: string;
+  value: string;
+  /** Optional faint secondary line (e.g. timestamp under main values). */
+  hint?: string;
+}
+
+interface ChartHoverProps {
+  /** Padding inside the SVG; matches the chart's CHART_PAD. */
+  pad: { top: number; right: number; bottom: number; left: number };
+  /** SVG viewBox dimensions used to render the chart. */
+  width: number;
+  height: number;
+  /**
+   * Called with the cursor's normalized x in [0..1] across the plot area.
+   * Returns `null` to hide the tooltip (e.g. cursor outside data range).
+   */
+  resolve: (xFraction: number) => { items: HoverItem[]; title?: string } | null;
+  children: ReactNode;
+}
+
+/**
+ * Wrap a chart's <svg> render to add mouse-driven crosshair + tooltip.
+ *
+ * The chart owner renders its bars / lines / axes via `children`; this wrapper
+ * adds an invisible <rect> across the plot area to capture pointer events, a
+ * vertical line that follows the cursor, and a floating tooltip on the right
+ * of the cursor (auto-flipping to the left when it would overflow).
+ */
+export function ChartHover({ pad, width, height, resolve, children }: ChartHoverProps) {
+  const [hover, setHover] = useState<{
+    xPx: number;
+    yPx: number;
+    fraction: number;
+    items: HoverItem[];
+    title?: string;
+  } | null>(null);
+
+  const innerW = width - pad.left - pad.right;
+  const innerH = height - pad.top - pad.bottom;
+
+  const onMove = (e: React.MouseEvent<SVGRectElement>) => {
+    const svg = e.currentTarget.ownerSVGElement;
+    if (!svg) return;
+    const rect = svg.getBoundingClientRect();
+    // Convert client coords → SVG viewBox coords.
+    const sx = ((e.clientX - rect.left) * width) / rect.width;
+    const sy = ((e.clientY - rect.top) * height) / rect.height;
+    const fraction = Math.max(0, Math.min(1, (sx - pad.left) / innerW));
+    const resolved = resolve(fraction);
+    if (!resolved) {
+      setHover(null);
+      return;
+    }
+    setHover({ xPx: sx, yPx: sy, fraction, items: resolved.items, title: resolved.title });
+  };
+
+  const onLeave = () => setHover(null);
+
+  return (
+    <div className="relative w-full">
+      <svg
+        viewBox={`0 0 ${width} ${height}`}
+        preserveAspectRatio="xMidYMid meet"
+        className="w-full h-auto text-foreground"
+      >
+        {children}
+        {hover && (
+          <line
+            x1={hover.xPx}
+            x2={hover.xPx}
+            y1={pad.top}
+            y2={pad.top + innerH}
+            stroke="currentColor"
+            strokeWidth={1}
+            strokeDasharray="3 3"
+            opacity={0.4}
+            pointerEvents="none"
+          />
+        )}
+        <rect
+          x={pad.left}
+          y={pad.top}
+          width={innerW}
+          height={innerH}
+          fill="transparent"
+          onMouseMove={onMove}
+          onMouseLeave={onLeave}
+        />
+      </svg>
+      {hover && hover.items.length > 0 && (
+        <HoverTooltip
+          xFraction={hover.fraction}
+          containerWidth={width}
+          padLeft={pad.left}
+          innerW={innerW}
+          title={hover.title}
+          items={hover.items}
+        />
+      )}
+    </div>
+  );
+}
+
+function HoverTooltip({
+  xFraction,
+  containerWidth,
+  padLeft,
+  innerW,
+  title,
+  items,
+}: {
+  xFraction: number;
+  containerWidth: number;
+  padLeft: number;
+  innerW: number;
+  title?: string;
+  items: HoverItem[];
+}) {
+  // Position tooltip near the crosshair as a % of the container.
+  // We flip to the cursor's left side when it would overflow the right edge.
+  const xPx = padLeft + xFraction * innerW;
+  const onRight = xPx < containerWidth * 0.55;
+  const left = onRight ? `${(xPx / containerWidth) * 100}%` : 'auto';
+  const right = onRight ? 'auto' : `${((containerWidth - xPx) / containerWidth) * 100}%`;
+  return (
+    <div
+      className="pointer-events-none absolute top-2 z-10 rounded-md border border-border bg-popover px-2 py-1.5 text-xs shadow-md"
+      style={{ left, right, marginLeft: onRight ? 8 : 0, marginRight: onRight ? 0 : 8 }}
+    >
+      {title && <div className="font-medium text-foreground mb-1">{title}</div>}
+      {items.map((it, i) => (
+        <div key={i} className="flex items-center gap-1.5 leading-tight">
+          <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: it.color }} />
+          <span className="text-muted-foreground">{it.label}</span>
+          <span className="ml-auto font-medium text-foreground tabular-nums">{it.value}</span>
+        </div>
+      ))}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/distribution.tsx b/packages/app/src/components/inference/agentic-point/distribution.tsx
index c9a563fe..685b73f3 100644
--- a/packages/app/src/components/inference/agentic-point/distribution.tsx
+++ b/packages/app/src/components/inference/agentic-point/distribution.tsx
@@ -1,140 +1,242 @@
 'use client';
 
-import { useMemo, useRef } from 'react';
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+
+const fmtNum = (n: number) =>
+  n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
 
 /**
  * Bar histogram with vertical p50/p75/p90/p95 guide lines. Designed for the
  * detail-page card — fills its container width via `viewBox` + 100% width.
+ * Hover shows the bin range + count + cumulative percentile.
  */
 export function Distribution({
   values,
   unit,
+  width = 720,
   height = 260,
 }: {
   values: readonly number[];
   unit: string;
+  width?: number;
   height?: number;
 }) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const svgParts = useMemo(() => {
-    if (values.length === 0) return { bars: '', guides: '', legend: '', axis: '', yTicks: '' };
+  const computed = useMemo(() => {
+    if (values.length === 0) return null;
     const sorted = [...values].toSorted((a, b) => a - b);
     const min = sorted[0]!;
     const max = sorted.at(-1)!;
     const range = Math.max(1e-9, max - min);
     const innerW = W - PAD.left - PAD.right;
     const innerH = H - PAD.top - PAD.bottom;
-
-    // Sturges-ish, scaled with sample size, capped so bars stay visible.
     const nBins = Math.min(50, Math.max(15, Math.ceil(Math.sqrt(values.length))));
     const counts: number[] = Array.from({ length: nBins }, () => 0);
     for (const v of values) {
       const i = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
       counts[i]!++;
     }
-    const maxCount = Math.max(...counts, 1);
-    const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
-    const barW = innerW / nBins;
-
-    const fmt = (n: number) =>
-      n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
-
-    const quantile = (q: number): number => {
-      const pos = (sorted.length - 1) * q;
-      const lo = Math.floor(pos);
-      const hi = Math.ceil(pos);
-      return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
-    };
-
-    const bars = counts
-      .map((c, i) => {
-        const h = (c / maxCount) * innerH;
-        const x = PAD.left + i * barW;
-        const y = PAD.top + (innerH - h);
-        return `<rect x="${x.toFixed(2)}" y="${y.toFixed(2)}" width="${Math.max(0, barW - 1).toFixed(2)}" height="${h.toFixed(2)}" fill="currentColor" opacity="0.55" />`;
-      })
-      .join('');
-
-    const GUIDES = [
-      { label: 'p50', q: 0.5, color: '#3b82f6' },
-      { label: 'p75', q: 0.75, color: '#22c55e' },
-      { label: 'p90', q: 0.9, color: '#f59e0b' },
-      { label: 'p95', q: 0.95, color: '#ef4444' },
-    ] as const;
-    const guides = GUIDES.map(({ q, color }) => {
-      const v = quantile(q);
-      const x = xScale(v);
-      return `<line x1="${x.toFixed(2)}" x2="${x.toFixed(2)}" y1="${PAD.top}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" opacity="0.95" />`;
-    }).join('');
-
-    // 4-tick x-axis: min, ~33%, ~66%, max
-    const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
-    const axisY = PAD.top + innerH + 14;
-    const axisLine = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${(PAD.top + innerH).toFixed(2)}" y2="${(PAD.top + innerH).toFixed(2)}" stroke="currentColor" opacity="0.2" />`;
-    const xLabels = xTickVals
-      .map((v, i) => {
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${xScale(v).toFixed(2)}" y="${axisY}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmt(v)}</text>`;
-      })
-      .join('');
-    const axisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">value (${unit})</text>`;
-
-    // 5-tick y-axis
-    const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
-    const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${PAD.left}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.4" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${fmt(v)}</text></g>`;
-      })
-      .join('');
-    const yAxisLabel = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">count</text>`;
-
-    const chipY = H - 8;
-    const chipW = innerW / GUIDES.length;
-    const legend = GUIDES.map(({ label: ql, q, color }, i) => {
-      const v = quantile(q);
-      const x = PAD.left + i * chipW;
-      return `
-      <line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${color}" stroke-width="2" stroke-dasharray="5 3" />
-      <text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${ql} ${fmt(v)}</text>`;
-    }).join('');
-
-    return {
-      bars,
-      guides,
-      legend,
-      axis: axisLine + xLabels + axisTitle + yAxisLabel,
-      yTicks,
-    };
-  }, [values, unit, H]);
-
-  const ref = useRef<HTMLDivElement | null>(null);
-
-  if (values.length === 0) {
+    return { sorted, min, max, range, innerW, innerH, nBins, counts };
+  }, [values, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
+
+  if (!computed) {
     return (
       <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
     );
   }
+  const { sorted, min, max, range, innerW, innerH, nBins, counts } = computed;
+  const maxCount = Math.max(...counts, 1);
+  const xScale = (v: number) => PAD.left + ((v - min) / range) * innerW;
+  const yScale = (c: number) => PAD.top + (1 - c / maxCount) * innerH;
+  const barW = innerW / nBins;
+
+  const fmt = fmtNum;
+
+  const quantile = (q: number): number => {
+    const pos = (sorted.length - 1) * q;
+    const lo = Math.floor(pos);
+    const hi = Math.ceil(pos);
+    return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+  };
+
+  const GUIDES = [
+    { label: 'p50', q: 0.5, color: '#3b82f6' },
+    { label: 'p75', q: 0.75, color: '#22c55e' },
+    { label: 'p90', q: 0.9, color: '#f59e0b' },
+    { label: 'p95', q: 0.95, color: '#ef4444' },
+  ] as const;
+
+  // Hover: report the bin range under cursor, its count, and what percentile
+  // the bin's midpoint represents in the empirical distribution.
+  const resolve = (fraction: number) => {
+    const v = min + fraction * range;
+    const binIdx = Math.min(nBins - 1, Math.floor(((v - min) / range) * nBins));
+    const binLo = min + (binIdx * range) / nBins;
+    const binHi = min + ((binIdx + 1) * range) / nBins;
+    const count = counts[binIdx] ?? 0;
+    // Cumulative % at the bin's right edge.
+    let cumCount = 0;
+    for (let i = 0; i <= binIdx; i++) cumCount += counts[i] ?? 0;
+    const cumPct = (cumCount / values.length) * 100;
+    const items: HoverItem[] = [
+      { color: 'currentColor', label: 'Bin', value: `${fmt(binLo)}–${fmt(binHi)} ${unit}` },
+      { color: 'currentColor', label: 'Count', value: count.toLocaleString() },
+      { color: 'currentColor', label: 'Cumulative', value: `${cumPct.toFixed(1)}%` },
+    ];
+    return { items };
+  };
+
+  const xTickVals = [min, min + range / 3, min + (2 * range) / 3, max];
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (maxCount * i) / 4);
 
   return (
-    <div ref={ref} className="w-full">
+    <div className="w-full">
       <div className="mb-2 text-xs text-muted-foreground">
-        {values.length.toLocaleString()} requests · range {Math.round(Math.min(...values))}–
-        {Math.round(Math.max(...values))} {unit}
+        {values.length.toLocaleString()} requests · range {fmt(min)}–{fmt(max)} {unit}
       </div>
-      <svg
-        viewBox={`0 0 ${W} ${H}`}
-        preserveAspectRatio="xMidYMid meet"
-        className="w-full h-auto text-foreground"
-        dangerouslySetInnerHTML={{
-          __html:
-            svgParts.bars + svgParts.guides + svgParts.axis + svgParts.yTicks + svgParts.legend,
-        }}
-      />
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis gridlines + labels */}
+        {yTickVals.map((v, i) => {
+          const y = yScale(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left - 4}
+                x2={PAD.left}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* Bars */}
+        {counts.map((c, i) => {
+          const h = (c / maxCount) * innerH;
+          const x = PAD.left + i * barW;
+          const y = PAD.top + (innerH - h);
+          return (
+            <rect
+              key={i}
+              x={x}
+              y={y}
+              width={Math.max(0, barW - 1)}
+              height={h}
+              fill="currentColor"
+              opacity={0.55}
+            />
+          );
+        })}
+
+        {/* Percentile guide lines */}
+        {GUIDES.map(({ q, color }) => {
+          const v = quantile(q);
+          const x = xScale(v);
+          return (
+            <line
+              key={q}
+              x1={x}
+              x2={x}
+              y1={PAD.top}
+              y2={PAD.top + innerH}
+              stroke={color}
+              strokeWidth={2}
+              strokeDasharray="5 3"
+              opacity={0.95}
+            />
+          );
+        })}
+
+        {/* X axis */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.2}
+        />
+        {xTickVals.map((v, i) => {
+          const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+          return (
+            <text
+              key={`x${i}`}
+              x={xScale(v)}
+              y={PAD.top + innerH + 14}
+              fontSize={11}
+              fill="currentColor"
+              opacity={0.7}
+              textAnchor={anchor}
+            >
+              {fmt(v)}
+            </text>
+          );
+        })}
+        <text
+          x={W / 2}
+          y={H - 22}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+        >
+          value ({unit})
+        </text>
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          count
+        </text>
+
+        {/* Percentile legend chips */}
+        {(() => {
+          const chipY = H - 8;
+          const chipW = innerW / GUIDES.length;
+          return GUIDES.map(({ label: ql, q, color }, i) => {
+            const v = quantile(q);
+            const x = PAD.left + i * chipW;
+            return (
+              <g key={ql}>
+                <line
+                  x1={x + 2}
+                  x2={x + 14}
+                  y1={chipY - 4}
+                  y2={chipY - 4}
+                  stroke={color}
+                  strokeWidth={2}
+                  strokeDasharray="5 3"
+                />
+                <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                  {ql} {fmt(v)}
+                </text>
+              </g>
+            );
+          });
+        })()}
+      </ChartHover>
     </div>
   );
 }
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
new file mode 100644
index 00000000..7c8e4538
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -0,0 +1,46 @@
+'use client';
+
+import { useState, type ReactNode } from 'react';
+import { Maximize2 } from 'lucide-react';
+
+import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog';
+
+/**
+ * Wraps a chart in a card with a header + expand button. Click the button to
+ * open the chart in a large dialog. The `render` prop receives `expanded:true`
+ * inside the dialog so charts can pick larger width/height.
+ */
+export function ExpandableChart({
+  title,
+  render,
+}: {
+  title: string;
+  render: (expanded: boolean) => ReactNode;
+}) {
+  const [open, setOpen] = useState(false);
+
+  return (
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+      <div className="flex items-start justify-between mb-3 gap-2">
+        <h2 className="text-sm font-semibold text-foreground">{title}</h2>
+        <button
+          type="button"
+          aria-label="Expand chart"
+          onClick={() => setOpen(true)}
+          className="text-muted-foreground hover:text-foreground transition-colors"
+        >
+          <Maximize2 className="size-4" />
+        </button>
+      </div>
+      {render(false)}
+      <Dialog open={open} onOpenChange={setOpen}>
+        <DialogContent className="max-w-[min(96vw,1400px)] w-[min(96vw,1400px)]">
+          <DialogHeader>
+            <DialogTitle>{title}</DialogTitle>
+          </DialogHeader>
+          <div className="w-full">{render(true)}</div>
+        </DialogContent>
+      </Dialog>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index bc081b4e..cd10aff7 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -4,6 +4,8 @@ import { useMemo } from 'react';
 
 import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
 
+import { ChartHover, type HoverItem } from './chart-hover';
+
 interface Series {
   name: string;
   /** The line to draw (caller pre-smooths if desired). */
@@ -21,6 +23,7 @@ interface TimeSeriesChartProps {
   yMax?: number;
   yFmt?: (v: number) => string;
   yAxisLabel?: string;
+  width?: number;
   height?: number;
 }
 
@@ -43,10 +46,7 @@ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): Tim
   return out;
 }
 
-/**
- * Expanding-window cumulative mean from index 0..i. Useful for "running
- * average over the entire run" lines (red overlay in the throughput chart).
- */
+/** Expanding-window cumulative mean from index 0..i. */
 export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   if (data.length === 0) return data;
   const out: TimeSeriesPoint[] = Array.from({ length: data.length });
@@ -68,7 +68,7 @@ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSerie
   return out;
 }
 
-const fmtInt = (n: number) =>
+const fmtIntDefault = (n: number) =>
   n >= 10000 ? new Intl.NumberFormat('en-US').format(Math.round(n)) : String(Math.round(n));
 
 const fmtSeconds = (s: number) => {
@@ -78,97 +78,72 @@ const fmtSeconds = (s: number) => {
   return `${m}m ${rem}s`;
 };
 
+/** Linear-interpolated value at time `t` from a time-sorted series. */
+function interpAt(data: TimeSeriesPoint[], t: number): number | null {
+  if (data.length === 0) return null;
+  if (t <= data[0]!.t) return data[0]!.value;
+  if (t >= data.at(-1)!.t) return data.at(-1)!.value;
+  // Binary search
+  let lo = 0;
+  let hi = data.length - 1;
+  while (hi - lo > 1) {
+    const mid = (lo + hi) >> 1;
+    if (data[mid]!.t <= t) lo = mid;
+    else hi = mid;
+  }
+  const a = data[lo]!;
+  const b = data[hi]!;
+  if (b.t === a.t) return a.value;
+  const frac = (t - a.t) / (b.t - a.t);
+  return a.value + (b.value - a.value) * frac;
+}
+
 export function TimeSeriesChart({
   series,
   durationS,
   yMax: yMaxOpt,
-  yFmt = fmtInt,
+  yFmt = fmtIntDefault,
   yAxisLabel,
+  width = 720,
   height = 260,
 }: TimeSeriesChartProps) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const inner = useMemo(() => {
+  const layout = useMemo(() => {
     const innerW = W - PAD.left - PAD.right;
     const innerH = H - PAD.top - PAD.bottom;
     const xMax = Math.max(durationS, 1);
     const yMax = yMaxOpt ?? Math.max(1e-9, ...series.flatMap((s) => s.data.map((d) => d.value)));
     const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
     const yScale = (v: number) => PAD.top + (1 - v / yMax) * innerH;
-
-    const subsample = (arr: TimeSeriesPoint[]) => {
-      if (arr.length === 0) return arr;
-      const stride = Math.max(1, Math.floor(arr.length / innerW));
-      return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
-    };
-
-    // Layered render: raw scatter (back) → lines (front). Iterate twice so
-    // emphasis lines (high strokeWidth) draw over everything else.
-    const dotsLayer = series
-      .filter((s) => s.rawData && s.rawData.length > 0)
-      .map((s) =>
-        subsample(s.rawData!)
-          .map((d) => {
-            const x = xScale(d.t);
-            const y = yScale(d.value);
-            return `<circle cx="${x.toFixed(2)}" cy="${y.toFixed(2)}" r="1.5" fill="${s.color}" opacity="0.2" />`;
-          })
-          .join(''),
-      )
-      .join('');
-
-    const lineLayer = series
-      .map((s) => {
-        if (s.data.length === 0) return '';
-        const sampled = subsample(s.data);
-        const pts = sampled.map((d) => [xScale(d.t), yScale(d.value)] as [number, number]);
-        const path = pts
-          .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
-          .join(' ');
-        return `<path d="${path}" fill="none" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 1.8}" />`;
-      })
-      .join('');
-
-    const paths = dotsLayer + lineLayer;
-
-    // X-axis: 5 ticks at 0..xMax
-    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
-    const axisY = PAD.top + innerH;
-    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
-      .map((v, i) => {
-        const x = xScale(v);
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
-      })
-      .join('')}`;
-    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
-
-    // Y-axis: 5 ticks at 0..yMax
-    const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${yFmt(v)}</text></g>`;
-      })
-      .join('');
-    const yAxisTitle = yAxisLabel
-      ? `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">${yAxisLabel}</text>`
-      : '';
-
-    // Legend at the bottom of the SVG
-    const chipY = H - 8;
-    const chipW = innerW / Math.max(1, series.length);
-    const legend = series
-      .map((s, i) => {
-        const x = PAD.left + i * chipW;
-        return `<line x1="${(x + 2).toFixed(2)}" x2="${(x + 14).toFixed(2)}" y1="${chipY - 4}" y2="${chipY - 4}" stroke="${s.color}" stroke-width="${s.strokeWidth ?? 2}" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${s.name}</text>`;
-      })
-      .join('');
-
-    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
-  }, [series, durationS, yMaxOpt, yFmt, yAxisLabel, H]);
+    return { innerW, innerH, xMax, yMax, xScale, yScale };
+  }, [series, durationS, yMaxOpt, W, H, PAD.bottom, PAD.left, PAD.right, PAD.top]);
+
+  const { innerW, innerH, xMax, yMax, xScale, yScale } = layout;
+
+  const subsample = (arr: TimeSeriesPoint[]) => {
+    if (arr.length === 0) return arr;
+    const stride = Math.max(1, Math.floor(arr.length / innerW));
+    return stride > 1 ? arr.filter((_, i) => i % stride === 0) : arr;
+  };
+
+  // Pre-format axis ticks.
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = Array.from({ length: 5 }, (_, i) => (yMax * i) / 4);
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    const items: HoverItem[] = [];
+    for (const s of series) {
+      const v = interpAt(s.data, t);
+      if (v === null || !Number.isFinite(v)) continue;
+      items.push({ color: s.color, label: s.name, value: yFmt(v) });
+    }
+    if (items.length === 0) return null;
+    return { items, title: fmtSeconds(t) };
+  };
 
   if (series.every((s) => s.data.length === 0)) {
     return (
@@ -177,12 +152,146 @@ export function TimeSeriesChart({
   }
 
   return (
-    <svg
-      viewBox={`0 0 ${W} ${H}`}
-      preserveAspectRatio="xMidYMid meet"
-      className="w-full h-auto text-foreground"
-      dangerouslySetInnerHTML={{ __html: inner }}
-    />
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {/* y-axis gridlines + labels */}
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {yFmt(v)}
+            </text>
+          </g>
+        );
+      })}
+
+      {/* Raw scatter underlay */}
+      {series
+        .filter((s) => s.rawData && s.rawData.length > 0)
+        .map((s, si) =>
+          subsample(s.rawData!).map((d, i) => (
+            <circle
+              key={`r${si}-${i}`}
+              cx={xScale(d.t)}
+              cy={yScale(d.value)}
+              r={1.5}
+              fill={s.color}
+              opacity={0.2}
+            />
+          )),
+        )}
+
+      {/* Lines */}
+      {series.map((s, si) => {
+        if (s.data.length === 0) return null;
+        const sampled = subsample(s.data);
+        const path = sampled
+          .map(
+            (d, i) =>
+              `${i === 0 ? 'M' : 'L'}${xScale(d.t).toFixed(2)},${yScale(d.value).toFixed(2)}`,
+          )
+          .join(' ');
+        return (
+          <path
+            key={`l${si}`}
+            d={path}
+            fill="none"
+            stroke={s.color}
+            strokeWidth={s.strokeWidth ?? 1.8}
+          />
+        );
+      })}
+
+      {/* X-axis */}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+
+      {yAxisLabel && (
+        <text
+          x={10}
+          y={H / 2}
+          fontSize={11}
+          fill="currentColor"
+          opacity={0.55}
+          textAnchor="middle"
+          transform={`rotate(-90 10 ${H / 2})`}
+        >
+          {yAxisLabel}
+        </text>
+      )}
+
+      {/* Legend */}
+      {(() => {
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, series.length);
+        return series.map((s, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <line
+                x1={x + 2}
+                x2={x + 14}
+                y1={chipY - 4}
+                y2={chipY - 4}
+                stroke={s.color}
+                strokeWidth={s.strokeWidth ?? 2}
+              />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {s.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
   );
 }
 
@@ -190,19 +299,21 @@ export function TimeSeriesChart({
 export function StackedAreaChart({
   sourceSeries,
   durationS,
+  width = 720,
   height = 260,
 }: {
   sourceSeries: Record<string, TimeSeriesPoint[]>;
   durationS: number;
+  width?: number;
   height?: number;
 }) {
-  const W = 720;
+  const W = width;
   const H = height;
   const PAD = { top: 12, right: 16, bottom: 56, left: 60 };
 
-  const inner = useMemo(() => {
+  const computed = useMemo(() => {
     const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
-    if (entries.length === 0) return '';
+    if (entries.length === 0) return null;
     const tValues = entries[0]![1].map((p) => p.t);
     const cum: Record<string, number[]> = {};
     for (const [name, arr] of entries) {
@@ -220,92 +331,166 @@ export function StackedAreaChart({
         shares[name]!.push(total > 0 ? (cum[name]?.[i] ?? 0) / total : 0);
       }
     }
-
-    const colors: Record<string, string> = {
-      local_compute: '#f97316',
-      local_cache_hit: '#3b82f6',
-      external_kv_transfer: '#22c55e',
-      miss: '#f97316',
-    };
-    const labelFor: Record<string, string> = {
-      local_compute: 'Prefill',
-      local_cache_hit: 'HBM Cache Hit',
-      external_kv_transfer: 'Offload Cache Hit',
-      miss: 'Miss',
-    };
-
-    const innerW = W - PAD.left - PAD.right;
-    const innerH = H - PAD.top - PAD.bottom;
-    const xMax = Math.max(durationS, 1);
-    const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
-    const yScale = (v: number) => PAD.top + (1 - v) * innerH;
-
-    const stackOrder = Object.keys(shares);
-    const lower: number[] = Array.from({ length: tValues.length }, () => 0);
-    const layers = stackOrder.map((name) => {
-      const upper = shares[name]!.map((v, i) => lower[i]! + v);
-      const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
-      const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
-      const d = `${top
-        .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
-        .join(' ')} ${[...bottom]
-        .toReversed()
-        .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
-        .join(' ')} Z`;
-      const color = colors[name] ?? '#6b7280';
-      const path = `<path d="${d}" fill="${color}" opacity="0.75" />`;
-      for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
-      return { name, color, path };
-    });
-
-    const paths = layers.map((l) => l.path).join('');
-
-    // X-axis
-    const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
-    const axisY = PAD.top + innerH;
-    const xAxis = `<line x1="${PAD.left}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${axisY.toFixed(2)}" y2="${axisY.toFixed(2)}" stroke="currentColor" opacity="0.2" />${xTickVals
-      .map((v, i) => {
-        const x = xScale(v);
-        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
-        return `<text x="${x.toFixed(2)}" y="${(axisY + 14).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.7" text-anchor="${anchor}">${fmtSeconds(v)}</text>`;
-      })
-      .join('')}`;
-    const xAxisTitle = `<text x="${(W / 2).toFixed(2)}" y="${H - 22}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle">time</text>`;
-
-    // Y-axis 0..100%
-    const yTickVals = [0, 0.25, 0.5, 0.75, 1];
-    const yTicks = yTickVals
-      .map((v) => {
-        const y = yScale(v);
-        return `<g><line x1="${PAD.left - 4}" x2="${(PAD.left + innerW).toFixed(2)}" y1="${y.toFixed(2)}" y2="${y.toFixed(2)}" stroke="currentColor" opacity="0.08" /><text x="${PAD.left - 8}" y="${(y + 3).toFixed(2)}" font-size="10" fill="currentColor" opacity="0.55" text-anchor="end">${(v * 100).toFixed(0)}%</text></g>`;
-      })
-      .join('');
-    const yAxisTitle = `<text x="${10}" y="${(H / 2).toFixed(2)}" font-size="11" fill="currentColor" opacity="0.55" text-anchor="middle" transform="rotate(-90 10 ${(H / 2).toFixed(2)})">% of prefill tokens</text>`;
-
-    const chipY = H - 8;
-    const chipW = innerW / Math.max(1, layers.length);
-    const legend = layers
-      .map((l, i) => {
-        const x = PAD.left + i * chipW;
-        return `<rect x="${(x + 2).toFixed(2)}" y="${chipY - 9}" width="12" height="8" fill="${l.color}" opacity="0.75" /><text x="${(x + 18).toFixed(2)}" y="${chipY}" font-size="11" fill="currentColor" opacity="0.9">${labelFor[l.name] ?? l.name}</text>`;
-      })
-      .join('');
-
-    return paths + xAxis + xAxisTitle + yTicks + yAxisTitle + legend;
-  }, [sourceSeries, durationS, H]);
-
-  if (Object.values(sourceSeries).every((v) => v.length === 0)) {
+    return { tValues, shares };
+  }, [sourceSeries]);
+
+  const colors: Record<string, string> = {
+    local_compute: '#f97316',
+    local_cache_hit: '#3b82f6',
+    external_kv_transfer: '#22c55e',
+    miss: '#f97316',
+  };
+  const labelFor: Record<string, string> = {
+    local_compute: 'Prefill',
+    local_cache_hit: 'HBM Cache Hit',
+    external_kv_transfer: 'Offload Cache Hit',
+    miss: 'Miss',
+  };
+
+  if (!computed) {
     return (
       <div className="h-[260px] grid place-items-center text-xs text-muted-foreground">No data</div>
     );
   }
+  const { tValues, shares } = computed;
+
+  const innerW = W - PAD.left - PAD.right;
+  const innerH = H - PAD.top - PAD.bottom;
+  const xMax = Math.max(durationS, 1);
+  const xScale = (t: number) => PAD.left + (t / xMax) * innerW;
+  const yScale = (v: number) => PAD.top + (1 - v) * innerH;
+
+  const stackOrder = Object.keys(shares);
+  const lower: number[] = Array.from({ length: tValues.length }, () => 0);
+  const layers = stackOrder.map((name) => {
+    const upper = shares[name]!.map((v, i) => lower[i]! + v);
+    const top = upper.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const bottom = lower.map((v, i) => [xScale(tValues[i]!), yScale(v)] as [number, number]);
+    const d = `${top
+      .map(([x, y], i) => `${i === 0 ? 'M' : 'L'}${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} ${[...bottom]
+      .toReversed()
+      .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
+      .join(' ')} Z`;
+    const color = colors[name] ?? '#6b7280';
+    for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
+    return { name, color, d };
+  });
+
+  const resolve = (fraction: number) => {
+    const t = fraction * xMax;
+    // Find the closest tValue index.
+    let idx = 0;
+    let bestDist = Infinity;
+    for (let i = 0; i < tValues.length; i++) {
+      const d = Math.abs(tValues[i]! - t);
+      if (d < bestDist) {
+        bestDist = d;
+        idx = i;
+      }
+    }
+    const items: HoverItem[] = stackOrder.map((name) => ({
+      color: colors[name] ?? '#6b7280',
+      label: labelFor[name] ?? name,
+      value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
+    }));
+    return { items, title: fmtSeconds(t) };
+  };
+
+  const xTickVals = Array.from({ length: 5 }, (_, i) => (xMax * i) / 4);
+  const yTickVals = [0, 0.25, 0.5, 0.75, 1];
 
   return (
-    <svg
-      viewBox={`0 0 ${W} ${H}`}
-      preserveAspectRatio="xMidYMid meet"
-      className="w-full h-auto text-foreground"
-      dangerouslySetInnerHTML={{ __html: inner }}
-    />
+    <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+      {yTickVals.map((v, i) => {
+        const y = yScale(v);
+        return (
+          <g key={`y${i}`}>
+            <line
+              x1={PAD.left - 4}
+              x2={PAD.left + innerW}
+              y1={y}
+              y2={y}
+              stroke="currentColor"
+              opacity={0.08}
+            />
+            <text
+              x={PAD.left - 8}
+              y={y + 3}
+              fontSize={10}
+              fill="currentColor"
+              opacity={0.55}
+              textAnchor="end"
+            >
+              {(v * 100).toFixed(0)}%
+            </text>
+          </g>
+        );
+      })}
+      {layers.map((l, i) => (
+        <path key={i} d={l.d} fill={l.color} opacity={0.75} />
+      ))}
+      <line
+        x1={PAD.left}
+        x2={PAD.left + innerW}
+        y1={PAD.top + innerH}
+        y2={PAD.top + innerH}
+        stroke="currentColor"
+        opacity={0.2}
+      />
+      {xTickVals.map((v, i) => {
+        const x = xScale(v);
+        const anchor = i === 0 ? 'start' : i === xTickVals.length - 1 ? 'end' : 'middle';
+        return (
+          <text
+            key={`x${i}`}
+            x={x}
+            y={PAD.top + innerH + 14}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.7}
+            textAnchor={anchor}
+          >
+            {fmtSeconds(v)}
+          </text>
+        );
+      })}
+      <text
+        x={W / 2}
+        y={H - 22}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+      >
+        time
+      </text>
+      <text
+        x={10}
+        y={H / 2}
+        fontSize={11}
+        fill="currentColor"
+        opacity={0.55}
+        textAnchor="middle"
+        transform={`rotate(-90 10 ${H / 2})`}
+      >
+        % of prefill tokens
+      </text>
+      {(() => {
+        const chipY = H - 8;
+        const chipW = innerW / Math.max(1, layers.length);
+        return layers.map((l, i) => {
+          const x = PAD.left + i * chipW;
+          return (
+            <g key={`leg${i}`}>
+              <rect x={x + 2} y={chipY - 9} width={12} height={8} fill={l.color} opacity={0.75} />
+              <text x={x + 18} y={chipY} fontSize={11} fill="currentColor" opacity={0.9}>
+                {labelFor[l.name] ?? l.name}
+              </text>
+            </g>
+          );
+        });
+      })()}
+    </ChartHover>
   );
 }

From 1d502ac198495147ef579140121a3e49a9f4349f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 20:09:55 -0500
Subject: [PATCH 25/96] feat(inference): one chart with TTFT / E2E /
 Interactivity x-axis picker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the always-rendered pair of charts (interactivity + e2e) with a
single chart whose x-axis is chosen by big pill-shaped buttons above the
card. Three options: TTFT (e2e chart with x = p90_ttft), E2E Latency
(e2e chart with x = median_e2el / p90_e2el), Interactivity (interactivity
chart). The inline E2E dropdown is removed — the buttons replace it.

Mode is persisted to ?i_xmode= and defaults by scenario kind:
  agentic   → TTFT
  fixed-seq → Interactivity

Initial state is SSR-stable (always reads URL only) and a post-mount
effect snaps to the kind default if no URL value was provided. The same
effect re-snaps on subsequent sequence-kind switches. The mode setter
also keeps selectedE2eXAxisMetric aligned so the existing useChartData
pipeline resolves the right x-axis for the e2e chart variant.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts     |   2 +
 .../components/inference/InferenceContext.tsx |  45 ++++-
 .../app/src/components/inference/types.ts     |   9 +
 .../components/inference/ui/ChartDisplay.tsx  | 162 ++++++++----------
 packages/app/src/lib/url-state.ts             |   2 +
 5 files changed, 130 insertions(+), 90 deletions(-)

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index 34b89aba..2d3c982f 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -195,6 +195,8 @@ export function createMockInferenceContext(
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
     setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
+    selectedXAxisMode: 'interactivity',
+    setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
     scaleType: 'auto',
     setScaleType: namedStub('setScaleType'),
     isLegendExpanded: true,
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index c80afc2e..00ea316c 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -43,7 +43,7 @@ import {
 import { useUrlState } from '@/hooks/useUrlState';
 import { buildAvailabilityHwKey } from '@/lib/chart-utils';
 import { getHardwareConfig, getModelSortIndex, isKnownGpu, TABLEAU_10 } from '@/lib/constants';
-import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING } from '@/lib/data-mappings';
+import { hasMtpEngineExclusion, MODEL_PREFIX_MAPPING, sequenceKind } from '@/lib/data-mappings';
 import {
   MtpEngineConflictToast,
   type MtpEngineConflictDetail,
@@ -133,6 +133,26 @@ export function InferenceProvider({
   const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState<string | null>(
     () => getUrlParam('i_e2e_xmetric') || 'p90_ttft',
   );
+  // Selected chart variant. Initialize from URL only — SSR cannot read URL, so
+  // computing a kind-based default here would diverge between server and client
+  // and cause a hydration mismatch. The scenario-kind default is applied in a
+  // post-mount effect below (and a ref tracks whether the user has overridden).
+  const urlXMode = (() => {
+    const v = getUrlParam('i_xmode');
+    return v === 'ttft' || v === 'e2e' || v === 'interactivity' ? v : null;
+  })();
+  const [selectedXAxisMode, setSelectedXAxisMode] = useState<'ttft' | 'e2e' | 'interactivity'>(
+    urlXMode ?? 'ttft',
+  );
+  const xAxisModeFromUrlRef = useRef(urlXMode !== null);
+  // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
+  // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
+  const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
+    xAxisModeFromUrlRef.current = true;
+    setSelectedXAxisMode(mode);
+    if (mode === 'ttft') setSelectedE2eXAxisMetric('p90_ttft');
+    else if (mode === 'e2e') setSelectedE2eXAxisMetric(null);
+  }, []);
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
   const [selectedPercentile, setSelectedPercentile] = useState<string>(
@@ -325,6 +345,24 @@ export function InferenceProvider({
     setTrackedConfigs((prev) => (prev.length > 0 ? [] : prev));
   }, [selectedModel, effectiveSequence, effectivePrecisions, selectedYAxisMetric]);
 
+  // Reconcile the x-axis mode with the scenario kind:
+  //  - On mount with no `i_xmode` URL param: snap to the kind's natural default
+  //    (agentic → ttft, fixed → interactivity). The state itself was initialized
+  //    to a SSR-stable constant so server and client render the same DOM; this
+  //    effect fixes it up after hydration.
+  //  - When the user later switches sequence kinds: snap to the new kind's
+  //    natural default (the prior selection was for a different kind, so it
+  //    doesn't carry over).
+  const lastSeqKindRef = useRef<ReturnType<typeof sequenceKind> | null>(null);
+  useEffect(() => {
+    const kind = sequenceKind(effectiveSequence);
+    const isInitialMount = lastSeqKindRef.current === null;
+    if (!isInitialMount && lastSeqKindRef.current === kind) return;
+    lastSeqKindRef.current = kind;
+    if (isInitialMount && xAxisModeFromUrlRef.current) return;
+    handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
+  }, [effectiveSequence, handleSetXAxisMode]);
+
   // Ref guard: when true, filter changes don't clear the active preset.
   // FavoritePresetsDropdown sets this while applying a preset so its own
   // programmatic setter calls don't accidentally deactivate it.
@@ -785,6 +823,7 @@ export function InferenceProvider({
       i_log: logScale ? '1' : '',
       i_xmetric: selectedXAxisMetric || '',
       i_e2e_xmetric: selectedE2eXAxisMetric || '',
+      i_xmode: selectedXAxisMode,
       i_scale: scaleType,
       i_legend: isLegendExpanded ? '' : '0',
       i_advlabel: useAdvancedLabels ? '1' : '',
@@ -798,6 +837,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       selectedGPUs,
       selectedDates,
@@ -968,6 +1008,8 @@ export function InferenceProvider({
       setSelectedXAxisMetric,
       selectedE2eXAxisMetric,
       setSelectedE2eXAxisMetric,
+      selectedXAxisMode,
+      setSelectedXAxisMode: handleSetXAxisMode,
       scaleType,
       setScaleType,
       loading,
@@ -1041,6 +1083,7 @@ export function InferenceProvider({
       selectedYAxisMetric,
       selectedXAxisMetric,
       selectedE2eXAxisMetric,
+      selectedXAxisMode,
       scaleType,
       selectedGPUs,
       selectedDates,
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 7a39bbd1..3bbee596 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -532,6 +532,15 @@ export interface InferenceChartContextType {
   setSelectedXAxisMetric: (metric: string | null) => void;
   selectedE2eXAxisMetric: string | null;
   setSelectedE2eXAxisMetric: (metric: string | null) => void;
+  /**
+   * Which chart variant the user wants to see — the inference card shows one chart
+   * at a time, picked by the big TTFT / E2E Latency / Interactivity buttons.
+   * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
+   * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
+   * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+   */
+  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity';
+  setSelectedXAxisMode: (mode: 'ttft' | 'e2e' | 'interactivity') => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
   setIsLegendExpanded: (metric: boolean) => void;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index e9021aed..f0611274 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -2,7 +2,7 @@
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
 import { useMemo, useRef, useState } from 'react';
-import { BarChart3, ChevronDown, Table2, X } from 'lucide-react';
+import { BarChart3, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
 import { useInference } from '@/components/inference/InferenceContext';
@@ -30,7 +30,6 @@ import {
   DialogHeader,
   DialogTitle,
 } from '@/components/ui/dialog';
-import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
 import { Skeleton } from '@/components/ui/skeleton';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import {
@@ -60,54 +59,25 @@ const ModelArchitectureDiagram = dynamic(() => import('./ModelArchitectureDiagra
 });
 import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 
-/** Controlled popover dropdown for the e2e chart x-axis toggle. */
-function E2eXAxisDropdown({
-  xAxisLabel,
-  xAxisOptions,
-  selectedValue,
-  onSelect,
-}: {
-  xAxisLabel: string;
-  xAxisOptions: { value: string | null; label: string }[];
-  selectedValue: string | null;
-  onSelect: (value: string | null) => void;
-}) {
-  const [open, setOpen] = useState(false);
-  return (
-    <Popover open={open} onOpenChange={setOpen}>
-      <PopoverTrigger asChild>
-        <button
-          className="inline-flex items-center gap-1 hover:opacity-70 transition-opacity cursor-pointer"
-          onClick={(e) => e.stopPropagation()}
-        >
-          vs. {xAxisLabel}
-          <ChevronDown className="no-export size-3.5 shrink-0 opacity-60" />
-        </button>
-      </PopoverTrigger>
-      <PopoverContent className="w-48 p-1" align="start">
-        {xAxisOptions.map((opt) => (
-          <button
-            key={opt.label}
-            className={`w-full text-left px-3 py-1.5 text-sm rounded hover:bg-accent transition-colors ${
-              (opt.value === null && !selectedValue) || opt.value === selectedValue
-                ? 'font-medium'
-                : ''
-            }`}
-            onClick={() => {
-              onSelect(opt.value);
-              setOpen(false);
-            }}
-          >
-            {opt.label}
-          </button>
-        ))}
-      </PopoverContent>
-    </Popover>
-  );
-}
-
 type InferenceViewMode = 'chart' | 'table';
 
+/**
+ * The three chart variants the user can choose with the big buttons above the
+ * chart card. Each maps to one entry in `inference-chart-config.json` plus a
+ * forced x-axis override for the E2E chartType.
+ */
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity';
+
+interface XAxisModeButton {
+  value: XAxisMode;
+  label: string;
+}
+const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [
+  { value: 'ttft', label: 'TTFT' },
+  { value: 'e2e', label: 'E2E Latency' },
+  { value: 'interactivity', label: 'Interactivity' },
+];
+
 const VIEW_MODE_OPTIONS: SegmentedToggleOption<InferenceViewMode>[] = [
   {
     value: 'chart',
@@ -152,9 +122,10 @@ export default function ChartDisplay() {
     logScale,
     activeHwTypes,
     activeDates,
-    setSelectedE2eXAxisMetric,
     selectedPercentile,
     compareGpuPair,
+    selectedXAxisMode,
+    setSelectedXAxisMode,
   } = useInference();
 
   const {
@@ -329,17 +300,26 @@ export default function ChartDisplay() {
     }));
   }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
 
+  // Show one chart at a time, picked by the TTFT / E2E / Interactivity buttons.
+  // Both 'ttft' and 'e2e' modes render the e2e chart (the x-axis swap is handled
+  // upstream by `selectedE2eXAxisMetric`, which `setSelectedXAxisMode` keeps in sync).
+  const visibleGraphs = useMemo(() => {
+    const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
+    const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
+    return filtered.length > 0 ? filtered : effectiveGraphs;
+  }, [effectiveGraphs, selectedXAxisMode]);
+
   const displayGraphs = isFirstLoad
-    ? Array.from({ length: 2 }).map((_, index) => (
-        <Card key={`skeleton-${index}`}>
+    ? [
+        <Card key="skeleton-0">
           <Skeleton className="h-7 w-2/4 mb-1" />
           <Skeleton className="h-5 w-3/4 mb-2" />
           <Skeleton className="h-[600px] w-full" />
-        </Card>
-      ))
-    : effectiveGraphs.length === 0
+        </Card>,
+      ]
+    : visibleGraphs.length === 0
       ? []
-      : effectiveGraphs.map((graph, graphIndex) => {
+      : visibleGraphs.map((graph, graphIndex) => {
           const isTimelineMode = Boolean(
             selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
           );
@@ -415,43 +395,17 @@ export default function ChartDisplay() {
                               return 'vs. P90 Time To First Token';
                             }
 
-                            // For e2e chart: render clickable inline dropdown for x-axis
+                            // For e2e chart: heading is driven by the TTFT / E2E button
+                            // selection above the card, so the inline dropdown is gone.
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               const pctlWord = selectedPercentile.toUpperCase();
-                              const e2elLabel = isAgentic
-                                ? `${pctlWord} End-to-end Latency`
-                                : 'End-to-end Latency';
-                              const xAxisLabel =
-                                selectedE2eXAxisMetric === 'p90_ttft' ? 'P90 TTFT' : e2elLabel;
-                              const xAxisOptions = [
-                                { value: null, label: e2elLabel },
-                                { value: 'p90_ttft', label: 'P90 TTFT' },
-                              ];
-                              const zoomPrefix =
-                                selectedDateRange.startDate &&
-                                selectedDateRange.endDate &&
-                                selectedGPUs.length > 0
-                                  ? 'gpu_timeseries'
-                                  : 'latency';
-                              return (
-                                <E2eXAxisDropdown
-                                  xAxisLabel={xAxisLabel}
-                                  xAxisOptions={xAxisOptions}
-                                  selectedValue={selectedE2eXAxisMetric}
-                                  onSelect={(value) => {
-                                    setSelectedE2eXAxisMetric(value);
-                                    track('latency_x_axis_metric_selected', {
-                                      metric: value ?? 'median_e2el',
-                                    });
-                                    window.dispatchEvent(
-                                      new CustomEvent(
-                                        `${zoomPrefix}_zoom_reset_chart-${graphIndex}`,
-                                      ),
-                                    );
-                                  }}
-                                />
-                              );
+                              if (selectedE2eXAxisMetric === 'p90_ttft') {
+                                return 'vs. P90 Time To First Token';
+                              }
+                              return isAgentic
+                                ? `vs. ${pctlWord} End-to-end Latency`
+                                : 'vs. End-to-end Latency';
                             }
 
                             // Fall back to the heading baked into chartDefinition
@@ -636,6 +590,36 @@ export default function ChartDisplay() {
           <CustomPowers loading={loading} />
         </section>
       )}
+      <section
+        className="flex flex-wrap justify-center gap-3 sm:gap-4"
+        role="tablist"
+        aria-label="Chart x-axis metric"
+        data-testid="x-axis-mode-buttons"
+      >
+        {X_AXIS_MODE_BUTTONS.map(({ value, label }) => {
+          const isActive = selectedXAxisMode === value;
+          return (
+            <button
+              key={value}
+              type="button"
+              role="tab"
+              aria-selected={isActive}
+              data-testid={`x-axis-mode-${value}`}
+              onClick={() => {
+                setSelectedXAxisMode(value);
+                track('latency_x_axis_mode_selected', { mode: value });
+              }}
+              className={`min-w-[160px] flex-1 sm:flex-initial rounded-full border-2 px-6 py-3 text-base font-semibold transition-colors ${
+                isActive
+                  ? 'border-primary bg-primary text-primary-foreground shadow-sm'
+                  : 'border-border bg-card text-foreground hover:border-primary/60 hover:bg-accent'
+              }`}
+            >
+              {label}
+            </button>
+          );
+        })}
+      </section>
       <div className="flex flex-col gap-4">{displayGraphs}</div>
 
       {/* Performance Over Time — Modal Drill-Down */}
diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts
index 4a48a776..73cbe0b7 100644
--- a/packages/app/src/lib/url-state.ts
+++ b/packages/app/src/lib/url-state.ts
@@ -25,6 +25,7 @@ const URL_STATE_KEYS = [
   'i_pctl',
   'i_xmetric',
   'i_e2e_xmetric',
+  'i_xmode',
   'i_scale',
   'i_gpus',
   'i_dates',
@@ -70,6 +71,7 @@ export const PARAM_DEFAULTS: Record<UrlStateKey, string> = {
   i_pctl: 'p90',
   i_xmetric: 'p90_ttft',
   i_e2e_xmetric: 'p90_ttft',
+  i_xmode: '',
   i_scale: 'auto',
   i_gpus: '',
   i_dates: '',

From 965c8622a36f02a6762388728c855da3ff2aa530 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 20:15:42 -0500
Subject: [PATCH 26/96] fix(inference): TTFT/E2E pick metric by sequence kind +
 add P75 option

Two related fixes for the x-axis-mode picker:

1. Fixed-seq has no p90_ttft / p90_e2el in the metrics JSONB (only
   median/p99). The TTFT button was hardcoded to p90_ttft, so the chart
   went blank on fixed-seq scenarios. Reconcile selectedE2eXAxisMetric in
   a reactive effect that picks median_ttft for fixed-seq and the user's
   selected percentile for agentic. useChartData's TTFT override now
   matches any *_ttft metric and derives its label from the actual
   percentile, instead of hardcoding "P90".

2. Add P75 to the agentic latency percentile selector. Update
   withPercentile + the label/heading regexes to handle p75 and p95.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx | 23 +++++++++++++++++--
 .../inference/hooks/useChartData.ts           | 16 +++++++++----
 .../components/inference/ui/ChartDisplay.tsx  | 10 +++++---
 packages/app/src/lib/benchmark-transform.ts   |  2 +-
 packages/app/src/lib/data-mappings.ts         |  8 ++++---
 5 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 00ea316c..74bdb28b 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -150,8 +150,9 @@ export function InferenceProvider({
   const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
     xAxisModeFromUrlRef.current = true;
     setSelectedXAxisMode(mode);
-    if (mode === 'ttft') setSelectedE2eXAxisMetric('p90_ttft');
-    else if (mode === 'e2e') setSelectedE2eXAxisMetric(null);
+    // The e2e chart's x-axis metric is reconciled in a separate effect below,
+    // because it depends on sequence kind (fixed-seq has no p90_* metrics) and
+    // the agentic percentile, both of which can change independently.
   }, []);
   // Latency percentile applied to the chart x-axis for agentic scenarios.
   // Values: 'p90' | 'p99'. Non-agentic charts ignore.
@@ -363,6 +364,24 @@ export function InferenceProvider({
     handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
   }, [effectiveSequence, handleSetXAxisMode]);
 
+  // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
+  // agentic percentile changes. For fixed-seq the JSONB only carries
+  // median_* / p99_* (no p90_*), so the TTFT button there has to point at
+  // median_ttft — otherwise the chart goes blank. For agentic, we point at
+  // the user's chosen percentile so the dropdown actually drives the axis.
+  useEffect(() => {
+    const isAgentic = sequenceKind(effectiveSequence) === 'agentic';
+    if (selectedXAxisMode === 'ttft') {
+      setSelectedE2eXAxisMetric(isAgentic ? `${selectedPercentile}_ttft` : 'median_ttft');
+    } else if (selectedXAxisMode === 'e2e') {
+      // null = use the chart-config natural x (median_e2el), which useChartData
+      // rewrites to <pctl>_e2el for agentic via withPercentile().
+      setSelectedE2eXAxisMetric(null);
+    }
+    // 'interactivity' mode renders the interactivity chart, which keys off
+    // selectedXAxisMetric (not the e2e one), so nothing to do here.
+  }, [selectedXAxisMode, effectiveSequence, selectedPercentile]);
+
   // Ref guard: when true, filter changes don't clear the active preset.
   // FavoritePresetsDropdown sets this while applying a preset so its own
   // programmatic setter calls don't accidentally deactivate it.
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 0d13b8ca..ffa6a8a7 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -215,8 +215,16 @@ export function useChartData(
         // Resolve the effective x-axis override per chart type
         const effectiveXMetric =
           chartDef.chartType === 'e2e' ? selectedE2eXAxisMetric : selectedXAxisMetric;
-        const isTtftOverride = effectiveXMetric === 'p90_ttft';
-        const ttftLabel = 'P90 Time To First Token (s)';
+        // The TTFT override is now any *_ttft metric (not just p90_ttft) — the
+        // x-axis-mode picker reconciles the percentile prefix based on sequence
+        // kind (fixed-seq → median, agentic → user-picked percentile).
+        const isTtftOverride =
+          typeof effectiveXMetric === 'string' && effectiveXMetric.endsWith('_ttft');
+        const ttftPctl = isTtftOverride
+          ? (effectiveXMetric as string).replace(/_ttft$/u, '')
+          : 'p90';
+        const ttftPctlWord = ttftPctl === 'median' ? 'Median' : ttftPctl.toUpperCase();
+        const ttftLabel = `${ttftPctlWord} Time To First Token (s)`;
 
         const isAgentic = selectedSequence === Sequence.AgenticTraces;
 
@@ -261,9 +269,9 @@ export function useChartData(
             selectedPercentile,
           ) as keyof AggDataEntry;
           const pctlWord = selectedPercentile.toUpperCase();
-          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord);
+          xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P75|P90|P95|P99(?:\.9)?)\b/iu, pctlWord);
           chartHeading = chartHeading.replace(
-            /^(vs\.\s+)(?:(Median|Mean|P90|P99(?:\.9)?)\s+)?/iu,
+            /^(vs\.\s+)(?:(Median|Mean|P75|P90|P95|P99(?:\.9)?)\s+)?/iu,
             `$1${pctlWord} `,
           );
         }
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index f0611274..ca7f9cd7 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -397,12 +397,16 @@ export default function ChartDisplay() {
 
                             // For e2e chart: heading is driven by the TTFT / E2E button
                             // selection above the card, so the inline dropdown is gone.
+                            // The metric carries the percentile prefix (e.g. p90_ttft,
+                            // median_ttft for fixed-seq, p75_ttft for agentic+p75).
                             if (graph.chartDefinition.chartType === 'e2e') {
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              const pctlWord = selectedPercentile.toUpperCase();
-                              if (selectedE2eXAxisMetric === 'p90_ttft') {
-                                return 'vs. P90 Time To First Token';
+                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
+                                return `vs. ${word} Time To First Token`;
                               }
+                              const pctlWord = selectedPercentile.toUpperCase();
                               return isAgentic
                                 ? `vs. ${pctlWord} End-to-end Latency`
                                 : 'vs. End-to-end Latency';
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index c5bdd6ed..ba26a978 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -136,7 +136,7 @@ interface PreparedEntry {
  * percentile prefix; leaves everything else alone.
  */
 export function withPercentile(key: string, percentile: string): string {
-  return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`);
+  return key.replace(/^(mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
 }
 
 /**
diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts
index 91f65a34..c18266ba 100644
--- a/packages/app/src/lib/data-mappings.ts
+++ b/packages/app/src/lib/data-mappings.ts
@@ -185,15 +185,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[];
 
 /**
  * Percentile of the latency distribution used for the chart x-axis when
- * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants
- * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p90
- * is surfaced in the UI.
+ * viewing agentic traces. Agentic rows carry median/p75/p90/p95/p99/p99.9
+ * variants for ttft, ttlt (=e2el), and itl (and intvty derived from itl);
+ * p75 and p90 are surfaced in the UI.
  */
 export enum Percentile {
+  P75 = 'p75',
   P90 = 'p90',
 }
 
 const PERCENTILE_CONFIG: Record<Percentile, { label: string }> = {
+  [Percentile.P75]: { label: 'p75' },
   [Percentile.P90]: { label: 'p90' },
 };
 

From e4d97f29bb3ff3a973a7b84113dc61278f70abf8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:18:17 -0500
Subject: [PATCH 27/96] feat(metrics): wire P75/P95 through frontend + register
 new aiperf keys

The P75 percentile option I just added was broken: rowToAggDataEntry
only copied median/mean/p90/p99/p99.9 from the metrics JSONB, so the
chart looked up entry.p75_ttft which didn't exist and points fell to 0.

- Add p75_*/p95_* fields for ttft/tpot/itl/e2el/intvty to AggDataEntry
  and rowToAggDataEntry so the existing percentile pipeline can resolve them.
- Update the energy-metrics test fixture for the new required fields.
- Register all new aiperf metric keys (p75/p95 latencies, qps stats,
  per-request token-count distribution, run totals, server cache hit rates,
  total/input/output tput_tps) in METRIC_KEYS so the ingest auto-capture
  warning stops firing on the next agentic run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/inference/types.ts     | 10 +++
 packages/app/src/lib/benchmark-transform.ts   | 10 +++
 packages/app/src/lib/energy-metrics.test.ts   | 10 +++
 packages/constants/src/metric-keys.ts         | 66 ++++++++++++++++++-
 4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 3bbee596..0a9908e3 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -52,7 +52,9 @@ export interface AggDataEntry {
   mean_ttft: number;
   median_ttft: number;
   std_ttft: number;
+  p75_ttft: number;
   p90_ttft: number;
+  p95_ttft: number;
   p99_ttft: number;
   'p99.9_ttft': number;
   mean_tpot: number;
@@ -61,8 +63,12 @@ export interface AggDataEntry {
   median_intvty: number;
   std_tpot: number;
   std_intvty: number;
+  p75_tpot: number;
+  p75_intvty: number;
   p90_tpot: number;
   p90_intvty: number;
+  p95_tpot: number;
+  p95_intvty: number;
   p99_tpot: number;
   p99_intvty: number;
   'p99.9_tpot': number;
@@ -70,13 +76,17 @@ export interface AggDataEntry {
   mean_itl: number;
   median_itl: number;
   std_itl: number;
+  p75_itl: number;
   p90_itl: number;
+  p95_itl: number;
   p99_itl: number;
   'p99.9_itl': number;
   mean_e2el: number;
   median_e2el: number;
   std_e2el: number;
+  p75_e2el: number;
   p90_e2el: number;
+  p95_e2el: number;
   p99_e2el: number;
   'p99.9_e2el': number;
   disagg: boolean;
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index ba26a978..3594750c 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -64,31 +64,41 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry {
     mean_ttft: m.mean_ttft ?? 0,
     median_ttft: m.median_ttft ?? 0,
     std_ttft: m.std_ttft ?? 0,
+    p75_ttft: m.p75_ttft ?? 0,
     p90_ttft: m.p90_ttft ?? 0,
+    p95_ttft: m.p95_ttft ?? 0,
     p99_ttft: m.p99_ttft ?? 0,
     'p99.9_ttft': m['p99.9_ttft'] ?? 0,
     mean_tpot: m.mean_tpot ?? 0,
     median_tpot: m.median_tpot ?? 0,
     std_tpot: m.std_tpot ?? 0,
+    p75_tpot: m.p75_tpot ?? 0,
     p90_tpot: m.p90_tpot ?? 0,
+    p95_tpot: m.p95_tpot ?? 0,
     p99_tpot: m.p99_tpot ?? 0,
     'p99.9_tpot': m['p99.9_tpot'] ?? 0,
     mean_intvty: m.mean_intvty ?? 0,
     median_intvty: m.median_intvty ?? 0,
     std_intvty: m.std_intvty ?? 0,
+    p75_intvty: m.p75_intvty ?? 0,
     p90_intvty: m.p90_intvty ?? 0,
+    p95_intvty: m.p95_intvty ?? 0,
     p99_intvty: m.p99_intvty ?? 0,
     'p99.9_intvty': m['p99.9_intvty'] ?? 0,
     mean_itl: m.mean_itl ?? 0,
     median_itl: m.median_itl ?? 0,
     std_itl: m.std_itl ?? 0,
+    p75_itl: m.p75_itl ?? 0,
     p90_itl: m.p90_itl ?? 0,
+    p95_itl: m.p95_itl ?? 0,
     p99_itl: m.p99_itl ?? 0,
     'p99.9_itl': m['p99.9_itl'] ?? 0,
     mean_e2el: m.mean_e2el ?? 0,
     median_e2el: m.median_e2el ?? 0,
     std_e2el: m.std_e2el ?? 0,
+    p75_e2el: m.p75_e2el ?? 0,
     p90_e2el: m.p90_e2el ?? 0,
+    p95_e2el: m.p95_e2el ?? 0,
     p99_e2el: m.p99_e2el ?? 0,
     'p99.9_e2el': m['p99.9_e2el'] ?? 0,
     disagg: row.disagg,
diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts
index 54788585..2f5844c1 100644
--- a/packages/app/src/lib/energy-metrics.test.ts
+++ b/packages/app/src/lib/energy-metrics.test.ts
@@ -57,7 +57,9 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_ttft: 0.5,
     median_ttft: 0.4,
     std_ttft: 0.1,
+    p75_ttft: 0.65,
     p90_ttft: 0.7,
+    p95_ttft: 0.75,
     p99_ttft: 0.8,
     'p99.9_ttft': 0.9,
     mean_tpot: 0.02,
@@ -66,8 +68,12 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     median_intvty: 44,
     std_tpot: 0.005,
     std_intvty: 5,
+    p75_tpot: 0.022,
+    p75_intvty: 50,
     p90_tpot: 0.025,
     p90_intvty: 55,
+    p95_tpot: 0.028,
+    p95_intvty: 58,
     p99_tpot: 0.03,
     p99_intvty: 60,
     'p99.9_tpot': 0.035,
@@ -75,13 +81,17 @@ function makeEntry(overrides: Partial<AggDataEntry> = {}): AggDataEntry {
     mean_itl: 0.01,
     median_itl: 0.01,
     std_itl: 0.002,
+    p75_itl: 0.012,
     p90_itl: 0.013,
+    p95_itl: 0.014,
     p99_itl: 0.015,
     'p99.9_itl': 0.018,
     mean_e2el: 5,
     median_e2el: 4.8,
     std_e2el: 0.5,
+    p75_e2el: 5.2,
     p90_e2el: 5.5,
+    p95_e2el: 5.8,
     p99_e2el: 6,
     'p99.9_e2el': 6.5,
     disagg: false,
diff --git a/packages/constants/src/metric-keys.ts b/packages/constants/src/metric-keys.ts
index cf2c4d0b..70e50f96 100644
--- a/packages/constants/src/metric-keys.ts
+++ b/packages/constants/src/metric-keys.ts
@@ -1,46 +1,110 @@
 /**
  * Canonical set of metric keys stored in the benchmark_results.metrics JSONB column.
  *
- * All values are in seconds unless noted otherwise. Throughput values are tokens/sec/GPU.
+ * Latency values (ttft/tpot/itl/e2el/intvty) are in seconds. Throughput values are
+ * tokens/sec — `_per_gpu` is per-GPU, `_tps` is total tokens/sec across the deployment.
+ *
+ * Distribution stats (mean/median/std/p75/p90/p95/p99/p99.9) are present for latency,
+ * QPS, and per-request token counts; agentic runs carry the full set, fixed-seq runs
+ * carry median/mean/p99/std for latency only.
  */
 export const METRIC_KEYS = new Set([
   // throughput (tokens/sec/GPU)
   'tput_per_gpu',
   'output_tput_per_gpu',
   'input_tput_per_gpu',
+  // throughput (tokens/sec, deployment total) — agentic aiperf reports both
+  'total_tput_tps',
+  'output_tput_tps',
+  'input_tput_tps',
   // TTFT — time to first token
   'median_ttft',
   'mean_ttft',
+  'p75_ttft',
   'p90_ttft',
+  'p95_ttft',
   'p99_ttft',
   'p99.9_ttft',
   'std_ttft',
   // TPOT — time per output token
   'median_tpot',
   'mean_tpot',
+  'p75_tpot',
   'p90_tpot',
+  'p95_tpot',
   'p99_tpot',
   'p99.9_tpot',
   'std_tpot',
   // ITL — inter-token latency
   'median_itl',
   'mean_itl',
+  'p75_itl',
   'p90_itl',
+  'p95_itl',
   'p99_itl',
   'p99.9_itl',
   'std_itl',
   // E2EL — end-to-end latency
   'median_e2el',
   'mean_e2el',
+  'p75_e2el',
   'p90_e2el',
+  'p95_e2el',
   'p99_e2el',
   'p99.9_e2el',
   'std_e2el',
   // interactivity
   'median_intvty',
   'mean_intvty',
+  'p75_intvty',
   'p90_intvty',
+  'p95_intvty',
   'p99_intvty',
   'p99.9_intvty',
   'std_intvty',
+  // QPS — queries per second (agentic aiperf)
+  'median_qps',
+  'mean_qps',
+  'p75_qps',
+  'p90_qps',
+  'p95_qps',
+  'p99_qps',
+  'p99.9_qps',
+  'std_qps',
+  // per-request input token count distribution
+  'median_input_tokens',
+  'mean_input_tokens',
+  'p75_input_tokens',
+  'p90_input_tokens',
+  'p95_input_tokens',
+  'p99_input_tokens',
+  'p99.9_input_tokens',
+  'std_input_tokens',
+  // per-request output token count distribution — actual served
+  'median_output_tokens_actual',
+  'mean_output_tokens_actual',
+  'p75_output_tokens_actual',
+  'p90_output_tokens_actual',
+  'p95_output_tokens_actual',
+  'p99_output_tokens_actual',
+  'p99.9_output_tokens_actual',
+  'std_output_tokens_actual',
+  // per-request output token count distribution — expected from trace
+  'median_output_tokens_expected',
+  'mean_output_tokens_expected',
+  'p75_output_tokens_expected',
+  'p90_output_tokens_expected',
+  'p95_output_tokens_expected',
+  'p99_output_tokens_expected',
+  'p99.9_output_tokens_expected',
+  'std_output_tokens_expected',
+  // run totals (agentic aiperf)
+  'duration_seconds',
+  'total_requests_completed',
+  'total_prompt_tokens',
+  'total_generation_tokens',
+  // server prefix-cache observability (agentic aiperf)
+  'server_gpu_cache_hit_rate',
+  'server_cpu_cache_hit_rate',
+  'theoretical_cache_hit_rate',
 ]);

From a7a135401f18ad2c24f6c87b25a1a255826309db Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:20:54 -0500
Subject: [PATCH 28/96] fix(inference): don't drop agentic TTFT points over 60s
 as outliers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

useChartData hardcoded a 60s latency-limit filter when xAxisField was
'p90_ttft' — meant to suppress fixed-seq overload outliers (conc=2048
rows that compress the rest of the chart to the left). For agentic
runs, TTFTs > 60s are normal (long prompts, multi-turn) so the filter
hid legitimate data points (e.g. only 7/12 visible for the latest B200
DSV4 ingest).

- Skip the latency-limit filter for agentic scenarios in both
  useChartData and processOverlayChartData.
- Broaden the TTFT-override detection from `=== 'p90_ttft'` to any
  `*_ttft` so the new median/p75/p99 percentile picks behave the same.
- Pass isAgentic into processOverlayChartData from ChartDisplay so the
  unofficial-run overlay path matches the official one.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/hooks/useChartData.ts   | 10 +++++++---
 .../src/components/inference/ui/ChartDisplay.tsx |  1 +
 packages/app/src/components/inference/utils.ts   | 16 +++++++++++++---
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index ffa6a8a7..2557b0d8 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -350,7 +350,8 @@ export function useChartData(
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
-        const isTtftX = xAxisField === 'p90_ttft';
+        const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)
@@ -365,11 +366,14 @@ export function useChartData(
                   roof,
                 };
               })
-              // When TTFT is on the x-axis, apply the latency limit to filter overload outliers
-              // (e.g. conc=2048 rows with TTFT > 60s that compress all real data to the far left)
+              // When TTFT is on the x-axis, apply the latency limit to filter
+              // overload outliers (fixed-seq conc=2048 rows with TTFT > 60s that
+              // compress all real data to the far left). Skip for agentic — long
+              // TTFTs there reflect real workloads (multi-turn, big prompts).
               .filter(
                 (d) =>
                   !isTtftX ||
+                  isAgentic ||
                   !chartDefinition.y_latency_limit ||
                   d.x <= chartDefinition.y_latency_limit,
               )
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index ca7f9cd7..12f9f5de 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -183,6 +183,7 @@ export default function ChartDisplay() {
         chartType,
         selectedYAxisMetric,
         effectiveXMetric,
+        { isAgentic: sequenceKind(selectedSequence) === 'agentic' },
       );
 
       let overlayPoints = processed;
diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts
index 735007ab..4876c614 100644
--- a/packages/app/src/components/inference/utils.ts
+++ b/packages/app/src/components/inference/utils.ts
@@ -75,11 +75,13 @@ export function processOverlayChartData(
   chartType: 'e2e' | 'interactivity',
   selectedYAxisMetric: string,
   selectedXAxisMetric: string | null,
+  options?: { isAgentic?: boolean },
 ): InferenceData[] {
   const chartDef = (chartDefinitions as ChartDefinition[]).find((d) => d.chartType === chartType);
   if (!chartDef) return [];
 
   const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+  const isAgentic = options?.isAgentic === true;
 
   // Resolve x-axis field (must match useChartData logic)
   const metricTitle =
@@ -87,8 +89,11 @@ export function processOverlayChartData(
   const isInputMetric = metricTitle.toLowerCase().includes('input');
   let xAxisField: string = chartDef.x;
   // selectedXAxisMetric is already the effective metric for this chart type
-  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric)
-  const isTtftOverride = selectedXAxisMetric === 'p90_ttft';
+  // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric).
+  // Match any *_ttft metric — the x-axis-mode picker can now select any
+  // percentile (median/p75/p90/p99) depending on sequence kind.
+  const isTtftOverride =
+    typeof selectedXAxisMetric === 'string' && selectedXAxisMetric.endsWith('_ttft');
 
   if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) {
     xAxisField = selectedXAxisMetric;
@@ -108,7 +113,12 @@ export function processOverlayChartData(
     })
     .filter(
       (d) =>
-        xAxisField === chartDef.x || !chartDef.y_latency_limit || d.x <= chartDef.y_latency_limit,
+        // Skip the latency limit for the natural x-axis or for agentic
+        // (long TTFTs are normal there, not overload outliers).
+        xAxisField === chartDef.x ||
+        isAgentic ||
+        !chartDef.y_latency_limit ||
+        d.x <= chartDef.y_latency_limit,
     );
 
   return filterDataByCostLimit(processedData, chartDef, selectedYAxisMetric);

From 07194de6e5df1ca75d1f35085d178a2dc2625493 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:42:53 -0500
Subject: [PATCH 29/96] fix(trace-histograms): chunk DB query + blob-cache to
 escape size caps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Loading trace histograms for ~30+ agentic points failed with HTTP 500
because the Neon serverless HTTP driver caps responses at 64 MB, and
each compressed profile_export.jsonl blob is ~1-2 MB — the JOIN
returned all matching blobs in one round-trip and blew the cap. With no
histogram data, the "View charts" button never appears on the tooltip,
so users couldn't open the per-point detail page after the latest run.

- Chunk getTraceHistograms to 12 IDs per query so each round-trip stays
  well under the 64 MB cap. Total payload still merged into one map.
- Switch the route's cachedQuery to blobOnly so the larger JSON
  response doesn't bump the Next.js unstable_cache 2 MB limit either.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/trace-histograms/route.ts  |  5 +++
 packages/db/src/queries/trace-histograms.ts   | 31 +++++++++++++------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/app/api/v1/trace-histograms/route.ts b/packages/app/src/app/api/v1/trace-histograms/route.ts
index fd7572a8..7a959a65 100644
--- a/packages/app/src/app/api/v1/trace-histograms/route.ts
+++ b/packages/app/src/app/api/v1/trace-histograms/route.ts
@@ -10,9 +10,14 @@ import { cachedJson, cachedQuery } from '@/lib/api-cache';
 
 export const dynamic = 'force-dynamic';
 
+// blobOnly: a 50-id histogram payload can easily exceed Next.js's 2MB
+// unstable_cache limit (each point carries one int per request, ~500-1000+
+// requests for agentic), which manifests as a 500 from the route. Blob
+// storage lets us cache the larger response without losing the warm-cache hit.
 const getCachedTraceHistograms = cachedQuery(
   (ids: number[]): Promise<TraceHistogramMap> => getTraceHistograms(getDb(), ids),
   'trace-histograms',
+  { blobOnly: true },
 );
 
 const MAX_IDS_PER_REQUEST = 200;
diff --git a/packages/db/src/queries/trace-histograms.ts b/packages/db/src/queries/trace-histograms.ts
index c243afd8..20ebc0d5 100644
--- a/packages/db/src/queries/trace-histograms.ts
+++ b/packages/db/src/queries/trace-histograms.ts
@@ -27,21 +27,34 @@ export interface TraceHistogramPoint {
 
 export type TraceHistogramMap = Record<number, TraceHistogramPoint>;
 
+/**
+ * Cap the number of blobs we pull in a single Neon HTTP query — the serverless
+ * driver returns 507 ("response is too large, max 64 MB") if the combined gzip
+ * payload exceeds that. Each profile_export.jsonl blob can be ~1-2 MB
+ * compressed, so we stay well below the cap at 12.
+ */
+const QUERY_CHUNK_SIZE = 12;
+
 export async function getTraceHistograms(
   sql: DbClient,
   benchmarkResultIds: number[],
 ): Promise<TraceHistogramMap> {
   if (benchmarkResultIds.length === 0) return {};
 
-  const rows = (await sql`
-    select
-      br.id as benchmark_result_id,
-      atr.profile_export_jsonl_gz as blob
-    from benchmark_results br
-    join agentic_trace_replay atr on atr.id = br.trace_replay_id
-    where br.id = any(${benchmarkResultIds}::bigint[])
-      and atr.profile_export_jsonl_gz is not null
-  `) as { benchmark_result_id: number; blob: Buffer }[];
+  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+        and atr.profile_export_jsonl_gz is not null
+    `) as { benchmark_result_id: number; blob: Buffer }[];
+    rows.push(...chunkRows);
+  }
 
   const result: TraceHistogramMap = {};
   for (const row of rows) {

From a1e594b34a8faa181af01e6c8449498eafa7e086 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 20 May 2026 23:48:54 -0500
Subject: [PATCH 30/96] feat(inference): run selector actually filters chart
 data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When two workflow runs land on the same date (e.g. re-ingesting a
config), the run picker's "Run 1/2" ↔ "Run 2/2" had no effect on the
chart — benchmarks API returned DISTINCT ON (config, conc, isl, osl)
ordered by date with no run tiebreaker, so Postgres arbitrarily picked
one row per config and both picker selections produced identical data.

Plumb runId through the request path:
- getLatestBenchmarks gets an optional runId branch that strictly
  scopes to one workflow_run (filter wr.github_run_id = $runId).
- /api/v1/benchmarks accepts ?runId=…, forwarded into the cached query
  so each run has its own blob-cache entry.
- fetchBenchmarks → benchmarkQueryOptions → useBenchmarks pass the
  runId through; React Query keys it for separate caches per run.
- useChartData accepts selectedRunId and forwards it.
- InferenceProvider only passes runId when the current date has >1
  runs — single-run dates keep the existing latest-per-config logic
  so configs from earlier dates remain visible.

Verified in the dashboard: switching Run 1/2 ↔ Run 2/2 fires distinct
requests with the correct runId and the chart re-renders per-run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/benchmarks/route.test.ts   | 24 +++++++++-
 .../app/src/app/api/v1/benchmarks/route.ts    |  7 +--
 .../components/inference/InferenceContext.tsx |  9 ++++
 .../inference/hooks/useChartData.ts           | 11 ++++-
 .../app/src/hooks/api/use-benchmarks.test.ts  | 21 +++++++-
 packages/app/src/hooks/api/use-benchmarks.ts  | 10 ++--
 packages/app/src/lib/api.ts                   |  3 ++
 packages/db/src/queries/benchmarks.ts         | 48 +++++++++++++++++++
 8 files changed, 122 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/app/api/v1/benchmarks/route.test.ts b/packages/app/src/app/api/v1/benchmarks/route.test.ts
index 780f775e..92d5f326 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.test.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.test.ts
@@ -59,6 +59,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       undefined,
       undefined,
+      undefined,
     );
   });
 
@@ -72,6 +73,7 @@ describe('GET /api/v1/benchmarks', () => {
       ['dsr1'],
       '2026-03-01',
       undefined,
+      undefined,
     );
   });
 
@@ -82,7 +84,27 @@ describe('GET /api/v1/benchmarks', () => {
       req('/api/v1/benchmarks?model=DeepSeek-R1-0528&date=2026-03-01&exact=true'),
     );
     expect(res.status).toBe(200);
-    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith('mock-sql', ['dsr1'], '2026-03-01', true);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      '2026-03-01',
+      true,
+      undefined,
+    );
+  });
+
+  it('passes runId param to query when provided', async () => {
+    mockGetLatestBenchmarks.mockResolvedValueOnce([]);
+
+    const res = await GET(req('/api/v1/benchmarks?model=DeepSeek-R1-0528&runId=26194160120'));
+    expect(res.status).toBe(200);
+    expect(mockGetLatestBenchmarks).toHaveBeenCalledWith(
+      'mock-sql',
+      ['dsr1'],
+      undefined,
+      undefined,
+      '26194160120',
+    );
   });
 
   it('returns 500 when query throws', async () => {
diff --git a/packages/app/src/app/api/v1/benchmarks/route.ts b/packages/app/src/app/api/v1/benchmarks/route.ts
index c79f1aa7..c4037208 100644
--- a/packages/app/src/app/api/v1/benchmarks/route.ts
+++ b/packages/app/src/app/api/v1/benchmarks/route.ts
@@ -11,10 +11,10 @@ import { loadFixture } from '@/lib/test-fixtures';
 export const dynamic = 'force-dynamic';
 
 const getCachedBenchmarks = cachedQuery(
-  (dbModelKeys: string[], date?: string, exact?: boolean) => {
+  (dbModelKeys: string[], date?: string, exact?: boolean, runId?: string) => {
     if (JSON_MODE)
       return Promise.resolve(jsonProvider.getLatestBenchmarks(dbModelKeys, date, exact));
-    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact);
+    return getLatestBenchmarks(getDb(), dbModelKeys, date, exact, runId);
   },
   'benchmarks',
   { blobOnly: true },
@@ -25,6 +25,7 @@ export async function GET(request: NextRequest) {
   const model = params.get('model') ?? '';
   const date = params.get('date') ?? undefined;
   const exact = params.get('exact') === 'true';
+  const runId = params.get('runId') ?? undefined;
   const dbModelKeys = DISPLAY_MODEL_TO_DB[model];
   if (!dbModelKeys || dbModelKeys.length === 0) {
     return NextResponse.json({ error: 'Unknown model' }, { status: 400 });
@@ -32,7 +33,7 @@ export async function GET(request: NextRequest) {
   if (FIXTURES_MODE) return cachedJson(loadFixture('benchmarks'));
 
   try {
-    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined);
+    const rows = await getCachedBenchmarks(dbModelKeys, date, exact || undefined, runId);
     return cachedJson(rows);
   } catch (error) {
     console.error('Error fetching benchmarks:', error);
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 74bdb28b..edf0974e 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -214,6 +214,14 @@ export function InferenceProvider({
   // ── Data fetching (gated by isActive) ──────────────────────────────────────
   const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined;
 
+  // Run-selector scoping: only constrain benchmark data to a specific run when
+  // the current date has >1 runs (ambiguous case). When there's one run per
+  // date, the picker is informational and the SQL's latest-per-config logic
+  // already returns that run's data — passing runId would needlessly narrow
+  // the cross-date config view.
+  const multipleRunsOnDate = availableRuns && Object.keys(availableRuns).length > 1;
+  const benchmarkRunId = multipleRunsOnDate && selectedRunId ? String(selectedRunId) : undefined;
+
   const {
     graphs,
     loading: chartDataLoading,
@@ -236,6 +244,7 @@ export function InferenceProvider({
     latestDate,
     selectedPercentile,
     compareGpuPair ?? null,
+    benchmarkRunId,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 2557b0d8..328750f0 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -86,10 +86,19 @@ export function useChartData(
   selectedPercentile = 'p90',
   /** When set, only series for these two registry GPU keys are shown (compare pages). */
   compareGpuPair?: readonly [string, string] | null,
+  /**
+   * GitHub run id (g_runid) from the run picker. When set, the benchmarks API
+   * scopes results to that workflow run instead of returning the latest per
+   * config — disambiguates when two runs land on the same date.
+   */
+  selectedRunId?: string,
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
   // materialized view instead of firing a redundant second fetch with identical data.
+  // When a specific run is selected, we always go through the runId branch and the
+  // date is effectively ignored — keep queryDate set so React Query still has a
+  // distinct cache key per date if the user navigates back to "latest".
   const queryDate =
     selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate
       ? ''
@@ -99,7 +108,7 @@ export function useChartData(
     data: allRows,
     isLoading: queryLoading,
     error: queryError,
-  } = useBenchmarks(selectedModel, queryDate, enabled);
+  } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId);
 
   // GPU comparison: fetch data for each additional comparison date
   const comparisonDates = useMemo(
diff --git a/packages/app/src/hooks/api/use-benchmarks.test.ts b/packages/app/src/hooks/api/use-benchmarks.test.ts
index 7329896d..c4f49130 100644
--- a/packages/app/src/hooks/api/use-benchmarks.test.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.test.ts
@@ -5,12 +5,29 @@ import { benchmarkQueryOptions } from '@/hooks/api/use-benchmarks';
 describe('benchmarkQueryOptions', () => {
   it('builds query key from model and date', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01');
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest']);
+    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'latest', '']);
   });
 
   it('builds exact query key when exact=true', () => {
     const opts = benchmarkQueryOptions('DeepSeek-R1-0528', '2026-03-01', true, true);
-    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact']);
+    expect(opts.queryKey).toEqual(['benchmarks', 'DeepSeek-R1-0528', '2026-03-01', 'exact', '']);
+  });
+
+  it('includes runId in query key when provided', () => {
+    const opts = benchmarkQueryOptions(
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      true,
+      false,
+      '26194160120',
+    );
+    expect(opts.queryKey).toEqual([
+      'benchmarks',
+      'DeepSeek-R1-0528',
+      '2026-03-01',
+      'latest',
+      '26194160120',
+    ]);
   });
 
   it('produces distinct keys for different models', () => {
diff --git a/packages/app/src/hooks/api/use-benchmarks.ts b/packages/app/src/hooks/api/use-benchmarks.ts
index 6da1568e..8fd1f4e9 100644
--- a/packages/app/src/hooks/api/use-benchmarks.ts
+++ b/packages/app/src/hooks/api/use-benchmarks.ts
@@ -8,14 +8,16 @@ export function benchmarkQueryOptions(
   date: string,
   enabled = true,
   exact?: boolean,
+  runId?: string,
 ) {
   return {
-    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest'] as const,
-    queryFn: ({ signal }: { signal: AbortSignal }) => fetchBenchmarks(model, date, exact, signal),
+    queryKey: ['benchmarks', model, date, exact ? 'exact' : 'latest', runId ?? ''] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      fetchBenchmarks(model, date, exact, signal, runId),
     enabled: enabled && Boolean(model),
   };
 }
 
-export function useBenchmarks(model: string, date?: string, enabled = true) {
-  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled));
+export function useBenchmarks(model: string, date?: string, enabled = true, runId?: string) {
+  return useQuery(benchmarkQueryOptions(model, date ?? 'latest', enabled, undefined, runId));
 }
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 98587c2f..31cf906a 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -121,10 +121,13 @@ export function fetchBenchmarks(
   date?: string,
   exact?: boolean,
   signal?: AbortSignal,
+  /** Optional github_run_id to scope to a specific workflow run. */
+  runId?: string,
 ) {
   const params = new URLSearchParams({ model });
   if (date) params.set('date', date);
   if (exact) params.set('exact', 'true');
+  if (runId) params.set('runId', runId);
   return fetchJson<BenchmarkRow[]>(`/api/v1/benchmarks?${params}`, signal);
 }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 36bb0e65..2291dc0c 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -53,8 +53,56 @@ export async function getLatestBenchmarks(
   modelKey: string | string[],
   date?: string,
   exact?: boolean,
+  /**
+   * If set, filter to a specific GitHub Actions workflow run.
+   * Bypasses the "latest per config" logic — when two runs landed on the same
+   * date and the user picked one in the run selector, this scopes the chart
+   * data to that run only. Value matches the URL param `g_runid` (a
+   * stringified github_run_id, not the DB id).
+   */
+  runId?: string,
 ): Promise<BenchmarkRow[]> {
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
+  if (runId) {
+    const rows = await sql`
+      SELECT
+        br.id,
+        c.hardware,
+        c.framework,
+        c.model,
+        c.precision,
+        c.spec_method,
+        c.disagg,
+        c.is_multinode,
+        c.prefill_tp,
+        c.prefill_ep,
+        c.prefill_dp_attention,
+        c.prefill_num_workers,
+        c.decode_tp,
+        c.decode_ep,
+        c.decode_dp_attention,
+        c.decode_num_workers,
+        c.num_prefill_gpu,
+        c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
+        br.isl,
+        br.osl,
+        br.conc,
+        br.image,
+        br.metrics,
+        br.date::text,
+        CASE WHEN wr.html_url IS NOT NULL THEN wr.html_url || '/attempts/' || wr.run_attempt ELSE NULL END AS run_url
+      FROM benchmark_results br
+      JOIN configs c ON c.id = br.config_id
+      JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+      WHERE c.model = ANY(${modelKeys})
+        AND br.error IS NULL
+        AND wr.github_run_id = ${runId}::bigint
+      ORDER BY br.config_id, br.conc, br.isl, br.osl
+    `;
+    return rows as unknown as BenchmarkRow[];
+  }
   if (date) {
     // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest)
     // exact=true: only return data from this exact date (for GPU comparison)

From b0d228abeb344aa2ced0e2c5ab2ac43e0128a17e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 00:11:34 -0500
Subject: [PATCH 31/96] feat(inference): Session Time + Prefill TPS x-axis
 (live from trace blobs)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two new agentic-only chart variants per
https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa, computed
live from the stored aiperf profile_export.jsonl blobs (no backfill needed):

- Session Time: mean across sessions of Σ per-turn request_latency,
  rescaled by mean_load / session_load. The summed-latency definition
  inherently strips inter-turn tool/thinking gaps (only GPU active time
  contributes).
- Prefill TPS / user: per turn ISL / TTFT, P90 across the session's turns,
  mean across sessions. Captures worst-turn prefill responsiveness.

The buttons only show on agentic scenarios (gated by a mounted flag to
keep SSR identical to the first client render). Roofline corners match the
expected Pareto direction: Session Time sweeps bottom-left → top-right;
Prefill TPS sweeps top-left → bottom-right.

Plumbing:
- New `getDerivedAgenticMetrics(sql, ids)` in packages/db chunks JSONL
  blob loads to 6 per query so we stay under Neon's 64 MB cap. Includes
  5-case unit suite for the math.
- New `/api/v1/derived-agentic-metrics` route + `useDerivedAgenticMetrics`
  hook, mirroring trace-histograms (blob-cached).
- ChartDisplay fetches derived metrics for visible agentic point IDs and
  overrides scatter data.x + chart heading + axis label + roofline corner.

Two side-effects fixed along the way:
- Hydration mismatch from URL-driven initial state: x-axis-mode now seeds
  from a fixed default and applies the URL value post-mount.
- The run-selector scoping regression where DSR1 (no model-matching
  changelog on its date) tried to fetch with a runId from a different
  model's run and got zero rows. Only pass runId when there are >1 runs
  whose CHANGELOG explicitly mentions the current model + precision.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/app/cypress/support/mock-data.ts     |   2 +-
 .../api/v1/derived-agentic-metrics/route.ts   |  68 ++++++
 .../components/inference/InferenceContext.tsx |  93 ++++++--
 .../app/src/components/inference/types.ts     |  10 +-
 .../components/inference/ui/ChartDisplay.tsx  | 114 +++++++--
 .../hooks/api/use-derived-agentic-metrics.ts  |  41 ++++
 .../queries/derived-agentic-metrics.test.ts   |  96 ++++++++
 .../db/src/queries/derived-agentic-metrics.ts | 224 ++++++++++++++++++
 8 files changed, 612 insertions(+), 36 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
 create mode 100644 packages/app/src/hooks/api/use-derived-agentic-metrics.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.test.ts
 create mode 100644 packages/db/src/queries/derived-agentic-metrics.ts

diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts
index 2d3c982f..152e3f98 100644
--- a/packages/app/cypress/support/mock-data.ts
+++ b/packages/app/cypress/support/mock-data.ts
@@ -195,7 +195,7 @@ export function createMockInferenceContext(
     setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'),
     selectedE2eXAxisMetric: null,
     setSelectedE2eXAxisMetric: namedStub('setSelectedE2eXAxisMetric'),
-    selectedXAxisMode: 'interactivity',
+    selectedXAxisMode: 'interactivity' as const,
     setSelectedXAxisMode: namedStub('setSelectedXAxisMode'),
     scaleType: 'auto',
     setScaleType: namedStub('setScaleType'),
diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
new file mode 100644
index 00000000..e5f6e0b2
--- /dev/null
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -0,0 +1,68 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getDerivedAgenticMetrics,
+  type DerivedAgenticMetricMap,
+} from '@semianalysisai/inferencex-db/queries/derived-agentic-metrics';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: the response is one entry per id with two numbers, but the
+// derivation work parses thousands of JSONL records per blob — cache the
+// computed result so a chart-refresh hits the warm path.
+const getCachedDerivedAgenticMetrics = cachedQuery(
+  (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
+  'derived-agentic-metrics',
+  { blobOnly: true },
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/derived-agentic-metrics?ids=1,2,3
+ *
+ * Returns per-id derived metrics computed live from the stored aiperf
+ * profile_export.jsonl blobs:
+ *  - normalized_session_time_s: mean across sessions of session e2e time
+ *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
+ *  - mean_p90_prefill_tps_per_user: mean across sessions of P90 (over the
+ *    session's turns) prefill TPS/user (ISL / TTFT).
+ *
+ * Ids without a trace_replay blob or with unparseable records are omitted.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const result = await getCachedDerivedAgenticMetrics(sorted);
+    return cachedJson(result);
+  } catch (error) {
+    console.error('Error fetching derived agentic metrics:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index edf0974e..2e5a245f 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -137,17 +137,32 @@ export function InferenceProvider({
   // computing a kind-based default here would diverge between server and client
   // and cause a hydration mismatch. The scenario-kind default is applied in a
   // post-mount effect below (and a ref tracks whether the user has overridden).
-  const urlXMode = (() => {
+  type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  const VALID_X_MODES: XAxisMode[] = [
+    'ttft',
+    'e2e',
+    'interactivity',
+    'session-time',
+    'prefill-tps',
+  ];
+  // SSR has no URL access, so seed with a fixed default and apply the URL
+  // value (if any) in a post-mount effect — keeps server + client first render
+  // identical and avoids "didn't match" hydration warnings when the URL holds
+  // a non-default mode.
+  const [selectedXAxisMode, setSelectedXAxisMode] = useState<XAxisMode>('ttft');
+  const xAxisModeFromUrlRef = useRef(false);
+  useEffect(() => {
+    if (xAxisModeFromUrlRef.current) return;
     const v = getUrlParam('i_xmode');
-    return v === 'ttft' || v === 'e2e' || v === 'interactivity' ? v : null;
-  })();
-  const [selectedXAxisMode, setSelectedXAxisMode] = useState<'ttft' | 'e2e' | 'interactivity'>(
-    urlXMode ?? 'ttft',
-  );
-  const xAxisModeFromUrlRef = useRef(urlXMode !== null);
+    if (v && (VALID_X_MODES as string[]).includes(v)) {
+      xAxisModeFromUrlRef.current = true;
+      setSelectedXAxisMode(v as XAxisMode);
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
   // Wrap the setter so a button click also aligns selectedE2eXAxisMetric — the
   // existing useChartData pipeline keys off that flag for the e2e chart's x-axis.
-  const handleSetXAxisMode = useCallback((mode: 'ttft' | 'e2e' | 'interactivity') => {
+  const handleSetXAxisMode = useCallback((mode: XAxisMode) => {
     xAxisModeFromUrlRef.current = true;
     setSelectedXAxisMode(mode);
     // The e2e chart's x-axis metric is reconciled in a separate effect below,
@@ -215,12 +230,37 @@ export function InferenceProvider({
   const latestDate = availableDates.length > 0 ? availableDates.at(-1) : undefined;
 
   // Run-selector scoping: only constrain benchmark data to a specific run when
-  // the current date has >1 runs (ambiguous case). When there's one run per
-  // date, the picker is informational and the SQL's latest-per-config logic
-  // already returns that run's data — passing runId would needlessly narrow
-  // the cross-date config view.
-  const multipleRunsOnDate = availableRuns && Object.keys(availableRuns).length > 1;
-  const benchmarkRunId = multipleRunsOnDate && selectedRunId ? String(selectedRunId) : undefined;
+  // there's actually a disambiguation to make for the CURRENT model. The
+  // raw `availableRuns` is across ALL models on the date, so the picker may
+  // auto-select a run that produced nothing for the current model — passing
+  // that runId would return zero rows and hide the chart entirely.
+  // Compute the set of runs whose CHANGELOG explicitly mentions this model +
+  // precision. We can't reuse `filterRunsByModel` here because it has a
+  // fallback that returns all runs when nothing matches (so the picker still
+  // renders) — which would make us pass a runId that produced no rows for
+  // the current model, hiding the chart.
+  const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING)
+    .filter(([, model]) => model === selectedModel)
+    .map(([prefix]) => prefix);
+  const runIdsWithModelChangelog: string[] = [];
+  if (availableRuns) {
+    for (const [runId, runInfo] of Object.entries(availableRuns)) {
+      if (!runInfo.changelog) continue;
+      const matches = runInfo.changelog.entries.some((entry) =>
+        entry.config_keys.some((key) => {
+          const parts = key.split('-');
+          return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!);
+        }),
+      );
+      if (matches) runIdsWithModelChangelog.push(runId);
+    }
+  }
+  const benchmarkRunId =
+    selectedRunId &&
+    runIdsWithModelChangelog.length > 1 &&
+    runIdsWithModelChangelog.includes(selectedRunId)
+      ? String(selectedRunId)
+      : undefined;
 
   const {
     graphs,
@@ -367,11 +407,30 @@ export function InferenceProvider({
   useEffect(() => {
     const kind = sequenceKind(effectiveSequence);
     const isInitialMount = lastSeqKindRef.current === null;
-    if (!isInitialMount && lastSeqKindRef.current === kind) return;
+    const isAgenticOnlyMode =
+      selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps';
+    // On a stale render where kind hasn't changed, bail unless the current
+    // mode is agentic-only and we just landed on a fixed-seq scenario — in
+    // that case force the snap so the chart doesn't try to plot trace-derived
+    // metrics against rows that have no trace_replay.
+    if (!isInitialMount && lastSeqKindRef.current === kind) {
+      if (kind === 'fixed-seq' && isAgenticOnlyMode) {
+        handleSetXAxisMode('interactivity');
+      }
+      return;
+    }
     lastSeqKindRef.current = kind;
-    if (isInitialMount && xAxisModeFromUrlRef.current) return;
+    if (
+      isInitialMount &&
+      xAxisModeFromUrlRef.current &&
+      !(kind === 'fixed-seq' && isAgenticOnlyMode)
+    ) {
+      // URL-restored agentic-only mode on a fixed-seq sequence makes no sense
+      // — fall through to the default snap below.
+      return;
+    }
     handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
-  }, [effectiveSequence, handleSetXAxisMode]);
+  }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]);
 
   // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or
   // agentic percentile changes. For fixed-seq the JSONB only carries
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index 0a9908e3..bedded40 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -544,13 +544,17 @@ export interface InferenceChartContextType {
   setSelectedE2eXAxisMetric: (metric: string | null) => void;
   /**
    * Which chart variant the user wants to see — the inference card shows one chart
-   * at a time, picked by the big TTFT / E2E Latency / Interactivity buttons.
+   * at a time, picked by the big buttons above the chart.
    * - 'ttft'          → e2e chartType with x-axis forced to p90_ttft
    * - 'e2e'           → e2e chartType with the chart-config default x-axis (median_e2el / p90_e2el)
    * - 'interactivity' → interactivity chartType (x = median_intvty / p90_intvty)
+   * - 'session-time'  → agentic-only; x = mean-normalized session time (live-computed from trace blobs)
+   * - 'prefill-tps'   → agentic-only; x = mean of P90 prefill TPS/user per session
    */
-  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity';
-  setSelectedXAxisMode: (mode: 'ttft' | 'e2e' | 'interactivity') => void;
+  selectedXAxisMode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+  setSelectedXAxisMode: (
+    mode: 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps',
+  ) => void;
   scaleType: 'auto' | 'linear' | 'log';
   setScaleType: (type: 'auto' | 'linear' | 'log') => void;
   setIsLegendExpanded: (metric: boolean) => void;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 12f9f5de..63953b30 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -1,7 +1,7 @@
 'use client';
 import { track } from '@/lib/analytics';
 import dynamic from 'next/dynamic';
-import { useMemo, useRef, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { BarChart3, Table2, X } from 'lucide-react';
 
 import chartDefinitions from '@/components/inference/inference-chart-config.json';
@@ -42,6 +42,7 @@ import {
   sequenceKind,
 } from '@/lib/data-mappings';
 import { useComparisonChangelogs } from '@/hooks/api/use-comparison-changelogs';
+import { useDerivedAgenticMetrics } from '@/hooks/api/use-derived-agentic-metrics';
 import { useTrendData } from '@/components/inference/hooks/useTrendData';
 import { hardwareKeyMatchesAnyBase } from '@/lib/constants';
 
@@ -62,20 +63,25 @@ import WorkflowInfoDisplay from './WorkflowInfoDisplay';
 type InferenceViewMode = 'chart' | 'table';
 
 /**
- * The three chart variants the user can choose with the big buttons above the
- * chart card. Each maps to one entry in `inference-chart-config.json` plus a
- * forced x-axis override for the E2E chartType.
+ * The chart variants the user can choose with the big buttons above the chart
+ * card. The first three map to entries in `inference-chart-config.json` plus a
+ * forced x-axis override for the E2E chartType; the last two are agentic-only
+ * derived metrics computed live from the stored trace_replay blobs.
  */
-type XAxisMode = 'ttft' | 'e2e' | 'interactivity';
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
 
 interface XAxisModeButton {
   value: XAxisMode;
   label: string;
+  /** When true, the button is only shown on agentic scenarios. */
+  agenticOnly?: boolean;
 }
 const X_AXIS_MODE_BUTTONS: XAxisModeButton[] = [
   { value: 'ttft', label: 'TTFT' },
   { value: 'e2e', label: 'E2E Latency' },
   { value: 'interactivity', label: 'Interactivity' },
+  { value: 'session-time', label: 'Session Time', agenticOnly: true },
+  { value: 'prefill-tps', label: 'Prefill TPS / user', agenticOnly: true },
 ];
 
 const VIEW_MODE_OPTIONS: SegmentedToggleOption<InferenceViewMode>[] = [
@@ -134,6 +140,13 @@ export default function ChartDisplay() {
     totalDatesQueried,
   } = useComparisonChangelogs(selectedGPUs, selectedDateRange, dateRangeAvailableDates);
 
+  // SSR has no URL access and `selectedSequence` defaults to agentic on the
+  // server even when the URL says fixed-seq — so any conditional rendering
+  // that keys off `sequenceKind(selectedSequence)` would diverge between
+  // server and client first render. Defer agentic-only UI until after mount.
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
   const [viewModes, setViewModes] = useState<Record<number, InferenceViewMode>>({});
   const replayHandlesRef = useRef<Record<number, ReplayLauncherHandle | null>>({});
   const getViewMode = (index: number): InferenceViewMode => viewModes[index] ?? 'chart';
@@ -301,15 +314,74 @@ export default function ChartDisplay() {
     }));
   }, [graphs, overlayDataByChartType, selectedModel, selectedSequence]);
 
-  // Show one chart at a time, picked by the TTFT / E2E / Interactivity buttons.
-  // Both 'ttft' and 'e2e' modes render the e2e chart (the x-axis swap is handled
-  // upstream by `selectedE2eXAxisMetric`, which `setSelectedXAxisMode` keeps in sync).
+  // Show one chart at a time, picked by the buttons above the chart.
+  //  - 'interactivity' renders the interactivity chartType.
+  //  - 'ttft' / 'e2e' render the e2e chartType (x swap via selectedE2eXAxisMetric).
+  //  - 'session-time' / 'prefill-tps' render the e2e chartType too; the x-axis
+  //    is overridden below from live-computed derived metrics.
   const visibleGraphs = useMemo(() => {
     const wantedType = selectedXAxisMode === 'interactivity' ? 'interactivity' : 'e2e';
     const filtered = effectiveGraphs.filter((g) => g.chartDefinition.chartType === wantedType);
     return filtered.length > 0 ? filtered : effectiveGraphs;
   }, [effectiveGraphs, selectedXAxisMode]);
 
+  // Derived-metric path: fetch live-computed values from the trace_replay blobs
+  // and override scatter data.x. Only fires for the two agentic-only modes.
+  const useDerived =
+    sequenceKind(selectedSequence) === 'agentic' &&
+    (selectedXAxisMode === 'session-time' || selectedXAxisMode === 'prefill-tps');
+  const derivedTargetIds = useMemo(() => {
+    if (!useDerived) return [] as number[];
+    const ids = new Set<number>();
+    for (const g of visibleGraphs) {
+      for (const d of g.data) {
+        if (d.benchmark_type === 'agentic_traces' && typeof d.id === 'number') {
+          ids.add(d.id);
+        }
+      }
+    }
+    return [...ids];
+  }, [useDerived, visibleGraphs]);
+  const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
+  const derivedMetrics = derivedQuery.data;
+
+  const renderableGraphs = useMemo(() => {
+    if (!useDerived) return visibleGraphs;
+    if (!derivedMetrics) return visibleGraphs.map((g) => ({ ...g, data: [] }));
+    const isSession = selectedXAxisMode === 'session-time';
+    const xLabel = isSession
+      ? 'Mean Normalized Session Time (s)'
+      : 'Mean P90 Prefill TPS per user (tok/s)';
+    // Roofline corner = which corner the curve sweeps from / toward, matching
+    // existing chart-config convention:
+    //  - session-time: as concurrency rises, session time AND throughput both
+    //    grow → curve goes bottom-left → top-right → upper_right.
+    //  - prefill-tps:  as concurrency rises, per-user prefill TPS falls while
+    //    total throughput rises → curve goes top-left → bottom-right →
+    //    upper_left.
+    const rooflineCorner = isSession ? 'upper_right' : 'upper_left';
+    return visibleGraphs.map((g) => {
+      const overriddenChartDef = {
+        ...g.chartDefinition,
+        x_label: xLabel,
+        // y_latency_limit was meant to suppress fixed-seq overload outliers on
+        // the TTFT axis — irrelevant for these derived axes.
+        y_latency_limit: undefined,
+        [`${selectedYAxisMetric}_roofline` as keyof typeof g.chartDefinition]: rooflineCorner,
+      };
+      const data = g.data
+        .map((d) => {
+          if (typeof d.id !== 'number') return null;
+          const m = derivedMetrics[d.id];
+          const v = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          if (v === null || v === undefined || !Number.isFinite(v)) return null;
+          return { ...d, x: v };
+        })
+        .filter((d): d is NonNullable<typeof d> => d !== null);
+      return { ...g, chartDefinition: overriddenChartDef, data };
+    });
+  }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
+
   const displayGraphs = isFirstLoad
     ? [
         <Card key="skeleton-0">
@@ -318,9 +390,9 @@ export default function ChartDisplay() {
           <Skeleton className="h-[600px] w-full" />
         </Card>,
       ]
-    : visibleGraphs.length === 0
+    : renderableGraphs.length === 0
       ? []
-      : visibleGraphs.map((graph, graphIndex) => {
+      : renderableGraphs.map((graph, graphIndex) => {
           const isTimelineMode = Boolean(
             selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
           );
@@ -396,11 +468,16 @@ export default function ChartDisplay() {
                               return 'vs. P90 Time To First Token';
                             }
 
-                            // For e2e chart: heading is driven by the TTFT / E2E button
-                            // selection above the card, so the inline dropdown is gone.
-                            // The metric carries the percentile prefix (e.g. p90_ttft,
-                            // median_ttft for fixed-seq, p75_ttft for agentic+p75).
+                            // For e2e chart: heading is driven by the buttons above the
+                            // card. Derived-metric modes win first; otherwise the metric
+                            // carries the percentile prefix (e.g. p90_ttft, median_ttft).
                             if (graph.chartDefinition.chartType === 'e2e') {
+                              if (selectedXAxisMode === 'session-time') {
+                                return 'vs. Mean Normalized Session Time';
+                              }
+                              if (selectedXAxisMode === 'prefill-tps') {
+                                return 'vs. Mean P90 Prefill TPS / user';
+                              }
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
                                 const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
@@ -601,7 +678,14 @@ export default function ChartDisplay() {
         aria-label="Chart x-axis metric"
         data-testid="x-axis-mode-buttons"
       >
-        {X_AXIS_MODE_BUTTONS.map(({ value, label }) => {
+        {X_AXIS_MODE_BUTTONS.filter(({ agenticOnly }) => {
+          if (!agenticOnly) return true;
+          // Before client mount, conditionalize on the server-default kind
+          // (agentic) so SSR + first client render produce identical DOM. After
+          // mount, hide the agentic-only buttons on fixed-seq sequences.
+          if (!mounted) return true;
+          return sequenceKind(selectedSequence) === 'agentic';
+        }).map(({ value, label }) => {
           const isActive = selectedXAxisMode === value;
           return (
             <button
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
new file mode 100644
index 00000000..108312ee
--- /dev/null
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -0,0 +1,41 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface DerivedAgenticMetric {
+  id: number;
+  /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
+   *  by mean_load / session_load. Null when the JSONL had no usable records. */
+  normalized_session_time_s: number | null;
+  /** Mean across sessions of (P90 over turns of ISL/TTFT). Null when no
+   *  prefill rates could be computed. */
+  mean_p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+async function fetchDerivedAgenticMetrics(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<DerivedAgenticMetricMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/derived-agentic-metrics?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`derived-agentic-metrics ${res.status}`);
+  return (await res.json()) as DerivedAgenticMetricMap;
+}
+
+/**
+ * Fetch per-id derived agentic metrics (session time + p90 prefill TPS/user)
+ * computed live from the stored aiperf profile_export.jsonl. Used to drive
+ * the "Session Time" and "Prefill TPS/user" chart variants.
+ *
+ * Ids without a trace_replay blob (older or non-aiperf agentic runs) are
+ * silently omitted from the response.
+ */
+export function useDerivedAgenticMetrics(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['derived-agentic-metrics', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchDerivedAgenticMetrics(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
new file mode 100644
index 00000000..795be28a
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -0,0 +1,96 @@
+import { describe, expect, it } from 'vitest';
+
+import { computeDerivedFromBlob } from './derived-agentic-metrics.js';
+
+/** Build one aiperf JSONL record for the synthetic fixture. */
+function rec(
+  conversation_id: string,
+  turn_index: number,
+  fields: { isl: number; osl: number; ttft_ms: number; latency_ms: number },
+): string {
+  return JSON.stringify({
+    metadata: { conversation_id, turn_index, benchmark_phase: 'profiling' },
+    metrics: {
+      request_latency: { value: fields.latency_ms, unit: 'ms' },
+      time_to_first_token: { value: fields.ttft_ms, unit: 'ms' },
+      input_sequence_length: { value: fields.isl, unit: 'tokens' },
+      output_sequence_length: { value: fields.osl, unit: 'tokens' },
+    },
+  });
+}
+
+describe('computeDerivedFromBlob', () => {
+  it('returns nulls when no usable records', () => {
+    const out = computeDerivedFromBlob('');
+    expect(out.normalized_session_time_s).toBeNull();
+    expect(out.mean_p90_prefill_tps_per_user).toBeNull();
+  });
+
+  it('rescales single-session time and computes P90 prefill', () => {
+    // One session, two turns. load = (100+50) + (200+50) = 400.
+    // Single session ⇒ mean_load = load_i ⇒ T̃ = T = (1000+2000) ms = 3.0 s.
+    const jsonl = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s1', 1, { isl: 200, osl: 50, ttft_ms: 1000, latency_ms: 2000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
+    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → P90 within session = 200.
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('rescales times across sessions with unequal load', () => {
+    // s1: 1 turn, load = 100, T = 1s
+    // s2: 1 turn, load = 300, T = 3s
+    // mean_load = 200; T̃_1 = 1 * 200/100 = 2; T̃_2 = 3 * 200/300 = 2
+    // Mean T̃ = 2.0
+    const jsonl = [
+      rec('s1', 0, { isl: 90, osl: 10, ttft_ms: 500, latency_ms: 1000 }),
+      rec('s2', 0, { isl: 270, osl: 30, ttft_ms: 500, latency_ms: 3000 }),
+    ].join('\n');
+    const out = computeDerivedFromBlob(jsonl);
+    expect(out.normalized_session_time_s).toBeCloseTo(2, 6);
+  });
+
+  it('drops records missing required fields and skips non-profiling phase', () => {
+    const lines = [
+      rec('s1', 0, { isl: 100, osl: 50, ttft_ms: 500, latency_ms: 1000 }),
+      // missing TTFT — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's1', turn_index: 1, benchmark_phase: 'profiling' },
+        metrics: {
+          request_latency: { value: 1000, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      // warmup phase — should be skipped
+      JSON.stringify({
+        metadata: { conversation_id: 's2', turn_index: 0, benchmark_phase: 'warmup' },
+        metrics: {
+          request_latency: { value: 9999, unit: 'ms' },
+          time_to_first_token: { value: 9999, unit: 'ms' },
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+    ];
+    const out = computeDerivedFromBlob(lines.join('\n'));
+    expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+  });
+
+  it('p90 across turns: 10-turn session picks the right rank', () => {
+    // Prefill rates 100..1000 (per turn isl/ttft); p90 of 10 values (linear) = 910.
+    const turns = Array.from({ length: 10 }, (_, i) =>
+      rec('s1', i, {
+        isl: (i + 1) * 100, // 100, 200, ..., 1000 tokens
+        osl: 10,
+        ttft_ms: 1000, // 1 second → rates: 100..1000 tps
+        latency_ms: 1500,
+      }),
+    );
+    const out = computeDerivedFromBlob(turns.join('\n'));
+    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+  });
+});
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
new file mode 100644
index 00000000..14f3adcf
--- /dev/null
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -0,0 +1,224 @@
+/**
+ * Live-computed per-point metrics derived from the stored aiperf
+ * `profile_export.jsonl` blob. These aren't precomputed in the metrics JSONB
+ * because they require grouping by `conversation_id` and aggregating per
+ * session — work that's cheap once per agentic point but adds up to be
+ * meaningful only when actually plotted.
+ *
+ * - normalized_session_time_s: per the "Mean Normalized Session Time" proposal
+ *   (https://gist.github.com/xinli-sw/115d370c17f6d1b977878b68530981fa). Sum of
+ *   per-turn `request_latency` per session (inter-turn tool/thinking gaps are
+ *   inherently excluded since we only sum the active GPU time, not wallclock).
+ *   Each session's time is rescaled by `mean_load / session_load`, where load
+ *   is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
+ *
+ * - mean_p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ *   Per turn: prefill_tps = ISL / TTFT_seconds. Per session: P90 across its
+ *   turns. Across sessions: arithmetic mean. Captures the worst-turn prefill
+ *   responsiveness from the end-user perspective.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface DerivedAgenticMetric {
+  /** benchmark_results.id this entry belongs to. */
+  id: number;
+  /** Mean normalized session time in seconds. */
+  normalized_session_time_s: number | null;
+  /** Mean across sessions of (P90 prefill tps/user across the session's turns). */
+  mean_p90_prefill_tps_per_user: number | null;
+}
+
+export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
+
+/**
+ * JSONL blobs can be ~1-2 MB compressed (~5-10 MB raw) and Neon's serverless
+ * HTTP driver caps responses at 64 MB — chunk to stay well under.
+ */
+const QUERY_CHUNK_SIZE = 6;
+
+interface RecordMetrics {
+  request_latency?: { value?: number; unit?: string } | number;
+  time_to_first_token?: { value?: number; unit?: string } | number;
+  input_sequence_length?: { value?: number } | number;
+  output_sequence_length?: { value?: number } | number;
+}
+
+interface RecordMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  benchmark_phase?: string;
+}
+
+interface ProfileRecord {
+  metadata?: RecordMetadata;
+  metrics?: RecordMetrics;
+}
+
+interface TurnFields {
+  request_latency_ms: number;
+  ttft_ms: number;
+  isl: number;
+  osl: number;
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return v;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+function extractTurn(rec: ProfileRecord): TurnFields | null {
+  const m = rec.metrics ?? {};
+  const rl = readNum(m.request_latency);
+  const tt = readNum(m.time_to_first_token);
+  const isl = readNum(m.input_sequence_length);
+  const osl = readNum(m.output_sequence_length);
+  if (rl === undefined || tt === undefined || isl === undefined || osl === undefined) return null;
+  if (rl <= 0 || tt <= 0 || isl <= 0) return null;
+  return { request_latency_ms: rl, ttft_ms: tt, isl, osl };
+}
+
+/** Linear-interpolated percentile (matches numpy's default linear method). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 0) return Number.NaN;
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+  if (xs.length === 0) return Number.NaN;
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+
+/**
+ * Parse one point's JSONL and return the two derived metrics. Returns
+ * `{ session_time: null, prefill: null }` if the blob has no usable records.
+ */
+export function computeDerivedFromBlob(jsonl: string): {
+  normalized_session_time_s: number | null;
+  mean_p90_prefill_tps_per_user: number | null;
+} {
+  // Group records by conversation_id, filter to the profiling phase.
+  const bySession = new Map<string, TurnFields[]>();
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const sid = rec.metadata?.conversation_id;
+    if (!sid) continue;
+    const turn = extractTurn(rec);
+    if (!turn) continue;
+    let list = bySession.get(sid);
+    if (!list) {
+      list = [];
+      bySession.set(sid, list);
+    }
+    list.push(turn);
+  }
+  if (bySession.size === 0) {
+    return { normalized_session_time_s: null, mean_p90_prefill_tps_per_user: null };
+  }
+
+  // Per-session aggregates.
+  const sessionTimesS: number[] = [];
+  const sessionLoads: number[] = [];
+  const sessionP90Prefill: number[] = [];
+  for (const turns of bySession.values()) {
+    let timeMs = 0;
+    let load = 0;
+    const prefillRates: number[] = [];
+    for (const t of turns) {
+      timeMs += t.request_latency_ms;
+      load += t.isl + t.osl;
+      const ttftSec = t.ttft_ms / 1000;
+      if (ttftSec > 0) prefillRates.push(t.isl / ttftSec);
+    }
+    if (load > 0) {
+      sessionTimesS.push(timeMs / 1000);
+      sessionLoads.push(load);
+    }
+    if (prefillRates.length > 0) {
+      prefillRates.sort((a, b) => a - b);
+      sessionP90Prefill.push(quantile(prefillRates, 0.9));
+    }
+  }
+
+  // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+  let normalized: number | null = null;
+  if (sessionTimesS.length > 0) {
+    const meanLoad = meanOf(sessionLoads);
+    if (meanLoad > 0) {
+      const scaled: number[] = [];
+      for (let i = 0; i < sessionTimesS.length; i++) {
+        const ti = sessionTimesS[i]!;
+        const li = sessionLoads[i]!;
+        if (li > 0) scaled.push(ti * (meanLoad / li));
+      }
+      normalized = scaled.length > 0 ? meanOf(scaled) : null;
+    }
+  }
+
+  const prefill = sessionP90Prefill.length > 0 ? meanOf(sessionP90Prefill) : null;
+
+  return {
+    normalized_session_time_s: normalized,
+    mean_p90_prefill_tps_per_user: prefill,
+  };
+}
+
+export async function getDerivedAgenticMetrics(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<DerivedAgenticMetricMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+        and atr.profile_export_jsonl_gz is not null
+    `) as { benchmark_result_id: number; blob: Buffer }[];
+    rows.push(...chunkRows);
+  }
+
+  const result: DerivedAgenticMetricMap = {};
+  for (const row of rows) {
+    try {
+      const jsonl = gunzipSync(row.blob).toString('utf8');
+      const { normalized_session_time_s, mean_p90_prefill_tps_per_user } =
+        computeDerivedFromBlob(jsonl);
+      result[Number(row.benchmark_result_id)] = {
+        id: Number(row.benchmark_result_id),
+        normalized_session_time_s,
+        mean_p90_prefill_tps_per_user,
+      };
+    } catch {
+      // Skip malformed blobs silently — frontend treats missing ids as "no data".
+    }
+  }
+  return result;
+}

From 8af1f5cd42f6d423ded91c04310345a09343fa34 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:20:29 -0400
Subject: [PATCH 32/96] fix(inference): show Mean Normalized Session Time in
 minutes

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/ChartDisplay.tsx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 63953b30..6be524b4 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -350,7 +350,7 @@ export default function ChartDisplay() {
     if (!derivedMetrics) return visibleGraphs.map((g) => ({ ...g, data: [] }));
     const isSession = selectedXAxisMode === 'session-time';
     const xLabel = isSession
-      ? 'Mean Normalized Session Time (s)'
+      ? 'Mean Normalized Session Time (min)'
       : 'Mean P90 Prefill TPS per user (tok/s)';
     // Roofline corner = which corner the curve sweeps from / toward, matching
     // existing chart-config convention:
@@ -373,8 +373,9 @@ export default function ChartDisplay() {
         .map((d) => {
           if (typeof d.id !== 'number') return null;
           const m = derivedMetrics[d.id];
-          const v = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
-          if (v === null || v === undefined || !Number.isFinite(v)) return null;
+          const raw = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
+          const v = isSession ? raw / 60 : raw;
           return { ...d, x: v };
         })
         .filter((d): d is NonNullable<typeof d> => d !== null);

From be34e97dd07ca02de674be04c312f62f779cc95a Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:23:34 -0400
Subject: [PATCH 33/96] fix(inference): use global P90 of per-turn prefill
 TPS/user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the per-session P90 + cross-session mean sandwich; pool every turn
into one array and take a single P90 so the tail isn't dampened. Field
renamed mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user across
DB query, API, frontend hook, and chart labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../api/v1/derived-agentic-metrics/route.ts   |  4 +-
 .../components/inference/ui/ChartDisplay.tsx  |  6 +--
 .../hooks/api/use-derived-agentic-metrics.ts  |  6 +--
 .../queries/derived-agentic-metrics.test.ts   | 10 ++---
 .../db/src/queries/derived-agentic-metrics.ts | 41 +++++++++----------
 5 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index e5f6e0b2..c45173e5 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -28,8 +28,8 @@ const MAX_IDS_PER_REQUEST = 200;
  * profile_export.jsonl blobs:
  *  - normalized_session_time_s: mean across sessions of session e2e time
  *    (Σ per-turn request_latency) rescaled by mean_load / session_load.
- *  - mean_p90_prefill_tps_per_user: mean across sessions of P90 (over the
- *    session's turns) prefill TPS/user (ISL / TTFT).
+ *  - p90_prefill_tps_per_user: P90 of per-turn prefill TPS/user (ISL / TTFT)
+ *    across every turn in every session.
  *
  * Ids without a trace_replay blob or with unparseable records are omitted.
  */
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 6be524b4..bd3064d0 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -351,7 +351,7 @@ export default function ChartDisplay() {
     const isSession = selectedXAxisMode === 'session-time';
     const xLabel = isSession
       ? 'Mean Normalized Session Time (min)'
-      : 'Mean P90 Prefill TPS per user (tok/s)';
+      : 'P90 Prefill TPS per user (tok/s)';
     // Roofline corner = which corner the curve sweeps from / toward, matching
     // existing chart-config convention:
     //  - session-time: as concurrency rises, session time AND throughput both
@@ -373,7 +373,7 @@ export default function ChartDisplay() {
         .map((d) => {
           if (typeof d.id !== 'number') return null;
           const m = derivedMetrics[d.id];
-          const raw = isSession ? m?.normalized_session_time_s : m?.mean_p90_prefill_tps_per_user;
+          const raw = isSession ? m?.normalized_session_time_s : m?.p90_prefill_tps_per_user;
           if (raw === null || raw === undefined || !Number.isFinite(raw)) return null;
           const v = isSession ? raw / 60 : raw;
           return { ...d, x: v };
@@ -477,7 +477,7 @@ export default function ChartDisplay() {
                                 return 'vs. Mean Normalized Session Time';
                               }
                               if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. Mean P90 Prefill TPS / user';
+                                return 'vs. P90 Prefill TPS / user';
                               }
                               const isAgentic = sequenceKind(selectedSequence) === 'agentic';
                               if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
diff --git a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
index 108312ee..6bc7ae5e 100644
--- a/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
+++ b/packages/app/src/hooks/api/use-derived-agentic-metrics.ts
@@ -5,9 +5,9 @@ export interface DerivedAgenticMetric {
   /** Mean across sessions of e2e time (Σ per-turn request_latency) rescaled
    *  by mean_load / session_load. Null when the JSONL had no usable records. */
   normalized_session_time_s: number | null;
-  /** Mean across sessions of (P90 over turns of ISL/TTFT). Null when no
-   *  prefill rates could be computed. */
-  mean_p90_prefill_tps_per_user: number | null;
+  /** P90 of per-turn ISL/TTFT across every turn in every session.
+   *  Null when no prefill rates could be computed. */
+  p90_prefill_tps_per_user: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
diff --git a/packages/db/src/queries/derived-agentic-metrics.test.ts b/packages/db/src/queries/derived-agentic-metrics.test.ts
index 795be28a..321434be 100644
--- a/packages/db/src/queries/derived-agentic-metrics.test.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.test.ts
@@ -23,7 +23,7 @@ describe('computeDerivedFromBlob', () => {
   it('returns nulls when no usable records', () => {
     const out = computeDerivedFromBlob('');
     expect(out.normalized_session_time_s).toBeNull();
-    expect(out.mean_p90_prefill_tps_per_user).toBeNull();
+    expect(out.p90_prefill_tps_per_user).toBeNull();
   });
 
   it('rescales single-session time and computes P90 prefill', () => {
@@ -35,8 +35,8 @@ describe('computeDerivedFromBlob', () => {
     ].join('\n');
     const out = computeDerivedFromBlob(jsonl);
     expect(out.normalized_session_time_s).toBeCloseTo(3, 6);
-    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → P90 within session = 200.
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+    // Prefill TPS per turn: 100/0.5=200, 200/1.0=200 → global P90 = 200.
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
   });
 
   it('rescales times across sessions with unequal load', () => {
@@ -77,7 +77,7 @@ describe('computeDerivedFromBlob', () => {
     ];
     const out = computeDerivedFromBlob(lines.join('\n'));
     expect(out.normalized_session_time_s).toBeCloseTo(1, 6);
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(200, 6);
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(200, 6);
   });
 
   it('p90 across turns: 10-turn session picks the right rank', () => {
@@ -91,6 +91,6 @@ describe('computeDerivedFromBlob', () => {
       }),
     );
     const out = computeDerivedFromBlob(turns.join('\n'));
-    expect(out.mean_p90_prefill_tps_per_user).toBeCloseTo(910, 6);
+    expect(out.p90_prefill_tps_per_user).toBeCloseTo(910, 6);
   });
 });
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index 14f3adcf..ac6fd38d 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -12,10 +12,10 @@
  *   Each session's time is rescaled by `mean_load / session_load`, where load
  *   is Σ(ISL+OSL) across turns. The plotted value is the mean across sessions.
  *
- * - mean_p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
- *   Per turn: prefill_tps = ISL / TTFT_seconds. Per session: P90 across its
- *   turns. Across sessions: arithmetic mean. Captures the worst-turn prefill
- *   responsiveness from the end-user perspective.
+ * - p90_prefill_tps_per_user: per the same gist's "Prefill" Pareto chart.
+ *   Per turn: prefill_tps = ISL / TTFT_seconds. Single P90 across every turn
+ *   in every session — the per-session percentile + cross-session mean
+ *   sandwich was discarded because it just dampens tail behavior.
  */
 
 import { gunzipSync } from 'node:zlib';
@@ -27,8 +27,8 @@ export interface DerivedAgenticMetric {
   id: number;
   /** Mean normalized session time in seconds. */
   normalized_session_time_s: number | null;
-  /** Mean across sessions of (P90 prefill tps/user across the session's turns). */
-  mean_p90_prefill_tps_per_user: number | null;
+  /** P90 of per-turn prefill tps/user (ISL / TTFT) across every turn in every session. */
+  p90_prefill_tps_per_user: number | null;
 }
 
 export type DerivedAgenticMetricMap = Record<number, DerivedAgenticMetric>;
@@ -109,7 +109,7 @@ function meanOf(xs: number[]): number {
  */
 export function computeDerivedFromBlob(jsonl: string): {
   normalized_session_time_s: number | null;
-  mean_p90_prefill_tps_per_user: number | null;
+  p90_prefill_tps_per_user: number | null;
 } {
   // Group records by conversation_id, filter to the profiling phase.
   const bySession = new Map<string, TurnFields[]>();
@@ -134,31 +134,27 @@ export function computeDerivedFromBlob(jsonl: string): {
     list.push(turn);
   }
   if (bySession.size === 0) {
-    return { normalized_session_time_s: null, mean_p90_prefill_tps_per_user: null };
+    return { normalized_session_time_s: null, p90_prefill_tps_per_user: null };
   }
 
-  // Per-session aggregates.
+  // Per-session aggregates for session time; per-turn prefill rates pool into
+  // a single global array so the percentile sees the full distribution.
   const sessionTimesS: number[] = [];
   const sessionLoads: number[] = [];
-  const sessionP90Prefill: number[] = [];
+  const allPrefillRates: number[] = [];
   for (const turns of bySession.values()) {
     let timeMs = 0;
     let load = 0;
-    const prefillRates: number[] = [];
     for (const t of turns) {
       timeMs += t.request_latency_ms;
       load += t.isl + t.osl;
       const ttftSec = t.ttft_ms / 1000;
-      if (ttftSec > 0) prefillRates.push(t.isl / ttftSec);
+      if (ttftSec > 0) allPrefillRates.push(t.isl / ttftSec);
     }
     if (load > 0) {
       sessionTimesS.push(timeMs / 1000);
       sessionLoads.push(load);
     }
-    if (prefillRates.length > 0) {
-      prefillRates.sort((a, b) => a - b);
-      sessionP90Prefill.push(quantile(prefillRates, 0.9));
-    }
   }
 
   // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
@@ -176,11 +172,15 @@ export function computeDerivedFromBlob(jsonl: string): {
     }
   }
 
-  const prefill = sessionP90Prefill.length > 0 ? meanOf(sessionP90Prefill) : null;
+  let prefill: number | null = null;
+  if (allPrefillRates.length > 0) {
+    allPrefillRates.sort((a, b) => a - b);
+    prefill = quantile(allPrefillRates, 0.9);
+  }
 
   return {
     normalized_session_time_s: normalized,
-    mean_p90_prefill_tps_per_user: prefill,
+    p90_prefill_tps_per_user: prefill,
   };
 }
 
@@ -209,12 +209,11 @@ export async function getDerivedAgenticMetrics(
   for (const row of rows) {
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');
-      const { normalized_session_time_s, mean_p90_prefill_tps_per_user } =
-        computeDerivedFromBlob(jsonl);
+      const { normalized_session_time_s, p90_prefill_tps_per_user } = computeDerivedFromBlob(jsonl);
       result[Number(row.benchmark_result_id)] = {
         id: Number(row.benchmark_result_id),
         normalized_session_time_s,
-        mean_p90_prefill_tps_per_user,
+        p90_prefill_tps_per_user,
       };
     } catch {
       // Skip malformed blobs silently — frontend treats missing ids as "no data".

From c774c005f7c2dfc1fa451e293df5d6456ba5be71 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 21 May 2026 01:29:27 -0400
Subject: [PATCH 34/96] fix(inference): no-data flash on session-time /
 prefill-tps modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two root causes for "No data available" when flipping to these modes:

1. Stale blob-cache: the v1 cache key still holds responses with the
   pre-rename `mean_p90_prefill_tps_per_user` field. The frontend's new
   `p90_prefill_tps_per_user` lookup misses → every row filters out.
   Bump the cache key to `derived-agentic-metrics-v2` to force a refresh.

2. Loading flicker: while the derived-metrics fetch is in flight we were
   passing empty `data: []` to ScatterGraph, which surfaces the misleading
   "change your filters" empty-state. Gate skeleton rendering on the
   derived query's pending/fetching state instead.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../api/v1/derived-agentic-metrics/route.ts   |   5 +-
 .../components/inference/ui/ChartDisplay.tsx  | 439 +++++++++---------
 2 files changed, 230 insertions(+), 214 deletions(-)

diff --git a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
index c45173e5..6ce7c017 100644
--- a/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
+++ b/packages/app/src/app/api/v1/derived-agentic-metrics/route.ts
@@ -13,9 +13,12 @@ export const dynamic = 'force-dynamic';
 // blobOnly: the response is one entry per id with two numbers, but the
 // derivation work parses thousands of JSONL records per blob — cache the
 // computed result so a chart-refresh hits the warm path.
+// Bumped to v2 when mean_p90_prefill_tps_per_user → p90_prefill_tps_per_user.
+// Stale v1 cache entries return undefined for the new field and silently
+// blank the chart with "No data available".
 const getCachedDerivedAgenticMetrics = cachedQuery(
   (ids: number[]): Promise<DerivedAgenticMetricMap> => getDerivedAgenticMetrics(getDb(), ids),
-  'derived-agentic-metrics',
+  'derived-agentic-metrics-v2',
   { blobOnly: true },
 );
 
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index bd3064d0..fd6cd9c1 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -344,6 +344,14 @@ export default function ChartDisplay() {
   }, [useDerived, visibleGraphs]);
   const derivedQuery = useDerivedAgenticMetrics(derivedTargetIds, useDerived);
   const derivedMetrics = derivedQuery.data;
+  // Show skeleton (not "No data available") while the derived-metrics query
+  // is in flight. Without this gate, every flip to session-time / prefill-tps
+  // briefly blanks the chart and surfaces a misleading empty-state.
+  const isDerivedLoading =
+    useDerived &&
+    derivedTargetIds.length > 0 &&
+    (derivedQuery.isPending || derivedQuery.isFetching) &&
+    !derivedMetrics;
 
   const renderableGraphs = useMemo(() => {
     if (!useDerived) return visibleGraphs;
@@ -383,191 +391,181 @@ export default function ChartDisplay() {
     });
   }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
 
-  const displayGraphs = isFirstLoad
-    ? [
-        <Card key="skeleton-0">
-          <Skeleton className="h-7 w-2/4 mb-1" />
-          <Skeleton className="h-5 w-3/4 mb-2" />
-          <Skeleton className="h-[600px] w-full" />
-        </Card>,
-      ]
-    : renderableGraphs.length === 0
-      ? []
-      : renderableGraphs.map((graph, graphIndex) => {
-          const isTimelineMode = Boolean(
-            selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
-          );
-          const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
-          return (
-            <section key={graphIndex} className="pt-8 md:pt-0">
-              <figure data-testid="chart-figure" className="relative rounded-lg">
-                <ChartButtons
-                  chartId={`chart-${graphIndex}`}
-                  analyticsPrefix={
-                    isTimelineMode
-                      ? 'gpu_timeseries'
-                      : graph.chartDefinition.chartType === 'e2e'
-                        ? 'latency'
-                        : 'interactivity'
-                  }
-                  leadingControls={
-                    <SegmentedToggle
-                      value={getViewMode(graphIndex)}
-                      options={VIEW_MODE_OPTIONS}
-                      onValueChange={(v) => handleViewModeChange(graphIndex, v)}
-                      ariaLabel="View mode"
-                      testId={`inference-view-toggle-${graphIndex}`}
-                    />
-                  }
-                  hideImageExport={getViewMode(graphIndex) === 'table'}
-                  setIsLegendExpanded={setIsLegendExpanded}
-                  exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
-                  onExportMp4={
-                    replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined
-                  }
-                  onExportCsv={() => {
-                    const visibleData = graph.data.filter((d) =>
+  const displayGraphs =
+    isFirstLoad || isDerivedLoading
+      ? [
+          <Card key="skeleton-0">
+            <Skeleton className="h-7 w-2/4 mb-1" />
+            <Skeleton className="h-5 w-3/4 mb-2" />
+            <Skeleton className="h-[600px] w-full" />
+          </Card>,
+        ]
+      : renderableGraphs.length === 0
+        ? []
+        : renderableGraphs.map((graph, graphIndex) => {
+            const isTimelineMode = Boolean(
+              selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
+            );
+            const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
+            return (
+              <section key={graphIndex} className="pt-8 md:pt-0">
+                <figure data-testid="chart-figure" className="relative rounded-lg">
+                  <ChartButtons
+                    chartId={`chart-${graphIndex}`}
+                    analyticsPrefix={
                       isTimelineMode
-                        ? activeDates.has(`${d.date}_${d.hwKey}`)
-                        : activeHwTypes.has(d.hwKey as string) &&
-                          selectedPrecisions.includes(d.precision),
-                    );
-                    const { headers, rows } = inferenceChartToCsv(
-                      visibleData,
-                      graph.model,
-                      graph.sequence,
-                    );
-                    exportToCsv(
-                      `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
-                      headers,
-                      rows,
-                    );
-                  }}
-                />
-                <Card>
-                  {(() => {
-                    const chartCaption = (
-                      <>
-                        <h2 className="text-lg font-semibold">
-                          {
-                            graph.chartDefinition[
-                              `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                            ]
-                          }{' '}
-                          {(() => {
-                            // For Input metrics with dynamic x-axis, use dynamic heading
-                            const metricTitle =
-                              (graph.chartDefinition[
+                        ? 'gpu_timeseries'
+                        : graph.chartDefinition.chartType === 'e2e'
+                          ? 'latency'
+                          : 'interactivity'
+                    }
+                    leadingControls={
+                      <SegmentedToggle
+                        value={getViewMode(graphIndex)}
+                        options={VIEW_MODE_OPTIONS}
+                        onValueChange={(v) => handleViewModeChange(graphIndex, v)}
+                        ariaLabel="View mode"
+                        testId={`inference-view-toggle-${graphIndex}`}
+                      />
+                    }
+                    hideImageExport={getViewMode(graphIndex) === 'table'}
+                    setIsLegendExpanded={setIsLegendExpanded}
+                    exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
+                    onExportMp4={
+                      replayAvailable
+                        ? () => replayHandlesRef.current[graphIndex]?.open()
+                        : undefined
+                    }
+                    onExportCsv={() => {
+                      const visibleData = graph.data.filter((d) =>
+                        isTimelineMode
+                          ? activeDates.has(`${d.date}_${d.hwKey}`)
+                          : activeHwTypes.has(d.hwKey as string) &&
+                            selectedPrecisions.includes(d.precision),
+                      );
+                      const { headers, rows } = inferenceChartToCsv(
+                        visibleData,
+                        graph.model,
+                        graph.sequence,
+                      );
+                      exportToCsv(
+                        `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
+                        headers,
+                        rows,
+                      );
+                    }}
+                  />
+                  <Card>
+                    {(() => {
+                      const chartCaption = (
+                        <>
+                          <h2 className="text-lg font-semibold">
+                            {
+                              graph.chartDefinition[
                                 `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                              ] as string) || '';
-                            const isInputMetric = metricTitle.toLowerCase().includes('input');
-                            if (
-                              graph.chartDefinition.chartType === 'interactivity' &&
-                              isInputMetric &&
-                              selectedXAxisMetric === 'p90_ttft'
-                            ) {
-                              return 'vs. P90 Time To First Token';
-                            }
-
-                            // For e2e chart: heading is driven by the buttons above the
-                            // card. Derived-metric modes win first; otherwise the metric
-                            // carries the percentile prefix (e.g. p90_ttft, median_ttft).
-                            if (graph.chartDefinition.chartType === 'e2e') {
-                              if (selectedXAxisMode === 'session-time') {
-                                return 'vs. Mean Normalized Session Time';
+                              ]
+                            }{' '}
+                            {(() => {
+                              // For Input metrics with dynamic x-axis, use dynamic heading
+                              const metricTitle =
+                                (graph.chartDefinition[
+                                  `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                                ] as string) || '';
+                              const isInputMetric = metricTitle.toLowerCase().includes('input');
+                              if (
+                                graph.chartDefinition.chartType === 'interactivity' &&
+                                isInputMetric &&
+                                selectedXAxisMetric === 'p90_ttft'
+                              ) {
+                                return 'vs. P90 Time To First Token';
                               }
-                              if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. P90 Prefill TPS / user';
-                              }
-                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
-                                const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
-                                const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
-                                return `vs. ${word} Time To First Token`;
+
+                              // For e2e chart: heading is driven by the buttons above the
+                              // card. Derived-metric modes win first; otherwise the metric
+                              // carries the percentile prefix (e.g. p90_ttft, median_ttft).
+                              if (graph.chartDefinition.chartType === 'e2e') {
+                                if (selectedXAxisMode === 'session-time') {
+                                  return 'vs. Mean Normalized Session Time';
+                                }
+                                if (selectedXAxisMode === 'prefill-tps') {
+                                  return 'vs. P90 Prefill TPS / user';
+                                }
+                                const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                                if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                  const pctl = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                  const word = pctl === 'median' ? 'Median' : pctl.toUpperCase();
+                                  return `vs. ${word} Time To First Token`;
+                                }
+                                const pctlWord = selectedPercentile.toUpperCase();
+                                return isAgentic
+                                  ? `vs. ${pctlWord} End-to-end Latency`
+                                  : 'vs. End-to-end Latency';
                               }
-                              const pctlWord = selectedPercentile.toUpperCase();
-                              return isAgentic
-                                ? `vs. ${pctlWord} End-to-end Latency`
-                                : 'vs. End-to-end Latency';
-                            }
 
-                            // Fall back to the heading baked into chartDefinition
-                            // by useChartData (already resolves per-metric overrides
-                            // and applies the agentic percentile rewrite).
-                            return graph.chartDefinition.heading;
-                          })()}
-                        </h2>
-                        <p className="text-sm text-muted-foreground mb-2">
-                          {getModelLabel(graph.model as Model)} •{' '}
-                          {selectedPrecisions
-                            .map((prec) => getPrecisionLabel(prec as Precision))
-                            .join(', ')}{' '}
-                          • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
-                          {isUnofficialRun
-                            ? 'Source: UNOFFICIAL'
-                            : 'Source: SemiAnalysis InferenceX™'}
-                          {selectedRunDate && (
-                            <>
-                              {' '}
-                              • Updated:{' '}
-                              {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
-                                'en-US',
-                                {
-                                  year: 'numeric',
-                                  month: '2-digit',
-                                  day: '2-digit',
-                                  timeZone: 'UTC',
-                                },
-                              )}
-                            </>
-                          )}
-                        </p>
-                        <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
-                        <UnofficialDomainNotice />
-                      </>
-                    );
-
-                    if (getViewMode(graphIndex) === 'table') {
-                      const overlay =
-                        graph.chartDefinition.chartType === 'e2e'
-                          ? overlayDataByChartType.e2e
-                          : overlayDataByChartType.interactivity;
-                      const overlayRows = (overlay?.data ?? []).filter((p) =>
-                        selectedPrecisions.includes(p.precision),
-                      );
-                      return (
-                        <>
-                          {chartCaption}
-                          <InferenceTable
-                            data={
-                              overlayRows.length > 0 ? [...graph.data, ...overlayRows] : graph.data
-                            }
-                            chartDefinition={graph.chartDefinition}
-                            selectedYAxisMetric={selectedYAxisMetric}
-                          />
+                              // Fall back to the heading baked into chartDefinition
+                              // by useChartData (already resolves per-metric overrides
+                              // and applies the agentic percentile rewrite).
+                              return graph.chartDefinition.heading;
+                            })()}
+                          </h2>
+                          <p className="text-sm text-muted-foreground mb-2">
+                            {getModelLabel(graph.model as Model)} •{' '}
+                            {selectedPrecisions
+                              .map((prec) => getPrecisionLabel(prec as Precision))
+                              .join(', ')}{' '}
+                            • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
+                            {isUnofficialRun
+                              ? 'Source: UNOFFICIAL'
+                              : 'Source: SemiAnalysis InferenceX™'}
+                            {selectedRunDate && (
+                              <>
+                                {' '}
+                                • Updated:{' '}
+                                {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
+                                  'en-US',
+                                  {
+                                    year: 'numeric',
+                                    month: '2-digit',
+                                    day: '2-digit',
+                                    timeZone: 'UTC',
+                                  },
+                                )}
+                              </>
+                            )}
+                          </p>
+                          <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
+                          <UnofficialDomainNotice />
                         </>
                       );
-                    }
 
-                    return selectedDateRange.startDate &&
-                      selectedDateRange.endDate &&
-                      selectedGPUs.length > 0 ? (
-                      <GPUGraph
-                        chartId={`chart-${graphIndex}`}
-                        modelLabel={graph.model}
-                        data={graph.data}
-                        xLabel={graph.chartDefinition.x_label}
-                        yLabel={`${
-                          graph.chartDefinition[
-                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                          ]
-                        }`}
-                        chartDefinition={graph.chartDefinition}
-                        caption={chartCaption}
-                      />
-                    ) : (
-                      <div className="relative">
-                        <ScatterGraph
+                      if (getViewMode(graphIndex) === 'table') {
+                        const overlay =
+                          graph.chartDefinition.chartType === 'e2e'
+                            ? overlayDataByChartType.e2e
+                            : overlayDataByChartType.interactivity;
+                        const overlayRows = (overlay?.data ?? []).filter((p) =>
+                          selectedPrecisions.includes(p.precision),
+                        );
+                        return (
+                          <>
+                            {chartCaption}
+                            <InferenceTable
+                              data={
+                                overlayRows.length > 0
+                                  ? [...graph.data, ...overlayRows]
+                                  : graph.data
+                              }
+                              chartDefinition={graph.chartDefinition}
+                              selectedYAxisMetric={selectedYAxisMetric}
+                            />
+                          </>
+                        );
+                      }
+
+                      return selectedDateRange.startDate &&
+                        selectedDateRange.endDate &&
+                        selectedGPUs.length > 0 ? (
+                        <GPUGraph
                           chartId={`chart-${graphIndex}`}
                           modelLabel={graph.model}
                           data={graph.data}
@@ -579,43 +577,58 @@ export default function ChartDisplay() {
                           }`}
                           chartDefinition={graph.chartDefinition}
                           caption={chartCaption}
-                          overlayData={
-                            graph.chartDefinition.chartType === 'e2e'
-                              ? (overlayDataByChartType.e2e ?? undefined)
-                              : (overlayDataByChartType.interactivity ?? undefined)
-                          }
                         />
-                        {selectedGPUs.length > 0 &&
-                          (!selectedDateRange.startDate || !selectedDateRange.endDate) && (
-                            <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
-                              <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
-                                Select a date range to view GPU comparison
-                              </p>
-                            </div>
-                          )}
-                      </div>
-                    );
-                  })()}
-                  {replayAvailable && (
-                    <ReplayLauncher
-                      ref={(handle) => {
-                        replayHandlesRef.current[graphIndex] = handle;
-                      }}
-                      parentChartId={`chart-${graphIndex}`}
-                      chartDefinition={graph.chartDefinition}
-                      yLabel={`${
-                        graph.chartDefinition[
-                          `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                        ]
-                      }`}
-                      xLabel={graph.chartDefinition.x_label}
-                    />
-                  )}
-                </Card>
-              </figure>
-            </section>
-          );
-        });
+                      ) : (
+                        <div className="relative">
+                          <ScatterGraph
+                            chartId={`chart-${graphIndex}`}
+                            modelLabel={graph.model}
+                            data={graph.data}
+                            xLabel={graph.chartDefinition.x_label}
+                            yLabel={`${
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                              ]
+                            }`}
+                            chartDefinition={graph.chartDefinition}
+                            caption={chartCaption}
+                            overlayData={
+                              graph.chartDefinition.chartType === 'e2e'
+                                ? (overlayDataByChartType.e2e ?? undefined)
+                                : (overlayDataByChartType.interactivity ?? undefined)
+                            }
+                          />
+                          {selectedGPUs.length > 0 &&
+                            (!selectedDateRange.startDate || !selectedDateRange.endDate) && (
+                              <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
+                                <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
+                                  Select a date range to view GPU comparison
+                                </p>
+                              </div>
+                            )}
+                        </div>
+                      );
+                    })()}
+                    {replayAvailable && (
+                      <ReplayLauncher
+                        ref={(handle) => {
+                          replayHandlesRef.current[graphIndex] = handle;
+                        }}
+                        parentChartId={`chart-${graphIndex}`}
+                        chartDefinition={graph.chartDefinition}
+                        yLabel={`${
+                          graph.chartDefinition[
+                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                          ]
+                        }`}
+                        xLabel={graph.chartDefinition.x_label}
+                      />
+                    )}
+                  </Card>
+                </figure>
+              </section>
+            );
+          });
 
   return (
     <div data-testid="inference-chart-display" className="flex flex-col gap-4">

From d5dbda773ef653d715cb1d0634c2b70cc94a826f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 15:40:32 -0500
Subject: [PATCH 35/96] feat(agentic-detail): aggregates-across-configs view

Adds a 'Per-point / Aggregates across configs' toggle near the SKU header
on /inference/agentic/[id]. The aggregates view replaces the per-point
charts with four multi-line charts (ISL, OSL, KV cache util, prefix cache
hit rate) showing how mean/P50/P75/P90/P99 vary across every sibling
config in the SKU. X-axis is sibling labels matching SiblingNav chips
(parallelism + concurrency); each percentile gets its own colored line.

Plumbing:
- `getAgenticAggregates(sql, ids)` in packages/db parses both the
  profile_export.jsonl (per-request ISL/OSL) and the server_metrics_json
  (KV cache util + prefix hit rate time-series) per id, computes the five
  percentiles. 6-case unit suite covers percentile math, JSONL parsing,
  and the prefix-hit derivation.
- /api/v1/agentic-aggregates blob-cached like trace-histograms.
- New `useAgenticAggregates` hook + new AggregateChart component (multi-
  line with hover + ExpandableChart parity).

Memory + transport handling:
- Each row pulls TWO compressed blobs and `server_metrics_json_gz` can be
  up to ~17 MB compressed per high-conc row. Chunked query at size 2
  keeps each Neon HTTP response under the 64 MB cap and limits Node heap
  to ~one chunk's worth of decompressed JSON at a time (parallel chunks
  OOM'd on a 12-sibling SKU).
- Slow path runs ~20s on a 12-sibling SKU; cached afterwards (blobOnly).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/api/v1/agentic-aggregates/route.ts    |  64 +++
 .../agentic-point/agentic-point-detail.tsx    | 479 ++++++++++++------
 .../agentic-point/aggregate-chart.tsx         | 230 +++++++++
 .../inference/agentic-point/sibling-nav.tsx   |   2 +-
 .../src/hooks/api/use-agentic-aggregates.ts   |  45 ++
 .../db/src/queries/agentic-aggregates.test.ts | 113 +++++
 packages/db/src/queries/agentic-aggregates.ts | 255 ++++++++++
 7 files changed, 1020 insertions(+), 168 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/agentic-aggregates/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
 create mode 100644 packages/app/src/hooks/api/use-agentic-aggregates.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.test.ts
 create mode 100644 packages/db/src/queries/agentic-aggregates.ts

diff --git a/packages/app/src/app/api/v1/agentic-aggregates/route.ts b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
new file mode 100644
index 00000000..63cb2dc0
--- /dev/null
+++ b/packages/app/src/app/api/v1/agentic-aggregates/route.ts
@@ -0,0 +1,64 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getAgenticAggregates,
+  type AgenticAggregateMap,
+} from '@semianalysisai/inferencex-db/queries/agentic-aggregates';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+// blobOnly: response stays small (a few numbers per id), but generating it
+// parses ~5-10 MB of decompressed JSONL + JSON per id. Cache so the
+// "Aggregates" toggle stays snappy.
+const getCachedAgenticAggregates = cachedQuery(
+  (ids: number[]): Promise<AgenticAggregateMap> => getAgenticAggregates(getDb(), ids),
+  'agentic-aggregates',
+  { blobOnly: true },
+);
+
+const MAX_IDS_PER_REQUEST = 200;
+
+/**
+ * GET /api/v1/agentic-aggregates?ids=1,2,3
+ *
+ * Returns per-id mean/p50/p75/p90/p99 for ISL, OSL, KV cache utilization,
+ * and prefix cache hit rate — computed live from the stored aiperf
+ * profile_export.jsonl + server_metrics_json blobs. Ids without a
+ * trace_replay blob (or with no usable samples) get nulls.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const result = await getCachedAgenticAggregates(sorted);
+    return cachedJson(result);
+  } catch (error) {
+    console.error('Error fetching agentic aggregates:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index ee58332d..a5bca4e0 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -2,8 +2,10 @@
 
 import Link from 'next/link';
 import { useRouter } from 'next/navigation';
+import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
+import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -12,10 +14,12 @@ import {
   type TimeSeriesPoint,
 } from '@/hooks/api/use-trace-server-metrics';
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
 
+import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
 import { ExpandableChart } from './expandable-chart';
-import { SiblingNav } from './sibling-nav';
+import { SiblingNav, chipLabel } from './sibling-nav';
 import {
   StackedAreaChart,
   TimeSeriesChart,
@@ -78,6 +82,28 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
+type DetailView = 'point' | 'aggregates';
+const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
+  { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+  { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
+];
+
+/** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
+function toAggPoint(
+  sibling: { id: number; label: string },
+  pct: { mean: number; p50: number; p75: number; p90: number; p99: number } | null | undefined,
+): AggregatePoint {
+  const values: Partial<Record<PercentileKey, number>> = {};
+  if (pct) {
+    values.mean = pct.mean;
+    values.p50 = pct.p50;
+    values.p75 = pct.p75;
+    values.p90 = pct.p90;
+    values.p99 = pct.p99;
+  }
+  return { id: sibling.id, label: sibling.label, values };
+}
+
 export function AgenticPointDetail({ id }: Props) {
   const router = useRouter();
   const histQuery = useTraceHistograms([id], true);
@@ -88,6 +114,13 @@ export function AgenticPointDetail({ id }: Props) {
   const metrics = metricsQuery.data;
   const siblingsData = siblingsQuery.data;
 
+  const [view, setView] = useState<DetailView>('point');
+  // Fetch aggregates only when the aggregates view is active. Uses the full
+  // sibling set (across parallelism + concurrency configs) so each chart
+  // shows how the metric varies across the SKU.
+  const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
+  const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
       <div className="flex items-center gap-2">
@@ -128,180 +161,292 @@ export function AgenticPointDetail({ id }: Props) {
         </div>
       )}
 
-      <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
-        <ExpandableChart
-          title="Input sequence length distribution"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
-            return histQuery.isLoading ? <Skeleton /> : <Empty />;
-          }}
-        />
-        <ExpandableChart
-          title="Output sequence length distribution"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
-            return histQuery.isLoading ? <Skeleton /> : <Empty />;
-          }}
+      <div className="flex items-center justify-between gap-3">
+        <SegmentedToggle
+          value={view}
+          options={VIEW_OPTIONS}
+          onValueChange={setView}
+          ariaLabel="Detail view"
+          testId="detail-view-toggle"
+          buttonClassName="px-3 py-1.5 text-sm"
         />
+        {view === 'aggregates' && (
+          <span className="text-xs text-muted-foreground">
+            {siblingIds.length} configs in SKU
+            {aggregatesQuery.isLoading ? ' · loading…' : ''}
+          </span>
+        )}
+      </div>
 
-        <ExpandableChart
-          title="KV cache utilization over time"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'GPU KV cache (avg n=50)',
-                    data: rollingAverage(metrics.kvCacheUsage, 50),
-                    rawData: metrics.kvCacheUsage,
-                    color: '#3b82f6',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yMax={1}
-                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                yAxisLabel="KV cache (%)"
-                {...size}
-              />
-            );
-          }}
+      {view === 'aggregates' ? (
+        <AggregatesGrid
+          siblings={siblingsData?.siblings ?? []}
+          aggregates={aggregatesQuery.data}
+          isLoading={aggregatesQuery.isLoading}
         />
+      ) : (
+        <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+          <ExpandableChart
+            title="Input sequence length distribution"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (hist) return <Distribution values={hist.isl} unit="tokens" {...size} />;
+              return histQuery.isLoading ? <Skeleton /> : <Empty />;
+            }}
+          />
+          <ExpandableChart
+            title="Output sequence length distribution"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (hist) return <Distribution values={hist.osl} unit="tokens" {...size} />;
+              return histQuery.isLoading ? <Skeleton /> : <Empty />;
+            }}
+          />
 
-        <ExpandableChart
-          title="Request queue depth"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'Running (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.running,
-                      })),
-                      50,
-                    ),
-                    color: '#22c55e',
-                    strokeWidth: 2,
-                  },
-                  {
-                    name: 'Waiting (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.waiting,
-                      })),
-                      50,
-                    ),
-                    color: '#ef4444',
-                    strokeWidth: 2,
-                  },
-                  {
-                    name: 'Total (avg n=50)',
-                    data: rollingAverage(
-                      metrics.queueDepth.map((p: QueueDepthPoint) => ({
-                        t: p.t,
-                        value: p.total,
-                      })),
-                      50,
-                    ),
-                    color: '#3b82f6',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yAxisLabel="Requests"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="KV cache utilization over time"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'GPU KV cache (avg n=50)',
+                      data: rollingAverage(metrics.kvCacheUsage, 50),
+                      rawData: metrics.kvCacheUsage,
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yMax={1}
+                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                  yAxisLabel="KV cache (%)"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Prefix cache hit rate per interval"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'GPU (HBM, avg n=50)',
-                    data: rollingAverage(metrics.prefixCacheHitRate, 50),
-                    rawData: metrics.prefixCacheHitRate,
-                    color: '#a855f7',
-                    strokeWidth: 2,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yMax={1}
-                yFmt={(v) => `${(v * 100).toFixed(0)}%`}
-                yAxisLabel="Hit rate (%)"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Request queue depth"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Running (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.running,
+                        })),
+                        50,
+                      ),
+                      color: '#22c55e',
+                      strokeWidth: 2,
+                    },
+                    {
+                      name: 'Waiting (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.waiting,
+                        })),
+                        50,
+                      ),
+                      color: '#ef4444',
+                      strokeWidth: 2,
+                    },
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(
+                        metrics.queueDepth.map((p: QueueDepthPoint) => ({
+                          t: p.t,
+                          value: p.total,
+                        })),
+                        50,
+                      ),
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Requests"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Throughput (total & decode)"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
-            return (
-              <TimeSeriesChart
-                series={[
-                  {
-                    name: 'Total (avg n=50)',
-                    data: rollingAverage(total, 50),
-                    color: '#3b82f6',
-                    strokeWidth: 1.6,
-                  },
-                  {
-                    name: 'Decode (avg n=50)',
-                    data: rollingAverage(metrics.decodeTps, 50),
-                    color: '#f97316',
-                    strokeWidth: 1.6,
-                  },
-                  {
-                    name: 'Total running avg',
-                    data: cumulativeAverage(total),
-                    color: '#ef4444',
-                    strokeWidth: 3,
-                  },
-                ]}
-                durationS={metrics.durationS}
-                yAxisLabel="Tokens / sec"
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Prefix cache hit rate per interval"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'GPU (HBM, avg n=50)',
+                      data: rollingAverage(metrics.prefixCacheHitRate, 50),
+                      rawData: metrics.prefixCacheHitRate,
+                      color: '#a855f7',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yMax={1}
+                  yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+                  yAxisLabel="Hit rate (%)"
+                  {...size}
+                />
+              );
+            }}
+          />
 
-        <ExpandableChart
-          title="Cumulative prompt token source breakdown"
-          render={(expanded) => {
-            const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
-            if (!metrics) return <Skeleton />;
-            return (
-              <StackedAreaChart
-                sourceSeries={metrics.promptTokensBySource}
-                durationS={metrics.durationS}
-                {...size}
-              />
-            );
-          }}
-        />
+          <ExpandableChart
+            title="Throughput (total & decode)"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              const total = sumSeries(metrics.prefillTps, metrics.decodeTps);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Total (avg n=50)',
+                      data: rollingAverage(total, 50),
+                      color: '#3b82f6',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Decode (avg n=50)',
+                      data: rollingAverage(metrics.decodeTps, 50),
+                      color: '#f97316',
+                      strokeWidth: 1.6,
+                    },
+                    {
+                      name: 'Total running avg',
+                      data: cumulativeAverage(total),
+                      color: '#ef4444',
+                      strokeWidth: 3,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens / sec"
+                  {...size}
+                />
+              );
+            }}
+          />
+
+          <ExpandableChart
+            title="Cumulative prompt token source breakdown"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <StackedAreaChart
+                  sourceSeries={metrics.promptTokensBySource}
+                  durationS={metrics.durationS}
+                  {...size}
+                />
+              );
+            }}
+          />
+        </div>
+      )}
+    </div>
+  );
+}
+
+function AggregatesGrid({
+  siblings,
+  aggregates,
+  isLoading,
+}: {
+  siblings: {
+    id: number;
+    conc: number;
+    decode_tp: number;
+    decode_ep: number;
+    disagg: boolean;
+    num_prefill_gpu: number;
+    num_decode_gpu: number;
+    offload_mode?: string | null;
+  }[];
+  aggregates: AgenticAggregateMap | undefined;
+  isLoading: boolean;
+}) {
+  if (siblings.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        SKU sibling list not loaded yet — open a point to populate.
       </div>
+    );
+  }
+  if (isLoading && !aggregates) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        Computing aggregates across {siblings.length} configs… (parsing trace blobs)
+      </div>
+    );
+  }
+  const labeled = siblings.map((s) => ({ id: s.id, label: chipLabel(s as any) }));
+  const islPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.isl));
+  const oslPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.osl));
+  const kvPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.kvCacheUtil));
+  const prefixPoints = labeled.map((s) => toAggPoint(s, aggregates?.[s.id]?.prefixCacheHitRate));
+  return (
+    <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
+      <ExpandableChart
+        title="ISL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={islPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="OSL distribution (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={oslPoints}
+            unit="tokens"
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="KV cache utilization (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={kvPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
+      <ExpandableChart
+        title="Prefix cache hit rate (across configs)"
+        render={(expanded) => (
+          <AggregateChart
+            points={prefixPoints}
+            unit="%"
+            yMax={1}
+            yFmt={(v) => `${(v * 100).toFixed(0)}%`}
+            {...(expanded ? CHART_SIZES.expanded : CHART_SIZES.inline)}
+          />
+        )}
+      />
     </div>
   );
 }
diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
new file mode 100644
index 00000000..446677ad
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -0,0 +1,230 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { ChartHover, type HoverItem } from './chart-hover';
+
+export type PercentileKey = 'mean' | 'p50' | 'p75' | 'p90' | 'p99';
+
+interface PercentileLine {
+  key: PercentileKey;
+  /** Display label in legend / tooltip. */
+  label: string;
+  color: string;
+}
+
+const PERCENTILE_LINES: PercentileLine[] = [
+  { key: 'mean', label: 'Mean', color: '#ef4444' },
+  { key: 'p50', label: 'P50', color: '#3b82f6' },
+  { key: 'p75', label: 'P75', color: '#22c55e' },
+  { key: 'p90', label: 'P90', color: '#f59e0b' },
+  { key: 'p99', label: 'P99', color: '#a855f7' },
+];
+
+export interface AggregatePoint {
+  /** Sibling label rendered on x-axis (e.g. "TP8 • c=8"). */
+  label: string;
+  /** Per-percentile value; missing percentiles are dropped from the plot. */
+  values: Partial<Record<PercentileKey, number>>;
+  /** Sibling id — purely informational, used in the tooltip title. */
+  id?: number;
+}
+
+/**
+ * Multi-line chart: one x-position per sibling config, one line per
+ * percentile (mean/p50/p75/p90/p99). Designed for the "Aggregates across
+ * configs" view on the agentic detail page.
+ */
+export function AggregateChart({
+  points,
+  unit,
+  yMax,
+  yFmt,
+  width = 720,
+  height = 320,
+}: {
+  points: readonly AggregatePoint[];
+  unit: string;
+  /** Optional fixed y-axis upper bound (e.g. 1 for percentages). */
+  yMax?: number;
+  /** Optional value formatter (e.g. percentage → "30%"). */
+  yFmt?: (v: number) => string;
+  width?: number;
+  height?: number;
+}) {
+  const W = width;
+  const H = height;
+  const PAD = { top: 16, right: 16, bottom: 90, left: 64 };
+  const fmt = (v: number) =>
+    yFmt
+      ? yFmt(v)
+      : v >= 10000
+        ? new Intl.NumberFormat('en-US').format(Math.round(v))
+        : v.toFixed(v < 10 ? 2 : 0);
+
+  const computed = useMemo(() => {
+    if (points.length === 0) return null;
+    let yMaxComputed = 0;
+    for (const p of points) {
+      for (const line of PERCENTILE_LINES) {
+        const v = p.values[line.key];
+        if (typeof v === 'number' && Number.isFinite(v) && v > yMaxComputed) yMaxComputed = v;
+      }
+    }
+    const yTop = yMax ?? (yMaxComputed === 0 ? 1 : yMaxComputed * 1.05);
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    return { yTop, innerW, innerH };
+  }, [points, W, H, PAD.left, PAD.right, PAD.top, PAD.bottom, yMax]);
+
+  if (!computed) {
+    return (
+      <div className="grid place-items-center text-xs text-muted-foreground" style={{ height: H }}>
+        No data
+      </div>
+    );
+  }
+  const { yTop, innerW, innerH } = computed;
+
+  // X positions: evenly spaced across the inner width.
+  const xOf = (i: number) =>
+    points.length === 1 ? PAD.left + innerW / 2 : PAD.left + (i / (points.length - 1)) * innerW;
+  const yOf = (v: number) => PAD.top + (1 - v / yTop) * innerH;
+
+  // 5 y-axis ticks evenly between 0 and yTop.
+  const yTicks = Array.from({ length: 5 }, (_, i) => (yTop * i) / 4);
+
+  // Resolve hover: snap to nearest sibling index and emit all percentiles
+  // that have data at that x.
+  const resolve = (fraction: number) => {
+    const idx = Math.round(fraction * (points.length - 1));
+    const p = points[Math.max(0, Math.min(points.length - 1, idx))];
+    if (!p) return null;
+    const items: HoverItem[] = [];
+    for (const line of PERCENTILE_LINES) {
+      const v = p.values[line.key];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      items.push({ color: line.color, label: line.label, value: fmt(v) });
+    }
+    return { items, title: p.label };
+  };
+
+  return (
+    <div className="w-full">
+      <div className="mb-2 flex flex-wrap items-center gap-x-3 gap-y-1 text-xs">
+        {PERCENTILE_LINES.map((line) => (
+          <div key={line.key} className="flex items-center gap-1.5">
+            <span className="inline-block w-3 h-0.5" style={{ backgroundColor: line.color }} />
+            <span className="text-muted-foreground">{line.label}</span>
+          </div>
+        ))}
+        <span className="ml-auto text-muted-foreground">
+          {points.length} configs · units: {unit}
+        </span>
+      </div>
+      <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+        {/* y-axis ticks + gridlines */}
+        {yTicks.map((v, i) => {
+          const y = yOf(v);
+          return (
+            <g key={`y${i}`}>
+              <line
+                x1={PAD.left}
+                x2={PAD.left + innerW}
+                y1={y}
+                y2={y}
+                stroke="currentColor"
+                opacity={0.08}
+              />
+              <text
+                x={PAD.left - 8}
+                y={y + 3}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.55}
+                textAnchor="end"
+              >
+                {fmt(v)}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X-axis tick labels — one per sibling, rotated 30° to fit. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          return (
+            <g key={`x${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={PAD.top + innerH}
+                y2={PAD.top + innerH + 4}
+                stroke="currentColor"
+                opacity={0.4}
+              />
+              <text
+                x={x}
+                y={PAD.top + innerH + 8}
+                fontSize={10}
+                fill="currentColor"
+                opacity={0.7}
+                textAnchor="end"
+                transform={`rotate(-30 ${x} ${PAD.top + innerH + 8})`}
+              >
+                {p.label}
+              </text>
+            </g>
+          );
+        })}
+
+        {/* X axis baseline */}
+        <line
+          x1={PAD.left}
+          x2={PAD.left + innerW}
+          y1={PAD.top + innerH}
+          y2={PAD.top + innerH}
+          stroke="currentColor"
+          opacity={0.25}
+        />
+
+        {/* Percentile polylines + markers */}
+        {PERCENTILE_LINES.map((line) => {
+          const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
+          const markers: { x: number; y: number }[] = [];
+          let prev: { x: number; y: number } | null = null;
+          for (let i = 0; i < points.length; i++) {
+            const v = points[i]!.values[line.key];
+            if (typeof v !== 'number' || !Number.isFinite(v)) {
+              prev = null;
+              continue;
+            }
+            const x = xOf(i);
+            const y = yOf(v);
+            markers.push({ x, y });
+            if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
+            prev = { x, y };
+          }
+          return (
+            <g key={line.key}>
+              {segments.map((s, j) => (
+                <line
+                  key={`s${j}`}
+                  x1={s.x1}
+                  y1={s.y1}
+                  x2={s.x2}
+                  y2={s.y2}
+                  stroke={line.color}
+                  strokeWidth={1.5}
+                />
+              ))}
+              {markers.map((m, j) => (
+                <circle key={`m${j}`} cx={m.x} cy={m.y} r={3} fill={line.color} />
+              ))}
+            </g>
+          );
+        })}
+      </ChartHover>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index 776c8ba2..aa727fdc 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -48,7 +48,7 @@ function frameworkLabel(fw: string) {
 }
 
 /** Short label for a sibling chip: parallelism + concurrency. */
-function chipLabel(s: BenchmarkSibling): string {
+export function chipLabel(s: BenchmarkSibling): string {
   const parallel = s.disagg
     ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
     : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
diff --git a/packages/app/src/hooks/api/use-agentic-aggregates.ts b/packages/app/src/hooks/api/use-agentic-aggregates.ts
new file mode 100644
index 00000000..4ca25ee2
--- /dev/null
+++ b/packages/app/src/hooks/api/use-agentic-aggregates.ts
@@ -0,0 +1,45 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  n: number;
+}
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+async function fetchAgenticAggregates(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<AgenticAggregateMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/agentic-aggregates?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`agentic-aggregates ${res.status}`);
+  return (await res.json()) as AgenticAggregateMap;
+}
+
+/**
+ * Fetch per-id aggregate stats (mean/p50/p75/p90/p99) for ISL, OSL, KV
+ * cache utilization, and prefix cache hit rate. Used by the "Aggregates
+ * across configs" view on the agentic detail page.
+ */
+export function useAgenticAggregates(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['agentic-aggregates', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchAgenticAggregates(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
new file mode 100644
index 00000000..2a0305bf
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, it } from 'vitest';
+
+import { extractIslOsl, extractServerMetricSamples, percentilesOf } from './agentic-aggregates.js';
+
+describe('percentilesOf', () => {
+  it('returns null for empty input', () => {
+    expect(percentilesOf([])).toBeNull();
+    expect(percentilesOf([Number.NaN, Number.POSITIVE_INFINITY])).toBeNull();
+  });
+
+  it('computes percentiles for a simple integer range', () => {
+    // 1..100, evenly spaced — linear quantile is straightforward.
+    const xs = Array.from({ length: 100 }, (_, i) => i + 1);
+    const p = percentilesOf(xs);
+    expect(p).not.toBeNull();
+    expect(p!.n).toBe(100);
+    expect(p!.mean).toBeCloseTo(50.5, 6);
+    expect(p!.p50).toBeCloseTo(50.5, 6);
+    // For 100 sorted values, p75 = sorted[0.75 * 99] = sorted[74.25] interp.
+    expect(p!.p75).toBeCloseTo(75.25, 6);
+    expect(p!.p90).toBeCloseTo(90.1, 6);
+    expect(p!.p99).toBeCloseTo(99.01, 6);
+  });
+
+  it('filters out non-finite values before computing', () => {
+    const p = percentilesOf([1, 2, Number.NaN, 3, Number.POSITIVE_INFINITY, 4]);
+    expect(p?.n).toBe(4);
+    expect(p?.mean).toBeCloseTo(2.5, 6);
+  });
+});
+
+describe('extractIslOsl', () => {
+  it('reads input/output sequence length from profiling records', () => {
+    const lines = [
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 100, unit: 'tokens' },
+          output_sequence_length: { value: 50, unit: 'tokens' },
+        },
+      }),
+      JSON.stringify({
+        metadata: { benchmark_phase: 'profiling' },
+        metrics: {
+          input_sequence_length: { value: 200, unit: 'tokens' },
+          output_sequence_length: { value: 75, unit: 'tokens' },
+        },
+      }),
+      // warmup record — should be ignored
+      JSON.stringify({
+        metadata: { benchmark_phase: 'warmup' },
+        metrics: {
+          input_sequence_length: { value: 9999, unit: 'tokens' },
+          output_sequence_length: { value: 9999, unit: 'tokens' },
+        },
+      }),
+    ];
+    const { isl, osl } = extractIslOsl(lines.join('\n'));
+    expect(isl).toEqual([100, 200]);
+    expect(osl).toEqual([50, 75]);
+  });
+});
+
+describe('extractServerMetricSamples', () => {
+  it('extracts KV cache util gauge and computes per-interval prefix hit rate', () => {
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:kv_cache_usage_perc': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, end_ns: 1, avg: 0.1 },
+                { start_ns: 1, end_ns: 2, avg: 0.5 },
+                { start_ns: 2, end_ns: 3, avg: 0.9 },
+              ],
+            },
+          ],
+        },
+        'vllm:gpu_prefix_cache_hits': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 80 },
+                { start_ns: 1, rate: 50 },
+                { start_ns: 2, rate: 0 }, // skipped because matching queries.rate is 0
+              ],
+            },
+          ],
+        },
+        'vllm:gpu_prefix_cache_queries': {
+          series: [
+            {
+              timeslices: [
+                { start_ns: 0, rate: 100 }, // hit rate = 0.8
+                { start_ns: 1, rate: 100 }, // hit rate = 0.5
+                { start_ns: 2, rate: 0 },
+              ],
+            },
+          ],
+        },
+      },
+    });
+    const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+    expect(kvCacheUtil).toEqual([0.1, 0.5, 0.9]);
+    expect(prefixCacheHitRate).toEqual([0.8, 0.5]);
+  });
+
+  it('returns empty arrays when the JSON lacks the expected metric series', () => {
+    const out = extractServerMetricSamples(JSON.stringify({ metrics: {} }));
+    expect(out.kvCacheUtil).toEqual([]);
+    expect(out.prefixCacheHitRate).toEqual([]);
+  });
+});
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
new file mode 100644
index 00000000..49ae6900
--- /dev/null
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -0,0 +1,255 @@
+/**
+ * Per-id aggregate stats for the "Aggregates across configs" view on the
+ * agentic detail page. Each id contributes one summary number per metric per
+ * percentile so the frontend can plot how each metric varies across the
+ * SKU's parallelism + concurrency configs.
+ *
+ * Sources:
+ *  - `profile_export.jsonl` → ISL / OSL per request (filtered to profiling phase)
+ *  - `server_metrics_json` → time-series of KV cache utilization +
+ *     prefix-cache hit rate per scrape interval
+ *
+ * Returns mean/p50/p75/p90/p99 per metric. Nulls when the blob is missing
+ * or has no usable samples — frontend treats those as "no data".
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+import type { DbClient } from '../connection.js';
+
+export interface MetricPercentiles {
+  mean: number;
+  p50: number;
+  p75: number;
+  p90: number;
+  p99: number;
+  /** Sample count used to compute the percentiles. */
+  n: number;
+}
+
+export interface AgenticAggregate {
+  id: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+}
+
+export type AgenticAggregateMap = Record<number, AgenticAggregate>;
+
+/**
+ * Each row pulls TWO compressed blobs (profile_export + server_metrics).
+ * `server_metrics_json_gz` can be up to ~17 MB compressed for high-conc
+ * runs, so even 3 rows can clear Neon's 64 MB cap. Stay conservative at 2.
+ * Chunks are issued in parallel below, so the wall-clock impact is small.
+ */
+const QUERY_CHUNK_SIZE = 2;
+
+/** Linear-interpolated percentile (matches numpy default). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 0) return Number.NaN;
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+function meanOf(xs: number[]): number {
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+
+/** Compute the percentile bundle for an array of samples; null if empty. */
+export function percentilesOf(samples: number[]): MetricPercentiles | null {
+  const clean = samples.filter((v) => Number.isFinite(v));
+  if (clean.length === 0) return null;
+  const sorted = [...clean].toSorted((a, b) => a - b);
+  return {
+    mean: meanOf(sorted),
+    p50: quantile(sorted, 0.5),
+    p75: quantile(sorted, 0.75),
+    p90: quantile(sorted, 0.9),
+    p99: quantile(sorted, 0.99),
+    n: sorted.length,
+  };
+}
+
+/** Pull a numeric metric out of the {value, unit} envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return v;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+interface ProfileRecord {
+  metadata?: { benchmark_phase?: string };
+  metrics?: {
+    input_sequence_length?: { value?: number } | number;
+    output_sequence_length?: { value?: number } | number;
+  };
+}
+
+/** Parse the profile_export.jsonl → per-request ISL + OSL arrays. */
+export function extractIslOsl(jsonl: string): { isl: number[]; osl: number[] } {
+  const isl: number[] = [];
+  const osl: number[] = [];
+  for (const line of jsonl.split('\n')) {
+    if (!line) continue;
+    let rec: ProfileRecord;
+    try {
+      rec = JSON.parse(line) as ProfileRecord;
+    } catch {
+      continue;
+    }
+    if (rec.metadata?.benchmark_phase && rec.metadata.benchmark_phase !== 'profiling') continue;
+    const m = rec.metrics ?? {};
+    const i = readNum(m.input_sequence_length);
+    const o = readNum(m.output_sequence_length);
+    if (typeof i === 'number') isl.push(i);
+    if (typeof o === 'number') osl.push(o);
+  }
+  return { isl, osl };
+}
+
+interface TimeSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+  count?: number;
+  sum?: number;
+}
+interface Series {
+  labels?: Record<string, string>;
+  timeslices?: TimeSlice[];
+}
+interface MetricMeta {
+  series?: Series[];
+}
+interface MetricsJson {
+  metrics?: Record<string, MetricMeta>;
+}
+
+/**
+ * Parse the server_metrics_json → time-series arrays for KV cache util and
+ * prefix cache hit rate (per-interval, computed from the prometheus
+ * counters the same way trace-server-metrics does it).
+ */
+export function extractServerMetricSamples(json: string): {
+  kvCacheUtil: number[];
+  prefixCacheHitRate: number[];
+} {
+  const parsed = JSON.parse(json) as MetricsJson;
+  const metrics = parsed.metrics ?? {};
+  const firstSeries = (name: string): Series | undefined => {
+    const s = metrics[name]?.series;
+    return s && s.length > 0 ? s[0] : undefined;
+  };
+
+  // KV cache util — gauge in [0, 1].
+  const kvSeries =
+    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
+  const kvCacheUtil: number[] = [];
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number') kvCacheUtil.push(ts.avg);
+  }
+
+  // Prefix cache hit rate per interval = hits.rate / queries.rate.
+  // Matches the derivation in queries/trace-server-metrics.ts.
+  const prefixCacheHitRate: number[] = [];
+  const hitsSeries = firstSeries('vllm:gpu_prefix_cache_hits');
+  const queriesSeries = firstSeries('vllm:gpu_prefix_cache_queries');
+  if (hitsSeries && queriesSeries) {
+    const qByStart = new Map<number, TimeSlice>();
+    for (const q of queriesSeries.timeslices ?? []) {
+      if (typeof q.start_ns === 'number') qByStart.set(q.start_ns, q);
+    }
+    for (const h of hitsSeries.timeslices ?? []) {
+      if (typeof h.start_ns !== 'number' || typeof h.rate !== 'number') continue;
+      const q = qByStart.get(h.start_ns);
+      if (!q || typeof q.rate !== 'number' || q.rate === 0) continue;
+      prefixCacheHitRate.push(h.rate / q.rate);
+    }
+  }
+
+  return { kvCacheUtil, prefixCacheHitRate };
+}
+
+export async function getAgenticAggregates(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<AgenticAggregateMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  // Serial chunks so we never have more than ~`QUERY_CHUNK_SIZE` blobs in
+  // memory at once. Some `server_metrics` blobs decompress to >100 MB; running
+  // all chunks in parallel OOMs the Node process. The aggregator is fronted by
+  // a blob cache (`blobOnly: true`), so the slow path runs at most once per
+  // sibling set.
+  const result: AgenticAggregateMap = {};
+  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+    const chunkRows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as profile_blob,
+        atr.server_metrics_json_gz as server_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as {
+      benchmark_result_id: number;
+      profile_blob: Buffer | null;
+      server_blob: Buffer | null;
+    }[];
+    for (const row of chunkRows) {
+      processRow(row, result);
+    }
+  }
+  return result;
+}
+
+function processRow(
+  row: { benchmark_result_id: number; profile_blob: Buffer | null; server_blob: Buffer | null },
+  result: AgenticAggregateMap,
+): void {
+  let islPct: MetricPercentiles | null = null;
+  let oslPct: MetricPercentiles | null = null;
+  let kvPct: MetricPercentiles | null = null;
+  let prefixPct: MetricPercentiles | null = null;
+
+  if (row.profile_blob) {
+    try {
+      const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      islPct = percentilesOf(isl);
+      oslPct = percentilesOf(osl);
+    } catch {
+      // ignore malformed blob
+    }
+  }
+  if (row.server_blob) {
+    try {
+      const json = gunzipSync(row.server_blob).toString('utf8');
+      const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
+      kvPct = percentilesOf(kvCacheUtil);
+      prefixPct = percentilesOf(prefixCacheHitRate);
+    } catch {
+      // ignore malformed blob
+    }
+  }
+
+  result[Number(row.benchmark_result_id)] = {
+    id: Number(row.benchmark_result_id),
+    isl: islPct,
+    osl: oslPct,
+    kvCacheUtil: kvPct,
+    prefixCacheHitRate: prefixPct,
+  };
+}

From 41ef33b21e6a34430be20e812e6eedbd7b8f90cf Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 16:13:49 -0500
Subject: [PATCH 36/96] fix(agentic-aggregates): metric name + stream-parse
 oversized blobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two issues left the Aggregates view mostly empty for the worst-case rows:

1. Prefix cache hit rate was null for EVERY row because the parser looked
   up `vllm:gpu_prefix_cache_*` but the actual metric names are
   `vllm:prefix_cache_*` (no `gpu_` prefix). Add fallback so both spellings
   work.

2. KV cache util + prefix cache hit rate were null for high-conc TP+EP
   rows. Their server_metrics_json decompresses past Node's max string
   length (0x1fffffe8 / 512 MB) because vllm dumps cache_config_info into
   every scrape interval, repeated thousands of times. `gunzipSync().toString()`
   threw ERR_STRING_TOO_LONG and the silent catch left both metrics null.

   Added stream-json fallback: pipe Buffer → gunzip → JSON parser →
   pick('metrics') → streamObject; only the metric keys we care about land
   in memory. Avoids ever materializing the 500+ MB JSON string. The
   fast path stays — sync gunzip + JSON.parse is used unless it throws.

Also split the DB fetch into two passes (profile blobs in batches of 8,
server blobs one at a time) so the server query response stays under
Neon's 64 MB HTTP cap on rows where the compressed server blob alone is
~17 MB and Neon's bytea-over-HTTP encoding inflates it ~1.6×.

Chart redesign: AggregateChart now draws a vertical bar per sibling
spanning the percentile range, with colored ticks at each percentile and
a diamond at the mean. Horizontal connecting lines per percentile remain
as a faint backdrop so the reader can still follow trends across configs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/aggregate-chart.tsx         |  72 ++++++-
 packages/db/package.json                      |   5 +-
 .../db/src/queries/agentic-aggregates.test.ts |   4 +-
 packages/db/src/queries/agentic-aggregates.ts | 197 ++++++++++++------
 pnpm-lock.yaml                                |  36 ++++
 5 files changed, 242 insertions(+), 72 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
index 446677ad..55ac8061 100644
--- a/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/aggregate-chart.tsx
@@ -188,10 +188,10 @@ export function AggregateChart({
           opacity={0.25}
         />
 
-        {/* Percentile polylines + markers */}
+        {/* Horizontal connecting lines per percentile — faint backdrop so the
+            eye can follow how each percentile changes across configs. */}
         {PERCENTILE_LINES.map((line) => {
           const segments: { x1: number; y1: number; x2: number; y2: number }[] = [];
-          const markers: { x: number; y: number }[] = [];
           let prev: { x: number; y: number } | null = null;
           for (let i = 0; i < points.length; i++) {
             const v = points[i]!.values[line.key];
@@ -201,12 +201,11 @@ export function AggregateChart({
             }
             const x = xOf(i);
             const y = yOf(v);
-            markers.push({ x, y });
             if (prev) segments.push({ x1: prev.x, y1: prev.y, x2: x, y2: y });
             prev = { x, y };
           }
           return (
-            <g key={line.key}>
+            <g key={`hline-${line.key}`} opacity={0.35}>
               {segments.map((s, j) => (
                 <line
                   key={`s${j}`}
@@ -215,12 +214,69 @@ export function AggregateChart({
                   x2={s.x2}
                   y2={s.y2}
                   stroke={line.color}
-                  strokeWidth={1.5}
+                  strokeWidth={1}
                 />
               ))}
-              {markers.map((m, j) => (
-                <circle key={`m${j}`} cx={m.x} cy={m.y} r={3} fill={line.color} />
-              ))}
+            </g>
+          );
+        })}
+
+        {/* Per-sibling vertical bar spanning the percentile range, with a
+            colored tick at each percentile level. Mean rendered as a small
+            diamond to distinguish from the percentile ticks. */}
+        {points.map((p, i) => {
+          const x = xOf(i);
+          // Collect percentile values present for this sibling.
+          const present = PERCENTILE_LINES.filter(
+            (line) =>
+              typeof p.values[line.key] === 'number' && Number.isFinite(p.values[line.key]!),
+          ).map((line) => ({ ...line, value: p.values[line.key]! }));
+          if (present.length === 0) return null;
+          // Only the *percentile* values define the bar extent; mean might be
+          // outside the percentile span on weird distributions.
+          const pctlOnly = present.filter((p2) => p2.key !== 'mean');
+          const bandValues = pctlOnly.length > 0 ? pctlOnly : present;
+          const bandYs = bandValues.map((b) => yOf(b.value));
+          const yLo = Math.min(...bandYs);
+          const yHi = Math.max(...bandYs);
+          return (
+            <g key={`bar-${i}`}>
+              <line
+                x1={x}
+                x2={x}
+                y1={yLo}
+                y2={yHi}
+                stroke="currentColor"
+                strokeWidth={1}
+                opacity={0.35}
+              />
+              {present.map((b) => {
+                const ty = yOf(b.value);
+                if (b.key === 'mean') {
+                  // Diamond marker for mean.
+                  const s = 4;
+                  return (
+                    <polygon
+                      key={`m-${b.key}`}
+                      points={`${x},${ty - s} ${x + s},${ty} ${x},${ty + s} ${x - s},${ty}`}
+                      fill={b.color}
+                      stroke={b.color}
+                    />
+                  );
+                }
+                // Horizontal tick at each percentile.
+                return (
+                  <line
+                    key={`tk-${b.key}`}
+                    x1={x - 6}
+                    x2={x + 6}
+                    y1={ty}
+                    y2={ty}
+                    stroke={b.color}
+                    strokeWidth={2.5}
+                  />
+                );
+              })}
             </g>
           );
         })}
diff --git a/packages/db/package.json b/packages/db/package.json
index c849ea26..d7caf34d 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -30,11 +30,14 @@
     "@neondatabase/serverless": "^1.1.0",
     "@noble/ciphers": "^2.2.0",
     "@semianalysisai/inferencex-constants": "workspace:*",
-    "postgres": "^3.4.9"
+    "postgres": "^3.4.9",
+    "stream-chain": "^3.4.0",
+    "stream-json": "^2.1.0"
   },
   "devDependencies": {
     "@types/adm-zip": "^0.5.8",
     "@types/node": "^25.7.0",
+    "@types/stream-json": "^1.7.8",
     "@vitest/coverage-v8": "^4.1.6",
     "adm-zip": "^0.5.17",
     "dotenv-cli": "^11.0.0",
diff --git a/packages/db/src/queries/agentic-aggregates.test.ts b/packages/db/src/queries/agentic-aggregates.test.ts
index 2a0305bf..8c712323 100644
--- a/packages/db/src/queries/agentic-aggregates.test.ts
+++ b/packages/db/src/queries/agentic-aggregates.test.ts
@@ -76,7 +76,7 @@ describe('extractServerMetricSamples', () => {
             },
           ],
         },
-        'vllm:gpu_prefix_cache_hits': {
+        'vllm:prefix_cache_hits': {
           series: [
             {
               timeslices: [
@@ -87,7 +87,7 @@ describe('extractServerMetricSamples', () => {
             },
           ],
         },
-        'vllm:gpu_prefix_cache_queries': {
+        'vllm:prefix_cache_queries': {
           series: [
             {
               timeslices: [
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 49ae6900..22ec7b28 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -13,7 +13,14 @@
  * or has no usable samples — frontend treats those as "no data".
  */
 
-import { gunzipSync } from 'node:zlib';
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
 
 import type { DbClient } from '../connection.js';
 
@@ -38,12 +45,15 @@ export interface AgenticAggregate {
 export type AgenticAggregateMap = Record<number, AgenticAggregate>;
 
 /**
- * Each row pulls TWO compressed blobs (profile_export + server_metrics).
- * `server_metrics_json_gz` can be up to ~17 MB compressed for high-conc
- * runs, so even 3 rows can clear Neon's 64 MB cap. Stay conservative at 2.
- * Chunks are issued in parallel below, so the wall-clock impact is small.
+ * `profile_export_jsonl_gz` is small (~1-3 MB) so we can batch many per
+ * round-trip. `server_metrics_json_gz` is much bigger (~17 MB compressed
+ * for high-conc TP+EP runs; Neon encodes bytea over HTTP at ~1.6× wire
+ * size, so two of those = ~50 MB and three already trips the 64 MB cap).
+ * We fetch the two blob types in separate queries with different chunk
+ * sizes.
  */
-const QUERY_CHUNK_SIZE = 2;
+const PROFILE_CHUNK_SIZE = 8;
+const SERVER_CHUNK_SIZE = 1;
 
 /** Linear-interpolated percentile (matches numpy default). */
 function quantile(sortedAsc: number[], q: number): number {
@@ -162,9 +172,14 @@ export function extractServerMetricSamples(json: string): {
 
   // Prefix cache hit rate per interval = hits.rate / queries.rate.
   // Matches the derivation in queries/trace-server-metrics.ts.
+  // Metric names: vllm exposes these as `vllm:prefix_cache_*` (no `gpu_`
+  // prefix); falls back to the `gpu_`-prefixed names in case a future
+  // vllm version renames them.
   const prefixCacheHitRate: number[] = [];
-  const hitsSeries = firstSeries('vllm:gpu_prefix_cache_hits');
-  const queriesSeries = firstSeries('vllm:gpu_prefix_cache_queries');
+  const hitsSeries =
+    firstSeries('vllm:prefix_cache_hits') ?? firstSeries('vllm:gpu_prefix_cache_hits');
+  const queriesSeries =
+    firstSeries('vllm:prefix_cache_queries') ?? firstSeries('vllm:gpu_prefix_cache_queries');
   if (hitsSeries && queriesSeries) {
     const qByStart = new Map<number, TimeSlice>();
     for (const q of queriesSeries.timeslices ?? []) {
@@ -181,75 +196,135 @@ export function extractServerMetricSamples(json: string): {
   return { kvCacheUtil, prefixCacheHitRate };
 }
 
+/** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
+const TARGET_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc', // older fallback name
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths)
+  'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect ONLY the metrics
+ * we need. Avoids the Node 512 MB string cap that JSON.parse hits on
+ * server_metrics blobs from high-conc TP+EP runs (which can decompress to
+ * >500 MB because vllm dumps `cache_config_info` every scrape interval).
+ *
+ * Pipeline: Buffer → gunzip → JSON parser → Pick('metrics') →
+ * StreamObject (one metric per chunk) → keep only the keys we care about.
+ *
+ * Returns the same `{ kvCacheUtil, prefixCacheHitRate }` shape as the
+ * synchronous fast path so callers can use either interchangeably.
+ */
+async function streamExtractServerMetricSamples(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  const collected: Record<string, MetricMeta> = {};
+  // stream-json's TypeScript types don't compose cleanly with node:stream's
+  // pipeline() generic, and several `.pipe()`/event APIs are typed loosely —
+  // cast to any for this local pipe chain. It works at runtime.
+  // stream-json composes transforms via stream-chain. `pick`/`streamObject`
+  // each return a Transform when called; `chain([...])` wires them.
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipeline as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: MetricMeta };
+      if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipeline as any).on('end', resolve);
+    (pipeline as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
 export async function getAgenticAggregates(
   sql: DbClient,
   benchmarkResultIds: number[],
 ): Promise<AgenticAggregateMap> {
   if (benchmarkResultIds.length === 0) return {};
 
-  // Serial chunks so we never have more than ~`QUERY_CHUNK_SIZE` blobs in
-  // memory at once. Some `server_metrics` blobs decompress to >100 MB; running
-  // all chunks in parallel OOMs the Node process. The aggregator is fronted by
-  // a blob cache (`blobOnly: true`), so the slow path runs at most once per
-  // sibling set.
   const result: AgenticAggregateMap = {};
-  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
-    const chunkRows = (await sql`
+  // ── Pass 1: profile_export blobs (cheap; large batches). ────────────────
+  for (let i = 0; i < benchmarkResultIds.length; i += PROFILE_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + PROFILE_CHUNK_SIZE);
+    const rows = (await sql`
+      select
+        br.id as benchmark_result_id,
+        atr.profile_export_jsonl_gz as profile_blob
+      from benchmark_results br
+      join agentic_trace_replay atr on atr.id = br.trace_replay_id
+      where br.id = any(${chunk}::bigint[])
+    `) as { benchmark_result_id: number; profile_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (row.profile_blob) {
+        try {
+          const jsonl = gunzipSync(row.profile_blob).toString('utf8');
+          const { isl, osl } = extractIslOsl(jsonl);
+          result[id].isl = percentilesOf(isl);
+          result[id].osl = percentilesOf(osl);
+        } catch {
+          // ignore malformed blob
+        }
+      }
+    }
+  }
+  // ── Pass 2: server_metrics blobs (huge; one at a time). ────────────────
+  // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
+  // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
+  // path runs at most once per sibling set.
+  for (let i = 0; i < benchmarkResultIds.length; i += SERVER_CHUNK_SIZE) {
+    const chunk = benchmarkResultIds.slice(i, i + SERVER_CHUNK_SIZE);
+    const rows = (await sql`
       select
         br.id as benchmark_result_id,
-        atr.profile_export_jsonl_gz as profile_blob,
         atr.server_metrics_json_gz as server_blob
       from benchmark_results br
       join agentic_trace_replay atr on atr.id = br.trace_replay_id
       where br.id = any(${chunk}::bigint[])
-    `) as {
-      benchmark_result_id: number;
-      profile_blob: Buffer | null;
-      server_blob: Buffer | null;
-    }[];
-    for (const row of chunkRows) {
-      processRow(row, result);
+    `) as { benchmark_result_id: number; server_blob: Buffer | null }[];
+    for (const row of rows) {
+      const id = Number(row.benchmark_result_id);
+      result[id] ??= blankAggregate(id);
+      if (!row.server_blob) continue;
+      let parsed: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+      try {
+        const json = gunzipSync(row.server_blob).toString('utf8');
+        parsed = extractServerMetricSamples(json);
+      } catch (error) {
+        // ERR_STRING_TOO_LONG (>512 MB) hits on high-conc TP+EP rows whose
+        // server_metrics_json decompresses past Node's max string length.
+        // Stream-parse to extract just the metric subtrees we care about.
+        const code = error && (error as NodeJS.ErrnoException).code;
+        const msg = error instanceof Error ? error.message : String(error);
+        if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+          try {
+            parsed = await streamExtractServerMetricSamples(row.server_blob);
+          } catch {
+            // stream fallback failed too — leave nulls
+          }
+        }
+      }
+      if (parsed) {
+        result[id].kvCacheUtil = percentilesOf(parsed.kvCacheUtil);
+        result[id].prefixCacheHitRate = percentilesOf(parsed.prefixCacheHitRate);
+      }
     }
   }
   return result;
 }
 
-function processRow(
-  row: { benchmark_result_id: number; profile_blob: Buffer | null; server_blob: Buffer | null },
-  result: AgenticAggregateMap,
-): void {
-  let islPct: MetricPercentiles | null = null;
-  let oslPct: MetricPercentiles | null = null;
-  let kvPct: MetricPercentiles | null = null;
-  let prefixPct: MetricPercentiles | null = null;
-
-  if (row.profile_blob) {
-    try {
-      const jsonl = gunzipSync(row.profile_blob).toString('utf8');
-      const { isl, osl } = extractIslOsl(jsonl);
-      islPct = percentilesOf(isl);
-      oslPct = percentilesOf(osl);
-    } catch {
-      // ignore malformed blob
-    }
-  }
-  if (row.server_blob) {
-    try {
-      const json = gunzipSync(row.server_blob).toString('utf8');
-      const { kvCacheUtil, prefixCacheHitRate } = extractServerMetricSamples(json);
-      kvPct = percentilesOf(kvCacheUtil);
-      prefixPct = percentilesOf(prefixCacheHitRate);
-    } catch {
-      // ignore malformed blob
-    }
-  }
-
-  result[Number(row.benchmark_result_id)] = {
-    id: Number(row.benchmark_result_id),
-    isl: islPct,
-    osl: oslPct,
-    kvCacheUtil: kvPct,
-    prefixCacheHitRate: prefixPct,
-  };
+function blankAggregate(id: number): AgenticAggregate {
+  return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
 }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 14505e57..717ffc5c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -249,6 +249,12 @@ importers:
       postgres:
         specifier: ^3.4.9
         version: 3.4.9
+      stream-chain:
+        specifier: ^3.4.0
+        version: 3.6.3
+      stream-json:
+        specifier: ^2.1.0
+        version: 2.1.0
     devDependencies:
       '@types/adm-zip':
         specifier: ^0.5.8
@@ -256,6 +262,9 @@ importers:
       '@types/node':
         specifier: ^25.7.0
         version: 25.7.0
+      '@types/stream-json':
+        specifier: ^1.7.8
+        version: 1.7.8
       '@vitest/coverage-v8':
         specifier: ^4.1.6
         version: 4.1.6(vitest@4.1.6)
@@ -2334,6 +2343,12 @@ packages:
   '@types/stats.js@0.17.4':
     resolution: {integrity: sha512-jIBvWWShCvlBqBNIZt0KAshWpvSjhkwkEu4ZUcASoAvhmrgAUI2t1dXrjSL4xXVLB4FznPrIsX3nKXFl/Dt4vA==}
 
+  '@types/stream-chain@2.1.0':
+    resolution: {integrity: sha512-guDyAl6s/CAzXUOWpGK2bHvdiopLIwpGu8v10+lb9hnQOyo4oj/ZUQFOvqFjKGsE3wJP1fpIesCcMvbXuWsqOg==}
+
+  '@types/stream-json@1.7.8':
+    resolution: {integrity: sha512-MU1OB1eFLcYWd1LjwKXrxdoPtXSRzRmAnnxs4Js/ayB5O/NvHraWwuOaqMWIebpYwM6khFlsJOHEhI9xK/ab4Q==}
+
   '@types/three@0.184.1':
     resolution: {integrity: sha512-6q4VdiqVsrTRqmk62/BnlcAvIrnDM0zf2ZDVKI5kZiniWrSaOHaQzmbp+BNzoggc/8tgW412pL//wZIxu2PPTA==}
 
@@ -5074,9 +5089,15 @@ packages:
     resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==}
     engines: {node: '>= 0.4'}
 
+  stream-chain@3.6.3:
+    resolution: {integrity: sha512-JZuELdHUuiZL4Olcr4EllGUvj9VKEaDkGHA6QAP5SruD0bgrr8TwtNXwRfH+fCncysEII7HhWll1+aOwvHYyRw==}
+
   stream-combiner@0.2.2:
     resolution: {integrity: sha512-6yHMqgLYDzQDcAkL+tjJDC5nSNuNIx0vZtRZeiPh7Saef7VHX9H5Ijn9l2VIol2zaNYlYEX6KyuT/237A58qEQ==}
 
+  stream-json@2.1.0:
+    resolution: {integrity: sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==}
+
   string-width@4.2.3:
     resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
     engines: {node: '>=8'}
@@ -7392,6 +7413,15 @@ snapshots:
 
   '@types/stats.js@0.17.4': {}
 
+  '@types/stream-chain@2.1.0':
+    dependencies:
+      '@types/node': 25.7.0
+
+  '@types/stream-json@1.7.8':
+    dependencies:
+      '@types/node': 25.7.0
+      '@types/stream-chain': 2.1.0
+
   '@types/three@0.184.1':
     dependencies:
       '@dimforge/rapier3d-compat': 0.12.0
@@ -10752,11 +10782,17 @@ snapshots:
       es-errors: 1.3.0
       internal-slot: 1.1.0
 
+  stream-chain@3.6.3: {}
+
   stream-combiner@0.2.2:
     dependencies:
       duplexer: 0.1.2
       through: 2.3.8
 
+  stream-json@2.1.0:
+    dependencies:
+      stream-chain: 3.6.3
+
   string-width@4.2.3:
     dependencies:
       emoji-regex: 8.0.0

From 1cedd240e95b52789690919cc4b13600920d842f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:04:02 -0500
Subject: [PATCH 37/96] feat(agentic-aggregates): pre-compute stats at ingest
 time

Detail page was decompressing + parsing every trace_replay blob on each
request, sometimes hitting Node's 512 MB string cap on high-conc TP+EP
server_metrics_json. Pre-compute the percentile + derived bundles into
a versioned `aggregate_stats` JSONB column, mirroring the pattern Alec
suggested. APIs read the column first and only fall back to the slow
blob-parse path for rows the backfill hasn't drained.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../008_agentic_aggregate_stats.sql           |  18 +++
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-aggregate-stats.ts   | 150 ++++++++++++++++++
 .../src/etl/compute-aggregate-stats.test.ts   | 123 ++++++++++++++
 .../db/src/etl/compute-aggregate-stats.ts     | 147 +++++++++++++++++
 packages/db/src/etl/trace-replay-ingest.ts    |  17 +-
 packages/db/src/queries/agentic-aggregates.ts |  77 ++++++++-
 .../db/src/queries/derived-agentic-metrics.ts |  47 +++++-
 8 files changed, 569 insertions(+), 11 deletions(-)
 create mode 100644 packages/db/migrations/008_agentic_aggregate_stats.sql
 create mode 100644 packages/db/src/backfill-aggregate-stats.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.test.ts
 create mode 100644 packages/db/src/etl/compute-aggregate-stats.ts

diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql
new file mode 100644
index 00000000..d55533b9
--- /dev/null
+++ b/packages/db/migrations/008_agentic_aggregate_stats.sql
@@ -0,0 +1,18 @@
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+  add column aggregate_stats jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index d7caf34d..f3f92311 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -19,6 +19,7 @@
     "db:ingest:supplemental": "dotenv -e ../../.env -- tsx src/ingest-supplemental.ts",
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
+    "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-aggregate-stats.ts b/packages/db/src/backfill-aggregate-stats.ts
new file mode 100644
index 00000000..8dd42dce
--- /dev/null
+++ b/packages/db/src/backfill-aggregate-stats.ts
@@ -0,0 +1,150 @@
+/**
+ * Backfill `agentic_trace_replay.aggregate_stats` for rows that are missing it
+ * or were computed by an older `STATS_VERSION`.
+ *
+ * The ingest path now computes stats inline, but existing rows (and rows
+ * whose computation logic has since changed) still need this pass. Run after
+ * applying migration 008 and any time `STATS_VERSION` bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can be hundreds of
+ *     MB decompressed for TP+EP / high-conc points — keeping one in memory
+ *     at a time avoids OOM).
+ *   - Skip rows whose stored `aggregate_stats.version` already matches.
+ *   - Recompute via the same `computeAggregateStats()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-aggregate-stats
+ *     [--limit N]   only process the first N candidate rows (useful for
+ *                   smoke-tests on a fresh deploy)
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { computeAggregateStats, STATS_VERSION } from './etl/compute-aggregate-stats.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-aggregate-stats ===');
+  console.log(`  STATS_VERSION = ${STATS_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Find candidates: rows missing stats, or whose stored version is stale.
+  // Using >>'version'::int comparison would error on null; coalesce to -1 so
+  // null-stats rows always count as stale.
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where aggregate_stats is null
+           or coalesce((aggregate_stats->>'version')::int, -1) <> ${STATS_VERSION}
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      // Fetch one row at a time — the json_gz blob is the heavy field.
+      const [row] = await sql<
+        { profile_export_jsonl_gz: Buffer | null; server_metrics_json_gz: Buffer | null }[]
+      >`
+        select profile_export_jsonl_gz, server_metrics_json_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+
+      const stats = await computeAggregateStats({
+        profileBlob: row.profile_export_jsonl_gz,
+        serverBlob: row.server_metrics_json_gz,
+      });
+
+      await sql`
+        update agentic_trace_replay
+        set aggregate_stats = ${sql.json(structuredClone(stats) as unknown as Parameters<typeof sql.json>[0])}
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-aggregate-stats failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-aggregate-stats.test.ts b/packages/db/src/etl/compute-aggregate-stats.test.ts
new file mode 100644
index 00000000..de0009de
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.test.ts
@@ -0,0 +1,123 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { STATS_VERSION, computeAggregateStats } from './compute-aggregate-stats.js';
+
+/** Build a minimal `profile_export.jsonl` from a few synthetic requests. */
+function makeProfileBlob(requests: { isl: number; osl: number; rl?: number; ttft?: number }[]) {
+  const lines = requests.map((r, i) =>
+    JSON.stringify({
+      metadata: {
+        benchmark_phase: 'profiling',
+        conversation_id: `conv-${i}`,
+        turn_index: 0,
+      },
+      metrics: {
+        input_sequence_length: { value: r.isl, unit: 'tokens' },
+        output_sequence_length: { value: r.osl, unit: 'tokens' },
+        request_latency: { value: r.rl ?? 1000, unit: 'ms' },
+        time_to_first_token: { value: r.ttft ?? 100, unit: 'ms' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+/** Build a tiny server_metrics_json blob with KV util + prefix cache series. */
+function makeServerBlob() {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1, avg: 0.2 },
+              { start_ns: 1, end_ns: 2, avg: 0.5 },
+              { start_ns: 2, end_ns: 3, avg: 0.8 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 80 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, rate: 100 }] }],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+describe('computeAggregateStats', () => {
+  it('returns the current STATS_VERSION in the bundle', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+
+  it('leaves every metric null when both blobs are null', async () => {
+    const stats = await computeAggregateStats({ profileBlob: null, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+  });
+
+  it('computes ISL/OSL percentiles + derived metrics from the profile blob', async () => {
+    const profileBlob = makeProfileBlob([
+      { isl: 100, osl: 50, rl: 1000, ttft: 100 },
+      { isl: 200, osl: 75, rl: 2000, ttft: 200 },
+      { isl: 300, osl: 100, rl: 3000, ttft: 300 },
+    ]);
+    const stats = await computeAggregateStats({ profileBlob, serverBlob: null });
+
+    expect(stats.isl?.n).toBe(3);
+    expect(stats.isl?.mean).toBeCloseTo(200, 6);
+    expect(stats.osl?.n).toBe(3);
+    expect(stats.osl?.mean).toBeCloseTo(75, 6);
+
+    // Server-side metrics still null when there's no server blob.
+    expect(stats.kvCacheUtil).toBeNull();
+    expect(stats.prefixCacheHitRate).toBeNull();
+
+    // Derived: prefill TPS per turn = isl / (ttft/1000) = 1000 for each, so p90 = 1000.
+    expect(stats.p90PrefillTpsPerUser).toBeCloseTo(1000, 6);
+    // Normalized session time: T̃_i = T_i × (mean_load / load_i), then mean.
+    //   loads = [150, 275, 400], mean_load = 275
+    //   scaled times (s) = [1×275/150, 2×275/275, 3×275/400] = [1.8333, 2, 2.0625]
+    //   mean ≈ 1.9653
+    expect(stats.normalizedSessionTimeS).toBeCloseTo(1.9653, 3);
+  });
+
+  it('computes KV util + prefix hit rate from the server blob alone', async () => {
+    const stats = await computeAggregateStats({
+      profileBlob: null,
+      serverBlob: makeServerBlob(),
+    });
+    expect(stats.kvCacheUtil?.n).toBe(3);
+    expect(stats.kvCacheUtil?.mean).toBeCloseTo(0.5, 6);
+    expect(stats.prefixCacheHitRate?.n).toBe(1);
+    expect(stats.prefixCacheHitRate?.mean).toBeCloseTo(0.8, 6);
+
+    // Profile-derived metrics absent.
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+  });
+
+  it('tolerates a malformed profile blob by leaving its metrics null', async () => {
+    // A random non-gzip buffer triggers a gunzip error — code path swallows it.
+    const garbage = Buffer.from('not-gzip-data');
+    const stats = await computeAggregateStats({ profileBlob: garbage, serverBlob: null });
+    expect(stats.isl).toBeNull();
+    expect(stats.osl).toBeNull();
+    expect(stats.normalizedSessionTimeS).toBeNull();
+    expect(stats.p90PrefillTpsPerUser).toBeNull();
+    // Version still set so the row is considered "computed".
+    expect(stats.version).toBe(STATS_VERSION);
+  });
+});
diff --git a/packages/db/src/etl/compute-aggregate-stats.ts b/packages/db/src/etl/compute-aggregate-stats.ts
new file mode 100644
index 00000000..a422cfec
--- /dev/null
+++ b/packages/db/src/etl/compute-aggregate-stats.ts
@@ -0,0 +1,147 @@
+/**
+ * Pre-compute the per-row aggregate stats for an `agentic_trace_replay`
+ * blob pair. The output lands in the `aggregate_stats` JSONB column so the
+ * detail page can serve the "Aggregates across configs" view and the
+ * derived chart x-axis modes from a single SQL row read, instead of
+ * parsing the raw blobs on demand.
+ *
+ * Shape is intentionally versioned — bump `STATS_VERSION` whenever the
+ * computation changes so the backfill script knows which rows to recompute.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+import { computeDerivedFromBlob } from '../queries/derived-agentic-metrics.js';
+import {
+  STATS_VERSION,
+  extractIslOsl,
+  extractServerMetricSamples,
+  percentilesOf,
+  type MetricPercentiles,
+} from '../queries/agentic-aggregates.js';
+
+export { STATS_VERSION };
+
+export interface AggregateStats {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  /** Mean of (per-session e2e time × mean_load / session_load) across sessions. */
+  normalizedSessionTimeS: number | null;
+  /** P90 of per-turn ISL/TTFT pooled across every session's turns. */
+  p90PrefillTpsPerUser: number | null;
+}
+
+/** Metric subtrees we extract via stream-parse on oversized server blobs. */
+const TARGET_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:gpu_prefix_cache_hits',
+  'vllm:gpu_prefix_cache_queries',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect just the metric
+ * subtrees we care about. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync().toString('utf8')` hits on high-conc TP+EP rows.
+ */
+async function streamExtractServer(
+  buffer: Buffer,
+): Promise<{ kvCacheUtil: number[]; prefixCacheHitRate: number[] }> {
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const collected: Record<string, unknown> = {};
+  const pipelineStream = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipelineStream as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: unknown };
+      if (TARGET_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipelineStream as any).on('end', resolve);
+    (pipelineStream as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return extractServerMetricSamples(JSON.stringify({ metrics: collected }));
+}
+
+/**
+ * Compute the full versioned stats bundle from a (profile, server-metrics)
+ * blob pair. Either blob may be null (e.g. only the server file existed) —
+ * the corresponding stats just come back null.
+ */
+export async function computeAggregateStats(args: {
+  profileBlob: Buffer | null;
+  serverBlob: Buffer | null;
+}): Promise<AggregateStats> {
+  let islPct: MetricPercentiles | null = null;
+  let oslPct: MetricPercentiles | null = null;
+  let normalized: number | null = null;
+  let prefillP90: number | null = null;
+
+  if (args.profileBlob) {
+    try {
+      const jsonl = gunzipSync(args.profileBlob).toString('utf8');
+      const { isl, osl } = extractIslOsl(jsonl);
+      islPct = percentilesOf(isl);
+      oslPct = percentilesOf(osl);
+      const derived = computeDerivedFromBlob(jsonl);
+      normalized = derived.normalized_session_time_s;
+      prefillP90 = derived.p90_prefill_tps_per_user;
+    } catch {
+      // ignore malformed blob — leave nulls
+    }
+  }
+
+  let kvPct: MetricPercentiles | null = null;
+  let prefixPct: MetricPercentiles | null = null;
+  if (args.serverBlob) {
+    let server: { kvCacheUtil: number[]; prefixCacheHitRate: number[] } | null = null;
+    try {
+      const json = gunzipSync(args.serverBlob).toString('utf8');
+      server = extractServerMetricSamples(json);
+    } catch (error) {
+      const code = error && (error as NodeJS.ErrnoException).code;
+      const msg = error instanceof Error ? error.message : String(error);
+      // ERR_STRING_TOO_LONG hits on high-conc TP+EP rows. Stream-parse to
+      // pull just the metric subtrees we need without materializing the
+      // full 500+ MB JSON string.
+      if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+        try {
+          server = await streamExtractServer(args.serverBlob);
+        } catch {
+          // stream fallback failed too — leave nulls
+        }
+      }
+    }
+    if (server) {
+      kvPct = percentilesOf(server.kvCacheUtil);
+      prefixPct = percentilesOf(server.prefixCacheHitRate);
+    }
+  }
+
+  return {
+    version: STATS_VERSION,
+    isl: islPct,
+    osl: oslPct,
+    kvCacheUtil: kvPct,
+    prefixCacheHitRate: prefixPct,
+    normalizedSessionTimeS: normalized,
+    p90PrefillTpsPerUser: prefillP90,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8c6d92b6..423f70e7 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -12,6 +12,8 @@ import { gzipSync } from 'node:zlib';
 
 import type postgres from 'postgres';
 
+import { computeAggregateStats } from './compute-aggregate-stats.js';
+
 type Sql = ReturnType<typeof postgres>;
 
 /**
@@ -55,6 +57,15 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
+  // Pre-compute the aggregate stats so the detail page / aggregates view
+  // doesn't have to re-parse these blobs on every request. The compute
+  // function tolerates one-or-both blobs being null and falls back to a
+  // streaming parser for oversized server_metrics blobs.
+  const aggregateStats = await computeAggregateStats({
+    profileBlob: profileGz,
+    serverBlob: metricsJsonGz,
+  });
+
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
     insert into agentic_trace_replay (
       profile_export_jsonl_gz,
@@ -62,7 +73,8 @@ export async function insertTraceReplay(
       server_metrics_csv,
       server_metrics_csv_size,
       server_metrics_json_gz,
-      server_metrics_json_uncompressed_size
+      server_metrics_json_uncompressed_size,
+      aggregate_stats
     )
     values (
       ${profileGz},
@@ -70,7 +82,8 @@ export async function insertTraceReplay(
       ${serverMetricsCsv},
       ${csvSize},
       ${metricsJsonGz},
-      ${metricsJsonSize}
+      ${metricsJsonSize},
+      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 22ec7b28..8ac4f678 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -24,6 +24,14 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
 
 import type { DbClient } from '../connection.js';
 
+/**
+ * Bump when the aggregate-stats computation algorithm changes — the backfill
+ * script recomputes any row whose stored `aggregate_stats.version` is older.
+ * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
+ * import: the compute helper depends on the percentile utilities below.
+ */
+export const STATS_VERSION = 1;
+
 export interface MetricPercentiles {
   mean: number;
   p50: number;
@@ -254,9 +262,55 @@ export async function getAgenticAggregates(
   if (benchmarkResultIds.length === 0) return {};
 
   const result: AgenticAggregateMap = {};
-  // ── Pass 1: profile_export blobs (cheap; large batches). ────────────────
-  for (let i = 0; i < benchmarkResultIds.length; i += PROFILE_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + PROFILE_CHUNK_SIZE);
+
+  // Fast path: read the pre-computed `aggregate_stats` JSONB written by the
+  // ingest pipeline (and back-filled by `backfill-aggregate-stats.ts`). One
+  // round-trip pulls everything we need for every requested id with no blob
+  // decompression, so the slow blob-parsing fallback only runs for ids
+  // whose stats are missing or were produced by an older `STATS_VERSION`.
+  const statsRows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.aggregate_stats as stats
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+  `) as {
+    benchmark_result_id: number;
+    stats: AggregateStatsRow | null;
+  }[];
+
+  const idsNeedingProfile: number[] = [];
+  const idsNeedingServer: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    const agg = blankAggregate(id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      agg.isl = row.stats.isl ?? null;
+      agg.osl = row.stats.osl ?? null;
+      agg.kvCacheUtil = row.stats.kvCacheUtil ?? null;
+      agg.prefixCacheHitRate = row.stats.prefixCacheHitRate ?? null;
+    } else {
+      // No stats (or stale version) — schedule the blob-parse fallback below
+      // so the response still surfaces data. Backfill should drain these.
+      idsNeedingProfile.push(id);
+      idsNeedingServer.push(id);
+    }
+    result[id] = agg;
+  }
+  // Also fall back for ids that didn't return a row at all (no trace_replay
+  // link) — keep the caller contract: every id we know about lands in the map.
+  for (const id of benchmarkResultIds) {
+    if (!(id in result)) result[id] = blankAggregate(id);
+  }
+
+  if (idsNeedingProfile.length === 0 && idsNeedingServer.length === 0) {
+    return result;
+  }
+
+  // ── Fallback Pass 1: profile_export blobs (cheap; large batches). ──────
+  for (let i = 0; i < idsNeedingProfile.length; i += PROFILE_CHUNK_SIZE) {
+    const chunk = idsNeedingProfile.slice(i, i + PROFILE_CHUNK_SIZE);
     const rows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -280,12 +334,12 @@ export async function getAgenticAggregates(
       }
     }
   }
-  // ── Pass 2: server_metrics blobs (huge; one at a time). ────────────────
+  // ── Fallback Pass 2: server_metrics blobs (huge; one at a time). ───────
   // Serial to avoid OOM on the decompressed JSON of a high-conc TP+EP row
   // (>500 MB raw). The aggregator is fronted by a blob cache, so the slow
   // path runs at most once per sibling set.
-  for (let i = 0; i < benchmarkResultIds.length; i += SERVER_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + SERVER_CHUNK_SIZE);
+  for (let i = 0; i < idsNeedingServer.length; i += SERVER_CHUNK_SIZE) {
+    const chunk = idsNeedingServer.slice(i, i + SERVER_CHUNK_SIZE);
     const rows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -325,6 +379,17 @@ export async function getAgenticAggregates(
   return result;
 }
 
+/** Shape of the JSONB column when read back via postgres-js. */
+interface AggregateStatsRow {
+  version: number;
+  isl: MetricPercentiles | null;
+  osl: MetricPercentiles | null;
+  kvCacheUtil: MetricPercentiles | null;
+  prefixCacheHitRate: MetricPercentiles | null;
+  normalizedSessionTimeS: number | null;
+  p90PrefillTpsPerUser: number | null;
+}
+
 function blankAggregate(id: number): AgenticAggregate {
   return { id, isl: null, osl: null, kvCacheUtil: null, prefixCacheHitRate: null };
 }
diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index ac6fd38d..a14a1727 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -21,6 +21,7 @@
 import { gunzipSync } from 'node:zlib';
 
 import type { DbClient } from '../connection.js';
+import { STATS_VERSION } from './agentic-aggregates.js';
 
 export interface DerivedAgenticMetric {
   /** benchmark_results.id this entry belongs to. */
@@ -190,9 +191,50 @@ export async function getDerivedAgenticMetrics(
 ): Promise<DerivedAgenticMetricMap> {
   if (benchmarkResultIds.length === 0) return {};
 
+  const result: DerivedAgenticMetricMap = {};
+
+  // Fast path: read the pre-computed values out of `aggregate_stats`. The
+  // ingest pipeline computes both metrics in the same pass that produces the
+  // percentile bundles, so a single SQL round-trip covers most ids without
+  // touching the gzipped profile blob.
+  const statsRows = (await sql`
+    select
+      br.id as benchmark_result_id,
+      atr.aggregate_stats as stats
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+  `) as {
+    benchmark_result_id: number;
+    stats: {
+      version?: number;
+      normalizedSessionTimeS?: number | null;
+      p90PrefillTpsPerUser?: number | null;
+    } | null;
+  }[];
+
+  const idsNeedingBlob: number[] = [];
+  for (const row of statsRows) {
+    const id = Number(row.benchmark_result_id);
+    if (row.stats && Number(row.stats.version) === STATS_VERSION) {
+      result[id] = {
+        id,
+        normalized_session_time_s: row.stats.normalizedSessionTimeS ?? null,
+        p90_prefill_tps_per_user: row.stats.p90PrefillTpsPerUser ?? null,
+      };
+    } else {
+      idsNeedingBlob.push(id);
+    }
+  }
+
+  if (idsNeedingBlob.length === 0) return result;
+
+  // Fallback: parse the profile blob directly. Used for rows whose
+  // `aggregate_stats` is null or computed by an older STATS_VERSION; the
+  // backfill script drains the population so this path should be rare.
   const rows: { benchmark_result_id: number; blob: Buffer }[] = [];
-  for (let i = 0; i < benchmarkResultIds.length; i += QUERY_CHUNK_SIZE) {
-    const chunk = benchmarkResultIds.slice(i, i + QUERY_CHUNK_SIZE);
+  for (let i = 0; i < idsNeedingBlob.length; i += QUERY_CHUNK_SIZE) {
+    const chunk = idsNeedingBlob.slice(i, i + QUERY_CHUNK_SIZE);
     const chunkRows = (await sql`
       select
         br.id as benchmark_result_id,
@@ -205,7 +247,6 @@ export async function getDerivedAgenticMetrics(
     rows.push(...chunkRows);
   }
 
-  const result: DerivedAgenticMetricMap = {};
   for (const row of rows) {
     try {
       const jsonl = gunzipSync(row.blob).toString('utf8');

From 9d9c7c13413c16a147b176691782827d5ee8d21d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:07:30 -0500
Subject: [PATCH 38/96] fix(agentic-aggregates): drop .js extension on
 app-route-traced import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turbopack doesn't do TypeScript's `.js → .ts` substitution when an
app-route bundles an intra-package value import, so the new
`STATS_VERSION` import broke the /api/v1/derived-agentic-metrics
route. The same `.js` value-import pattern works for files not pulled
into an app route (e.g. workflow-run.ts → run-overrides.ts) so the
existing intra-package imports are left alone.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/queries/derived-agentic-metrics.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/db/src/queries/derived-agentic-metrics.ts b/packages/db/src/queries/derived-agentic-metrics.ts
index a14a1727..35a4b76c 100644
--- a/packages/db/src/queries/derived-agentic-metrics.ts
+++ b/packages/db/src/queries/derived-agentic-metrics.ts
@@ -21,7 +21,7 @@
 import { gunzipSync } from 'node:zlib';
 
 import type { DbClient } from '../connection.js';
-import { STATS_VERSION } from './agentic-aggregates.js';
+import { STATS_VERSION } from './agentic-aggregates';
 
 export interface DerivedAgenticMetric {
   /** benchmark_results.id this entry belongs to. */

From 6063d01e2d563951d70dea699edd30a6b06df81a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 21 May 2026 17:18:31 -0500
Subject: [PATCH 39/96] feat(agentic-detail): pre-compute chart_series at
 ingest time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Detail page was parsing the entire server_metrics_json_gz blob on every
request — fine for small rows, but TP+EP high-conc rows decompress past
Node's 512 MB max-string-length cap and threw ERR_STRING_TOO_LONG,
killing the page for point 206242 et al.

Extends the Alec-pattern to the time-series path: new `chart_series`
JSONB column holds pre-extracted kvCacheUsage, prefixCacheHitRate,
queueDepth, prefillTps, decodeTps, and promptTokensBySource arrays.
The API fast-path is a single SQL row read; the slow path (compute
from blob, with stream-parse fallback for oversized rows) only runs
for rows whose chart_series is missing or stale-versioned.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../migrations/009_agentic_chart_series.sql   |  19 ++
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-chart-series.ts      | 154 ++++++++++
 .../db/src/etl/compute-chart-series.test.ts   | 129 +++++++++
 packages/db/src/etl/compute-chart-series.ts   | 268 ++++++++++++++++++
 packages/db/src/etl/trace-replay-ingest.ts    |  21 +-
 .../db/src/queries/trace-server-metrics.ts    | 261 +++++------------
 7 files changed, 654 insertions(+), 199 deletions(-)
 create mode 100644 packages/db/migrations/009_agentic_chart_series.sql
 create mode 100644 packages/db/src/backfill-chart-series.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.test.ts
 create mode 100644 packages/db/src/etl/compute-chart-series.ts

diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql
new file mode 100644
index 00000000..b42718b9
--- /dev/null
+++ b/packages/db/migrations/009_agentic_chart_series.sql
@@ -0,0 +1,19 @@
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+  add column chart_series jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index f3f92311..f97c442a 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -20,6 +20,7 @@
     "db:migrate": "dotenv -e ../../.env -- tsx src/migrate.ts",
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
+    "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-chart-series.ts b/packages/db/src/backfill-chart-series.ts
new file mode 100644
index 00000000..66156b45
--- /dev/null
+++ b/packages/db/src/backfill-chart-series.ts
@@ -0,0 +1,154 @@
+/**
+ * Backfill `agentic_trace_replay.chart_series` for rows that are missing it
+ * or were computed by an older `CHART_SERIES_VERSION`.
+ *
+ * The ingest path now computes the time-series inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 009 and any time `CHART_SERIES_VERSION`
+ * bumps.
+ *
+ * Strategy:
+ *   - Stream rows one at a time (server_metrics_json_gz can decompress
+ *     past 500 MB on high-conc TP+EP points — one in memory at a time
+ *     avoids OOM).
+ *   - Skip rows whose stored version already matches.
+ *   - Recompute via the same `computeChartSeries()` helper the ingest
+ *     path uses, so behavior cannot drift.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-chart-series
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import { CHART_SERIES_VERSION, computeChartSeries } from './etl/compute-chart-series.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-chart-series ===');
+  console.log(`  CHART_SERIES_VERSION = ${CHART_SERIES_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows that actually have a server_metrics blob can produce a
+  // chart_series. Rows without the blob legitimately keep `chart_series`
+  // null and the API serves them via the slow path (which also returns
+  // null because there's no blob to parse — so the page falls into the
+  // "no stored trace_replay blob" branch).
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where server_metrics_json_gz is not null
+          and (
+            chart_series is null
+            or coalesce((chart_series->>'version')::int, -1) <> ${CHART_SERIES_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      const [row] = await sql<{ server_metrics_json_gz: Buffer | null }[]>`
+        select server_metrics_json_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+
+      const series = await computeChartSeries(row.server_metrics_json_gz);
+
+      await sql`
+        update agentic_trace_replay
+        set chart_series = ${
+          series === null
+            ? null
+            : sql.json(structuredClone(series) as unknown as Parameters<typeof sql.json>[0])
+        }
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-chart-series failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
new file mode 100644
index 00000000..dafc7200
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -0,0 +1,129 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { CHART_SERIES_VERSION, computeChartSeries } from './compute-chart-series.js';
+
+/**
+ * Build a minimal server_metrics_json blob covering the metrics the chart
+ * consumes. Each timeslice is one second long starting at t=0.
+ */
+function makeBlob(opts?: {
+  prefixHits?: number;
+  prefixQueries?: number;
+  promptTokensRate?: number;
+}) {
+  const json = JSON.stringify({
+    metrics: {
+      'vllm:kv_cache_usage_perc': {
+        series: [
+          {
+            timeslices: [
+              { start_ns: 0, end_ns: 1e9, avg: 0.1 },
+              { start_ns: 1e9, end_ns: 2e9, avg: 0.4 },
+              { start_ns: 2e9, end_ns: 3e9, avg: 0.7 },
+            ],
+          },
+        ],
+      },
+      'vllm:prefix_cache_hits': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixHits ?? 75 }] }],
+      },
+      'vllm:prefix_cache_queries': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.prefixQueries ?? 100 }] }],
+      },
+      'vllm:num_requests_running': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 5 }] }],
+      },
+      'vllm:num_requests_waiting': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, avg: 2 }] }],
+      },
+      'vllm:prompt_tokens': {
+        series: [
+          { timeslices: [{ start_ns: 0, end_ns: 1e9, rate: opts?.promptTokensRate ?? 1000 }] },
+        ],
+      },
+      'vllm:generation_tokens': {
+        series: [{ timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 500 }] }],
+      },
+      'vllm:prompt_tokens_by_source': {
+        series: [
+          {
+            labels: { source: 'local_cache_hit' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 200 }],
+          },
+          {
+            labels: { source: 'miss' },
+            timeslices: [{ start_ns: 0, end_ns: 1e9, rate: 800 }],
+          },
+        ],
+      },
+    },
+  });
+  return gzipSync(Buffer.from(json));
+}
+
+describe('computeChartSeries', () => {
+  it('returns null when the blob is null', async () => {
+    expect(await computeChartSeries(null)).toBeNull();
+  });
+
+  it('returns the current CHART_SERIES_VERSION in the bundle', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.version).toBe(CHART_SERIES_VERSION);
+  });
+
+  it('extracts kvCacheUsage points with t=seconds-from-start', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.kvCacheUsage).toEqual([
+      { t: 0, value: 0.1 },
+      { t: 1, value: 0.4 },
+      { t: 2, value: 0.7 },
+    ]);
+  });
+
+  it('computes prefixCacheHitRate as hits.rate / queries.rate', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 80, prefixQueries: 100 }));
+    expect(series?.prefixCacheHitRate).toEqual([{ t: 0, value: 0.8 }]);
+  });
+
+  it('drops prefixCacheHitRate windows where queries.rate is 0', async () => {
+    const series = await computeChartSeries(makeBlob({ prefixHits: 5, prefixQueries: 0 }));
+    expect(series?.prefixCacheHitRate).toEqual([]);
+  });
+
+  it('pairs running + waiting into queueDepth points', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.queueDepth).toEqual([{ t: 0, running: 5, waiting: 2, total: 7 }]);
+  });
+
+  it('extracts prefillTps + decodeTps from counter rates', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(series?.prefillTps).toEqual([{ t: 0, value: 1000 }]);
+    expect(series?.decodeTps).toEqual([{ t: 0, value: 500 }]);
+  });
+
+  it('splits promptTokensBySource by label and skips empty series', async () => {
+    const series = await computeChartSeries(makeBlob());
+    expect(Object.keys(series!.promptTokensBySource).toSorted()).toEqual([
+      'local_cache_hit',
+      'miss',
+    ]);
+    expect(series!.promptTokensBySource['local_cache_hit']).toEqual([{ t: 0, value: 200 }]);
+    expect(series!.promptTokensBySource['miss']).toEqual([{ t: 0, value: 800 }]);
+  });
+
+  it('computes timing metadata from the widest metric window', async () => {
+    const series = await computeChartSeries(makeBlob());
+    // kvCacheUsage has the widest window (0 → 3e9), so startNs=0, endNs=3e9.
+    expect(series?.startNs).toBe(0);
+    expect(series?.endNs).toBe(3e9);
+    expect(series?.durationS).toBeCloseTo(3, 6);
+    expect(series?.timeslicesCount).toBe(3);
+  });
+
+  it('returns null on a malformed (non-gzip) blob', async () => {
+    const result = await computeChartSeries(Buffer.from('not-gzip-data'));
+    expect(result).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
new file mode 100644
index 00000000..3cb4181b
--- /dev/null
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -0,0 +1,268 @@
+/**
+ * Pre-compute the time-series for the agentic detail page chart, so the
+ * API doesn't have to gunzip + JSON-parse a multi-hundred-MB blob on every
+ * request. The output lands in `agentic_trace_replay.chart_series` and is
+ * read directly by `getTraceServerMetrics`.
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `CHART_SERIES_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { Readable } from 'node:stream';
+import { createGunzip, gunzipSync } from 'node:zlib';
+
+import { chain } from 'stream-chain';
+
+import { parser } from 'stream-json';
+import { pick } from 'stream-json/filters/pick.js';
+import { streamObject } from 'stream-json/streamers/stream-object.js';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const CHART_SERIES_VERSION = 1;
+
+export interface TimeSeriesPoint {
+  /** Seconds from benchmark start. */
+  t: number;
+  value: number;
+}
+
+export interface QueueDepthPoint {
+  t: number;
+  running: number;
+  waiting: number;
+  total: number;
+}
+
+export interface ChartSeries {
+  version: number;
+  /** ns wall-clock of the first window's start; for debugging only. */
+  startNs: number;
+  /** ns wall-clock of the last window's end. */
+  endNs: number;
+  /** Total benchmark window in seconds. */
+  durationS: number;
+  /** Number of 1Hz windows captured. */
+  timeslicesCount: number;
+  kvCacheUsage: TimeSeriesPoint[];
+  prefixCacheHitRate: TimeSeriesPoint[];
+  queueDepth: QueueDepthPoint[];
+  promptTokensBySource: Record<string, TimeSeriesPoint[]>;
+  prefillTps: TimeSeriesPoint[];
+  decodeTps: TimeSeriesPoint[];
+}
+
+// ── Raw blob shapes (subset we read) ────────────────────────────────────
+
+interface RawSlice {
+  start_ns?: number;
+  end_ns?: number;
+  avg?: number;
+  rate?: number;
+}
+
+interface RawSeries {
+  labels?: Record<string, string>;
+  timeslices?: RawSlice[];
+}
+
+interface RawMetric {
+  series?: RawSeries[];
+}
+
+type MetricsMap = Record<string, RawMetric>;
+
+/** The set of metric subtrees the chart consumes. */
+const CHART_METRIC_KEYS = new Set([
+  'vllm:kv_cache_usage_perc',
+  'vllm:gpu_cache_usage_perc',
+  'vllm:prefix_cache_hits',
+  'vllm:prefix_cache_queries',
+  'vllm:num_requests_running',
+  'vllm:num_requests_waiting',
+  'vllm:prompt_tokens',
+  'vllm:generation_tokens',
+  'vllm:prompt_tokens_by_source',
+]);
+
+/**
+ * Stream-parse the gzipped server_metrics_json and collect only the metric
+ * subtrees the chart needs. Avoids Node's 512 MB max-string-length cap that
+ * `gunzipSync(buffer).toString('utf8')` trips on high-conc TP+EP rows.
+ */
+async function streamCollectMetrics(buffer: Buffer): Promise<MetricsMap> {
+  /* eslint-disable @typescript-eslint/no-explicit-any */
+  const collected: MetricsMap = {};
+  const pipeline = chain([
+    Readable.from(buffer),
+    createGunzip(),
+    parser(),
+    pick({ filter: 'metrics' }),
+    streamObject(),
+  ]);
+  await new Promise<void>((resolve, reject) => {
+    (pipeline as any).on('data', (chunk: unknown) => {
+      const { key, value } = chunk as { key: string; value: RawMetric };
+      if (CHART_METRIC_KEYS.has(key)) collected[key] = value;
+    });
+    (pipeline as any).on('end', resolve);
+    (pipeline as any).on('error', reject);
+  });
+  /* eslint-enable @typescript-eslint/no-explicit-any */
+  return collected;
+}
+
+/**
+ * Parse the gzipped server_metrics blob into the metric map. Tries the
+ * synchronous fast path first; falls back to stream-parse on
+ * ERR_STRING_TOO_LONG so high-conc TP+EP rows succeed.
+ */
+async function parseMetrics(buffer: Buffer): Promise<MetricsMap> {
+  try {
+    const obj = JSON.parse(gunzipSync(buffer).toString('utf8')) as { metrics?: MetricsMap };
+    return obj.metrics ?? {};
+  } catch (error) {
+    const code = error && (error as NodeJS.ErrnoException).code;
+    const msg = error instanceof Error ? error.message : String(error);
+    if (code === 'ERR_STRING_TOO_LONG' || msg.includes('longer than 0x1fffffe8')) {
+      return await streamCollectMetrics(buffer);
+    }
+    throw error;
+  }
+}
+
+/**
+ * Build chart-ready time-series arrays from a gzipped server_metrics blob.
+ * The math mirrors `getTraceServerMetrics` — this helper exists so ingest,
+ * backfill, and the API path produce byte-identical results.
+ */
+export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeries | null> {
+  if (!blob) return null;
+  let metrics: MetricsMap;
+  try {
+    metrics = await parseMetrics(blob);
+  } catch {
+    // Malformed blob → no series (caller treats null as "no data").
+    return null;
+  }
+  return buildSeriesFromMetrics(metrics);
+}
+
+/** Pull the first series under a metric key, or undefined. */
+function firstSeries(metrics: MetricsMap, name: string): RawSeries | undefined {
+  const s = metrics[name]?.series;
+  return s && s.length > 0 ? s[0] : undefined;
+}
+
+function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
+  // Timing reference: smallest start_ns and largest end_ns across every
+  // timeslice we extracted. (Same logic as the original getTraceServerMetrics
+  // — looking at every metric gives the widest possible window even if some
+  // series start late.)
+  let startNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+  let timeslicesCount = 0;
+  for (const metricMeta of Object.values(metrics)) {
+    for (const s of metricMeta?.series ?? []) {
+      const ts = s.timeslices ?? [];
+      if (ts.length === 0) continue;
+      timeslicesCount = Math.max(timeslicesCount, ts.length);
+      const first = ts[0]!;
+      const last = ts.at(-1)!;
+      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
+      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
+    }
+  }
+  if (!Number.isFinite(startNs)) startNs = 0;
+  const tOf = (ns: number) => (ns - startNs) / 1e9;
+
+  // KV cache usage (gauge, 0..1)
+  const kvCacheUsage: TimeSeriesPoint[] = [];
+  const kvSeries =
+    firstSeries(metrics, 'vllm:kv_cache_usage_perc') ??
+    firstSeries(metrics, 'vllm:gpu_cache_usage_perc');
+  for (const ts of kvSeries?.timeslices ?? []) {
+    if (typeof ts.avg === 'number' && typeof ts.start_ns === 'number') {
+      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
+    }
+  }
+
+  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
+  const hitsTs = firstSeries(metrics, 'vllm:prefix_cache_hits')?.timeslices ?? [];
+  const qsTs = firstSeries(metrics, 'vllm:prefix_cache_queries')?.timeslices ?? [];
+  const prefixCacheHitRate: TimeSeriesPoint[] = [];
+  const minLen = Math.min(hitsTs.length, qsTs.length);
+  for (let i = 0; i < minLen; i++) {
+    const h = hitsTs[i]!;
+    const q = qsTs[i]!;
+    if (
+      typeof q.rate === 'number' &&
+      q.rate > 0 &&
+      typeof h.rate === 'number' &&
+      typeof h.start_ns === 'number'
+    ) {
+      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
+    }
+  }
+
+  // Queue depth: pair running + waiting by index.
+  const runTs = firstSeries(metrics, 'vllm:num_requests_running')?.timeslices ?? [];
+  const waitTs = firstSeries(metrics, 'vllm:num_requests_waiting')?.timeslices ?? [];
+  const queueDepth: QueueDepthPoint[] = [];
+  const qlen = Math.min(runTs.length, waitTs.length);
+  for (let i = 0; i < qlen; i++) {
+    const r = runTs[i]!;
+    const w = waitTs[i]!;
+    if (typeof r.start_ns !== 'number') continue;
+    const running = typeof r.avg === 'number' ? r.avg : 0;
+    const waiting = typeof w.avg === 'number' ? w.avg : 0;
+    queueDepth.push({
+      t: tOf(r.start_ns),
+      running,
+      waiting,
+      total: running + waiting,
+    });
+  }
+
+  // Throughput: extract counter `rate` (already per-second from aiperf).
+  const counterRate = (name: string): TimeSeriesPoint[] => {
+    const s = firstSeries(metrics, name);
+    if (!s) return [];
+    const out: TimeSeriesPoint[] = [];
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+        out.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    return out;
+  };
+  const prefillTps = counterRate('vllm:prompt_tokens');
+  const decodeTps = counterRate('vllm:generation_tokens');
+
+  // Per-source prompt tokens — emit one TS array per source label.
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
+    const labels = series.labels ?? {};
+    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
+    const arr: TimeSeriesPoint[] = [];
+    for (const ts of series.timeslices ?? []) {
+      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+      }
+    }
+    if (arr.length > 0) promptTokensBySource[source] = arr;
+  }
+
+  return {
+    version: CHART_SERIES_VERSION,
+    startNs,
+    endNs,
+    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
+    timeslicesCount,
+    kvCacheUsage,
+    prefixCacheHitRate,
+    queueDepth,
+    promptTokensBySource,
+    prefillTps,
+    decodeTps,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 423f70e7..f70200ff 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -13,6 +13,7 @@ import { gzipSync } from 'node:zlib';
 import type postgres from 'postgres';
 
 import { computeAggregateStats } from './compute-aggregate-stats.js';
+import { computeChartSeries } from './compute-chart-series.js';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -57,14 +58,14 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
-  // Pre-compute the aggregate stats so the detail page / aggregates view
-  // doesn't have to re-parse these blobs on every request. The compute
-  // function tolerates one-or-both blobs being null and falls back to a
+  // Pre-compute the aggregate stats + chart-ready time-series so the
+  // detail page / aggregates view doesn't have to re-parse these blobs on
+  // every request. Both helpers tolerate a null blob and fall back to a
   // streaming parser for oversized server_metrics blobs.
-  const aggregateStats = await computeAggregateStats({
-    profileBlob: profileGz,
-    serverBlob: metricsJsonGz,
-  });
+  const [aggregateStats, chartSeries] = await Promise.all([
+    computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
+    computeChartSeries(metricsJsonGz),
+  ]);
 
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
     insert into agentic_trace_replay (
@@ -74,7 +75,8 @@ export async function insertTraceReplay(
       server_metrics_csv_size,
       server_metrics_json_gz,
       server_metrics_json_uncompressed_size,
-      aggregate_stats
+      aggregate_stats,
+      chart_series
     )
     values (
       ${profileGz},
@@ -83,7 +85,8 @@ export async function insertTraceReplay(
       ${csvSize},
       ${metricsJsonGz},
       ${metricsJsonSize},
-      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])}
+      ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])},
+      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 822ae633..624b6ed3 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -1,73 +1,26 @@
 /**
- * Parse aiperf's `server_metrics_export.json` blob (gzipped in
- * `agentic_trace_replay.server_metrics_json_gz`) and return a slim, chart-ready
- * time-series for one benchmark point.
+ * Time-series view of one agentic benchmark point: chart-ready arrays for
+ * KV utilization, prefix-cache hit rate, queue depth, prefill + decode TPS,
+ * and per-source prompt-token counts.
  *
- * The raw JSON has shape:
- *   metrics: {
- *     "<metric_name>": {
- *       series: [
- *         {
- *           labels: { ... },
- *           stats: { ... summary ... },
- *           timeslices: [
- *             { start_ns, end_ns, avg, min, max }            // gauges
- *             { start_ns, end_ns, total, rate }              // counters
- *           ]
- *         }
- *       ]
- *     }
- *   }
- *
- * Timeslices are ~1 Hz windows. The benchmark window can be tens of minutes
- * (1800+ windows). We return them as `[{ t, ...}]` arrays with `t` measured
- * in seconds from the benchmark start so the frontend doesn't need to
- * shuffle bigint nanoseconds around.
+ * Backed by `agentic_trace_replay.chart_series` (pre-computed at ingest
+ * time, see `etl/compute-chart-series.ts`). The fast path is a single SQL
+ * row read; the slow path re-computes from `server_metrics_json_gz` and is
+ * only taken when the column is missing or the stored
+ * `CHART_SERIES_VERSION` is stale (the backfill script should drain that).
  */
 
-import { gunzipSync } from 'node:zlib';
+import {
+  CHART_SERIES_VERSION,
+  computeChartSeries,
+  type ChartSeries,
+  type QueueDepthPoint,
+  type TimeSeriesPoint,
+} from '../etl/compute-chart-series';
 
 import type { DbClient } from '../connection.js';
 
-interface GaugeSlice {
-  start_ns: number;
-  end_ns: number;
-  avg?: number;
-  min?: number;
-  max?: number;
-}
-
-interface CounterSlice {
-  start_ns: number;
-  end_ns: number;
-  total?: number;
-  rate?: number;
-}
-
-interface Series {
-  endpoint_url?: string;
-  labels?: Record<string, string>;
-  stats?: Record<string, unknown>;
-  timeslices?: (GaugeSlice & CounterSlice)[];
-}
-
-interface MetricsJson {
-  metrics?: Record<string, { type?: string; description?: string; series?: Series[] }>;
-}
-
-export interface TimeSeriesPoint {
-  /** Seconds from benchmark start. */
-  t: number;
-  value: number;
-}
-
-export interface QueueDepthPoint {
-  t: number;
-  running: number;
-  waiting: number;
-  /** Optional total — frontend can compute too. */
-  total: number;
-}
+export type { TimeSeriesPoint, QueueDepthPoint } from '../etl/compute-chart-series';
 
 export interface PointMeta {
   id: number;
@@ -120,30 +73,13 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
 }
 
-export async function getTraceServerMetrics(
-  sql: DbClient,
-  benchmarkResultId: number,
-): Promise<TraceServerMetrics | null> {
-  const rows = (await sql`
-    select
-      atr.server_metrics_json_gz as blob,
-      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
-      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
-      br.date::text,
-      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
-      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
-      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
-    from benchmark_results br
-    join configs c on c.id = br.config_id
-    join workflow_runs wr on wr.id = br.workflow_run_id
-    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
-    where br.id = ${benchmarkResultId}
-  `) as unknown as ({ blob: Buffer | null } & PointMeta)[];
-  const row = rows[0];
-  if (!row) return null;
-  const blob = row.blob;
-  if (!blob) return null;
-  const pointMeta: PointMeta = {
+interface RawMetaRow extends PointMeta {
+  blob: Buffer | null;
+  chart_series: ChartSeries | null;
+}
+
+function buildMeta(row: RawMetaRow): PointMeta {
+  return {
     id: Number(row.id),
     hardware: row.hardware,
     framework: row.framework,
@@ -163,113 +99,58 @@ export async function getTraceServerMetrics(
     server_cpu_cache_hit_rate:
       row.server_cpu_cache_hit_rate === null ? null : Number(row.server_cpu_cache_hit_rate),
   };
+}
 
-  const parsed = JSON.parse(gunzipSync(blob).toString('utf8')) as MetricsJson;
-  const metrics = parsed.metrics ?? {};
-
-  const firstSeries = (name: string): Series | undefined => {
-    const s = metrics[name]?.series;
-    return s && s.length > 0 ? s[0] : undefined;
+function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
+  return {
+    meta,
+    startNs: series.startNs,
+    endNs: series.endNs,
+    durationS: series.durationS,
+    timeslicesCount: series.timeslicesCount,
+    kvCacheUsage: series.kvCacheUsage,
+    prefixCacheHitRate: series.prefixCacheHitRate,
+    queueDepth: series.queueDepth,
+    promptTokensBySource: series.promptTokensBySource,
+    prefillTps: series.prefillTps,
+    decodeTps: series.decodeTps,
   };
+}
 
-  // Compute timing reference from the first gauge metric we can find.
-  let startNs = Number.POSITIVE_INFINITY;
-  let endNs = 0;
-  let timeslicesCount = 0;
-  for (const metricMeta of Object.values(metrics)) {
-    for (const s of metricMeta?.series ?? []) {
-      const ts = s.timeslices ?? [];
-      if (ts.length === 0) continue;
-      timeslicesCount = Math.max(timeslicesCount, ts.length);
-      const first = ts[0]!;
-      const last = ts.at(-1)!;
-      if (typeof first.start_ns === 'number' && first.start_ns < startNs) startNs = first.start_ns;
-      if (typeof last.end_ns === 'number' && last.end_ns > endNs) endNs = last.end_ns;
-    }
-  }
-  if (!Number.isFinite(startNs)) startNs = 0;
-  const tOf = (ns: number) => (ns - startNs) / 1e9;
-
-  // KV cache usage (gauge, 0..1)
-  const kvCacheUsage: TimeSeriesPoint[] = [];
-  const kvSeries =
-    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number') {
-      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
-    }
-  }
-
-  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
-  // `rate` is already per-window delta; we just divide.
-  const hitsTs = firstSeries('vllm:prefix_cache_hits')?.timeslices ?? [];
-  const qsTs = firstSeries('vllm:prefix_cache_queries')?.timeslices ?? [];
-  const prefixCacheHitRate: TimeSeriesPoint[] = [];
-  const minLen = Math.min(hitsTs.length, qsTs.length);
-  for (let i = 0; i < minLen; i++) {
-    const h = hitsTs[i]!;
-    const q = qsTs[i]!;
-    if (typeof q.rate === 'number' && q.rate > 0 && typeof h.rate === 'number') {
-      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
-    }
-  }
-
-  // Queue depth: pair running + waiting by index.
-  const runTs = firstSeries('vllm:num_requests_running')?.timeslices ?? [];
-  const waitTs = firstSeries('vllm:num_requests_waiting')?.timeslices ?? [];
-  const queueDepth: QueueDepthPoint[] = [];
-  const qlen = Math.min(runTs.length, waitTs.length);
-  for (let i = 0; i < qlen; i++) {
-    const r = runTs[i]!;
-    const w = waitTs[i]!;
-    const running = typeof r.avg === 'number' ? r.avg : 0;
-    const waiting = typeof w.avg === 'number' ? w.avg : 0;
-    queueDepth.push({
-      t: tOf(r.start_ns),
-      running,
-      waiting,
-      total: running + waiting,
-    });
-  }
-
-  // Throughput: extract counter `rate` (already per-second delta from aiperf).
-  const counterRateSeries = (name: string): TimeSeriesPoint[] => {
-    const s = firstSeries(name);
-    if (!s) return [];
-    const out: TimeSeriesPoint[] = [];
-    for (const ts of s.timeslices ?? []) {
-      if (typeof ts.rate === 'number') out.push({ t: tOf(ts.start_ns), value: ts.rate });
-    }
-    return out;
-  };
-  const prefillTps = counterRateSeries('vllm:prompt_tokens');
-  const decodeTps = counterRateSeries('vllm:generation_tokens');
+export async function getTraceServerMetrics(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<TraceServerMetrics | null> {
+  const rows = (await sql`
+    select
+      atr.server_metrics_json_gz as blob,
+      atr.chart_series,
+      br.id, c.hardware, c.framework, c.model, c.precision, c.spec_method, c.disagg,
+      br.conc, br.offload_mode, br.isl, br.osl, br.benchmark_type,
+      br.date::text,
+      case when wr.html_url is not null then wr.html_url || '/attempts/' || wr.run_attempt else null end as run_url,
+      (br.metrics ->> 'server_gpu_cache_hit_rate')::numeric as server_gpu_cache_hit_rate,
+      (br.metrics ->> 'server_cpu_cache_hit_rate')::numeric as server_cpu_cache_hit_rate
+    from benchmark_results br
+    join configs c on c.id = br.config_id
+    join workflow_runs wr on wr.id = br.workflow_run_id
+    left join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawMetaRow[];
+  const row = rows[0];
+  if (!row) return null;
+  if (!row.blob) return null;
+  const meta = buildMeta(row);
 
-  // Per-source prompt tokens — emit one TS array per source label.
-  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
-  for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
-    const labels = series.labels ?? {};
-    const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
-    const arr: TimeSeriesPoint[] = [];
-    for (const ts of series.timeslices ?? []) {
-      if (typeof ts.rate === 'number') {
-        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
-      }
-    }
-    if (arr.length > 0) promptTokensBySource[source] = arr;
+  // Fast path: pre-computed chart_series at the current version.
+  if (row.chart_series && Number(row.chart_series.version) === CHART_SERIES_VERSION) {
+    return merge(meta, row.chart_series);
   }
 
-  return {
-    meta: pointMeta,
-    startNs,
-    endNs,
-    durationS: endNs > startNs ? (endNs - startNs) / 1e9 : 0,
-    timeslicesCount,
-    kvCacheUsage,
-    prefixCacheHitRate,
-    queueDepth,
-    promptTokensBySource,
-    prefillTps,
-    decodeTps,
-  };
+  // Slow path: compute from the blob. `computeChartSeries` handles
+  // ERR_STRING_TOO_LONG via a stream-parse fallback so high-conc TP+EP
+  // rows succeed even before the backfill drains them.
+  const series = await computeChartSeries(row.blob);
+  if (!series) return null;
+  return merge(meta, series);
 }

From 24fe8feae5175d80a53002fd4f3b3b77bb42e8c4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 22 May 2026 14:00:37 -0500
Subject: [PATCH 40/96] feat(agentic-detail): per-request Gantt timeline view
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a "Request timeline" view on the agentic point detail page, modeled
after the agent-timeline in semianalysis-claude-code-proxy. Each row is
a conversation (with sub-agent rows nested + indented under their
parent), each bar is one HTTP request from request_start → request_end
with a thin lead-in showing credit_issued → request_start queue wait.

Hover any bar for per-request stats (TTFT, ISL/OSL, queue wait, phase,
worker, agent depth). Move anywhere over the chart for a crosshair
that shows the cursor time + how many requests are running / waiting /
completed at that instant — O(log n) sweep counts so it stays smooth
on big runs.

Same Alec pattern as 008/009: migration 010 adds a `request_timeline`
JSONB column on agentic_trace_replay, computed at ingest time and
backfilled for existing rows. ~30 KB per row vs the ~1-3 MB raw blob.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/app/api/v1/request-timeline/route.ts  |  40 +
 .../agentic-point/agentic-point-detail.tsx    |  25 +-
 .../agentic-point/request-timeline.tsx        | 821 ++++++++++++++++++
 .../app/src/hooks/api/use-request-timeline.ts |  59 ++
 .../010_agentic_request_timeline.sql          |  15 +
 packages/db/package.json                      |   1 +
 packages/db/src/backfill-request-timeline.ts  | 144 +++
 .../src/etl/compute-request-timeline.test.ts  | 153 ++++
 .../db/src/etl/compute-request-timeline.ts    | 182 ++++
 packages/db/src/etl/trace-replay-ingest.ts    |  18 +-
 packages/db/src/queries/request-timeline.ts   |  48 +
 11 files changed, 1498 insertions(+), 8 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/request-timeline/route.ts
 create mode 100644 packages/app/src/components/inference/agentic-point/request-timeline.tsx
 create mode 100644 packages/app/src/hooks/api/use-request-timeline.ts
 create mode 100644 packages/db/migrations/010_agentic_request_timeline.sql
 create mode 100644 packages/db/src/backfill-request-timeline.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.test.ts
 create mode 100644 packages/db/src/etl/compute-request-timeline.ts
 create mode 100644 packages/db/src/queries/request-timeline.ts

diff --git a/packages/app/src/app/api/v1/request-timeline/route.ts b/packages/app/src/app/api/v1/request-timeline/route.ts
new file mode 100644
index 00000000..6c884fb2
--- /dev/null
+++ b/packages/app/src/app/api/v1/request-timeline/route.ts
@@ -0,0 +1,40 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getRequestTimeline,
+  type RequestTimeline,
+} from '@semianalysisai/inferencex-db/queries/request-timeline';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedRequestTimeline = cachedQuery(
+  (id: number): Promise<RequestTimeline | null> => getRequestTimeline(getDb(), id),
+  'request-timeline',
+  { blobOnly: true },
+);
+
+/**
+ * GET /api/v1/request-timeline?id=N
+ *
+ * Returns the per-request Gantt timeline for one agentic benchmark point.
+ * Each request entry has ns-from-start offsets for credit/start/ack/end,
+ * plus TTFT, ISL, OSL, conversation id, turn index, worker id. 404 if the
+ * point has no stored profile_export.jsonl blob.
+ */
+export async function GET(request: NextRequest) {
+  const id = Number(request.nextUrl.searchParams.get('id'));
+  if (!id || !Number.isFinite(id)) {
+    return NextResponse.json({ error: 'id is required (benchmark_result_id)' }, { status: 400 });
+  }
+  try {
+    const data = await getCachedRequestTimeline(id);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching request timeline:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index a5bca4e0..2e43b4fb 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -6,6 +6,7 @@ import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
 import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
+import { useRequestTimeline } from '@/hooks/api/use-request-timeline';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -19,6 +20,7 @@ import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/seg
 import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
 import { ExpandableChart } from './expandable-chart';
+import { RequestTimelineView } from './request-timeline';
 import { SiblingNav, chipLabel } from './sibling-nav';
 import {
   StackedAreaChart,
@@ -82,9 +84,10 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
-type DetailView = 'point' | 'aggregates';
+type DetailView = 'point' | 'timeline' | 'aggregates';
 const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
+  { value: 'timeline', label: 'Request timeline', testId: 'detail-view-timeline' },
   { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
 ];
 
@@ -120,6 +123,8 @@ export function AgenticPointDetail({ id }: Props) {
   // shows how the metric varies across the SKU.
   const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
   const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
+  // Per-request timeline fetched only when the timeline view is active.
+  const timelineQuery = useRequestTimeline(id, view === 'timeline');
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -176,6 +181,11 @@ export function AgenticPointDetail({ id }: Props) {
             {aggregatesQuery.isLoading ? ' · loading…' : ''}
           </span>
         )}
+        {view === 'timeline' && timelineQuery.data && (
+          <span className="text-xs text-muted-foreground">
+            {timelineQuery.data.requests.length} requests
+          </span>
+        )}
       </div>
 
       {view === 'aggregates' ? (
@@ -184,6 +194,19 @@ export function AgenticPointDetail({ id }: Props) {
           aggregates={aggregatesQuery.data}
           isLoading={aggregatesQuery.isLoading}
         />
+      ) : view === 'timeline' ? (
+        timelineQuery.isLoading ? (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            Loading request timeline…
+          </div>
+        ) : timelineQuery.data ? (
+          <RequestTimelineView data={timelineQuery.data} />
+        ) : (
+          <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+            No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact
+            isn&apos;t stored for this row.
+          </div>
+        )
       ) : (
         <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
           <ExpandableChart
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
new file mode 100644
index 00000000..bcbe105a
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -0,0 +1,821 @@
+'use client';
+
+import { useCallback, useMemo, useRef, useState } from 'react';
+
+import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
+import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+
+/**
+ * Gantt-style request timeline for one agentic benchmark point.
+ *
+ * Rows are conversations (or workers — toggle in the header). Bars are
+ * individual HTTP requests, drawn from request_start to request_end with a
+ * thin lead-in segment from credit_issued (load gen queue). Scroll-wheel
+ * zooms, drag pans, hover shows per-request stats.
+ *
+ * The reference for this layout is the agent-timeline in semianalysis-claude-code-proxy.
+ */
+
+type RowMode = 'conversation' | 'worker';
+
+const ROW_MODE_OPTIONS: SegmentedToggleOption<RowMode>[] = [
+  { value: 'conversation', label: 'By conversation', testId: 'timeline-mode-conversation' },
+  { value: 'worker', label: 'By worker', testId: 'timeline-mode-worker' },
+];
+
+type PhaseFilter = 'all' | 'profiling';
+
+const PHASE_OPTIONS: SegmentedToggleOption<PhaseFilter>[] = [
+  { value: 'profiling', label: 'Profiling', testId: 'timeline-phase-profiling' },
+  { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' },
+];
+
+/** A stable color palette indexed by row-key hash. */
+const ROW_COLORS = [
+  '#3b82f6',
+  '#ef4444',
+  '#10b981',
+  '#f59e0b',
+  '#a855f7',
+  '#06b6d4',
+  '#f97316',
+  '#84cc16',
+  '#ec4899',
+  '#14b8a6',
+  '#8b5cf6',
+  '#eab308',
+];
+
+/** Phase color overlay drawn as a thin strip at the bottom of each bar. */
+const PHASE_COLORS: Record<string, string> = {
+  profiling: '#22c55e',
+  warmup: '#94a3b8',
+  unknown: '#64748b',
+};
+
+interface Row {
+  key: string;
+  label: string;
+  color: string;
+  requests: RequestRecord[];
+  /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */
+  depth: number;
+  /** True if this row is a sub-agent ("Subagent N of parent X"). */
+  isSubagent: boolean;
+}
+
+/**
+ * Conversation ids for subagent calls look like
+ *   <parent_cid>::sa:subagent_<N>_<hash>
+ * Split into the parent cid and a sub-agent label (or the whole thing if
+ * this is a top-level conversation).
+ */
+function splitCid(cid: string): { parent: string; subagent: string | null } {
+  const sep = cid.indexOf('::sa:');
+  if (sep === -1) return { parent: cid, subagent: null };
+  return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) };
+}
+
+/** Group requests into rows; in conversation mode subagents nest under parents. */
+function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
+  const groups = new Map<string, RequestRecord[]>();
+  for (const r of requests) {
+    const key = mode === 'conversation' ? r.cid : r.wid;
+    let list = groups.get(key);
+    if (!list) {
+      list = [];
+      groups.set(key, list);
+    }
+    list.push(r);
+  }
+
+  if (mode !== 'conversation') {
+    // Worker mode: flat rows, sorted by first activity.
+    const rows: Row[] = [];
+    let i = 0;
+    for (const [key, list] of groups) {
+      list.sort((a, b) => a.start - b.start);
+      rows.push({
+        key,
+        label: shortenWid(key),
+        color: ROW_COLORS[i % ROW_COLORS.length]!,
+        requests: list,
+        depth: 0,
+        isSubagent: false,
+      });
+      i++;
+    }
+    rows.sort((a, b) => a.requests[0]!.start - b.requests[0]!.start);
+    return rows;
+  }
+
+  // Conversation mode: build a parent → [subagents] tree so each parent
+  // group renders as one parent row followed by its sub-agent rows. Color
+  // is shared inside a tree so the visual grouping reads.
+  interface Tree {
+    parentCid: string;
+    parentRow: { key: string; requests: RequestRecord[] } | null;
+    subagents: Map<string, RequestRecord[]>; // subagent label → requests
+    firstStart: number;
+  }
+  const trees = new Map<string, Tree>();
+  for (const [cid, list] of groups) {
+    list.sort((a, b) => a.start - b.start);
+    const { parent, subagent } = splitCid(cid);
+    let tree = trees.get(parent);
+    if (!tree) {
+      tree = {
+        parentCid: parent,
+        parentRow: null,
+        subagents: new Map(),
+        firstStart: Number.POSITIVE_INFINITY,
+      };
+      trees.set(parent, tree);
+    }
+    if (subagent === null) {
+      tree.parentRow = { key: cid, requests: list };
+    } else {
+      tree.subagents.set(subagent, list);
+    }
+    const earliest = list[0]!.start;
+    if (earliest < tree.firstStart) tree.firstStart = earliest;
+  }
+
+  const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
+  const rows: Row[] = [];
+  let colorIdx = 0;
+  for (const tree of sortedTrees) {
+    const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
+    colorIdx++;
+    if (tree.parentRow) {
+      rows.push({
+        key: tree.parentRow.key,
+        label: shortenCid(tree.parentCid),
+        color,
+        requests: tree.parentRow.requests,
+        depth: 0,
+        isSubagent: false,
+      });
+    } else {
+      // Pseudo-parent header so orphan subagents still render under
+      // something they belong to.
+      rows.push({
+        key: `__parent_${tree.parentCid}`,
+        label: shortenCid(tree.parentCid),
+        color,
+        requests: [],
+        depth: 0,
+        isSubagent: false,
+      });
+    }
+    const subagentEntries = [...tree.subagents.entries()].toSorted(
+      (a, b) => a[1][0]!.start - b[1][0]!.start,
+    );
+    for (const [saLabel, list] of subagentEntries) {
+      rows.push({
+        key: `${tree.parentCid}::${saLabel}`,
+        label: `↳ ${formatSubagentLabel(saLabel)}`,
+        color,
+        requests: list,
+        depth: 1,
+        isSubagent: true,
+      });
+    }
+  }
+  return rows;
+}
+
+/** `subagent_001_bf1c5c16` → `subagent 001 · bf1c` (compact, readable). */
+function formatSubagentLabel(raw: string): string {
+  const m = /^subagent_(\d+)_([0-9a-f]+)$/i.exec(raw);
+  if (!m) return raw;
+  return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
+}
+
+function shortenCid(cid: string): string {
+  if (cid.length <= 12) return cid;
+  return `${cid.slice(0, 8)}…${cid.slice(-4)}`;
+}
+
+function shortenWid(wid: string): string {
+  // worker_4ae87bea → w_4ae8
+  return wid.replace(/^worker_/, 'w_').slice(0, 12);
+}
+
+/** Format ns offset → "+12.3s" / "+1.2m". */
+function formatTickLabel(ns: number): string {
+  const ms = ns / 1e6;
+  if (ms < 1000) return `+${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `+${(ms / 1000).toFixed(ms < 10_000 ? 1 : 0)}s`;
+  return `+${(ms / 60_000).toFixed(1)}m`;
+}
+
+function formatDuration(ms: number): string {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  if (ms < 60_000) return `${(ms / 1000).toFixed(2)}s`;
+  return `${(ms / 60_000).toFixed(2)}m`;
+}
+
+/** Number of values in a sorted ascending array that are <= target. */
+function countLeq(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! <= target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+/** Number of values in a sorted ascending array that are < target. */
+function countLt(sorted: number[], target: number): number {
+  let lo = 0;
+  let hi = sorted.length;
+  while (lo < hi) {
+    const mid = (lo + hi) >>> 1;
+    if (sorted[mid]! < target) lo = mid + 1;
+    else hi = mid;
+  }
+  return lo;
+}
+
+interface TooltipData {
+  x: number;
+  y: number;
+  row: Row;
+  req: RequestRecord;
+}
+
+function Tooltip({ data }: { data: TooltipData }) {
+  const { row, req } = data;
+  const totalMs = (req.end - req.start) / 1e6;
+  const queueMs = (req.start - req.credit) / 1e6;
+  return (
+    <div
+      className="fixed z-50 pointer-events-none rounded-md border border-border bg-card p-2.5 shadow-lg text-[11px]"
+      style={{ left: data.x + 12, top: data.y - 10, maxWidth: 280 }}
+    >
+      <div className="flex items-center gap-2 font-medium text-foreground">
+        <span className="inline-block w-2 h-2 rounded-sm" style={{ backgroundColor: row.color }} />
+        <span className="truncate">{row.label}</span>
+        <span className="text-muted-foreground">· turn {req.ti}</span>
+        {req.cancelled && <span className="text-destructive">· cancelled</span>}
+      </div>
+      <div className="mt-1.5 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>Total</span>
+        <span className="text-foreground text-right tabular-nums">{formatDuration(totalMs)}</span>
+        <span>Queue wait</span>
+        <span className="text-foreground text-right tabular-nums">
+          {queueMs > 0.5 ? formatDuration(queueMs) : '—'}
+        </span>
+        {req.ttftMs !== null && (
+          <>
+            <span>TTFT</span>
+            <span className="text-foreground text-right tabular-nums">
+              {formatDuration(req.ttftMs)}
+            </span>
+          </>
+        )}
+        {req.isl !== null && (
+          <>
+            <span>ISL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.isl.toLocaleString()}
+            </span>
+          </>
+        )}
+        {req.osl !== null && (
+          <>
+            <span>OSL</span>
+            <span className="text-foreground text-right tabular-nums">
+              {req.osl.toLocaleString()}
+            </span>
+          </>
+        )}
+        <span>Phase</span>
+        <span className="text-foreground text-right">{req.phase}</span>
+        {req.ad > 0 && (
+          <>
+            <span>Agent depth</span>
+            <span className="text-foreground text-right tabular-nums">{req.ad}</span>
+          </>
+        )}
+        <span>Worker</span>
+        <span className="text-foreground text-right truncate">{shortenWid(req.wid)}</span>
+      </div>
+      <div className="mt-1.5 pt-1 border-t border-border/40 text-[10px] text-muted-foreground">
+        Started at {formatTickLabel(req.start)}
+      </div>
+    </div>
+  );
+}
+
+export function RequestTimelineView({ data }: { data: RequestTimeline }) {
+  const [rowMode, setRowMode] = useState<RowMode>('conversation');
+  const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
+  const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+  const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
+
+  // Apply phase filter, then group into rows.
+  const filtered = useMemo(
+    () =>
+      phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
+    [data.requests, phaseFilter],
+  );
+  const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]);
+
+  // Pre-sort the timestamp columns so the cursor-time stats popover can
+  // count "running / waiting at time t" in O(log n). With a few hundred
+  // requests this is overkill — but it stays smooth on huge runs too.
+  const sortedTimes = useMemo(() => {
+    const credits = filtered.map((r) => r.credit).toSorted((a, b) => a - b);
+    const starts = filtered.map((r) => r.start).toSorted((a, b) => a - b);
+    const ends = filtered.map((r) => r.end).toSorted((a, b) => a - b);
+    return { credits, starts, ends };
+  }, [filtered]);
+
+  // Cursor state (vertical line + stats popover). null when the mouse
+  // isn't over the chart. xPx is svg-local; tNs is the ns offset from
+  // dataStart that the cursor is pointing at.
+  const [cursor, setCursor] = useState<{
+    xPx: number;
+    tNs: number;
+    clientX: number;
+    clientY: number;
+  } | null>(null);
+
+  // Timeline extent (clamped to actual data — if we filtered out warmup
+  // the visible window should shrink to just the profiling phase).
+  const dataStart = filtered.length === 0 ? 0 : Math.min(...filtered.map((r) => r.credit));
+  const dataEnd = filtered.length === 0 ? 1 : Math.max(...filtered.map((r) => r.end));
+  const totalNs = Math.max(dataEnd - dataStart, 1);
+
+  // Visible window state (ns offsets, relative to dataStart).
+  const [viewStart, setViewStart] = useState(0);
+  const [viewEnd, setViewEnd] = useState<number | null>(null);
+  const vStart = viewStart;
+  const vEnd = viewEnd ?? totalNs;
+  const visibleDur = Math.max(vEnd - vStart, 1);
+  const isZoomed = viewEnd !== null;
+
+  // Layout
+  const LABEL_WIDTH = 160;
+  const ROW_HEIGHT = 22;
+  const ROW_GAP = 3;
+  const HEADER_HEIGHT = 24;
+  const PADDING_RIGHT = 12;
+  const chartWidth = 920;
+  const svgHeight = HEADER_HEIGHT + rows.length * (ROW_HEIGHT + ROW_GAP) + 6;
+  const scale = (chartWidth - PADDING_RIGHT) / visibleDur;
+  // Local coords: convert ns offset from dataStart to x px.
+  const xOf = (ns: number) => (ns - dataStart - vStart) * scale;
+
+  // Time-axis ticks (~8 across visible window, snapped to nice second multiples).
+  const niceMs = [
+    100, 250, 500, 1000, 2000, 5000, 10_000, 30_000, 60_000, 120_000, 300_000, 600_000, 1_800_000,
+  ];
+  const targetMs = visibleDur / 1e6 / 8;
+  const tickMs = niceMs.find((n) => n >= targetMs) ?? targetMs;
+  const tickNs = tickMs * 1e6;
+  const ticks: number[] = [];
+  const tickStart = Math.floor(vStart / tickNs) * tickNs;
+  for (let t = tickStart; t <= vEnd + tickNs; t += tickNs) {
+    if (t >= vStart && t <= vEnd) ticks.push(t);
+  }
+
+  const handleWheel = useCallback(
+    (e: React.WheelEvent<SVGSVGElement>) => {
+      e.preventDefault();
+      const rect = e.currentTarget.getBoundingClientRect();
+      const mouseX = e.clientX - rect.left;
+      const mouseRatio = Math.max(0, Math.min(1, mouseX / (chartWidth - PADDING_RIGHT)));
+      const curStart = vStart;
+      const curEnd = vEnd;
+      const curDur = curEnd - curStart;
+      const factor = e.deltaY > 0 ? 1.2 : 1 / 1.2;
+      const newDur = Math.min(Math.max(curDur * factor, totalNs * 0.001), totalNs);
+      const pivot = curStart + mouseRatio * curDur;
+      let newStart = pivot - mouseRatio * newDur;
+      let newEnd = pivot + (1 - mouseRatio) * newDur;
+      if (newStart < 0) {
+        newEnd -= newStart;
+        newStart = 0;
+      }
+      if (newEnd > totalNs) {
+        newStart -= newEnd - totalNs;
+        newEnd = totalNs;
+        if (newStart < 0) newStart = 0;
+      }
+      if (newEnd - newStart >= totalNs * 0.99) {
+        setViewStart(0);
+        setViewEnd(null);
+      } else {
+        setViewStart(newStart);
+        setViewEnd(newEnd);
+      }
+    },
+    [vStart, vEnd, totalNs, chartWidth],
+  );
+
+  const handleMouseDown = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      if (e.button !== 0) return;
+      dragRef.current = { startX: e.clientX, vs: vStart, ve: vEnd };
+    },
+    [vStart, vEnd],
+  );
+
+  const handleMouseMove = useCallback(
+    (e: React.MouseEvent<SVGSVGElement>) => {
+      // Dragging takes precedence over cursor tracking — panning the view.
+      if (dragRef.current) {
+        const dx = e.clientX - dragRef.current.startX;
+        const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT);
+        const delta = -dx * nsPerPx;
+        let ns = dragRef.current.vs + delta;
+        let ne = dragRef.current.ve + delta;
+        const dur = ne - ns;
+        if (ns < 0) {
+          ns = 0;
+          ne = dur;
+        }
+        if (ne > totalNs) {
+          ne = totalNs;
+          ns = totalNs - dur;
+          if (ns < 0) ns = 0;
+        }
+        setViewStart(ns);
+        setViewEnd(ne);
+        setTooltip(null);
+        setCursor(null);
+        return;
+      }
+      // Track the cursor position in svg-local px and the matching ns offset
+      // so the crosshair + stats popover can render. Clamped to the chart
+      // plot area (don't show a cursor on the axis labels gutter).
+      const rect = e.currentTarget.getBoundingClientRect();
+      const xPx = Math.max(0, Math.min(chartWidth - PADDING_RIGHT, e.clientX - rect.left));
+      const nsPerPx = visibleDur / (chartWidth - PADDING_RIGHT);
+      const tNs = vStart + xPx * nsPerPx;
+      setCursor({ xPx, tNs, clientX: e.clientX, clientY: e.clientY });
+    },
+    [visibleDur, chartWidth, totalNs, vStart],
+  );
+
+  const handleMouseUp = useCallback(() => {
+    dragRef.current = null;
+  }, []);
+
+  const handleMouseLeave = useCallback(() => {
+    dragRef.current = null;
+    setCursor(null);
+  }, []);
+
+  const resetZoom = useCallback(() => {
+    setViewStart(0);
+    setViewEnd(null);
+  }, []);
+
+  if (rows.length === 0) {
+    return (
+      <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
+        No requests in the current filter.
+      </div>
+    );
+  }
+
+  const totalRequests = filtered.length;
+
+  return (
+    <div className="space-y-3">
+      {/* Controls */}
+      <div className="flex flex-wrap items-center gap-2">
+        <SegmentedToggle
+          value={rowMode}
+          options={ROW_MODE_OPTIONS}
+          onValueChange={setRowMode}
+          ariaLabel="Row mode"
+          testId="timeline-row-mode"
+          buttonClassName="px-2.5 py-1 text-xs"
+        />
+        <SegmentedToggle
+          value={phaseFilter}
+          options={PHASE_OPTIONS}
+          onValueChange={setPhaseFilter}
+          ariaLabel="Phase filter"
+          testId="timeline-phase-filter"
+          buttonClassName="px-2.5 py-1 text-xs"
+        />
+        <span className="ml-auto text-xs text-muted-foreground">
+          {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
+          {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
+          {formatDuration((dataEnd - dataStart) / 1e6)}
+          {isZoomed && (
+            <>
+              {' · '}
+              <button type="button" onClick={resetZoom} className="text-foreground hover:underline">
+                reset zoom
+              </button>
+            </>
+          )}
+        </span>
+      </div>
+
+      {/* Chart container */}
+      <div className="rounded-md border border-border/60 bg-card overflow-hidden">
+        <div className="flex">
+          {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
+          <div
+            className="flex-shrink-0 border-r border-border/60 bg-card/80"
+            style={{ width: LABEL_WIDTH }}
+          >
+            <div
+              className="border-b border-border/60 flex items-end px-2 pb-1"
+              style={{ height: HEADER_HEIGHT }}
+            >
+              <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
+                {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
+              </span>
+            </div>
+            {rows.map((row) => (
+              <div
+                key={row.key}
+                className="flex items-center gap-1.5 overflow-hidden pr-2"
+                style={{
+                  height: ROW_HEIGHT + ROW_GAP,
+                  paddingLeft: 8 + row.depth * 12,
+                }}
+              >
+                <span
+                  className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                  style={{
+                    backgroundColor: row.color,
+                    opacity: row.isSubagent ? 0.55 : 1,
+                  }}
+                />
+                <span
+                  className="text-[10px] font-mono truncate"
+                  style={{ color: row.color, opacity: row.isSubagent ? 0.85 : 1 }}
+                >
+                  {row.label}
+                </span>
+                <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                  {row.requests.length > 0 ? row.requests.length : '—'}
+                </span>
+              </div>
+            ))}
+          </div>
+
+          {/* Scrollable SVG */}
+          <div className="flex-1 overflow-x-auto">
+            <svg
+              width={chartWidth}
+              height={svgHeight}
+              className="block"
+              style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
+              onWheel={handleWheel}
+              onMouseDown={handleMouseDown}
+              onMouseMove={handleMouseMove}
+              onMouseUp={handleMouseUp}
+              onMouseLeave={handleMouseLeave}
+            >
+              {/* Header / time-axis baseline */}
+              <line
+                x1={0}
+                y1={HEADER_HEIGHT}
+                x2={chartWidth}
+                y2={HEADER_HEIGHT}
+                stroke="currentColor"
+                opacity={0.15}
+              />
+
+              {/* Time axis ticks */}
+              {ticks.map((t) => {
+                // Convert visible-window ns offset → x px (the tick array
+                // is already in dataStart-relative coords).
+                const x = (t - vStart) * scale;
+                return (
+                  <g key={t}>
+                    <line
+                      x1={x}
+                      y1={HEADER_HEIGHT}
+                      x2={x}
+                      y2={svgHeight}
+                      stroke="currentColor"
+                      opacity={0.08}
+                      strokeDasharray="2 4"
+                    />
+                    <text
+                      x={x + 2}
+                      y={HEADER_HEIGHT - 6}
+                      fill="currentColor"
+                      opacity={0.55}
+                      fontSize={9}
+                      fontFamily="ui-monospace, SFMono-Regular, monospace"
+                    >
+                      {formatTickLabel(t)}
+                    </text>
+                  </g>
+                );
+              })}
+
+              {/* Row separators */}
+              {rows.map((row, idx) => (
+                <line
+                  key={`sep-${row.key}`}
+                  x1={0}
+                  y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  x2={chartWidth}
+                  y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  stroke="currentColor"
+                  opacity={0.04}
+                />
+              ))}
+
+              {/* Request bars */}
+              {rows.map((row, rowIdx) => {
+                const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
+                const barH = ROW_HEIGHT - 4;
+                return row.requests.map((req) => {
+                  const xCredit = xOf(req.credit);
+                  const xStart = xOf(req.start);
+                  const xEnd = xOf(req.end);
+                  // Cull bars entirely outside the visible window so big
+                  // benchmarks don't render thousands of zero-width rects.
+                  if (xEnd < -2 || xCredit > chartWidth + 2) return null;
+                  const runW = Math.max(xEnd - xStart, 1);
+                  const queueW = Math.max(xStart - xCredit, 0);
+                  const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+                  return (
+                    <g
+                      key={`${req.cid}-${req.ti}-${req.start}`}
+                      onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
+                      onMouseLeave={() => setTooltip(null)}
+                    >
+                      {/* Queue lead-in (faint) — only drawn when noticeable. */}
+                      {queueW >= 1 && (
+                        <rect
+                          x={xCredit}
+                          y={yTop + barH / 2 - 1}
+                          width={queueW}
+                          height={2}
+                          fill={row.color}
+                          opacity={0.35}
+                        />
+                      )}
+                      {/* Main bar */}
+                      <rect
+                        x={xStart}
+                        y={yTop}
+                        width={runW}
+                        height={barH}
+                        rx={2}
+                        fill={row.color}
+                        opacity={req.cancelled ? 0.35 : row.isSubagent ? 0.6 : 0.85}
+                      />
+                      {/* Phase strip at bottom */}
+                      <rect
+                        x={xStart}
+                        y={yTop + barH - 2}
+                        width={runW}
+                        height={2}
+                        rx={1}
+                        fill={phaseColor}
+                        opacity={0.85}
+                      />
+                      {/* Cancelled X overlay */}
+                      {req.cancelled && runW > 6 && (
+                        <line
+                          x1={xStart + 1}
+                          y1={yTop + 1}
+                          x2={xStart + runW - 1}
+                          y2={yTop + barH - 1}
+                          stroke="currentColor"
+                          strokeWidth={0.7}
+                          opacity={0.6}
+                        />
+                      )}
+                    </g>
+                  );
+                });
+              })}
+
+              {/* Cursor crosshair — drawn on top of bars so it stays visible
+                  through dense rows. Stats popover is rendered as fixed
+                  HTML below the SVG block. */}
+              {cursor && (
+                <line
+                  x1={cursor.xPx}
+                  x2={cursor.xPx}
+                  y1={0}
+                  y2={svgHeight}
+                  stroke="currentColor"
+                  strokeWidth={1}
+                  opacity={0.45}
+                  pointerEvents="none"
+                />
+              )}
+            </svg>
+          </div>
+        </div>
+      </div>
+
+      {/* Footer / legend */}
+      <div className="flex flex-wrap items-center gap-x-4 gap-y-1 px-1 text-[11px] text-muted-foreground">
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm bg-current opacity-30" />
+          queue wait
+        </span>
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#22c55e' }} />
+          profiling
+        </span>
+        <span className="inline-flex items-center gap-1.5">
+          <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#94a3b8' }} />
+          warmup
+        </span>
+        <span className="ml-auto opacity-70">scroll to zoom · drag to pan</span>
+      </div>
+
+      {/* Cursor stats popover: count of in-flight / waiting at the cursor's
+          ns offset. Hidden when the user is hovering an individual bar
+          (per-request tooltip wins). */}
+      {cursor && !tooltip && (
+        <CursorPopover
+          cursor={cursor}
+          dataStart={dataStart}
+          startTimes={sortedTimes.starts}
+          endTimes={sortedTimes.ends}
+          creditTimes={sortedTimes.credits}
+        />
+      )}
+
+      {/* Tooltip */}
+      {tooltip && <Tooltip data={tooltip} />}
+    </div>
+  );
+}
+
+function CursorPopover({
+  cursor,
+  dataStart,
+  startTimes,
+  endTimes,
+  creditTimes,
+}: {
+  cursor: { xPx: number; tNs: number; clientX: number; clientY: number };
+  dataStart: number;
+  startTimes: number[];
+  endTimes: number[];
+  creditTimes: number[];
+}) {
+  // At time t (ns from dataStart, here represented as t = tNs):
+  //   running  = #(start <= t) - #(end < t)
+  //   waiting  = #(credit <= t) - #(start <= t)
+  //   completed= #(end <= t)
+  const t = cursor.tNs;
+  const startsLeq = countLeq(startTimes, t);
+  const endsLt = countLt(endTimes, t);
+  const creditsLeq = countLeq(creditTimes, t);
+  const endsLeq = countLeq(endTimes, t);
+  const running = Math.max(0, startsLeq - endsLt);
+  const waiting = Math.max(0, creditsLeq - startsLeq);
+  const completed = endsLeq;
+  const inflight = running + waiting;
+  // Absolute wall-clock seconds since the timeline origin (dataStart).
+  const tSec = t / 1e9;
+  // Position the popover near the cursor without overflowing the viewport.
+  // 200 px wide; flip to the left of the cursor if it would clip the right.
+  const wantLeft = cursor.clientX + 14;
+  const left =
+    typeof window === 'undefined' || wantLeft + 220 < window.innerWidth
+      ? wantLeft
+      : cursor.clientX - 220;
+  return (
+    <div
+      className="fixed z-40 pointer-events-none rounded-md border border-border bg-card/95 backdrop-blur p-2 shadow-lg text-[11px] font-mono"
+      style={{ left, top: cursor.clientY - 60, minWidth: 180 }}
+    >
+      <div className="flex justify-between gap-3 text-foreground">
+        <span className="text-muted-foreground">t =</span>
+        <span className="tabular-nums">
+          {tSec < 60 ? `${tSec.toFixed(3)} s` : `${(tSec / 60).toFixed(3)} m`}
+        </span>
+      </div>
+      <div className="mt-1 pt-1 border-t border-border/40 grid grid-cols-2 gap-x-3 gap-y-0.5 text-muted-foreground">
+        <span>In flight</span>
+        <span className="text-foreground text-right tabular-nums">{inflight}</span>
+        <span className="pl-3 text-[10px]">running</span>
+        <span className="text-foreground text-right tabular-nums">{running}</span>
+        <span className="pl-3 text-[10px]">waiting</span>
+        <span className="text-foreground text-right tabular-nums">{waiting}</span>
+        <span>Completed</span>
+        <span className="text-foreground text-right tabular-nums">{completed}</span>
+      </div>
+      {/* dataStart is informational — the displayed t is relative to it. */}
+      <div className="mt-1 pt-1 border-t border-border/40 text-[9px] text-muted-foreground">
+        relative to t₀ ({(dataStart / 1e9).toFixed(0)}s wall-clock)
+      </div>
+    </div>
+  );
+}
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
new file mode 100644
index 00000000..d3ceaab8
--- /dev/null
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -0,0 +1,59 @@
+import { useQuery } from '@tanstack/react-query';
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  ttftMs: number | null;
+  isl: number | null;
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  startNs: number;
+  endNs: number;
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+async function fetchRequestTimeline(
+  id: number,
+  signal?: AbortSignal,
+): Promise<RequestTimeline | null> {
+  const res = await fetch(`/api/v1/request-timeline?id=${id}`, { signal });
+  if (res.status === 404) return null;
+  if (!res.ok) throw new Error(`request-timeline ${res.status}`);
+  return (await res.json()) as RequestTimeline;
+}
+
+/**
+ * Lazy-fetch the per-request Gantt timeline for one agentic point.
+ * Enabled only when the caller opts in (e.g. the timeline view becomes
+ * active), so the payload (~30 KB per point) isn't paid for every page load.
+ */
+export function useRequestTimeline(id: number | null, enabled = false) {
+  return useQuery({
+    queryKey: ['request-timeline', id] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) =>
+      id ? fetchRequestTimeline(id, signal) : Promise.resolve(null),
+    enabled: enabled && Boolean(id),
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql
new file mode 100644
index 00000000..756b775e
--- /dev/null
+++ b/packages/db/migrations/010_agentic_request_timeline.sql
@@ -0,0 +1,15 @@
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+  add column request_timeline jsonb;
diff --git a/packages/db/package.json b/packages/db/package.json
index f97c442a..710089f1 100644
--- a/packages/db/package.json
+++ b/packages/db/package.json
@@ -21,6 +21,7 @@
     "db:apply-overrides": "dotenv -e ../../.env -- tsx src/apply-overrides.ts",
     "db:backfill-aggregate-stats": "dotenv -e ../../.env -- tsx src/backfill-aggregate-stats.ts",
     "db:backfill-chart-series": "dotenv -e ../../.env -- tsx src/backfill-chart-series.ts",
+    "db:backfill-request-timeline": "dotenv -e ../../.env -- tsx src/backfill-request-timeline.ts",
     "db:dump": "dotenv -e ../../.env -- tsx src/dump-db.ts",
     "db:load-dump": "dotenv -e ../../.env -- tsx src/load-dump.ts",
     "db:reset": "dotenv -e ../../.env -- tsx src/reset-db.ts",
diff --git a/packages/db/src/backfill-request-timeline.ts b/packages/db/src/backfill-request-timeline.ts
new file mode 100644
index 00000000..327099d0
--- /dev/null
+++ b/packages/db/src/backfill-request-timeline.ts
@@ -0,0 +1,144 @@
+/**
+ * Backfill `agentic_trace_replay.request_timeline` for rows that are
+ * missing it or were computed by an older `REQUEST_TIMELINE_VERSION`.
+ *
+ * The ingest path now computes the timeline inline, but existing rows
+ * (and rows whose computation logic has since changed) still need this
+ * pass. Run after applying migration 010 and any time the version bumps.
+ *
+ * Usage:
+ *   pnpm --filter @semianalysisai/inferencex-db db:backfill-request-timeline
+ *     [--limit N]   only process the first N candidate rows
+ *     [--force]     recompute every row, even if version already matches
+ *     [--yes]       skip the confirmation prompt
+ */
+
+import { confirm, hasNoSslFlag, hasYesFlag } from './cli-utils.js';
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+} from './etl/compute-request-timeline.js';
+import { createAdminSql } from './etl/db-utils.js';
+
+interface CliFlags {
+  limit: number | null;
+  force: boolean;
+}
+
+function parseFlags(): CliFlags {
+  let limit: number | null = null;
+  let force = false;
+  for (let i = 2; i < process.argv.length; i++) {
+    const arg = process.argv[i]!;
+    if (arg === '--force') force = true;
+    else if (arg === '--limit') {
+      const next = process.argv[++i];
+      if (!next || Number.isNaN(Number(next))) {
+        console.error('--limit requires a numeric argument');
+        process.exit(1);
+      }
+      limit = Number(next);
+    }
+  }
+  return { limit, force };
+}
+
+const flags = parseFlags();
+
+const sql = createAdminSql({
+  noSsl: hasNoSslFlag(),
+  max: 1,
+  onnotice: () => {},
+});
+
+async function main(): Promise<void> {
+  console.log('=== backfill-request-timeline ===');
+  console.log(`  REQUEST_TIMELINE_VERSION = ${REQUEST_TIMELINE_VERSION}`);
+  console.log(`  force = ${flags.force}`);
+  console.log(`  limit = ${flags.limit ?? 'none'}`);
+
+  // Only rows with a profile_export blob can produce a timeline. Rows
+  // without the blob keep `request_timeline` null and the API serves them
+  // as "no timeline data".
+  const candidates = flags.force
+    ? await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `
+    : await sql<{ id: number }[]>`
+        select id
+        from agentic_trace_replay
+        where profile_export_jsonl_gz is not null
+          and (
+            request_timeline is null
+            or coalesce((request_timeline->>'version')::int, -1) <> ${REQUEST_TIMELINE_VERSION}
+          )
+        order by id
+        ${flags.limit ? sql`limit ${flags.limit}` : sql``}
+      `;
+
+  if (candidates.length === 0) {
+    console.log('\n  Nothing to do — all rows up to date.');
+    return;
+  }
+
+  console.log(`\n  ${candidates.length} candidate row(s).`);
+  if (!hasYesFlag()) {
+    const ok = await confirm('\nProceed? (y/N) ');
+    if (!ok) {
+      console.log('Aborted.');
+      return;
+    }
+  }
+
+  let ok = 0;
+  let failed = 0;
+  const t0 = Date.now();
+  for (const { id } of candidates) {
+    const start = Date.now();
+    try {
+      const [row] = await sql<{ profile_export_jsonl_gz: Buffer | null }[]>`
+        select profile_export_jsonl_gz
+        from agentic_trace_replay
+        where id = ${id}
+      `;
+      if (!row) {
+        console.warn(`  id=${id}: row vanished, skipping`);
+        continue;
+      }
+      const timeline = computeRequestTimeline(row.profile_export_jsonl_gz);
+      await sql`
+        update agentic_trace_replay
+        set request_timeline = ${
+          timeline === null
+            ? null
+            : sql.json(structuredClone(timeline) as unknown as Parameters<typeof sql.json>[0])
+        }
+        where id = ${id}
+      `;
+      ok++;
+      const elapsed = Math.round((Date.now() - start) / 1000);
+      const elapsedTotal = Math.round((Date.now() - t0) / 1000);
+      console.log(
+        `  ✓ id=${id} (${elapsed}s, ${ok}/${candidates.length} done, ${elapsedTotal}s total)`,
+      );
+    } catch (error) {
+      failed++;
+      console.error(`  ✗ id=${id}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+
+  const totalSec = Math.round((Date.now() - t0) / 1000);
+  console.log(`\n=== backfill complete: ${ok} ok, ${failed} failed in ${totalSec}s ===`);
+  if (failed > 0) process.exitCode = 1;
+}
+
+main()
+  .catch((error) => {
+    console.error('backfill-request-timeline failed:', error);
+    process.exitCode = 1;
+  })
+  .finally(() => sql.end());
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
new file mode 100644
index 00000000..64512aca
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -0,0 +1,153 @@
+import { gzipSync } from 'node:zlib';
+
+import { describe, expect, it } from 'vitest';
+
+import { REQUEST_TIMELINE_VERSION, computeRequestTimeline } from './compute-request-timeline.js';
+
+interface SyntheticRequest {
+  cid: string;
+  ti: number;
+  wid?: string;
+  ad?: number;
+  phase?: string;
+  credit: number;
+  start: number;
+  end: number;
+  ack?: number | null;
+  ttftMs?: number | null;
+  isl?: number | null;
+  osl?: number | null;
+  cancelled?: boolean;
+}
+
+function makeBlob(requests: SyntheticRequest[]) {
+  const lines = requests.map((r) =>
+    JSON.stringify({
+      metadata: {
+        conversation_id: r.cid,
+        turn_index: r.ti,
+        worker_id: r.wid ?? 'worker_default',
+        agent_depth: r.ad ?? 0,
+        benchmark_phase: r.phase ?? 'profiling',
+        credit_issued_ns: r.credit,
+        request_start_ns: r.start,
+        ...(r.ack === undefined ? {} : { request_ack_ns: r.ack }),
+        request_end_ns: r.end,
+        was_cancelled: r.cancelled ?? false,
+      },
+      metrics: {
+        time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+        input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
+        output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
+      },
+    }),
+  );
+  return gzipSync(Buffer.from(lines.join('\n')));
+}
+
+describe('computeRequestTimeline', () => {
+  it('returns null when the blob is null', () => {
+    expect(computeRequestTimeline(null)).toBeNull();
+  });
+
+  it('returns null on a malformed (non-gzip) blob', () => {
+    expect(computeRequestTimeline(Buffer.from('not-gzip'))).toBeNull();
+  });
+
+  it('returns null when the blob has no parseable records', () => {
+    expect(computeRequestTimeline(gzipSync(Buffer.from('\n\n')))).toBeNull();
+  });
+
+  it('returns the current REQUEST_TIMELINE_VERSION in the bundle', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([{ cid: 'a', ti: 0, credit: 1000, start: 2000, end: 3000 }]),
+    );
+    expect(tl?.version).toBe(REQUEST_TIMELINE_VERSION);
+  });
+
+  it('shifts ns timestamps to be relative to the earliest credit_issued', () => {
+    // Two requests with absolute ns starting at 1_000_000_000.
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 1_000_000_000, start: 1_001_000_000, end: 1_010_000_000 },
+        { cid: 'a', ti: 1, credit: 1_020_000_000, start: 1_021_000_000, end: 1_030_000_000 },
+      ]),
+    );
+    expect(tl?.startNs).toBe(1_000_000_000);
+    expect(tl?.endNs).toBe(1_030_000_000);
+    expect(tl?.durationS).toBeCloseTo(0.03, 6);
+    expect(tl?.requests[0]?.credit).toBe(0);
+    expect(tl?.requests[0]?.end).toBe(10_000_000);
+    expect(tl?.requests[1]?.start).toBe(21_000_000);
+  });
+
+  it('sorts requests by start time, regardless of input order', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        { cid: 'a', ti: 0, credit: 30, start: 50, end: 60 },
+        { cid: 'a', ti: 1, credit: 0, start: 10, end: 20 },
+        { cid: 'a', ti: 2, credit: 80, start: 90, end: 100 },
+      ]),
+    );
+    expect(tl?.requests.map((r) => r.start)).toEqual([10, 50, 90]);
+  });
+
+  it('preserves conversation/worker grouping fields', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'conv-A',
+          ti: 5,
+          wid: 'worker_abcd1234',
+          ad: 2,
+          phase: 'profiling',
+          credit: 0,
+          start: 10,
+          end: 100,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cid).toBe('conv-A');
+    expect(r.ti).toBe(5);
+    expect(r.wid).toBe('worker_abcd1234');
+    expect(r.ad).toBe(2);
+    expect(r.phase).toBe('profiling');
+  });
+
+  it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'a',
+          ti: 0,
+          credit: 0,
+          start: 10,
+          end: 100,
+          ttftMs: 25.5,
+          isl: 1024,
+          osl: 256,
+          cancelled: true,
+        },
+      ]),
+    );
+    const r = tl?.requests[0]!;
+    expect(r.cancelled).toBe(true);
+    expect(r.ttftMs).toBeCloseTo(25.5, 6);
+    expect(r.isl).toBe(1024);
+    expect(r.osl).toBe(256);
+  });
+
+  it('skips records missing both credit_issued_ns and request_start_ns', () => {
+    // Build a record with only request_end_ns — the helper rejects it.
+    const broken = gzipSync(
+      Buffer.from(
+        JSON.stringify({
+          metadata: { conversation_id: 'a', turn_index: 0, request_end_ns: 1234 },
+          metrics: {},
+        }),
+      ),
+    );
+    expect(computeRequestTimeline(broken)).toBeNull();
+  });
+});
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
new file mode 100644
index 00000000..a1134f7a
--- /dev/null
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -0,0 +1,182 @@
+/**
+ * Pre-compute the per-request timeline for the agentic detail page's
+ * Gantt view. Output lands in `agentic_trace_replay.request_timeline`
+ * and is read directly by the timeline API route.
+ *
+ * Shape is a thin array — ~150 bytes per request × ~200 requests per
+ * point ≈ 30 KB per row before JSONB compression. Trivial vs the raw
+ * gzipped JSONL blob (~1-3 MB).
+ *
+ * Versioned so the backfill script knows which rows are stale — bump
+ * `REQUEST_TIMELINE_VERSION` whenever the extraction algorithm changes.
+ */
+
+import { gunzipSync } from 'node:zlib';
+
+/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
+export const REQUEST_TIMELINE_VERSION = 1;
+
+export interface RequestRecord {
+  /** Conversation id (groups turns of one agent session). */
+  cid: string;
+  /** Zero-based turn index within the conversation. */
+  ti: number;
+  /** Worker id (concurrency slot that handled this request). */
+  wid: string;
+  /** Sub-agent depth (0 = top-level). */
+  ad: number;
+  /** `warmup` or `profiling`. */
+  phase: string;
+  /** ns offset from timeline.startNs. Load gen decided to dispatch. */
+  credit: number;
+  /** ns offset from timeline.startNs. HTTP send started. */
+  start: number;
+  /** ns offset from timeline.startNs. First server acknowledgement (or null). */
+  ack: number | null;
+  /** ns offset from timeline.startNs. Last byte received. */
+  end: number;
+  /** Time-to-first-token in ms. */
+  ttftMs: number | null;
+  /** Input sequence length (tokens). */
+  isl: number | null;
+  /** Output sequence length (tokens). */
+  osl: number | null;
+  cancelled: boolean;
+}
+
+export interface RequestTimeline {
+  version: number;
+  /** Wall-clock ns of the earliest event (used as the relative-time origin). */
+  startNs: number;
+  /** Wall-clock ns of the latest `request_end_ns`. */
+  endNs: number;
+  /** Total span in seconds. */
+  durationS: number;
+  requests: RequestRecord[];
+}
+
+interface RawMetadata {
+  conversation_id?: string;
+  turn_index?: number;
+  worker_id?: string;
+  agent_depth?: number;
+  benchmark_phase?: string;
+  credit_issued_ns?: number;
+  request_start_ns?: number;
+  request_ack_ns?: number;
+  request_end_ns?: number;
+  was_cancelled?: boolean;
+}
+
+interface RawMetricValue {
+  value?: number;
+}
+
+interface RawRecord {
+  metadata?: RawMetadata;
+  metrics?: {
+    time_to_first_token?: RawMetricValue | number;
+    input_sequence_length?: RawMetricValue | number;
+    output_sequence_length?: RawMetricValue | number;
+  };
+}
+
+/** Pull a numeric metric out of the `{value, unit}` envelope (or a bare number). */
+function readNum(v: unknown): number | undefined {
+  if (typeof v === 'number') return Number.isFinite(v) ? v : undefined;
+  if (v && typeof v === 'object' && 'value' in v) {
+    const inner = (v as { value?: unknown }).value;
+    if (typeof inner === 'number' && Number.isFinite(inner)) return inner;
+  }
+  return undefined;
+}
+
+/**
+ * Parse the gzipped `profile_export.jsonl` blob into a chart-ready
+ * timeline. Returns null on a missing or malformed blob.
+ */
+export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | null {
+  if (!blob) return null;
+  let text: string;
+  try {
+    text = gunzipSync(blob).toString('utf8');
+  } catch {
+    return null;
+  }
+
+  // First pass: parse + collect raw turns; find timeline origin.
+  const raw: {
+    meta: RawMetadata;
+    ttftMs: number | null;
+    isl: number | null;
+    osl: number | null;
+  }[] = [];
+  let originNs = Number.POSITIVE_INFINITY;
+  let endNs = 0;
+
+  for (const line of text.split('\n')) {
+    if (!line) continue;
+    let rec: RawRecord;
+    try {
+      rec = JSON.parse(line) as RawRecord;
+    } catch {
+      continue;
+    }
+    const meta = rec.metadata ?? {};
+    // Use credit_issued_ns when available (the true start of the request's
+    // lifecycle), falling back to request_start_ns. Skip rows missing both.
+    const cStart = meta.credit_issued_ns ?? meta.request_start_ns;
+    const cEnd = meta.request_end_ns;
+    if (typeof cStart !== 'number' || typeof cEnd !== 'number') continue;
+
+    if (cStart < originNs) originNs = cStart;
+    if (cEnd > endNs) endNs = cEnd;
+
+    raw.push({
+      meta,
+      ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+      isl: readNum(rec.metrics?.input_sequence_length) ?? null,
+      osl: readNum(rec.metrics?.output_sequence_length) ?? null,
+    });
+  }
+
+  if (raw.length === 0) return null;
+  if (!Number.isFinite(originNs)) originNs = 0;
+
+  // Second pass: shift timestamps to be relative to originNs (smaller
+  // numbers fit in JSON nicely and the frontend doesn't need bigint math).
+  const requests: RequestRecord[] = [];
+  for (const r of raw) {
+    const m = r.meta;
+    const credit = (m.credit_issued_ns ?? m.request_start_ns ?? originNs) - originNs;
+    const start = (m.request_start_ns ?? m.credit_issued_ns ?? originNs) - originNs;
+    const ack = typeof m.request_ack_ns === 'number' ? m.request_ack_ns - originNs : null;
+    const end = (m.request_end_ns ?? originNs) - originNs;
+    requests.push({
+      cid: m.conversation_id ?? 'unknown',
+      ti: typeof m.turn_index === 'number' ? m.turn_index : 0,
+      wid: m.worker_id ?? 'unknown',
+      ad: typeof m.agent_depth === 'number' ? m.agent_depth : 0,
+      phase: m.benchmark_phase ?? 'unknown',
+      credit,
+      start,
+      ack,
+      end,
+      ttftMs: r.ttftMs,
+      isl: r.isl,
+      osl: r.osl,
+      cancelled: m.was_cancelled === true,
+    });
+  }
+
+  // Stable order so backfill output is deterministic.
+  requests.sort((a, b) => a.start - b.start);
+
+  return {
+    version: REQUEST_TIMELINE_VERSION,
+    startNs: originNs,
+    endNs,
+    durationS: endNs > originNs ? (endNs - originNs) / 1e9 : 0,
+    requests,
+  };
+}
diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index f70200ff..8cc03f2a 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -14,6 +14,7 @@ import type postgres from 'postgres';
 
 import { computeAggregateStats } from './compute-aggregate-stats.js';
 import { computeChartSeries } from './compute-chart-series.js';
+import { computeRequestTimeline } from './compute-request-timeline.js';
 
 type Sql = ReturnType<typeof postgres>;
 
@@ -58,13 +59,14 @@ export async function insertTraceReplay(
   const metricsJsonGz = serverMetricsJson ? gzipSync(serverMetricsJson) : null;
   const metricsJsonSize = serverMetricsJson ? serverMetricsJson.length : null;
 
-  // Pre-compute the aggregate stats + chart-ready time-series so the
-  // detail page / aggregates view doesn't have to re-parse these blobs on
-  // every request. Both helpers tolerate a null blob and fall back to a
-  // streaming parser for oversized server_metrics blobs.
-  const [aggregateStats, chartSeries] = await Promise.all([
+  // Pre-compute aggregate stats + chart-ready time-series + per-request
+  // timeline so the detail page doesn't have to re-parse these blobs on
+  // every request. Each helper tolerates a null blob and falls back to
+  // a streaming parser for oversized server_metrics blobs.
+  const [aggregateStats, chartSeries, requestTimeline] = await Promise.all([
     computeAggregateStats({ profileBlob: profileGz, serverBlob: metricsJsonGz }),
     computeChartSeries(metricsJsonGz),
+    Promise.resolve(computeRequestTimeline(profileGz)),
   ]);
 
   const [{ id: traceReplayId }] = await sql<{ id: number }[]>`
@@ -76,7 +78,8 @@ export async function insertTraceReplay(
       server_metrics_json_gz,
       server_metrics_json_uncompressed_size,
       aggregate_stats,
-      chart_series
+      chart_series,
+      request_timeline
     )
     values (
       ${profileGz},
@@ -86,7 +89,8 @@ export async function insertTraceReplay(
       ${metricsJsonGz},
       ${metricsJsonSize},
       ${sql.json(structuredClone(aggregateStats) as unknown as Parameters<typeof sql.json>[0])},
-      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])}
+      ${chartSeries === null ? null : sql.json(structuredClone(chartSeries) as unknown as Parameters<typeof sql.json>[0])},
+      ${requestTimeline === null ? null : sql.json(structuredClone(requestTimeline) as unknown as Parameters<typeof sql.json>[0])}
     )
     returning id
   `;
diff --git a/packages/db/src/queries/request-timeline.ts b/packages/db/src/queries/request-timeline.ts
new file mode 100644
index 00000000..2bd3e251
--- /dev/null
+++ b/packages/db/src/queries/request-timeline.ts
@@ -0,0 +1,48 @@
+/**
+ * Per-request timeline for the agentic detail page's Gantt view.
+ *
+ * Backed by `agentic_trace_replay.request_timeline` (pre-computed at
+ * ingest time, see `etl/compute-request-timeline.ts`). The fast path is
+ * a single SQL row read; the slow path re-computes from
+ * `profile_export_jsonl_gz` and is only taken when the column is missing
+ * or the stored `REQUEST_TIMELINE_VERSION` is stale.
+ */
+
+import {
+  REQUEST_TIMELINE_VERSION,
+  computeRequestTimeline,
+  type RequestTimeline,
+} from '../etl/compute-request-timeline';
+
+import type { DbClient } from '../connection.js';
+
+export type { RequestTimeline, RequestRecord } from '../etl/compute-request-timeline';
+
+interface RawRow {
+  blob: Buffer | null;
+  request_timeline: RequestTimeline | null;
+}
+
+export async function getRequestTimeline(
+  sql: DbClient,
+  benchmarkResultId: number,
+): Promise<RequestTimeline | null> {
+  const rows = (await sql`
+    select
+      atr.profile_export_jsonl_gz as blob,
+      atr.request_timeline
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = ${benchmarkResultId}
+  `) as unknown as RawRow[];
+  const row = rows[0];
+  if (!row) return null;
+
+  // Fast path: pre-computed timeline at the current version.
+  if (row.request_timeline && Number(row.request_timeline.version) === REQUEST_TIMELINE_VERSION) {
+    return row.request_timeline;
+  }
+
+  // Slow path: recompute from the blob (rare — only stale/missing rows).
+  return computeRequestTimeline(row.blob);
+}

From f2618f44d6eafa38bffb3b9b9ec39c5224d62b76 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 22 May 2026 14:21:24 -0500
Subject: [PATCH 41/96] fix(agentic-detail): aggregate vllm metrics across all
 engine series
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chart_series + aggregate_stats helpers were only reading series[0]
for each metric, which under-counted by Nx on multi-engine DP/PP vllm
deployments (each engine reports its own series in
vllm:num_requests_running, kv_cache_usage_perc, prompt_tokens, etc.).

Worst-case visible effect: for point 206032 (b200, dsv4, conc=24,
8-engine cluster), the queue-depth chart maxed at ~3 while the
per-request timeline correctly showed ~22 concurrent. Other metrics
were similarly clipped — prefix-cache hit rate, throughput, KV util.

Now we sum gauges + counter rates across all engines, and average
kv_cache_usage_perc (since it's a per-engine fraction). After fix, the
same row's peak queue depth reads 24 (running 21 + waiting 3), matching
the timeline.

Bumps STATS_VERSION + CHART_SERIES_VERSION to 2 so the backfill scripts
recompute existing rows; both were re-run against 130/26 rows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../db/src/etl/compute-chart-series.test.ts   |  80 +++++++++
 packages/db/src/etl/compute-chart-series.ts   | 154 ++++++++++--------
 packages/db/src/queries/agentic-aggregates.ts |  90 ++++++----
 3 files changed, 226 insertions(+), 98 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.test.ts b/packages/db/src/etl/compute-chart-series.test.ts
index dafc7200..4c6f8791 100644
--- a/packages/db/src/etl/compute-chart-series.test.ts
+++ b/packages/db/src/etl/compute-chart-series.test.ts
@@ -63,6 +63,48 @@ function makeBlob(opts?: {
   return gzipSync(Buffer.from(json));
 }
 
+/** Build a synthetic per-engine vLLM metric series for the multi-engine test. */
+function buildEngineSeries(engineId: number, baseRunning: number) {
+  const labels = { engine: String(engineId) };
+  return {
+    runningSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: baseRunning },
+        { start_ns: 1e9, avg: baseRunning + 1 },
+      ],
+    },
+    waitingSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0 },
+        { start_ns: 1e9, avg: 0 },
+      ],
+    },
+    kvSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, avg: 0.25 },
+        { start_ns: 1e9, avg: 0.5 },
+      ],
+    },
+    promptSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 100 },
+        { start_ns: 1e9, rate: 200 },
+      ],
+    },
+    genSlice: {
+      labels,
+      timeslices: [
+        { start_ns: 0, rate: 50 },
+        { start_ns: 1e9, rate: 75 },
+      ],
+    },
+  };
+}
+
 describe('computeChartSeries', () => {
   it('returns null when the blob is null', async () => {
     expect(await computeChartSeries(null)).toBeNull();
@@ -126,4 +168,42 @@ describe('computeChartSeries', () => {
     const result = await computeChartSeries(Buffer.from('not-gzip-data'));
     expect(result).toBeNull();
   });
+
+  it('aggregates gauges + counters across all engine series (DP/PP fix)', async () => {
+    // Simulate a 4-engine deployment: each engine reports its own series for
+    // every metric. Cluster-wide value should be SUM for running/waiting and
+    // counter rates, AVG for kv_cache_usage_perc (per-engine fraction).
+    const engines = [0, 1, 2, 3].map((id) => buildEngineSeries(id, 3)); // running=3 per engine
+    const json = JSON.stringify({
+      metrics: {
+        'vllm:num_requests_running': { series: engines.map((e) => e.runningSlice) },
+        'vllm:num_requests_waiting': { series: engines.map((e) => e.waitingSlice) },
+        'vllm:kv_cache_usage_perc': { series: engines.map((e) => e.kvSlice) },
+        'vllm:prompt_tokens': { series: engines.map((e) => e.promptSlice) },
+        'vllm:generation_tokens': { series: engines.map((e) => e.genSlice) },
+      },
+    });
+    const blob = gzipSync(Buffer.from(json));
+    const cs = await computeChartSeries(blob);
+    expect(cs).not.toBeNull();
+    // queueDepth.running = Σ engines = 4 × 3 = 12 at t=0; 4 × 4 = 16 at t=1
+    expect(cs!.queueDepth).toEqual([
+      { t: 0, running: 12, waiting: 0, total: 12 },
+      { t: 1, running: 16, waiting: 0, total: 16 },
+    ]);
+    // kvCacheUsage stays 0.25, 0.5 (average across engines, all engines reported same value)
+    expect(cs!.kvCacheUsage).toEqual([
+      { t: 0, value: 0.25 },
+      { t: 1, value: 0.5 },
+    ]);
+    // prefillTps = Σ rates = 4 × 100 = 400; then 4 × 200 = 800
+    expect(cs!.prefillTps).toEqual([
+      { t: 0, value: 400 },
+      { t: 1, value: 800 },
+    ]);
+    expect(cs!.decodeTps).toEqual([
+      { t: 0, value: 200 },
+      { t: 1, value: 300 },
+    ]);
+  });
 });
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 3cb4181b..530600cf 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -17,8 +17,16 @@ import { parser } from 'stream-json';
 import { pick } from 'stream-json/filters/pick.js';
 import { streamObject } from 'stream-json/streamers/stream-object.js';
 
-/** Bump when the extraction algorithm changes — backfill recomputes anything older. */
-export const CHART_SERIES_VERSION = 1;
+/**
+ * Bump when the extraction algorithm changes — backfill recomputes anything
+ * older.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP
+ * deployments — most visible as a request-queue-depth chart that maxed out
+ * at ~3 when the timeline clearly showed 20+ in-flight).
+ */
+export const CHART_SERIES_VERSION = 2;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -147,17 +155,44 @@ export async function computeChartSeries(blob: Buffer | null): Promise<ChartSeri
   return buildSeriesFromMetrics(metrics);
 }
 
-/** Pull the first series under a metric key, or undefined. */
-function firstSeries(metrics: MetricsMap, name: string): RawSeries | undefined {
-  const s = metrics[name]?.series;
-  return s && s.length > 0 ? s[0] : undefined;
+/**
+ * Aggregate one timeslice field across all series of a metric, indexed by
+ * `start_ns`. Multi-engine vllm deployments report one series per engine —
+ * the cluster value is the sum (for running/waiting/throughput counters)
+ * or the average (for kv_cache_usage_perc, a per-engine fraction).
+ */
+function aggregateByStart(
+  series: readonly RawSeries[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of series ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
+/** Stable order: emit one point per unique start_ns, chronologically. */
+function sortedEntries(m: Map<number, number>): [number, number][] {
+  return [...m.entries()].toSorted((a, b) => a[0] - b[0]);
 }
 
 function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   // Timing reference: smallest start_ns and largest end_ns across every
-  // timeslice we extracted. (Same logic as the original getTraceServerMetrics
-  // — looking at every metric gives the widest possible window even if some
-  // series start late.)
+  // timeslice we extracted. timeslicesCount is the length of any single
+  // series (engines are scraped on the same cadence), so picking the max
+  // length across all series of all metrics is safe.
   let startNs = Number.POSITIVE_INFINITY;
   let endNs = 0;
   let timeslicesCount = 0;
@@ -175,83 +210,70 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   if (!Number.isFinite(startNs)) startNs = 0;
   const tOf = (ns: number) => (ns - startNs) / 1e9;
 
-  // KV cache usage (gauge, 0..1)
-  const kvCacheUsage: TimeSeriesPoint[] = [];
+  // KV cache usage (gauge, 0..1) — average across engines so the value
+  // stays a fraction (each engine has its own KV pool).
   const kvSeries =
-    firstSeries(metrics, 'vllm:kv_cache_usage_perc') ??
-    firstSeries(metrics, 'vllm:gpu_cache_usage_perc');
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number' && typeof ts.start_ns === 'number') {
-      kvCacheUsage.push({ t: tOf(ts.start_ns), value: ts.avg });
-    }
-  }
+    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
+    aggregateByStart(kvSeries, 'avg', 'avg'),
+  ).map(([t, v]) => ({ t: tOf(t), value: v }));
 
-  // Prefix cache hit rate per scrape (Δhits / Δqueries from counter rate).
-  const hitsTs = firstSeries(metrics, 'vllm:prefix_cache_hits')?.timeslices ?? [];
-  const qsTs = firstSeries(metrics, 'vllm:prefix_cache_queries')?.timeslices ?? [];
+  // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
+  // engines, joined on start_ns.
+  const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum');
+  const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum');
   const prefixCacheHitRate: TimeSeriesPoint[] = [];
-  const minLen = Math.min(hitsTs.length, qsTs.length);
-  for (let i = 0; i < minLen; i++) {
-    const h = hitsTs[i]!;
-    const q = qsTs[i]!;
-    if (
-      typeof q.rate === 'number' &&
-      q.rate > 0 &&
-      typeof h.rate === 'number' &&
-      typeof h.start_ns === 'number'
-    ) {
-      prefixCacheHitRate.push({ t: tOf(h.start_ns), value: h.rate / q.rate });
-    }
+  for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) {
+    const q = qsByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push({ t: tOf(t), value: h / q });
   }
 
-  // Queue depth: pair running + waiting by index.
-  const runTs = firstSeries(metrics, 'vllm:num_requests_running')?.timeslices ?? [];
-  const waitTs = firstSeries(metrics, 'vllm:num_requests_waiting')?.timeslices ?? [];
+  // Queue depth: sum running + waiting across engines per timeslice.
+  const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum');
+  const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum');
   const queueDepth: QueueDepthPoint[] = [];
-  const qlen = Math.min(runTs.length, waitTs.length);
-  for (let i = 0; i < qlen; i++) {
-    const r = runTs[i]!;
-    const w = waitTs[i]!;
-    if (typeof r.start_ns !== 'number') continue;
-    const running = typeof r.avg === 'number' ? r.avg : 0;
-    const waiting = typeof w.avg === 'number' ? w.avg : 0;
-    queueDepth.push({
-      t: tOf(r.start_ns),
-      running,
-      waiting,
-      total: running + waiting,
-    });
+  // Union of timestamps so we surface activity even if one of the gauges
+  // didn't report a sample on a given tick.
+  const allTimes = new Set<number>([...runByT.keys(), ...waitByT.keys()]);
+  for (const t of [...allTimes].toSorted((a, b) => a - b)) {
+    const running = runByT.get(t) ?? 0;
+    const waiting = waitByT.get(t) ?? 0;
+    queueDepth.push({ t: tOf(t), running, waiting, total: running + waiting });
   }
 
-  // Throughput: extract counter `rate` (already per-second from aiperf).
-  const counterRate = (name: string): TimeSeriesPoint[] => {
-    const s = firstSeries(metrics, name);
-    if (!s) return [];
-    const out: TimeSeriesPoint[] = [];
-    for (const ts of s.timeslices ?? []) {
-      if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
-        out.push({ t: tOf(ts.start_ns), value: ts.rate });
-      }
-    }
-    return out;
-  };
+  // Throughput: sum the counter `rate` (already per-second) across engines.
+  const counterRate = (name: string): TimeSeriesPoint[] =>
+    sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({
+      t: tOf(t),
+      value: v,
+    }));
   const prefillTps = counterRate('vllm:prompt_tokens');
   const decodeTps = counterRate('vllm:generation_tokens');
 
-  // Per-source prompt tokens — emit one TS array per source label.
-  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  // Per-source prompt tokens — sum across engines per source label.
+  const promptBySrcByT = new Map<string, Map<number, number>>();
   for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
     const labels = series.labels ?? {};
     const source = labels['source'] ?? labels['reason'] ?? labels['kind'] ?? JSON.stringify(labels);
-    const arr: TimeSeriesPoint[] = [];
+    let byT = promptBySrcByT.get(source);
+    if (!byT) {
+      byT = new Map<number, number>();
+      promptBySrcByT.set(source, byT);
+    }
     for (const ts of series.timeslices ?? []) {
       if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
-        arr.push({ t: tOf(ts.start_ns), value: ts.rate });
+        byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
       }
     }
+  }
+  const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
+  for (const [source, byT] of promptBySrcByT) {
+    const arr: TimeSeriesPoint[] = [];
+    for (const [t, v] of [...byT.entries()].toSorted((a, b) => a[0] - b[0])) {
+      if (v > 0) arr.push({ t: tOf(t), value: v });
+    }
     if (arr.length > 0) promptTokensBySource[source] = arr;
   }
-
   return {
     version: CHART_SERIES_VERSION,
     startNs,
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 8ac4f678..1ad7fd7f 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -29,8 +29,11 @@ import type { DbClient } from '../connection.js';
  * script recomputes any row whose stored `aggregate_stats.version` is older.
  * Lives here (rather than in compute-aggregate-stats.ts) to avoid a circular
  * import: the compute helper depends on the percentile utilities below.
+ *
+ * v2: aggregate vllm gauges/counters across all engine series (was reading
+ * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
  */
-export const STATS_VERSION = 1;
+export const STATS_VERSION = 2;
 
 export interface MetricPercentiles {
   mean: number;
@@ -154,10 +157,47 @@ interface MetricsJson {
   metrics?: Record<string, MetricMeta>;
 }
 
+/**
+ * Aggregate a per-timeslice field across all series of a metric, indexed by
+ * the timeslice's `start_ns`. vllm reports one series per engine on
+ * multi-engine DP/PP deployments, so we sum (or average) across engines to
+ * get the cluster-wide value at each timeslice.
+ *
+ * `field` selects which numeric field on a timeslice to read (`avg` for
+ * gauges, `rate` for counter deltas). `combine` controls cross-engine math:
+ * 'sum' for running/waiting/throughput counters where the cluster total is
+ * the sum; 'avg' for KV cache utilization, which is bounded [0, 1] per
+ * engine and should be averaged across engines for the cluster view.
+ */
+function aggregateSeriesByStart(
+  metricSeries: readonly Series[] | undefined,
+  field: 'avg' | 'rate',
+  combine: 'sum' | 'avg',
+): Map<number, number> {
+  const sums = new Map<number, number>();
+  const counts = new Map<number, number>();
+  for (const s of metricSeries ?? []) {
+    for (const ts of s.timeslices ?? []) {
+      if (typeof ts.start_ns !== 'number') continue;
+      const v = ts[field];
+      if (typeof v !== 'number' || !Number.isFinite(v)) continue;
+      sums.set(ts.start_ns, (sums.get(ts.start_ns) ?? 0) + v);
+      counts.set(ts.start_ns, (counts.get(ts.start_ns) ?? 0) + 1);
+    }
+  }
+  if (combine === 'sum') return sums;
+  const out = new Map<number, number>();
+  for (const [t, s] of sums) out.set(t, s / (counts.get(t) ?? 1));
+  return out;
+}
+
 /**
  * Parse the server_metrics_json → time-series arrays for KV cache util and
  * prefix cache hit rate (per-interval, computed from the prometheus
  * counters the same way trace-server-metrics does it).
+ *
+ * Aggregates across all engine series so multi-engine DP/PP deployments are
+ * counted correctly (previously we only read engine 0).
  */
 export function extractServerMetricSamples(json: string): {
   kvCacheUtil: number[];
@@ -165,40 +205,26 @@ export function extractServerMetricSamples(json: string): {
 } {
   const parsed = JSON.parse(json) as MetricsJson;
   const metrics = parsed.metrics ?? {};
-  const firstSeries = (name: string): Series | undefined => {
-    const s = metrics[name]?.series;
-    return s && s.length > 0 ? s[0] : undefined;
-  };
 
-  // KV cache util — gauge in [0, 1].
-  const kvSeries =
-    firstSeries('vllm:kv_cache_usage_perc') ?? firstSeries('vllm:gpu_cache_usage_perc');
-  const kvCacheUtil: number[] = [];
-  for (const ts of kvSeries?.timeslices ?? []) {
-    if (typeof ts.avg === 'number') kvCacheUtil.push(ts.avg);
-  }
+  // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
+  // value stays a percentage; summing would give meaningless 0..N.
+  const kvSeriesAll =
+    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
 
-  // Prefix cache hit rate per interval = hits.rate / queries.rate.
-  // Matches the derivation in queries/trace-server-metrics.ts.
-  // Metric names: vllm exposes these as `vllm:prefix_cache_*` (no `gpu_`
-  // prefix); falls back to the `gpu_`-prefixed names in case a future
-  // vllm version renames them.
+  // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
+  // all engines. Sum first, then divide.
+  const hitsAll =
+    metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series;
+  const queriesAll =
+    metrics['vllm:prefix_cache_queries']?.series ??
+    metrics['vllm:gpu_prefix_cache_queries']?.series;
+  const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
+  const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
   const prefixCacheHitRate: number[] = [];
-  const hitsSeries =
-    firstSeries('vllm:prefix_cache_hits') ?? firstSeries('vllm:gpu_prefix_cache_hits');
-  const queriesSeries =
-    firstSeries('vllm:prefix_cache_queries') ?? firstSeries('vllm:gpu_prefix_cache_queries');
-  if (hitsSeries && queriesSeries) {
-    const qByStart = new Map<number, TimeSlice>();
-    for (const q of queriesSeries.timeslices ?? []) {
-      if (typeof q.start_ns === 'number') qByStart.set(q.start_ns, q);
-    }
-    for (const h of hitsSeries.timeslices ?? []) {
-      if (typeof h.start_ns !== 'number' || typeof h.rate !== 'number') continue;
-      const q = qByStart.get(h.start_ns);
-      if (!q || typeof q.rate !== 'number' || q.rate === 0) continue;
-      prefixCacheHitRate.push(h.rate / q.rate);
-    }
+  for (const [t, h] of hitsByT) {
+    const q = qByT.get(t);
+    if (q !== undefined && q > 0) prefixCacheHitRate.push(h / q);
   }
 
   return { kvCacheUtil, prefixCacheHitRate };

From b3e315ccd66bfc5476fc7bf28b1b3c52628ffd8d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 26 May 2026 18:28:33 -0500
Subject: [PATCH 42/96] fix(scenario-selector): wrap "Deprecated" in
 SelectLabel + lead with agentic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two visual issues in the Scenario dropdown:
1. The "Deprecated" sub-header rendered as a bare span while sibling
   group labels ("Fixed Sequence Length") use SelectLabel — so
   "Deprecated" came out in body-text size, looking out of place.
2. Agentic Traces sat below the deprecated fixed-seq entries, visually
   implying it was part of the deprecated section.

Wraps DeprecatedSectionTitle in SelectLabel so the styling matches its
peers across all selectors (Scenario, Model, Hardware) that use it.
Moves the Agentic group to the top of the Scenario dropdown so it's
visually distinct from the fixed-seq + deprecated entries.

Agentic Traces was already the preferred default when available
(GlobalFilterContext.tsx); no behavior change there.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/ui/chart-selectors.tsx | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 19b4bfb0..8b91059a 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -33,7 +33,7 @@ import {
 
 function DeprecatedSectionTitle({ reason }: { reason: string }) {
   return (
-    <span className="flex items-center gap-1">
+    <SelectLabel className="flex items-center gap-1">
       Deprecated
       <TooltipRoot>
         <TooltipTrigger asChild>
@@ -43,7 +43,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) {
           <span>{reason}</span>
         </TooltipContent>
       </TooltipRoot>
-    </span>
+    </SelectLabel>
   );
 }
 
@@ -261,6 +261,17 @@ export function ScenarioSelector({
           <SelectValue />
         </SelectTrigger>
         <SelectContent>
+          {/* Agentic first — preferred default scenario when available. */}
+          {agentic.length > 0 && (
+            <SelectGroup>
+              <SelectLabel>Agentic</SelectLabel>
+              {agentic.map((seq) => (
+                <SelectItem key={seq} value={seq}>
+                  {getSequenceLabel(seq as Sequence)}
+                </SelectItem>
+              ))}
+            </SelectGroup>
+          )}
           {fixedSeq.length > 0 && (
             <SelectGroup>
               <SelectLabel>Fixed Sequence Length</SelectLabel>
@@ -281,11 +292,6 @@ export function ScenarioSelector({
               )}
             </SelectGroup>
           )}
-          {agentic.map((seq) => (
-            <SelectItem key={seq} value={seq}>
-              {getSequenceLabel(seq as Sequence)}
-            </SelectItem>
-          ))}
         </SelectContent>
       </Select>
     </div>

From 19b99586353cd39bccd4072bd6e2a2afcaf73367 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 26 May 2026 18:32:26 -0500
Subject: [PATCH 43/96] fix(scenario-selector): wrap Deprecated header in
 SelectLabel only inside Select
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous commit (b3e315c) changed DeprecatedSectionTitle to render
SelectLabel internally, which throws at runtime ("SelectLabel must be
used within SelectGroup") in callsites that render the header via
MultiSelect — MultiSelect wraps the header in its own div, not a Radix
SelectGroup.

Revert the component to a plain styled span (MultiSelect's div wrapper
supplies the small/muted styling), and wrap with SelectLabel only at
the ScenarioSelector callsite, where the header sits directly inside
a SelectGroup.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/src/components/ui/chart-selectors.tsx     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx
index 8b91059a..49ea3f1a 100644
--- a/packages/app/src/components/ui/chart-selectors.tsx
+++ b/packages/app/src/components/ui/chart-selectors.tsx
@@ -31,9 +31,16 @@ import {
   sequenceKind,
 } from '@/lib/data-mappings';
 
+/**
+ * "Deprecated" sub-header used by selectors. Rendered as a span (not
+ * SelectLabel) because some callsites use `MultiSelect`, which wraps
+ * headers in its own div and isn't a SelectGroup. The span carries no
+ * styling of its own — the parent context supplies the muted/small
+ * treatment. ScenarioSelector renders this inside a SelectLabel directly.
+ */
 function DeprecatedSectionTitle({ reason }: { reason: string }) {
   return (
-    <SelectLabel className="flex items-center gap-1">
+    <span className="flex items-center gap-1">
       Deprecated
       <TooltipRoot>
         <TooltipTrigger asChild>
@@ -43,7 +50,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) {
           <span>{reason}</span>
         </TooltipContent>
       </TooltipRoot>
-    </SelectLabel>
+    </span>
   );
 }
 
@@ -282,7 +289,9 @@ export function ScenarioSelector({
               ))}
               {fixedGroups.deprecated.length > 0 && (
                 <>
-                  <DeprecatedSectionTitle reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  <SelectLabel>
+                    <DeprecatedSectionTitle reason="CI capacity was reallocated to agentic coding and multi-turn chat scenarios." />
+                  </SelectLabel>
                   {fixedGroups.deprecated.map((seq) => (
                     <SelectItem key={seq} value={seq}>
                       {getSequenceLabel(seq as Sequence)}

From 7114833409b92a206f7c22b80846db527e01da43 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 13:22:13 -0500
Subject: [PATCH 44/96] feat(agentic-detail): add cumulative input tokens chart
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Surfaces a new chart on the agentic detail page showing the running
total of input (prompt) tokens served over the course of the run —
useful for seeing how the load actually accumulates vs the
instantaneous prefill_tps line we already plot.

Adds a `cumulativeSum` helper alongside the existing `cumulativeAverage`
and `sumSeries` time-series utilities. No backfill needed — the source
data (`chart_series.prefillTps`) is already pre-computed at ingest time
for every blob-bearing row.

(Input throughput as a Pareto axis is already wired via the existing
`y_inputTputPerGpu` y-axis option; no change there.)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 24 +++++++++++++++++++
 .../agentic-point/time-series-chart.tsx       | 17 +++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 2e43b4fb..1a61b93b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -26,6 +26,7 @@ import {
   StackedAreaChart,
   TimeSeriesChart,
   cumulativeAverage,
+  cumulativeSum,
   rollingAverage,
   sumSeries,
 } from './time-series-chart';
@@ -381,6 +382,29 @@ export function AgenticPointDetail({ id }: Props) {
               );
             }}
           />
+
+          <ExpandableChart
+            title="Total input tokens over time"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!metrics) return <Skeleton />;
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'Cumulative input tokens',
+                      data: cumulativeSum(metrics.prefillTps),
+                      color: '#3b82f6',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={metrics.durationS}
+                  yAxisLabel="Tokens"
+                  {...size}
+                />
+              );
+            }}
+          />
         </div>
       )}
     </div>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index cd10aff7..042c4331 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -58,6 +58,23 @@ export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Running cumulative sum of a per-interval rate series. Each output point
+ * is the integral of the rate from start to that point, assuming the rate
+ * applies over a 1-second window (aiperf's scrape interval). Use for
+ * "total tokens served so far" from a tokens-per-second series.
+ */
+export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
+  if (data.length === 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  let sum = 0;
+  for (let i = 0; i < data.length; i++) {
+    sum += data[i]!.value;
+    out[i] = { t: data[i]!.t, value: sum };
+  }
+  return out;
+}
+
 /** Pointwise sum of two arrays sharing the same t index. */
 export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
   const n = Math.min(a.length, b.length);

From c6697de8ff3d8263924986fd71b4622f1369f9a3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 14:44:19 -0500
Subject: [PATCH 45/96] feat(agentic-detail): plot cumulative unique input
 tokens
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the "Total input tokens over time" chart with "Total unique
input tokens over time" — cumsum of (prompt-token rate − prefix-cache-
hit rate per second), which equals the cumulative tokens vllm actually
had to prefill from scratch (= vllm:request_prefill_kv_computed_tokens).

Adds `prefixCacheHitsTps` to the chart_series JSONB (extracted by
summing vllm:prefix_cache_hits.rate across all engine series, same DP-
aware path as prefillTps). Bumps CHART_SERIES_VERSION to 3; the
existing trace-server-metrics query defaults the field to [] for any
older v2 rows so reads stay safe before backfill catches up.

Backfilled 62 rows to v3.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx       | 14 +++++++++++---
 .../src/hooks/api/use-trace-server-metrics.ts    |  2 ++
 packages/db/src/etl/compute-chart-series.ts      | 16 +++++++++++++++-
 packages/db/src/queries/trace-server-metrics.ts  |  4 ++++
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 1a61b93b..4bebd37c 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -384,16 +384,24 @@ export function AgenticPointDetail({ id }: Props) {
           />
 
           <ExpandableChart
-            title="Total input tokens over time"
+            title="Total unique input tokens over time"
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
+              // Unique = total prompt tokens vllm received minus the tokens
+              // it served from the prefix cache. The cache-miss portion is
+              // what actually constitutes "new content" the GPU had to
+              // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens.
+              const unique = sumSeries(
+                metrics.prefillTps,
+                metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })),
+              );
               return (
                 <TimeSeriesChart
                   series={[
                     {
-                      name: 'Cumulative input tokens',
-                      data: cumulativeSum(metrics.prefillTps),
+                      name: 'Cumulative unique input tokens',
+                      data: cumulativeSum(unique),
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index 8418aa4f..664bc6c7 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -42,6 +42,8 @@ export interface TraceServerMetrics {
   promptTokensBySource: Record<string, TimeSeriesPoint[]>;
   prefillTps: TimeSeriesPoint[];
   decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 530600cf..91e89521 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -25,8 +25,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * only series[0], which under-counted by Nx on multi-engine DP/PP
  * deployments — most visible as a request-queue-depth chart that maxed out
  * at ~3 when the timeline clearly showed 20+ in-flight).
+ *
+ * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
+ * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
  */
-export const CHART_SERIES_VERSION = 2;
+export const CHART_SERIES_VERSION = 3;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -57,6 +60,13 @@ export interface ChartSeries {
   promptTokensBySource: Record<string, TimeSeriesPoint[]>;
   prefillTps: TimeSeriesPoint[];
   decodeTps: TimeSeriesPoint[];
+  /**
+   * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across
+   * engines. Detail page derives "cumulative unique input tokens" as
+   * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually
+   * saved vs the raw queries that came in.
+   */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -249,6 +259,9 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }));
   const prefillTps = counterRate('vllm:prompt_tokens');
   const decodeTps = counterRate('vllm:generation_tokens');
+  // Tokens served from prefix cache per scrape. Lets the frontend derive
+  // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
+  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits');
 
   // Per-source prompt tokens — sum across engines per source label.
   const promptBySrcByT = new Map<string, Map<number, number>>();
@@ -286,5 +299,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     promptTokensBySource,
     prefillTps,
     decodeTps,
+    prefixCacheHitsTps,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 624b6ed3..76775e77 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -71,6 +71,8 @@ export interface TraceServerMetrics {
   prefillTps: TimeSeriesPoint[];
   /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */
   decodeTps: TimeSeriesPoint[];
+  /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
+  prefixCacheHitsTps: TimeSeriesPoint[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -114,6 +116,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     promptTokensBySource: series.promptTokensBySource,
     prefillTps: series.prefillTps,
     decodeTps: series.decodeTps,
+    // v2 chart_series rows pre-backfill don't have this field — default to []
+    prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
   };
 }
 

From b5679bb10acfd6a6765b48a5864b2a0ec73d4915 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:00:12 -0500
Subject: [PATCH 46/96] feat(request-timeline): expandable subagent -> stream
 rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The harness fans a single subagent into multiple parallel ":sN" streams
when its inner requests overlap in time (weka_trace._pack_into_streams).
Previously each :sN got its own swimlane row, which made one parent
conversation with 5 subagents (each fanned into 2-8 streams) render as
23 separate rows — visually implying 23 distinct subagent invocations
when really there are 5.

Now: each subagent shows as one row by default with a chevron + stream
count chip ("subagent 003 · f1e7 ×8"). The collapsed row draws the
union of all stream bars overlaid, so the concurrency burst is still
visible at a glance. Click the chevron to fan into per-stream rows;
click again to collapse.

For conv 0f5b266f in benchmark 206360: 23 rows → 5 rows by default.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.tsx        | 325 ++++++++++++------
 1 file changed, 226 insertions(+), 99 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index bcbe105a..8762a158 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -53,44 +53,84 @@ const PHASE_COLORS: Record<string, string> = {
   unknown: '#64748b',
 };
 
+/**
+ * Row kinds:
+ *   parent           — top-level conversation (depth 0)
+ *   worker           — worker swimlane (depth 0, worker mode)
+ *   subagent         — a subagent invocation (depth 1). Either a single
+ *                      stream (renders its own bars), or a multi-stream
+ *                      container whose bars are the union of its streams
+ *                      when collapsed.
+ *   stream           — one :sN stream of a multi-stream subagent (depth 2).
+ *                      Hidden by default; toggled in via the parent's chevron.
+ */
+type RowKind = 'parent' | 'worker' | 'subagent' | 'stream';
+
 interface Row {
   key: string;
   label: string;
   color: string;
   requests: RequestRecord[];
-  /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */
   depth: number;
-  /** True if this row is a sub-agent ("Subagent N of parent X"). */
-  isSubagent: boolean;
+  kind: RowKind;
+  /** Number of streams under this subagent (>=1). Only set for subagent rows. */
+  streamCount?: number;
+  /** For stream rows: the parent subagent's row key (drives expand/collapse). */
+  parentRowKey?: string;
 }
 
 /**
  * Conversation ids for subagent calls look like
- *   <parent_cid>::sa:subagent_<N>_<hash>
- * Split into the parent cid and a sub-agent label (or the whole thing if
- * this is a top-level conversation).
+ *   <parent_cid>::sa:<agent_id>[:s<stream_idx>]
+ * The optional `:s<N>` suffix is set when the harness fans a single
+ * subagent into multiple parallel "streams" (interval-graph
+ * decomposition in weka_trace._pack_into_streams). We split it off so
+ * we can group all streams of one subagent under a single header row.
  */
-function splitCid(cid: string): { parent: string; subagent: string | null } {
+function splitCid(cid: string): {
+  parent: string;
+  subagentBase: string | null;
+  stream: number | null;
+} {
   const sep = cid.indexOf('::sa:');
-  if (sep === -1) return { parent: cid, subagent: null };
-  return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) };
+  if (sep === -1) return { parent: cid, subagentBase: null, stream: null };
+  const parent = cid.slice(0, sep);
+  const raw = cid.slice(sep + 5);
+  const m = /^(.*):s(\d+)$/.exec(raw);
+  if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) };
+  return { parent, subagentBase: raw, stream: null };
 }
 
-/** Group requests into rows; in conversation mode subagents nest under parents. */
-function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
-  const groups = new Map<string, RequestRecord[]>();
-  for (const r of requests) {
-    const key = mode === 'conversation' ? r.cid : r.wid;
-    let list = groups.get(key);
-    if (!list) {
-      list = [];
-      groups.set(key, list);
-    }
-    list.push(r);
-  }
-
+/**
+ * Group requests into rows. In conversation mode, output order is:
+ *   parent_conv
+ *     subagent_001                  (collapsed by default, container)
+ *       :s0                         (hidden unless expanded)
+ *       :s1
+ *     subagent_002
+ *     ...
+ *
+ * `expandedSubagents` controls which subagent containers reveal their
+ * stream children. Bars on a collapsed subagent are the UNION of all its
+ * streams' requests — overlapping bars visually communicate the
+ * stream-level parallelism without expanding.
+ */
+function buildRows(
+  requests: RequestRecord[],
+  mode: RowMode,
+  expandedSubagents: ReadonlySet<string>,
+): Row[] {
   if (mode !== 'conversation') {
     // Worker mode: flat rows, sorted by first activity.
+    const groups = new Map<string, RequestRecord[]>();
+    for (const r of requests) {
+      let list = groups.get(r.wid);
+      if (!list) {
+        list = [];
+        groups.set(r.wid, list);
+      }
+      list.push(r);
+    }
     const rows: Row[] = [];
     let i = 0;
     for (const [key, list] of groups) {
@@ -101,7 +141,7 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
         color: ROW_COLORS[i % ROW_COLORS.length]!,
         requests: list,
         depth: 0,
-        isSubagent: false,
+        kind: 'worker',
       });
       i++;
     }
@@ -109,36 +149,40 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
     return rows;
   }
 
-  // Conversation mode: build a parent → [subagents] tree so each parent
-  // group renders as one parent row followed by its sub-agent rows. Color
-  // is shared inside a tree so the visual grouping reads.
+  // Conversation mode — tree: parent → subagent → stream.
   interface Tree {
     parentCid: string;
-    parentRow: { key: string; requests: RequestRecord[] } | null;
-    subagents: Map<string, RequestRecord[]>; // subagent label → requests
+    parentReqs: RequestRecord[];
+    // subagentBase → (streamIndex|null → requests)
+    subagents: Map<string, Map<number | null, RequestRecord[]>>;
     firstStart: number;
   }
   const trees = new Map<string, Tree>();
-  for (const [cid, list] of groups) {
-    list.sort((a, b) => a.start - b.start);
-    const { parent, subagent } = splitCid(cid);
+  for (const r of requests) {
+    const { parent, subagentBase, stream } = splitCid(r.cid);
     let tree = trees.get(parent);
     if (!tree) {
       tree = {
         parentCid: parent,
-        parentRow: null,
+        parentReqs: [],
         subagents: new Map(),
         firstStart: Number.POSITIVE_INFINITY,
       };
       trees.set(parent, tree);
     }
-    if (subagent === null) {
-      tree.parentRow = { key: cid, requests: list };
+    if (subagentBase === null) {
+      tree.parentReqs.push(r);
     } else {
-      tree.subagents.set(subagent, list);
+      let saMap = tree.subagents.get(subagentBase);
+      if (!saMap) {
+        saMap = new Map();
+        tree.subagents.set(subagentBase, saMap);
+      }
+      const list = saMap.get(stream);
+      if (list) list.push(r);
+      else saMap.set(stream, [r]);
     }
-    const earliest = list[0]!.start;
-    if (earliest < tree.firstStart) tree.firstStart = earliest;
+    if (r.start < tree.firstStart) tree.firstStart = r.start;
   }
 
   const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart);
@@ -147,39 +191,66 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] {
   for (const tree of sortedTrees) {
     const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!;
     colorIdx++;
-    if (tree.parentRow) {
+    // Parent row (use a placeholder key if the parent itself wasn't replayed).
+    tree.parentReqs.sort((a, b) => a.start - b.start);
+    rows.push({
+      key: tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`,
+      label: tree.parentCid,
+      color,
+      requests: tree.parentReqs,
+      depth: 0,
+      kind: 'parent',
+    });
+
+    // One subagent row per base (which may contain N streams).
+    const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => {
+      const aStart = Math.min(
+        ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      const bStart = Math.min(
+        ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY),
+      );
+      return aStart - bStart;
+    });
+    for (const [saBase, streams] of subagentEntries) {
+      const subagentKey = `${tree.parentCid}::sa:${saBase}`;
+      // Union of all stream requests for collapsed-view bars.
+      const allReqs: RequestRecord[] = [];
+      for (const reqs of streams.values()) allReqs.push(...reqs);
+      allReqs.sort((a, b) => a.start - b.start);
+      const streamCount = streams.size;
       rows.push({
-        key: tree.parentRow.key,
-        label: shortenCid(tree.parentCid),
+        key: subagentKey,
+        label: `↳ ${formatSubagentLabel(saBase)}`,
         color,
-        requests: tree.parentRow.requests,
-        depth: 0,
-        isSubagent: false,
-      });
-    } else {
-      // Pseudo-parent header so orphan subagents still render under
-      // something they belong to.
-      rows.push({
-        key: `__parent_${tree.parentCid}`,
-        label: shortenCid(tree.parentCid),
-        color,
-        requests: [],
-        depth: 0,
-        isSubagent: false,
-      });
-    }
-    const subagentEntries = [...tree.subagents.entries()].toSorted(
-      (a, b) => a[1][0]!.start - b[1][0]!.start,
-    );
-    for (const [saLabel, list] of subagentEntries) {
-      rows.push({
-        key: `${tree.parentCid}::${saLabel}`,
-        label: `↳ ${formatSubagentLabel(saLabel)}`,
-        color,
-        requests: list,
+        requests: allReqs,
         depth: 1,
-        isSubagent: true,
+        kind: 'subagent',
+        streamCount,
       });
+
+      // Stream children only when expanded AND there's more than one
+      // stream (a single-stream subagent has nothing extra to show).
+      if (streamCount > 1 && expandedSubagents.has(subagentKey)) {
+        const streamEntries = [...streams.entries()].toSorted((a, b) => {
+          // Sort by stream index (null first as the "default" stream)
+          const ai = a[0] ?? -1;
+          const bi = b[0] ?? -1;
+          return ai - bi;
+        });
+        for (const [streamIdx, reqs] of streamEntries) {
+          reqs.sort((a, b) => a.start - b.start);
+          rows.push({
+            key: `${subagentKey}:s${streamIdx ?? '∅'}`,
+            label: `stream ${streamIdx ?? '∅'}`,
+            color,
+            requests: reqs,
+            depth: 2,
+            kind: 'stream',
+            parentRowKey: subagentKey,
+          });
+        }
+      }
     }
   }
   return rows;
@@ -192,11 +263,6 @@ function formatSubagentLabel(raw: string): string {
   return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`;
 }
 
-function shortenCid(cid: string): string {
-  if (cid.length <= 12) return cid;
-  return `${cid.slice(0, 8)}…${cid.slice(-4)}`;
-}
-
 function shortenWid(wid: string): string {
   // worker_4ae87bea → w_4ae8
   return wid.replace(/^worker_/, 'w_').slice(0, 12);
@@ -314,6 +380,17 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   const [rowMode, setRowMode] = useState<RowMode>('conversation');
   const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+  // Which multi-stream subagents currently have their per-stream rows
+  // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
+  const [expandedSubagents, setExpandedSubagents] = useState<ReadonlySet<string>>(() => new Set());
+  const toggleSubagent = useCallback((key: string) => {
+    setExpandedSubagents((prev) => {
+      const next = new Set(prev);
+      if (next.has(key)) next.delete(key);
+      else next.add(key);
+      return next;
+    });
+  }, []);
   const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
 
   // Apply phase filter, then group into rows.
@@ -322,7 +399,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
       phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
     [data.requests, phaseFilter],
   );
-  const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]);
+  const rows = useMemo(
+    () => buildRows(filtered, rowMode, expandedSubagents),
+    [filtered, rowMode, expandedSubagents],
+  );
 
   // Pre-sort the timestamp columns so the cursor-time stats popover can
   // count "running / waiting at time t" in O(log n). With a few hundred
@@ -359,7 +439,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   const isZoomed = viewEnd !== null;
 
   // Layout
-  const LABEL_WIDTH = 160;
+  // Wide enough for a full 36-char conversation id at 10px font, plus the
+  // indent + color stripe + count badge. Subagent rows inherit the same
+  // width but truncate the longer "↳ subagent N · hash" tail with ellipsis.
+  const LABEL_WIDTH = 360;
   const ROW_HEIGHT = 22;
   const ROW_GAP = 3;
   const HEADER_HEIGHT = 24;
@@ -537,33 +620,58 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                 {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
               </span>
             </div>
-            {rows.map((row) => (
-              <div
-                key={row.key}
-                className="flex items-center gap-1.5 overflow-hidden pr-2"
-                style={{
-                  height: ROW_HEIGHT + ROW_GAP,
-                  paddingLeft: 8 + row.depth * 12,
-                }}
-              >
-                <span
-                  className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+            {rows.map((row) => {
+              const isSubagentRow = row.kind === 'subagent';
+              const isStreamRow = row.kind === 'stream';
+              const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
+              const isExpanded = isExpandable && expandedSubagents.has(row.key);
+              return (
+                <div
+                  key={row.key}
+                  className="flex items-center gap-1 overflow-hidden pr-2"
                   style={{
-                    backgroundColor: row.color,
-                    opacity: row.isSubagent ? 0.55 : 1,
+                    height: ROW_HEIGHT + ROW_GAP,
+                    paddingLeft: 4 + row.depth * 10,
                   }}
-                />
-                <span
-                  className="text-[10px] font-mono truncate"
-                  style={{ color: row.color, opacity: row.isSubagent ? 0.85 : 1 }}
                 >
-                  {row.label}
-                </span>
-                <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
-                  {row.requests.length > 0 ? row.requests.length : '—'}
-                </span>
-              </div>
-            ))}
+                  {isExpandable ? (
+                    <button
+                      type="button"
+                      onClick={() => toggleSubagent(row.key)}
+                      className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
+                      aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                      title={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                    >
+                      <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
+                    </button>
+                  ) : (
+                    <span className="size-3.5 shrink-0" />
+                  )}
+                  <span
+                    className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                    style={{
+                      backgroundColor: row.color,
+                      opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
+                    }}
+                  />
+                  <span
+                    className="text-[10px] font-mono truncate"
+                    style={{
+                      color: row.color,
+                      opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                    }}
+                  >
+                    {row.label}
+                    {isExpandable && (
+                      <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                    )}
+                  </span>
+                  <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                    {row.requests.length > 0 ? row.requests.length : '—'}
+                  </span>
+                </div>
+              );
+            })}
           </div>
 
           {/* Scrollable SVG */}
@@ -636,6 +744,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
               {rows.map((row, rowIdx) => {
                 const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
                 const barH = ROW_HEIGHT - 4;
+                // For multi-stream subagent containers, suppress the union
+                // bars when expanded — the child stream rows draw them
+                // individually instead, so we'd double-draw otherwise.
+                if (
+                  row.kind === 'subagent' &&
+                  (row.streamCount ?? 1) > 1 &&
+                  expandedSubagents.has(row.key)
+                ) {
+                  return null;
+                }
                 return row.requests.map((req) => {
                   const xCredit = xOf(req.credit);
                   const xStart = xOf(req.start);
@@ -663,7 +781,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                           opacity={0.35}
                         />
                       )}
-                      {/* Main bar */}
+                      {/* Main bar — opacity stepped down with depth so
+                          parent > subagent > stream reads visually. */}
                       <rect
                         x={xStart}
                         y={yTop}
@@ -671,7 +790,15 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                         height={barH}
                         rx={2}
                         fill={row.color}
-                        opacity={req.cancelled ? 0.35 : row.isSubagent ? 0.6 : 0.85}
+                        opacity={
+                          req.cancelled
+                            ? 0.35
+                            : row.kind === 'stream'
+                              ? 0.5
+                              : row.kind === 'subagent'
+                                ? 0.6
+                                : 0.85
+                        }
                       />
                       {/* Phase strip at bottom */}
                       <rect

From 2e1f1ce33da85dbc8058bf41feffffc04ba7ee26 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:07:27 -0500
Subject: [PATCH 47/96] fix(agentic-detail): make unique-input-tokens chart
 monotonic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vllm's per-scrape prompt_tokens.rate and prefix_cache_hits.rate counters
can lag each other by several seconds across scrapes (we see prefill=0
at one tick with hits=1.1M, then prefill=1.5M with hits=452K six ticks
later — lifetime totals agree but per-tick they don't). Computing
cumsum(prefill - hits) per tick made the chart dip well negative at
the start.

Replaces the per-tick subtraction with `cumulativeDifferenceMonotonic`:
union the two series by timestamp, accumulate each independently, take
the diff, then enforce a running max so the curve never decreases.
End-of-run totals are unchanged (both counters converge to the right
value); transient skew just looks like a brief plateau instead of a
negative dip.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 21 ++++++-----
 .../agentic-point/time-series-chart.tsx       | 37 +++++++++++++++++++
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 4bebd37c..1abf64e6 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -26,7 +26,7 @@ import {
   StackedAreaChart,
   TimeSeriesChart,
   cumulativeAverage,
-  cumulativeSum,
+  cumulativeDifferenceMonotonic,
   rollingAverage,
   sumSeries,
 } from './time-series-chart';
@@ -388,20 +388,21 @@ export function AgenticPointDetail({ id }: Props) {
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
-              // Unique = total prompt tokens vllm received minus the tokens
-              // it served from the prefix cache. The cache-miss portion is
-              // what actually constitutes "new content" the GPU had to
-              // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens.
-              const unique = sumSeries(
-                metrics.prefillTps,
-                metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })),
-              );
+              // Unique = total prompt tokens received minus tokens served
+              // from the prefix cache. Equivalent to cumsum of
+              // vllm:request_prefill_kv_computed_tokens. We compute it as
+              // monotonic-non-decreasing cumulative-diff so per-scrape
+              // timing skew between the prompt_tokens and prefix_cache_hits
+              // counters can't make the line dip negative.
               return (
                 <TimeSeriesChart
                   series={[
                     {
                       name: 'Cumulative unique input tokens',
-                      data: cumulativeSum(unique),
+                      data: cumulativeDifferenceMonotonic(
+                        metrics.prefillTps,
+                        metrics.prefixCacheHitsTps,
+                      ),
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 042c4331..25d5a672 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -75,6 +75,43 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Monotonic-non-decreasing cumulative difference of two rate series:
+ * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce
+ * a running max so the curve never dips below its prior value.
+ *
+ * Use this to plot things like "cumulative cache-missed tokens" where the
+ * true value can only ever grow, but the underlying per-tick rates can
+ * temporarily look negative due to counter timing skew between scrapes
+ * (vllm's `prefix_cache_hits` and `prompt_tokens` counters can lag each
+ * other by ~5-10 s in our data even though their lifetime totals agree).
+ *
+ * `a` and `b` may have different (or overlapping) timestamp sets — both
+ * are unioned and walked in time order. Output has one point per unique
+ * timestamp present in either input.
+ */
+export function cumulativeDifferenceMonotonic(
+  a: TimeSeriesPoint[],
+  b: TimeSeriesPoint[],
+): TimeSeriesPoint[] {
+  const aByT = new Map(a.map((p) => [p.t, p.value]));
+  const bByT = new Map(b.map((p) => [p.t, p.value]));
+  const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y);
+  const out: TimeSeriesPoint[] = Array.from({ length: allT.length });
+  let cumA = 0;
+  let cumB = 0;
+  let runningMax = 0;
+  for (let i = 0; i < allT.length; i++) {
+    const t = allT[i]!;
+    cumA += aByT.get(t) ?? 0;
+    cumB += bByT.get(t) ?? 0;
+    const diff = cumA - cumB;
+    if (diff > runningMax) runningMax = diff;
+    out[i] = { t, value: runningMax };
+  }
+  return out;
+}
+
 /** Pointwise sum of two arrays sharing the same t index. */
 export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] {
   const n = Math.min(a.length, b.length);

From 08bbe6650c73935d7ac7a9fa29a722b141911bc9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 15:15:05 -0500
Subject: [PATCH 48/96] feat(agentic-detail): add unique input tokens in flight
 chart

New chart on the per-point view that plots the deduped count of
input tokens currently held by in-flight requests, as a 30s time-
weighted rolling average with the raw step series rendered as faint
scatter behind it. Useful for seeing the working set the model has
to hold KV cache for at any instant.

Computation (frontend, from request_timeline):
  - At each request start/end event, maintain active ISL per cid
    (within one cid turns are sequential, so each cid contributes
    at most one in-flight ISL at a time)
  - total_in_flight(t) = sum over cids with active request of that
    cid's current ISL
  - Across cids we treat content as independent (cross-conv prefix
    sharing measured at <1 pp, so summing is a tight approximation)

Adds timeRollingAverage helper: time-weighted (vs sample-count)
moving average suitable for irregularly-sampled event series like
this one.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 43 ++++++++-
 .../agentic-point/time-series-chart.tsx       | 96 +++++++++++++++++++
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 1abf64e6..2db2809b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -27,8 +27,10 @@ import {
   TimeSeriesChart,
   cumulativeAverage,
   cumulativeDifferenceMonotonic,
+  inflightUniqueTokens,
   rollingAverage,
   sumSeries,
+  timeRollingAverage,
 } from './time-series-chart';
 
 interface Props {
@@ -124,8 +126,10 @@ export function AgenticPointDetail({ id }: Props) {
   // shows how the metric varies across the SKU.
   const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? [];
   const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates');
-  // Per-request timeline fetched only when the timeline view is active.
-  const timelineQuery = useRequestTimeline(id, view === 'timeline');
+  // Per-request timeline used by both the timeline view AND the per-point
+  // "Unique input tokens in flight" chart, so fetch whenever we're on
+  // either view.
+  const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point');
 
   return (
     <div className="container mx-auto px-4 lg:px-8 flex flex-col gap-4 py-6">
@@ -414,6 +418,41 @@ export function AgenticPointDetail({ id }: Props) {
               );
             }}
           />
+
+          <ExpandableChart
+            title="Unique input tokens in flight"
+            render={(expanded) => {
+              const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+              if (!timelineQuery.data) {
+                return timelineQuery.isLoading ? <Skeleton /> : <Empty />;
+              }
+              // Step function: at each request start/end, sum the ISLs of
+              // currently-active requests across distinct cids. Within one
+              // cid turns are sequential so each cid contributes at most
+              // one in-flight ISL; across cids we treat content as
+              // independent (cross-conv prefix sharing adds <1pp in
+              // practice). Smooth with a 30s time-weighted rolling average
+              // so brief turn-handoff dips don't dominate the chart.
+              const raw = inflightUniqueTokens(timelineQuery.data.requests);
+              const smoothed = timeRollingAverage(raw, 30);
+              return (
+                <TimeSeriesChart
+                  series={[
+                    {
+                      name: 'In flight (avg 30s)',
+                      data: smoothed,
+                      rawData: raw,
+                      color: '#a855f7',
+                      strokeWidth: 2,
+                    },
+                  ]}
+                  durationS={timelineQuery.data.durationS}
+                  yAxisLabel="Tokens"
+                  {...size}
+                />
+              );
+            }}
+          />
         </div>
       )}
     </div>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 25d5a672..520b3ed6 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -27,6 +27,39 @@ interface TimeSeriesChartProps {
   height?: number;
 }
 
+/**
+ * Time-weighted rolling average over a `windowS`-second trailing window.
+ * Treats the input as a step function (value held constant between
+ * samples) and integrates over the trailing window, dividing by the
+ * window length. Good for smoothing irregularly-sampled event series
+ * (e.g. request start/end events) where the regular sample-count
+ * `rollingAverage` would over-weight bursts of close-together events.
+ */
+export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] {
+  if (data.length === 0 || windowS <= 0) return data;
+  const out: TimeSeriesPoint[] = Array.from({ length: data.length });
+  for (let i = 0; i < data.length; i++) {
+    const tEnd = data[i]!.t;
+    const tStart = Math.max(0, tEnd - windowS);
+    // Find the first sample j whose t is >= tStart; the step value at
+    // tStart is data[j-1].value if j > 0, else data[0].value.
+    let j = 0;
+    while (j < data.length && data[j]!.t < tStart) j++;
+    let prevT = tStart;
+    let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value;
+    let area = 0;
+    for (; j <= i; j++) {
+      const curT = data[j]!.t;
+      area += prevV * (curT - prevT);
+      prevT = curT;
+      prevV = data[j]!.value;
+    }
+    const dur = tEnd - tStart;
+    out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value };
+  }
+  return out;
+}
+
 /** Centered rolling average over `windowSize` samples. */
 export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] {
   if (data.length === 0 || windowSize <= 1) return data;
@@ -75,6 +108,69 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] {
   return out;
 }
 
+/**
+ * Per-event step series: at each request start/end, sum the ISLs of
+ * currently-active requests across distinct `cid`s. Within a single
+ * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N),
+ * so each cid contributes at most one in-flight ISL at a time. Across
+ * different cids we assume content is independent (parent ↔ subagent
+ * and conv ↔ conv share negligible prefix in practice — cross-conv
+ * dedup added ~0.25 pp to theoretical hit rate, so treating them as
+ * independent is a tight approximation of the true in-flight unique
+ * token count).
+ *
+ * Output is a step function: one point per event, value held constant
+ * until the next event. Time axis is seconds relative to the earliest
+ * event in `requests`.
+ */
+export function inflightUniqueTokens(
+  requests: readonly { cid: string; start: number; end: number; isl: number | null }[],
+): TimeSeriesPoint[] {
+  if (requests.length === 0) return [];
+  // The request_timeline timestamps are ns-relative to its own origin.
+  // Convert events to seconds and emit a step series.
+  interface Event {
+    tNs: number;
+    kind: 'start' | 'end';
+    cid: string;
+    isl: number;
+  }
+  const events: Event[] = [];
+  for (const r of requests) {
+    const isl = r.isl ?? 0;
+    if (isl <= 0) continue;
+    events.push({ tNs: r.start, kind: 'start', cid: r.cid, isl });
+    events.push({ tNs: r.end, kind: 'end', cid: r.cid, isl });
+  }
+  if (events.length === 0) return [];
+  // Sort by time; on ties, process 'end' before 'start' so a same-instant
+  // turn handoff within one cid doesn't transiently double-count.
+  events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1));
+
+  // Active ISL per cid (max in case the same cid somehow has overlapping
+  // events; in practice it's always 0 or 1 request at a time per cid).
+  const activeByCid = new Map<string, number>();
+  let total = 0;
+  const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }];
+  for (const e of events) {
+    const tSec = e.tNs / 1e9;
+    if (e.kind === 'start') {
+      const prev = activeByCid.get(e.cid) ?? 0;
+      const next = Math.max(prev, e.isl);
+      activeByCid.set(e.cid, next);
+      total += next - prev;
+    } else {
+      const cur = activeByCid.get(e.cid) ?? 0;
+      if (cur > 0) {
+        total -= cur;
+        activeByCid.delete(e.cid);
+      }
+    }
+    out.push({ t: tSec, value: Math.max(0, total) });
+  }
+  return out;
+}
+
 /**
  * Monotonic-non-decreasing cumulative difference of two rate series:
  * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce

From 7561deb1cc5a210ce6cd074ab0d4771b3b9f8342 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 20:30:39 -0500
Subject: [PATCH 49/96] feat(chart-series): extract SGLang metrics alongside
 vllm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our chart_series + aggregate_stats extractors hardcoded vllm:* metric
names, so SGLang runs (e.g. qwen3.5/h100/sglang) ingested cleanly but
the per-point detail page rendered empty charts — chart_series fields
were all zero-length arrays.

Adds fallback chains in each extractor:

  KV cache util      vllm:kv_cache_usage_perc  → sglang:token_usage
  Prefix cache hits  vllm:prefix_cache_hits    → sglang:cached_tokens
  Prefix cache qrys  vllm:prefix_cache_queries → sglang:prompt_tokens
  Requests running   vllm:num_requests_running → sglang:num_running_reqs
  Requests waiting   vllm:num_requests_waiting → sglang:num_queue_reqs
  Prompt tokens rate vllm:prompt_tokens        → sglang:prompt_tokens
  Generation rate    vllm:generation_tokens    → sglang:generation_tokens

The `pickFirstNonEmpty` helper walks the chain and uses whichever
series has data, so a future framework (mori-sglang, dynamo, etc.) can
plug in by adding its names to each chain — no per-framework branching.

CHART_SERIES_VERSION → 4, STATS_VERSION → 3. Both backfills re-ran (86
chart_series rows, 190 aggregate_stats rows). SGLang chart_series for
qwen3.5 run 944 verified — was 0-length arrays before, now ~1800
samples each.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts   | 67 +++++++++++++++----
 packages/db/src/queries/agentic-aggregates.ts | 56 +++++++++++++---
 2 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 91e89521..86b79925 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -28,8 +28,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  *
  * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative
  * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps).
+ *
+ * v4: extract sglang:* metrics too (fallback chain in each picker), so
+ * SGLang runs populate the chart_series the same way vllm runs do.
  */
-export const CHART_SERIES_VERSION = 3;
+export const CHART_SERIES_VERSION = 4;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -89,8 +92,13 @@ interface RawMetric {
 
 type MetricsMap = Record<string, RawMetric>;
 
-/** The set of metric subtrees the chart consumes. */
+/**
+ * The set of metric subtrees the chart consumes. Includes both vllm:* and
+ * sglang:* names so the stream-parse fallback collects whichever framework
+ * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric.
+ */
 const CHART_METRIC_KEYS = new Set([
+  // vLLM
   'vllm:kv_cache_usage_perc',
   'vllm:gpu_cache_usage_perc',
   'vllm:prefix_cache_hits',
@@ -100,6 +108,13 @@ const CHART_METRIC_KEYS = new Set([
   'vllm:prompt_tokens',
   'vllm:generation_tokens',
   'vllm:prompt_tokens_by_source',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
+  'sglang:generation_tokens',
+  'sglang:num_running_reqs',
+  'sglang:num_queue_reqs',
 ]);
 
 /**
@@ -220,18 +235,37 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   if (!Number.isFinite(startNs)) startNs = 0;
   const tOf = (ns: number) => (ns - startNs) / 1e9;
 
+  // Pick the first metric name whose series array has any data; fallback
+  // chain lets the same code path serve both vllm:* and sglang:* blobs.
+  const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => {
+    for (const name of names) {
+      const s = metrics[name]?.series;
+      if (s && s.length > 0) return s;
+    }
+    return undefined;
+  };
+
   // KV cache usage (gauge, 0..1) — average across engines so the value
   // stays a fraction (each engine has its own KV pool).
-  const kvSeries =
-    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvSeries = pickSeries(
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
   const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
     aggregateByStart(kvSeries, 'avg', 'avg'),
   ).map(([t, v]) => ({ t: tOf(t), value: v }));
 
   // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
-  // engines, joined on start_ns.
-  const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum');
-  const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum');
+  // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens.
+  const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens');
+  const qsSeries = pickSeries(
+    'vllm:prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
+  const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum');
+  const qsByT = aggregateByStart(qsSeries, 'rate', 'sum');
   const prefixCacheHitRate: TimeSeriesPoint[] = [];
   for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) {
     const q = qsByT.get(t);
@@ -239,8 +273,10 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   }
 
   // Queue depth: sum running + waiting across engines per timeslice.
-  const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum');
-  const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum');
+  const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs');
+  const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs');
+  const runByT = aggregateByStart(runSeries, 'avg', 'sum');
+  const waitByT = aggregateByStart(waitSeries, 'avg', 'sum');
   const queueDepth: QueueDepthPoint[] = [];
   // Union of timestamps so we surface activity even if one of the gauges
   // didn't report a sample on a given tick.
@@ -252,16 +288,19 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   }
 
   // Throughput: sum the counter `rate` (already per-second) across engines.
-  const counterRate = (name: string): TimeSeriesPoint[] =>
-    sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({
+  // Takes a fallback chain so vllm:* and sglang:* both work.
+  const counterRate = (...names: string[]): TimeSeriesPoint[] => {
+    const s = pickSeries(...names);
+    return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({
       t: tOf(t),
       value: v,
     }));
-  const prefillTps = counterRate('vllm:prompt_tokens');
-  const decodeTps = counterRate('vllm:generation_tokens');
+  };
+  const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens');
+  const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens');
   // Tokens served from prefix cache per scrape. Lets the frontend derive
   // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
-  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits');
+  const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
   // Per-source prompt tokens — sum across engines per source label.
   const promptBySrcByT = new Map<string, Map<number, number>>();
diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts
index 1ad7fd7f..da5d18a0 100644
--- a/packages/db/src/queries/agentic-aggregates.ts
+++ b/packages/db/src/queries/agentic-aggregates.ts
@@ -32,8 +32,12 @@ import type { DbClient } from '../connection.js';
  *
  * v2: aggregate vllm gauges/counters across all engine series (was reading
  * only series[0], which under-counted by Nx on multi-engine DP/PP deployments).
+ *
+ * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate
+ * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way
+ * they do for vllm runs.
  */
-export const STATS_VERSION = 2;
+export const STATS_VERSION = 3;
 
 export interface MetricPercentiles {
   mean: number;
@@ -199,6 +203,18 @@ function aggregateSeriesByStart(
  * Aggregates across all engine series so multi-engine DP/PP deployments are
  * counted correctly (previously we only read engine 0).
  */
+/** First metric whose series array is non-empty; supports vllm/sglang fallback. */
+function pickFirstNonEmpty(
+  metrics: Record<string, MetricMeta>,
+  ...names: string[]
+): Series[] | undefined {
+  for (const name of names) {
+    const s = metrics[name]?.series;
+    if (s && s.length > 0) return s;
+  }
+  return undefined;
+}
+
 export function extractServerMetricSamples(json: string): {
   kvCacheUtil: number[];
   prefixCacheHitRate: number[];
@@ -208,17 +224,29 @@ export function extractServerMetricSamples(json: string): {
 
   // KV cache util — per-engine gauge in [0, 1]. Average across engines so the
   // value stays a percentage; summing would give meaningless 0..N.
-  const kvSeriesAll =
-    metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series;
+  const kvSeriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:kv_cache_usage_perc',
+    'vllm:gpu_cache_usage_perc',
+    'sglang:token_usage',
+  );
   const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()];
 
   // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across
-  // all engines. Sum first, then divide.
-  const hitsAll =
-    metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series;
-  const queriesAll =
-    metrics['vllm:prefix_cache_queries']?.series ??
-    metrics['vllm:gpu_prefix_cache_queries']?.series;
+  // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens.
+  const hitsAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_hits',
+    'vllm:gpu_prefix_cache_hits',
+    'sglang:cached_tokens',
+  );
+  const queriesAll = pickFirstNonEmpty(
+    metrics,
+    'vllm:prefix_cache_queries',
+    'vllm:gpu_prefix_cache_queries',
+    'vllm:prompt_tokens',
+    'sglang:prompt_tokens',
+  );
   const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum');
   const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum');
   const prefixCacheHitRate: number[] = [];
@@ -232,12 +260,18 @@ export function extractServerMetricSamples(json: string): {
 
 /** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */
 const TARGET_METRIC_KEYS = new Set([
+  // vLLM
   'vllm:kv_cache_usage_perc',
-  'vllm:gpu_cache_usage_perc', // older fallback name
+  'vllm:gpu_cache_usage_perc',
   'vllm:prefix_cache_hits',
   'vllm:prefix_cache_queries',
-  'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths)
+  'vllm:gpu_prefix_cache_hits',
   'vllm:gpu_prefix_cache_queries',
+  'vllm:prompt_tokens',
+  // SGLang
+  'sglang:token_usage',
+  'sglang:cached_tokens',
+  'sglang:prompt_tokens',
 ]);
 
 /**

From 625d6e85e411cf8081977d3b76ad98d1805ad3c5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 27 May 2026 20:48:58 -0500
Subject: [PATCH 50/96] fix(ingest): derive GPU cache hit rate for SGLang at
 ingest time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SGLang runs' harness JSON doesn't populate server_gpu_cache_hit_rate
(vLLM runs do), so the detail-page header and inference chart tooltip
showed "—" for SGLang points. Now at trace_replay ingest, if any of
the linked benchmark_results rows has a null server_gpu_cache_hit_rate
and we have non-empty prefill/hits time-series in the computed
chart_series, derive the lifetime cluster ratio as
sum(hits.rate) / sum(prompt.rate) and write it into the row's metrics
JSONB.

Already-stored SGLang rows from runs 944/945 backfilled via a one-off
UPDATE earlier in this session (8 rows, mostly ~87-89% hit rate, one
high-conc outlier at 2.4%).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8cc03f2a..8d1e01b8 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -100,4 +100,23 @@ export async function insertTraceReplay(
     set trace_replay_id = ${traceReplayId}
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
+
+  // Derive a lifetime GPU cache hit rate from chart_series for any linked
+  // row whose harness JSON didn't set one (SGLang runs don't populate
+  // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has
+  // no usable prefill data — leaves the field null in that case, matching
+  // legacy "no trace_replay" behavior.
+  if (chartSeries && chartSeries.prefillTps.length > 0) {
+    const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
+    const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+    if (sumPrompts > 0) {
+      const rate = sumHits / sumPrompts;
+      await sql`
+        update benchmark_results
+        set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric))
+        where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
+          and (metrics->>'server_gpu_cache_hit_rate') is null
+      `;
+    }
+  }
 }

From aa76e9eca423d3ab2c7079ff28d74b70adefae1c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 14:38:52 -0500
Subject: [PATCH 51/96] feat(chart-series): map sglang:realtime_tokens to
 promptTokensBySource
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Cumulative prompt token source breakdown" chart was empty for
SGLang runs because the vllm-specific vllm:prompt_tokens_by_source
metric doesn't exist on SGLang. Maps sglang:realtime_tokens (which has
mode={prefill_cache, prefill_compute, decode}) into the same source
breakdown when no vllm series is present, filtered to prefill_* modes
(decode tokens are output throughput, not prompt-token volume).

CHART_SERIES_VERSION → 5. Backfilled 128 rows; SGLang rows from runs
944/946/947 now have prefill_cache + prefill_compute sources populated.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts | 31 ++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 86b79925..0807e238 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -31,8 +31,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  *
  * v4: extract sglang:* metrics too (fallback chain in each picker), so
  * SGLang runs populate the chart_series the same way vllm runs do.
+ *
+ * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
+ * into promptTokensBySource so the cumulative prompt-token-source-breakdown
+ * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
  */
-export const CHART_SERIES_VERSION = 4;
+export const CHART_SERIES_VERSION = 5;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -115,6 +119,7 @@ const CHART_METRIC_KEYS = new Set([
   'sglang:generation_tokens',
   'sglang:num_running_reqs',
   'sglang:num_queue_reqs',
+  'sglang:realtime_tokens',
 ]);
 
 /**
@@ -303,6 +308,12 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
   // Per-source prompt tokens — sum across engines per source label.
+  //   vllm: vllm:prompt_tokens_by_source has one series per source label
+  //         (local_cache_hit, external_cache_hit, miss, ...). Use the
+  //         `source`/`reason`/`kind` label as the breakdown key.
+  //   sglang: sglang:realtime_tokens uses a `mode` label with values
+  //         {prefill_cache, prefill_compute, decode}. Filter to prefill_*
+  //         since decode isn't prompt-token volume.
   const promptBySrcByT = new Map<string, Map<number, number>>();
   for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) {
     const labels = series.labels ?? {};
@@ -318,6 +329,24 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
       }
     }
   }
+  // SGLang fallback: only consider when the vllm metric wasn't found.
+  if (promptBySrcByT.size === 0) {
+    for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const mode = labels['mode'] ?? 'unknown';
+      if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens)
+      let byT = promptBySrcByT.get(mode);
+      if (!byT) {
+        byT = new Map<number, number>();
+        promptBySrcByT.set(mode, byT);
+      }
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+          byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+        }
+      }
+    }
+  }
   const promptTokensBySource: Record<string, TimeSeriesPoint[]> = {};
   for (const [source, byT] of promptBySrcByT) {
     const arr: TimeSeriesPoint[] = [];

From 5872a3d8d3c6f5e6feee879e2f8f6f5d0ddd04ac Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 14:48:27 -0500
Subject: [PATCH 52/96] feat(chart-series): break out SGLang cache hits by
 cache_source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously SGLang detail pages showed two stacked-area layers in the
prompt-token source breakdown: prefill_cache (everything that hit the
cache) + prefill_compute (cache miss). The user wanted finer
granularity — specifically a distinction between on-GPU HBM cache and
CPU-offloaded (hicache) host cache.

SGLang's sglang:cached_tokens metric carries a cache_source label that
varies per cache tier:
  - "device" → on-GPU HBM cache hit
  - "host"   → CPU-offload (hicache) cache hit
  - "total"  → older sglang, single series with no tier breakdown

Switches the cache-hit portion of the breakdown from the coarse
`prefill_cache` mode label to per-cache_source series:
  - device → "cache hit (HBM)"
  - host   → "cache hit (CPU offload)"
  - total  → "cache hit"
  - other  → "cache hit (<src>)"

Cache misses still come from realtime_tokens[mode=prefill_compute],
relabeled "compute (miss)" for symmetry.

Current data only contains device/total (no hicache runs ingested
yet) — when hicache runs come in, the chart will automatically split
cache hits into HBM + CPU-offload layers with no further code change.

CHART_SERIES_VERSION → 6. Backfilled 128 rows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/compute-chart-series.ts | 47 +++++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 0807e238..1996708f 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -35,8 +35,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode})
  * into promptTokensBySource so the cumulative prompt-token-source-breakdown
  * chart shows useful splits for SGLang runs (filtered to prefill_* modes).
+ *
+ * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source
+ * breakdown from sglang:cached_tokens — current runs always have one
+ * cache_source ("device" / HBM) but hicache (CPU offload) runs would
+ * split into "device" + "host" automatically once ingested.
  */
-export const CHART_SERIES_VERSION = 5;
+export const CHART_SERIES_VERSION = 6;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -330,15 +335,49 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     }
   }
   // SGLang fallback: only consider when the vllm metric wasn't found.
+  //   - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]`
+  //   - Cache hits, split by tier: per-series `sglang:cached_tokens` where each
+  //     series carries a `cache_source` label ("device" = HBM, "host" = CPU
+  //     offload via hicache). Current runs have only `device`; when hicache
+  //     runs land, additional series will appear and the chart will split.
   if (promptBySrcByT.size === 0) {
     for (const series of metrics['sglang:realtime_tokens']?.series ?? []) {
       const labels = series.labels ?? {};
       const mode = labels['mode'] ?? 'unknown';
-      if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens)
-      let byT = promptBySrcByT.get(mode);
+      // Only carry the cache-miss line over — cache hits come from
+      // sglang:cached_tokens broken out by cache_source below, so we'd
+      // double-count if we kept `prefill_cache` here too.
+      if (mode !== 'prefill_compute') continue;
+      const label = 'compute (miss)';
+      let byT = promptBySrcByT.get(label);
+      if (!byT) {
+        byT = new Map<number, number>();
+        promptBySrcByT.set(label, byT);
+      }
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {
+          byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate);
+        }
+      }
+    }
+    // Cache hits broken out per cache_source. Strip the noisy "total" label
+    // (older sglang versions emit a single un-broken-out series labelled
+    // total — show that as just "cache hit").
+    for (const series of metrics['sglang:cached_tokens']?.series ?? []) {
+      const labels = series.labels ?? {};
+      const src = labels['cache_source'] ?? 'cache hit';
+      const label =
+        src === 'device'
+          ? 'cache hit (HBM)'
+          : src === 'host'
+            ? 'cache hit (CPU offload)'
+            : src === 'total'
+              ? 'cache hit'
+              : `cache hit (${src})`;
+      let byT = promptBySrcByT.get(label);
       if (!byT) {
         byT = new Map<number, number>();
-        promptBySrcByT.set(mode, byT);
+        promptBySrcByT.set(label, byT);
       }
       for (const ts of series.timeslices ?? []) {
         if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') {

From 94a3e8b1986e54165c062e2a14eda60d9e9dd146 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:01:24 -0500
Subject: [PATCH 53/96] feat(chart-series): host cache util line + fix SGLang
 stacked-area colors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related fixes for SGLang hicache rendering on the agentic detail page:

1. KV cache utilization chart was GPU-HBM-only. SGLang hicache runs also
   expose sglang:hicache_host_{used,total}_tokens — the CPU offload
   pool's tokens-in-use over its capacity. Extracted as a new
   `hostKvCacheUsage` time series; frontend overlays it as a second
   orange line on the existing chart when the row has hicache data.

2. The cumulative-prompt-token-source-breakdown chart rendered ALL
   three SGLang sources in the same color, because the colors dict
   only knew vllm-style names (local_compute, local_cache_hit, etc.).
   Added explicit colors for the SGLang label names ('cache hit
   (HBM)', 'cache hit (CPU offload)', 'cache hit', 'compute (miss)')
   plus a memoized fallback palette so any future unknown source name
   gets a distinct color rather than falling through to gray.

CHART_SERIES_VERSION → 7. Backfilled 128 rows; hicache rows from
workflow_run 947 (8 rows) now have ~1830 hostKvCacheUsage samples
matching their HBM samples.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 16 ++++++++-
 .../agentic-point/time-series-chart.tsx       | 30 ++++++++++++++--
 .../src/hooks/api/use-trace-server-metrics.ts |  2 ++
 packages/db/src/etl/compute-chart-series.ts   | 36 ++++++++++++++++++-
 .../db/src/queries/trace-server-metrics.ts    |  3 ++
 5 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 2db2809b..b047ea8f 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -236,16 +236,30 @@ export function AgenticPointDetail({ id }: Props) {
             render={(expanded) => {
               const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
               if (!metrics) return <Skeleton />;
+              // For SGLang hicache rows we have both GPU (HBM) util and
+              // host (CPU offload pool) util — overlay them as two lines.
+              const hasHost = metrics.hostKvCacheUsage.length > 0;
               return (
                 <TimeSeriesChart
                   series={[
                     {
-                      name: 'GPU KV cache (avg n=50)',
+                      name: hasHost ? 'GPU HBM (avg n=50)' : 'GPU KV cache (avg n=50)',
                       data: rollingAverage(metrics.kvCacheUsage, 50),
                       rawData: metrics.kvCacheUsage,
                       color: '#3b82f6',
                       strokeWidth: 2,
                     },
+                    ...(hasHost
+                      ? [
+                          {
+                            name: 'CPU offload pool (avg n=50)',
+                            data: rollingAverage(metrics.hostKvCacheUsage, 50),
+                            rawData: metrics.hostKvCacheUsage,
+                            color: '#f97316',
+                            strokeWidth: 2,
+                          },
+                        ]
+                      : []),
                   ]}
                   durationS={metrics.durationS}
                   yMax={1}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 520b3ed6..15a15869 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -485,10 +485,16 @@ export function StackedAreaChart({
   }, [sourceSeries]);
 
   const colors: Record<string, string> = {
+    // vLLM source names
     local_compute: '#f97316',
     local_cache_hit: '#3b82f6',
     external_kv_transfer: '#22c55e',
     miss: '#f97316',
+    // SGLang source names (set by compute-chart-series for sglang rows)
+    'cache hit (HBM)': '#3b82f6',
+    'cache hit (CPU offload)': '#22c55e',
+    'cache hit': '#3b82f6',
+    'compute (miss)': '#f97316',
   };
   const labelFor: Record<string, string> = {
     local_compute: 'Prefill',
@@ -496,6 +502,26 @@ export function StackedAreaChart({
     external_kv_transfer: 'Offload Cache Hit',
     miss: 'Miss',
   };
+  // Fallback palette for any source name not in `colors` so we never
+  // emit two layers in the same shade. Cycles by insertion order.
+  const fallbackPalette = [
+    '#3b82f6',
+    '#f97316',
+    '#22c55e',
+    '#a855f7',
+    '#ef4444',
+    '#06b6d4',
+    '#f59e0b',
+    '#ec4899',
+  ];
+  let fallbackIdx = 0;
+  const colorFor = (name: string): string => {
+    if (colors[name]) return colors[name]!;
+    const c = fallbackPalette[fallbackIdx % fallbackPalette.length]!;
+    fallbackIdx++;
+    colors[name] = c; // memoize so the SAME unknown name always gets the same color
+    return c;
+  };
 
   if (!computed) {
     return (
@@ -522,7 +548,7 @@ export function StackedAreaChart({
       .toReversed()
       .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`)
       .join(' ')} Z`;
-    const color = colors[name] ?? '#6b7280';
+    const color = colorFor(name);
     for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!;
     return { name, color, d };
   });
@@ -540,7 +566,7 @@ export function StackedAreaChart({
       }
     }
     const items: HoverItem[] = stackOrder.map((name) => ({
-      color: colors[name] ?? '#6b7280',
+      color: colorFor(name),
       label: labelFor[name] ?? name,
       value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`,
     }));
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index 664bc6c7..bac67a50 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -44,6 +44,8 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
   /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 1996708f..8105961e 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -40,8 +40,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * breakdown from sglang:cached_tokens — current runs always have one
  * cache_source ("device" / HBM) but hicache (CPU offload) runs would
  * split into "device" + "host" automatically once ingested.
+ *
+ * v7: extract sglang:hicache_host_{used,total}_tokens into a new
+ * hostKvCacheUsage series so the KV cache utilization chart can plot
+ * the CPU offload pool's usage alongside the on-GPU HBM line.
  */
-export const CHART_SERIES_VERSION = 6;
+export const CHART_SERIES_VERSION = 7;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -79,6 +83,12 @@ export interface ChartSeries {
    * saved vs the raw queries that came in.
    */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /**
+   * Host (CPU offload) KV cache utilization, 0..1. Only populated for
+   * SGLang hicache runs (derived as hicache_host_used / hicache_host_total).
+   * Frontend overlays this on the KV cache util chart as a second line.
+   */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -125,6 +135,8 @@ const CHART_METRIC_KEYS = new Set([
   'sglang:num_running_reqs',
   'sglang:num_queue_reqs',
   'sglang:realtime_tokens',
+  'sglang:hicache_host_used_tokens',
+  'sglang:hicache_host_total_tokens',
 ]);
 
 /**
@@ -312,6 +324,27 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits).
   const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens');
 
+  // SGLang hicache: host-pool KV cache utilization as used/total per
+  // timeslice. Both metrics are gauges in absolute tokens. Total stays
+  // constant (it's the pool size), used fluctuates.
+  const hostUsedByT = aggregateByStart(
+    metrics['sglang:hicache_host_used_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostTotalByT = aggregateByStart(
+    metrics['sglang:hicache_host_total_tokens']?.series,
+    'avg',
+    'sum',
+  );
+  const hostKvCacheUsage: TimeSeriesPoint[] = [];
+  for (const [t, used] of [...hostUsedByT.entries()].toSorted((a, b) => a[0] - b[0])) {
+    const total = hostTotalByT.get(t);
+    if (total !== undefined && total > 0) {
+      hostKvCacheUsage.push({ t: tOf(t), value: used / total });
+    }
+  }
+
   // Per-source prompt tokens — sum across engines per source label.
   //   vllm: vllm:prompt_tokens_by_source has one series per source label
   //         (local_cache_hit, external_cache_hit, miss, ...). Use the
@@ -407,5 +440,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     prefillTps,
     decodeTps,
     prefixCacheHitsTps,
+    hostKvCacheUsage,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index 76775e77..eccb0a0c 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -73,6 +73,8 @@ export interface TraceServerMetrics {
   decodeTps: TimeSeriesPoint[];
   /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */
   prefixCacheHitsTps: TimeSeriesPoint[];
+  /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
+  hostKvCacheUsage: TimeSeriesPoint[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -118,6 +120,7 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     decodeTps: series.decodeTps,
     // v2 chart_series rows pre-backfill don't have this field — default to []
     prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
+    hostKvCacheUsage: series.hostKvCacheUsage ?? [],
   };
 }
 

From 93e197b7e54d140acfe65b61aeb4f5c48ca27091 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:19:20 -0500
Subject: [PATCH 54/96] fix(stacked-area): align sources by timestamp before
 computing shares
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cumulative-prompt-token-source-breakdown chart was showing huge
"100% compute (miss)" plateaus around minute 20-24 of many SGLang runs.

Root cause: the chart computed cumulative shares per ARRAY INDEX (not
timestamp), but in SGLang's per-scrape metrics, cache hits and misses
fire on different ticks — one scrape reports 193K hits + 0 miss, the
next reports 0 hits + 8K miss. So each source has a different timestamp
array. Indexing them in lockstep mixed values from different moments
and made the share calculation flap to 100% one side or the other.

Fix: union timestamps across all sources, then for each unique
timestamp carry forward each source's cumulative sum (a source that
didn't report at time t holds its previous cumulative value rather
than appearing as 0).

After fix: shares change smoothly over time as each source's cumulative
sum grows; transient single-tick gaps no longer drive the visible
share to either extreme.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/time-series-chart.tsx       | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 15a15869..75d7bb1e 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -464,15 +464,36 @@ export function StackedAreaChart({
   const computed = useMemo(() => {
     const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0);
     if (entries.length === 0) return null;
-    const tValues = entries[0]![1].map((p) => p.t);
+
+    // Different sources can land on different scrape timestamps
+    // (SGLang's hits/misses fire on alternating ticks), so we MUST
+    // align across all sources before computing shares — otherwise the
+    // share calculation indexes into each source's own time axis and
+    // mixes values from different moments.
+    //
+    // Approach: union all timestamps across sources, then for each
+    // unique timestamp carry forward the cumulative sum for every
+    // source (a source that didn't report at time t holds its previous
+    // cumulative value rather than dropping to 0).
+    const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted(
+      (a, b) => a - b,
+    );
+
+    // For each source, walk its (sorted) array and produce a parallel
+    // cumulative-sum array indexed against `tValues` via carry-forward.
     const cum: Record<string, number[]> = {};
     for (const [name, arr] of entries) {
+      const valByT = new Map(arr.map((p) => [p.t, p.value]));
+      const out: number[] = Array.from({ length: tValues.length });
       let acc = 0;
-      cum[name] = arr.map((p) => {
-        acc += p.value;
-        return acc;
-      });
+      for (let i = 0; i < tValues.length; i++) {
+        const v = valByT.get(tValues[i]!);
+        if (v !== undefined) acc += v;
+        out[i] = acc;
+      }
+      cum[name] = out;
     }
+
     const shares: Record<string, number[]> = {};
     for (const name of Object.keys(cum)) shares[name] = [];
     for (let i = 0; i < tValues.length; i++) {

From c14e19e277930495e4a43c3a6d6f42a611fec336 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 28 May 2026 15:44:07 -0500
Subject: [PATCH 55/96] fix(ingest): split GPU vs CPU cache hit rate for SGLang
 hicache rows

Previous inline derivation (commit 625d6e8) summed ALL cache hit
sources into server_gpu_cache_hit_rate, which conflated GPU HBM hits
with CPU offload hits on SGLang hicache rows. The harness JSON also
never sets server_cpu_cache_hit_rate.

Now derives both metrics from chart_series.promptTokensBySource:
  server_gpu_cache_hit_rate = sum(HBM + 'cache hit') / sum(prompts)
  server_cpu_cache_hit_rate = sum(CPU offload) / sum(prompts) or null
                              (null when no CPU offload source exists)

Falls back to prefixCacheHitsTps for vLLM rows where promptTokensBySource
isn't broken out by cache source. Overwrites any pre-existing value so
the derivation stays consistent with what the detail-page charts plot.

Backfilled all existing rows via two-phase SQL update earlier in the
session:
  - 8 hicache rows in workflow_run 947 now show GPU ~1-2% / CPU ~87-91%
  - Other SGLang rows show GPU ~87% / CPU null
  - vLLM rows restored to their original GPU hit rates

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 40 +++++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 8d1e01b8..43655d9a 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -101,21 +101,43 @@ export async function insertTraceReplay(
     where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
   `;
 
-  // Derive a lifetime GPU cache hit rate from chart_series for any linked
-  // row whose harness JSON didn't set one (SGLang runs don't populate
-  // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has
-  // no usable prefill data — leaves the field null in that case, matching
-  // legacy "no trace_replay" behavior.
+  // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang
+  // runs don't populate these in the harness JSON; vLLM runs do but only
+  // for GPU. We always recompute to keep the derivation consistent with
+  // what the detail-page charts plot — overwriting any pre-existing value.
+  //
+  // For hicache (CPU offload) rows the chart_series.promptTokensBySource
+  // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)"
+  // sources, letting us split GPU vs CPU hit rate. Other rows just have
+  // a single cache-hit source (either "cache hit (HBM)" / "cache hit"
+  // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps
+  // sum which equals the single cache source's total).
   if (chartSeries && chartSeries.prefillTps.length > 0) {
     const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
-    const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
     if (sumPrompts > 0) {
-      const rate = sumHits / sumPrompts;
+      const sumOf = (name: string): number =>
+        (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0);
+      const cpuHits = sumOf('cache hit (CPU offload)');
+      const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit');
+      // If the source breakdown has a HBM entry, use it (covers SGLang).
+      // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path).
+      const gpuHits =
+        hbmFromBreakdown > 0
+          ? hbmFromBreakdown
+          : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0);
+      const gpuRate = gpuHits / sumPrompts;
+      const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null;
       await sql`
         update benchmark_results
-        set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric))
+        set metrics = jsonb_set(
+          case when ${cpuRate}::numeric is not null
+            then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric))
+            else metrics
+          end,
+          '{server_gpu_cache_hit_rate}',
+          to_jsonb(${gpuRate}::numeric)
+        )
         where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[])
-          and (metrics->>'server_gpu_cache_hit_rate') is null
       `;
     }
   }

From 268617ccd85ccc8aea6ed12dd4bd61273c8a37c1 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 3 Jun 2026 10:40:04 -0500
Subject: [PATCH 56/96] fix(ingest): recognize vLLM LMCache
 external_kv_transfer as CPU hit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inline cache-hit-rate derivation only handled SGLang's hicache label
('cache hit (CPU offload)'). vLLM with LMCache uses 'external_kv_transfer'
in its prompt_tokens_by_source breakdown for the same concept (CPU
offload pool serving tokens to GPU). Those vLLM rows had cpu rate
null even when external_kv_transfer was the dominant source.

Adds external_kv_transfer + local_cache_hit to the source name aliases:
  GPU hits  = local_cache_hit + cache hit (HBM) + cache hit
  CPU hits  = external_kv_transfer + cache hit (CPU offload)
  fallback  = prefixCacheHitsTps total (for single-source rows)

Backfilled 132 affected rows via SQL — vLLM LMCache rows now show CPU
rate where present (e.g. dsv4 b300 conc=128 offload=on shows GPU ~1%
+ CPU ~87%, matching the actual cache topology).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 packages/db/src/etl/trace-replay-ingest.ts | 23 ++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts
index 43655d9a..cb022ca9 100644
--- a/packages/db/src/etl/trace-replay-ingest.ts
+++ b/packages/db/src/etl/trace-replay-ingest.ts
@@ -106,21 +106,24 @@ export async function insertTraceReplay(
   // for GPU. We always recompute to keep the derivation consistent with
   // what the detail-page charts plot — overwriting any pre-existing value.
   //
-  // For hicache (CPU offload) rows the chart_series.promptTokensBySource
-  // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)"
-  // sources, letting us split GPU vs CPU hit rate. Other rows just have
-  // a single cache-hit source (either "cache hit (HBM)" / "cache hit"
-  // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps
-  // sum which equals the single cache source's total).
+  // Source label naming differs by framework / cache topology:
+  //   SGLang hicache: 'cache hit (HBM)' + 'cache hit (CPU offload)'
+  //   SGLang older:   'cache hit'      (no tier breakdown)
+  //   vLLM LMCache:   'local_cache_hit' + 'external_kv_transfer'  (+ 'local_compute' for miss)
+  //   vLLM single:    falls back to prefixCacheHitsTps total (= local cache only)
   if (chartSeries && chartSeries.prefillTps.length > 0) {
     const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0);
     if (sumPrompts > 0) {
       const sumOf = (name: string): number =>
         (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0);
-      const cpuHits = sumOf('cache hit (CPU offload)');
-      const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit');
-      // If the source breakdown has a HBM entry, use it (covers SGLang).
-      // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path).
+      // CPU-offload hits: SGLang hicache + vLLM LMCache external transfer.
+      const cpuHits = sumOf('cache hit (CPU offload)') + sumOf('external_kv_transfer');
+      // GPU/HBM hits from source breakdown, summed across known aliases.
+      const hbmFromBreakdown =
+        sumOf('cache hit (HBM)') + sumOf('cache hit') + sumOf('local_cache_hit');
+      // If the source breakdown has any GPU entry, use it. Otherwise fall back
+      // to total prefixCacheHitsTps sum (single-source vLLM path with no
+      // by_source metric — equals the lone cache counter's lifetime).
       const gpuHits =
         hbmFromBreakdown > 0
           ? hbmFromBreakdown

From 7fc6b4f7b5a49aa370d912d6df36b40d80b813a6 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:02:34 -0500
Subject: [PATCH 57/96] fix(scatter): use lightweight presence endpoint for
 View charts button
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chart pre-fetched full trace_replay JSONL blobs for every visible
agentic point just to decide whether to render the "View charts" button
in pinned tooltips. With the latest run's 8x8 conc=512 rows pushing up
to 13 MB compressed per blob, 12-id chunks blew past Neon's 64 MB
per-HTTP-response cap and 500'd — hiding the button for every point.

New /api/v1/trace-availability returns {id: true} for ids that have a
stored blob; ScatterGraph uses that boolean instead. trace-histograms
is still used by the detail page (single id, no chunking issue).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../app/api/v1/trace-availability/route.ts    | 59 +++++++++++++++++++
 .../components/inference/ui/ScatterGraph.tsx  | 23 ++++----
 .../inference/utils/tooltipUtils.ts           | 15 ++---
 .../src/hooks/api/use-trace-availability.ts   | 29 +++++++++
 packages/db/src/queries/trace-availability.ts | 34 +++++++++++
 5 files changed, 143 insertions(+), 17 deletions(-)
 create mode 100644 packages/app/src/app/api/v1/trace-availability/route.ts
 create mode 100644 packages/app/src/hooks/api/use-trace-availability.ts
 create mode 100644 packages/db/src/queries/trace-availability.ts

diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts
new file mode 100644
index 00000000..2484ceaf
--- /dev/null
+++ b/packages/app/src/app/api/v1/trace-availability/route.ts
@@ -0,0 +1,59 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getTraceAvailability,
+  type TraceAvailabilityMap,
+} from '@semianalysisai/inferencex-db/queries/trace-availability';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedTraceAvailability = cachedQuery(
+  (ids: number[]): Promise<TraceAvailabilityMap> => getTraceAvailability(getDb(), ids),
+  'trace-availability',
+);
+
+const MAX_IDS_PER_REQUEST = 500;
+
+/**
+ * GET /api/v1/trace-availability?ids=1,2,3
+ *
+ * Returns `{[id]: true}` for ids that have a stored trace_replay blob.
+ * Lightweight presence check used by the scatter tooltip to decide whether
+ * to render the "View charts" button — see queries/trace-availability.ts.
+ */
+export async function GET(request: NextRequest) {
+  const raw = request.nextUrl.searchParams.get('ids');
+  if (!raw) {
+    return NextResponse.json({ error: 'ids query param is required' }, { status: 400 });
+  }
+
+  const ids = [
+    ...new Set(
+      raw
+        .split(',')
+        .map((s) => Number(s.trim()))
+        .filter((n) => Number.isFinite(n) && n > 0),
+    ),
+  ];
+  if (ids.length === 0) {
+    return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 });
+  }
+  if (ids.length > MAX_IDS_PER_REQUEST) {
+    return NextResponse.json(
+      { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` },
+      { status: 400 },
+    );
+  }
+
+  try {
+    const sorted = [...ids].toSorted((a, b) => a - b);
+    const availability = await getCachedTraceAvailability(sorted);
+    return cachedJson(availability);
+  } catch (error) {
+    console.error('Error fetching trace availability:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index fdcf8952..b93799db 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -6,7 +6,7 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react';
 
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
-import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
+import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
 import { useRouter } from 'next/navigation';
 import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
@@ -497,8 +497,11 @@ const ScatterGraph = React.memo(
     // All official points for rendering (unfiltered — visibility via opacity)
     const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]);
 
-    // Trace-replay histograms (ISL / OSL distributions) for agentic points.
-    // Pre-fetch the whole visible set so tooltip render stays synchronous.
+    // Bulk presence lookup for agentic points: which ids have a stored
+    // trace_replay blob → controls the "View charts" button in the pinned
+    // tooltip. We deliberately don't fetch the histograms themselves here;
+    // a 95-point dsv4-b300 dashboard would pull GB of profile blobs through
+    // Neon's HTTP API and trip its 64 MB per-response cap.
     const agenticIds = useMemo(() => {
       const ids: number[] = [];
       for (const p of pointsData) {
@@ -506,7 +509,7 @@ const ScatterGraph = React.memo(
       }
       return ids;
     }, [pointsData]);
-    const { data: traceHistograms } = useTraceHistograms(agenticIds);
+    const { data: traceAvailability } = useTraceAvailability(agenticIds);
     const router = useRouter();
 
     // Gradient label data
@@ -774,8 +777,7 @@ const ScatterGraph = React.memo(
             hardwareConfig,
             isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)),
             runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined,
-            traceHistogram:
-              typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined,
+            hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false,
           }),
         getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x),
         getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y),
@@ -842,10 +844,11 @@ const ScatterGraph = React.memo(
         removeTrackedConfig,
         chartDefinition.chartType,
         selectedPrecisions,
-        // Tooltip content closure reads traceHistograms to decide whether to
-        // show the "View charts" button — rebuild config when the histogram
-        // fetch resolves so the button appears for points that have data.
-        traceHistograms,
+        // Tooltip content closure reads traceAvailability to decide whether
+        // to render the "View charts" button — rebuild config when the
+        // presence fetch resolves so the button appears for points that
+        // have a trace_replay blob.
+        traceAvailability,
         router,
       ],
     );
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index ccc371f9..ed68c41b 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -20,12 +20,13 @@ export interface TooltipConfig {
   /** URL to the GitHub Actions workflow run */
   runUrl?: string;
   /**
-   * Per-request ISL/OSL arrays for agentic points, sourced from the stored
-   * aiperf `profile_export.jsonl`. Used to detect whether the point has any
-   * trace data (so the "View charts" button can appear); the actual
-   * distributions are rendered on the detail page, not inline.
+   * Whether this agentic point has a stored trace_replay blob. Controls
+   * visibility of the "View charts" button — the actual distributions are
+   * rendered on the detail page, not inline, so all the tooltip needs is a
+   * presence boolean (sourced from the bulk `/api/v1/trace-availability`
+   * call so we don't ship megabytes of profile JSONL just for this check).
    */
-  traceHistogram?: { isl: number[]; osl: number[] } | undefined;
+  hasTrace?: boolean;
 }
 
 export interface OverlayTooltipConfig extends TooltipConfig {
@@ -221,7 +222,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
     selectedYAxisMetric,
     hardwareConfig,
     runUrl,
-    traceHistogram,
+    hasTrace,
   } = config;
 
   return `
@@ -271,7 +272,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
       </div>
       ${generateAgenticHTML(d)}
       ${runLinkHTML(runUrl)}
-      ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))}
+      ${viewChartsButtonHTML(isPinned, Boolean(hasTrace))}
       ${
         isPinned
           ? `<button data-action="track-over-time" style="
diff --git a/packages/app/src/hooks/api/use-trace-availability.ts b/packages/app/src/hooks/api/use-trace-availability.ts
new file mode 100644
index 00000000..02176d59
--- /dev/null
+++ b/packages/app/src/hooks/api/use-trace-availability.ts
@@ -0,0 +1,29 @@
+import { useQuery } from '@tanstack/react-query';
+
+export type TraceAvailabilityMap = Record<number, true>;
+
+async function fetchTraceAvailability(
+  ids: number[],
+  signal?: AbortSignal,
+): Promise<TraceAvailabilityMap> {
+  if (ids.length === 0) return {};
+  const res = await fetch(`/api/v1/trace-availability?ids=${ids.join(',')}`, { signal });
+  if (!res.ok) throw new Error(`trace-availability ${res.status}`);
+  return (await res.json()) as TraceAvailabilityMap;
+}
+
+/**
+ * Bulk presence lookup: which of the given `benchmark_results.id`s have a
+ * stored trace_replay blob. Used by the scatter chart to decide whether to
+ * surface the "View charts" button — cheap boolean per id instead of
+ * shipping multi-MB profile blobs just for the check.
+ */
+export function useTraceAvailability(ids: number[], enabled = true) {
+  const sortedKey = [...new Set(ids)].toSorted((a, b) => a - b);
+  return useQuery({
+    queryKey: ['trace-availability', sortedKey.join(',')] as const,
+    queryFn: ({ signal }: { signal: AbortSignal }) => fetchTraceAvailability(sortedKey, signal),
+    enabled: enabled && sortedKey.length > 0,
+    staleTime: 5 * 60 * 1000,
+  });
+}
diff --git a/packages/db/src/queries/trace-availability.ts b/packages/db/src/queries/trace-availability.ts
new file mode 100644
index 00000000..155b3d4c
--- /dev/null
+++ b/packages/db/src/queries/trace-availability.ts
@@ -0,0 +1,34 @@
+/**
+ * Bulk "does this point have a trace_replay blob?" lookup. Used by the
+ * inference scatter chart to decide whether to render a "View charts"
+ * button in the pinned tooltip — a pure presence check that doesn't need
+ * the multi-megabyte blob payload `getTraceHistograms` ships.
+ *
+ * Going through `trace-histograms` for this trips Neon's 64 MB
+ * per-HTTP-response cap as soon as one chunk's combined gzip payload
+ * exceeds the cap (high-conc 8×8 rows can be 13 MB compressed each).
+ */
+
+import type { DbClient } from '../connection.js';
+
+/** Map of `benchmark_results.id` → true for each id that has a trace_replay blob. */
+export type TraceAvailabilityMap = Record<number, true>;
+
+export async function getTraceAvailability(
+  sql: DbClient,
+  benchmarkResultIds: number[],
+): Promise<TraceAvailabilityMap> {
+  if (benchmarkResultIds.length === 0) return {};
+
+  const rows = (await sql`
+    select br.id
+    from benchmark_results br
+    join agentic_trace_replay atr on atr.id = br.trace_replay_id
+    where br.id = any(${benchmarkResultIds}::bigint[])
+      and atr.profile_export_jsonl_gz is not null
+  `) as { id: number }[];
+
+  const result: TraceAvailabilityMap = {};
+  for (const row of rows) result[Number(row.id)] = true;
+  return result;
+}

From 80468ebbb3f733db613de9241b82b6c159685b4d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:02:50 -0500
Subject: [PATCH 58/96] feat(chart-series): per-DP-rank KV cache utilization
 overlay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cluster-average KV util line hides load skew on DEP configs — 8
ranks averaging 20% can hide one rank at 12% and another at 23%.

Bump CHART_SERIES_VERSION 7 -> 8 to keep one entry per engine in
kvCacheUsageByEngine. The detail page draws each rank in the
request-timeline palette (so DP indices read as the same color in
both views) and overlays the bold red "Avg" line on top.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    | 87 ++++++++++++++-----
 .../agentic-point/time-series-chart.tsx       | 15 +++-
 .../src/hooks/api/use-trace-server-metrics.ts |  5 ++
 packages/db/src/etl/compute-chart-series.ts   | 41 ++++++++-
 .../db/src/queries/trace-server-metrics.ts    |  7 ++
 5 files changed, 131 insertions(+), 24 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index b047ea8f..1ce321ee 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -87,6 +87,25 @@ const CHART_SIZES = {
   expanded: { width: 1300, height: 520 },
 };
 
+// Per-DP-rank color palette for DEP runs (one distinct color per rank in
+// the KV cache utilization overlay). Mirrors the request-timeline row
+// palette so the same DP index reads as the same color across both views.
+// Wraps mod-N if more than 12 ranks ever land.
+const DP_RANK_PALETTE = [
+  '#3b82f6',
+  '#ef4444',
+  '#10b981',
+  '#f59e0b',
+  '#a855f7',
+  '#06b6d4',
+  '#f97316',
+  '#84cc16',
+  '#ec4899',
+  '#14b8a6',
+  '#8b5cf6',
+  '#eab308',
+];
+
 type DetailView = 'point' | 'timeline' | 'aggregates';
 const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'point', label: 'Per-point', testId: 'detail-view-point' },
@@ -239,28 +258,56 @@ export function AgenticPointDetail({ id }: Props) {
               // For SGLang hicache rows we have both GPU (HBM) util and
               // host (CPU offload pool) util — overlay them as two lines.
               const hasHost = metrics.hostKvCacheUsage.length > 0;
+              // DEP runs report one series per engine. When there's more
+              // than one, draw one line per rank in distinct colors so
+              // load skew is visible at a glance; cluster-average sits on
+              // top in white so it stands out.
+              const perEngine = metrics.kvCacheUsageByEngine ?? [];
+              const hasPerEngine = perEngine.length > 1;
+              // Render order matters: per-engine first → average drawn on top.
+              const series = [
+                ...(hasPerEngine
+                  ? perEngine.map((e, i) => ({
+                      name: `DP ${e.engineLabel}`,
+                      data: rollingAverage(e.points, 50),
+                      color: DP_RANK_PALETTE[i % DP_RANK_PALETTE.length]!,
+                      // Thin + translucent so the Avg line on top reads as
+                      // the headline number, not just one more series.
+                      strokeWidth: 1,
+                      strokeOpacity: 0.5,
+                    }))
+                  : []),
+                {
+                  name: hasHost
+                    ? 'GPU HBM (avg n=50)'
+                    : hasPerEngine
+                      ? 'Avg'
+                      : 'GPU KV cache (avg n=50)',
+                  data: rollingAverage(metrics.kvCacheUsage, 50),
+                  // Skip raw scatter when per-engine overlay is on — the
+                  // DP-rank lines already convey the spread, dots would be noise.
+                  rawData: hasPerEngine ? undefined : metrics.kvCacheUsage,
+                  // Bold red Avg sits on top of the translucent per-DP lines.
+                  // DP 1 in the palette is #ef4444 (lighter red); the darker
+                  // #dc2626 here plus the heavier stroke keeps it distinct.
+                  color: hasPerEngine ? '#dc2626' : '#3b82f6',
+                  strokeWidth: hasPerEngine ? 3.5 : 2,
+                },
+                ...(hasHost
+                  ? [
+                      {
+                        name: 'CPU offload pool (avg n=50)',
+                        data: rollingAverage(metrics.hostKvCacheUsage, 50),
+                        rawData: metrics.hostKvCacheUsage,
+                        color: '#f97316',
+                        strokeWidth: 2,
+                      },
+                    ]
+                  : []),
+              ];
               return (
                 <TimeSeriesChart
-                  series={[
-                    {
-                      name: hasHost ? 'GPU HBM (avg n=50)' : 'GPU KV cache (avg n=50)',
-                      data: rollingAverage(metrics.kvCacheUsage, 50),
-                      rawData: metrics.kvCacheUsage,
-                      color: '#3b82f6',
-                      strokeWidth: 2,
-                    },
-                    ...(hasHost
-                      ? [
-                          {
-                            name: 'CPU offload pool (avg n=50)',
-                            data: rollingAverage(metrics.hostKvCacheUsage, 50),
-                            rawData: metrics.hostKvCacheUsage,
-                            color: '#f97316',
-                            strokeWidth: 2,
-                          },
-                        ]
-                      : []),
-                  ]}
+                  series={series}
                   durationS={metrics.durationS}
                   yMax={1}
                   yFmt={(v) => `${(v * 100).toFixed(0)}%`}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 75d7bb1e..399f965d 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -15,6 +15,11 @@ interface Series {
   color: string;
   /** Override default stroke width (1.8). Use higher values for emphasis lines. */
   strokeWidth?: number;
+  /** Stroke opacity (0..1). Use < 1 for background/underlay lines. */
+  strokeOpacity?: number;
+  /** Hide from the hover legend (e.g. per-engine underlay lines that
+   *  would clutter the tooltip). The path still renders. */
+  hideFromHover?: boolean;
 }
 
 interface TimeSeriesChartProps {
@@ -287,6 +292,7 @@ export function TimeSeriesChart({
     const t = fraction * xMax;
     const items: HoverItem[] = [];
     for (const s of series) {
+      if (s.hideFromHover) continue;
       const v = interpAt(s.data, t);
       if (v === null || !Number.isFinite(v)) continue;
       items.push({ color: s.color, label: s.name, value: yFmt(v) });
@@ -363,6 +369,7 @@ export function TimeSeriesChart({
             fill="none"
             stroke={s.color}
             strokeWidth={s.strokeWidth ?? 1.8}
+            strokeOpacity={s.strokeOpacity ?? 1}
           />
         );
       })}
@@ -418,11 +425,13 @@ export function TimeSeriesChart({
         </text>
       )}
 
-      {/* Legend */}
+      {/* Legend — skip series flagged hideFromHover so per-engine
+          underlays don't clutter the chip row. */}
       {(() => {
+        const visible = series.filter((s) => !s.hideFromHover);
         const chipY = H - 8;
-        const chipW = innerW / Math.max(1, series.length);
-        return series.map((s, i) => {
+        const chipW = innerW / Math.max(1, visible.length);
+        return visible.map((s, i) => {
           const x = PAD.left + i * chipW;
           return (
             <g key={`leg${i}`}>
diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts
index bac67a50..11905aaa 100644
--- a/packages/app/src/hooks/api/use-trace-server-metrics.ts
+++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts
@@ -46,6 +46,11 @@ export interface TraceServerMetrics {
   prefixCacheHitsTps: TimeSeriesPoint[];
   /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
   hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization. Empty for single-engine deployments —
+   * the cluster-average `kvCacheUsage` line covers that case alone.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
 }
 
 async function fetchTraceServerMetrics(
diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts
index 8105961e..46600f7d 100644
--- a/packages/db/src/etl/compute-chart-series.ts
+++ b/packages/db/src/etl/compute-chart-series.ts
@@ -44,8 +44,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js';
  * v7: extract sglang:hicache_host_{used,total}_tokens into a new
  * hostKvCacheUsage series so the KV cache utilization chart can plot
  * the CPU offload pool's usage alongside the on-GPU HBM line.
+ *
+ * v8: keep the per-engine dimension on kv_cache_usage_perc as
+ * `kvCacheUsageByEngine` (one entry per DP rank). The cluster-average
+ * line hides load skew on DEP configs; the detail page overlays the
+ * per-rank lines so a hot rank is visible at a glance.
  */
-export const CHART_SERIES_VERSION = 7;
+export const CHART_SERIES_VERSION = 8;
 
 export interface TimeSeriesPoint {
   /** Seconds from benchmark start. */
@@ -89,6 +94,15 @@ export interface ChartSeries {
    * Frontend overlays this on the KV cache util chart as a second line.
    */
   hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization (0..1 each). One entry per engine
+   * series found in the raw metric, ordered by the `engine` label when
+   * present and by series-array index otherwise. Empty for single-engine
+   * deployments — the average `kvCacheUsage` line covers that case alone.
+   * The detail page overlays these on the same chart so DEP load skew is
+   * visible without changing the headline number.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
 }
 
 // ── Raw blob shapes (subset we read) ────────────────────────────────────
@@ -277,6 +291,30 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
   const kvCacheUsage: TimeSeriesPoint[] = sortedEntries(
     aggregateByStart(kvSeries, 'avg', 'avg'),
   ).map(([t, v]) => ({ t: tOf(t), value: v }));
+  // Per-engine breakdown of the same metric. We only emit it when there's
+  // more than one series — single-engine deployments would just duplicate
+  // the cluster-average line.
+  const kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[] = [];
+  if (kvSeries && kvSeries.length > 1) {
+    // Sort by numeric engine label when present so rank 0..N renders in
+    // order; fall back to series-array index otherwise.
+    const decorated = kvSeries.map((s, idx) => {
+      const raw =
+        s.labels?.['engine'] ?? s.labels?.['engine_idx'] ?? s.labels?.['dp_rank'] ?? String(idx);
+      const numeric = Number(raw);
+      return { series: s, idx, label: raw, sortKey: Number.isFinite(numeric) ? numeric : idx };
+    });
+    decorated.sort((a, b) => a.sortKey - b.sortKey);
+    for (const { series, label } of decorated) {
+      const pts: TimeSeriesPoint[] = [];
+      for (const ts of series.timeslices ?? []) {
+        if (typeof ts.start_ns !== 'number' || typeof ts.avg !== 'number') continue;
+        if (!Number.isFinite(ts.avg)) continue;
+        pts.push({ t: tOf(ts.start_ns), value: ts.avg });
+      }
+      if (pts.length > 0) kvCacheUsageByEngine.push({ engineLabel: label, points: pts });
+    }
+  }
 
   // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across
   // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens.
@@ -441,5 +479,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries {
     decodeTps,
     prefixCacheHitsTps,
     hostKvCacheUsage,
+    kvCacheUsageByEngine,
   };
 }
diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts
index eccb0a0c..5594d514 100644
--- a/packages/db/src/queries/trace-server-metrics.ts
+++ b/packages/db/src/queries/trace-server-metrics.ts
@@ -75,6 +75,11 @@ export interface TraceServerMetrics {
   prefixCacheHitsTps: TimeSeriesPoint[];
   /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */
   hostKvCacheUsage: TimeSeriesPoint[];
+  /**
+   * Per-DP-rank KV cache utilization. Empty for single-engine deployments —
+   * the cluster-average `kvCacheUsage` line covers that case alone.
+   */
+  kvCacheUsageByEngine: { engineLabel: string; points: TimeSeriesPoint[] }[];
 }
 
 interface RawMetaRow extends PointMeta {
@@ -121,6 +126,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics {
     // v2 chart_series rows pre-backfill don't have this field — default to []
     prefixCacheHitsTps: series.prefixCacheHitsTps ?? [],
     hostKvCacheUsage: series.hostKvCacheUsage ?? [],
+    // v8+ field; older chart_series rows lack it → omit per-engine overlay.
+    kvCacheUsageByEngine: series.kvCacheUsageByEngine ?? [],
   };
 }
 

From 3a5ef158f615ba2177e7b911639d3ccd832159f2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:24:56 -0500
Subject: [PATCH 59/96] feat(scatter): restrict non-e2e xmodes to e2e-pareto
 points
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TTFT, interactivity, session-time, and prefill-tps charts used to
compute their own Pareto frontiers on the swapped x metric. That let a
vendor benchmark-hack: tune a config to top TTFT while quietly tanking
decode (or vice versa), and post a chart-topping point that didn't
reflect real e2e performance.

When xmode != 'e2e', filter the displayed point set to those that sit
on the (e2e_latency, y) Pareto frontier — same set of points across
every non-e2e chart, just rendered at the chosen x metric. The e2e
chart itself is unchanged and remains the source of truth.

Per Oren's review:
  "all and only the points that show up on e2e latency pareto should
   show up on ttft & interactivity & prefill tok/s/user pareto."

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx |   1 +
 .../inference/hooks/useChartData.ts           | 104 ++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 2e5a245f..c446dc71 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -285,6 +285,7 @@ export function InferenceProvider({
     selectedPercentile,
     compareGpuPair ?? null,
     benchmarkRunId,
+    selectedXAxisMode,
   );
 
   // For GPU comparison date picker — use shared availability data from global filters
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 328750f0..397572df 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -22,6 +22,84 @@ import {
 import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
 import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
+import {
+  paretoFrontLowerLeft,
+  paretoFrontLowerRight,
+  paretoFrontUpperLeft,
+  paretoFrontUpperRight,
+} from '@/lib/chart-utils';
+
+type XAxisMode = 'ttft' | 'e2e' | 'interactivity' | 'session-time' | 'prefill-tps';
+
+/**
+ * Resolve the percentile-prefixed e2e-latency field name for the given
+ * sequence + percentile combo (e.g. 'median_e2el', 'p90_e2el').
+ */
+function e2elFieldFor(percentile: string): string {
+  return withPercentile('median_e2el', percentile);
+}
+
+/**
+ * Compute the set of benchmark_results.id values that sit on the
+ * (e2e_latency, y) Pareto frontier within each (hwKey, precision, date)
+ * group. Used to restrict the non-e2e xmode charts (ttft, interactivity,
+ * session-time, prefill-tps) so they show *only* the points that win on
+ * end-to-end latency — preventing benchmark-hacking where a config tops
+ * one axis while tanking the other.
+ *
+ * Returns null when the y-metric has no roofline direction declared on
+ * the e2e chart (caller falls back to no filtering in that case).
+ */
+function e2eParetoIds(
+  points: InferenceData[],
+  selectedYAxisMetric: string,
+  percentile: string,
+): Set<number> | null {
+  const e2eChartDef = (chartDefinitions as ChartDefinition[]).find((c) => c.chartType === 'e2e');
+  if (!e2eChartDef) return null;
+  const dir = e2eChartDef[`${selectedYAxisMetric}_roofline` as keyof ChartDefinition] as
+    | 'upper_right'
+    | 'upper_left'
+    | 'lower_left'
+    | 'lower_right'
+    | undefined;
+  if (!dir) return null;
+  const frontierFn =
+    dir === 'upper_right'
+      ? paretoFrontUpperRight
+      : dir === 'upper_left'
+        ? paretoFrontUpperLeft
+        : dir === 'lower_left'
+          ? paretoFrontLowerLeft
+          : paretoFrontLowerRight;
+  const e2elField = e2elFieldFor(percentile);
+  const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey;
+
+  // Re-frame each candidate point in (e2el, y) space, then compute the
+  // pareto per (hwKey, precision, date) bucket — frontiers don't span dates
+  // (a May 17 point can't dominate a May 15 plot).
+  const byGroup = new Map<string, InferenceData[]>();
+  for (const p of points) {
+    const yValue = (p[metricKey] as { y?: number } | undefined)?.y;
+    const xValue = (p as unknown as Record<string, unknown>)[e2elField];
+    if (typeof xValue !== 'number' || !Number.isFinite(xValue)) continue;
+    if (typeof yValue !== 'number' || !Number.isFinite(yValue)) continue;
+    const key = `${p.hwKey}|${p.precision}|${p.date}`;
+    let bucket = byGroup.get(key);
+    if (!bucket) {
+      bucket = [];
+      byGroup.set(key, bucket);
+    }
+    bucket.push({ ...p, x: xValue, y: yValue });
+  }
+  const ids = new Set<number>();
+  for (const bucket of byGroup.values()) {
+    for (const f of frontierFn(bucket)) {
+      if (typeof f.id === 'number') ids.add(f.id);
+    }
+  }
+  return ids;
+}
 
 /** Build deduplicated comparison dates, excluding the main run date. */
 export function buildComparisonDates(
@@ -92,6 +170,15 @@ export function useChartData(
    * config — disambiguates when two runs land on the same date.
    */
   selectedRunId?: string,
+  /**
+   * Current x-axis mode. When set to anything other than 'e2e', the displayed
+   * data is filtered to the (e2e-latency, y) Pareto frontier so the ttft /
+   * interactivity / session-time / prefill-tps charts show only points that
+   * also win on end-to-end latency — preventing benchmark-hacking where a
+   * config tops one metric while tanking the other. The 'e2e' mode is the
+   * source of truth and keeps the full point set.
+   */
+  selectedXAxisMode: XAxisMode = 'e2e',
 ) {
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
@@ -357,6 +444,21 @@ export function useChartData(
 
         filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
 
+        // When the user is NOT viewing the e2e latency chart, restrict the
+        // displayed points to those that sit on the (e2e_latency, y) Pareto
+        // frontier — i.e. "this is the e2e chart, we're just plotting the
+        // ttft value." Prevents benchmark-hacking where a config tops one
+        // axis (TTFT, interactivity, prefill-tps) while quietly tanking
+        // end-to-end latency.
+        if (selectedXAxisMode !== 'e2e') {
+          const paretoIds = e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
+          if (paretoIds) {
+            filteredData = filteredData.filter(
+              (d) => typeof d.id === 'number' && paretoIds.has(d.id),
+            );
+          }
+        }
+
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
         const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
@@ -407,6 +509,8 @@ export function useChartData(
     userPowers,
     stableChartDefinitions,
     compareGpuPair,
+    selectedXAxisMode,
+    selectedPercentile,
   ]);
 
   return { graphs, loading, error, hardwareConfig };

From 5035e17a8fdc2b6c9b86511075d22687a2b1f731 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:36:22 -0500
Subject: [PATCH 60/96] fix(scatter): keep non-pareto points visible on non-e2e
 xmodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous change filtered the displayed data down to e2e-Pareto winners,
which hid every dominated config from the TTFT / interactivity /
session-time / prefill-tps views. Users couldn't see where the
non-optimal configs actually sit on the alternative axes — losing
diagnostic visibility just to enforce the anti-benchmark-hack rule.

Switch from hard filter to a per-point `isOnE2eFrontier` flag: every
point still renders as scatter, only the e2e-Pareto winners feed the
frontier line. ScatterGraph honors the flag in its roofline compute
so the line stays restricted to non-hackable configs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../inference/hooks/useChartData.ts           | 30 ++++++++++---------
 .../app/src/components/inference/types.ts     | 11 +++++++
 .../components/inference/ui/ScatterGraph.tsx  | 14 ++++++++-
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 397572df..50e6d87d 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -444,20 +444,17 @@ export function useChartData(
 
         filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
 
-        // When the user is NOT viewing the e2e latency chart, restrict the
-        // displayed points to those that sit on the (e2e_latency, y) Pareto
-        // frontier — i.e. "this is the e2e chart, we're just plotting the
-        // ttft value." Prevents benchmark-hacking where a config tops one
-        // axis (TTFT, interactivity, prefill-tps) while quietly tanking
-        // end-to-end latency.
-        if (selectedXAxisMode !== 'e2e') {
-          const paretoIds = e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
-          if (paretoIds) {
-            filteredData = filteredData.filter(
-              (d) => typeof d.id === 'number' && paretoIds.has(d.id),
-            );
-          }
-        }
+        // When the user is NOT viewing the e2e latency chart, mark each
+        // point with whether it sits on the (e2e_latency, y) Pareto
+        // frontier for its (hwKey, precision, date) group. The chart
+        // still renders every point as scatter — only e2e-Pareto winners
+        // feed the roofline (ScatterGraph honors the flag). Prevents
+        // benchmark-hacking the TTFT / interactivity line by tanking
+        // decode (or vice versa) without hiding non-optimal configs.
+        const e2eParetoSet =
+          selectedXAxisMode === 'e2e'
+            ? null
+            : e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
@@ -470,11 +467,16 @@ export function useChartData(
                 const yValue = (d[metricKey] as { y: number })?.y ?? d.y;
                 const roof = (d[metricKey] as { roof: boolean })?.roof ?? false;
                 const xValue = (d as any)[xAxisField] ?? d.x;
+                const isOnE2eFrontier =
+                  e2eParetoSet === null
+                    ? undefined
+                    : typeof d.id === 'number' && e2eParetoSet.has(d.id);
                 return {
                   ...d,
                   x: xValue,
                   y: yValue,
                   roof,
+                  isOnE2eFrontier,
                 };
               })
               // When TTFT is on the x-axis, apply the latency limit to filter
diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts
index bedded40..219e6bd7 100644
--- a/packages/app/src/components/inference/types.ts
+++ b/packages/app/src/components/inference/types.ts
@@ -158,6 +158,17 @@ export interface InferenceData extends Partial<Omit<AggDataEntry, AggDataConflic
   x: number;
   y: number;
   hidden?: boolean;
+  /**
+   * Whether this point sits on the (e2e_latency, y-metric) Pareto frontier.
+   * Set by useChartData when `selectedXAxisMode !== 'e2e'`. The TTFT /
+   * interactivity / session-time / prefill-tps charts use this flag to
+   * restrict their roofline computation to e2e-Pareto winners — vendors
+   * can't benchmark-hack TTFT by tanking decode (or vice versa) and still
+   * appear on the frontier line — while keeping every point visible as
+   * scatter so the user can see where dominated configs actually sit.
+   * Undefined when the chart is in e2e mode (no remapping needed).
+   */
+  isOnE2eFrontier?: boolean;
 
   // Overridden fields with narrower types
   hwKey: string;
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index b93799db..a5cbc9cf 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -383,7 +383,19 @@ const ScatterGraph = React.memo(
         }
         const combined: InferenceData[] = [];
         for (const datePoints of byDate.values()) {
-          combined.push(...frontierFn(datePoints));
+          // In non-e2e xmodes, useChartData stamps every point with an
+          // `isOnE2eFrontier` flag so the line is restricted to the
+          // e2e-Pareto winners — same set of points across every chart,
+          // just re-plotted at the chosen x metric. When the flag is
+          // present on ANY point in the bucket, narrow to the winners
+          // before paretoing (otherwise we'd recompute a fresh frontier
+          // on the swapped x axis and reintroduce the benchmark hack).
+          const flagged = datePoints.some((p) => p.isOnE2eFrontier !== undefined);
+          const seedPoints = flagged
+            ? datePoints.filter((p) => p.isOnE2eFrontier === true)
+            : datePoints;
+          if (seedPoints.length === 0) continue;
+          combined.push(...frontierFn(seedPoints));
         }
         combined.sort((a, b) => a.x - b.x);
         result[hwKey] = combined;

From 2bfea38c9b200c4fbd09592cddf8b0e788e4a580 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:43:09 -0500
Subject: [PATCH 61/96] fix(scatter): scope e2e-pareto restriction to agentic
 only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixed-seq workloads don't have the multi-turn / session-time framing
that motivated the anti-hack rule — their e2e IS the request latency,
so a TTFT hack there reads honestly on e2e too. Reverting fixed-seq
to the prior per-axis Pareto avoids changing established leaderboard
semantics for non-agentic runs.

Agentic continues to mark `isOnE2eFrontier` on each point so the TTFT,
interactivity, session-time and prefill-tps lines stay restricted to
e2e-winning configs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../inference/hooks/useChartData.ts           | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 50e6d87d..3c67ff90 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -444,22 +444,30 @@ export function useChartData(
 
         filteredData = filterDataByCostLimit(filteredData, chartDefinition, selectedYAxisMetric);
 
-        // When the user is NOT viewing the e2e latency chart, mark each
-        // point with whether it sits on the (e2e_latency, y) Pareto
-        // frontier for its (hwKey, precision, date) group. The chart
-        // still renders every point as scatter — only e2e-Pareto winners
-        // feed the roofline (ScatterGraph honors the flag). Prevents
-        // benchmark-hacking the TTFT / interactivity line by tanking
-        // decode (or vice versa) without hiding non-optimal configs.
+        // For AGENTIC workloads only: when the user is NOT viewing the
+        // e2e latency chart, mark each point with whether it sits on the
+        // (e2e_latency, y) Pareto frontier for its (hwKey, precision,
+        // date) group. The chart still renders every point as scatter —
+        // only e2e-Pareto winners feed the roofline (ScatterGraph honors
+        // the flag). Prevents benchmark-hacking the TTFT / interactivity
+        // line by tanking decode (or vice versa) without hiding the
+        // non-optimal configs from view.
+        //
+        // Fixed-seq workloads keep the existing per-axis Pareto since
+        // there's no separate "session-time" notion of total latency —
+        // their e2e IS the request latency, so a TTFT hack there reads
+        // honestly on e2e too. The anti-hack constraint is specifically
+        // about multi-turn agentic where TTFT measures a tiny fraction
+        // of the user-visible session time.
+        const isAgentic = selectedSequence === Sequence.AgenticTraces;
         const e2eParetoSet =
-          selectedXAxisMode === 'e2e'
-            ? null
-            : e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile);
+          isAgentic && selectedXAxisMode !== 'e2e'
+            ? e2eParetoIds(filteredData, selectedYAxisMetric, selectedPercentile)
+            : null;
 
         // Filter to points that have the selected metric, then remap x/y
         const hasMetric = filteredData.some((d) => metricKey in d);
         const isTtftX = typeof xAxisField === 'string' && xAxisField.endsWith('_ttft');
-        const isAgentic = selectedSequence === Sequence.AgenticTraces;
         const processedData = hasMetric
           ? filteredData
               .filter((d) => metricKey in d)

From cbeeb695a15391fde615792bf9ad9e3e4233b220 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 13:56:40 -0500
Subject: [PATCH 62/96] feat(legend): info tooltip on Optimal Only for agentic
 non-e2e modes

Add an optional infoTooltip field to LegendSwitchConfig that renders a
small info icon next to the switch label. On agentic + non-e2e xmodes,
hovering it explains that "optimal" means on the end-to-end Pareto
frontier (not a per-axis Pareto), so users understand why off-frontier
points may appear above the line.

Hit target widened (-m-1.5 p-1.5) and delay dropped to 100ms so the
tiny icon isn't flaky to hover.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  | 15 ++++++++++-
 .../app/src/components/ui/chart-legend.tsx    | 26 +++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index a5cbc9cf..2552a334 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -12,7 +12,7 @@ import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
 import { computeToggle } from '@/hooks/useTogglableSet';
 import { getHardwareConfig, getModelSortIndex } from '@/lib/constants';
-import { getChartWatermark } from '@/lib/data-mappings';
+import { getChartWatermark, Sequence } from '@/lib/data-mappings';
 import { formatNumber, getDisplayLabel, updateRepoUrl } from '@/lib/utils';
 import { D3Chart } from '@/lib/d3-chart/D3Chart';
 import type {
@@ -242,6 +242,8 @@ const ScatterGraph = React.memo(
       trackedConfigs,
       addTrackedConfig,
       removeTrackedConfig,
+      selectedXAxisMode,
+      selectedSequence,
     } = useInference();
 
     const {
@@ -2281,6 +2283,17 @@ const ScatterGraph = React.memo(
                   setHideNonOptimal(checked);
                   track('latency_hide_non_optimal_toggled', { enabled: checked });
                 },
+                // On agentic + non-e2e chart, "optimal" means "on the
+                // e2e-latency Pareto frontier" (not a per-axis Pareto on the
+                // current x metric). Explain that so users don't wonder why
+                // a point sitting above the line is still considered
+                // dominated.
+                ...(selectedSequence === Sequence.AgenticTraces && selectedXAxisMode !== 'e2e'
+                  ? {
+                      infoTooltip:
+                        "On agentic, optimal = on the end-to-end latency Pareto frontier, so a config can't win this axis by tanking e2e. Off-frontier points may appear above the line.",
+                    }
+                  : {}),
               },
               {
                 id: 'scatter-hide-point-labels',
diff --git a/packages/app/src/components/ui/chart-legend.tsx b/packages/app/src/components/ui/chart-legend.tsx
index 81d5f261..a20c9959 100644
--- a/packages/app/src/components/ui/chart-legend.tsx
+++ b/packages/app/src/components/ui/chart-legend.tsx
@@ -6,6 +6,7 @@ import {
   ArrowRightToLine,
   Circle,
   Diamond,
+  Info,
   Square,
   Triangle,
   X,
@@ -36,6 +37,8 @@ export interface LegendSwitchConfig {
   label: string;
   checked: boolean;
   onCheckedChange: (checked: boolean) => void;
+  /** Optional explainer rendered as an info-icon tooltip next to the label. */
+  infoTooltip?: React.ReactNode;
 }
 
 export interface LegendActionConfig {
@@ -273,6 +276,29 @@ export default function ChartLegend({
             >
               {sw.label}
             </Label>
+            {sw.infoTooltip && (
+              <TooltipProvider delayDuration={100}>
+                <TooltipRoot>
+                  <TooltipTrigger asChild>
+                    <button
+                      type="button"
+                      data-testid={`${sw.id}-info`}
+                      aria-label={`More info about ${sw.label}`}
+                      className="text-muted-foreground hover:text-foreground cursor-help -m-1.5 p-1.5 inline-flex items-center"
+                    >
+                      <Info size={14} />
+                    </button>
+                  </TooltipTrigger>
+                  <TooltipContent
+                    side="top"
+                    sideOffset={6}
+                    className="max-w-[260px] text-xs leading-snug"
+                  >
+                    {sw.infoTooltip}
+                  </TooltipContent>
+                </TooltipRoot>
+              </TooltipProvider>
+            )}
           </div>
         ))}
       </div>

From de5e51a1330d7c24f51850e729a19a2d8802d990 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 4 Jun 2026 14:50:42 -0500
Subject: [PATCH 63/96] fix(inference): don't scope chart to one run when runs
 cover different hardware
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two workflow runs landing on the same date for the same model+precision
but DIFFERENT hardware (e.g. a B300 dsv4 run and a B200 dsv4 run) each
get their own changelog entry. The single-run scoping guard matched runs
by model+precision only, so both counted as "runs with a changelog for
this model", length>1 tripped, and selecting either run scoped the
benchmarks query to that one workflow run — hiding the other GPU's curve
entirely (carry-forward across hardware silently broke).

Scope to a single run only when two runs contest the SAME full config_key
(model-precision-hardware-framework) — a genuine same-day re-run of one
hardware, where a DISTINCT ON merge could mix them. Complementary
different-hardware runs now both render via the normal date carry-forward.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index c446dc71..244c713c 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -242,25 +242,42 @@ export function InferenceProvider({
   const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING)
     .filter(([, model]) => model === selectedModel)
     .map(([prefix]) => prefix);
-  const runIdsWithModelChangelog: string[] = [];
+  // Map each FULL config_key (model-precision-hardware-framework) a run's
+  // changelog claims to the set of runs claiming it. Single-run scoping should
+  // only kick in when two runs contest the SAME full key — e.g. a same-day
+  // re-run of one hardware — because then a DISTINCT ON merge could mix them
+  // and the user needs to pick which run wins. Runs covering DIFFERENT hardware
+  // of the same model (e.g. a B300 run and a B200 run on the same date) are
+  // complementary: both must render via carry-forward. Matching on model+
+  // precision alone (the old behavior) wrongly treated those as alternatives
+  // and scoped the chart to one run, hiding the other GPU's curve.
+  const runsByConfigKey = new Map<string, Set<string>>();
   if (availableRuns) {
     for (const [runId, runInfo] of Object.entries(availableRuns)) {
       if (!runInfo.changelog) continue;
-      const matches = runInfo.changelog.entries.some((entry) =>
-        entry.config_keys.some((key) => {
+      for (const entry of runInfo.changelog.entries) {
+        for (const key of entry.config_keys) {
           const parts = key.split('-');
-          return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!);
-        }),
-      );
-      if (matches) runIdsWithModelChangelog.push(runId);
+          if (modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!)) {
+            let runs = runsByConfigKey.get(key);
+            if (!runs) {
+              runs = new Set<string>();
+              runsByConfigKey.set(key, runs);
+            }
+            runs.add(runId);
+          }
+        }
+      }
     }
   }
+  // A run is "contested" only if some full config_key it claims is also claimed
+  // by another run. Only then does picking a run disambiguate anything.
+  const contestedRunIds = new Set<string>();
+  for (const runs of runsByConfigKey.values()) {
+    if (runs.size > 1) for (const r of runs) contestedRunIds.add(r);
+  }
   const benchmarkRunId =
-    selectedRunId &&
-    runIdsWithModelChangelog.length > 1 &&
-    runIdsWithModelChangelog.includes(selectedRunId)
-      ? String(selectedRunId)
-      : undefined;
+    selectedRunId && contestedRunIds.has(String(selectedRunId)) ? String(selectedRunId) : undefined;
 
   const {
     graphs,

From af8766ddbe9a3077b9a226cd3487f4f4e040e58b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 11 Jun 2026 11:24:29 -0500
Subject: [PATCH 64/96] fix(inference): carry forward un-contested configs when
 a run is selected
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Selecting a workflow run in the picker scoped the ENTIRE benchmarks query
to that run, so any same-day config living in a different workflow run
vanished — e.g. with two vLLM runs and one SGLang run on the same date,
picking either vLLM run (contested, so scoping kicks in) hid the SGLang
curve entirely, while picking the SGLang run (uncontested, no scoping)
showed everything.

Fetch both the normal latest-per-config rows and the run-scoped rows, and
merge: the selected run wins for every (model, precision, hardware,
framework, benchmark_type) group it actually produced — preserving the
disambiguation that scoping exists for, including dropping base rows for
concs the run didn't cover so DISTINCT-ON mixing can't sneak back — and
every other config carries forward from the base rows. benchmark_type is
part of the replacement key so an agentic-only run can't hide the same
config's fixed-seq carry-forward.

The base query is the default view query so it's effectively always
cached; run selection adds no extra latency in practice.

Verified live: Jun 10, DSv4 B300, run 3/3 (vLLM affinity run) now renders
both b300_vllm (run-scoped) and b300_sglang (carried forward).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/inference/InferenceContext.tsx |  4 ++
 .../inference/hooks/useChartData.ts           | 41 ++++++++++---
 .../app/src/lib/benchmark-transform.test.ts   | 60 ++++++++++++++++++-
 packages/app/src/lib/benchmark-transform.ts   | 29 +++++++++
 4 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 3b994367..5d165e60 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -286,6 +286,10 @@ export function InferenceProvider({
   }
   // A run is "contested" only if some full config_key it claims is also claimed
   // by another run. Only then does picking a run disambiguate anything.
+  // Downstream (useChartData / mergeRunScopedRows) this no longer scopes the
+  // WHOLE chart to the run: only the configs the run actually produced are
+  // pinned to it, and every other config (e.g. another framework's same-day
+  // run) still carries forward from the normal latest-per-config rows.
   const contestedRunIds = new Set<string>();
   for (const runs of runsByConfigKey.values()) {
     if (runs.size > 1) for (const r of runs) contestedRunIds.add(r);
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 019d0691..e76c3123 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -19,7 +19,11 @@ import {
   getModelSortIndex,
   hardwareKeyMatchesAnyBase,
 } from '@/lib/constants';
-import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform';
+import {
+  mergeRunScopedRows,
+  transformBenchmarkRows,
+  withPercentile,
+} from '@/lib/benchmark-transform';
 import { Sequence, type Model } from '@/lib/data-mappings';
 import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils';
 import {
@@ -183,19 +187,40 @@ export function useChartData(
   // When the selected date is the latest available, use '' (empty string) to match
   // the initial no-date query key, reusing the eagerly-fetched benchmarks from the
   // materialized view instead of firing a redundant second fetch with identical data.
-  // When a specific run is selected, we always go through the runId branch and the
-  // date is effectively ignored — keep queryDate set so React Query still has a
-  // distinct cache key per date if the user navigates back to "latest".
   const queryDate =
     selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate
       ? ''
       : selectedRunDate;
 
+  // Two queries: the normal latest-per-config view (always), plus the
+  // run-scoped rows when a specific workflow run is selected. The merged
+  // result pins ONLY the configs the selected run produced to that run, and
+  // carries every other config forward from the base rows — selecting one of
+  // two same-day vLLM runs must not hide the day's SGLang curve just because
+  // it lives in a different workflow run. The base query is the default view
+  // query, so it's almost always already in the React Query cache.
+  const {
+    data: baseRows,
+    isLoading: baseLoading,
+    error: baseError,
+  } = useBenchmarks(selectedModel, queryDate, enabled);
   const {
-    data: allRows,
-    isLoading: queryLoading,
-    error: queryError,
-  } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId);
+    data: runRows,
+    isLoading: runLoading,
+    error: runError,
+  } = useBenchmarks(selectedModel, queryDate, enabled && Boolean(selectedRunId), selectedRunId);
+
+  const allRows = useMemo(() => {
+    if (!selectedRunId) return baseRows;
+    // Wait for the run rows before rendering a scoped view — rendering base
+    // rows first would flash the un-scoped chart, then swap contested points.
+    if (!runRows) return undefined;
+    if (!baseRows) return runRows;
+    return mergeRunScopedRows(runRows, baseRows);
+  }, [selectedRunId, runRows, baseRows]);
+
+  const queryLoading = baseLoading || (Boolean(selectedRunId) && runLoading);
+  const queryError = baseError ?? (selectedRunId ? runError : null);
 
   // GPU comparison: fetch data for each additional comparison date
   const comparisonDates = useMemo(
diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts
index 62cc1809..077e8c3e 100644
--- a/packages/app/src/lib/benchmark-transform.test.ts
+++ b/packages/app/src/lib/benchmark-transform.test.ts
@@ -2,7 +2,11 @@ import { describe, it, expect, vi } from 'vitest';
 
 import type { BenchmarkRow } from '@/lib/api';
 
-import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform';
+import {
+  mergeRunScopedRows,
+  rowToAggDataEntry,
+  transformBenchmarkRows,
+} from './benchmark-transform';
 
 function makeRow(overrides: Partial<BenchmarkRow> = {}): BenchmarkRow {
   return {
@@ -776,3 +780,57 @@ describe('transformBenchmarkRows — dp_attention narrowing', () => {
     expect(point.decode_dp_attention).toBe(true);
   });
 });
+
+describe('mergeRunScopedRows', () => {
+  const vllmRun = (over: Partial<BenchmarkRow> = {}) =>
+    makeRow({ model: 'dsv4', hardware: 'b300', framework: 'vllm', precision: 'fp4', ...over });
+  const sglangBase = (over: Partial<BenchmarkRow> = {}) =>
+    makeRow({ model: 'dsv4', hardware: 'b300', framework: 'sglang', precision: 'fp4', ...over });
+
+  it('pins configs the run covers to the run rows, replacing base rows', () => {
+    const runRows = [vllmRun({ id: 10, conc: 32 }), vllmRun({ id: 11, conc: 64 })];
+    const baseRows = [vllmRun({ id: 90, conc: 32 }), vllmRun({ id: 91, conc: 128 })];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    // All vllm base rows dropped (incl. conc=128 the run didn't cover) — a
+    // partial-sweep run must fully own its config or the DISTINCT-ON mixing
+    // the scoping exists to prevent comes right back.
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]);
+  });
+
+  it('carries forward configs the run does not cover (the same-day other-framework curve)', () => {
+    const runRows = [vllmRun({ id: 10 })];
+    const baseRows = [
+      vllmRun({ id: 90 }),
+      sglangBase({ id: 91 }),
+      sglangBase({ id: 92, conc: 128 }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91, 92]);
+  });
+
+  it('keeps base rows of other hardware / precision / model untouched', () => {
+    const runRows = [vllmRun({ id: 10 })];
+    const baseRows = [
+      vllmRun({ id: 90, hardware: 'b200' }),
+      vllmRun({ id: 91, precision: 'fp8' }),
+      vllmRun({ id: 92, model: 'kimik2.5' }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 90, 91, 92]);
+  });
+
+  it('scopes per benchmark_type — an agentic run does not hide fixed-seq carry-forward', () => {
+    const runRows = [vllmRun({ id: 10, benchmark_type: 'agentic_traces' })];
+    const baseRows = [
+      vllmRun({ id: 90, benchmark_type: 'agentic_traces' }),
+      vllmRun({ id: 91, benchmark_type: 'single_turn' }),
+    ];
+    const merged = mergeRunScopedRows(runRows, baseRows);
+    expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]);
+  });
+
+  it('returns base rows unchanged when the run produced nothing', () => {
+    const baseRows = [vllmRun({ id: 90 }), sglangBase({ id: 91 })];
+    expect(mergeRunScopedRows([], baseRows)).toBe(baseRows);
+  });
+});
diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts
index 9f6b43d1..8329c84b 100644
--- a/packages/app/src/lib/benchmark-transform.ts
+++ b/packages/app/src/lib/benchmark-transform.ts
@@ -172,6 +172,35 @@ export function withPercentile(key: string, percentile: string): string {
   return key.replace(/^(?:mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`);
 }
 
+// Replacement granularity for single-run scoping: the changelog config_key
+// tuple (model-precision-hardware-framework) plus benchmark_type, so an
+// agentic-only run never hides the same config's fixed-seq carry-forward.
+const runScopeKey = (r: BenchmarkRow): string =>
+  `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}`;
+
+/**
+ * Merge run-scoped benchmark rows with the normal latest-per-config rows.
+ *
+ * When the user picks a specific workflow run (to disambiguate two same-day
+ * sweeps of the same config), only the configs that run actually produced
+ * should be pinned to it — every other config must keep its normal
+ * carry-forward rows. Scoping the whole chart to the run (the old behavior)
+ * silently hid complementary configs that happened to land on the same date,
+ * e.g. selecting one of two same-day vLLM runs made the day's SGLang curve
+ * vanish because it lived in a different workflow run.
+ *
+ * Run rows win for every (model, precision, hardware, framework,
+ * benchmark_type) group they cover; base rows fill in the rest.
+ */
+export function mergeRunScopedRows(
+  runRows: BenchmarkRow[],
+  baseRows: BenchmarkRow[],
+): BenchmarkRow[] {
+  if (runRows.length === 0) return baseRows;
+  const claimed = new Set(runRows.map(runScopeKey));
+  return [...runRows, ...baseRows.filter((r) => !claimed.has(runScopeKey(r)))];
+}
+
 /**
  * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig.
  * Returns one InferenceData[] per chart definition (e2e, interactivity).

From d6d31436abf38eb32e6383ab692ff0b8519ca32c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:25:49 -0500
Subject: [PATCH 65/96] fix: reconcile agentic data after master merge

---
 .../component/inference-chart-controls.cy.tsx |   4 +-
 .../inference/hooks/useChartData.ts           |   8 +-
 .../components/inference/ui/ChartDisplay.tsx  | 481 +++++++++---------
 .../components/inference/ui/ScatterGraph.tsx  |   5 +-
 .../components/unofficial-run-provider.tsx    |  10 +-
 packages/app/src/lib/api.ts                   |  15 +-
 packages/db/src/queries/benchmarks.ts         |  21 +-
 7 files changed, 282 insertions(+), 262 deletions(-)

diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx
index 03e6a50c..5a6311f4 100644
--- a/packages/app/cypress/component/inference-chart-controls.cy.tsx
+++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx
@@ -14,8 +14,8 @@ describe('Inference ChartControls', () => {
 
   it('renders the sequence selector with the current sequence', () => {
     // Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K"
-    cy.get('#sequence-select').should('be.visible');
-    cy.get('#sequence-select').should('contain.text', '8K / 1K');
+    cy.get('#scenario-select').should('be.visible');
+    cy.get('#scenario-select').should('contain.text', '8K / 1K');
   });
 
   it('renders the precision multi-select with the current precision', () => {
diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts
index 0d1eac64..ee5acb88 100644
--- a/packages/app/src/components/inference/hooks/useChartData.ts
+++ b/packages/app/src/components/inference/hooks/useChartData.ts
@@ -220,13 +220,7 @@ export function useChartData(
     data: runRows,
     isLoading: runLoading,
     error: runError,
-  } = useBenchmarks(
-    selectedModel,
-    '',
-    enabled && Boolean(selectedRunId),
-    selectedRunId,
-    true,
-  );
+  } = useBenchmarks(selectedModel, '', enabled && Boolean(selectedRunId), selectedRunId, true);
 
   const allRows = useMemo(() => {
     if (!selectedRunId) return baseRows;
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx
index 3a431440..caf713cc 100644
--- a/packages/app/src/components/inference/ui/ChartDisplay.tsx
+++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx
@@ -429,217 +429,206 @@ export default function ChartDisplay() {
     });
   }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]);
 
-  const displayGraphs = isFirstLoad || isDerivedLoading
-    ? [
-        <Card key="skeleton-0">
-          <Skeleton className="h-7 w-2/4 mb-1" />
-          <Skeleton className="h-5 w-3/4 mb-2" />
-          <Skeleton className="h-[600px] w-full" />
-        </Card>,
-      ]
-    : renderableGraphs.length === 0
-      ? []
-      : renderableGraphs.map((graph, graphIndex) => {
-          const isTimelineMode = Boolean(
-            selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
-          );
-          const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
-          return (
-            <section key={graphIndex} className="pt-8 md:pt-0">
-              <figure data-testid="chart-figure" className="relative rounded-lg">
-                <ChartButtons
-                  chartId={`chart-${graphIndex}`}
-                  analyticsPrefix={
-                    isTimelineMode
-                      ? 'gpu_timeseries'
-                      : graph.chartDefinition.chartType === 'e2e'
-                        ? 'latency'
-                        : 'interactivity'
-                  }
-                  leadingControls={
-                    <SegmentedToggle
-                      value={getViewMode(graphIndex)}
-                      options={VIEW_MODE_OPTIONS}
-                      onValueChange={(v) => handleViewModeChange(graphIndex, v)}
-                      ariaLabel="View mode"
-                      testId={`inference-view-toggle-${graphIndex}`}
-                    />
-                  }
-                  hideImageExport={getViewMode(graphIndex) === 'table'}
-                  setIsLegendExpanded={setIsLegendExpanded}
-                  exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
-                  onExportMp4={
-                    replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined
-                  }
-                  onExportCsv={() => {
-                    const visibleData = graph.data.filter((d) =>
+  const displayGraphs =
+    isFirstLoad || isDerivedLoading
+      ? [
+          <Card key="skeleton-0">
+            <Skeleton className="h-7 w-2/4 mb-1" />
+            <Skeleton className="h-5 w-3/4 mb-2" />
+            <Skeleton className="h-[600px] w-full" />
+          </Card>,
+        ]
+      : renderableGraphs.length === 0
+        ? []
+        : renderableGraphs.map((graph, graphIndex) => {
+            const isTimelineMode = Boolean(
+              selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0,
+            );
+            const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode;
+            return (
+              <section key={graphIndex} className="pt-8 md:pt-0">
+                <figure data-testid="chart-figure" className="relative rounded-lg">
+                  <ChartButtons
+                    chartId={`chart-${graphIndex}`}
+                    analyticsPrefix={
                       isTimelineMode
-                        ? activeDates.has(`${d.date}_${d.hwKey}`)
-                        : activeHwTypes.has(d.hwKey as string) &&
-                          selectedPrecisions.includes(d.precision),
-                    );
-                    const { headers, rows } = inferenceChartToCsv(
-                      visibleData,
-                      graph.model,
-                      graph.sequence,
-                    );
-                    // Match warnings against the same series the chart annotates,
-                    // including visible unofficial-run overlay series.
-                    const overlay =
-                      graph.chartDefinition.chartType === 'e2e'
-                        ? overlayDataByChartType.e2e
-                        : overlayDataByChartType.interactivity;
-                    const visibleOverlayRows = isTimelineMode
-                      ? []
-                      : (overlay?.data ?? []).filter(
-                          (p) =>
-                            activeOverlayHwTypes.has(p.hwKey as string) &&
-                            selectedPrecisions.includes(p.precision),
-                        );
-                    const issueNotes = matchKnownConfigIssues(graph.model, [
-                      ...visibleData,
-                      ...visibleOverlayRows,
-                    ]).map((issue) =>
-                      knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))),
-                    );
-                    exportToCsv(
-                      `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
-                      headers,
-                      rows,
-                      issueNotes,
-                    );
-                  }}
-                />
-                <Card>
-                  {(() => {
-                    const chartCaption = (
-                      <>
-                        <h2 className="text-lg font-semibold">
-                          {
-                            graph.chartDefinition[
-                              `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                            ]
-                          }{' '}
-                          {(() => {
-                            // For Input metrics with dynamic x-axis, use dynamic heading
-                            const metricTitle =
-                              (graph.chartDefinition[
-                                `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
-                              ] as string) || '';
-                            const isInputMetric = metricTitle.toLowerCase().includes('input');
-                            if (
-                              graph.chartDefinition.chartType === 'interactivity' &&
-                              isInputMetric &&
-                              selectedXAxisMetric
-                            ) {
-                              if (selectedXAxisMetric === 'p99_ttft') {
-                                return 'vs. P99 Time To First Token';
-                              } else if (selectedXAxisMetric === 'median_ttft') {
-                                return 'vs. Median Time To First Token';
-                              }
-                            }
-
-                            // The e2e chart heading follows the branch-level x-axis mode
-                            // selector, including agentic-only derived metrics.
-                            if (graph.chartDefinition.chartType === 'e2e') {
-                              if (selectedXAxisMode === 'session-time') {
-                                return 'vs. Mean Normalized Session Time';
-                              }
-                              if (selectedXAxisMode === 'prefill-tps') {
-                                return 'vs. P90 Prefill TPS / user';
-                              }
-                              const isAgentic = sequenceKind(selectedSequence) === 'agentic';
-                              if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
-                                const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
-                                const word =
-                                  percentile === 'median' ? 'Median' : percentile.toUpperCase();
-                                return `vs. ${word} Time To First Token`;
-                              }
-                              return isAgentic
-                                ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency`
-                                : 'vs. End-to-end Latency';
-                            }
-
-                            // Fall back to configured heading
-                            return (
-                              graph.chartDefinition[
-                                `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
-                              ] || graph.chartDefinition.heading
-                            );
-                          })()}
-                        </h2>
-                        <p className="text-sm text-muted-foreground mb-2">
-                          {getModelLabel(graph.model as Model)} •{' '}
-                          {selectedPrecisions
-                            .map((prec) => getPrecisionLabel(prec as Precision))
-                            .join(', ')}{' '}
-                          • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
-                          {isUnofficialRun
-                            ? 'Source: UNOFFICIAL'
-                            : 'Source: SemiAnalysis InferenceX™'}
-                          {selectedRunDate && (
-                            <>
-                              {' '}
-                              • Updated:{' '}
-                              {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
-                                'en-US',
-                                {
-                                  year: 'numeric',
-                                  month: '2-digit',
-                                  day: '2-digit',
-                                  timeZone: 'UTC',
-                                },
-                              )}
-                            </>
-                          )}
-                        </p>
-                        <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
-                        <UnofficialDomainNotice />
-                      </>
-                    );
-
-                    if (getViewMode(graphIndex) === 'table') {
+                        ? 'gpu_timeseries'
+                        : graph.chartDefinition.chartType === 'e2e'
+                          ? 'latency'
+                          : 'interactivity'
+                    }
+                    leadingControls={
+                      <SegmentedToggle
+                        value={getViewMode(graphIndex)}
+                        options={VIEW_MODE_OPTIONS}
+                        onValueChange={(v) => handleViewModeChange(graphIndex, v)}
+                        ariaLabel="View mode"
+                        testId={`inference-view-toggle-${graphIndex}`}
+                      />
+                    }
+                    hideImageExport={getViewMode(graphIndex) === 'table'}
+                    setIsLegendExpanded={setIsLegendExpanded}
+                    exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`}
+                    onExportMp4={
+                      replayAvailable
+                        ? () => replayHandlesRef.current[graphIndex]?.open()
+                        : undefined
+                    }
+                    onExportCsv={() => {
+                      const visibleData = graph.data.filter((d) =>
+                        isTimelineMode
+                          ? activeDates.has(`${d.date}_${d.hwKey}`)
+                          : activeHwTypes.has(d.hwKey as string) &&
+                            selectedPrecisions.includes(d.precision),
+                      );
+                      const { headers, rows } = inferenceChartToCsv(
+                        visibleData,
+                        graph.model,
+                        graph.sequence,
+                      );
+                      // Match warnings against the same series the chart annotates,
+                      // including visible unofficial-run overlay series.
                       const overlay =
                         graph.chartDefinition.chartType === 'e2e'
                           ? overlayDataByChartType.e2e
                           : overlayDataByChartType.interactivity;
-                      const overlayRows = (overlay?.data ?? []).filter((p) =>
-                        selectedPrecisions.includes(p.precision),
+                      const visibleOverlayRows = isTimelineMode
+                        ? []
+                        : (overlay?.data ?? []).filter(
+                            (p) =>
+                              activeOverlayHwTypes.has(p.hwKey as string) &&
+                              selectedPrecisions.includes(p.precision),
+                          );
+                      const issueNotes = matchKnownConfigIssues(graph.model, [
+                        ...visibleData,
+                        ...visibleOverlayRows,
+                      ]).map((issue) =>
+                        knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))),
                       );
-                      return (
+                      exportToCsv(
+                        `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`,
+                        headers,
+                        rows,
+                        issueNotes,
+                      );
+                    }}
+                  />
+                  <Card>
+                    {(() => {
+                      const chartCaption = (
                         <>
-                          {chartCaption}
-                          <InferenceTable
-                            data={
-                              overlayRows.length > 0 ? [...graph.data, ...overlayRows] : graph.data
-                            }
-                            chartDefinition={graph.chartDefinition}
-                            selectedYAxisMetric={selectedYAxisMetric}
-                          />
+                          <h2 className="text-lg font-semibold">
+                            {
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                              ]
+                            }{' '}
+                            {(() => {
+                              // For Input metrics with dynamic x-axis, use dynamic heading
+                              const metricTitle =
+                                (graph.chartDefinition[
+                                  `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition
+                                ] as string) || '';
+                              const isInputMetric = metricTitle.toLowerCase().includes('input');
+                              if (
+                                graph.chartDefinition.chartType === 'interactivity' &&
+                                isInputMetric &&
+                                selectedXAxisMetric
+                              ) {
+                                if (selectedXAxisMetric === 'p99_ttft') {
+                                  return 'vs. P99 Time To First Token';
+                                } else if (selectedXAxisMetric === 'median_ttft') {
+                                  return 'vs. Median Time To First Token';
+                                }
+                              }
+
+                              // The e2e chart heading follows the branch-level x-axis mode
+                              // selector, including agentic-only derived metrics.
+                              if (graph.chartDefinition.chartType === 'e2e') {
+                                if (selectedXAxisMode === 'session-time') {
+                                  return 'vs. Mean Normalized Session Time';
+                                }
+                                if (selectedXAxisMode === 'prefill-tps') {
+                                  return 'vs. P90 Prefill TPS / user';
+                                }
+                                const isAgentic = sequenceKind(selectedSequence) === 'agentic';
+                                if (selectedE2eXAxisMetric?.endsWith('_ttft')) {
+                                  const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, '');
+                                  const word =
+                                    percentile === 'median' ? 'Median' : percentile.toUpperCase();
+                                  return `vs. ${word} Time To First Token`;
+                                }
+                                return isAgentic
+                                  ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency`
+                                  : 'vs. End-to-end Latency';
+                              }
+
+                              // Fall back to configured heading
+                              return (
+                                graph.chartDefinition[
+                                  `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition
+                                ] || graph.chartDefinition.heading
+                              );
+                            })()}
+                          </h2>
+                          <p className="text-sm text-muted-foreground mb-2">
+                            {getModelLabel(graph.model as Model)} •{' '}
+                            {selectedPrecisions
+                              .map((prec) => getPrecisionLabel(prec as Precision))
+                              .join(', ')}{' '}
+                            • {getSequenceLabel(graph.sequence as Sequence)} •{' '}
+                            {isUnofficialRun
+                              ? 'Source: UNOFFICIAL'
+                              : 'Source: SemiAnalysis InferenceX™'}
+                            {selectedRunDate && (
+                              <>
+                                {' '}
+                                • Updated:{' '}
+                                {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString(
+                                  'en-US',
+                                  {
+                                    year: 'numeric',
+                                    month: '2-digit',
+                                    day: '2-digit',
+                                    timeZone: 'UTC',
+                                  },
+                                )}
+                              </>
+                            )}
+                          </p>
+                          <MetricAssumptionNotes selectedYAxisMetric={selectedYAxisMetric} />
+                          <UnofficialDomainNotice />
                         </>
                       );
-                    }
 
-                    return selectedGPUs.length > 0 &&
-                      ((selectedDateRange.startDate && selectedDateRange.endDate) ||
-                        selectedDates.length > 0) ? (
-                      <GPUGraph
-                        chartId={`chart-${graphIndex}`}
-                        modelLabel={graph.model}
-                        data={graph.data}
-                        xLabel={graph.chartDefinition.x_label}
-                        yLabel={`${
-                          graph.chartDefinition[
-                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                          ]
-                        }`}
-                        chartDefinition={graph.chartDefinition}
-                        caption={chartCaption}
-                        runNumbering={runNumbering}
-                      />
-                    ) : (
-                      <div className="relative">
-                        <ScatterGraph
+                      if (getViewMode(graphIndex) === 'table') {
+                        const overlay =
+                          graph.chartDefinition.chartType === 'e2e'
+                            ? overlayDataByChartType.e2e
+                            : overlayDataByChartType.interactivity;
+                        const overlayRows = (overlay?.data ?? []).filter((p) =>
+                          selectedPrecisions.includes(p.precision),
+                        );
+                        return (
+                          <>
+                            {chartCaption}
+                            <InferenceTable
+                              data={
+                                overlayRows.length > 0
+                                  ? [...graph.data, ...overlayRows]
+                                  : graph.data
+                              }
+                              chartDefinition={graph.chartDefinition}
+                              selectedYAxisMetric={selectedYAxisMetric}
+                            />
+                          </>
+                        );
+                      }
+
+                      return selectedGPUs.length > 0 &&
+                        ((selectedDateRange.startDate && selectedDateRange.endDate) ||
+                          selectedDates.length > 0) ? (
+                        <GPUGraph
                           chartId={`chart-${graphIndex}`}
                           modelLabel={graph.model}
                           data={graph.data}
@@ -651,44 +640,60 @@ export default function ChartDisplay() {
                           }`}
                           chartDefinition={graph.chartDefinition}
                           caption={chartCaption}
-                          overlayData={
-                            graph.chartDefinition.chartType === 'e2e'
-                              ? (overlayDataByChartType.e2e ?? undefined)
-                              : (overlayDataByChartType.interactivity ?? undefined)
-                          }
+                          runNumbering={runNumbering}
                         />
-                        {selectedGPUs.length > 0 &&
-                          (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
-                          selectedDates.length === 0 && (
-                            <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
-                              <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
-                                Select a date range or add a run to view GPU comparison
-                              </p>
-                            </div>
-                          )}
-                      </div>
-                    );
-                  })()}
-                  {replayAvailable && (
-                    <ReplayLauncher
-                      ref={(handle) => {
-                        replayHandlesRef.current[graphIndex] = handle;
-                      }}
-                      parentChartId={`chart-${graphIndex}`}
-                      chartDefinition={graph.chartDefinition}
-                      yLabel={`${
-                        graph.chartDefinition[
-                          `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
-                        ]
-                      }`}
-                      xLabel={graph.chartDefinition.x_label}
-                    />
-                  )}
-                </Card>
-              </figure>
-            </section>
-          );
-        });
+                      ) : (
+                        <div className="relative">
+                          <ScatterGraph
+                            chartId={`chart-${graphIndex}`}
+                            modelLabel={graph.model}
+                            data={graph.data}
+                            xLabel={graph.chartDefinition.x_label}
+                            yLabel={`${
+                              graph.chartDefinition[
+                                `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                              ]
+                            }`}
+                            chartDefinition={graph.chartDefinition}
+                            caption={chartCaption}
+                            overlayData={
+                              graph.chartDefinition.chartType === 'e2e'
+                                ? (overlayDataByChartType.e2e ?? undefined)
+                                : (overlayDataByChartType.interactivity ?? undefined)
+                            }
+                          />
+                          {selectedGPUs.length > 0 &&
+                            (!selectedDateRange.startDate || !selectedDateRange.endDate) &&
+                            selectedDates.length === 0 && (
+                              <div className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-[2px] rounded-lg z-10">
+                                <p className="text-sm font-medium text-muted-foreground bg-background/90 border border-border rounded-md px-4 py-2 shadow-sm">
+                                  Select a date range or add a run to view GPU comparison
+                                </p>
+                              </div>
+                            )}
+                        </div>
+                      );
+                    })()}
+                    {replayAvailable && (
+                      <ReplayLauncher
+                        ref={(handle) => {
+                          replayHandlesRef.current[graphIndex] = handle;
+                        }}
+                        parentChartId={`chart-${graphIndex}`}
+                        chartDefinition={graph.chartDefinition}
+                        yLabel={`${
+                          graph.chartDefinition[
+                            `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition
+                          ]
+                        }`}
+                        xLabel={graph.chartDefinition.x_label}
+                      />
+                    )}
+                  </Card>
+                </figure>
+              </section>
+            );
+          });
 
   return (
     <div data-testid="inference-chart-display" className="flex flex-col gap-4">
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 982c24d2..e1cad1a4 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -7,7 +7,6 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react';
 import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry';
 import { useInference } from '@/components/inference/InferenceContext';
 import { useTraceAvailability } from '@/hooks/api/use-trace-availability';
-import { useRouter } from 'next/navigation';
 import { pointNearestX } from '@/components/inference/ui/line-label-anchor';
 import ChartLegend from '@/components/ui/chart-legend';
 import { useUnofficialRun } from '@/components/unofficial-run-provider';
@@ -582,7 +581,6 @@ const ScatterGraph = React.memo(
       return ids;
     }, [pointsData]);
     const { data: traceAvailability } = useTraceAvailability(agenticIds);
-    const router = useRouter();
 
     // Gradient label data
     const allPointLabelsByKey = useMemo(() => {
@@ -902,7 +900,7 @@ const ScatterGraph = React.memo(
               });
               chartRef.current?.dismissTooltip();
               chartRef.current?.hideTooltip();
-              router.push(`/inference/agentic/${pointId}`);
+              window.location.assign(`/inference/agentic/${pointId}`);
             });
           }
         },
@@ -923,7 +921,6 @@ const ScatterGraph = React.memo(
         // presence fetch resolves so the button appears for points that
         // have a trace_replay blob.
         traceAvailability,
-        router,
       ],
     );
 
diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx
index b8e76f38..54b470ff 100644
--- a/packages/app/src/components/unofficial-run-provider.tsx
+++ b/packages/app/src/components/unofficial-run-provider.tsx
@@ -279,11 +279,11 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) {
       // Filter chart data by stamped `run_url`. A row belongs to the dismissed
       // run if its URL matches exactly OR the numeric id parses to the same.
       const belongsToDismissed = (rowUrl?: string | null) => {
-          if (!rowUrl) return false;
-          if (rowUrl === target.url) return true;
-          const m = rowUrl.match(/\/runs\/(?<runId>\d+)/u);
-          return m?.groups?.runId === runId;
-        };
+        if (!rowUrl) return false;
+        if (rowUrl === target.url) return true;
+        const m = rowUrl.match(/\/runs\/(?<runId>\d+)/u);
+        return m?.groups?.runId === runId;
+      };
 
       // Compute the filtered chart data BEFORE any setState so we can pass the
       // same value to setUnofficialChartData and parseAvailableModelsAndSequences.
diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts
index 0dac5883..a9d66715 100644
--- a/packages/app/src/lib/api.ts
+++ b/packages/app/src/lib/api.ts
@@ -8,6 +8,8 @@ import type { WorkerPower } from '@/components/inference/types';
 import type { SubmissionsResponse } from './submissions-types';
 
 export interface BenchmarkRow {
+  /** Stable per-point id from benchmark_results; used for agentic detail lookups. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -25,9 +27,13 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  // Null for agentic_traces rows; numeric for single_turn fixed-seq rows.
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  /** KV-cache offload mode. Defaults to 'off' for fixed-sequence rows. */
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   /**
@@ -176,13 +182,14 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) {
 
 export interface AvailabilityRow {
   model: string;
-  isl: number;
-  osl: number;
+  isl: number | null;
+  osl: number | null;
   precision: string;
   hardware: string;
   framework: string;
   spec_method: string;
   disagg: boolean;
+  benchmark_type: string;
   date: string;
 }
 
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 49c60604..6833756a 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -11,6 +11,8 @@ import type { WorkerPower } from '../etl/benchmark-mapper.js';
 export type BenchmarkWorkerRow = WorkerPower;
 
 export interface BenchmarkRow {
+  /** Stable benchmark_results id used for agentic detail lookups. */
+  id: number;
   hardware: string;
   framework: string;
   model: string;
@@ -28,9 +30,11 @@ export interface BenchmarkRow {
   decode_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
-  isl: number;
-  osl: number;
+  benchmark_type: string;
+  isl: number | null;
+  osl: number | null;
   conc: number;
+  offload_mode: string;
   image: string | null;
   metrics: Record<string, number>;
   /**
@@ -95,6 +99,7 @@ export async function getLatestBenchmarks(
         : sql``;
     const rows = await sql`
       SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+        br.id,
         c.hardware,
         c.framework,
         c.model,
@@ -112,6 +117,8 @@ export async function getLatestBenchmarks(
         c.decode_num_workers,
         c.num_prefill_gpu,
         c.num_decode_gpu,
+        br.benchmark_type,
+        br.offload_mode,
         br.isl,
         br.osl,
         br.conc,
@@ -136,6 +143,7 @@ export async function getLatestBenchmarks(
   // No date filter: use materialized view for instant lookups
   const rows = await sql`
     SELECT
+      lb.id,
       c.hardware,
       c.framework,
       c.model,
@@ -153,6 +161,8 @@ export async function getLatestBenchmarks(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      lb.benchmark_type,
+      lb.offload_mode,
       lb.isl,
       lb.osl,
       lb.conc,
@@ -185,6 +195,7 @@ export async function getBenchmarksForRun(
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   const rows = await sql`
     SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+      br.id,
       c.hardware,
       c.framework,
       c.model,
@@ -202,6 +213,8 @@ export async function getBenchmarksForRun(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
+      br.offload_mode,
       br.isl,
       br.osl,
       br.conc,
@@ -235,6 +248,7 @@ export async function getAllBenchmarksForHistory(
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   const rows = await sql`
     SELECT
+      br.id,
       c.hardware,
       c.framework,
       c.model,
@@ -252,9 +266,12 @@ export async function getAllBenchmarksForHistory(
       c.decode_num_workers,
       c.num_prefill_gpu,
       c.num_decode_gpu,
+      br.benchmark_type,
+      br.offload_mode,
       br.isl,
       br.osl,
       br.conc,
+      br.image,
       br.metrics - '{std_ttft,std_tpot,std_e2el,std_intvty,std_itl,mean_ttft,mean_tpot,mean_e2el,mean_intvty,mean_itl}'::text[] as metrics,
       br.workers,
       br.date::text,

From f60ef9c7f18a1782edd5542510328b242048a2de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:34:00 -0500
Subject: [PATCH 66/96] fix(gpu-compare): show concurrency (C=) over points

GPU compare mode (GPUGraph) labeled points with only the parallelism/tp
string, dropping the C=<conc> suffix that the single-run scatter chart
(ScatterGraph) shows. Append it so compare-mode points are annotated the
same way.

Verified live in compare mode: points now read e.g. 'DEP8 / C=2048',
'TP4 / C=64'.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/app/src/components/inference/ui/GPUGraph.tsx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index e7737a2e..24b1266f 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -759,7 +759,11 @@ const GPUGraph = React.memo(
             config: {
               getColor,
               hideLabels: hidePointLabels,
-              getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)),
+              // Match ScatterGraph: append the concurrency (C=) to the
+              // parallelism/tp label so compare-mode points are annotated the
+              // same way as the single-run scatter chart.
+              getLabelText: (d) =>
+                useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`,
               foreground: 'var(--foreground)',
               dataAttrs: {
                 series: (d) => `${d.date}_${d.hwKey}`,

From 22028ccfe3141aa632b4c23aaca26b9c4bd51b58 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:43:42 -0500
Subject: [PATCH 67/96] fix(agentic-timeline): hide no-op phase toggle;
 fixed-height scroll window
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes to the conversation/request-timeline view:

1. The Profiling vs 'All (incl. warmup)' toggle never did anything —
   aiperf's profile_export only contains profiling-phase requests, so
   every stored record has phase='profiling' (verified: 297k/297k rows).
   Hide the toggle unless a non-profiling request actually exists, so it
   reappears and works only if warmup is ever exported.

2. The timeline grew to fit every conversation/worker, making the card
   arbitrarily tall. Cap the body at a fixed height (480px) and scroll
   the rows vertically inside it. Few-row runs still size to content
   (no empty space); the label column and bars scroll together since
   they share the one scroll container.

Verified live on a 3475-request point: phase toggle absent, row-mode
toggle still present, window clientHeight 480 with ~3745px scrolling
inside.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.tsx        | 474 +++++++++---------
 1 file changed, 249 insertions(+), 225 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 3c032fdd..2313775e 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -30,6 +30,11 @@ const PHASE_OPTIONS: SegmentedToggleOption<PhaseFilter>[] = [
   { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' },
 ];
 
+// The timeline body is capped at this height and scrolls internally, so a run
+// with many conversations/workers doesn't make the card grow unbounded and push
+// the rest of the detail page down. Sized to show ~16 rows + the header.
+const TIMELINE_BODY_MAX_HEIGHT = 480;
+
 /** A stable color palette indexed by row-key hash. */
 const ROW_COLORS = [
   '#3b82f6',
@@ -393,11 +398,24 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
   }, []);
   const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null);
 
-  // Apply phase filter, then group into rows.
+  // The phase toggle only means something when warmup requests are actually
+  // present. aiperf's profile_export only contains profiling-phase requests, so
+  // in practice every record is `profiling` and the toggle is a no-op — hide it
+  // unless a non-profiling request exists (keeps it working if warmup is ever
+  // exported).
+  const hasWarmup = useMemo(
+    () => data.requests.some((r) => r.phase !== 'profiling'),
+    [data.requests],
+  );
+
+  // Apply phase filter, then group into rows. With no warmup data the filter
+  // collapses to "profiling" regardless of the (hidden) toggle state.
   const filtered = useMemo(
     () =>
-      phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'),
-    [data.requests, phaseFilter],
+      phaseFilter === 'all' && hasWarmup
+        ? data.requests
+        : data.requests.filter((r) => r.phase === 'profiling'),
+    [data.requests, phaseFilter, hasWarmup],
   );
   const rows = useMemo(
     () => buildRows(filtered, rowMode, expandedSubagents),
@@ -581,14 +599,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
           testId="timeline-row-mode"
           buttonClassName="px-2.5 py-1 text-xs"
         />
-        <SegmentedToggle
-          value={phaseFilter}
-          options={PHASE_OPTIONS}
-          onValueChange={setPhaseFilter}
-          ariaLabel="Phase filter"
-          testId="timeline-phase-filter"
-          buttonClassName="px-2.5 py-1 text-xs"
-        />
+        {hasWarmup && (
+          <SegmentedToggle
+            value={phaseFilter}
+            options={PHASE_OPTIONS}
+            onValueChange={setPhaseFilter}
+            ariaLabel="Phase filter"
+            testId="timeline-phase-filter"
+            buttonClassName="px-2.5 py-1 text-xs"
+          />
+        )}
         <span className="ml-auto text-xs text-muted-foreground">
           {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '}
           {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '}
@@ -606,243 +626,247 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
 
       {/* Chart container */}
       <div className="rounded-md border border-border/60 bg-card overflow-hidden">
-        <div className="flex">
-          {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
-          <div
-            className="flex-shrink-0 border-r border-border/60 bg-card/80"
-            style={{ width: LABEL_WIDTH }}
-          >
+        {/* Fixed-height window: the rows scroll vertically inside it instead of
+            the card growing to fit every conversation/worker. */}
+        <div className="overflow-y-auto" style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}>
+          <div className="flex">
+            {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
             <div
-              className="border-b border-border/60 flex items-end px-2 pb-1"
-              style={{ height: HEADER_HEIGHT }}
+              className="flex-shrink-0 border-r border-border/60 bg-card/80"
+              style={{ width: LABEL_WIDTH }}
             >
-              <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
-                {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
-              </span>
-            </div>
-            {rows.map((row) => {
-              const isSubagentRow = row.kind === 'subagent';
-              const isStreamRow = row.kind === 'stream';
-              const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
-              const isExpanded = isExpandable && expandedSubagents.has(row.key);
-              return (
-                <div
-                  key={row.key}
-                  className="flex items-center gap-1 overflow-hidden pr-2"
-                  style={{
-                    height: ROW_HEIGHT + ROW_GAP,
-                    paddingLeft: 4 + row.depth * 10,
-                  }}
-                >
-                  {isExpandable ? (
-                    <button
-                      type="button"
-                      onClick={() => toggleSubagent(row.key)}
-                      className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
-                      aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
-                      title={isExpanded ? 'Collapse streams' : 'Expand streams'}
-                    >
-                      <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
-                    </button>
-                  ) : (
-                    <span className="size-3.5 shrink-0" />
-                  )}
-                  <span
-                    className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
-                    style={{
-                      backgroundColor: row.color,
-                      opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
-                    }}
-                  />
-                  <span
-                    className="text-[10px] font-mono truncate"
+              <div
+                className="border-b border-border/60 flex items-end px-2 pb-1"
+                style={{ height: HEADER_HEIGHT }}
+              >
+                <span className="text-[9px] font-mono font-bold uppercase tracking-[0.15em] text-muted-foreground">
+                  {rowMode === 'conversation' ? 'Conversation' : 'Worker'}
+                </span>
+              </div>
+              {rows.map((row) => {
+                const isSubagentRow = row.kind === 'subagent';
+                const isStreamRow = row.kind === 'stream';
+                const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1;
+                const isExpanded = isExpandable && expandedSubagents.has(row.key);
+                return (
+                  <div
+                    key={row.key}
+                    className="flex items-center gap-1 overflow-hidden pr-2"
                     style={{
-                      color: row.color,
-                      opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                      height: ROW_HEIGHT + ROW_GAP,
+                      paddingLeft: 4 + row.depth * 10,
                     }}
                   >
-                    {row.label}
-                    {isExpandable && (
-                      <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                    {isExpandable ? (
+                      <button
+                        type="button"
+                        onClick={() => toggleSubagent(row.key)}
+                        className="size-3.5 flex items-center justify-center text-muted-foreground hover:text-foreground shrink-0"
+                        aria-label={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                        title={isExpanded ? 'Collapse streams' : 'Expand streams'}
+                      >
+                        <span className="text-[10px] leading-none">{isExpanded ? '▾' : '▸'}</span>
+                      </button>
+                    ) : (
+                      <span className="size-3.5 shrink-0" />
                     )}
-                  </span>
-                  <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
-                    {row.requests.length > 0 ? row.requests.length : '—'}
-                  </span>
-                </div>
-              );
-            })}
-          </div>
-
-          {/* Scrollable SVG */}
-          <div className="flex-1 overflow-x-auto">
-            <svg
-              width={chartWidth}
-              height={svgHeight}
-              className="block"
-              style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
-              onWheel={handleWheel}
-              onMouseDown={handleMouseDown}
-              onMouseMove={handleMouseMove}
-              onMouseUp={handleMouseUp}
-              onMouseLeave={handleMouseLeave}
-            >
-              {/* Header / time-axis baseline */}
-              <line
-                x1={0}
-                y1={HEADER_HEIGHT}
-                x2={chartWidth}
-                y2={HEADER_HEIGHT}
-                stroke="currentColor"
-                opacity={0.15}
-              />
-
-              {/* Time axis ticks */}
-              {ticks.map((t) => {
-                // Convert visible-window ns offset → x px (the tick array
-                // is already in dataStart-relative coords).
-                const x = (t - vStart) * scale;
-                return (
-                  <g key={t}>
-                    <line
-                      x1={x}
-                      y1={HEADER_HEIGHT}
-                      x2={x}
-                      y2={svgHeight}
-                      stroke="currentColor"
-                      opacity={0.08}
-                      strokeDasharray="2 4"
+                    <span
+                      className="inline-block w-1 h-3 rounded-sm flex-shrink-0"
+                      style={{
+                        backgroundColor: row.color,
+                        opacity: isStreamRow ? 0.4 : isSubagentRow ? 0.55 : 1,
+                      }}
                     />
-                    <text
-                      x={x + 2}
-                      y={HEADER_HEIGHT - 6}
-                      fill="currentColor"
-                      opacity={0.55}
-                      fontSize={9}
-                      fontFamily="ui-monospace, SFMono-Regular, monospace"
+                    <span
+                      className="text-[10px] font-mono truncate"
+                      style={{
+                        color: row.color,
+                        opacity: isStreamRow ? 0.7 : isSubagentRow ? 0.85 : 1,
+                      }}
                     >
-                      {formatTickLabel(t)}
-                    </text>
-                  </g>
+                      {row.label}
+                      {isExpandable && (
+                        <span className="text-muted-foreground ml-1">×{row.streamCount}</span>
+                      )}
+                    </span>
+                    <span className="text-[9px] font-mono text-muted-foreground ml-auto shrink-0">
+                      {row.requests.length > 0 ? row.requests.length : '—'}
+                    </span>
+                  </div>
                 );
               })}
+            </div>
 
-              {/* Row separators */}
-              {rows.map((row, idx) => (
+            {/* Scrollable SVG */}
+            <div className="flex-1 overflow-x-auto">
+              <svg
+                width={chartWidth}
+                height={svgHeight}
+                className="block"
+                style={{ cursor: isZoomed ? 'grab' : 'crosshair' }}
+                onWheel={handleWheel}
+                onMouseDown={handleMouseDown}
+                onMouseMove={handleMouseMove}
+                onMouseUp={handleMouseUp}
+                onMouseLeave={handleMouseLeave}
+              >
+                {/* Header / time-axis baseline */}
                 <line
-                  key={`sep-${row.key}`}
                   x1={0}
-                  y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  y1={HEADER_HEIGHT}
                   x2={chartWidth}
-                  y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                  y2={HEADER_HEIGHT}
                   stroke="currentColor"
-                  opacity={0.04}
+                  opacity={0.15}
                 />
-              ))}
-
-              {/* Request bars */}
-              {rows.map((row, rowIdx) => {
-                const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
-                const barH = ROW_HEIGHT - 4;
-                // For multi-stream subagent containers, suppress the union
-                // bars when expanded — the child stream rows draw them
-                // individually instead, so we'd double-draw otherwise.
-                if (
-                  row.kind === 'subagent' &&
-                  (row.streamCount ?? 1) > 1 &&
-                  expandedSubagents.has(row.key)
-                ) {
-                  return null;
-                }
-                return row.requests.map((req) => {
-                  const xCredit = xOf(req.credit);
-                  const xStart = xOf(req.start);
-                  const xEnd = xOf(req.end);
-                  // Cull bars entirely outside the visible window so big
-                  // benchmarks don't render thousands of zero-width rects.
-                  if (xEnd < -2 || xCredit > chartWidth + 2) return null;
-                  const runW = Math.max(xEnd - xStart, 1);
-                  const queueW = Math.max(xStart - xCredit, 0);
-                  const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+
+                {/* Time axis ticks */}
+                {ticks.map((t) => {
+                  // Convert visible-window ns offset → x px (the tick array
+                  // is already in dataStart-relative coords).
+                  const x = (t - vStart) * scale;
                   return (
-                    <g
-                      key={`${req.cid}-${req.ti}-${req.start}`}
-                      onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
-                      onMouseLeave={() => setTooltip(null)}
-                    >
-                      {/* Queue lead-in (faint) — only drawn when noticeable. */}
-                      {queueW >= 1 && (
+                    <g key={t}>
+                      <line
+                        x1={x}
+                        y1={HEADER_HEIGHT}
+                        x2={x}
+                        y2={svgHeight}
+                        stroke="currentColor"
+                        opacity={0.08}
+                        strokeDasharray="2 4"
+                      />
+                      <text
+                        x={x + 2}
+                        y={HEADER_HEIGHT - 6}
+                        fill="currentColor"
+                        opacity={0.55}
+                        fontSize={9}
+                        fontFamily="ui-monospace, SFMono-Regular, monospace"
+                      >
+                        {formatTickLabel(t)}
+                      </text>
+                    </g>
+                  );
+                })}
+
+                {/* Row separators */}
+                {rows.map((row, idx) => (
+                  <line
+                    key={`sep-${row.key}`}
+                    x1={0}
+                    y1={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                    x2={chartWidth}
+                    y2={HEADER_HEIGHT + idx * (ROW_HEIGHT + ROW_GAP)}
+                    stroke="currentColor"
+                    opacity={0.04}
+                  />
+                ))}
+
+                {/* Request bars */}
+                {rows.map((row, rowIdx) => {
+                  const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2;
+                  const barH = ROW_HEIGHT - 4;
+                  // For multi-stream subagent containers, suppress the union
+                  // bars when expanded — the child stream rows draw them
+                  // individually instead, so we'd double-draw otherwise.
+                  if (
+                    row.kind === 'subagent' &&
+                    (row.streamCount ?? 1) > 1 &&
+                    expandedSubagents.has(row.key)
+                  ) {
+                    return null;
+                  }
+                  return row.requests.map((req) => {
+                    const xCredit = xOf(req.credit);
+                    const xStart = xOf(req.start);
+                    const xEnd = xOf(req.end);
+                    // Cull bars entirely outside the visible window so big
+                    // benchmarks don't render thousands of zero-width rects.
+                    if (xEnd < -2 || xCredit > chartWidth + 2) return null;
+                    const runW = Math.max(xEnd - xStart, 1);
+                    const queueW = Math.max(xStart - xCredit, 0);
+                    const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!;
+                    return (
+                      <g
+                        key={`${req.cid}-${req.ti}-${req.start}`}
+                        onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
+                        onMouseLeave={() => setTooltip(null)}
+                      >
+                        {/* Queue lead-in (faint) — only drawn when noticeable. */}
+                        {queueW >= 1 && (
+                          <rect
+                            x={xCredit}
+                            y={yTop + barH / 2 - 1}
+                            width={queueW}
+                            height={2}
+                            fill={row.color}
+                            opacity={0.35}
+                          />
+                        )}
+                        {/* Main bar — opacity stepped down with depth so
+                          parent > subagent > stream reads visually. */}
                         <rect
-                          x={xCredit}
-                          y={yTop + barH / 2 - 1}
-                          width={queueW}
-                          height={2}
+                          x={xStart}
+                          y={yTop}
+                          width={runW}
+                          height={barH}
+                          rx={2}
                           fill={row.color}
-                          opacity={0.35}
+                          opacity={
+                            req.cancelled
+                              ? 0.35
+                              : row.kind === 'stream'
+                                ? 0.5
+                                : row.kind === 'subagent'
+                                  ? 0.6
+                                  : 0.85
+                          }
                         />
-                      )}
-                      {/* Main bar — opacity stepped down with depth so
-                          parent > subagent > stream reads visually. */}
-                      <rect
-                        x={xStart}
-                        y={yTop}
-                        width={runW}
-                        height={barH}
-                        rx={2}
-                        fill={row.color}
-                        opacity={
-                          req.cancelled
-                            ? 0.35
-                            : row.kind === 'stream'
-                              ? 0.5
-                              : row.kind === 'subagent'
-                                ? 0.6
-                                : 0.85
-                        }
-                      />
-                      {/* Phase strip at bottom */}
-                      <rect
-                        x={xStart}
-                        y={yTop + barH - 2}
-                        width={runW}
-                        height={2}
-                        rx={1}
-                        fill={phaseColor}
-                        opacity={0.85}
-                      />
-                      {/* Cancelled X overlay */}
-                      {req.cancelled && runW > 6 && (
-                        <line
-                          x1={xStart + 1}
-                          y1={yTop + 1}
-                          x2={xStart + runW - 1}
-                          y2={yTop + barH - 1}
-                          stroke="currentColor"
-                          strokeWidth={0.7}
-                          opacity={0.6}
+                        {/* Phase strip at bottom */}
+                        <rect
+                          x={xStart}
+                          y={yTop + barH - 2}
+                          width={runW}
+                          height={2}
+                          rx={1}
+                          fill={phaseColor}
+                          opacity={0.85}
                         />
-                      )}
-                    </g>
-                  );
-                });
-              })}
-
-              {/* Cursor crosshair — drawn on top of bars so it stays visible
+                        {/* Cancelled X overlay */}
+                        {req.cancelled && runW > 6 && (
+                          <line
+                            x1={xStart + 1}
+                            y1={yTop + 1}
+                            x2={xStart + runW - 1}
+                            y2={yTop + barH - 1}
+                            stroke="currentColor"
+                            strokeWidth={0.7}
+                            opacity={0.6}
+                          />
+                        )}
+                      </g>
+                    );
+                  });
+                })}
+
+                {/* Cursor crosshair — drawn on top of bars so it stays visible
                   through dense rows. Stats popover is rendered as fixed
                   HTML below the SVG block. */}
-              {cursor && (
-                <line
-                  x1={cursor.xPx}
-                  x2={cursor.xPx}
-                  y1={0}
-                  y2={svgHeight}
-                  stroke="currentColor"
-                  strokeWidth={1}
-                  opacity={0.45}
-                  pointerEvents="none"
-                />
-              )}
-            </svg>
+                {cursor && (
+                  <line
+                    x1={cursor.xPx}
+                    x2={cursor.xPx}
+                    y1={0}
+                    y2={svgHeight}
+                    stroke="currentColor"
+                    strokeWidth={1}
+                    opacity={0.45}
+                    pointerEvents="none"
+                  />
+                )}
+              </svg>
+            </div>
           </div>
         </div>
       </div>

From 28d25a53b7e3543a3d91e9c19f05b2409c20c032 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Wed, 17 Jun 2026 11:50:26 -0500
Subject: [PATCH 68/96] feat(agentic-timeline): sticky bottom h-scroll +
 double-click to reset zoom

The fixed-height window put the chart's horizontal scrollbar at the
bottom of the tall (full-height) content, below the fold and unreachable.
Make the window itself the single scroll container (overflow-auto, both
axes) and pin the label column with position:sticky left-0, so the
horizontal scrollbar stays at the window's bottom edge while the label
column stays put during horizontal scroll and scrolls with the rows
vertically.

Also add double-click anywhere on the timeline to reset zoom/pan (same
resetZoom the existing button calls) and note it in the hint text.

Verified live: window scrollW 1280 > clientW 879 (h-scroll present and
working), label column sticky, rows scroll vertically.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agentic-point/request-timeline.tsx        | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 2313775e..7c5fdab0 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -626,13 +626,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
 
       {/* Chart container */}
       <div className="rounded-md border border-border/60 bg-card overflow-hidden">
-        {/* Fixed-height window: the rows scroll vertically inside it instead of
-            the card growing to fit every conversation/worker. */}
-        <div className="overflow-y-auto" style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}>
-          <div className="flex">
-            {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
+        {/* Fixed-height window: rows scroll vertically and the chart scrolls
+            horizontally inside it, so the card doesn't grow to fit every
+            conversation/worker AND the horizontal scrollbar stays pinned to the
+            window's bottom edge (rather than the bottom of the tall content). */}
+        <div className="overflow-auto" style={{ maxHeight: TIMELINE_BODY_MAX_HEIGHT }}>
+          <div className="flex w-max">
+            {/* Label column — pinned left (sticky) so it stays put during
+                horizontal scroll, while scrolling vertically with the rows. */}
             <div
-              className="flex-shrink-0 border-r border-border/60 bg-card/80"
+              className="sticky left-0 z-10 flex-shrink-0 border-r border-border/60 bg-card"
               style={{ width: LABEL_WIDTH }}
             >
               <div
@@ -697,8 +700,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
               })}
             </div>
 
-            {/* Scrollable SVG */}
-            <div className="flex-1 overflow-x-auto">
+            {/* Chart column — horizontal scrolling is handled by the window
+                container above so its scrollbar stays pinned to the window's
+                bottom edge; double-click anywhere resets the zoom. */}
+            <div className="flex-shrink-0">
               <svg
                 width={chartWidth}
                 height={svgHeight}
@@ -709,6 +714,7 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                 onMouseMove={handleMouseMove}
                 onMouseUp={handleMouseUp}
                 onMouseLeave={handleMouseLeave}
+                onDoubleClick={resetZoom}
               >
                 {/* Header / time-axis baseline */}
                 <line
@@ -885,7 +891,9 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
           <span className="inline-block w-3 h-2 rounded-sm" style={{ background: '#94a3b8' }} />
           warmup
         </span>
-        <span className="ml-auto opacity-70">scroll to zoom · drag to pan</span>
+        <span className="ml-auto opacity-70">
+          scroll to zoom · drag to pan · double-click to reset
+        </span>
       </div>
 
       {/* Cursor stats popover: count of in-flight / waiting at the cursor's

From 6e56bbfb2a29c6ffad2e4d4484bfcb6673fdacfd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 09:29:18 -0500
Subject: [PATCH 69/96] fix(gpu-compare): show CPU-offload halo on points
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dashed offload-mode ring (drawn in ScatterGraph's onRender for every
point with offload_mode='on') was missing from GPU compare mode
(GPUGraph), so the CPU-offloading indicator never appeared there. Mirror
it in GPUGraph's onRender — same dashed var(--foreground) ring at
POINT_SIZE+4, appended inside each .dot-group so it travels with the
point on zoom/pan.

Verified live in compare mode (DSv4 B200/B300 agentic): offload points
now render the dashed halo (5 rings, r=7.5, dash 3 2).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../src/components/inference/ui/GPUGraph.tsx  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx
index 24b1266f..19ba574f 100644
--- a/packages/app/src/components/inference/ui/GPUGraph.tsx
+++ b/packages/app/src/components/inference/ui/GPUGraph.tsx
@@ -26,6 +26,7 @@ import {
   formatLargeNumber,
   getShapeKeyForPrecision,
   logTickFormat,
+  POINT_SIZE,
 } from '@/lib/chart-rendering';
 import {
   paretoFrontLowerLeft,
@@ -827,6 +828,28 @@ const GPUGraph = React.memo(
           }
           // Set foreground color on scatter point labels
           ctx.layout.zoomGroup.selectAll('.point-label').style('fill', 'var(--foreground)');
+
+          // Offload halo: dashed ring on every point that used KV offload
+          // (mirrors ScatterGraph so compare mode shows the same CPU-offload
+          // indicator). The ring is a child of the dot-group, so it travels
+          // with the point on zoom/pan without a separate onZoom pass.
+          ctx.layout.zoomGroup
+            .selectAll<SVGGElement, InferenceData>('.dot-group')
+            .each(function (d) {
+              const showHalo = d.offload_mode === 'on';
+              d3.select(this)
+                .selectAll<SVGCircleElement, boolean>('.offload-halo')
+                .data(showHalo ? [true] : [])
+                .join('circle')
+                .attr('class', 'offload-halo')
+                .attr('r', POINT_SIZE + 4)
+                .attr('fill', 'none')
+                .attr('stroke', 'var(--foreground)')
+                .attr('stroke-width', 1.5)
+                .attr('stroke-dasharray', '3 2')
+                .attr('opacity', 0.9)
+                .attr('pointer-events', 'none');
+            });
         }}
         legendElement={
           <ChartLegend

From 2c060090278d660f1ad59e01646f5cdf0950e7d4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Thu, 18 Jun 2026 12:56:08 -0500
Subject: [PATCH 70/96] fix(high-contrast): use full hue wheel for
 single-vendor comparisons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

generateHighContrastColors clamps each vendor's series into its brand hue
zone (NVIDIA=green, AMD=red) at <=PREFERRED_MAX items. The point of that
clamp is to keep DIFFERENT vendors apart at a glance — but when only one
vendor is present (the common all-NVIDIA agentic comparison: B200/B300 x
vLLM/SGLang), there's no rival to separate from, so every series collapses
into the same narrow green band and high-contrast mode looks like it does
nothing.

When a single vendor is present, skip the brand zone and rival-ban and use
the full hue wheel for maximum separation. Verified on an all-NVIDIA
agentic view: HC now spreads pink/blue/gold/green (hues 45/99/227/330,
min adjacent gap 54deg) instead of four near-identical greens. Multi-vendor
behavior is unchanged — vendors keep their brand zones so they stay
distinguishable. The non-HC palette still carries vendor identity.

Updated the single-vendor color tests to assert separability across the
full wheel rather than brand-zone confinement.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/app/src/lib/chart-utils.test.ts | 39 ++++++++++--------------
 packages/app/src/lib/chart-utils.ts      | 19 ++++++++++--
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/packages/app/src/lib/chart-utils.test.ts b/packages/app/src/lib/chart-utils.test.ts
index 061037ed..f6828ce2 100644
--- a/packages/app/src/lib/chart-utils.test.ts
+++ b/packages/app/src/lib/chart-utils.test.ts
@@ -353,30 +353,29 @@ describe('generateHighContrastColors', () => {
     expect(Object.values(dark).join(',')).not.toEqual(Object.values(light).join(','));
   });
 
-  // ---------- Tier 1: few items → brand zone ----------
-
-  it('3 NVIDIA GPUs are not red', () => {
+  // ---------- Single vendor: full wheel for maximum contrast ----------
+  // Brand-zone / rival-ban only apply when MULTIPLE vendors are present (so the
+  // vendors stay visually separable). With a single vendor there's no rival to
+  // distinguish from, so HC opens the full hue wheel — brand hue is sacrificed
+  // for the contrast HC exists to provide (fixes the all-NVIDIA agentic case
+  // where every series otherwise collapsed into the green brand band).
+
+  it('3 NVIDIA GPUs (single vendor) are distinguishable across the full wheel', () => {
     const result = generateHighContrastColors(['h100_vllm', 'h200_vllm', 'b200_vllm'], 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotReddish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(3);
     assertMinDist(result, 30);
   });
 
-  it('2 AMD GPUs are not green', () => {
+  it('2 AMD GPUs (single vendor) are distinguishable across the full wheel', () => {
     const result = generateHighContrastColors(['mi300x_sglang', 'mi325x_sglang'], 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotGreenish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(2);
     assertMinDist(result, 30);
   });
 
-  it('4 NVIDIA GPUs stay in brand zone and are distinguishable', () => {
+  it('4 NVIDIA GPUs (single vendor) use the full wheel and stay well-separated', () => {
     const keys = ['h100_vllm', 'h200_vllm', 'b200_vllm', 'b300_vllm'];
     const result = generateHighContrastColors(keys, 'dark');
-    for (const color of Object.values(result)) {
-      expect(isNotReddish(parseRgb(color))).toBe(true);
-    }
+    expect(Object.keys(result)).toHaveLength(4);
     assertMinDist(result, 25);
   });
 
@@ -401,19 +400,13 @@ describe('generateHighContrastColors', () => {
     assertMinDist(result, 25);
   });
 
-  // ---------- Tier 2: moderate items → full wheel minus rival color ----------
+  // ---------- Single vendor, many items → full wheel, best spacing ----------
 
-  it('10 NVIDIA GPUs: no red hues, still distinguishable', () => {
+  it('10 NVIDIA GPUs (single vendor) are well-separated across the full wheel', () => {
     const gpus = ['h100', 'h200', 'b200', 'b300', 'gb200'];
     const keys = gpus.flatMap((g) => [`${g}_vllm`, `${g}_sglang`]);
     const result = generateHighContrastColors(keys, 'dark');
-    // Should not be reddish (banned)
-    for (const color of Object.values(result)) {
-      const rgb = parseRgb(color);
-      // Not red-dominant with low green — i.e. not in the red/pink zone
-      const isRedPink = rgb[0] > 150 && rgb[1] < 80 && rgb[2] < 150;
-      expect(isRedPink).toBe(false);
-    }
+    expect(Object.keys(result)).toHaveLength(10);
     assertMinDist(result, 20);
   });
 
diff --git a/packages/app/src/lib/chart-utils.ts b/packages/app/src/lib/chart-utils.ts
index 33a5b4e3..3eeda15b 100644
--- a/packages/app/src/lib/chart-utils.ts
+++ b/packages/app/src/lib/chart-utils.ts
@@ -61,10 +61,17 @@ const PALETTE_CACHE = new Map<string, string[]>();
 /**
  * Generates high-contrast colors using iwanthue (k-means in CIELab space).
  *
- * Tiered strategy per vendor:
+ * Tiered strategy per vendor (only when >1 vendor is present):
  *   ≤ PREFERRED_MAX → constrain to brand zone (NVIDIA=green, AMD=red)
  *   ≤ BAN_MAX       → full wheel minus rival's brand color
  *   > BAN_MAX       → full wheel, no restrictions, best spacing wins
+ *
+ * Single-vendor case (e.g. an all-NVIDIA agentic comparison of B200/B300 ×
+ * vLLM/SGLang): the brand zone and rival-ban exist to keep vendors apart at a
+ * glance, but with one vendor there's no rival — clamping every series into the
+ * same narrow hue band just collapses the contrast HC is supposed to maximize.
+ * So skip both restrictions and use the full wheel, giving the series the widest
+ * possible separation.
  */
 export const generateHighContrastColors = (
   keys: string[],
@@ -91,6 +98,12 @@ export const generateHighContrastColors = (
     list.push(key);
   }
 
+  // Brand-zone / rival-ban only serve to keep DIFFERENT vendors apart. With a
+  // single vendor present there's nothing to separate from, so those
+  // restrictions only shrink the usable hue range and kill contrast — open the
+  // full wheel instead (the common all-NVIDIA agentic comparison case).
+  const multiVendor = groups.size > 1;
+
   for (const [vendor, vendorKeys] of groups) {
     const count = vendorKeys.length;
     const isBanned = BANNED_HUE_TEST[vendor] ?? null;
@@ -99,8 +112,8 @@ export const generateHighContrastColors = (
     // Tier 1: few items → brand zone only
     // Tier 2: moderate  → full wheel minus rival color
     // Tier 3: many      → full wheel, no restrictions
-    const usePreferred = preferred && count <= PREFERRED_MAX;
-    const useBan = !usePreferred && isBanned && count <= BAN_MAX;
+    const usePreferred = multiVendor && preferred && count <= PREFERRED_MAX;
+    const useBan = multiVendor && !usePreferred && isBanned && count <= BAN_MAX;
 
     // Everything iwanthue's output depends on (the ban filter and preferred
     // zone are functions of vendor; the seed is vendor+theme).

From 6275aa70bf0162cd83762ff79a2e0a5c053270e2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 10:17:42 -0500
Subject: [PATCH 71/96] feat(inference): default line labels off, parallelism
 labels + high contrast on

Change the inference chart's default toggle states:
- Line Labels: on -> off  (i_linelabel=1 overrides on)
- Parallelism Labels: off -> on, which also defaults point labels on since
  parallelism labels ARE point labels (i_advlabel=0 overrides off)
- High Contrast: off -> on, via a new opt-in defaultHighContrast on
  useChartUIState so reliability/evaluation (r_/e_ prefixes) stay off;
  i_hc=0 overrides off. Historical trends shares the inference context so
  it inherits the high-contrast default too.

URL serialization flipped to omit each param at its new default and only
write the override value, so share links stay clean. Updated line-labels,
gradient-labels, and url-params E2E specs to the new defaults.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../app/cypress/e2e/gradient-labels.cy.ts     | 16 +++++-----
 packages/app/cypress/e2e/line-labels.cy.ts    | 31 ++++++++++++-------
 packages/app/cypress/e2e/url-params.cy.ts     | 14 +++++++--
 .../components/inference/InferenceContext.tsx | 25 ++++++++-------
 packages/app/src/hooks/useChartContext.ts     | 12 +++++--
 5 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts
index 333baa6d..a0753e90 100644
--- a/packages/app/cypress/e2e/gradient-labels.cy.ts
+++ b/packages/app/cypress/e2e/gradient-labels.cy.ts
@@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => {
     cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels');
   });
 
-  it('Parallelism Labels toggle is off by default', () => {
-    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked');
+  it('Parallelism Labels toggle is on by default', () => {
+    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
   });
 
   it('per-point labels are visible by default (gradient labels off)', () => {
@@ -60,21 +60,19 @@ describe('Gradient Labels Toggle', () => {
   });
 
   it('both toggles can be enabled simultaneously', () => {
-    // Turn on Gradient Labels (off by default)
+    // Parallelism Labels is on by default; ensure it's on, then turn on Gradient.
+    cy.get('#scatter-parallelism-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     cy.get('#scatter-gradient-labels').click();
     cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
 
-    // Turn on Parallelism Labels
-    cy.get('#scatter-parallelism-labels').click();
-    cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
-
     // Both should be checked
     cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked');
     cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked');
 
-    // Reset for next tests
+    // Reset gradient for next tests (parallelism stays at its default-on).
     cy.get('#scatter-gradient-labels').click();
-    cy.get('#scatter-parallelism-labels').click();
   });
 
   it('URL param i_gradlabel=1 enables gradient labels on load', () => {
diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts
index 84e655f8..23b372df 100644
--- a/packages/app/cypress/e2e/line-labels.cy.ts
+++ b/packages/app/cypress/e2e/line-labels.cy.ts
@@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => {
     cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels');
   });
 
-  it('Line Labels toggle is on by default', () => {
-    cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
-
-    // Line labels render without any interaction
-    cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
-  });
-
-  it('toggling Line Labels off then back on removes and restores label elements', () => {
-    // On by default — turn it off first.
-    cy.get('#scatter-line-labels').click();
+  it('Line Labels toggle is off by default', () => {
     cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+
+    // No line labels render without interaction
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
+  });
 
-    // Turn it back on — labels return.
+  it('toggling Line Labels on then back off adds and removes label elements', () => {
+    // Off by default — turn it on first.
     cy.get('#scatter-line-labels').click();
     cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked');
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
+
+    // Turn it back off — labels disappear.
+    cy.get('#scatter-line-labels').click();
+    cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked');
+    cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0);
   });
 
   it('line labels have colored background rects and text', () => {
+    // Off by default — ensure on (idempotent; prior test left them off).
+    cy.get('#scatter-line-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     // Each line label group should contain a background rect and text
     cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should(
       'have.length.greaterThan',
@@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => {
   });
 
   it('line labels render in the foreground, after the scatter points', () => {
-    // Labels were toggled on in the test above and remain on here.
+    // Off by default — ensure on (idempotent; previous test leaves them on).
+    cy.get('#scatter-line-labels').then(($el) => {
+      if ($el.attr('data-state') !== 'checked') cy.wrap($el).click();
+    });
     cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0);
 
     cy.get('[data-testid="scatter-graph"] svg').then(($svg) => {
diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts
index 33282b9c..3c480686 100644
--- a/packages/app/cypress/e2e/url-params.cy.ts
+++ b/packages/app/cypress/e2e/url-params.cy.ts
@@ -236,9 +236,15 @@ describe('URL Parameter Persistence', () => {
   });
 
   describe('High contrast mode', () => {
-    it('page loads without high contrast by default', () => {
+    it('inference loads with high contrast on by default', () => {
       visitWithDismissedModal('/inference');
       cy.get('[data-testid="scatter-graph"]').should('exist');
+      cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked');
+    });
+
+    it('i_hc=0 disables high contrast on load', () => {
+      visitWithDismissedModal('/inference?i_hc=0');
+      cy.get('[data-testid="scatter-graph"]').should('exist');
       cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
     });
 
@@ -267,10 +273,12 @@ describe('URL Parameter Persistence', () => {
       cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
 
-    it('historical trends tab has high contrast switch off by default', () => {
+    it('historical trends tab shares the inference high-contrast default (on)', () => {
+      // Historical reads highContrast from the same InferenceContext as the
+      // scatter chart, so it inherits the default-on behavior.
       visitWithDismissedModal('/historical');
       cy.get('[data-testid="historical-trends-display"]').should('exist');
-      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked');
+      cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked');
     });
 
     it('i_hc=1 enables historical trends high contrast', () => {
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index d66febd0..c2c599ff 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -195,6 +195,8 @@ export function InferenceProvider({
   );
   const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({
     urlPrefix: 'i_',
+    // Inference chart defaults to high contrast (?i_hc=0 overrides off).
+    defaultHighContrast: true,
   });
 
   const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0');
@@ -202,21 +204,22 @@ export function InferenceProvider({
     // Legacy `?i_nolabel=1` from before the rename: keep hiding point labels
     // explicitly so the share link's intent survives future default changes.
     if (getUrlParam('i_nolabel') === '1') return false;
+    if (getUrlParam('i_label') === '0') return false;
     if (getUrlParam('i_label') === '1') return true;
-    // Old share links set `?i_advlabel=1` while keeping the labels default
-    // (shown). Mirror the toggle's auto-enable side-effect on load so those
-    // links still render advanced labels under the new default-off behavior.
-    if (getUrlParam('i_advlabel') === '1') return true;
-    return false;
+    // Default on: parallelism labels (also default on) are point labels and
+    // are pointless without them shown.
+    return true;
   });
   const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1');
+  // Parallelism labels default on (?i_advlabel=0 overrides off).
   const [useAdvancedLabels, setUseAdvancedLabels] = useState(
-    () => getUrlParam('i_advlabel') === '1',
+    () => getUrlParam('i_advlabel') !== '0',
   );
   const [showGradientLabels, setShowGradientLabels] = useState(
     () => getUrlParam('i_gradlabel') === '1',
   );
-  const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') !== '0');
+  // Line labels default off (?i_linelabel=1 overrides on).
+  const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') === '1');
   const [showSpeedOverlay, setShowSpeedOverlay] = useState(() => getUrlParam('i_speed') === '1');
   const [showMinecraftOverlay, setShowMinecraftOverlay] = useState(
     () => getUrlParam('i_mc') === '1',
@@ -983,17 +986,17 @@ export function InferenceProvider({
       i_dstart: selectedDateRange.startDate,
       i_dend: selectedDateRange.endDate,
       i_optimal: hideNonOptimal ? '' : '0',
-      i_label: showPointLabels ? '1' : '',
-      i_hc: highContrast ? '1' : '',
+      i_label: showPointLabels ? '' : '0',
+      i_hc: highContrast ? '' : '0',
       i_log: logScale ? '1' : '',
       i_xmetric: selectedXAxisMetric || '',
       i_e2e_xmetric: selectedE2eXAxisMetric || '',
       i_xmode: selectedXAxisMode,
       i_scale: scaleType,
       i_legend: isLegendExpanded ? '' : '0',
-      i_advlabel: useAdvancedLabels ? '1' : '',
+      i_advlabel: useAdvancedLabels ? '' : '0',
       i_gradlabel: showGradientLabels ? '1' : '',
-      i_linelabel: showLineLabels ? '' : '0',
+      i_linelabel: showLineLabels ? '1' : '',
       i_speed: showSpeedOverlay ? '1' : '',
       i_mc: showMinecraftOverlay ? '1' : '',
       i_active: iActiveStr,
diff --git a/packages/app/src/hooks/useChartContext.ts b/packages/app/src/hooks/useChartContext.ts
index 49812c3e..be095430 100644
--- a/packages/app/src/hooks/useChartContext.ts
+++ b/packages/app/src/hooks/useChartContext.ts
@@ -37,6 +37,12 @@ export function reconcileActiveSet<T>(
 interface UseChartStateConfig {
   /** URL parameter prefix (e.g., 'i_' for inference, 'r_' for reliability, 'e_' for evaluation) */
   urlPrefix: string;
+  /**
+   * Initial high-contrast value when the URL has no `<prefix>hc` param.
+   * Defaults to false; the inference chart opts in to true. A `<prefix>hc=0`
+   * URL param overrides it back off.
+   */
+  defaultHighContrast?: boolean;
 }
 
 /**
@@ -44,7 +50,7 @@ interface UseChartStateConfig {
  * Includes mobile-specific legend collapse behavior.
  */
 export function useChartUIState(config: UseChartStateConfig) {
-  const { urlPrefix } = config;
+  const { urlPrefix, defaultHighContrast = false } = config;
   const { getUrlParam } = useUrlState();
 
   const hcParam = `${urlPrefix}hc` as any;
@@ -52,7 +58,7 @@ export function useChartUIState(config: UseChartStateConfig) {
 
   // Initialize with safe defaults that match SSR output to avoid hydration mismatches.
   // URL-param values are applied in a mount effect so the state is only set client-side.
-  const [highContrast, setHighContrast] = useState(false);
+  const [highContrast, setHighContrast] = useState(defaultHighContrast);
   const [isLegendExpanded, setIsLegendExpanded] = useState(true);
   const didInit = useRef(false);
 
@@ -60,7 +66,9 @@ export function useChartUIState(config: UseChartStateConfig) {
     if (didInit.current) return;
     didInit.current = true;
     const hcVal = getUrlParam(hcParam);
+    // Respect both overrides so the toggle round-trips regardless of the default.
     if (hcVal === '1') setHighContrast(true);
+    else if (hcVal === '0') setHighContrast(false);
     const legendVal = getUrlParam(legendParam);
     if (legendVal === '0') setIsLegendExpanded(false);
   }, [getUrlParam, hcParam, legendParam]);

From 5c290a49f50d7a0834a544d3e837bc1d1ccad5de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 14:30:44 -0500
Subject: [PATCH 72/96] feat(agentic): use the chart's TP/EP/DEP/TEP
 parallelism labels on sibling chips
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The agentic detail page's sibling navigator labeled configs with an ad-hoc
`TP{n}EP{n}` / `{p}P+{d}D` scheme that ignored dp-attention and the
TEP/DEP collapse, so a DEP4 config read as plain TP4EP4 (and, mid-deploy
before the API carried dp_attention, as TEP4).

Extract the scatter chart's labeler into a shared parallelism-label module
(configSegmentLabel + parallelismLabel) and route both getPointLabel and the
sibling chipLabel through it, so the two surfaces describe a config
identically (TP/EP/TEP/DEP/DPA…, multinode-disagg worker segments).

Carry the fields the labeler needs through the siblings query/API/hook:
decode/prefill dp_attention + num_workers + is_multinode.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../inference/agentic-point/sibling-nav.tsx   | 20 ++++-
 .../inference/utils/parallelism-label.test.ts | 58 ++++++++++++++
 .../inference/utils/parallelism-label.ts      | 79 +++++++++++++++++++
 .../inference/utils/tooltipUtils.ts           | 69 ++++++----------
 .../src/hooks/api/use-benchmark-siblings.ts   |  5 ++
 packages/db/src/queries/benchmark-siblings.ts | 20 ++++-
 6 files changed, 202 insertions(+), 49 deletions(-)
 create mode 100644 packages/app/src/components/inference/utils/parallelism-label.test.ts
 create mode 100644 packages/app/src/components/inference/utils/parallelism-label.ts

diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index aa727fdc..f92d6b63 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -4,6 +4,7 @@ import { useRouter } from 'next/navigation';
 import { ChevronLeft, ChevronRight } from 'lucide-react';
 
 import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
 
 const HW_LABELS: Record<string, string> = {
   b200: 'B200',
@@ -49,9 +50,22 @@ function frameworkLabel(fw: string) {
 
 /** Short label for a sibling chip: parallelism + concurrency. */
 export function chipLabel(s: BenchmarkSibling): string {
-  const parallel = s.disagg
-    ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D`
-    : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`;
+  // Same parallelism labeler the chart points use (TP/EP/TEP/DEP/DPA…).
+  const parallel = parallelismLabel({
+    tp: s.decode_tp,
+    ep: s.decode_ep,
+    dpAttention: s.decode_dp_attention,
+    disagg: s.disagg,
+    isMultinode: s.is_multinode,
+    prefillTp: s.prefill_tp,
+    prefillEp: s.prefill_ep,
+    prefillDpAttention: s.prefill_dp_attention,
+    prefillNumWorkers: s.prefill_num_workers,
+    decodeTp: s.decode_tp,
+    decodeEp: s.decode_ep,
+    decodeDpAttention: s.decode_dp_attention,
+    decodeNumWorkers: s.decode_num_workers,
+  });
   const offload = s.offload_mode === 'on' ? ' • off=ON' : '';
   return `${parallel} • c=${s.conc}${offload}`;
 }
diff --git a/packages/app/src/components/inference/utils/parallelism-label.test.ts b/packages/app/src/components/inference/utils/parallelism-label.test.ts
new file mode 100644
index 00000000..aaf715d3
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+
+import { configSegmentLabel, parallelismLabel } from './parallelism-label';
+
+describe('configSegmentLabel', () => {
+  it('collapses symmetric tp===ep to TEP / DEP by dp-attention', () => {
+    expect(configSegmentLabel(8, 8, false)).toBe('TEP8');
+    expect(configSegmentLabel(8, 8, true)).toBe('DEP8');
+  });
+
+  it('uses EP / DPAEP when ep>1 and tp!==ep', () => {
+    expect(configSegmentLabel(4, 16, false)).toBe('EP16');
+    expect(configSegmentLabel(4, 16, true)).toBe('DPAEP16');
+  });
+
+  it('uses TP / DPATP when ep<=1 or absent', () => {
+    expect(configSegmentLabel(8, 1, false)).toBe('TP8');
+    expect(configSegmentLabel(8, undefined, false)).toBe('TP8');
+    expect(configSegmentLabel(8, 1, true)).toBe('DPATP8');
+  });
+});
+
+describe('parallelismLabel', () => {
+  it('falls back to bare tp when no ep data', () => {
+    expect(parallelismLabel({ tp: 8 })).toBe('8');
+  });
+
+  it('labels a single-segment config', () => {
+    expect(parallelismLabel({ tp: 8, ep: 8, dpAttention: true })).toBe('DEP8');
+    expect(parallelismLabel({ tp: 4, ep: 8, dpAttention: false })).toBe('EP8');
+  });
+
+  it('builds multinode-disagg per-role worker segments', () => {
+    expect(
+      parallelismLabel({
+        tp: 8,
+        ep: 4,
+        disagg: true,
+        isMultinode: true,
+        prefillTp: 4,
+        prefillEp: 4,
+        prefillDpAttention: false,
+        prefillNumWorkers: 2,
+        decodeTp: 8,
+        decodeEp: 8,
+        decodeDpAttention: true,
+        decodeNumWorkers: 1,
+      }),
+    ).toBe('2xTEP4+1xDEP8');
+  });
+
+  it('single-node disagg uses the single (decode) segment, not worker syntax', () => {
+    // is_multinode false → no "NxPrefill+MxDecode" expansion.
+    expect(
+      parallelismLabel({ tp: 8, ep: 8, dpAttention: false, disagg: true, isMultinode: false }),
+    ).toBe('TEP8');
+  });
+});
diff --git a/packages/app/src/components/inference/utils/parallelism-label.ts b/packages/app/src/components/inference/utils/parallelism-label.ts
new file mode 100644
index 00000000..98207110
--- /dev/null
+++ b/packages/app/src/components/inference/utils/parallelism-label.ts
@@ -0,0 +1,79 @@
+/**
+ * Shared parallelism-config labeling — the single source of truth for the
+ * short "TP8 / EP8 / TEP8 / DEP8 / DPAEP8 / 2xEP4+1xDPAEP32" labels.
+ *
+ * Used by the scatter/GPU chart point labels (via getPointLabel) and the
+ * agentic detail page's sibling navigator chips, so both surfaces describe a
+ * config identically.
+ */
+
+/**
+ * Generates a short config segment label from parallelism params.
+ * - tp == ep and dp-attn false: "TEP{N}"
+ * - tp == ep and dp-attn true: "DEP{N}"
+ * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
+ * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
+ */
+export const configSegmentLabel = (
+  tp: number,
+  ep: number | undefined,
+  dpAttention: boolean | undefined,
+): string => {
+  if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
+    return dpAttention ? `DEP${tp}` : `TEP${tp}`;
+  }
+  const dpaPrefix = dpAttention ? 'DPA' : '';
+  if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
+  return `${dpaPrefix}EP${ep}`;
+};
+
+/** Parallelism params for one benchmark config, framework-agnostic. */
+export interface ParallelismFields {
+  tp: number;
+  ep?: number;
+  dpAttention?: boolean;
+  disagg?: boolean;
+  isMultinode?: boolean;
+  prefillTp?: number;
+  prefillEp?: number;
+  prefillDpAttention?: boolean;
+  prefillNumWorkers?: number;
+  decodeTp?: number;
+  decodeEp?: number;
+  decodeDpAttention?: boolean;
+  decodeNumWorkers?: number;
+}
+
+/**
+ * Returns the short parallelism label for a config.
+ * - No EP data (old rows): falls back to the bare tp value (e.g. "8").
+ * - Multinode disagg: per-role segments with worker counts,
+ *   e.g. "2xEP4+1xDPAEP32".
+ * - Otherwise: a single segment from (tp, ep, dpAttention).
+ */
+export const parallelismLabel = (f: ParallelismFields): string => {
+  if (
+    (f.ep === null || f.ep === undefined) &&
+    (f.prefillEp === null || f.prefillEp === undefined)
+  ) {
+    return String(f.tp);
+  }
+
+  if (f.isMultinode && f.disagg) {
+    const prefillLabel = configSegmentLabel(
+      f.prefillTp ?? f.tp,
+      f.prefillEp ?? f.ep,
+      f.prefillDpAttention ?? f.dpAttention,
+    );
+    const decodeLabel = configSegmentLabel(
+      f.decodeTp ?? f.tp,
+      f.decodeEp ?? f.ep,
+      f.decodeDpAttention ?? f.dpAttention,
+    );
+    const pw = f.prefillNumWorkers ?? 1;
+    const dw = f.decodeNumWorkers ?? 1;
+    return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
+  }
+
+  return configSegmentLabel(f.tp, f.ep, f.dpAttention);
+};
diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts
index 14d3b553..ea039336 100644
--- a/packages/app/src/components/inference/utils/tooltipUtils.ts
+++ b/packages/app/src/components/inference/utils/tooltipUtils.ts
@@ -1,6 +1,7 @@
 import { formatNumber, getDisplayLabel } from '@/lib/utils';
 
 import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types';
+import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
 
 export interface TooltipConfig {
   /** The data point to display */
@@ -34,57 +35,37 @@ export interface OverlayTooltipConfig extends TooltipConfig {
   overlayData: OverlayData;
 }
 
-/**
- * Generates a short config segment label from parallelism params.
- * - tp == ep and dp-attn false: "TEP{N}"
- * - tp == ep and dp-attn true: "DEP{N}"
- * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}"
- * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}"
- */
-const configSegmentLabel = (
-  tp: number,
-  ep: number | undefined,
-  dpAttention: boolean | undefined,
-): string => {
-  if (ep !== null && ep !== undefined && ep > 1 && tp === ep) {
-    return dpAttention ? `DEP${tp}` : `TEP${tp}`;
-  }
-  const dpaPrefix = dpAttention ? 'DPA' : '';
-  if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`;
-  return `${dpaPrefix}EP${ep}`;
-};
+// `dp_attention` is `boolean | string` on InferenceData (DB sends raw, the
+// transform narrows "true"/"false" → boolean). Coerce to a plain boolean for
+// the shared labeler, treating the legacy string form correctly.
+const asBool = (v: boolean | string | undefined): boolean | undefined =>
+  typeof v === 'string' ? v === 'true' : v;
 
 /**
  * Returns the short label for a data point on the chart.
  * - Non-multinode: e.g. "TP8", "EP8", "TEP8", "DEP8", "DPAEP8"
  * - Multinode disagg: e.g. "2xEP4+1xDPAEP32"
  * - Old data (no ep field): falls back to tp value
+ *
+ * Delegates to the shared {@link parallelismLabel} so the chart points and the
+ * agentic sibling navigator describe a config identically.
  */
-export const getPointLabel = (d: InferenceData): string => {
-  if (
-    (d.ep === null || d.ep === undefined) &&
-    (d.prefill_ep === null || d.prefill_ep === undefined)
-  )
-    return String(d.tp);
-
-  if (d.is_multinode && d.disagg) {
-    const prefillLabel = configSegmentLabel(
-      d.prefill_tp ?? d.tp,
-      d.prefill_ep ?? d.ep,
-      d.prefill_dp_attention ?? d.dp_attention,
-    );
-    const decodeLabel = configSegmentLabel(
-      d.decode_tp ?? d.tp,
-      d.decode_ep ?? d.ep,
-      d.decode_dp_attention ?? d.dp_attention,
-    );
-    const pw = d.prefill_num_workers ?? 1;
-    const dw = d.decode_num_workers ?? 1;
-    return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`;
-  }
-
-  return configSegmentLabel(d.tp, d.ep, d.dp_attention);
-};
+export const getPointLabel = (d: InferenceData): string =>
+  parallelismLabel({
+    tp: d.tp,
+    ep: d.ep,
+    dpAttention: asBool(d.dp_attention),
+    disagg: d.disagg,
+    isMultinode: d.is_multinode,
+    prefillTp: d.prefill_tp,
+    prefillEp: d.prefill_ep,
+    prefillDpAttention: asBool(d.prefill_dp_attention),
+    prefillNumWorkers: d.prefill_num_workers,
+    decodeTp: d.decode_tp,
+    decodeEp: d.decode_ep,
+    decodeDpAttention: asBool(d.decode_dp_attention),
+    decodeNumWorkers: d.decode_num_workers,
+  });
 
 const runLinkHTML = (runUrl?: string) =>
   runUrl
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
index 1ea90c0d..e6bc4906 100644
--- a/packages/app/src/hooks/api/use-benchmark-siblings.ts
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -6,11 +6,16 @@ export interface BenchmarkSibling {
   offload_mode: string | null;
   decode_tp: number;
   decode_ep: number;
+  decode_dp_attention: boolean;
+  decode_num_workers: number;
   prefill_tp: number;
   prefill_ep: number;
+  prefill_dp_attention: boolean;
+  prefill_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
   disagg: boolean;
+  is_multinode: boolean;
   is_current: boolean;
   has_trace: boolean;
 }
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
index 245a1170..241a48ba 100644
--- a/packages/db/src/queries/benchmark-siblings.ts
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -14,11 +14,16 @@ export interface BenchmarkSibling {
   offload_mode: string | null;
   decode_tp: number;
   decode_ep: number;
+  decode_dp_attention: boolean;
+  decode_num_workers: number;
   prefill_tp: number;
   prefill_ep: number;
+  prefill_dp_attention: boolean;
+  prefill_num_workers: number;
   num_prefill_gpu: number;
   num_decode_gpu: number;
   disagg: boolean;
+  is_multinode: boolean;
   /** True if this row IS the point passed in. */
   is_current: boolean;
   /** Whether the row has a stored trace_replay blob (for navigation hint). */
@@ -74,8 +79,9 @@ export async function getBenchmarkSiblings(
   const rows = (await sql`
     select
       br.id, br.conc, br.offload_mode,
-      c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep,
-      c.num_prefill_gpu, c.num_decode_gpu, c.disagg,
+      c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers,
+      c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers,
+      c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode,
       (br.trace_replay_id is not null) as has_trace
     from benchmark_results br
     join configs c on c.id = br.config_id
@@ -93,11 +99,16 @@ export async function getBenchmarkSiblings(
     offload_mode: string | null;
     decode_tp: number;
     decode_ep: number;
+    decode_dp_attention: boolean;
+    decode_num_workers: number;
     prefill_tp: number;
     prefill_ep: number;
+    prefill_dp_attention: boolean;
+    prefill_num_workers: number;
     num_prefill_gpu: number;
     num_decode_gpu: number;
     disagg: boolean;
+    is_multinode: boolean;
     has_trace: boolean;
   }[];
 
@@ -107,11 +118,16 @@ export async function getBenchmarkSiblings(
     offload_mode: r.offload_mode,
     decode_tp: r.decode_tp,
     decode_ep: r.decode_ep,
+    decode_dp_attention: r.decode_dp_attention,
+    decode_num_workers: r.decode_num_workers,
     prefill_tp: r.prefill_tp,
     prefill_ep: r.prefill_ep,
+    prefill_dp_attention: r.prefill_dp_attention,
+    prefill_num_workers: r.prefill_num_workers,
     num_prefill_gpu: r.num_prefill_gpu,
     num_decode_gpu: r.num_decode_gpu,
     disagg: r.disagg,
+    is_multinode: r.is_multinode,
     is_current: Number(r.id) === benchmarkResultId,
     has_trace: r.has_trace,
   }));

From 32adf6bec66f41ffe2cfa4f08251afcb333c007d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 14:53:17 -0500
Subject: [PATCH 73/96] feat(agentic): sort dropdown for the sibling point
 navigator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a 'Sort by' dropdown to the agentic detail page's point navigator:
- Default (DB order)
- Concurrency ↑
- Parallelism (groups all TP, then TEP/DEP/EP… by ep→tp→dpa, conc within)
- Throughput/GPU ↓
- Total requests ↓

Carry tput_per_gpu and total_requests (total_requests_completed, falling
back to legacy num_requests_total) through the siblings query/API/hook.

prev/next follow the sorted order, and the chosen sort is persisted in the
URL (?sort=) — read on mount and threaded through every point link plus a
router.replace — so navigating to another point no longer resets it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../inference/agentic-point/sibling-nav.tsx   | 131 ++++++++++++++++--
 .../src/hooks/api/use-benchmark-siblings.ts   |   2 +
 packages/db/src/queries/benchmark-siblings.ts |  16 +++
 3 files changed, 141 insertions(+), 8 deletions(-)

diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
index f92d6b63..a1a5d1ab 100644
--- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
+++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx
@@ -1,10 +1,19 @@
 'use client';
 
+import { useMemo, useState } from 'react';
 import { useRouter } from 'next/navigation';
 import { ChevronLeft, ChevronRight } from 'lucide-react';
 
 import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings';
 import { parallelismLabel } from '@/components/inference/utils/parallelism-label';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { track } from '@/lib/analytics';
 
 const HW_LABELS: Record<string, string> = {
   b200: 'B200',
@@ -70,12 +79,83 @@ export function chipLabel(s: BenchmarkSibling): string {
   return `${parallel} • c=${s.conc}${offload}`;
 }
 
+type SortMode = 'default' | 'conc' | 'parallelism' | 'tput' | 'requests';
+
+const SORT_OPTIONS: { value: SortMode; label: string }[] = [
+  { value: 'default', label: 'Default' },
+  { value: 'conc', label: 'Concurrency ↑' },
+  { value: 'parallelism', label: 'Parallelism' },
+  { value: 'tput', label: 'Throughput/GPU ↓' },
+  { value: 'requests', label: 'Total requests ↓' },
+];
+
+// Group key for the "parallelism" sort: ep first (so TP/EP1 sorts ahead of
+// EP/TEP/DEP groups), then tp, then dp-attention, then disagg — every config
+// of one parallelism lands together, ordered by concurrency within.
+const parallelRank = (s: BenchmarkSibling): [number, number, number, number] => [
+  s.decode_ep ?? 0,
+  s.decode_tp ?? 0,
+  s.decode_dp_attention ? 1 : 0,
+  s.disagg ? 1 : 0,
+];
+
+function sortSiblings(siblings: BenchmarkSibling[], mode: SortMode): BenchmarkSibling[] {
+  if (mode === 'default') return siblings;
+  const out = [...siblings];
+  if (mode === 'conc') {
+    out.sort((a, b) => a.conc - b.conc);
+  } else if (mode === 'tput') {
+    // Highest throughput/GPU first; rows missing the metric sink to the end.
+    out.sort((a, b) => (b.tput_per_gpu ?? -Infinity) - (a.tput_per_gpu ?? -Infinity));
+  } else if (mode === 'requests') {
+    // Most total requests first; rows missing the metric sink to the end.
+    out.sort((a, b) => (b.total_requests ?? -Infinity) - (a.total_requests ?? -Infinity));
+  } else {
+    out.sort((a, b) => {
+      const ra = parallelRank(a);
+      const rb = parallelRank(b);
+      for (let i = 0; i < ra.length; i++) {
+        if (ra[i] !== rb[i]) return ra[i] - rb[i];
+      }
+      // Within a parallelism group: offload off before on, then concurrency.
+      const oa = a.offload_mode === 'on' ? 1 : 0;
+      const ob = b.offload_mode === 'on' ? 1 : 0;
+      return oa - ob || a.conc - b.conc;
+    });
+  }
+  return out;
+}
+
+const isSortMode = (v: string | null): v is SortMode =>
+  v !== null && SORT_OPTIONS.some((o) => o.value === v);
+
 export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) {
   const router = useRouter();
-  const currentIdx = siblings.findIndex((s) => s.is_current);
-  const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null;
-  const next =
-    currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null;
+  // Persist the sort in the URL so clicking a point (which remounts this
+  // component on the new route) keeps the chosen order instead of resetting.
+  // Read it once from the URL on mount — this component only renders after the
+  // client-side siblings query resolves, so `window` is always available here
+  // (no SSR/hydration mismatch). Matches the app's window-based url-state read.
+  const [sortMode, setSortMode] = useState<SortMode>(() => {
+    if (typeof window === 'undefined') return 'default';
+    const v = new URLSearchParams(window.location.search).get('sort');
+    return isSortMode(v) ? v : 'default';
+  });
+
+  const sorted = useMemo(() => sortSiblings(siblings, sortMode), [siblings, sortMode]);
+
+  // prev/next follow the displayed (sorted) order so navigation matches the row.
+  const currentIdx = sorted.findIndex((s) => s.is_current);
+  const prev = currentIdx > 0 ? sorted[currentIdx - 1] : null;
+  const next = currentIdx !== -1 && currentIdx < sorted.length - 1 ? sorted[currentIdx + 1] : null;
+
+  // Carry the active sort through every point-to-point link.
+  const hrefFor = (id: number) =>
+    sortMode === 'default'
+      ? `/inference/agentic/${id}`
+      : `/inference/agentic/${id}?sort=${sortMode}`;
+
+  const currentId = siblings.find((s) => s.is_current)?.id;
 
   const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`;
 
@@ -88,23 +168,58 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
         </span>
       </div>
       <div className="flex items-center gap-2 flex-wrap">
+        <div className="flex items-center gap-1.5">
+          <span className="text-xs text-muted-foreground">Sort by</span>
+          <Select
+            value={sortMode}
+            onValueChange={(v) => {
+              const mode = v as SortMode;
+              setSortMode(mode);
+              track('agentic_siblings_sorted', { mode });
+              // Mirror into the URL (replace, no history spam) so a refresh —
+              // and the next point's mount — keep the chosen order.
+              if (currentId !== undefined) {
+                const href =
+                  mode === 'default'
+                    ? `/inference/agentic/${currentId}`
+                    : `/inference/agentic/${currentId}?sort=${mode}`;
+                router.replace(href, { scroll: false });
+              }
+            }}
+          >
+            <SelectTrigger
+              className="h-7 w-[10rem] text-xs"
+              aria-label="Sort points"
+              data-testid="sibling-sort-select"
+            >
+              <SelectValue />
+            </SelectTrigger>
+            <SelectContent>
+              {SORT_OPTIONS.map((o) => (
+                <SelectItem key={o.value} value={o.value} className="text-xs">
+                  {o.label}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
         <button
           type="button"
           disabled={!prev}
-          onClick={() => prev && router.push(`/inference/agentic/${prev.id}`)}
+          onClick={() => prev && router.push(hrefFor(prev.id))}
           className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
           aria-label="Previous point"
         >
           <ChevronLeft className="size-3.5" /> prev
         </button>
         <div className="flex items-center gap-1 flex-wrap">
-          {siblings.map((s) => {
+          {sorted.map((s) => {
             const active = s.is_current;
             return (
               <button
                 key={s.id}
                 type="button"
-                onClick={() => !active && router.push(`/inference/agentic/${s.id}`)}
+                onClick={() => !active && router.push(hrefFor(s.id))}
                 className={`px-2 py-1 rounded-md text-xs border transition-colors ${
                   active
                     ? 'border-primary bg-primary text-primary-foreground font-medium'
@@ -120,7 +235,7 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
         <button
           type="button"
           disabled={!next}
-          onClick={() => next && router.push(`/inference/agentic/${next.id}`)}
+          onClick={() => next && router.push(hrefFor(next.id))}
           className="inline-flex items-center gap-1 px-2 py-1 rounded-md text-xs border border-border/40 hover:bg-accent disabled:opacity-30 disabled:cursor-not-allowed"
           aria-label="Next point"
         >
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
index e6bc4906..55720bdf 100644
--- a/packages/app/src/hooks/api/use-benchmark-siblings.ts
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -16,6 +16,8 @@ export interface BenchmarkSibling {
   num_decode_gpu: number;
   disagg: boolean;
   is_multinode: boolean;
+  tput_per_gpu: number | null;
+  total_requests: number | null;
   is_current: boolean;
   has_trace: boolean;
 }
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
index 241a48ba..c7e4a317 100644
--- a/packages/db/src/queries/benchmark-siblings.ts
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -24,6 +24,13 @@ export interface BenchmarkSibling {
   num_decode_gpu: number;
   disagg: boolean;
   is_multinode: boolean;
+  /** Throughput per GPU (tok/s/gpu) for this point; null if the metric is absent. */
+  tput_per_gpu: number | null;
+  /**
+   * Total requests for this point — `total_requests_completed` (aiperf runner)
+   * falling back to the legacy `num_requests_total`; null if neither is present.
+   */
+  total_requests: number | null;
   /** True if this row IS the point passed in. */
   is_current: boolean;
   /** Whether the row has a stored trace_replay blob (for navigation hint). */
@@ -82,6 +89,11 @@ export async function getBenchmarkSiblings(
       c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers,
       c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers,
       c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode,
+      (br.metrics->>'tput_per_gpu')::float8 as tput_per_gpu,
+      coalesce(
+        (br.metrics->>'total_requests_completed')::float8,
+        (br.metrics->>'num_requests_total')::float8
+      ) as total_requests,
       (br.trace_replay_id is not null) as has_trace
     from benchmark_results br
     join configs c on c.id = br.config_id
@@ -109,6 +121,8 @@ export async function getBenchmarkSiblings(
     num_decode_gpu: number;
     disagg: boolean;
     is_multinode: boolean;
+    tput_per_gpu: number | null;
+    total_requests: number | null;
     has_trace: boolean;
   }[];
 
@@ -128,6 +142,8 @@ export async function getBenchmarkSiblings(
     num_decode_gpu: r.num_decode_gpu,
     disagg: r.disagg,
     is_multinode: r.is_multinode,
+    tput_per_gpu: r.tput_per_gpu === null ? null : Number(r.tput_per_gpu),
+    total_requests: r.total_requests === null ? null : Number(r.total_requests),
     is_current: Number(r.id) === benchmarkResultId,
     has_trace: r.has_trace,
   }));

From 60c5c2db0d73e9858e2cab84bb5e507be18ebf1e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 15:49:57 -0500
Subject: [PATCH 74/96] feat(datasets): add 011 schema for datasets +
 dataset_conversations

Additive migration backing the new /datasets area: a registry of ingested
HF cc-traces-weka dataset versions (summary + precomputed chart_data) and one
row per conversation holding a flamegraph-ready structure JSONB. Drop snippet
in the migration header for revert.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/migrations/011_datasets.sql | 55 +++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 packages/db/migrations/011_datasets.sql

diff --git a/packages/db/migrations/011_datasets.sql b/packages/db/migrations/011_datasets.sql
new file mode 100644
index 00000000..7a70d83f
--- /dev/null
+++ b/packages/db/migrations/011_datasets.sql
@@ -0,0 +1,55 @@
+-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
+-- the agentic benchmarks replay) + their per-conversation trace structure.
+--
+-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
+-- not the source traces. These two tables back the new /datasets area: a
+-- registry of ingested dataset versions with precomputed summary + chart data,
+-- and one row per conversation holding a flamegraph-ready `structure` (turns +
+-- subagent groups with input split into cached-prefix vs uncached-suffix). The
+-- raw hash_ids are NOT stored — they're only needed at ingest to derive the
+-- cached/uncached split, so the runtime read is a single small JSONB.
+--
+-- Additive only. To revert this migration:
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   delete from schema_migrations where filename = '011_datasets.sql';
+
+create table datasets (
+  -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
+  id          text primary key,
+  -- URL key, e.g. 'cc-traces-weka-062126'.
+  slug        text not null unique,
+  label       text not null,
+  -- 'full' | '256k' | 'no-subagents' (the published variants).
+  variant     text not null default 'full',
+  description text,
+  hf_url      text,
+  license     text,
+  conversation_count integer not null default 0,
+  -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
+  summary     jsonb not null default '{}'::jsonb,
+  -- Precomputed distributions for the dataset-detail cards (input/output length,
+  -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
+  chart_data  jsonb not null default '{}'::jsonb,
+  dataset_version integer not null default 1,
+  ingested_at timestamptz not null default now()
+);
+
+create table dataset_conversations (
+  id          bigserial primary key,
+  dataset_id  text not null references datasets(id) on delete cascade,
+  -- The conversation id from the dataset record (trace id).
+  conv_id     text not null,
+  models      text[] not null default '{}',
+  num_turns           integer not null default 0,
+  num_subagent_groups integer not null default 0,
+  total_in    bigint not null default 0,
+  total_out   bigint not null default 0,
+  total_cached bigint not null default 0,
+  -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
+  -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
+  structure   jsonb not null,
+  unique (dataset_id, conv_id)
+);
+
+create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);

From 71e388f83c8d20f76738daa2b877962c9e3533bd Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 15:49:57 -0500
Subject: [PATCH 75/96] feat(datasets): weka trace structure + cached-prefix
 builder

Pure transforms (no DB) turning a raw cc-traces-weka conversation into a
flamegraph-ready structure: ordered turn/subagent nodes with input split into
cached-prefix vs uncached-suffix. Ports _count_seen_prefix_blocks from the
aiperf weka loader; subagents run against a spawn-time snapshot of the parent
prefix cache. Includes linear/log histogram helpers for the detail cards and
13 unit tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/src/etl/weka-structure.test.ts | 158 ++++++++++++
 packages/db/src/etl/weka-structure.ts      | 275 +++++++++++++++++++++
 2 files changed, 433 insertions(+)
 create mode 100644 packages/db/src/etl/weka-structure.test.ts
 create mode 100644 packages/db/src/etl/weka-structure.ts

diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
new file mode 100644
index 00000000..95bfef38
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -0,0 +1,158 @@
+import { describe, it, expect } from 'vitest';
+import {
+  countSeenPrefixBlocks,
+  buildConversationStructure,
+  linearHistogram,
+  logHistogram,
+  type RawWekaConversation,
+  type SubagentNode,
+  type TurnNode,
+} from './weka-structure.js';
+
+describe('countSeenPrefixBlocks', () => {
+  it('counts only the contiguous leading run already seen', () => {
+    const seen = new Set([1, 2, 3, 9]);
+    // 1,2,3 seen contiguously; 4 breaks the run even though 9 is seen later.
+    expect(countSeenPrefixBlocks([1, 2, 3, 4, 9], seen)).toBe(3);
+  });
+
+  it('returns 0 when the first block is unseen', () => {
+    expect(countSeenPrefixBlocks([7, 1, 2], new Set([1, 2]))).toBe(0);
+  });
+
+  it('returns the full length when every block is seen', () => {
+    expect(countSeenPrefixBlocks([1, 2], new Set([1, 2, 3]))).toBe(2);
+  });
+
+  it('handles empty hash list', () => {
+    expect(countSeenPrefixBlocks([], new Set([1]))).toBe(0);
+  });
+});
+
+describe('buildConversationStructure', () => {
+  it('splits input into cached-prefix vs uncached as the prefix cache warms', () => {
+    const conv: RawWekaConversation = {
+      id: 'c1',
+      block_size: 64,
+      requests: [
+        // Turn 0: nothing seen yet → all uncached.
+        { type: 'n', model: 'm', in: 128, out: 10, hash_ids: [1, 2] },
+        // Turn 1: blocks 1,2 already seen, 3 is new → 2 blocks cached.
+        { type: 'n', model: 'm', in: 192, out: 20, hash_ids: [1, 2, 3] },
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    const t0 = s.nodes[0] as TurnNode;
+    const t1 = s.nodes[1] as TurnNode;
+    expect(t0).toMatchObject({ kind: 'turn', in: 128, cached: 0, uncached: 128, out: 10 });
+    expect(t1.cached).toBe(128); // 2 blocks × 64
+    expect(t1.uncached).toBe(64); // 192 - 128
+    expect(s.totals).toMatchObject({
+      in: 320,
+      out: 30,
+      cached: 128,
+      uncached: 192,
+      numTurns: 2,
+      numSubagentGroups: 0,
+    });
+  });
+
+  it('clamps cached to the effective input on a partial last block', () => {
+    const conv: RawWekaConversation = {
+      id: 'c2',
+      block_size: 64,
+      requests: [
+        { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // 2 blocks but in=100 (partial)
+        { type: 'n', in: 100, out: 0, hash_ids: [1, 2] }, // both seen → cached clamped to 100
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    const t1 = s.nodes[1] as TurnNode;
+    expect(t1.cached).toBe(100);
+    expect(t1.uncached).toBe(0);
+  });
+
+  it('treats turns with no hash_ids as fully uncached', () => {
+    const conv: RawWekaConversation = {
+      id: 'c3',
+      requests: [{ type: 'n', in: 50, out: 5 }],
+    };
+    const t0 = buildConversationStructure(conv).nodes[0] as TurnNode;
+    expect(t0).toMatchObject({ cached: 0, uncached: 50 });
+  });
+
+  it('nests subagent groups with aggregated children and runs them against a spawn-time snapshot', () => {
+    const conv: RawWekaConversation = {
+      id: 'c4',
+      block_size: 64,
+      requests: [
+        { type: 'n', model: 'main', in: 64, out: 10, hash_ids: [1] },
+        {
+          type: 'subagent',
+          agent_id: 'a1',
+          subagent_type: 'Explore',
+          duration_ms: 1234,
+          requests: [
+            // sees parent block 1 (snapshot at spawn) → 1 block cached
+            { type: 'n', model: 'sub', in: 128, out: 7, hash_ids: [1, 5] },
+            // now block 5 is also seen within the subagent → 2 cached
+            { type: 'n', model: 'sub', in: 128, out: 3, hash_ids: [1, 5] },
+          ],
+        },
+        // Parent turn after subagent: block 5 must NOT be cached (subagent
+        // context not folded back); only block 1 is in the parent seen set.
+        { type: 'n', model: 'main', in: 128, out: 1, hash_ids: [1, 5] },
+      ],
+    };
+    const s = buildConversationStructure(conv);
+    expect(s.totals.numTurns).toBe(2); // two top-level normal turns
+    expect(s.totals.numSubagentGroups).toBe(1);
+
+    const sub = s.nodes[1] as SubagentNode;
+    expect(sub.kind).toBe('subagent');
+    expect(sub.label).toBe('Explore');
+    expect(sub.agentId).toBe('a1');
+    expect(sub.durationMs).toBe(1234);
+    expect(sub.children).toHaveLength(2);
+    expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
+    expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
+    expect(sub.in).toBe(256);
+    expect(sub.out).toBe(10);
+
+    const afterSub = s.nodes[2] as TurnNode;
+    expect(afterSub.cached).toBe(64); // only block 1; block 5 not folded back
+  });
+
+  it('falls back to the default block size and a generic subagent label', () => {
+    const conv: RawWekaConversation = {
+      id: 'c5',
+      requests: [{ type: 'subagent', requests: [{ type: 'n', in: 10, out: 1, hash_ids: [1] }] }],
+    };
+    const s = buildConversationStructure(conv);
+    expect(s.blockSize).toBe(64);
+    expect((s.nodes[0] as SubagentNode).label).toBe('Subagent');
+  });
+});
+
+describe('histograms', () => {
+  it('linearHistogram buckets across [0, max] and totals the count', () => {
+    const bins = linearHistogram([0, 1, 2, 3, 4], 4);
+    expect(bins).toHaveLength(4);
+    expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(5);
+    expect(bins[0].x0).toBe(0);
+  });
+
+  it('linearHistogram handles all-zero input', () => {
+    expect(linearHistogram([0, 0])).toEqual([{ x0: 0, x1: 1, count: 2 }]);
+  });
+
+  it('logHistogram drops non-positive values and preserves the positive total', () => {
+    const bins = logHistogram([1, 10, 100, 1000, 0, -5], 3);
+    expect(bins.reduce((acc, b) => acc + b.count, 0)).toBe(4);
+  });
+
+  it('both return [] for empty input', () => {
+    expect(linearHistogram([])).toEqual([]);
+    expect(logHistogram([])).toEqual([]);
+  });
+});
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
new file mode 100644
index 00000000..e4113c68
--- /dev/null
+++ b/packages/db/src/etl/weka-structure.ts
@@ -0,0 +1,275 @@
+/**
+ * Pure transforms for the HuggingFace cc-traces-weka datasets.
+ *
+ * Turns a raw conversation record (`{ id, block_size, requests[] }`, where each
+ * request is a normal turn or a subagent group) into a compact, flamegraph-ready
+ * `structure`: ordered nodes with input split into cached-prefix vs
+ * uncached-suffix. The cached split ports `_count_seen_prefix_blocks` from the
+ * aiperf weka loader (contiguous leading hash_ids already seen under an infinite
+ * KV cache). No DB access — safe to import anywhere and unit-test directly.
+ */
+
+export const DEFAULT_BLOCK_SIZE = 64;
+
+// ── Raw record shapes (subset we read) ──────────────────────────────────────
+
+export interface RawWekaRequest {
+  t?: number;
+  type?: string; // 'n' | 's'
+  model?: string;
+  in?: number;
+  out?: number;
+  hash_ids?: number[];
+  api_time?: number;
+}
+
+export interface RawWekaSubagent {
+  t?: number;
+  type: 'subagent';
+  agent_id?: string;
+  subagent_type?: string;
+  duration_ms?: number;
+  requests?: RawWekaRequest[];
+  models?: string[];
+}
+
+export type RawWekaEntry = RawWekaRequest | RawWekaSubagent;
+
+export interface RawWekaConversation {
+  id: string;
+  models?: string[];
+  block_size?: number;
+  hash_id_scope?: string;
+  requests?: RawWekaEntry[];
+}
+
+// ── Output structure (stored in dataset_conversations.structure) ─────────────
+
+export interface TurnNode {
+  kind: 'turn';
+  turnIndex: number;
+  model?: string;
+  in: number;
+  out: number;
+  /** Input tokens served from the prefix cache (≤ in). */
+  cached: number;
+  /** Input tokens that must be (re)computed (in - cached). */
+  uncached: number;
+}
+
+export interface SubagentNode {
+  kind: 'subagent';
+  label: string;
+  agentId?: string;
+  durationMs?: number;
+  in: number;
+  out: number;
+  cached: number;
+  uncached: number;
+  children: TurnNode[];
+}
+
+export type StructureNode = TurnNode | SubagentNode;
+
+export interface ConversationStructure {
+  blockSize: number;
+  nodes: StructureNode[];
+  totals: {
+    in: number;
+    out: number;
+    cached: number;
+    uncached: number;
+    numTurns: number;
+    numSubagentGroups: number;
+  };
+}
+
+const isSubagent = (e: RawWekaEntry): e is RawWekaSubagent =>
+  (e as RawWekaSubagent).type === 'subagent';
+
+/**
+ * Count contiguous leading hash_ids already present in `seen`
+ * (port of aiperf `_count_seen_prefix_blocks`).
+ */
+export function countSeenPrefixBlocks(
+  hashIds: readonly number[],
+  seen: ReadonlySet<number>,
+): number {
+  let hits = 0;
+  for (const h of hashIds) {
+    if (!seen.has(h)) break;
+    hits += 1;
+  }
+  return hits;
+}
+
+/**
+ * Compute the {cached, uncached} input-token split for one request and fold its
+ * blocks into `seen`. `cached` is derived from blocks but clamped to the
+ * request's effective `in` so cached+uncached === in even when the last block is
+ * partial (in = hash_token_count, not always a multiple of blockSize).
+ */
+function splitInput(
+  req: RawWekaRequest,
+  seen: Set<number>,
+  blockSize: number,
+): { in: number; cached: number; uncached: number } {
+  const input = Math.max(0, Math.round(req.in ?? 0));
+  const hashIds = req.hash_ids ?? [];
+  if (hashIds.length === 0) {
+    return { in: input, cached: 0, uncached: input };
+  }
+  const cachedBlocks = countSeenPrefixBlocks(hashIds, seen);
+  for (const h of hashIds) seen.add(h);
+  const cached = Math.min(input, cachedBlocks * blockSize);
+  return { in: input, cached, uncached: input - cached };
+}
+
+function subagentLabel(s: RawWekaSubagent): string {
+  const base = s.subagent_type?.trim();
+  return base && base.length > 0 ? base : 'Subagent';
+}
+
+/**
+ * Build the flamegraph structure for one conversation. Main turns share a single
+ * accumulating prefix-cache `seen` set; each subagent group runs against a
+ * *copy* of the parent `seen` at spawn (its context is separate and is not
+ * folded back into the parent), mirroring the weka loader's parent/child split.
+ */
+export function buildConversationStructure(
+  conv: RawWekaConversation,
+  blockSizeOverride?: number,
+): ConversationStructure {
+  const blockSize = blockSizeOverride ?? conv.block_size ?? DEFAULT_BLOCK_SIZE;
+  const seen = new Set<number>();
+  const nodes: StructureNode[] = [];
+  let totalIn = 0;
+  let totalOut = 0;
+  let totalCached = 0;
+  let totalUncached = 0;
+  let numTurns = 0;
+  let numSubagentGroups = 0;
+  let turnIndex = 0;
+
+  for (const entry of conv.requests ?? []) {
+    if (isSubagent(entry)) {
+      const childSeen = new Set(seen); // snapshot at spawn; not merged back
+      const children: TurnNode[] = [];
+      let gin = 0;
+      let gout = 0;
+      let gcached = 0;
+      let guncached = 0;
+      for (const inner of entry.requests ?? []) {
+        const split = splitInput(inner, childSeen, blockSize);
+        const out = Math.max(0, Math.round(inner.out ?? 0));
+        children.push({
+          kind: 'turn',
+          turnIndex: turnIndex++,
+          model: inner.model,
+          in: split.in,
+          out,
+          cached: split.cached,
+          uncached: split.uncached,
+        });
+        gin += split.in;
+        gout += out;
+        gcached += split.cached;
+        guncached += split.uncached;
+      }
+      nodes.push({
+        kind: 'subagent',
+        label: subagentLabel(entry),
+        agentId: entry.agent_id,
+        durationMs: entry.duration_ms,
+        in: gin,
+        out: gout,
+        cached: gcached,
+        uncached: guncached,
+        children,
+      });
+      numSubagentGroups += 1;
+      totalIn += gin;
+      totalOut += gout;
+      totalCached += gcached;
+      totalUncached += guncached;
+    } else {
+      const split = splitInput(entry, seen, blockSize);
+      const out = Math.max(0, Math.round(entry.out ?? 0));
+      nodes.push({
+        kind: 'turn',
+        turnIndex: turnIndex++,
+        model: entry.model,
+        in: split.in,
+        out,
+        cached: split.cached,
+        uncached: split.uncached,
+      });
+      numTurns += 1;
+      totalIn += split.in;
+      totalOut += out;
+      totalCached += split.cached;
+      totalUncached += split.uncached;
+    }
+  }
+
+  return {
+    blockSize,
+    nodes,
+    totals: {
+      in: totalIn,
+      out: totalOut,
+      cached: totalCached,
+      uncached: totalUncached,
+      numTurns,
+      numSubagentGroups,
+    },
+  };
+}
+
+// ── Distribution binning (for the dataset-detail cards) ──────────────────────
+
+export interface HistogramBin {
+  x0: number;
+  x1: number;
+  count: number;
+}
+
+/** Linear-width histogram over [0, max]. Empty input → []. */
+export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+  if (values.length === 0) return [];
+  const max = Math.max(...values);
+  if (max <= 0) return [{ x0: 0, x1: 1, count: values.length }];
+  const width = max / bins;
+  const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+    x0: i * width,
+    x1: (i + 1) * width,
+    count: 0,
+  }));
+  for (const v of values) {
+    const idx = Math.min(bins - 1, Math.max(0, Math.floor(v / width)));
+    out[idx].count += 1;
+  }
+  return out;
+}
+
+/** Log-width histogram over positive values (values ≤ 0 are dropped). */
+export function logHistogram(values: readonly number[], bins = 40): HistogramBin[] {
+  const pos = values.filter((v) => v > 0);
+  if (pos.length === 0) return [];
+  const min = Math.min(...pos);
+  const max = Math.max(...pos);
+  const lo = Math.log10(min);
+  const hi = Math.log10(max);
+  if (hi <= lo) return [{ x0: min, x1: max <= min ? min * 10 : max, count: pos.length }];
+  const width = (hi - lo) / bins;
+  const out: HistogramBin[] = Array.from({ length: bins }, (_, i) => ({
+    x0: 10 ** (lo + i * width),
+    x1: 10 ** (lo + (i + 1) * width),
+    count: 0,
+  }));
+  for (const v of pos) {
+    const idx = Math.min(bins - 1, Math.max(0, Math.floor((Math.log10(v) - lo) / width)));
+    out[idx].count += 1;
+  }
+  return out;
+}

From 9fbc7160057f60945adaa4bf3bc98b645f0c25f2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 15:54:11 -0500
Subject: [PATCH 76/96] feat(datasets): HF cc-traces-weka ingest script

Pages the HF datasets-server rows API (adaptive page length for the ~3.5MB
rows), builds the flamegraph structure + cached-prefix split per conversation,
accumulates dataset-level distributions (input/output length, turns/conv,
subagent fan-out, cached fraction) into datasets.chart_data, and upserts
datasets + dataset_conversations. DATABASE_WRITE_URL must be provided. Verified
the cached split against a hand computation on raw hash_ids.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/src/ingest-weka-dataset.ts | 386 +++++++++++++++++++++++++
 1 file changed, 386 insertions(+)
 create mode 100644 packages/db/src/ingest-weka-dataset.ts

diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
new file mode 100644
index 00000000..4ef5328e
--- /dev/null
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -0,0 +1,386 @@
+/**
+ * Ingest a HuggingFace cc-traces-weka dataset into the `datasets` +
+ * `dataset_conversations` tables that back the /datasets area.
+ *
+ * Public dataset, no token needed — fetched via the HF datasets-server rows API
+ * (rows are large, ~3.5 MB each, so we page in small chunks with adaptive
+ * backoff). Per conversation we build a flamegraph-ready `structure` (turns +
+ * subagent groups, input split into cached-prefix vs uncached) and accumulate
+ * dataset-level distributions for the detail cards. Raw hash_ids are discarded
+ * after the cached/uncached split is computed.
+ *
+ * Usage (DATABASE_WRITE_URL must be provided — never hardcoded):
+ *   DATABASE_WRITE_URL='postgres://…' pnpm exec tsx src/ingest-weka-dataset.ts \
+ *     semianalysisai/cc-traces-weka-062126 [--label "…"] [--variant full|256k] \
+ *     [--description "…"] [--limit N]
+ *
+ * Upsert: re-running replaces the dataset's rows (delete + re-insert).
+ * Remember to purge the API cache afterwards (POST /api/v1/invalidate).
+ */
+
+import { createAdminSql } from './etl/db-utils';
+import { hasNoSslFlag } from './cli-utils';
+import {
+  buildConversationStructure,
+  linearHistogram,
+  logHistogram,
+  type ConversationStructure,
+  type RawWekaConversation,
+  type TurnNode,
+} from './etl/weka-structure';
+
+const ROWS_API = 'https://datasets-server.huggingface.co/rows';
+const INFO_API = 'https://datasets-server.huggingface.co/info';
+
+interface CliArgs {
+  dataset: string;
+  label?: string;
+  variant?: string;
+  description?: string;
+  limit?: number;
+}
+
+function parseArgs(): CliArgs {
+  const argv = process.argv.slice(2);
+  const positional = argv.filter((a) => !a.startsWith('--'));
+  const dataset = positional[0];
+  if (!dataset) {
+    console.error(
+      'Usage: tsx src/ingest-weka-dataset.ts <hf-dataset-id> [--label …] [--variant full|256k] [--description …] [--limit N]',
+    );
+    process.exit(1);
+  }
+  const getFlag = (name: string): string | undefined => {
+    const i = argv.indexOf(`--${name}`);
+    return i !== -1 && i + 1 < argv.length ? argv[i + 1] : undefined;
+  };
+  const limitRaw = getFlag('limit');
+  return {
+    dataset,
+    label: getFlag('label'),
+    variant: getFlag('variant'),
+    description: getFlag('description'),
+    limit: limitRaw ? Number(limitRaw) : undefined,
+  };
+}
+
+async function fetchJson(url: string): Promise<unknown> {
+  const res = await fetch(url);
+  if (!res.ok) {
+    throw new Error(`${res.status} ${res.statusText} for ${url}`);
+  }
+  return res.json();
+}
+
+async function getRowCount(dataset: string): Promise<number> {
+  const info = (await fetchJson(`${INFO_API}?dataset=${encodeURIComponent(dataset)}`)) as {
+    dataset_info?: Record<string, { splits?: Record<string, { num_examples?: number }> }>;
+  };
+  const cfg = info.dataset_info?.['default'];
+  const num = cfg?.splits?.['train']?.num_examples;
+  return typeof num === 'number' ? num : 0;
+}
+
+/** Page through rows with adaptive length (halve on "too big"/error). */
+async function* iterRows(
+  dataset: string,
+  total: number,
+  limit?: number,
+): AsyncGenerator<RawWekaConversation> {
+  const cap = limit ? Math.min(limit, total) : total;
+  let offset = 0;
+  let length = 5; // ~18 MB/page at ~3.5 MB/row; backs off on failure
+  while (offset < cap) {
+    const want = Math.min(length, cap - offset);
+    const url = `${ROWS_API}?dataset=${encodeURIComponent(dataset)}&config=default&split=train&offset=${offset}&length=${want}`;
+    let payload: { rows?: { row: RawWekaConversation }[] };
+    try {
+      payload = (await fetchJson(url)) as { rows?: { row: RawWekaConversation }[] };
+    } catch (error) {
+      if (want > 1) {
+        length = Math.max(1, Math.floor(want / 2));
+        console.warn(
+          `  page @${offset} (len ${want}) failed (${String(error)}); retrying with len ${length}`,
+        );
+        continue;
+      }
+      throw error;
+    }
+    const rows = payload.rows ?? [];
+    if (rows.length === 0) break;
+    for (const r of rows) yield r.row;
+    offset += rows.length;
+    process.stdout.write(`\r  fetched ${Math.min(offset, cap)}/${cap} conversations`);
+  }
+  process.stdout.write('\n');
+}
+
+interface Accumulator {
+  inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children)
+  outputPerTurn: number[];
+  cachedFractionPerTurn: number[]; // cached/in, for turns with in>0
+  turnsPerConv: number[]; // main (top-level) turns
+  subagentGroupsPerConv: number[];
+  subagentTurnsPerGroup: number[];
+  totalIn: number;
+  totalOut: number;
+  totalCached: number;
+  mainTurns: number;
+  subagentGroups: number;
+  subagentTurns: number;
+  modelCounts: Record<string, number>;
+}
+
+function newAccumulator(): Accumulator {
+  return {
+    inputPerTurn: [],
+    outputPerTurn: [],
+    cachedFractionPerTurn: [],
+    turnsPerConv: [],
+    subagentGroupsPerConv: [],
+    subagentTurnsPerGroup: [],
+    totalIn: 0,
+    totalOut: 0,
+    totalCached: 0,
+    mainTurns: 0,
+    subagentGroups: 0,
+    subagentTurns: 0,
+    modelCounts: {},
+  };
+}
+
+function recordTurn(acc: Accumulator, t: TurnNode): void {
+  acc.inputPerTurn.push(t.in);
+  acc.outputPerTurn.push(t.out);
+  if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in);
+  if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1;
+}
+
+function accumulate(acc: Accumulator, s: ConversationStructure): void {
+  acc.totalIn += s.totals.in;
+  acc.totalOut += s.totals.out;
+  acc.totalCached += s.totals.cached;
+  acc.mainTurns += s.totals.numTurns;
+  acc.subagentGroups += s.totals.numSubagentGroups;
+  acc.turnsPerConv.push(s.totals.numTurns);
+  acc.subagentGroupsPerConv.push(s.totals.numSubagentGroups);
+  for (const node of s.nodes) {
+    if (node.kind === 'turn') {
+      recordTurn(acc, node);
+    } else {
+      acc.subagentTurnsPerGroup.push(node.children.length);
+      acc.subagentTurns += node.children.length;
+      for (const child of node.children) recordTurn(acc, child);
+    }
+  }
+}
+
+interface NumberSummary {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  p90: number;
+}
+
+function summarize(values: number[]): NumberSummary {
+  if (values.length === 0) {
+    return { count: 0, min: 0, max: 0, mean: 0, median: 0, p90: 0 };
+  }
+  const sorted = [...values].toSorted((a, b) => a - b);
+  const n = sorted.length;
+  // Quantile by position; q(0)=min, q(1)=max — avoids array-tail indexing that
+  // the linter rewrites to `.at(-1)` (which widens the type to `| undefined`).
+  const q = (p: number) => sorted[Math.min(n - 1, Math.max(0, Math.floor(p * (n - 1))))];
+  const sum = sorted.reduce((a, b) => a + b, 0);
+  return {
+    count: n,
+    min: q(0),
+    max: q(1),
+    mean: sum / n,
+    median: q(0.5),
+    p90: q(0.9),
+  };
+}
+
+function buildChartData(acc: Accumulator) {
+  return {
+    version: 1,
+    inputTokensPerTurn: {
+      bins: logHistogram(acc.inputPerTurn),
+      stats: summarize(acc.inputPerTurn),
+    },
+    outputTokensPerTurn: {
+      bins: logHistogram(acc.outputPerTurn),
+      stats: summarize(acc.outputPerTurn),
+    },
+    turnsPerConversation: {
+      bins: linearHistogram(acc.turnsPerConv),
+      stats: summarize(acc.turnsPerConv),
+    },
+    subagentGroupsPerConversation: {
+      bins: linearHistogram(acc.subagentGroupsPerConv),
+      stats: summarize(acc.subagentGroupsPerConv),
+    },
+    cachedFractionPerTurn: {
+      bins: linearHistogram(acc.cachedFractionPerTurn, 20),
+      stats: summarize(acc.cachedFractionPerTurn),
+    },
+  };
+}
+
+function buildSummary(acc: Accumulator, blockSize: number, hashIdScope: string | null) {
+  const cachedPct = acc.totalIn > 0 ? acc.totalCached / acc.totalIn : 0;
+  return {
+    version: 1,
+    blockSize,
+    hashIdScope,
+    totalIn: acc.totalIn,
+    totalOut: acc.totalOut,
+    totalCached: acc.totalCached,
+    cachedPct,
+    mainTurns: acc.mainTurns,
+    subagentGroups: acc.subagentGroups,
+    subagentTurns: acc.subagentTurns,
+    modelMix: acc.modelCounts,
+  };
+}
+
+function slugFromDataset(dataset: string): string {
+  return dataset.includes('/') ? dataset.slice(dataset.indexOf('/') + 1) : dataset;
+}
+
+function inferVariant(slug: string): string {
+  if (slug.endsWith('-256k')) return '256k';
+  if (slug.includes('no-subagent')) return 'no-subagents';
+  return 'full';
+}
+
+function defaultLabel(slug: string): string {
+  // cc-traces-weka-062126 → "CC Traces Weka 062126"
+  return slug
+    .split('-')
+    .map((p) => (/^\d+$/u.test(p) ? p : p.toUpperCase()))
+    .join(' ')
+    .replace(/^CC TRACES WEKA/u, 'CC Traces Weka');
+}
+
+async function main(): Promise<void> {
+  const args = parseArgs();
+  const slug = slugFromDataset(args.dataset);
+  const variant = args.variant ?? inferVariant(slug);
+  const label = args.label ?? defaultLabel(slug);
+  const hfUrl = `https://huggingface.co/datasets/${args.dataset}`;
+
+  console.log(`=== ingest-weka-dataset: ${args.dataset} ===`);
+  console.log(`  slug=${slug} variant=${variant} label="${label}"`);
+
+  const sql = createAdminSql({ noSsl: hasNoSslFlag(), max: 1 });
+
+  const total = await getRowCount(args.dataset);
+  console.log(`  ${total} conversations on HF`);
+
+  const acc = newAccumulator();
+  let blockSize = 64;
+  let hashIdScope: string | null = null;
+
+  // Buffer the per-conversation rows; flush in batches to keep memory bounded.
+  interface ConvRow {
+    dataset_id: string;
+    conv_id: string;
+    models: string[];
+    num_turns: number;
+    num_subagent_groups: number;
+    total_in: number;
+    total_out: number;
+    total_cached: number;
+    structure: ConversationStructure;
+  }
+  const pending: ConvRow[] = [];
+
+  try {
+    // Upsert the dataset shell first (FK target). Counts/summary filled at the end.
+    await sql`
+      insert into datasets (id, slug, label, variant, description, hf_url, license)
+      values (${args.dataset}, ${slug}, ${label}, ${variant}, ${args.description ?? null}, ${hfUrl}, 'apache-2.0')
+      on conflict (id) do update set
+        slug = excluded.slug, label = excluded.label, variant = excluded.variant,
+        description = coalesce(excluded.description, datasets.description),
+        hf_url = excluded.hf_url, license = excluded.license, ingested_at = now()
+    `;
+    // Clear prior conversations for a clean re-ingest.
+    await sql`delete from dataset_conversations where dataset_id = ${args.dataset}`;
+
+    const flush = async () => {
+      if (pending.length === 0) return;
+      // postgres.js row-helper insert: serializes `structure` to jsonb and
+      // `models` to text[] per row (unnest can't carry a text[] column — a 2D
+      // array would flatten into scalar rows).
+      const rows = pending.map((p) => ({
+        dataset_id: args.dataset,
+        conv_id: p.conv_id,
+        models: p.models,
+        num_turns: p.num_turns,
+        num_subagent_groups: p.num_subagent_groups,
+        total_in: p.total_in,
+        total_out: p.total_out,
+        total_cached: p.total_cached,
+        structure: sql.json(p.structure as unknown as Parameters<typeof sql.json>[0]),
+      }));
+      await sql`insert into dataset_conversations ${sql(rows)}`;
+      pending.length = 0;
+    };
+
+    let count = 0;
+    for await (const conv of iterRows(args.dataset, total, args.limit)) {
+      blockSize = conv.block_size ?? blockSize;
+      hashIdScope = conv.hash_id_scope ?? hashIdScope;
+      const structure = buildConversationStructure(conv);
+      accumulate(acc, structure);
+      pending.push({
+        dataset_id: args.dataset,
+        conv_id: conv.id,
+        models: Array.isArray(conv.models) ? conv.models : [],
+        num_turns: structure.totals.numTurns,
+        num_subagent_groups: structure.totals.numSubagentGroups,
+        total_in: structure.totals.in,
+        total_out: structure.totals.out,
+        total_cached: structure.totals.cached,
+        structure,
+      });
+      count += 1;
+      if (pending.length >= 25) await flush();
+    }
+    await flush();
+
+    const summary = buildSummary(acc, blockSize, hashIdScope);
+    const chartData = buildChartData(acc);
+    await sql`
+      update datasets set
+        conversation_count = ${count},
+        summary = ${sql.json(summary as unknown as Parameters<typeof sql.json>[0])},
+        chart_data = ${sql.json(chartData as unknown as Parameters<typeof sql.json>[0])},
+        ingested_at = now()
+      where id = ${args.dataset}
+    `;
+
+    console.log(`\n  ingested ${count} conversations`);
+    console.log(
+      `  main turns=${acc.mainTurns} subagent groups=${acc.subagentGroups} subagent turns=${acc.subagentTurns}`,
+    );
+    console.log(
+      `  totals: in=${acc.totalIn.toLocaleString()} out=${acc.totalOut.toLocaleString()} ` +
+        `cached=${acc.totalCached.toLocaleString()} (${(summary.cachedPct * 100).toFixed(1)}% of input)`,
+    );
+    console.log('\n=== done ===');
+    console.log('  Purge the API cache: POST /api/v1/invalidate');
+  } finally {
+    await sql.end({ timeout: 5 });
+  }
+}
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});

From b6be5a8d06f6f0ff118d8eee2d8c4a509d8be3ee Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:06:25 -0500
Subject: [PATCH 77/96] fix(datasets): handle HF 429 rate-limiting in ingest

Retry 429/5xx with exponential backoff (honoring Retry-After) instead of
shrinking page size, plus a 400ms inter-page delay. Lets the full 393-row
ingest complete without tripping the datasets-server rate limit.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/db/src/ingest-weka-dataset.ts | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
index 4ef5328e..22069419 100644
--- a/packages/db/src/ingest-weka-dataset.ts
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -64,8 +64,30 @@ function parseArgs(): CliArgs {
   };
 }
 
-async function fetchJson(url: string): Promise<unknown> {
+const sleep = (ms: number) =>
+  new Promise<void>((resolve) => {
+    setTimeout(resolve, ms);
+  });
+
+/**
+ * Fetch JSON, transparently retrying on HF rate-limiting (429) and transient
+ * 5xx with exponential backoff. Honors a Retry-After header when present.
+ */
+async function fetchJson(url: string, attempt = 0): Promise<unknown> {
   const res = await fetch(url);
+  if (res.status === 429 || res.status >= 500) {
+    if (attempt >= 6) {
+      throw new Error(`${res.status} ${res.statusText} after ${attempt} retries for ${url}`);
+    }
+    const retryAfter = Number(res.headers.get('retry-after'));
+    const waitMs =
+      Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1000 : 2000 * 2 ** attempt;
+    console.warn(
+      `  ${res.status} ${res.statusText}; waiting ${Math.round(waitMs / 1000)}s (attempt ${attempt + 1})`,
+    );
+    await sleep(waitMs);
+    return fetchJson(url, attempt + 1);
+  }
   if (!res.ok) {
     throw new Error(`${res.status} ${res.statusText} for ${url}`);
   }
@@ -111,6 +133,7 @@ async function* iterRows(
     for (const r of rows) yield r.row;
     offset += rows.length;
     process.stdout.write(`\r  fetched ${Math.min(offset, cap)}/${cap} conversations`);
+    if (offset < cap) await sleep(400); // be polite to the HF datasets-server
   }
   process.stdout.write('\n');
 }

From a376b5ba826463d447dcade4c5cc990ce7f22143 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:10:08 -0500
Subject: [PATCH 78/96] feat(datasets): DB queries, API routes, and React Query
 hooks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

queries/datasets.ts: listDatasets, getDataset (incl chart_data),
listConversations (paginated, searchable, 4 sort modes — separate per-sort
queries since the neon HTTP driver can't compose order-by fragments),
getConversation (flamegraph structure). Routes under /api/v1/datasets/* with
cachedQuery + gzip cachedJson. Hooks use-datasets.ts mirror the existing
benchmark-siblings hook style. Verified all four routes against the live branch.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../[slug]/conversations/[convId]/route.ts    |  33 +++
 .../v1/datasets/[slug]/conversations/route.ts |  53 +++++
 .../src/app/api/v1/datasets/[slug]/route.ts   |  29 +++
 packages/app/src/app/api/v1/datasets/route.ts |  24 ++
 packages/app/src/hooks/api/use-datasets.ts    | 183 +++++++++++++++
 packages/db/src/queries/datasets.ts           | 209 ++++++++++++++++++
 6 files changed, 531 insertions(+)
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/[slug]/route.ts
 create mode 100644 packages/app/src/app/api/v1/datasets/route.ts
 create mode 100644 packages/app/src/hooks/api/use-datasets.ts
 create mode 100644 packages/db/src/queries/datasets.ts

diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
new file mode 100644
index 00000000..84cc15e3
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/[convId]/route.ts
@@ -0,0 +1,33 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  getConversation,
+  type ConversationDetail,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedConversation = cachedQuery(
+  (slug: string, convId: string): Promise<ConversationDetail | null> =>
+    getConversation(getDb(), slug, convId),
+  'dataset-conversation',
+);
+
+/** GET /api/v1/datasets/[slug]/conversations/[convId] — flamegraph structure. */
+export async function GET(
+  _request: NextRequest,
+  { params }: { params: Promise<{ slug: string; convId: string }> },
+) {
+  const { slug, convId } = await params;
+  try {
+    const data = await getCachedConversation(slug, decodeURIComponent(convId));
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset conversation:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
new file mode 100644
index 00000000..62b9e5b7
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/conversations/route.ts
@@ -0,0 +1,53 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import {
+  listConversations,
+  type ConversationList,
+  type ListConversationsOpts,
+} from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const SORTS = new Set(['tokens', 'turns', 'subagents', 'id']);
+
+const getCachedConversations = cachedQuery(
+  (
+    slug: string,
+    search: string,
+    limit: number,
+    offset: number,
+    sort: string,
+  ): Promise<ConversationList | null> =>
+    listConversations(getDb(), slug, {
+      search: search || undefined,
+      limit,
+      offset,
+      sort: sort as ListConversationsOpts['sort'],
+    }),
+  'dataset-conversations',
+);
+
+/**
+ * GET /api/v1/datasets/[slug]/conversations?search=&limit=&offset=&sort=
+ * Paginated conversation list (counts only, no flamegraph structure).
+ */
+export async function GET(request: NextRequest, { params }: { params: Promise<{ slug: string }> }) {
+  const { slug } = await params;
+  const sp = request.nextUrl.searchParams;
+  const search = sp.get('search') ?? '';
+  const limit = Math.min(200, Math.max(1, Number(sp.get('limit')) || 50));
+  const offset = Math.max(0, Number(sp.get('offset')) || 0);
+  const sortParam = sp.get('sort') ?? 'tokens';
+  const sort = SORTS.has(sortParam) ? sortParam : 'tokens';
+  try {
+    const data = await getCachedConversations(slug, search, limit, offset, sort);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset conversations:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/[slug]/route.ts b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
new file mode 100644
index 00000000..9e4af580
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/[slug]/route.ts
@@ -0,0 +1,29 @@
+import { type NextRequest, NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { getDataset, type DatasetDetail } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDataset = cachedQuery(
+  (slug: string): Promise<DatasetDetail | null> => getDataset(getDb(), slug),
+  'dataset',
+);
+
+/** GET /api/v1/datasets/[slug] — one dataset incl. precomputed chart_data. */
+export async function GET(
+  _request: NextRequest,
+  { params }: { params: Promise<{ slug: string }> },
+) {
+  const { slug } = await params;
+  try {
+    const data = await getCachedDataset(slug);
+    if (!data) return NextResponse.json({ error: 'Not found' }, { status: 404 });
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching dataset:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/app/api/v1/datasets/route.ts b/packages/app/src/app/api/v1/datasets/route.ts
new file mode 100644
index 00000000..f0acca3c
--- /dev/null
+++ b/packages/app/src/app/api/v1/datasets/route.ts
@@ -0,0 +1,24 @@
+import { NextResponse } from 'next/server';
+
+import { getDb } from '@semianalysisai/inferencex-db/connection';
+import { listDatasets, type DatasetRecord } from '@semianalysisai/inferencex-db/queries/datasets';
+
+import { cachedJson, cachedQuery } from '@/lib/api-cache';
+
+export const dynamic = 'force-dynamic';
+
+const getCachedDatasets = cachedQuery(
+  (): Promise<DatasetRecord[]> => listDatasets(getDb()),
+  'datasets',
+);
+
+/** GET /api/v1/datasets — all ingested cc-traces-weka datasets (registry cards). */
+export async function GET() {
+  try {
+    const data = await getCachedDatasets();
+    return cachedJson(data);
+  } catch (error) {
+    console.error('Error fetching datasets:', error);
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 });
+  }
+}
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts
new file mode 100644
index 00000000..3ce61a85
--- /dev/null
+++ b/packages/app/src/hooks/api/use-datasets.ts
@@ -0,0 +1,183 @@
+import { useQuery, keepPreviousData } from '@tanstack/react-query';
+
+import type {
+  ConversationStructure,
+  StructureNode,
+} from '@semianalysisai/inferencex-db/etl/weka-structure';
+
+export type { ConversationStructure, StructureNode };
+
+export interface DatasetSummary {
+  blockSize?: number;
+  hashIdScope?: string | null;
+  totalIn?: number;
+  totalOut?: number;
+  totalCached?: number;
+  cachedPct?: number;
+  mainTurns?: number;
+  subagentGroups?: number;
+  subagentTurns?: number;
+  modelMix?: Record<string, number>;
+  [k: string]: unknown;
+}
+
+export interface DatasetRecord {
+  id: string;
+  slug: string;
+  label: string;
+  variant: string;
+  description: string | null;
+  hf_url: string | null;
+  license: string | null;
+  conversation_count: number;
+  summary: DatasetSummary;
+  ingested_at: string;
+}
+
+export interface HistogramBin {
+  x0: number;
+  x1: number;
+  count: number;
+}
+
+export interface DistributionStats {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  p90: number;
+}
+
+export interface Distribution {
+  bins: HistogramBin[];
+  stats: DistributionStats;
+}
+
+export interface DatasetChartData {
+  version?: number;
+  inputTokensPerTurn?: Distribution;
+  outputTokensPerTurn?: Distribution;
+  turnsPerConversation?: Distribution;
+  subagentGroupsPerConversation?: Distribution;
+  cachedFractionPerTurn?: Distribution;
+  [k: string]: unknown;
+}
+
+export interface DatasetDetail extends DatasetRecord {
+  chart_data: DatasetChartData;
+}
+
+export interface ConversationListItem {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+}
+
+export interface ConversationList {
+  total: number;
+  items: ConversationListItem[];
+}
+
+export interface ConversationDetail {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+  structure: ConversationStructure;
+}
+
+export type ConversationSort = 'tokens' | 'turns' | 'subagents' | 'id';
+
+const DAY = 24 * 60 * 60 * 1000;
+
+/** All ingested datasets (registry cards). */
+export function useDatasets() {
+  return useQuery({
+    queryKey: ['datasets'] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch('/api/v1/datasets', { signal });
+      if (!res.ok) throw new Error(`datasets ${res.status}`);
+      return (await res.json()) as DatasetRecord[];
+    },
+    staleTime: DAY,
+  });
+}
+
+/** One dataset incl. chart_data. */
+export function useDataset(slug: string | null) {
+  return useQuery({
+    queryKey: ['dataset', slug] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch(`/api/v1/datasets/${slug}`, { signal });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`dataset ${res.status}`);
+      return (await res.json()) as DatasetDetail;
+    },
+    enabled: Boolean(slug),
+    staleTime: DAY,
+  });
+}
+
+export interface UseConversationsArgs {
+  slug: string | null;
+  search?: string;
+  limit?: number;
+  offset?: number;
+  sort?: ConversationSort;
+}
+
+/** Paginated conversation list for a dataset (counts only). */
+export function useDatasetConversations({
+  slug,
+  search = '',
+  limit = 50,
+  offset = 0,
+  sort = 'tokens',
+}: UseConversationsArgs) {
+  return useQuery({
+    queryKey: ['dataset-conversations', slug, search, limit, offset, sort] as const,
+    queryFn: async ({ signal }) => {
+      const qs = new URLSearchParams({
+        limit: String(limit),
+        offset: String(offset),
+        sort,
+      });
+      if (search) qs.set('search', search);
+      const res = await fetch(`/api/v1/datasets/${slug}/conversations?${qs.toString()}`, {
+        signal,
+      });
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`dataset-conversations ${res.status}`);
+      return (await res.json()) as ConversationList;
+    },
+    enabled: Boolean(slug),
+    placeholderData: keepPreviousData,
+    staleTime: DAY,
+  });
+}
+
+/** One conversation's flamegraph structure. */
+export function useDatasetConversation(slug: string | null, convId: string | null) {
+  return useQuery({
+    queryKey: ['dataset-conversation', slug, convId] as const,
+    queryFn: async ({ signal }) => {
+      const res = await fetch(
+        `/api/v1/datasets/${slug}/conversations/${encodeURIComponent(convId ?? '')}`,
+        { signal },
+      );
+      if (res.status === 404) return null;
+      if (!res.ok) throw new Error(`dataset-conversation ${res.status}`);
+      return (await res.json()) as ConversationDetail;
+    },
+    enabled: Boolean(slug) && Boolean(convId),
+    staleTime: DAY,
+  });
+}
diff --git a/packages/db/src/queries/datasets.ts b/packages/db/src/queries/datasets.ts
new file mode 100644
index 00000000..89c6ca5e
--- /dev/null
+++ b/packages/db/src/queries/datasets.ts
@@ -0,0 +1,209 @@
+/**
+ * Read queries for the agentic-benchmark source datasets (the HF cc-traces-weka
+ * corpora ingested by ingest-weka-dataset.ts). Back the /datasets area:
+ *   - listDatasets      → registry cards (no per-conversation rows)
+ *   - getDataset        → one dataset incl. precomputed chart_data
+ *   - listConversations → paginated conversation list (counts only, no structure)
+ *   - getConversation   → one conversation's flamegraph structure
+ */
+
+import type { DbClient } from '../connection.js';
+import type { ConversationStructure } from '../etl/weka-structure.js';
+
+export interface DatasetSummary {
+  blockSize?: number;
+  hashIdScope?: string | null;
+  totalIn?: number;
+  totalOut?: number;
+  totalCached?: number;
+  cachedPct?: number;
+  mainTurns?: number;
+  subagentGroups?: number;
+  subagentTurns?: number;
+  modelMix?: Record<string, number>;
+  [k: string]: unknown;
+}
+
+export interface DatasetRecord {
+  id: string;
+  slug: string;
+  label: string;
+  variant: string;
+  description: string | null;
+  hf_url: string | null;
+  license: string | null;
+  conversation_count: number;
+  summary: DatasetSummary;
+  ingested_at: string;
+}
+
+export interface DatasetDetail extends DatasetRecord {
+  /** Precomputed distribution bins + stats keyed by metric (see ingest buildChartData). */
+  chart_data: Record<string, unknown>;
+}
+
+export interface ConversationListItem {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+}
+
+export interface ConversationList {
+  total: number;
+  items: ConversationListItem[];
+}
+
+export interface ConversationDetail {
+  conv_id: string;
+  models: string[];
+  num_turns: number;
+  num_subagent_groups: number;
+  total_in: number;
+  total_out: number;
+  total_cached: number;
+  structure: ConversationStructure;
+}
+
+/** All ingested datasets, newest first. Excludes the (large) chart_data blob. */
+export async function listDatasets(sql: DbClient): Promise<DatasetRecord[]> {
+  const rows = (await sql`
+    select id, slug, label, variant, description, hf_url, license,
+           conversation_count, summary, ingested_at::text
+    from datasets
+    order by ingested_at desc, slug asc
+  `) as unknown as DatasetRecord[];
+  return rows.map((r) => ({ ...r, conversation_count: Number(r.conversation_count) }));
+}
+
+/** One dataset by slug, including chart_data. Null if not found. */
+export async function getDataset(sql: DbClient, slug: string): Promise<DatasetDetail | null> {
+  const rows = (await sql`
+    select id, slug, label, variant, description, hf_url, license,
+           conversation_count, summary, chart_data, ingested_at::text
+    from datasets
+    where slug = ${slug}
+  `) as unknown as DatasetDetail[];
+  const row = rows[0];
+  if (!row) return null;
+  return { ...row, conversation_count: Number(row.conversation_count) };
+}
+
+export interface ListConversationsOpts {
+  search?: string;
+  limit?: number;
+  offset?: number;
+  /** 'tokens' (total_in desc), 'turns' (num_turns desc), or 'id' (conv_id asc). */
+  sort?: 'tokens' | 'turns' | 'subagents' | 'id';
+}
+
+const MAX_LIMIT = 200;
+
+/**
+ * Paginated conversation list for a dataset (by slug). Returns counts only —
+ * the per-conversation `structure` blob is fetched separately by
+ * getConversation so the list stays light.
+ */
+export async function listConversations(
+  sql: DbClient,
+  slug: string,
+  opts: ListConversationsOpts = {},
+): Promise<ConversationList | null> {
+  const ds = (await sql`select id from datasets where slug = ${slug}`) as unknown as {
+    id: string;
+  }[];
+  const datasetId = ds[0]?.id;
+  if (!datasetId) return null;
+
+  const limit = Math.min(MAX_LIMIT, Math.max(1, opts.limit ?? 50));
+  const offset = Math.max(0, opts.offset ?? 0);
+  const search = opts.search?.trim();
+  const like = search ? `%${search}%` : null;
+
+  const totalRows = (await sql`
+    select count(*)::int as n
+    from dataset_conversations
+    where dataset_id = ${datasetId}
+      and (${like}::text is null or conv_id ilike ${like})
+  `) as unknown as { n: number }[];
+  const total = totalRows[0]?.n ?? 0;
+
+  // Separate queries per sort (literal ORDER BY) — the neon HTTP driver doesn't
+  // compose nested sql fragments the way postgres.js does, so we can't splice an
+  // order-by fragment. The sort key is an enum, never raw user input.
+  const sort = opts.sort ?? 'tokens';
+  let items: ConversationListItem[];
+  if (sort === 'turns') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by num_turns desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else if (sort === 'subagents') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by num_subagent_groups desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else if (sort === 'id') {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  } else {
+    items = (await sql`
+      select conv_id, models, num_turns, num_subagent_groups, total_in, total_out, total_cached
+      from dataset_conversations
+      where dataset_id = ${datasetId} and (${like}::text is null or conv_id ilike ${like})
+      order by total_in desc, conv_id asc
+      limit ${limit} offset ${offset}
+    `) as unknown as ConversationListItem[];
+  }
+
+  return {
+    total,
+    items: items.map((r) => ({
+      ...r,
+      num_turns: Number(r.num_turns),
+      num_subagent_groups: Number(r.num_subagent_groups),
+      total_in: Number(r.total_in),
+      total_out: Number(r.total_out),
+      total_cached: Number(r.total_cached),
+    })),
+  };
+}
+
+/** One conversation's full flamegraph structure. Null if dataset/conv missing. */
+export async function getConversation(
+  sql: DbClient,
+  slug: string,
+  convId: string,
+): Promise<ConversationDetail | null> {
+  const rows = (await sql`
+    select dc.conv_id, dc.models, dc.num_turns, dc.num_subagent_groups,
+           dc.total_in, dc.total_out, dc.total_cached, dc.structure
+    from dataset_conversations dc
+    join datasets d on d.id = dc.dataset_id
+    where d.slug = ${slug} and dc.conv_id = ${convId}
+  `) as unknown as ConversationDetail[];
+  const row = rows[0];
+  if (!row) return null;
+  return {
+    ...row,
+    num_turns: Number(row.num_turns),
+    num_subagent_groups: Number(row.num_subagent_groups),
+    total_in: Number(row.total_in),
+    total_out: Number(row.total_out),
+    total_cached: Number(row.total_cached),
+  };
+}

From 574dfcc8a832fe081167cfd55af586463a29e546 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:16:54 -0500
Subject: [PATCH 79/96] feat(datasets): /datasets pages, distribution cards,
 flamegraph, nav
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- /datasets: methodology prose + dataset registry cards (DatasetList)
- /datasets/[slug]: summary stats, model mix, 5 precomputed-histogram
  distribution cards (DistributionCard, log/linear), and a
  searchable/sortable/paginated conversation table
- /datasets/[slug]/conversations/[convId]: per-conversation TraceFlamegraph —
  one bar per turn (cached prefix + uncached input + output), subagent groups
  collapsible (collapsed by default) with expand/collapse-all
- header nav 'Datasets' link
- query-layer test (mock DbClient): not-found paths + numeric coercion

Verified end-to-end against the live branch DB: both datasets list with real
stats, distributions render, flamegraph shows the prefix-reuse signature
(turn 2 fully uncached, later turns mostly cached), expand-all surfaces
subagent subturns. Zero console errors.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../[slug]/conversations/[convId]/page.tsx    |  32 ++
 packages/app/src/app/datasets/[slug]/page.tsx |  32 ++
 packages/app/src/app/datasets/page.tsx        |  99 ++++++
 .../components/datasets/conversation-view.tsx | 101 ++++++
 .../components/datasets/dataset-detail.tsx    | 305 ++++++++++++++++++
 .../src/components/datasets/dataset-list.tsx  |  85 +++++
 .../components/datasets/distribution-card.tsx | 220 +++++++++++++
 .../components/datasets/trace-flamegraph.tsx  | 273 ++++++++++++++++
 packages/app/src/components/header/header.tsx |   6 +
 packages/db/src/queries/datasets.test.ts      | 102 ++++++
 10 files changed, 1255 insertions(+)
 create mode 100644 packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
 create mode 100644 packages/app/src/app/datasets/[slug]/page.tsx
 create mode 100644 packages/app/src/app/datasets/page.tsx
 create mode 100644 packages/app/src/components/datasets/conversation-view.tsx
 create mode 100644 packages/app/src/components/datasets/dataset-detail.tsx
 create mode 100644 packages/app/src/components/datasets/dataset-list.tsx
 create mode 100644 packages/app/src/components/datasets/distribution-card.tsx
 create mode 100644 packages/app/src/components/datasets/trace-flamegraph.tsx
 create mode 100644 packages/db/src/queries/datasets.test.ts

diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
new file mode 100644
index 00000000..75702c1b
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -0,0 +1,32 @@
+import type { Metadata } from 'next';
+
+import { ConversationView } from '@/components/datasets/conversation-view';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+  params: Promise<{ slug: string; convId: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise<Metadata> {
+  const { slug, convId } = await params;
+  const short = convId.slice(0, 12);
+  const title = `Conversation ${short} | ${slug}`;
+  const description = `Per-turn token flamegraph (cached prefix vs uncached input vs output) for conversation ${short} in the ${slug} agentic trace dataset.`;
+  return {
+    title,
+    description,
+    alternates: { canonical: `${SITE_URL}/datasets/${slug}/conversations/${convId}` },
+    robots: { index: false }, // per-conversation pages are too numerous to index
+  };
+}
+
+export default async function ConversationPage({ params }: Props) {
+  const { slug, convId } = await params;
+  return (
+    <main className="relative">
+      <div className="container mx-auto px-4 pb-8 lg:px-8">
+        <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/app/datasets/[slug]/page.tsx b/packages/app/src/app/datasets/[slug]/page.tsx
new file mode 100644
index 00000000..f32e3fa6
--- /dev/null
+++ b/packages/app/src/app/datasets/[slug]/page.tsx
@@ -0,0 +1,32 @@
+import type { Metadata } from 'next';
+
+import { DatasetDetail } from '@/components/datasets/dataset-detail';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+interface Props {
+  params: Promise<{ slug: string }>;
+}
+
+export async function generateMetadata({ params }: Props): Promise<Metadata> {
+  const { slug } = await params;
+  const title = `${slug} | Agentic Datasets`;
+  const description = `Distributions, token statistics, and per-conversation flamegraphs for the ${slug} agentic trace dataset.`;
+  return {
+    title,
+    description,
+    alternates: { canonical: `${SITE_URL}/datasets/${slug}` },
+    openGraph: { title: `${title} | InferenceX`, description, url: `${SITE_URL}/datasets/${slug}` },
+    twitter: { title: `${title} | InferenceX`, description },
+  };
+}
+
+export default async function DatasetDetailPage({ params }: Props) {
+  const { slug } = await params;
+  return (
+    <main className="relative">
+      <div className="container mx-auto px-4 pb-8 lg:px-8">
+        <DatasetDetail slug={slug} />
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/app/datasets/page.tsx b/packages/app/src/app/datasets/page.tsx
new file mode 100644
index 00000000..7fe46b93
--- /dev/null
+++ b/packages/app/src/app/datasets/page.tsx
@@ -0,0 +1,99 @@
+import type { Metadata } from 'next';
+
+import { Card } from '@/components/ui/card';
+import { JsonLd } from '@/components/json-ld';
+import { DatasetList } from '@/components/datasets/dataset-list';
+import { SITE_URL } from '@semianalysisai/inferencex-constants';
+
+const DESCRIPTION =
+  'The real Claude Code agentic conversation traces that the InferenceX agentic benchmark replays — methodology, distributions, and per-conversation flamegraphs.';
+
+export const metadata: Metadata = {
+  title: 'Agentic Datasets',
+  description: DESCRIPTION,
+  alternates: { canonical: `${SITE_URL}/datasets` },
+  openGraph: {
+    title: 'Agentic Datasets | InferenceX',
+    description: DESCRIPTION,
+    url: `${SITE_URL}/datasets`,
+  },
+  twitter: { title: 'Agentic Datasets | InferenceX', description: DESCRIPTION },
+};
+
+const jsonLd = {
+  '@context': 'https://schema.org',
+  '@type': 'CollectionPage',
+  name: 'InferenceX Agentic Datasets',
+  description: DESCRIPTION,
+  url: `${SITE_URL}/datasets`,
+};
+
+export default function DatasetsPage() {
+  return (
+    <main className="relative">
+      <JsonLd data={jsonLd} />
+      <div className="container mx-auto flex flex-col gap-6 px-4 pb-8 lg:px-8">
+        <section>
+          <Card>
+            <h1 className="mb-2 text-xl font-semibold text-foreground">
+              Agentic Benchmark Datasets
+            </h1>
+            <p className="mb-3 text-sm text-muted-foreground">
+              InferenceX&apos;s agentic benchmark doesn&apos;t replay synthetic prompts — it replays
+              real Claude Code coding sessions captured as <strong>conversation traces</strong>.
+              Each trace is a full multi-turn session: the main agent&apos;s turns plus any
+              subagents it spawned, with per-turn input/output token counts and the 64-token
+              KV-cache block hashes needed to reconstruct prefix-cache reuse. The traces are
+              published openly on HuggingFace under <code>semianalysisai/cc-traces-weka-*</code>{' '}
+              (apache-2.0).
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">
+              How traces are captured
+            </h2>
+            <p className="mb-3 text-sm text-muted-foreground">
+              Production Claude Code sessions are recorded through a logging proxy that captures
+              every API request: its input and output token counts, the model used, timing (TTFT,
+              inter-token latency), and a list of <code>hash_ids</code> — one per 64-token KV block
+              of the request&apos;s input. Subagent invocations are grouped under their parent turn.
+              No prompt or completion text is stored; only token counts and block hashes, so the
+              corpus is shareable while remaining a faithful workload for replay.
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">
+              Cached prefix vs uncached suffix
+            </h2>
+            <p className="mb-3 text-sm text-muted-foreground">
+              Agentic workloads are dominated by prefix reuse: each turn resends the growing
+              conversation, so most of its input is already in the KV cache from prior turns. We
+              reconstruct this exactly. Walking a conversation in order under an idealized infinite
+              cache, a turn&apos;s <strong>cached prefix</strong> is its longest run of leading{' '}
+              <code>hash_ids</code> already seen; the rest is the <strong>uncached suffix</strong>{' '}
+              that must be (re)computed. Blocks are 64 tokens; the split is clamped so cached +
+              uncached equals the turn&apos;s effective input even on a partial final block.
+              Subagents run against a snapshot of the parent cache at spawn (their context is
+              separate and is not folded back into the parent).
+            </p>
+
+            <h2 className="mb-1.5 mt-4 text-sm font-semibold text-foreground">Dataset variants</h2>
+            <ul className="mb-1 list-disc space-y-1 pl-5 text-sm text-muted-foreground">
+              <li>
+                <strong>full</strong> — every captured request, unmodified.
+              </li>
+              <li>
+                <strong>256k</strong> — requests whose input + output exceeds 256,000 tokens are
+                dropped so every turn fits a 256k context window (used when benchmarking engines
+                configured for a 256k max context).
+              </li>
+            </ul>
+          </Card>
+        </section>
+
+        <section className="flex flex-col gap-3">
+          <h2 className="text-lg font-semibold text-foreground">Datasets</h2>
+          <DatasetList />
+        </section>
+      </div>
+    </main>
+  );
+}
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
new file mode 100644
index 00000000..43992c41
--- /dev/null
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -0,0 +1,101 @@
+'use client';
+
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import { useDatasetConversation } from '@/hooks/api/use-datasets';
+
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
+  const { data, isLoading, isError } = useDatasetConversation(slug, convId);
+
+  if (isLoading) {
+    return (
+      <div className="py-12 text-center text-sm text-muted-foreground">Loading conversation…</div>
+    );
+  }
+  if (isError || !data) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">
+        Conversation not found.{' '}
+        <Link href={`/datasets/${slug}`} className="text-primary underline">
+          Back to dataset
+        </Link>
+      </div>
+    );
+  }
+
+  const cachedPct =
+    data.total_in > 0 ? `${((data.total_cached / data.total_in) * 100).toFixed(0)}%` : '—';
+
+  return (
+    <div className="flex flex-col gap-6">
+      <div>
+        <div className="mb-1 flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
+          <Link href="/datasets" className="hover:text-foreground">
+            Datasets
+          </Link>
+          <span>/</span>
+          <Link href={`/datasets/${slug}`} className="hover:text-foreground">
+            {slug}
+          </Link>
+          <span>/</span>
+          <span className="text-foreground">conversation</span>
+        </div>
+        <h1 className="break-all font-mono text-lg font-semibold text-foreground">
+          {data.conv_id}
+        </h1>
+        {data.models.length > 0 && (
+          <div className="mt-2 flex flex-wrap gap-2">
+            {data.models.map((m) => (
+              <span
+                key={m}
+                className="rounded-md border border-border/40 px-2 py-0.5 text-xs text-foreground"
+              >
+                {m}
+              </span>
+            ))}
+          </div>
+        )}
+      </div>
+
+      <Card className="p-4">
+        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-6">
+          <Stat label="Main turns" value={String(data.num_turns)} />
+          <Stat label="Subagent groups" value={String(data.num_subagent_groups)} />
+          <Stat label="Input" value={`${compact(data.total_in)} tok`} />
+          <Stat label="Output" value={`${compact(data.total_out)} tok`} />
+          <Stat label="Cached" value={`${compact(data.total_cached)} tok`} />
+          <Stat label="Cached %" value={cachedPct} />
+        </dl>
+      </Card>
+
+      <Card className="p-4">
+        <h2 className="mb-3 text-lg font-semibold text-foreground">Flamegraph</h2>
+        <p className="mb-4 text-xs text-muted-foreground">
+          One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default —
+          click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
+          plus generated output.
+        </p>
+        <TraceFlamegraph structure={data.structure} />
+      </Card>
+    </div>
+  );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div>
+      <dt className="text-xs text-muted-foreground">{label}</dt>
+      <dd className="text-lg font-semibold tabular-nums text-foreground">{value}</dd>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
new file mode 100644
index 00000000..57c50649
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -0,0 +1,305 @@
+'use client';
+
+import { useState } from 'react';
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import {
+  useDataset,
+  useDatasetConversations,
+  type ConversationSort,
+} from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+const PAGE = 50;
+
+const SORTS: { value: ConversationSort; label: string }[] = [
+  { value: 'tokens', label: 'Total input ↓' },
+  { value: 'turns', label: 'Turns ↓' },
+  { value: 'subagents', label: 'Subagent groups ↓' },
+  { value: 'id', label: 'Conversation ID' },
+];
+
+export function DatasetDetail({ slug }: { slug: string }) {
+  const { data: dataset, isLoading, isError } = useDataset(slug);
+  const [search, setSearch] = useState('');
+  const [sort, setSort] = useState<ConversationSort>('tokens');
+  const [page, setPage] = useState(0);
+
+  const { data: convs, isFetching } = useDatasetConversations({
+    slug,
+    search,
+    sort,
+    limit: PAGE,
+    offset: page * PAGE,
+  });
+
+  if (isLoading) {
+    return <div className="py-12 text-center text-sm text-muted-foreground">Loading dataset…</div>;
+  }
+  if (isError || !dataset) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">
+        Dataset not found.{' '}
+        <Link href="/datasets" className="text-primary underline">
+          Back to datasets
+        </Link>
+      </div>
+    );
+  }
+
+  const s = dataset.summary ?? {};
+  const cd = dataset.chart_data ?? {};
+  const total = convs?.total ?? 0;
+  const pageCount = Math.ceil(total / PAGE);
+
+  return (
+    <div className="flex flex-col gap-6">
+      {/* header */}
+      <div>
+        <div className="mb-1 flex items-center gap-2">
+          <Link href="/datasets" className="text-xs text-muted-foreground hover:text-foreground">
+            ← Datasets
+          </Link>
+        </div>
+        <div className="flex flex-wrap items-baseline justify-between gap-2">
+          <h1 className="text-2xl font-semibold text-foreground">{dataset.label}</h1>
+          <div className="flex items-center gap-2 text-xs">
+            <span className="rounded-full border border-border/50 px-2 py-0.5 uppercase tracking-wide text-muted-foreground">
+              {dataset.variant}
+            </span>
+            {dataset.hf_url && (
+              <a
+                href={dataset.hf_url}
+                target="_blank"
+                rel="noopener noreferrer"
+                onClick={() => track('datasets_hf_link_clicked', { slug })}
+                className="text-primary hover:underline"
+              >
+                View on HuggingFace ↗
+              </a>
+            )}
+          </div>
+        </div>
+        {dataset.description && (
+          <p className="mt-2 max-w-3xl text-sm text-muted-foreground">{dataset.description}</p>
+        )}
+      </div>
+
+      {/* summary stats */}
+      <Card className="p-4">
+        <dl className="grid grid-cols-2 gap-4 sm:grid-cols-3 lg:grid-cols-6">
+          <Stat label="Conversations" value={dataset.conversation_count.toLocaleString()} />
+          <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
+          <Stat label="Subagent groups" value={compact(s.subagentGroups ?? 0)} />
+          <Stat label="Subagent turns" value={compact(s.subagentTurns ?? 0)} />
+          <Stat
+            label="Cached input"
+            value={typeof s.cachedPct === 'number' ? `${(s.cachedPct * 100).toFixed(0)}%` : '—'}
+          />
+          <Stat label="Total tokens" value={compact((s.totalIn ?? 0) + (s.totalOut ?? 0))} />
+        </dl>
+        {s.modelMix && Object.keys(s.modelMix).length > 0 && (
+          <div className="mt-4 border-t border-border/40 pt-3">
+            <div className="mb-1.5 text-xs font-medium text-muted-foreground">
+              Model mix (turns)
+            </div>
+            <div className="flex flex-wrap gap-2">
+              {Object.entries(s.modelMix)
+                .toSorted((a, b) => b[1] - a[1])
+                .map(([model, count]) => (
+                  <span
+                    key={model}
+                    className="rounded-md border border-border/40 px-2 py-0.5 text-xs text-foreground"
+                  >
+                    {model} <span className="text-muted-foreground">{compact(count)}</span>
+                  </span>
+                ))}
+            </div>
+          </div>
+        )}
+      </Card>
+
+      {/* distribution cards */}
+      <section className="flex flex-col gap-3">
+        <h2 className="text-lg font-semibold text-foreground">Distributions</h2>
+        <div className="grid gap-4 lg:grid-cols-2">
+          <DistributionCard
+            title="Input tokens per turn"
+            unit="tokens"
+            scale="log"
+            distribution={cd.inputTokensPerTurn}
+          />
+          <DistributionCard
+            title="Output tokens per turn"
+            unit="tokens"
+            scale="log"
+            distribution={cd.outputTokensPerTurn}
+          />
+          <DistributionCard
+            title="Turns per conversation"
+            unit="turns"
+            distribution={cd.turnsPerConversation}
+          />
+          <DistributionCard
+            title="Subagent groups per conversation"
+            unit="groups"
+            distribution={cd.subagentGroupsPerConversation}
+          />
+          <DistributionCard
+            title="Cached fraction per turn"
+            unit=""
+            distribution={cd.cachedFractionPerTurn}
+            formatValue={(v) => `${(v * 100).toFixed(0)}%`}
+          />
+        </div>
+      </section>
+
+      {/* conversation list */}
+      <section className="flex flex-col gap-3">
+        <div className="flex flex-wrap items-center justify-between gap-3">
+          <h2 className="text-lg font-semibold text-foreground">
+            Conversations{' '}
+            <span className="text-sm font-normal text-muted-foreground">({total})</span>
+          </h2>
+          <div className="flex items-center gap-2">
+            <input
+              type="text"
+              value={search}
+              onChange={(e) => {
+                setSearch(e.target.value);
+                setPage(0);
+              }}
+              placeholder="Search by ID…"
+              className="h-8 w-40 rounded-md border border-border/40 bg-background px-2 text-xs outline-none focus:border-primary"
+            />
+            <Select
+              value={sort}
+              onValueChange={(v) => {
+                setSort(v as ConversationSort);
+                setPage(0);
+                track('datasets_conversations_sorted', { mode: v });
+              }}
+            >
+              <SelectTrigger className="h-8 w-[12rem] text-xs" aria-label="Sort conversations">
+                <SelectValue />
+              </SelectTrigger>
+              <SelectContent>
+                {SORTS.map((o) => (
+                  <SelectItem key={o.value} value={o.value} className="text-xs">
+                    {o.label}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+          </div>
+        </div>
+
+        <Card className="overflow-hidden p-0">
+          <table className="w-full text-sm">
+            <thead className="border-b border-border/40 bg-muted/30 text-xs text-muted-foreground">
+              <tr>
+                <th className="px-3 py-2 text-left font-medium">Conversation</th>
+                <th className="px-3 py-2 text-right font-medium">Turns</th>
+                <th className="px-3 py-2 text-right font-medium">Subagents</th>
+                <th className="px-3 py-2 text-right font-medium">Input</th>
+                <th className="px-3 py-2 text-right font-medium">Output</th>
+                <th className="px-3 py-2 text-right font-medium">Cached</th>
+              </tr>
+            </thead>
+            <tbody>
+              {(convs?.items ?? []).map((c) => {
+                const cachedPct =
+                  c.total_in > 0 ? `${((c.total_cached / c.total_in) * 100).toFixed(0)}%` : '—';
+                return (
+                  <tr
+                    key={c.conv_id}
+                    className="border-b border-border/20 last:border-0 hover:bg-accent/40"
+                  >
+                    <td className="px-3 py-2">
+                      <Link
+                        href={`/datasets/${slug}/conversations/${c.conv_id}`}
+                        onClick={() => track('datasets_conversation_clicked', { slug })}
+                        className="font-mono text-xs text-primary hover:underline"
+                      >
+                        {c.conv_id.slice(0, 20)}…
+                      </Link>
+                      {c.models.length > 0 && (
+                        <span className="ml-2 text-[11px] text-muted-foreground">
+                          {c.models.length} model{c.models.length === 1 ? '' : 's'}
+                        </span>
+                      )}
+                    </td>
+                    <td className="px-3 py-2 text-right tabular-nums">{c.num_turns}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{c.num_subagent_groups}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{compact(c.total_in)}</td>
+                    <td className="px-3 py-2 text-right tabular-nums">{compact(c.total_out)}</td>
+                    <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">
+                      {cachedPct}
+                    </td>
+                  </tr>
+                );
+              })}
+              {!isFetching && (convs?.items.length ?? 0) === 0 && (
+                <tr>
+                  <td colSpan={6} className="px-3 py-8 text-center text-xs text-muted-foreground">
+                    No conversations match.
+                  </td>
+                </tr>
+              )}
+            </tbody>
+          </table>
+        </Card>
+
+        {pageCount > 1 && (
+          <div className="flex items-center justify-center gap-3 text-xs">
+            <button
+              type="button"
+              disabled={page === 0}
+              onClick={() => setPage((p) => Math.max(0, p - 1))}
+              className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+            >
+              ← Prev
+            </button>
+            <span className="text-muted-foreground">
+              Page {page + 1} of {pageCount}
+            </span>
+            <button
+              type="button"
+              disabled={page >= pageCount - 1}
+              onClick={() => setPage((p) => Math.min(pageCount - 1, p + 1))}
+              className="rounded-md border border-border/40 px-2 py-1 hover:bg-accent disabled:opacity-30"
+            >
+              Next →
+            </button>
+          </div>
+        )}
+      </section>
+    </div>
+  );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div>
+      <dt className="text-xs text-muted-foreground">{label}</dt>
+      <dd className="text-lg font-semibold tabular-nums text-foreground">{value}</dd>
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx
new file mode 100644
index 00000000..5fcc0dfe
--- /dev/null
+++ b/packages/app/src/components/datasets/dataset-list.tsx
@@ -0,0 +1,85 @@
+'use client';
+
+import Link from 'next/link';
+
+import { Card } from '@/components/ui/card';
+import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets';
+import { track } from '@/lib/analytics';
+
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+function DatasetCard({ d }: { d: DatasetRecord }) {
+  const s = d.summary ?? {};
+  const cachedPct = typeof s.cachedPct === 'number' ? `${(s.cachedPct * 100).toFixed(0)}%` : '—';
+  return (
+    <Link
+      href={`/datasets/${d.slug}`}
+      onClick={() => track('datasets_card_clicked', { slug: d.slug })}
+      className="block transition-colors hover:[&_*]:border-primary/40"
+    >
+      <Card className="h-full p-4 transition-colors hover:border-primary/40">
+        <div className="mb-1 flex items-baseline justify-between gap-2">
+          <h3 className="text-base font-semibold text-foreground">{d.label}</h3>
+          <span className="rounded-full border border-border/50 px-2 py-0.5 text-[10px] uppercase tracking-wide text-muted-foreground">
+            {d.variant}
+          </span>
+        </div>
+        {d.description && (
+          <p className="mb-3 line-clamp-2 text-xs text-muted-foreground">{d.description}</p>
+        )}
+        <dl className="grid grid-cols-2 gap-x-4 gap-y-1.5 text-xs">
+          <Stat label="Conversations" value={d.conversation_count.toLocaleString()} />
+          <Stat label="Main turns" value={compact(s.mainTurns ?? 0)} />
+          <Stat label="Subagent groups" value={compact(s.subagentGroups ?? 0)} />
+          <Stat label="Cached input" value={cachedPct} />
+          <Stat label="Total input" value={`${compact(s.totalIn ?? 0)} tok`} />
+          <Stat label="Total output" value={`${compact(s.totalOut ?? 0)} tok`} />
+        </dl>
+        <div className="mt-3 text-xs font-medium text-primary">View dataset →</div>
+      </Card>
+    </Link>
+  );
+}
+
+function Stat({ label, value }: { label: string; value: string }) {
+  return (
+    <div className="flex items-baseline justify-between gap-2">
+      <dt className="text-muted-foreground">{label}</dt>
+      <dd className="tabular-nums font-medium text-foreground">{value}</dd>
+    </div>
+  );
+}
+
+export function DatasetList() {
+  const { data, isLoading, isError } = useDatasets();
+
+  if (isLoading) {
+    return <div className="py-12 text-center text-sm text-muted-foreground">Loading datasets…</div>;
+  }
+  if (isError || !data) {
+    return (
+      <div className="py-12 text-center text-sm text-destructive">Failed to load datasets.</div>
+    );
+  }
+  if (data.length === 0) {
+    return (
+      <div className="py-12 text-center text-sm text-muted-foreground">
+        No datasets ingested yet.
+      </div>
+    );
+  }
+
+  return (
+    <div className="grid gap-4 sm:grid-cols-2">
+      {data.map((d) => (
+        <DatasetCard key={d.id} d={d} />
+      ))}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
new file mode 100644
index 00000000..7abc367f
--- /dev/null
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -0,0 +1,220 @@
+'use client';
+
+import { useMemo } from 'react';
+
+import { Card } from '@/components/ui/card';
+import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover';
+import type { Distribution } from '@/hooks/api/use-datasets';
+
+/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  if (abs > 0 && abs < 1) return n.toFixed(2);
+  return String(Math.round(n));
+}
+
+interface DistributionCardProps {
+  title: string;
+  subtitle?: string;
+  unit: string;
+  distribution?: Distribution;
+  scale?: 'log' | 'linear';
+  /** Format the x value (defaults to compact). e.g. percent for cached fraction. */
+  formatValue?: (v: number) => string;
+}
+
+const W = 720;
+const H = 240;
+const PAD = { top: 12, right: 16, bottom: 48, left: 52 };
+
+/**
+ * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a
+ * themeable bar chart with median/p90 guide lines and a hover tooltip. Bars are
+ * drawn at equal visual width; for log-scaled bins the edge labels are already
+ * log-spaced so the shape reads as a log histogram.
+ */
+export function DistributionCard({
+  title,
+  subtitle,
+  unit,
+  distribution,
+  scale = 'linear',
+  formatValue = compact,
+}: DistributionCardProps) {
+  const computed = useMemo(() => {
+    const bins = distribution?.bins ?? [];
+    if (bins.length === 0) return null;
+    const maxCount = Math.max(1, ...bins.map((b) => b.count));
+    const innerW = W - PAD.left - PAD.right;
+    const innerH = H - PAD.top - PAD.bottom;
+    const n = bins.length;
+    const barW = innerW / n;
+    // Map a data value to an x pixel by locating its bin (positional — works for
+    // both linear and log bins since the edges are precomputed at ingest).
+    const valueToX = (v: number): number | null => {
+      for (let i = 0; i < n; i++) {
+        if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) {
+          return PAD.left + (i + 0.5) * barW;
+        }
+      }
+      if (v <= bins[0].x0) return PAD.left + 0.5 * barW;
+      return PAD.left + (n - 0.5) * barW;
+    };
+    return { bins, maxCount, innerW, innerH, n, barW, valueToX };
+  }, [distribution]);
+
+  if (!computed) {
+    return (
+      <Card className="p-4">
+        <div className="mb-1 text-sm font-medium text-foreground">{title}</div>
+        <div className="grid h-[240px] place-items-center text-xs text-muted-foreground">
+          No data
+        </div>
+      </Card>
+    );
+  }
+
+  const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed;
+  const stats = distribution?.stats;
+
+  const guides = stats
+    ? ([
+        { label: 'median', value: stats.median, color: '#3b82f6' },
+        { label: 'p90', value: stats.p90, color: '#f59e0b' },
+      ] as const)
+    : [];
+
+  // X tick labels from a few bin edges.
+  const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1];
+
+  const resolve = (fraction: number) => {
+    const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n)));
+    const b = bins[i];
+    const items: HoverItem[] = [
+      {
+        color: 'currentColor',
+        label: 'Range',
+        value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`,
+      },
+      { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() },
+    ];
+    return { items };
+  };
+
+  return (
+    <Card className="p-4">
+      <div className="mb-0.5 flex items-baseline justify-between gap-2">
+        <span className="text-sm font-medium text-foreground">{title}</span>
+        {scale === 'log' && (
+          <span className="text-[10px] uppercase tracking-wide text-muted-foreground">
+            log scale
+          </span>
+        )}
+      </div>
+      {subtitle && <div className="mb-1 text-xs text-muted-foreground">{subtitle}</div>}
+      {stats && (
+        <div className="mb-2 text-xs text-muted-foreground">
+          n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '}
+          {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit}
+        </div>
+      )}
+      <div className="w-full text-muted-foreground">
+        <ChartHover pad={PAD} width={W} height={H} resolve={resolve}>
+          {/* bars */}
+          {bins.map((b, i) => {
+            const h = (b.count / maxCount) * innerH;
+            const x = PAD.left + i * barW;
+            const y = PAD.top + (innerH - h);
+            return (
+              <rect
+                key={i}
+                x={x}
+                y={y}
+                width={Math.max(0, barW - 1)}
+                height={h}
+                className="fill-primary/55"
+              />
+            );
+          })}
+
+          {/* guide lines */}
+          {guides.map((g) => {
+            const x = valueToX(g.value);
+            if (x === null) return null;
+            return (
+              <line
+                key={g.label}
+                x1={x}
+                x2={x}
+                y1={PAD.top}
+                y2={PAD.top + innerH}
+                stroke={g.color}
+                strokeWidth={2}
+                strokeDasharray="5 3"
+                opacity={0.95}
+              />
+            );
+          })}
+
+          {/* x axis */}
+          <line
+            x1={PAD.left}
+            x2={PAD.left + innerW}
+            y1={PAD.top + innerH}
+            y2={PAD.top + innerH}
+            stroke="currentColor"
+            opacity={0.2}
+          />
+          {tickIdxs.map((i, k) => {
+            const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle';
+            const x = PAD.left + (i + 0.5) * barW;
+            return (
+              <text
+                key={i}
+                x={x}
+                y={PAD.top + innerH + 14}
+                fontSize={11}
+                fill="currentColor"
+                opacity={0.7}
+                textAnchor={anchor}
+              >
+                {formatValue(bins[i].x0)}
+              </text>
+            );
+          })}
+          <text
+            x={W / 2}
+            y={H - 16}
+            fontSize={11}
+            fill="currentColor"
+            opacity={0.55}
+            textAnchor="middle"
+          >
+            {unit}
+          </text>
+
+          {/* guide legend */}
+          {guides.map((g, i) => (
+            <g key={g.label} transform={`translate(${PAD.left + i * 110}, ${PAD.top})`}>
+              <line
+                x1={0}
+                x2={12}
+                y1={4}
+                y2={4}
+                stroke={g.color}
+                strokeWidth={2}
+                strokeDasharray="5 3"
+              />
+              <text x={16} y={7} fontSize={10} fill="currentColor" opacity={0.85}>
+                {g.label} {formatValue(g.value)}
+              </text>
+            </g>
+          ))}
+        </ChartHover>
+      </div>
+    </Card>
+  );
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
new file mode 100644
index 00000000..12588582
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -0,0 +1,273 @@
+'use client';
+
+import { useCallback, useMemo, useState } from 'react';
+
+import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
+
+/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
+function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  return String(Math.round(n));
+}
+
+// Stacked-bar segment colors. Cached prefix vs uncached input vs output —
+// fixed hues (theme-independent) so the meaning is stable in light/dark.
+const SEG = {
+  cached: '#10b981', // emerald-500 — input served from prefix cache
+  uncached: '#f59e0b', // amber-500 — input that must be (re)computed
+  output: '#8b5cf6', // violet-500 — generated tokens
+} as const;
+
+const LEGEND = [
+  { key: 'cached', label: 'Cached prefix', color: SEG.cached },
+  { key: 'uncached', label: 'Uncached input', color: SEG.uncached },
+  { key: 'output', label: 'Output', color: SEG.output },
+] as const;
+
+interface VisibleRow {
+  key: string;
+  label: string;
+  sublabel?: string;
+  cached: number;
+  uncached: number;
+  output: number;
+  total: number;
+  indent: number;
+  isGroup: boolean;
+  isExpanded: boolean;
+  groupIndex?: number;
+}
+
+interface TooltipState {
+  x: number;
+  y: number;
+  row: VisibleRow;
+}
+
+/**
+ * Per-conversation flamegraph driven by the precomputed `structure` JSONB.
+ * One row per turn; subagent groups render a collapsible header with indented
+ * children (collapsed by default). Each bar stacks cached-prefix + uncached
+ * input + output, scaled to the widest visible turn.
+ */
+export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) {
+  const nodes = structure.nodes;
+
+  // Subagent groups collapsed by default.
+  const [expanded, setExpanded] = useState<Set<number>>(() => new Set());
+  const [tooltip, setTooltip] = useState<TooltipState | null>(null);
+
+  const groupIndexes = useMemo(() => {
+    const out: number[] = [];
+    nodes.forEach((node, i) => {
+      if (node.kind === 'subagent') out.push(i);
+    });
+    return out;
+  }, [nodes]);
+
+  const toggle = useCallback((i: number) => {
+    setExpanded((prev) => {
+      const next = new Set(prev);
+      if (next.has(i)) next.delete(i);
+      else next.add(i);
+      return next;
+    });
+  }, []);
+
+  const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]);
+  const collapseAll = useCallback(() => setExpanded(new Set()), []);
+
+  const rows = useMemo<VisibleRow[]>(() => {
+    const out: VisibleRow[] = [];
+    let turnNo = 0;
+    nodes.forEach((node: StructureNode, i) => {
+      if (node.kind === 'turn') {
+        turnNo += 1;
+        out.push({
+          key: `t-${i}`,
+          label: `Turn ${turnNo}`,
+          sublabel: node.model ?? undefined,
+          cached: node.cached,
+          uncached: node.uncached,
+          output: node.out,
+          total: node.in + node.out,
+          indent: 0,
+          isGroup: false,
+          isExpanded: false,
+        });
+      } else {
+        const isExpanded = expanded.has(i);
+        out.push({
+          key: `g-${i}`,
+          label: `${node.label}`,
+          sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${
+            node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : ''
+          }`,
+          cached: node.cached,
+          uncached: node.uncached,
+          output: node.out,
+          total: node.in + node.out,
+          indent: 0,
+          isGroup: true,
+          isExpanded,
+          groupIndex: i,
+        });
+        if (isExpanded) {
+          node.children.forEach((child, ci) => {
+            out.push({
+              key: `g-${i}-c-${ci}`,
+              label: `↳ subturn ${ci + 1}`,
+              sublabel: child.model ?? undefined,
+              cached: child.cached,
+              uncached: child.uncached,
+              output: child.out,
+              total: child.in + child.out,
+              indent: 1,
+              isGroup: false,
+              isExpanded: false,
+            });
+          });
+        }
+      }
+    });
+    return out;
+  }, [nodes, expanded]);
+
+  const maxTotal = useMemo(
+    () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)),
+    [rows],
+  );
+
+  const onMove = (e: React.MouseEvent, row: VisibleRow) => {
+    setTooltip({ x: e.clientX, y: e.clientY, row });
+  };
+
+  return (
+    <div className="relative">
+      <div className="mb-3 flex flex-wrap items-center justify-between gap-3">
+        <div className="flex items-center gap-3 text-xs">
+          {LEGEND.map((l) => (
+            <span key={l.key} className="inline-flex items-center gap-1.5">
+              <span
+                className="inline-block size-3 rounded-sm"
+                style={{ backgroundColor: l.color }}
+              />
+              <span className="text-muted-foreground">{l.label}</span>
+            </span>
+          ))}
+        </div>
+        {groupIndexes.length > 0 && (
+          <div className="flex items-center gap-1.5">
+            <button
+              type="button"
+              onClick={expandAll}
+              className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+            >
+              Expand all
+            </button>
+            <button
+              type="button"
+              onClick={collapseAll}
+              className="rounded-md border border-border/40 px-2 py-1 text-xs hover:bg-accent"
+            >
+              Collapse all
+            </button>
+          </div>
+        )}
+      </div>
+
+      <div className="flex flex-col gap-0.5">
+        {rows.map((row) => {
+          const barFrac = row.total / maxTotal;
+          const cw = (row.cached / row.total) * 100;
+          const uw = (row.uncached / row.total) * 100;
+          const ow = (row.output / row.total) * 100;
+          return (
+            <div
+              key={row.key}
+              className="flex items-center gap-2"
+              style={{ paddingLeft: row.indent * 20 }}
+            >
+              {/* label / group toggle */}
+              <div className="flex w-44 shrink-0 items-center gap-1 truncate">
+                {row.isGroup ? (
+                  <button
+                    type="button"
+                    onClick={() => row.groupIndex !== undefined && toggle(row.groupIndex)}
+                    className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
+                  >
+                    <span className="inline-block w-3 text-muted-foreground">
+                      {row.isExpanded ? '▾' : '▸'}
+                    </span>
+                    <span className="truncate">{row.label}</span>
+                  </button>
+                ) : (
+                  <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
+                )}
+              </div>
+
+              {/* stacked bar */}
+              <div
+                className="relative h-5 flex-1 cursor-default"
+                onMouseMove={(e) => onMove(e, row)}
+                onMouseLeave={() => setTooltip(null)}
+              >
+                <div
+                  className={`flex h-full overflow-hidden rounded-sm ${
+                    row.isGroup ? 'opacity-70 ring-1 ring-border/50' : ''
+                  }`}
+                  style={{ width: `${Math.max(0.5, barFrac * 100)}%` }}
+                >
+                  <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
+                  <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
+                  <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                </div>
+              </div>
+
+              {/* total */}
+              <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
+                {compact(row.total)}
+              </div>
+            </div>
+          );
+        })}
+      </div>
+
+      {tooltip && (
+        <div
+          className="pointer-events-none fixed z-50 rounded-md border border-border bg-popover px-2.5 py-1.5 text-xs shadow-md"
+          style={{ left: tooltip.x + 12, top: tooltip.y + 12 }}
+        >
+          <div className="mb-1 font-medium text-foreground">
+            {tooltip.row.label}
+            {tooltip.row.sublabel ? (
+              <span className="ml-1 font-normal text-muted-foreground">{tooltip.row.sublabel}</span>
+            ) : null}
+          </div>
+          <div className="grid grid-cols-[auto_auto] gap-x-3 gap-y-0.5 text-muted-foreground">
+            <span style={{ color: SEG.cached }}>Cached prefix</span>
+            <span className="text-right tabular-nums text-foreground">
+              {compact(tooltip.row.cached)}
+            </span>
+            <span style={{ color: SEG.uncached }}>Uncached input</span>
+            <span className="text-right tabular-nums text-foreground">
+              {compact(tooltip.row.uncached)}
+            </span>
+            <span style={{ color: SEG.output }}>Output</span>
+            <span className="text-right tabular-nums text-foreground">
+              {compact(tooltip.row.output)}
+            </span>
+            <span>Cached %</span>
+            <span className="text-right tabular-nums text-foreground">
+              {tooltip.row.cached + tooltip.row.uncached > 0
+                ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
+                : '—'}
+            </span>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx
index 57965518..5725d99f 100644
--- a/packages/app/src/components/header/header.tsx
+++ b/packages/app/src/components/header/header.tsx
@@ -46,6 +46,12 @@ const NAV_LINKS = [
     testId: 'nav-link-supporters',
     event: 'header_supporters_clicked',
   },
+  {
+    href: '/datasets',
+    label: 'Datasets',
+    testId: 'nav-link-datasets',
+    event: 'header_datasets_clicked',
+  },
   { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' },
   { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' },
 ] as const;
diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts
new file mode 100644
index 00000000..c1676445
--- /dev/null
+++ b/packages/db/src/queries/datasets.test.ts
@@ -0,0 +1,102 @@
+import { describe, expect, it } from 'vitest';
+
+import type { DbClient } from '../connection.js';
+import { getConversation, listConversations, listDatasets } from './datasets.js';
+
+/**
+ * Mock DbClient: returns canned result sets in call order. Each call to the
+ * tagged-template `sql` shifts the next queued rows array. The query text is
+ * ignored — these tests assert the JS-side shaping/coercion, not SQL.
+ */
+function mockSql(queue: unknown[][]): DbClient {
+  const responses = [...queue];
+  return (() => Promise.resolve(responses.shift() ?? [])) as unknown as DbClient;
+}
+
+describe('listDatasets', () => {
+  it('coerces conversation_count to a number', async () => {
+    const sql = mockSql([
+      [
+        {
+          id: 'a/b',
+          slug: 'b',
+          label: 'B',
+          variant: 'full',
+          conversation_count: '393',
+          summary: {},
+        },
+      ],
+    ]);
+    const out = await listDatasets(sql);
+    expect(out).toHaveLength(1);
+    expect(out[0].conversation_count).toBe(393);
+    expect(typeof out[0].conversation_count).toBe('number');
+  });
+});
+
+describe('listConversations', () => {
+  it('returns null when the dataset slug is unknown', async () => {
+    const sql = mockSql([[]]); // datasets lookup → no rows
+    expect(await listConversations(sql, 'missing')).toBeNull();
+  });
+
+  it('returns total + numerically-coerced items', async () => {
+    const sql = mockSql([
+      [{ id: 'ds-id' }], // datasets lookup
+      [{ n: 2 }], // count
+      [
+        {
+          conv_id: 'c1',
+          models: ['m'],
+          num_turns: '5',
+          num_subagent_groups: '1',
+          total_in: '1000',
+          total_out: '200',
+          total_cached: '900',
+        },
+      ], // items
+    ]);
+    const out = await listConversations(sql, 'b', { sort: 'tokens' });
+    expect(out).not.toBeNull();
+    expect(out!.total).toBe(2);
+    expect(out!.items[0]).toMatchObject({
+      conv_id: 'c1',
+      num_turns: 5,
+      num_subagent_groups: 1,
+      total_in: 1000,
+      total_out: 200,
+      total_cached: 900,
+    });
+    expect(typeof out!.items[0].total_in).toBe('number');
+  });
+});
+
+describe('getConversation', () => {
+  it('returns null when the conversation is missing', async () => {
+    const sql = mockSql([[]]);
+    expect(await getConversation(sql, 'b', 'nope')).toBeNull();
+  });
+
+  it('coerces counts and passes through the structure', async () => {
+    const structure = { blockSize: 64, nodes: [], totals: {} };
+    const sql = mockSql([
+      [
+        {
+          conv_id: 'c1',
+          models: ['m'],
+          num_turns: '3',
+          num_subagent_groups: '0',
+          total_in: '500',
+          total_out: '100',
+          total_cached: '450',
+          structure,
+        },
+      ],
+    ]);
+    const out = await getConversation(sql, 'b', 'c1');
+    expect(out).not.toBeNull();
+    expect(out!.num_turns).toBe(3);
+    expect(out!.total_cached).toBe(450);
+    expect(out!.structure).toBe(structure);
+  });
+});

From 0c50139594a99adcc43f558d0b80ae08870af20e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:18:38 -0500
Subject: [PATCH 80/96] docs(ingest): note the separate agentic-dataset ingest
 script

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/agents/ingest.md | 188 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 .claude/agents/ingest.md

diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
new file mode 100644
index 00000000..aa0099ac
--- /dev/null
+++ b/.claude/agents/ingest.md
@@ -0,0 +1,188 @@
+---
+name: ingest
+description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL.
+tools: Bash, Read, Edit, Write
+---
+
+You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`.
+
+## Environment
+
+- **Repo root**: `/Users/quilicic/InferenceX-app`
+- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements:
+  - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname.
+  - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`.
+- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000)
+- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app`
+- **INVALIDATE_SECRET** lives in repo root `.env` under that key.
+- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var.
+
+## Standard ingest
+
+```bash
+cd /Users/quilicic/InferenceX-app/packages/db
+DATABASE_WRITE_URL='<provided direct non-pooled write URL>' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts --download <RUN_ID> SemiAnalysisAI/InferenceX
+```
+
+Then refresh the materialized view (the script's auto-refresh sometimes races):
+`REFRESH MATERIALIZED VIEW latest_benchmarks;`
+
+## Cache purge (always do after any DB mutation)
+
+```bash
+SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"')
+# Localhost (port 3002, NOT 3000)
+curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate
+# Preview
+mkdir -p /tmp/vp && cd /tmp/vp \
+  && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \
+  && vercel curl /api/v1/invalidate \
+       --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \
+       --yes -- -sS -X POST -H "Authorization: Bearer $SECRET"
+rm -rf /tmp/vp
+```
+
+## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision))
+
+```sql
+BEGIN;
+DELETE FROM benchmark_results br USING configs c
+WHERE c.id = br.config_id
+  AND c.model = '<model>' AND c.hardware = '<hw>' AND c.framework = '<framework>'
+  AND c.precision = '<prec>' AND br.benchmark_type = '<bt>';
+DELETE FROM availability
+WHERE model = '<model>' AND hardware = '<hw>' AND framework = '<framework>'
+  AND precision = '<prec>' AND benchmark_type = '<bt>';
+COMMIT;
+```
+
+If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked.
+
+## AIPerf tagging — DO NOT use by default
+
+AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision).
+
+Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`.
+
+<details>
+<summary>Explicit-request-only: how to tag a run as `spec_decoding='aiperf'`</summary>
+
+```bash
+RID=<run_id>
+TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX)
+cd $TMPDIR
+
+# 1. Logical-name dedup + download
+gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \
+  --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \
+  | python3 -c "
+import sys, re, collections
+seen = collections.OrderedDict()
+for line in sys.stdin:
+    name, url, created = line.rstrip('\n').split('\t')
+    key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name)
+    if key not in seen or seen[key][2] < created:
+        seen[key] = (name, url, created)
+for _, (name, url, _) in seen.items():
+    print(f'{name}\t{url}')
+" > artifacts.tsv
+while IFS=$'\t' read -r name url; do
+  mkdir -p "$name"
+  gh api "$url" > "$name/a.zip" 2>/dev/null
+  unzip -oq "$name/a.zip" -d "$name" 2>/dev/null
+  rm "$name/a.zip"
+done < artifacts.tsv
+
+# 2. Patch every benchmark JSON to set spec_decoding=aiperf
+find $TMPDIR -name "*.json" | python3 -c "
+import sys, json
+for fn in (l.strip() for l in sys.stdin):
+    try:
+        with open(fn) as f: d = json.load(f)
+    except Exception: continue
+    rows = d if isinstance(d, list) else [d]
+    if not rows or not isinstance(rows[0], dict): continue
+    changed = False
+    for row in rows:
+        if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row):
+            row['spec_decoding'] = 'aiperf'
+            changed = True
+    if changed:
+        with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f)
+"
+
+# 3. Ingest in CI mode (reads INGEST_* env vars)
+cd /Users/quilicic/InferenceX-app/packages/db
+INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \
+DATABASE_WRITE_URL='<provided direct non-pooled write URL>' \
+GITHUB_TOKEN=$(gh auth token) \
+pnpm exec tsx src/ingest-ci-run.ts
+rm -rf $TMPDIR
+```
+
+The `spec_method` column has a lowercase check constraint — always lowercase.
+
+</details>
+
+## Don't auto-mention "AIPerf" in changelog entries
+
+Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`.
+
+## Adding a perf changelog entry
+
+Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `<model>-<precision>-<hw>-<framework>` (matches what the user actually sees in the filter chain).
+
+```sql
+INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link)
+SELECT id, date, '', '', ARRAY['<model>-<precision>-<hw>-<framework>'], '<description>', NULL
+FROM latest_workflow_runs WHERE github_run_id = <RUN_ID>
+RETURNING id, workflow_run_id, date::text, description;
+```
+
+Description convention from prior entries: `<HW upper> <Model> Ingest #<N> (<note>)` — e.g.
+
+- `B200 Kimi Ingest #1`
+- `MI355X Kimi Ingest #2`
+- `H200 Kimi Ingest #1 (mmap cache)`
+
+If user doesn't specify a description, ask for one OR derive from the run name.
+
+## Common gotchas
+
+- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = <wr_id>` then `REFRESH MATERIALIZED VIEW latest_benchmarks`.
+- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT.
+- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites.
+- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `_<runner>_<attempt>` suffix.
+- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection.
+- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = '<full-sweep-date>'` so the frontend's max-date-per-group dedup doesn't drop the older sweep.
+
+## Process
+
+1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/<RID> --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips).
+2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding.
+3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line.
+4. **Refresh materialized view**.
+5. **Add changelog entry** if the user asked or if the run is a "marker" worth surfacing.
+6. **Purge both caches** (localhost 3002 + preview).
+7. **Report** the row count, date, hardware, run id, and changelog id (if added).
+
+## Related: ingesting agentic _datasets_ (not benchmark runs)
+
+This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow:
+
+```bash
+cd packages/db && DATABASE_WRITE_URL='<direct write url>' \
+  pnpm exec tsx src/ingest-weka-dataset.ts <hf-dataset-id> \
+  [--label "…"] [--variant full|256k] [--description "…"] [--limit N]
+```
+
+It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
+
+## Don't
+
+- Don't push to git unless the user asked.
+- Don't ingest without permission if it's a delete+reingest of existing data.
+- Don't hit port 3000 for cache purge — it's a different project.
+- Don't capitalize `spec_method` values (DB has a lowercase check constraint).

From 2ae6ebaab06b27bd65f0601aa6ae7905cbd01d79 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:24:01 -0500
Subject: [PATCH 81/96] fix(datasets): flamegraph scroll box + dual-scale group
 bars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wrap rows in a fixed-height (max-h-[520px]) vertically scrollable bordered box.
Subagent group headers carry aggregate token totals that dwarf any single turn,
which made their bars overflow the row (width >> 100%). Now turns/subturns use a
per-turn scale while group headers use a separate group-aggregate scale (slim
muted strips), both clamped to the track — groups stay comparable to each other
and nothing overflows.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/datasets/trace-flamegraph.tsx  | 111 ++++++++++--------
 1 file changed, 63 insertions(+), 48 deletions(-)

diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 12588582..12cc14ec 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -135,10 +135,19 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
     return out;
   }, [nodes, expanded]);
 
+  // Two scales: leaf turns/subturns share a per-turn axis (the primary signal —
+  // how cached/uncached evolves), while subagent group headers carry aggregates
+  // orders of magnitude larger, so they get their own axis to stay comparable to
+  // each other. Group bars render slim + muted, so the mixed scale reads as a
+  // distinct "group summary" track rather than a contradiction.
   const maxTotal = useMemo(
     () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)),
     [rows],
   );
+  const maxGroupTotal = useMemo(
+    () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)),
+    [rows],
+  );
 
   const onMove = (e: React.MouseEvent, row: VisibleRow) => {
     setTooltip({ x: e.clientX, y: e.clientY, row });
@@ -178,61 +187,67 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
         )}
       </div>
 
-      <div className="flex flex-col gap-0.5">
-        {rows.map((row) => {
-          const barFrac = row.total / maxTotal;
-          const cw = (row.cached / row.total) * 100;
-          const uw = (row.uncached / row.total) * 100;
-          const ow = (row.output / row.total) * 100;
-          return (
-            <div
-              key={row.key}
-              className="flex items-center gap-2"
-              style={{ paddingLeft: row.indent * 20 }}
-            >
-              {/* label / group toggle */}
-              <div className="flex w-44 shrink-0 items-center gap-1 truncate">
-                {row.isGroup ? (
-                  <button
-                    type="button"
-                    onClick={() => row.groupIndex !== undefined && toggle(row.groupIndex)}
-                    className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
-                  >
-                    <span className="inline-block w-3 text-muted-foreground">
-                      {row.isExpanded ? '▾' : '▸'}
-                    </span>
-                    <span className="truncate">{row.label}</span>
-                  </button>
-                ) : (
-                  <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
-                )}
-              </div>
-
-              {/* stacked bar */}
+      <div className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2">
+        <div className="flex flex-col gap-0.5">
+          {rows.map((row) => {
+            // Group headers use the group axis; turns/subturns use the per-turn
+            // axis. Clamp to the track width either way.
+            const denom = row.isGroup ? maxGroupTotal : maxTotal;
+            const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100));
+            const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0;
+            const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0;
+            const ow = row.total > 0 ? (row.output / row.total) * 100 : 0;
+            return (
               <div
-                className="relative h-5 flex-1 cursor-default"
-                onMouseMove={(e) => onMove(e, row)}
-                onMouseLeave={() => setTooltip(null)}
+                key={row.key}
+                className="flex items-center gap-2"
+                style={{ paddingLeft: row.indent * 20 }}
               >
+                {/* label / group toggle */}
+                <div className="flex w-44 shrink-0 items-center gap-1 truncate">
+                  {row.isGroup ? (
+                    <button
+                      type="button"
+                      onClick={() => row.groupIndex !== undefined && toggle(row.groupIndex)}
+                      className="flex items-center gap-1 truncate text-left text-xs font-medium text-foreground hover:text-primary"
+                    >
+                      <span className="inline-block w-3 text-muted-foreground">
+                        {row.isExpanded ? '▾' : '▸'}
+                      </span>
+                      <span className="truncate">{row.label}</span>
+                    </button>
+                  ) : (
+                    <span className="truncate pl-4 text-xs text-foreground">{row.label}</span>
+                  )}
+                </div>
+
+                {/* stacked bar — group headers render as a slim muted summary
+                    strip so they read as aggregates, not individual turns. */}
                 <div
-                  className={`flex h-full overflow-hidden rounded-sm ${
-                    row.isGroup ? 'opacity-70 ring-1 ring-border/50' : ''
-                  }`}
-                  style={{ width: `${Math.max(0.5, barFrac * 100)}%` }}
+                  className="relative flex h-5 flex-1 items-center"
+                  onMouseMove={(e) => onMove(e, row)}
+                  onMouseLeave={() => setTooltip(null)}
                 >
-                  <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
-                  <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
-                  <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                  <div
+                    className={`flex overflow-hidden rounded-sm ${
+                      row.isGroup ? 'h-2.5 opacity-80' : 'h-5'
+                    }`}
+                    style={{ width: `${widthPct}%` }}
+                  >
+                    <div style={{ width: `${cw}%`, backgroundColor: SEG.cached }} />
+                    <div style={{ width: `${uw}%`, backgroundColor: SEG.uncached }} />
+                    <div style={{ width: `${ow}%`, backgroundColor: SEG.output }} />
+                  </div>
                 </div>
-              </div>
 
-              {/* total */}
-              <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
-                {compact(row.total)}
+                {/* total */}
+                <div className="w-16 shrink-0 text-right text-[11px] tabular-nums text-muted-foreground">
+                  {compact(row.total)}
+                </div>
               </div>
-            </div>
-          );
-        })}
+            );
+          })}
+        </div>
       </div>
 
       {tooltip && (

From c749f8f271bcfa46293b1ce2ec29adac1907231d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 16:31:40 -0500
Subject: [PATCH 82/96] feat(datasets): link request timeline to source-dataset
 conversation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add run_datasets (workflow_run → dataset slug) mapping (migration 012) and
surface it through the benchmark-siblings sku. The agentic detail page's request
timeline now deep-links each request bar to its exact conversation in the
/datasets viewer — the request cid, stripped of any ::sa:/::fa: suffix, is the
dataset conv_id. Tooltip shows a 'click to view in dataset' hint; bars get a
pointer cursor only when a mapping exists. Backfilled workflow_run 27915787191
(the dsv4/b300/vllm run incl. point 422083) → cc-traces-weka-062126.

Verified: clicking a timeline bar on /inference/agentic/422083 navigates to the
matching /datasets/cc-traces-weka-062126/conversations/<conv_id>.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../agentic-point/agentic-point-detail.tsx    |  5 ++-
 .../agentic-point/dataset-conv-id.test.ts     | 27 ++++++++++++
 .../agentic-point/request-timeline.tsx        | 43 +++++++++++++++++--
 .../src/hooks/api/use-benchmark-siblings.ts   |  1 +
 packages/db/migrations/012_run_datasets.sql   | 19 ++++++++
 packages/db/src/queries/benchmark-siblings.ts |  7 ++-
 6 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
 create mode 100644 packages/db/migrations/012_run_datasets.sql

diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 278ad8f7..4a076955 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -225,7 +225,10 @@ export function AgenticPointDetail({ id }: Props) {
             Loading request timeline…
           </div>
         ) : timelineQuery.data ? (
-          <RequestTimelineView data={timelineQuery.data} />
+          <RequestTimelineView
+            data={timelineQuery.data}
+            datasetSlug={siblingsQuery.data?.sku.dataset_slug}
+          />
         ) : (
           <div className="rounded-lg border border-border/40 bg-card/40 p-4 text-sm text-muted-foreground">
             No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
new file mode 100644
index 00000000..a7ebbd8c
--- /dev/null
+++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
@@ -0,0 +1,27 @@
+import { describe, expect, it } from 'vitest';
+
+import { datasetConvId } from './request-timeline';
+
+describe('datasetConvId', () => {
+  it('returns a plain conversation id unchanged', () => {
+    expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602')).toBe(
+      '002001296e8a8c38ad9d7cc436d691afc602',
+    );
+  });
+
+  it('strips a ::sa: subagent suffix to the parent conv id', () => {
+    expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+      '002001296e8a8c38ad9d7cc436d691afc602',
+    );
+  });
+
+  it('strips a ::fa: forked-agent suffix', () => {
+    expect(datasetConvId('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBe(
+      '02bc0afb13f7a2d9efa86c28511261d85c0e',
+    );
+  });
+
+  it('strips at the first :: even with a trailing stream index', () => {
+    expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc');
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 7c5fdab0..655556fb 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -1,9 +1,21 @@
 'use client';
 
 import { useCallback, useMemo, useRef, useState } from 'react';
+import { useRouter } from 'next/navigation';
 
 import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline';
 import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
+
+/**
+ * The dataset conversation id for a request: the cid with any subagent/forked
+ * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in
+ * dataset_conversations, so it deep-links into /datasets/<slug>/conversations/.
+ */
+export function datasetConvId(cid: string): string {
+  const i = cid.indexOf('::');
+  return i === -1 ? cid : cid.slice(0, i);
+}
 
 /**
  * Gantt-style request timeline for one agentic benchmark point.
@@ -317,7 +329,7 @@ interface TooltipData {
   req: RequestRecord;
 }
 
-function Tooltip({ data }: { data: TooltipData }) {
+function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) {
   const { row, req } = data;
   const totalMs = (req.end - req.start) / 1e6;
   const queueMs = (req.start - req.credit) / 1e6;
@@ -377,14 +389,37 @@ function Tooltip({ data }: { data: TooltipData }) {
       <div className="mt-1.5 pt-1 border-t border-border/40 text-[10px] text-muted-foreground">
         Started at {formatTickLabel(req.start)}
       </div>
+      {linkable && (
+        <div className="mt-1 text-[10px] font-medium text-primary">
+          Click to view this conversation in the dataset →
+        </div>
+      )}
     </div>
   );
 }
 
-export function RequestTimelineView({ data }: { data: RequestTimeline }) {
+export function RequestTimelineView({
+  data,
+  datasetSlug,
+}: {
+  data: RequestTimeline;
+  /** Source dataset slug for this run; enables click-to-conversation deep links. */
+  datasetSlug?: string | null;
+}) {
+  const router = useRouter();
   const [rowMode, setRowMode] = useState<RowMode>('conversation');
   const [phaseFilter, setPhaseFilter] = useState<PhaseFilter>('profiling');
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
+
+  const openConversation = useCallback(
+    (cid: string) => {
+      if (!datasetSlug) return;
+      const convId = datasetConvId(cid);
+      track('agentic_timeline_to_dataset', { slug: datasetSlug });
+      router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`);
+    },
+    [datasetSlug, router],
+  );
   // Which multi-stream subagents currently have their per-stream rows
   // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id).
   const [expandedSubagents, setExpandedSubagents] = useState<ReadonlySet<string>>(() => new Set());
@@ -798,6 +833,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
                         key={`${req.cid}-${req.ti}-${req.start}`}
                         onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
                         onMouseLeave={() => setTooltip(null)}
+                        onClick={datasetSlug ? () => openConversation(req.cid) : undefined}
+                        style={datasetSlug ? { cursor: 'pointer' } : undefined}
                       >
                         {/* Queue lead-in (faint) — only drawn when noticeable. */}
                         {queueW >= 1 && (
@@ -910,7 +947,7 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) {
       )}
 
       {/* Tooltip */}
-      {tooltip && <Tooltip data={tooltip} />}
+      {tooltip && <Tooltip data={tooltip} linkable={Boolean(datasetSlug)} />}
     </div>
   );
 }
diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts
index 55720bdf..f8bef99e 100644
--- a/packages/app/src/hooks/api/use-benchmark-siblings.ts
+++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts
@@ -31,6 +31,7 @@ export interface BenchmarkSku {
   benchmark_type: string;
   github_run_id: number;
   date: string;
+  dataset_slug: string | null;
 }
 
 export interface BenchmarkSiblings {
diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql
new file mode 100644
index 00000000..58dd9f88
--- /dev/null
+++ b/packages/db/migrations/012_run_datasets.sql
@@ -0,0 +1,19 @@
+-- Maps a benchmark workflow_run to the source dataset it replayed, so the
+-- agentic detail page can deep-link each request in the timeline to the exact
+-- conversation in the /datasets viewer (the request's conversation_id, with any
+-- ::sa:/::fa: suffix stripped, is the dataset conv_id).
+--
+-- One row per workflow_run (every benchmark in a run replays the same dataset).
+-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/<slug>
+-- URL) rather than an FK, so the mapping can be recorded before/independent of
+-- the dataset being ingested; the UI degrades gracefully if the slug is absent.
+--
+-- Additive only. To revert:
+--   drop table if exists run_datasets;
+--   delete from schema_migrations where filename = '012_run_datasets.sql';
+
+create table run_datasets (
+  workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
+  dataset_slug    text not null,
+  created_at      timestamptz not null default now()
+);
diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts
index c7e4a317..2d36eb22 100644
--- a/packages/db/src/queries/benchmark-siblings.ts
+++ b/packages/db/src/queries/benchmark-siblings.ts
@@ -47,6 +47,8 @@ export interface BenchmarkSku {
   /** Human-readable workflow_run summary so the page header can hint at provenance. */
   github_run_id: number;
   date: string;
+  /** Slug of the source dataset this run replayed (run_datasets), or null. */
+  dataset_slug: string | null;
 }
 
 export interface BenchmarkSiblings {
@@ -63,10 +65,11 @@ export async function getBenchmarkSiblings(
     select
       c.hardware, c.framework, c.model, c.precision, c.spec_method,
       br.benchmark_type, br.workflow_run_id, br.date::text,
-      wr.github_run_id
+      wr.github_run_id, rd.dataset_slug
     from benchmark_results br
     join configs c on c.id = br.config_id
     join workflow_runs wr on wr.id = br.workflow_run_id
+    left join run_datasets rd on rd.workflow_run_id = br.workflow_run_id
     where br.id = ${benchmarkResultId}
   `) as unknown as {
     hardware: string;
@@ -78,6 +81,7 @@ export async function getBenchmarkSiblings(
     workflow_run_id: number;
     date: string;
     github_run_id: number;
+    dataset_slug: string | null;
   }[];
   const root = seed[0];
   if (!root) return null;
@@ -158,6 +162,7 @@ export async function getBenchmarkSiblings(
       benchmark_type: root.benchmark_type,
       github_run_id: Number(root.github_run_id),
       date: root.date,
+      dataset_slug: root.dataset_slug ?? null,
     },
     siblings,
   };

From 6b700a3ccbc53fbc7e109360a2e5baa582e588c9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 17:31:14 -0500
Subject: [PATCH 83/96] feat(datasets): deep-link request-timeline bar to the
 exact turn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The timeline link now carries ?turn=<ti> (and &sa=<agentId> for subagent
requests). The flamegraph resolves the target node — main turns by ordinal,
subagent turns by matching the group's agentId then the ti-th child — expands
the subagent group if needed, scrolls the row into view, and flashes a ring.

subagentIdOf strips the harness stream suffix (:s<n> and :aux:<n>) so the cid's
agent id matches the dataset SubagentNode.agentId. Verified end-to-end: clicking
a subagent bar on /inference/agentic/422083 opens the conversation, expands the
right group, and highlights the exact subturn.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/datasets/conversation-view.tsx | 18 +++++-
 .../components/datasets/trace-flamegraph.tsx  | 60 +++++++++++++++++--
 .../agentic-point/dataset-conv-id.test.ts     | 28 ++++++++-
 .../agentic-point/request-timeline.tsx        | 30 ++++++++--
 4 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index 43992c41..ba1d0532 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -1,5 +1,6 @@
 'use client';
 
+import { useState } from 'react';
 import Link from 'next/link';
 
 import { Card } from '@/components/ui/card';
@@ -17,6 +18,17 @@ function compact(n: number): string {
 export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
   const { data, isLoading, isError } = useDatasetConversation(slug, convId);
 
+  // Deep-link target from a request-timeline click: ?turn=<ti>[&sa=<agentId>].
+  // Read once from the URL on mount (matches the app's window-based url-state
+  // reads; avoids a Suspense boundary for useSearchParams).
+  const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => {
+    if (typeof window === 'undefined') return { turn: null, agent: null };
+    const p = new URLSearchParams(window.location.search);
+    const turnRaw = p.get('turn');
+    const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null;
+    return { turn, agent: p.get('sa') };
+  });
+
   if (isLoading) {
     return (
       <div className="py-12 text-center text-sm text-muted-foreground">Loading conversation…</div>
@@ -85,7 +97,11 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
           click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
           plus generated output.
         </p>
-        <TraceFlamegraph structure={data.structure} />
+        <TraceFlamegraph
+          structure={data.structure}
+          highlightTurn={highlight.turn}
+          highlightAgentId={highlight.agent}
+        />
       </Card>
     </div>
   );
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 12cc14ec..3995a9c5 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -1,6 +1,6 @@
 'use client';
 
-import { useCallback, useMemo, useState } from 'react';
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
 
 import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
 
@@ -52,12 +52,58 @@ interface TooltipState {
  * children (collapsed by default). Each bar stacks cached-prefix + uncached
  * input + output, scaled to the widest visible turn.
  */
-export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) {
+export function TraceFlamegraph({
+  structure,
+  highlightTurn,
+  highlightAgentId,
+}: {
+  structure: ConversationStructure;
+  /** Turn index to scroll to / highlight (from a request-timeline deep link). */
+  highlightTurn?: number | null;
+  /** Subagent id when the highlighted turn is inside a subagent group. */
+  highlightAgentId?: string | null;
+}) {
   const nodes = structure.nodes;
 
-  // Subagent groups collapsed by default.
-  const [expanded, setExpanded] = useState<Set<number>>(() => new Set());
+  // Resolve the deep-link target to a row key (+ the group that must be open to
+  // show it). Main turns match by their main-turn ordinal; subagent turns match
+  // the group by agentId, then the ti-th child.
+  const target = useMemo(() => {
+    if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null;
+    if (highlightAgentId) {
+      const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId);
+      if (gi === -1) return null;
+      const group = nodes[gi] as Extract<StructureNode, { kind: 'subagent' }>;
+      if (highlightTurn >= group.children.length) return null;
+      return { rowKey: `g-${gi}-c-${highlightTurn}`, expandGroup: gi };
+    }
+    let ordinal = 0;
+    for (let i = 0; i < nodes.length; i++) {
+      if (nodes[i].kind === 'turn') {
+        if (ordinal === highlightTurn) return { rowKey: `t-${i}`, expandGroup: null };
+        ordinal += 1;
+      }
+    }
+    return null;
+  }, [nodes, highlightTurn, highlightAgentId]);
+
+  // Subagent groups collapsed by default — except the deep-link target's group.
+  const [expanded, setExpanded] = useState<Set<number>>(() =>
+    typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(),
+  );
   const [tooltip, setTooltip] = useState<TooltipState | null>(null);
+  const scrollRef = useRef<HTMLDivElement>(null);
+
+  // Scroll the target row into view and flash a highlight once it's rendered.
+  useEffect(() => {
+    if (!target) return;
+    const el = scrollRef.current?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`);
+    if (!el) return;
+    el.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    el.classList.add('ring-2', 'ring-primary', 'rounded-sm');
+    const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600);
+    return () => clearTimeout(t);
+  }, [target]);
 
   const groupIndexes = useMemo(() => {
     const out: number[] = [];
@@ -187,7 +233,10 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
         )}
       </div>
 
-      <div className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2">
+      <div
+        ref={scrollRef}
+        className="max-h-[520px] overflow-y-auto overflow-x-hidden rounded-md border border-border/40 bg-muted/10 p-2"
+      >
         <div className="flex flex-col gap-0.5">
           {rows.map((row) => {
             // Group headers use the group axis; turns/subturns use the per-turn
@@ -200,6 +249,7 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur
             return (
               <div
                 key={row.key}
+                data-rowkey={row.key}
                 className="flex items-center gap-2"
                 style={{ paddingLeft: row.indent * 20 }}
               >
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
index a7ebbd8c..f55d6131 100644
--- a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
+++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
 
-import { datasetConvId } from './request-timeline';
+import { datasetConvId, subagentIdOf } from './request-timeline';
 
 describe('datasetConvId', () => {
   it('returns a plain conversation id unchanged', () => {
@@ -25,3 +25,29 @@ describe('datasetConvId', () => {
     expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc');
   });
 });
+
+describe('subagentIdOf', () => {
+  it('returns null for a main-conversation cid', () => {
+    expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602')).toBeNull();
+  });
+
+  it('extracts the subagent id from a ::sa: cid', () => {
+    expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe(
+      'subagent_004_27c95af7',
+    );
+  });
+
+  it('drops a trailing :s<stream> index from the subagent id', () => {
+    expect(subagentIdOf('abc::sa:subagent_001_f552fe6f:s3')).toBe('subagent_001_f552fe6f');
+  });
+
+  it('drops an :aux:<n> stream suffix from the subagent id', () => {
+    expect(subagentIdOf('04dba6fe::sa:subagent_001_b00fdc12:aux:011')).toBe(
+      'subagent_001_b00fdc12',
+    );
+  });
+
+  it('returns null for a ::fa: forked-agent cid (no matching subagent group)', () => {
+    expect(subagentIdOf('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBeNull();
+  });
+});
diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
index 655556fb..baf3dc1f 100644
--- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx
+++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx
@@ -17,6 +17,21 @@ export function datasetConvId(cid: string): string {
   return i === -1 ? cid : cid.slice(0, i);
 }
 
+/**
+ * The subagent id encoded in a cid (`…::sa:<agent_id>[:s<n>|:aux:<n>]`), or null
+ * for a main-conversation request. The harness fans a single subagent into
+ * parallel streams with a `:s<n>` or `:aux:<n>` suffix; the dataset
+ * SubagentNode.agentId is the bare base (e.g. `subagent_001_b00fdc12`). Agent
+ * ids never contain a colon, so the base is everything up to the first one.
+ */
+export function subagentIdOf(cid: string): string | null {
+  const i = cid.indexOf('::sa:');
+  if (i === -1) return null;
+  const raw = cid.slice(i + '::sa:'.length);
+  const colon = raw.indexOf(':');
+  return colon === -1 ? raw : raw.slice(0, colon);
+}
+
 /**
  * Gantt-style request timeline for one agentic benchmark point.
  *
@@ -412,11 +427,18 @@ export function RequestTimelineView({
   const [tooltip, setTooltip] = useState<TooltipData | null>(null);
 
   const openConversation = useCallback(
-    (cid: string) => {
+    (req: RequestRecord) => {
       if (!datasetSlug) return;
-      const convId = datasetConvId(cid);
+      const convId = datasetConvId(req.cid);
+      // Carry the turn (and, for subagent requests, the subagent id) so the
+      // flamegraph can scroll to / highlight the exact node this bar maps to.
+      const params = new URLSearchParams({ turn: String(req.ti) });
+      const sa = subagentIdOf(req.cid);
+      if (sa) params.set('sa', sa);
       track('agentic_timeline_to_dataset', { slug: datasetSlug });
-      router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`);
+      router.push(
+        `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`,
+      );
     },
     [datasetSlug, router],
   );
@@ -833,7 +855,7 @@ export function RequestTimelineView({
                         key={`${req.cid}-${req.ti}-${req.start}`}
                         onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })}
                         onMouseLeave={() => setTooltip(null)}
-                        onClick={datasetSlug ? () => openConversation(req.cid) : undefined}
+                        onClick={datasetSlug ? () => openConversation(req) : undefined}
                         style={datasetSlug ? { cursor: 'pointer' } : undefined}
                       >
                         {/* Queue lead-in (faint) — only drawn when noticeable. */}

From 83fcd04e16649ca7a8fb3b1b78231c8588f274e8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 17:44:05 -0500
Subject: [PATCH 84/96] fix(datasets): visible turn highlight +
 pointer-tracking flamegraph tooltip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Deep-link highlight is now state-driven (bg-primary/20 + ring, fades over
  700ms) instead of fragile classList mutation, so it's clearly visible and
  survives re-renders. Subagent groups still auto-expand and scroll into view.
- Portal the hover tooltip to document.body so its position:fixed is
  viewport-relative — an ancestor transform was offsetting it away from the
  cursor. Now it sits at pointer+12px.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/datasets/trace-flamegraph.tsx  | 96 +++++++++++--------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 3995a9c5..53f13b6a 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -1,6 +1,7 @@
 'use client';
 
 import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import { createPortal } from 'react-dom';
 
 import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
 
@@ -94,14 +95,23 @@ export function TraceFlamegraph({
   const [tooltip, setTooltip] = useState<TooltipState | null>(null);
   const scrollRef = useRef<HTMLDivElement>(null);
 
-  // Scroll the target row into view and flash a highlight once it's rendered.
+  // Portal target only exists after mount (the tooltip is portaled to body so
+  // its position:fixed is viewport-relative, immune to ancestor transforms).
+  const [mounted, setMounted] = useState(false);
+  useEffect(() => setMounted(true), []);
+
+  // The deep-link target row gets a state-driven highlight (ring + bg flash)
+  // that fades out — state-driven so a re-render can't clobber it, and so the
+  // fade is a real CSS transition rather than an abrupt classList removal.
+  const [highlightKey, setHighlightKey] = useState<string | null>(target?.rowKey ?? null);
+
+  // Scroll the target row into view once it's rendered, then fade the highlight.
   useEffect(() => {
     if (!target) return;
+    setHighlightKey(target.rowKey);
     const el = scrollRef.current?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`);
-    if (!el) return;
-    el.scrollIntoView({ block: 'center', behavior: 'smooth' });
-    el.classList.add('ring-2', 'ring-primary', 'rounded-sm');
-    const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600);
+    el?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    const t = setTimeout(() => setHighlightKey(null), 2200);
     return () => clearTimeout(t);
   }, [target]);
 
@@ -246,11 +256,14 @@ export function TraceFlamegraph({
             const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0;
             const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0;
             const ow = row.total > 0 ? (row.output / row.total) * 100 : 0;
+            const isHighlighted = row.key === highlightKey;
             return (
               <div
                 key={row.key}
                 data-rowkey={row.key}
-                className="flex items-center gap-2"
+                className={`flex items-center gap-2 rounded-sm transition-colors duration-700 ${
+                  isHighlighted ? 'bg-primary/20 ring-2 ring-primary' : 'ring-0'
+                }`}
                 style={{ paddingLeft: row.indent * 20 }}
               >
                 {/* label / group toggle */}
@@ -300,39 +313,44 @@ export function TraceFlamegraph({
         </div>
       </div>
 
-      {tooltip && (
-        <div
-          className="pointer-events-none fixed z-50 rounded-md border border-border bg-popover px-2.5 py-1.5 text-xs shadow-md"
-          style={{ left: tooltip.x + 12, top: tooltip.y + 12 }}
-        >
-          <div className="mb-1 font-medium text-foreground">
-            {tooltip.row.label}
-            {tooltip.row.sublabel ? (
-              <span className="ml-1 font-normal text-muted-foreground">{tooltip.row.sublabel}</span>
-            ) : null}
-          </div>
-          <div className="grid grid-cols-[auto_auto] gap-x-3 gap-y-0.5 text-muted-foreground">
-            <span style={{ color: SEG.cached }}>Cached prefix</span>
-            <span className="text-right tabular-nums text-foreground">
-              {compact(tooltip.row.cached)}
-            </span>
-            <span style={{ color: SEG.uncached }}>Uncached input</span>
-            <span className="text-right tabular-nums text-foreground">
-              {compact(tooltip.row.uncached)}
-            </span>
-            <span style={{ color: SEG.output }}>Output</span>
-            <span className="text-right tabular-nums text-foreground">
-              {compact(tooltip.row.output)}
-            </span>
-            <span>Cached %</span>
-            <span className="text-right tabular-nums text-foreground">
-              {tooltip.row.cached + tooltip.row.uncached > 0
-                ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
-                : '—'}
-            </span>
-          </div>
-        </div>
-      )}
+      {tooltip &&
+        mounted &&
+        createPortal(
+          <div
+            className="pointer-events-none fixed z-50 rounded-md border border-border bg-popover px-2.5 py-1.5 text-xs shadow-md"
+            style={{ left: tooltip.x + 12, top: tooltip.y + 12 }}
+          >
+            <div className="mb-1 font-medium text-foreground">
+              {tooltip.row.label}
+              {tooltip.row.sublabel ? (
+                <span className="ml-1 font-normal text-muted-foreground">
+                  {tooltip.row.sublabel}
+                </span>
+              ) : null}
+            </div>
+            <div className="grid grid-cols-[auto_auto] gap-x-3 gap-y-0.5 text-muted-foreground">
+              <span style={{ color: SEG.cached }}>Cached prefix</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.cached)}
+              </span>
+              <span style={{ color: SEG.uncached }}>Uncached input</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.uncached)}
+              </span>
+              <span style={{ color: SEG.output }}>Output</span>
+              <span className="text-right tabular-nums text-foreground">
+                {compact(tooltip.row.output)}
+              </span>
+              <span>Cached %</span>
+              <span className="text-right tabular-nums text-foreground">
+                {tooltip.row.cached + tooltip.row.uncached > 0
+                  ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
+                  : '—'}
+              </span>
+            </div>
+          </div>,
+          document.body,
+        )}
     </div>
   );
 }

From 3c40d31172cce46f5e150223bcfa092ff573288f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 17:58:47 -0500
Subject: [PATCH 85/96] fix(datasets): deep-link highlight fires on first
 navigation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The conversation page read ?turn/&sa from window.location.search in a useState
initializer, which captures stale/empty params during a client-side navigation —
so scroll+highlight+expand only worked after a manual reload. Switch to the
reactive useSearchParams (page wrapped in Suspense) so the params are present on
the first nav. Also make the flamegraph expand the target subagent group via an
effect (reacting to target changes), and defer the scroll one frame so the
just-expanded child row exists. Verified via a real timeline click — no reload.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../[slug]/conversations/[convId]/page.tsx    |  5 ++++-
 .../components/datasets/conversation-view.tsx | 19 ++++++++--------
 .../components/datasets/trace-flamegraph.tsx  | 22 +++++++++++++++----
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
index 75702c1b..83eb56a0 100644
--- a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
+++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx
@@ -1,3 +1,4 @@
+import { Suspense } from 'react';
 import type { Metadata } from 'next';
 
 import { ConversationView } from '@/components/datasets/conversation-view';
@@ -25,7 +26,9 @@ export default async function ConversationPage({ params }: Props) {
   return (
     <main className="relative">
       <div className="container mx-auto px-4 pb-8 lg:px-8">
-        <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+        <Suspense>
+          <ConversationView slug={slug} convId={decodeURIComponent(convId)} />
+        </Suspense>
       </div>
     </main>
   );
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index ba1d0532..739d3bb2 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -1,7 +1,7 @@
 'use client';
 
-import { useState } from 'react';
 import Link from 'next/link';
+import { useSearchParams } from 'next/navigation';
 
 import { Card } from '@/components/ui/card';
 import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
@@ -19,15 +19,14 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
   const { data, isLoading, isError } = useDatasetConversation(slug, convId);
 
   // Deep-link target from a request-timeline click: ?turn=<ti>[&sa=<agentId>].
-  // Read once from the URL on mount (matches the app's window-based url-state
-  // reads; avoids a Suspense boundary for useSearchParams).
-  const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => {
-    if (typeof window === 'undefined') return { turn: null, agent: null };
-    const p = new URLSearchParams(window.location.search);
-    const turnRaw = p.get('turn');
-    const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null;
-    return { turn, agent: p.get('sa') };
-  });
+  // useSearchParams (not a one-shot window.location read) so the params are
+  // present on the very first client-side navigation, not just after a reload.
+  const params = useSearchParams();
+  const turnRaw = params.get('turn');
+  const highlight = {
+    turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null,
+    agent: params.get('sa'),
+  };
 
   if (isLoading) {
     return (
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 53f13b6a..a577193b 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -105,14 +105,28 @@ export function TraceFlamegraph({
   // fade is a real CSS transition rather than an abrupt classList removal.
   const [highlightKey, setHighlightKey] = useState<string | null>(target?.rowKey ?? null);
 
-  // Scroll the target row into view once it's rendered, then fade the highlight.
+  // When the deep-link target resolves/changes: expand its subagent group, then
+  // (after the row renders) scroll it into view and flash the highlight. Runs on
+  // first load and on any later target change (e.g. clicking another bar into
+  // the same conversation). The row query/scroll is deferred to the next frame
+  // so the just-expanded child row exists in the DOM.
   useEffect(() => {
     if (!target) return;
+    if (typeof target.expandGroup === 'number') {
+      const gi = target.expandGroup;
+      setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi)));
+    }
     setHighlightKey(target.rowKey);
-    const el = scrollRef.current?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`);
-    el?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    const raf = requestAnimationFrame(() => {
+      scrollRef.current
+        ?.querySelector<HTMLElement>(`[data-rowkey="${target.rowKey}"]`)
+        ?.scrollIntoView({ block: 'center', behavior: 'smooth' });
+    });
     const t = setTimeout(() => setHighlightKey(null), 2200);
-    return () => clearTimeout(t);
+    return () => {
+      cancelAnimationFrame(raf);
+      clearTimeout(t);
+    };
   }, [target]);
 
   const groupIndexes = useMemo(() => {

From e460ea2300f57912eff46d92fbb6fb447fc435e4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 22 Jun 2026 22:34:55 -0500
Subject: [PATCH 86/96] fix(high-contrast): stable line colors when deselecting
 legend items
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In HC mode the iwanthue palette is sized and indexed by the key set it's
generated over. ScatterGraph generated it from the *active* (selected) hw set,
so deselecting a line shrank the set, re-sized the palette, and shifted every
remaining line's hue — most visible on single-vendor agentic runs (which span
the full hue wheel since 2c06009), where deselecting B300 could recolor B200
from red to blue.

Pass the stable full set of hw-types-with-data as hcKeys so the palette and
per-key index are fixed; toggling now only hides/shows lines without recoloring
the rest. Adds a useThemeColors regression test asserting a line's HC color is
identical across active subsets when hcKeys is the full set.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../components/inference/ui/ScatterGraph.tsx  |  8 ++++++
 packages/app/src/hooks/useThemeColors.test.ts | 28 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx
index 76231522..77770ec0 100644
--- a/packages/app/src/components/inference/ui/ScatterGraph.tsx
+++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx
@@ -381,10 +381,18 @@ const ScatterGraph = React.memo(
       () => [...effectiveOfficialHwTypes],
       [effectiveOfficialHwTypes],
     );
+    // High-contrast palette is keyed off the FULL set of official hw types with
+    // data, not the active subset. Otherwise deselecting a line shrinks the key
+    // set, which re-sizes the iwanthue palette and shifts every remaining line's
+    // hue (most visible for single-vendor agentic runs that span the full wheel —
+    // e.g. deselecting B300 would recolor B200 from red to blue). Keying off the
+    // stable full set fixes each hw's color so toggling only hides/shows lines.
+    const stableHcKeys = useMemo(() => [...hwTypesWithData], [hwTypesWithData]);
     const { resolveColor, getCssColor } = useThemeColors({
       highContrast,
       identifiers: activeHwKeys,
       activeKeys: activeOfficialKeys,
+      hcKeys: stableHcKeys,
     });
 
     // --- Changelog ---
diff --git a/packages/app/src/hooks/useThemeColors.test.ts b/packages/app/src/hooks/useThemeColors.test.ts
index 7275e384..11050d19 100644
--- a/packages/app/src/hooks/useThemeColors.test.ts
+++ b/packages/app/src/hooks/useThemeColors.test.ts
@@ -170,4 +170,32 @@ describe('useThemeColors color maps', () => {
     }
     unmountOn();
   });
+
+  // Regression: deselecting a legend line must not recolor the remaining lines.
+  // The HC palette is sized/indexed by the key set it's generated over, so when
+  // it was generated over the *active* subset (no hcKeys), shrinking the
+  // selection re-sized the palette and shifted every remaining line's hue (most
+  // visible on single-vendor agentic runs spanning the full wheel). Passing a
+  // stable `hcKeys` (the full set with data) fixes each line's color.
+  it('keeps a line HC color stable across active subsets when hcKeys is the full set', () => {
+    const FULL = ['b200', 'b300']; // single-vendor (NVIDIA) agentic comparison
+
+    const all = renderHook<UseThemeColorsResult>(() =>
+      useThemeColors({ highContrast: true, activeKeys: ['b200', 'b300'], hcKeys: FULL }),
+    );
+    const b200WithBoth = all.result.current.resolveColor('b200');
+    const b300Color = all.result.current.resolveColor('b300');
+    all.unmount();
+
+    // b300 deselected → only b200 active, but hcKeys is still the full set.
+    const subset = renderHook<UseThemeColorsResult>(() =>
+      useThemeColors({ highContrast: true, activeKeys: ['b200'], hcKeys: FULL }),
+    );
+    const b200Alone = subset.result.current.resolveColor('b200');
+    subset.unmount();
+
+    expect(b200WithBoth).toMatch(/^#[0-9a-f]{6}$/iu);
+    expect(b200WithBoth).not.toBe(b300Color); // HC still produces distinct hues
+    expect(b200Alone).toBe(b200WithBoth); // deselecting b300 did NOT recolor b200
+  });
 });

From a912eab780a76ba015b21590d3c162e0fd4c37ea Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:04:28 -0500
Subject: [PATCH 87/96] chore(security): bump dompurify override to >=3.4.11
 (GHSA-cmwh-pvxp-8882)

---
 pnpm-lock.yaml      | 52 ++++++++++++++++++++++++++++++++-------------
 pnpm-workspace.yaml |  2 +-
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index cdd8a01d..bb7bb824 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -5,7 +5,7 @@ settings:
   excludeLinksFromLockfile: false
 
 overrides:
-  dompurify@<3.4.9: '>=3.4.9'
+  dompurify@<=3.4.10: '>=3.4.11'
   esbuild@>=0.27.3 <0.28.1: '>=0.28.1'
   form-data@>=4.0.0 <4.0.6: '>=4.0.6'
   hono@<4.12.21: '>=4.12.21'
@@ -20,7 +20,7 @@ importers:
     devDependencies:
       '@babel/core':
         specifier: ^7.29.6
-        version: 7.29.7
+        version: 7.29.7(supports-color@8.1.1)
       audit-ci:
         specifier: ^7.1.0
         version: 7.1.0
@@ -2994,9 +2994,6 @@ packages:
     resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==}
     engines: {node: '>=8'}
 
-  dompurify@3.4.10:
-    resolution: {integrity: sha512-0xzNv0e7oYC6yyuOGZIABPM4qtg3QxLFniDNPP4ZP90wR8Yq3zgwpRbrNiT4N3IKqDbbYFEJLV+JWEs19aZ//w==}
-
   dompurify@3.4.11:
     resolution: {integrity: sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==}
 
@@ -5538,7 +5535,27 @@ snapshots:
       '@babel/helpers': 7.29.7
       '@babel/parser': 7.29.7
       '@babel/template': 7.29.7
-      '@babel/traverse': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
+      '@babel/types': 7.29.7
+      '@jridgewell/remapping': 2.3.5
+      convert-source-map: 2.0.0
+      debug: 4.4.3(supports-color@8.1.1)
+      gensync: 1.0.0-beta.2
+      json5: 2.2.3
+      semver: 6.3.1
+    transitivePeerDependencies:
+      - supports-color
+
+  '@babel/core@7.29.7(supports-color@8.1.1)':
+    dependencies:
+      '@babel/code-frame': 7.29.7
+      '@babel/generator': 7.29.7
+      '@babel/helper-compilation-targets': 7.29.7
+      '@babel/helper-module-transforms': 7.29.7(@babel/core@7.29.7(supports-color@8.1.1))
+      '@babel/helpers': 7.29.7
+      '@babel/parser': 7.29.7
+      '@babel/template': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
       '@babel/types': 7.29.7
       '@jridgewell/remapping': 2.3.5
       convert-source-map: 2.0.0
@@ -5569,17 +5586,26 @@ snapshots:
 
   '@babel/helper-module-imports@7.29.7':
     dependencies:
-      '@babel/traverse': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
       '@babel/types': 7.29.7
     transitivePeerDependencies:
       - supports-color
 
+  '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7(supports-color@8.1.1))':
+    dependencies:
+      '@babel/core': 7.29.7(supports-color@8.1.1)
+      '@babel/helper-module-imports': 7.29.7
+      '@babel/helper-validator-identifier': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
+    transitivePeerDependencies:
+      - supports-color
+
   '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7)':
     dependencies:
       '@babel/core': 7.29.7
       '@babel/helper-module-imports': 7.29.7
       '@babel/helper-validator-identifier': 7.29.7
-      '@babel/traverse': 7.29.7
+      '@babel/traverse': 7.29.7(supports-color@8.1.1)
     transitivePeerDependencies:
       - supports-color
 
@@ -5621,7 +5647,7 @@ snapshots:
       '@babel/parser': 7.29.7
       '@babel/types': 7.29.7
 
-  '@babel/traverse@7.29.7':
+  '@babel/traverse@7.29.7(supports-color@8.1.1)':
     dependencies:
       '@babel/code-frame': 7.29.7
       '@babel/generator': 7.29.7
@@ -7981,10 +8007,6 @@ snapshots:
     dependencies:
       path-type: 4.0.0
 
-  dompurify@3.4.10:
-    optionalDependencies:
-      '@types/trusted-types': 2.0.7
-
   dompurify@3.4.11:
     optionalDependencies:
       '@types/trusted-types': 2.0.7
@@ -8812,7 +8834,7 @@ snapshots:
 
   jest-worker@27.5.1:
     dependencies:
-      '@types/node': 25.9.3
+      '@types/node': 26.0.0
       merge-stream: 2.0.0
       supports-color: 8.1.1
 
@@ -9790,7 +9812,7 @@ snapshots:
       '@posthog/core': 1.35.3
       '@posthog/types': 1.390.2
       core-js: 3.49.0
-      dompurify: 3.4.10
+      dompurify: 3.4.11
       fflate: 0.4.8
       preact: 10.29.2
       query-selector-shadow-dom: 1.0.1
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
index c6ea723c..361059bb 100644
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@@ -25,7 +25,7 @@ auditConfig:
     - GHSA-h67p-54hq-rp68
 
 overrides:
-  dompurify@<3.4.9: '>=3.4.9'
+  dompurify@<=3.4.10: '>=3.4.11'
   esbuild@>=0.27.3 <0.28.1: '>=0.28.1'
   form-data@>=4.0.0 <4.0.6: '>=4.0.6'
   hono@<4.12.21: '>=4.12.21'

From ba6bc1ce6cedce56d45c8fcd96a74c3cd53879dc Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:28:18 -0500
Subject: [PATCH 88/96] test(e2e): align selector testid with scenario-selector
 rename; rewrite x-axis toggle test for single-chart mode buttons

---
 .../app/cypress/e2e/dropdown-switching.cy.ts  |  4 +-
 .../app/cypress/e2e/historical-trends.cy.ts   |  4 +-
 .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts  | 64 +++++++++----------
 packages/app/cypress/e2e/url-params.cy.ts     | 10 +--
 4 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts
index ac88dc84..4bc8b695 100644
--- a/packages/app/cypress/e2e/dropdown-switching.cy.ts
+++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts
@@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => {
     cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true');
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
 
-    cy.get('[data-testid="sequence-selector"]').click();
+    cy.get('[data-testid="scenario-selector"]').click();
 
     cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false');
-    cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true');
+    cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true');
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
   });
 
diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts
index f0a70a56..55b0e274 100644
--- a/packages/app/cypress/e2e/historical-trends.cy.ts
+++ b/packages/app/cypress/e2e/historical-trends.cy.ts
@@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => {
       delete doc.body.dataset.scrollLocked;
       doc.body.style.removeProperty('pointer-events');
     });
-    cy.get('[data-testid="sequence-selector"]').should('be.visible');
-    cy.get('[data-testid="sequence-selector"]').click();
+    cy.get('[data-testid="scenario-selector"]').should('be.visible');
+    cy.get('[data-testid="scenario-selector"]').click();
     cy.get('[role="option"]').should('have.length.greaterThan', 0);
     cy.get('body').type('{esc}');
   });
diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index e17a4aff..636a7ccf 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -1,46 +1,42 @@
-describe('TTFT X-Axis Toggle (E2E chart)', () => {
+describe('X-Axis Mode Toggle (inference chart)', () => {
   before(() => {
-    cy.window().then((win) => {
-      win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+    cy.visit('/inference', {
+      onBeforeLoad(win) {
+        win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now()));
+      },
     });
-    cy.visit('/inference');
-    cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2);
+    cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible');
+    cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
   });
 
-  it('shows the x-axis dropdown in the e2e chart heading', () => {
-    cy.get('[data-testid="chart-figure"]')
-      .eq(1)
-      .find('h2 button')
-      .should('contain.text', 'vs.')
-      .and('contain.text', 'Latency');
+  it('shows the x-axis mode buttons with Interactivity active by default', () => {
+    cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible');
+    cy.get('[data-testid="x-axis-mode-interactivity"]')
+      .should('be.visible')
+      .and('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
   });
 
-  it('opens popover with three x-axis options', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').within(() => {
-      cy.contains('End-to-end Latency').should('exist');
-      cy.contains('P99 TTFT').should('exist');
-      cy.contains('Median TTFT').should('exist');
-    });
-  });
-
-  it('switches x-axis to P99 TTFT and updates the heading', () => {
-    cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click();
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT');
+  it('switches the x-axis to TTFT and updates the heading', () => {
+    cy.get('[data-testid="x-axis-mode-ttft"]').click();
+    cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token');
   });
 
-  it('switches x-axis to Median TTFT and updates the heading', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').contains('Median TTFT').click();
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT');
+  it('switches the x-axis to E2E Latency and updates the heading', () => {
+    cy.get('[data-testid="x-axis-mode-e2e"]').click();
+    cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency');
   });
 
-  it('switches back to End-to-end Latency', () => {
-    cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click();
-    cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click();
-    cy.get('[data-testid="chart-figure"]')
-      .eq(1)
-      .find('h2')
-      .should('contain.text', 'End-to-end Latency');
+  it('switches back to Interactivity', () => {
+    cy.get('[data-testid="x-axis-mode-interactivity"]').click();
+    cy.get('[data-testid="x-axis-mode-interactivity"]').should(
+      'have.attr',
+      'aria-selected',
+      'true',
+    );
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
   });
 });
diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts
index 3c480686..927aee5f 100644
--- a/packages/app/cypress/e2e/url-params.cy.ts
+++ b/packages/app/cypress/e2e/url-params.cy.ts
@@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => {
 };
 
 const assertNoHydrationMismatch = () => {
-  cy.get('[data-testid="sequence-selector"]').should('be.visible');
+  cy.get('[data-testid="scenario-selector"]').should('be.visible');
   cy.get('@consoleError').then((spy) => {
     const calls = (spy as unknown as { args: unknown[][] }).args;
     const hydration = calls.filter((args) =>
@@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => {
 
     it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => {
       visitWithErrorSpy('/inference?i_seq=1k/1k');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       assertNoHydrationMismatch();
     });
 
@@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => {
       // Visit the canonical model-prefixed slug so the assertion is directly
       // about the rendered page, not about a bare-slug redirect interleaving.
       visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       assertNoHydrationMismatch();
     });
 
     it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => {
       visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk');
-      cy.get('[data-testid="sequence-selector"]')
+      cy.get('[data-testid="scenario-selector"]')
         .invoke('text')
         .should('not.contain', 'junk')
         .and('match', /[18]K . [18]K/u);
@@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => {
       // `effectivePrecisions` intersects the selection with available precisions
       // and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported.
       visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8');
-      cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K');
+      cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K');
       cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek');
       cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8');
       assertNoHydrationMismatch();

From ada19b54e41ea3ad87cdfc22dd3d27e1a3d7df44 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:41:03 -0500
Subject: [PATCH 89/96] test(datasets): component tests for distribution card,
 trace flamegraph (incl deep-link), and dataset list states

---
 .../app/cypress/component/dataset-list.cy.tsx | 93 +++++++++++++++++++
 .../component/distribution-card.cy.tsx        | 45 +++++++++
 .../cypress/component/trace-flamegraph.cy.tsx | 86 +++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 packages/app/cypress/component/dataset-list.cy.tsx
 create mode 100644 packages/app/cypress/component/distribution-card.cy.tsx
 create mode 100644 packages/app/cypress/component/trace-flamegraph.cy.tsx

diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx
new file mode 100644
index 00000000..f7cfcb9a
--- /dev/null
+++ b/packages/app/cypress/component/dataset-list.cy.tsx
@@ -0,0 +1,93 @@
+import { QueryClient, QueryClientProvider } from '@tanstack/react-query';
+import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime';
+
+import { DatasetList } from '@/components/datasets/dataset-list';
+import type { DatasetRecord } from '@/hooks/api/use-datasets';
+
+const datasets: DatasetRecord[] = [
+  {
+    id: 'ds-1',
+    slug: 'cc-traces-weka-full',
+    label: 'cc-traces-weka (full)',
+    variant: 'full',
+    description: 'Every captured request, unmodified.',
+    hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full',
+    license: 'apache-2.0',
+    conversation_count: 1234,
+    summary: {
+      totalIn: 5_000_000,
+      totalOut: 250_000,
+      cachedPct: 0.82,
+      mainTurns: 9800,
+      subagentGroups: 540,
+    },
+    ingested_at: '2026-06-20T00:00:00Z',
+  },
+  {
+    id: 'ds-2',
+    slug: 'cc-traces-weka-256k',
+    label: 'cc-traces-weka (256k)',
+    variant: '256k',
+    description: 'Turns trimmed to a 256k context window.',
+    hf_url: null,
+    license: 'apache-2.0',
+    conversation_count: 980,
+    summary: {
+      totalIn: 3_200_000,
+      totalOut: 180_000,
+      cachedPct: 0.79,
+      mainTurns: 7600,
+      subagentGroups: 410,
+    },
+    ingested_at: '2026-06-19T00:00:00Z',
+  },
+];
+
+function createMockRouter() {
+  return {
+    push: cy.stub(),
+    replace: cy.stub(),
+    refresh: cy.stub(),
+    back: cy.stub(),
+    forward: cy.stub(),
+    prefetch: cy.stub().resolves(),
+  };
+}
+
+function mountList() {
+  const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } });
+  cy.mount(
+    <AppRouterContext.Provider value={createMockRouter()}>
+      <QueryClientProvider client={queryClient}>
+        <DatasetList />
+      </QueryClientProvider>
+    </AppRouterContext.Provider>,
+  );
+}
+
+describe('DatasetList', () => {
+  it('renders a card per dataset with its summary stats', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list');
+    mountList();
+    cy.wait('@list');
+    cy.contains('cc-traces-weka (full)').should('be.visible');
+    cy.contains('cc-traces-weka (256k)').should('be.visible');
+    cy.contains('1,234').should('be.visible'); // conversation_count, localized
+    cy.contains('82%').should('be.visible'); // cachedPct
+    cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist');
+  });
+
+  it('shows the empty state when no datasets are ingested', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty');
+    mountList();
+    cy.wait('@empty');
+    cy.contains('No datasets ingested yet.').should('be.visible');
+  });
+
+  it('shows the error state when the request fails', () => {
+    cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err');
+    mountList();
+    cy.wait('@err');
+    cy.contains('Failed to load datasets.').should('be.visible');
+  });
+});
diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx
new file mode 100644
index 00000000..fb7e5461
--- /dev/null
+++ b/packages/app/cypress/component/distribution-card.cy.tsx
@@ -0,0 +1,45 @@
+import { DistributionCard } from '@/components/datasets/distribution-card';
+import type { Distribution } from '@/hooks/api/use-datasets';
+
+const distribution: Distribution = {
+  bins: [
+    { x0: 0, x1: 100, count: 5 },
+    { x0: 100, x1: 200, count: 20 },
+    { x0: 200, x1: 300, count: 12 },
+    { x0: 300, x1: 400, count: 3 },
+  ],
+  stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 },
+};
+
+describe('DistributionCard', () => {
+  it('renders the title, summary stats, and one bar per bin', () => {
+    cy.mount(
+      <DistributionCard title="Input tokens per turn" unit="tok" distribution={distribution} />,
+    );
+    cy.contains('Input tokens per turn').should('be.visible');
+    cy.contains('n=40').should('be.visible');
+    cy.contains('median 175').should('be.visible');
+    cy.contains('p90 320').should('be.visible');
+    // One filled bar rect per bin (ChartHover may add a transparent overlay rect).
+    cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length);
+  });
+
+  it('shows a "No data" placeholder when no distribution is provided', () => {
+    cy.mount(<DistributionCard title="Empty metric" unit="tok" />);
+    cy.contains('Empty metric').should('be.visible');
+    cy.contains('No data').should('be.visible');
+    cy.get('rect[class*="fill-primary"]').should('not.exist');
+  });
+
+  it('marks the chart as log scale when scale="log"', () => {
+    cy.mount(
+      <DistributionCard
+        title="Output tokens per turn"
+        unit="tok"
+        scale="log"
+        distribution={distribution}
+      />,
+    );
+    cy.contains('log scale').should('be.visible');
+  });
+});
diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx
new file mode 100644
index 00000000..1be90e0c
--- /dev/null
+++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx
@@ -0,0 +1,86 @@
+import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
+import type { ConversationStructure } from '@/hooks/api/use-datasets';
+
+// Two main turns followed by one subagent group with two child turns.
+// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`).
+const structure: ConversationStructure = {
+  blockSize: 64,
+  nodes: [
+    { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 },
+    {
+      kind: 'turn',
+      turnIndex: 1,
+      model: 'claude',
+      in: 2000,
+      out: 300,
+      cached: 1500,
+      uncached: 500,
+    },
+    {
+      kind: 'subagent',
+      label: 'Subagent: search',
+      agentId: 'agent-1',
+      durationMs: 12000,
+      in: 5000,
+      out: 800,
+      cached: 3000,
+      uncached: 2000,
+      children: [
+        {
+          kind: 'turn',
+          turnIndex: 0,
+          model: 'claude',
+          in: 2500,
+          out: 400,
+          cached: 1500,
+          uncached: 1000,
+        },
+        {
+          kind: 'turn',
+          turnIndex: 1,
+          model: 'claude',
+          in: 2500,
+          out: 400,
+          cached: 1500,
+          uncached: 1000,
+        },
+      ],
+    },
+  ],
+  totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 },
+};
+
+describe('TraceFlamegraph', () => {
+  it('renders the legend, main-turn rows, and the subagent group header', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.contains('Cached prefix').should('be.visible');
+    cy.contains('Uncached input').should('be.visible');
+    cy.contains('Output').should('be.visible');
+    cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1');
+    cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2');
+    cy.contains('Subagent: search').should('be.visible');
+  });
+
+  it('keeps subagent children collapsed until the group is expanded', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+    cy.contains('button', 'Subagent: search').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+    cy.get('[data-rowkey="g-2-c-1"]').should('be.visible');
+  });
+
+  it('expand all / collapse all toggles every subagent group', () => {
+    cy.mount(<TraceFlamegraph structure={structure} />);
+    cy.contains('button', 'Expand all').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('be.visible');
+    cy.contains('button', 'Collapse all').click();
+    cy.get('[data-rowkey="g-2-c-0"]').should('not.exist');
+  });
+
+  it('auto-expands and highlights the target group child for a request-timeline deep link', () => {
+    cy.mount(
+      <TraceFlamegraph structure={structure} highlightAgentId="agent-1" highlightTurn={1} />,
+    );
+    cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary');
+  });
+});

From 1c61ee3f597e22d33e891b73f7f95511a73844d3 Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 01:47:02 -0500
Subject: [PATCH 90/96] refactor(datasets): extract shared compact() formatter,
 dedupe 5 local copies

---
 .../src/components/datasets/conversation-view.tsx    |  9 +--------
 .../app/src/components/datasets/dataset-detail.tsx   |  9 +--------
 .../app/src/components/datasets/dataset-list.tsx     |  9 +--------
 .../src/components/datasets/distribution-card.tsx    | 11 +----------
 packages/app/src/components/datasets/format.ts       | 12 ++++++++++++
 .../app/src/components/datasets/trace-flamegraph.tsx |  9 +--------
 6 files changed, 17 insertions(+), 42 deletions(-)
 create mode 100644 packages/app/src/components/datasets/format.ts

diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index 739d3bb2..d39b83d9 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -6,14 +6,7 @@ import { useSearchParams } from 'next/navigation';
 import { Card } from '@/components/ui/card';
 import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph';
 import { useDatasetConversation } from '@/hooks/api/use-datasets';
-
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 export function ConversationView({ slug, convId }: { slug: string; convId: string }) {
   const { data, isLoading, isError } = useDatasetConversation(slug, convId);
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
index 57c50649..9410a505 100644
--- a/packages/app/src/components/datasets/dataset-detail.tsx
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -18,14 +18,7 @@ import {
   type ConversationSort,
 } from '@/hooks/api/use-datasets';
 import { track } from '@/lib/analytics';
-
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 const PAGE = 50;
 
diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx
index 5fcc0dfe..84b279db 100644
--- a/packages/app/src/components/datasets/dataset-list.tsx
+++ b/packages/app/src/components/datasets/dataset-list.tsx
@@ -5,14 +5,7 @@ import Link from 'next/link';
 import { Card } from '@/components/ui/card';
 import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets';
 import { track } from '@/lib/analytics';
-
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 function DatasetCard({ d }: { d: DatasetRecord }) {
   const s = d.summary ?? {};
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
index 7abc367f..d0c0f166 100644
--- a/packages/app/src/components/datasets/distribution-card.tsx
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -5,16 +5,7 @@ import { useMemo } from 'react';
 import { Card } from '@/components/ui/card';
 import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover';
 import type { Distribution } from '@/hooks/api/use-datasets';
-
-/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  if (abs > 0 && abs < 1) return n.toFixed(2);
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 interface DistributionCardProps {
   title: string;
diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts
new file mode 100644
index 00000000..f6f5530c
--- /dev/null
+++ b/packages/app/src/components/datasets/format.ts
@@ -0,0 +1,12 @@
+/**
+ * Compact number formatter for dataset token/count displays:
+ * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82".
+ */
+export function compact(n: number): string {
+  const abs = Math.abs(n);
+  if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`;
+  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
+  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
+  if (abs > 0 && abs < 1) return n.toFixed(2);
+  return String(Math.round(n));
+}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index a577193b..12ecb4a4 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -4,14 +4,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
 import { createPortal } from 'react-dom';
 
 import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets';
-
-/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */
-function compact(n: number): string {
-  const abs = Math.abs(n);
-  if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`;
-  if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`;
-  return String(Math.round(n));
-}
+import { compact } from './format';
 
 // Stacked-bar segment colors. Cached prefix vs uncached input vs output —
 // fixed hues (theme-independent) so the meaning is stable in light/dark.

From e2e5424e7071d380d05b7c1bcfddfc5bccfc3c5b Mon Sep 17 00:00:00 2001
From: adibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 10:26:34 -0500
Subject: [PATCH 91/96] refactor(db): squash agentic migrations into
 007_agentic.sql so numbering doesn't collide with master

---
 .claude/agents/ingest.md                      |   2 +-
 .../db/migrations/002_agentic_scenario.sql    |  30 --
 .../migrations/003_agentic_availability.sql   |  21 --
 packages/db/migrations/004_offload_mode.sql   |  42 ---
 .../migrations/006_agentic_trace_replay.sql   |  34 --
 packages/db/migrations/007_agentic.sql        | 326 ++++++++++++++++++
 .../007_agentic_trace_server_metrics_json.sql |  17 -
 .../008_agentic_aggregate_stats.sql           |  18 -
 .../migrations/009_agentic_chart_series.sql   |  19 -
 .../010_agentic_request_timeline.sql          |  15 -
 packages/db/migrations/011_datasets.sql       |  55 ---
 packages/db/migrations/012_run_datasets.sql   |  19 -
 12 files changed, 327 insertions(+), 271 deletions(-)
 delete mode 100644 packages/db/migrations/002_agentic_scenario.sql
 delete mode 100644 packages/db/migrations/003_agentic_availability.sql
 delete mode 100644 packages/db/migrations/004_offload_mode.sql
 delete mode 100644 packages/db/migrations/006_agentic_trace_replay.sql
 create mode 100644 packages/db/migrations/007_agentic.sql
 delete mode 100644 packages/db/migrations/007_agentic_trace_server_metrics_json.sql
 delete mode 100644 packages/db/migrations/008_agentic_aggregate_stats.sql
 delete mode 100644 packages/db/migrations/009_agentic_chart_series.sql
 delete mode 100644 packages/db/migrations/010_agentic_request_timeline.sql
 delete mode 100644 packages/db/migrations/011_datasets.sql
 delete mode 100644 packages/db/migrations/012_run_datasets.sql

diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md
index aa0099ac..4ecbc1dd 100644
--- a/.claude/agents/ingest.md
+++ b/.claude/agents/ingest.md
@@ -178,7 +178,7 @@ cd packages/db && DATABASE_WRITE_URL='<direct write url>' \
   [--label "…"] [--variant full|256k] [--description "…"] [--limit N]
 ```
 
-It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
+It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker).
 
 ## Don't
 
diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql
deleted file mode 100644
index c143914e..00000000
--- a/packages/db/migrations/002_agentic_scenario.sql
+++ /dev/null
@@ -1,30 +0,0 @@
--- Support agentic scenarios in benchmark_results.
---
--- Scenarios are discriminated by benchmark_type:
---   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
---   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
---
--- conc retains its meaning (concurrent users/requests) for both.
-
--- 1) isl/osl become nullable for agentic rows
-alter table benchmark_results
-  alter column isl drop not null,
-  alter column osl drop not null;
-
--- 2) CHECK constraints: positive-or-null
-alter table benchmark_results
-  drop constraint benchmark_results_isl_positive,
-  drop constraint benchmark_results_osl_positive;
-
-alter table benchmark_results
-  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
-  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
-
--- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
---    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
-alter table benchmark_results
-  drop constraint benchmark_results_unique;
-
-alter table benchmark_results
-  add constraint benchmark_results_unique unique nulls not distinct
-    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql
deleted file mode 100644
index e96cbd50..00000000
--- a/packages/db/migrations/003_agentic_availability.sql
+++ /dev/null
@@ -1,21 +0,0 @@
--- Extend the availability table to cover agentic scenarios.
---
--- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
--- for availability and add benchmark_type so the frontend can enumerate
--- agentic vs single_turn scenarios per model/date.
---
--- Postgres primary keys require every column to be NOT NULL, so we drop the PK
--- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
--- equivalent except it allows isl/osl to be NULL for agentic rows.
-
-alter table availability
-  drop constraint availability_pkey;
-
-alter table availability
-  alter column isl drop not null,
-  alter column osl drop not null,
-  add column benchmark_type text not null default 'single_turn';
-
-alter table availability
-  add constraint availability_natural_key unique nulls not distinct
-    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql
deleted file mode 100644
index 24b617f1..00000000
--- a/packages/db/migrations/004_offload_mode.sql
+++ /dev/null
@@ -1,42 +0,0 @@
--- Add offload_mode as a first-class dimension on benchmark_results.
---
--- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
--- runs: a single run may emit two rows for the same (config, isl, osl, conc)
--- — one with offload disabled, one enabled. The pre-existing unique key
--- collapsed those into one row, forcing the ingest to skip variants.
---
--- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
--- assumption baked into the existing 5,500+ rows.
-
-alter table benchmark_results
-  add column offload_mode text not null default 'off';
-
--- Backfill agentic rows from the offload_mode value already living in metrics
--- JSONB (set during the earlier agentic ingest backfill).
-update benchmark_results
-   set offload_mode = metrics->>'offload_mode'
- where benchmark_type = 'agentic_traces'
-   and metrics ? 'offload_mode';
-
--- Replace the unique constraint so on/off variants can coexist.
-alter table benchmark_results
-  drop constraint benchmark_results_unique;
-
-alter table benchmark_results
-  add constraint benchmark_results_unique unique nulls not distinct
-    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
-
--- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
-drop materialized view if exists latest_benchmarks cascade;
-
-create materialized view latest_benchmarks as
-select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
-  br.*
-from benchmark_results br
-join latest_workflow_runs wr on wr.id = br.workflow_run_id
-where br.error is null
-order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
-
-create unique index latest_benchmarks_pk
-  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
-create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql
deleted file mode 100644
index 398bc725..00000000
--- a/packages/db/migrations/006_agentic_trace_replay.sql
+++ /dev/null
@@ -1,34 +0,0 @@
--- Capture raw aiperf trace files per agentic benchmark point.
---
--- The aiperf harness produces two per-point export files inside each
--- `agentic_<suffix>` artifact:
---   - profile_export.jsonl         (~2 MB raw, per-request data)
---   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
---
--- We persist them so the dashboard can later show per-request distributions,
--- KV cache utilization over time, and conversation traces without needing to
--- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
--- ~500 KB per point post-gzip the total fits comfortably without a separate
--- blob service.
---
--- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
--- column on benchmark_results). Older, non-aiperf agentic runs simply have a
--- NULL `trace_replay_id`.
-
-create table agentic_trace_replay (
-  id                                bigserial   primary key,
-  -- gzip(profile_export.jsonl); null when only the server metrics file existed
-  profile_export_jsonl_gz           bytea,
-  profile_export_uncompressed_size  bigint,
-  -- raw csv bytes; null when only the profile file existed
-  server_metrics_csv                bytea,
-  server_metrics_csv_size           bigint,
-  created_at                        timestamptz not null default now()
-);
-
-alter table benchmark_results
-  add column trace_replay_id bigint references agentic_trace_replay(id);
-
-create index benchmark_results_trace_replay_idx
-  on benchmark_results (trace_replay_id)
-  where trace_replay_id is not null;
diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/007_agentic.sql
new file mode 100644
index 00000000..eceea82e
--- /dev/null
+++ b/packages/db/migrations/007_agentic.sql
@@ -0,0 +1,326 @@
+-- 007_agentic.sql
+--
+-- Squashed agentic-benchmark + datasets schema. Collapses the feat/agentx
+-- migrations 002_agentic_scenario .. 012_run_datasets into one file that sorts
+-- after master's highest migration (006_benchmark_results_workers), so the
+-- branch's numbering no longer collides with master's 002-006. None of the
+-- collapsed migrations had been applied to any deployed database.
+--
+-- Statement order is preserved exactly. The latest_benchmarks recreate uses
+-- 'select br.*', so it retains every benchmark_results column added earlier
+-- (including master's 'workers' from 006) and re-keys the view on offload_mode.
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 002_agentic_scenario.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Support agentic scenarios in benchmark_results.
+--
+-- Scenarios are discriminated by benchmark_type:
+--   'single_turn'     — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set.
+--   'agentic_traces'  — trace-replay agentic runs. isl/osl NULL.
+--
+-- conc retains its meaning (concurrent users/requests) for both.
+
+-- 1) isl/osl become nullable for agentic rows
+alter table benchmark_results
+  alter column isl drop not null,
+  alter column osl drop not null;
+
+-- 2) CHECK constraints: positive-or-null
+alter table benchmark_results
+  drop constraint benchmark_results_isl_positive,
+  drop constraint benchmark_results_osl_positive;
+
+alter table benchmark_results
+  add constraint benchmark_results_isl_positive check (isl is null or isl > 0),
+  add constraint benchmark_results_osl_positive check (osl is null or osl > 0);
+
+-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows
+--    can't duplicate on (workflow_run_id, config_id, benchmark_type, conc).
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 003_agentic_availability.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Extend the availability table to cover agentic scenarios.
+--
+-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same
+-- for availability and add benchmark_type so the frontend can enumerate
+-- agentic vs single_turn scenarios per model/date.
+--
+-- Postgres primary keys require every column to be NOT NULL, so we drop the PK
+-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally
+-- equivalent except it allows isl/osl to be NULL for agentic rows.
+
+alter table availability
+  drop constraint availability_pkey;
+
+alter table availability
+  alter column isl drop not null,
+  alter column osl drop not null,
+  add column benchmark_type text not null default 'single_turn';
+
+alter table availability
+  add constraint availability_natural_key unique nulls not distinct
+    (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 004_offload_mode.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add offload_mode as a first-class dimension on benchmark_results.
+--
+-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace
+-- runs: a single run may emit two rows for the same (config, isl, osl, conc)
+-- — one with offload disabled, one enabled. The pre-existing unique key
+-- collapsed those into one row, forcing the ingest to skip variants.
+--
+-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the
+-- assumption baked into the existing 5,500+ rows.
+
+alter table benchmark_results
+  add column offload_mode text not null default 'off';
+
+-- Backfill agentic rows from the offload_mode value already living in metrics
+-- JSONB (set during the earlier agentic ingest backfill).
+update benchmark_results
+   set offload_mode = metrics->>'offload_mode'
+ where benchmark_type = 'agentic_traces'
+   and metrics ? 'offload_mode';
+
+-- Replace the unique constraint so on/off variants can coexist.
+alter table benchmark_results
+  drop constraint benchmark_results_unique;
+
+alter table benchmark_results
+  add constraint benchmark_results_unique unique nulls not distinct
+    (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode);
+
+-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too.
+drop materialized view if exists latest_benchmarks cascade;
+
+create materialized view latest_benchmarks as
+select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode)
+  br.*
+from benchmark_results br
+join latest_workflow_runs wr on wr.id = br.workflow_run_id
+where br.error is null
+order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc;
+
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 006_agentic_trace_replay.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Capture raw aiperf trace files per agentic benchmark point.
+--
+-- The aiperf harness produces two per-point export files inside each
+-- `agentic_<suffix>` artifact:
+--   - profile_export.jsonl         (~2 MB raw, per-request data)
+--   - server_metrics_export.csv    (~20 KB raw, periodic Prometheus snapshots)
+--
+-- We persist them so the dashboard can later show per-request distributions,
+-- KV cache utilization over time, and conversation traces without needing to
+-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at
+-- ~500 KB per point post-gzip the total fits comfortably without a separate
+-- blob service.
+--
+-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK
+-- column on benchmark_results). Older, non-aiperf agentic runs simply have a
+-- NULL `trace_replay_id`.
+
+create table agentic_trace_replay (
+  id                                bigserial   primary key,
+  -- gzip(profile_export.jsonl); null when only the server metrics file existed
+  profile_export_jsonl_gz           bytea,
+  profile_export_uncompressed_size  bigint,
+  -- raw csv bytes; null when only the profile file existed
+  server_metrics_csv                bytea,
+  server_metrics_csv_size           bigint,
+  created_at                        timestamptz not null default now()
+);
+
+alter table benchmark_results
+  add column trace_replay_id bigint references agentic_trace_replay(id);
+
+create index benchmark_results_trace_replay_idx
+  on benchmark_results (trace_replay_id)
+  where trace_replay_id is not null;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 007_agentic_trace_server_metrics_json.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Add the full server-metrics time-series JSON to agentic_trace_replay.
+--
+-- The existing `server_metrics_csv` column holds aiperf's summary export —
+-- one row per metric with avg/min/max/std/p1..p99 across the entire run.
+-- That's enough for the cumulative cache-hit number but not for any
+-- "metric over time" view (KV cache utilization curve, queue depth, prefix
+-- hit rate per interval, cumulative prefill token source).
+--
+-- The harness also writes `server_metrics_export.json` which contains the
+-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
+-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
+-- to ~6 MB gzipped (text with repeated metric names + numeric values).
+-- That's the file we store here for any future time-series chart.
+
+alter table agentic_trace_replay
+  add column server_metrics_json_gz bytea,
+  add column server_metrics_json_uncompressed_size bigint;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 008_agentic_aggregate_stats.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed aggregate stats for each agentic_trace_replay row.
+--
+-- Previously the agentic detail page parsed the (huge) profile_export.jsonl
+-- and server_metrics_json blobs on every request to compute distribution
+-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
+-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
+-- worst rows (high-conc TP+EP server_metrics blobs that decompress past
+-- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
+--
+-- This column holds the computed stats so the API serves the page from a
+-- single SQL row read. Shape mirrors the existing benchmark_results.metrics
+-- JSONB convention; an inner `version` field lets the backfill script
+-- detect rows whose stats were computed by an older algorithm and
+-- recompute them. Null when stats haven't been computed yet (existing
+-- rows pre-backfill; the API has a slow-path fallback for that case).
+
+alter table agentic_trace_replay
+  add column aggregate_stats jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 009_agentic_chart_series.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed time-series for the agentic detail page chart.
+--
+-- Sibling to `aggregate_stats` (migration 008): that column stores
+-- per-row percentile/derived *summaries*, this one stores the full
+-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
+-- queueDepth, prefillTps, decodeTps, promptTokensBySource).
+--
+-- Without this, the detail page parsed the entire `server_metrics_json_gz`
+-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
+-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
+-- With pre-computed series the page is a single SQL row read.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored series were produced by an older algorithm.
+-- Null when the series haven't been computed yet; the API has a slow-path
+-- fallback (with stream-parse for oversized blobs) for that case.
+
+alter table agentic_trace_replay
+  add column chart_series jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 010_agentic_request_timeline.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Pre-computed per-request timeline for the agentic detail page.
+--
+-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
+-- holds a thin per-request array extracted from `profile_export_jsonl_gz`
+-- so the detail page can render a Gantt-style swimlane of every request
+-- (one bar per conversation turn) without re-parsing the JSONL on every
+-- page load.
+--
+-- Shape includes an inner `version` field so the backfill script can
+-- recompute rows whose stored timeline was produced by an older
+-- algorithm. Null when the timeline hasn't been computed yet; the API
+-- falls back to parsing the blob in that case.
+
+alter table agentic_trace_replay
+  add column request_timeline jsonb;
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 011_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
+-- the agentic benchmarks replay) + their per-conversation trace structure.
+--
+-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
+-- not the source traces. These two tables back the new /datasets area: a
+-- registry of ingested dataset versions with precomputed summary + chart data,
+-- and one row per conversation holding a flamegraph-ready `structure` (turns +
+-- subagent groups with input split into cached-prefix vs uncached-suffix). The
+-- raw hash_ids are NOT stored — they're only needed at ingest to derive the
+-- cached/uncached split, so the runtime read is a single small JSONB.
+--
+-- Additive only. To revert this migration:
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   (and see the run_datasets revert below; this is all one migration now:
+--    delete from schema_migrations where filename = '007_agentic.sql';)
+
+create table datasets (
+  -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
+  id          text primary key,
+  -- URL key, e.g. 'cc-traces-weka-062126'.
+  slug        text not null unique,
+  label       text not null,
+  -- 'full' | '256k' | 'no-subagents' (the published variants).
+  variant     text not null default 'full',
+  description text,
+  hf_url      text,
+  license     text,
+  conversation_count integer not null default 0,
+  -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
+  summary     jsonb not null default '{}'::jsonb,
+  -- Precomputed distributions for the dataset-detail cards (input/output length,
+  -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
+  chart_data  jsonb not null default '{}'::jsonb,
+  dataset_version integer not null default 1,
+  ingested_at timestamptz not null default now()
+);
+
+create table dataset_conversations (
+  id          bigserial primary key,
+  dataset_id  text not null references datasets(id) on delete cascade,
+  -- The conversation id from the dataset record (trace id).
+  conv_id     text not null,
+  models      text[] not null default '{}',
+  num_turns           integer not null default 0,
+  num_subagent_groups integer not null default 0,
+  total_in    bigint not null default 0,
+  total_out   bigint not null default 0,
+  total_cached bigint not null default 0,
+  -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
+  -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
+  structure   jsonb not null,
+  unique (dataset_id, conv_id)
+);
+
+create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);
+
+-- ───────────────────────────────────────────────────────────────────────
+-- (was 012_run_datasets.sql)
+-- ───────────────────────────────────────────────────────────────────────
+-- Maps a benchmark workflow_run to the source dataset it replayed, so the
+-- agentic detail page can deep-link each request in the timeline to the exact
+-- conversation in the /datasets viewer (the request's conversation_id, with any
+-- ::sa:/::fa: suffix stripped, is the dataset conv_id).
+--
+-- One row per workflow_run (every benchmark in a run replays the same dataset).
+-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/<slug>
+-- URL) rather than an FK, so the mapping can be recorded before/independent of
+-- the dataset being ingested; the UI degrades gracefully if the slug is absent.
+--
+-- Additive only. To revert this whole squashed migration:
+--   drop table if exists run_datasets;
+--   drop table if exists dataset_conversations;
+--   drop table if exists datasets;
+--   drop table if exists agentic_trace_replay cascade;
+--   (plus the benchmark_results/availability column + constraint changes above)
+--   delete from schema_migrations where filename = '007_agentic.sql';
+
+create table run_datasets (
+  workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
+  dataset_slug    text not null,
+  created_at      timestamptz not null default now()
+);
diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
deleted file mode 100644
index ba7bd095..00000000
--- a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql
+++ /dev/null
@@ -1,17 +0,0 @@
--- Add the full server-metrics time-series JSON to agentic_trace_replay.
---
--- The existing `server_metrics_csv` column holds aiperf's summary export —
--- one row per metric with avg/min/max/std/p1..p99 across the entire run.
--- That's enough for the cumulative cache-hit number but not for any
--- "metric over time" view (KV cache utilization curve, queue depth, prefix
--- hit rate per interval, cumulative prefill token source).
---
--- The harness also writes `server_metrics_export.json` which contains the
--- raw per-scrape (~1Hz) values for every Prometheus metric over the whole
--- benchmark window. Raw size is ~250 MB per point but it compresses ~42x
--- to ~6 MB gzipped (text with repeated metric names + numeric values).
--- That's the file we store here for any future time-series chart.
-
-alter table agentic_trace_replay
-  add column server_metrics_json_gz bytea,
-  add column server_metrics_json_uncompressed_size bigint;
diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql
deleted file mode 100644
index d55533b9..00000000
--- a/packages/db/migrations/008_agentic_aggregate_stats.sql
+++ /dev/null
@@ -1,18 +0,0 @@
--- Pre-computed aggregate stats for each agentic_trace_replay row.
---
--- Previously the agentic detail page parsed the (huge) profile_export.jsonl
--- and server_metrics_json blobs on every request to compute distribution
--- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived
--- metrics (session-time, p90 prefill TPS). That took ~20s per row and the
--- worst rows (high-conc TP+EP server_metrics blobs that decompress past
--- Node's 512 MB string cap) couldn't be parsed without a stream fallback.
---
--- This column holds the computed stats so the API serves the page from a
--- single SQL row read. Shape mirrors the existing benchmark_results.metrics
--- JSONB convention; an inner `version` field lets the backfill script
--- detect rows whose stats were computed by an older algorithm and
--- recompute them. Null when stats haven't been computed yet (existing
--- rows pre-backfill; the API has a slow-path fallback for that case).
-
-alter table agentic_trace_replay
-  add column aggregate_stats jsonb;
diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql
deleted file mode 100644
index b42718b9..00000000
--- a/packages/db/migrations/009_agentic_chart_series.sql
+++ /dev/null
@@ -1,19 +0,0 @@
--- Pre-computed time-series for the agentic detail page chart.
---
--- Sibling to `aggregate_stats` (migration 008): that column stores
--- per-row percentile/derived *summaries*, this one stores the full
--- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate,
--- queueDepth, prefillTps, decodeTps, promptTokensBySource).
---
--- Without this, the detail page parsed the entire `server_metrics_json_gz`
--- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc
--- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length).
--- With pre-computed series the page is a single SQL row read.
---
--- Shape includes an inner `version` field so the backfill script can
--- recompute rows whose stored series were produced by an older algorithm.
--- Null when the series haven't been computed yet; the API has a slow-path
--- fallback (with stream-parse for oversized blobs) for that case.
-
-alter table agentic_trace_replay
-  add column chart_series jsonb;
diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql
deleted file mode 100644
index 756b775e..00000000
--- a/packages/db/migrations/010_agentic_request_timeline.sql
+++ /dev/null
@@ -1,15 +0,0 @@
--- Pre-computed per-request timeline for the agentic detail page.
---
--- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one
--- holds a thin per-request array extracted from `profile_export_jsonl_gz`
--- so the detail page can render a Gantt-style swimlane of every request
--- (one bar per conversation turn) without re-parsing the JSONL on every
--- page load.
---
--- Shape includes an inner `version` field so the backfill script can
--- recompute rows whose stored timeline was produced by an older
--- algorithm. Null when the timeline hasn't been computed yet; the API
--- falls back to parsing the blob in that case.
-
-alter table agentic_trace_replay
-  add column request_timeline jsonb;
diff --git a/packages/db/migrations/011_datasets.sql b/packages/db/migrations/011_datasets.sql
deleted file mode 100644
index 7a70d83f..00000000
--- a/packages/db/migrations/011_datasets.sql
+++ /dev/null
@@ -1,55 +0,0 @@
--- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora
--- the agentic benchmarks replay) + their per-conversation trace structure.
---
--- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but
--- not the source traces. These two tables back the new /datasets area: a
--- registry of ingested dataset versions with precomputed summary + chart data,
--- and one row per conversation holding a flamegraph-ready `structure` (turns +
--- subagent groups with input split into cached-prefix vs uncached-suffix). The
--- raw hash_ids are NOT stored — they're only needed at ingest to derive the
--- cached/uncached split, so the runtime read is a single small JSONB.
---
--- Additive only. To revert this migration:
---   drop table if exists dataset_conversations;
---   drop table if exists datasets;
---   delete from schema_migrations where filename = '011_datasets.sql';
-
-create table datasets (
-  -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'.
-  id          text primary key,
-  -- URL key, e.g. 'cc-traces-weka-062126'.
-  slug        text not null unique,
-  label       text not null,
-  -- 'full' | '256k' | 'no-subagents' (the published variants).
-  variant     text not null default 'full',
-  description text,
-  hf_url      text,
-  license     text,
-  conversation_count integer not null default 0,
-  -- Token totals, main_turns, subagent_groups, model mix, date range, etc.
-  summary     jsonb not null default '{}'::jsonb,
-  -- Precomputed distributions for the dataset-detail cards (input/output length,
-  -- turns per conversation, subagent fan-out, …). Versioned via an inner field.
-  chart_data  jsonb not null default '{}'::jsonb,
-  dataset_version integer not null default 1,
-  ingested_at timestamptz not null default now()
-);
-
-create table dataset_conversations (
-  id          bigserial primary key,
-  dataset_id  text not null references datasets(id) on delete cascade,
-  -- The conversation id from the dataset record (trace id).
-  conv_id     text not null,
-  models      text[] not null default '{}',
-  num_turns           integer not null default 0,
-  num_subagent_groups integer not null default 0,
-  total_in    bigint not null default 0,
-  total_out   bigint not null default 0,
-  total_cached bigint not null default 0,
-  -- Flamegraph-ready ordered node tree (turns + subagent groups, each with
-  -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts.
-  structure   jsonb not null,
-  unique (dataset_id, conv_id)
-);
-
-create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id);
diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql
deleted file mode 100644
index 58dd9f88..00000000
--- a/packages/db/migrations/012_run_datasets.sql
+++ /dev/null
@@ -1,19 +0,0 @@
--- Maps a benchmark workflow_run to the source dataset it replayed, so the
--- agentic detail page can deep-link each request in the timeline to the exact
--- conversation in the /datasets viewer (the request's conversation_id, with any
--- ::sa:/::fa: suffix stripped, is the dataset conv_id).
---
--- One row per workflow_run (every benchmark in a run replays the same dataset).
--- dataset_slug is a plain slug (matches datasets.slug / the /datasets/<slug>
--- URL) rather than an FK, so the mapping can be recorded before/independent of
--- the dataset being ingested; the UI degrades gracefully if the slug is absent.
---
--- Additive only. To revert:
---   drop table if exists run_datasets;
---   delete from schema_migrations where filename = '012_run_datasets.sql';
-
-create table run_datasets (
-  workflow_run_id bigint primary key references workflow_runs(id) on delete cascade,
-  dataset_slug    text not null,
-  created_at      timestamptz not null default now()
-);

From 772dfef5cde7a79d02963a9f151cb43b6592920e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 10:57:37 -0500
Subject: [PATCH 92/96] add agentic time-series and dataset timing

---
 .../e2e/agentic-point-time-series.cy.ts       | 98 +++++++++++++++++++
 .../e2e/datasets-flamegraph-time.cy.ts        | 85 ++++++++++++++++
 .../components/datasets/conversation-view.tsx |  3 +-
 .../datasets/trace-flamegraph.test.ts         | 16 +++
 .../components/datasets/trace-flamegraph.tsx  | 35 +++++++
 .../agentic-point/agentic-point-detail.tsx    | 97 +++++++++++++++++-
 .../agentic-point/expandable-chart.tsx        | 30 ++++--
 .../agentic-point/time-series-chart.test.ts   | 73 +++++++++++++-
 .../agentic-point/time-series-chart.tsx       | 60 ++++++++++++
 .../app/src/hooks/api/use-request-timeline.ts |  2 +
 .../src/etl/compute-request-timeline.test.ts  | 25 ++++-
 .../db/src/etl/compute-request-timeline.ts    | 12 ++-
 packages/db/src/etl/weka-structure.test.ts    | 28 +++++-
 packages/db/src/etl/weka-structure.ts         | 40 ++++++++
 14 files changed, 586 insertions(+), 18 deletions(-)
 create mode 100644 packages/app/cypress/e2e/agentic-point-time-series.cy.ts
 create mode 100644 packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
 create mode 100644 packages/app/src/components/datasets/trace-flamegraph.test.ts

diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
new file mode 100644
index 00000000..b0cfb60d
--- /dev/null
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -0,0 +1,98 @@
+const timelineRequest = (
+  index: number,
+  ttftMs: number,
+  tpotMs: number,
+  overrides: Record<string, unknown> = {},
+) => ({
+  cid: 'conversation-1',
+  ti: index,
+  wid: 'worker-1',
+  ad: 0,
+  phase: 'profiling',
+  credit: index * 1_000_000_000,
+  start: index * 1_000_000_000,
+  ack: null,
+  end: (index + 1) * 1_000_000_000,
+  ttftMs,
+  tpotMs,
+  isl: 1024,
+  osl: 128,
+  cancelled: false,
+  ...overrides,
+});
+
+describe('Agentic point request metric time series', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} });
+    cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null });
+    cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 });
+    cy.intercept('GET', '/api/v1/request-timeline*', {
+      body: {
+        version: 3,
+        startNs: 0,
+        endNs: 7_000_000_000,
+        durationS: 7,
+        requests: [
+          timelineRequest(0, 100, 10),
+          timelineRequest(1, 200, 20),
+          timelineRequest(2, 400, 25),
+          timelineRequest(3, 800, 40),
+          timelineRequest(4, 1600, 80),
+          timelineRequest(5, 3200, 160, { phase: 'warmup' }),
+          timelineRequest(6, 6400, 320, { cancelled: true }),
+        ],
+      },
+    });
+    cy.visit('/inference/agentic/206885');
+  });
+
+  it('renders rolling P75 interactivity and TTFT using profiling requests only', () => {
+    cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+      cy.contains('h2', 'Interactivity over time').should('be.visible');
+      cy.get('[data-testid="interactivity-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P75');
+      cy.get('svg circle').should('have.length', 5);
+      cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
+      cy.get('svg').should('contain.text', '1 / cumulative mean TPOT');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.contains('h2', 'TTFT over time').should('be.visible');
+      cy.get('svg circle').should('have.length', 5);
+      cy.get('svg').should('contain.text', 'TTFT (s)');
+      cy.get('svg').should('contain.text', 'Cumulative mean TTFT');
+      cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
+    });
+  });
+
+  it('switches each chart independently from P75 to P90', () => {
+    cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
+      cy.contains('svg', 'P75 (rolling 50 req)')
+        .find('path')
+        .first()
+        .invoke('attr', 'd')
+        .as('p75Path');
+      cy.contains('button', 'P90').click();
+      cy.get('[data-testid="interactivity-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P90');
+      cy.contains('svg', 'P90 (rolling 50 req)')
+        .find('path')
+        .first()
+        .invoke('attr', 'd')
+        .then(function (p90Path) {
+          expect(p90Path).not.to.equal(this.p75Path);
+        });
+    });
+
+    cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
+      cy.get('[data-testid="ttft-percentile-toggle"]')
+        .find('[role="tab"][aria-selected="true"]')
+        .should('have.text', 'P75');
+      cy.contains('button', 'P90').click();
+      cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+    });
+  });
+});
diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
new file mode 100644
index 00000000..672675a3
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts
@@ -0,0 +1,85 @@
+describe('Dataset conversation flamegraph timing', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', {
+      body: {
+        conv_id: 'conversation-1',
+        models: ['model-a'],
+        num_turns: 2,
+        num_subagent_groups: 1,
+        total_in: 1000,
+        total_out: 100,
+        total_cached: 500,
+        structure: {
+          blockSize: 64,
+          totals: {
+            in: 1000,
+            out: 100,
+            cached: 500,
+            uncached: 500,
+            numTurns: 2,
+            numSubagentGroups: 1,
+          },
+          nodes: [
+            {
+              kind: 'turn',
+              turnIndex: 0,
+              startS: 0,
+              model: 'model-a',
+              in: 100,
+              out: 10,
+              cached: 0,
+              uncached: 100,
+            },
+            {
+              kind: 'subagent',
+              label: 'Explore',
+              agentId: 'agent-1',
+              startS: 3661.2,
+              endS: 3782.6,
+              durationMs: 121_400,
+              in: 800,
+              out: 80,
+              cached: 500,
+              uncached: 300,
+              children: [
+                {
+                  kind: 'turn',
+                  turnIndex: 1,
+                  startS: 3661.2,
+                  model: 'model-a',
+                  in: 800,
+                  out: 80,
+                  cached: 500,
+                  uncached: 300,
+                },
+              ],
+            },
+            {
+              kind: 'turn',
+              turnIndex: 2,
+              startS: 65.4,
+              model: 'model-a',
+              in: 100,
+              out: 10,
+              cached: 0,
+              uncached: 100,
+            },
+          ],
+        },
+      },
+    });
+    cy.visit('/datasets/test-dataset/conversations/conversation-1');
+  });
+
+  it('shows turn offsets and a collapsed subagent time range', () => {
+    cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00');
+    cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05');
+    cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03');
+    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist');
+  });
+
+  it('shows subturn offsets when the subagent group is expanded', () => {
+    cy.contains('button', 'Explore').click();
+    cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01');
+  });
+});
diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx
index d39b83d9..57aaa0c3 100644
--- a/packages/app/src/components/datasets/conversation-view.tsx
+++ b/packages/app/src/components/datasets/conversation-view.tsx
@@ -87,7 +87,8 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin
         <p className="mb-4 text-xs text-muted-foreground">
           One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default —
           click a group to expand it. Each bar splits input into cached prefix and uncached suffix,
-          plus generated output.
+          plus generated output. Timestamps are elapsed from conversation start; subagent headers
+          show their full active range.
         </p>
         <TraceFlamegraph
           structure={data.structure}
diff --git a/packages/app/src/components/datasets/trace-flamegraph.test.ts b/packages/app/src/components/datasets/trace-flamegraph.test.ts
new file mode 100644
index 00000000..00293c00
--- /dev/null
+++ b/packages/app/src/components/datasets/trace-flamegraph.test.ts
@@ -0,0 +1,16 @@
+import { describe, expect, it } from 'vitest';
+
+import { formatElapsedTime } from './trace-flamegraph';
+
+describe('formatElapsedTime', () => {
+  it('formats elapsed seconds below and above one hour', () => {
+    expect(formatElapsedTime(0)).toBe('00:00');
+    expect(formatElapsedTime(65.4)).toBe('01:05');
+    expect(formatElapsedTime(3661.6)).toBe('1:01:02');
+    expect(formatElapsedTime(86_541.149)).toBe('24:02:21');
+  });
+
+  it('clamps negative offsets to the conversation origin', () => {
+    expect(formatElapsedTime(-5)).toBe('00:00');
+  });
+});
diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx
index 12ecb4a4..d0bbb01f 100644
--- a/packages/app/src/components/datasets/trace-flamegraph.tsx
+++ b/packages/app/src/components/datasets/trace-flamegraph.tsx
@@ -24,6 +24,7 @@ interface VisibleRow {
   key: string;
   label: string;
   sublabel?: string;
+  timeLabel?: string;
   cached: number;
   uncached: number;
   output: number;
@@ -34,6 +35,24 @@ interface VisibleRow {
   groupIndex?: number;
 }
 
+/** Format seconds from conversation start as a compact elapsed timestamp. */
+export function formatElapsedTime(seconds: number): string {
+  const total = Math.max(0, Math.round(seconds));
+  const hours = Math.floor(total / 3600);
+  const minutes = Math.floor((total % 3600) / 60);
+  const secs = total % 60;
+  const mm = String(minutes).padStart(2, '0');
+  const ss = String(secs).padStart(2, '0');
+  return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`;
+}
+
+function timeLabel(startS?: number, endS?: number): string | undefined {
+  if (startS === undefined || !Number.isFinite(startS)) return undefined;
+  const start = formatElapsedTime(startS);
+  if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`;
+  return `+${start}–${formatElapsedTime(endS)}`;
+}
+
 interface TooltipState {
   x: number;
   y: number;
@@ -152,6 +171,7 @@ export function TraceFlamegraph({
           key: `t-${i}`,
           label: `Turn ${turnNo}`,
           sublabel: node.model ?? undefined,
+          timeLabel: timeLabel(node.startS),
           cached: node.cached,
           uncached: node.uncached,
           output: node.out,
@@ -168,6 +188,7 @@ export function TraceFlamegraph({
           sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${
             node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : ''
           }`,
+          timeLabel: timeLabel(node.startS, node.endS),
           cached: node.cached,
           uncached: node.uncached,
           output: node.out,
@@ -183,6 +204,7 @@ export function TraceFlamegraph({
               key: `g-${i}-c-${ci}`,
               label: `↳ subturn ${ci + 1}`,
               sublabel: child.model ?? undefined,
+              timeLabel: timeLabel(child.startS),
               cached: child.cached,
               uncached: child.uncached,
               output: child.out,
@@ -291,6 +313,15 @@ export function TraceFlamegraph({
                   )}
                 </div>
 
+                {/* Offset from conversation start. Group rows span the full
+                    subagent lifetime; leaf rows show their start instant. */}
+                <div
+                  className="w-36 shrink-0 text-[11px] tabular-nums text-muted-foreground"
+                  data-testid={`flamegraph-time-${row.key}`}
+                >
+                  {row.timeLabel ?? '—'}
+                </div>
+
                 {/* stacked bar — group headers render as a slim muted summary
                     strip so they read as aggregates, not individual turns. */}
                 <div
@@ -354,6 +385,10 @@ export function TraceFlamegraph({
                   ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%`
                   : '—'}
               </span>
+              <span>From start</span>
+              <span className="text-right tabular-nums text-foreground">
+                {tooltip.row.timeLabel ?? '—'}
+              </span>
             </div>
           </div>,
           document.body,
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index 4a076955..e24b7e6b 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -6,7 +6,7 @@ import { useState } from 'react';
 import { ArrowLeft } from 'lucide-react';
 
 import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates';
-import { useRequestTimeline } from '@/hooks/api/use-request-timeline';
+import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-request-timeline';
 import { useTraceHistograms } from '@/hooks/api/use-trace-histograms';
 import {
   useTraceServerMetrics,
@@ -16,6 +16,7 @@ import {
 } from '@/hooks/api/use-trace-server-metrics';
 import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings';
 import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle';
+import { track } from '@/lib/analytics';
 
 import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart';
 import { Distribution } from './distribution';
@@ -30,8 +31,11 @@ import {
   cumulativeUniqueInputTokens,
   inflightUniqueTokens,
   rollingAverage,
+  rollingRequestMetric,
   sumSeries,
   timeRollingAverage,
+  type RequestMetric,
+  type RequestPercentile,
 } from './time-series-chart';
 
 interface Props {
@@ -114,6 +118,83 @@ const VIEW_OPTIONS: SegmentedToggleOption<DetailView>[] = [
   { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' },
 ];
 
+const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption<RequestPercentile>[] = [
+  { value: 'p75', label: 'P75' },
+  { value: 'p90', label: 'P90' },
+];
+
+// Unofficial-run overlays cannot open this persisted point-detail route: they
+// have no benchmark_results id or stored request timeline. These charts are
+// therefore intentionally limited to DB-backed agentic points.
+function RequestMetricOverTime({
+  title,
+  metric,
+  timeline,
+  isLoading,
+}: {
+  title: string;
+  metric: RequestMetric;
+  timeline: RequestTimeline | null | undefined;
+  isLoading: boolean;
+}) {
+  const [percentile, setPercentile] = useState<RequestPercentile>('p75');
+  const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null;
+  const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity';
+  const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4';
+
+  const controls = (
+    <SegmentedToggle
+      value={percentile}
+      options={REQUEST_PERCENTILE_OPTIONS}
+      onValueChange={(value) => {
+        setPercentile(value);
+        track('inference_agentic_percentile_changed', { metric, percentile: value });
+      }}
+      ariaLabel={`${metricLabel} percentile`}
+      testId={`${metric}-percentile-toggle`}
+    />
+  );
+
+  return (
+    <ExpandableChart
+      title={title}
+      controls={controls}
+      testId={`${metric}-over-time-chart`}
+      render={(expanded) => {
+        const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline;
+        if (!timeline) return isLoading ? <Skeleton /> : <Empty />;
+        return (
+          <TimeSeriesChart
+            series={[
+              {
+                name: `${percentile.toUpperCase()} (rolling 50 req)`,
+                data: result?.trend ?? [],
+                rawData: result?.raw,
+                color,
+                strokeWidth: 2.5,
+              },
+              {
+                name: metric === 'ttft' ? 'Cumulative mean TTFT' : '1 / cumulative mean TPOT',
+                data: result?.cumulative ?? [],
+                color: '#ef4444',
+                strokeWidth: 3,
+              },
+            ]}
+            durationS={timeline.durationS}
+            yFmt={
+              metric === 'ttft'
+                ? (value) => `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s`
+                : (value) => `${value.toFixed(0)}`
+            }
+            yAxisLabel={metric === 'ttft' ? 'TTFT (s)' : 'Interactivity (tok/s/user)'}
+            {...size}
+          />
+        );
+      }}
+    />
+  );
+}
+
 /** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */
 function toAggPoint(
   sibling: { id: number; label: string },
@@ -254,6 +335,20 @@ export function AgenticPointDetail({ id }: Props) {
             }}
           />
 
+          <RequestMetricOverTime
+            title="Interactivity over time"
+            metric="interactivity"
+            timeline={timelineQuery.data}
+            isLoading={timelineQuery.isLoading}
+          />
+
+          <RequestMetricOverTime
+            title="TTFT over time"
+            metric="ttft"
+            timeline={timelineQuery.data}
+            isLoading={timelineQuery.isLoading}
+          />
+
           <ExpandableChart
             title="KV cache utilization over time"
             render={(expanded) => {
diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
index 7c8e4538..cb5987ec 100644
--- a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx
@@ -13,30 +13,40 @@ import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/u
 export function ExpandableChart({
   title,
   render,
+  controls,
+  testId,
 }: {
   title: string;
   render: (expanded: boolean) => ReactNode;
+  controls?: ReactNode;
+  testId?: string;
 }) {
   const [open, setOpen] = useState(false);
 
   return (
-    <div className="rounded-lg border border-border/40 bg-card/40 p-4">
+    <div className="rounded-lg border border-border/40 bg-card/40 p-4" data-testid={testId}>
       <div className="flex items-start justify-between mb-3 gap-2">
         <h2 className="text-sm font-semibold text-foreground">{title}</h2>
-        <button
-          type="button"
-          aria-label="Expand chart"
-          onClick={() => setOpen(true)}
-          className="text-muted-foreground hover:text-foreground transition-colors"
-        >
-          <Maximize2 className="size-4" />
-        </button>
+        <div className="flex items-center gap-2">
+          {controls}
+          <button
+            type="button"
+            aria-label="Expand chart"
+            onClick={() => setOpen(true)}
+            className="text-muted-foreground hover:text-foreground transition-colors"
+          >
+            <Maximize2 className="size-4" />
+          </button>
+        </div>
       </div>
       {render(false)}
       <Dialog open={open} onOpenChange={setOpen}>
         <DialogContent className="max-w-[min(96vw,1400px)] w-[min(96vw,1400px)]">
           <DialogHeader>
-            <DialogTitle>{title}</DialogTitle>
+            <div className="flex items-center justify-between gap-3 pr-8">
+              <DialogTitle>{title}</DialogTitle>
+              {controls}
+            </div>
           </DialogHeader>
           <div className="w-full">{render(true)}</div>
         </DialogContent>
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
index 64deace4..926772db 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
@@ -1,6 +1,77 @@
 import { describe, expect, it } from 'vitest';
 
-import { cumulativeUniqueInputTokens } from './time-series-chart';
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
+
+import { cumulativeUniqueInputTokens, rollingRequestMetric } from './time-series-chart';
+
+const request = (
+  endS: number,
+  ttftMs: number | null,
+  tpotMs: number | null,
+  overrides: Partial<RequestRecord> = {},
+): RequestRecord => ({
+  cid: 'conversation',
+  ti: endS,
+  wid: 'worker',
+  ad: 0,
+  phase: 'profiling',
+  credit: 0,
+  start: 0,
+  ack: null,
+  end: endS * 1e9,
+  ttftMs,
+  tpotMs,
+  isl: 100,
+  osl: 10,
+  cancelled: false,
+  ...overrides,
+});
+
+describe('rollingRequestMetric', () => {
+  it('computes a trailing P75 TTFT over the requested window', () => {
+    const result = rollingRequestMetric(
+      [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30), request(4, 400, 40)],
+      'ttft',
+      'p75',
+      3,
+    );
+
+    expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 });
+    expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]);
+    expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]);
+  });
+
+  it('inverts the rolling TPOT percentile for interactivity', () => {
+    const result = rollingRequestMetric(
+      [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30)],
+      'interactivity',
+      'p90',
+      3,
+    );
+
+    expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]);
+    expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8);
+    expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]);
+  });
+
+  it('drops warmup, cancelled, missing, and non-positive samples', () => {
+    const result = rollingRequestMetric(
+      [
+        request(1, 100, 10),
+        request(2, 200, 20, { phase: 'warmup' }),
+        request(3, 300, 30, { cancelled: true }),
+        request(4, null, null),
+        request(5, 0, 0),
+      ],
+      'ttft',
+      'p90',
+    );
+
+    expect(result.raw).toEqual([{ t: 1, value: 0.1 }]);
+    expect(result.trend).toEqual([{ t: 1, value: 0.1 }]);
+    expect(result.cumulative).toEqual([{ t: 1, value: 0.1 }]);
+  });
+});
 
 describe('cumulativeUniqueInputTokens', () => {
   it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => {
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 6b00b1e6..749a17e4 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -3,6 +3,7 @@
 import { useMemo } from 'react';
 
 import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics';
+import type { RequestRecord } from '@/hooks/api/use-request-timeline';
 
 import { ChartHover, type HoverItem } from './chart-hover';
 
@@ -32,6 +33,65 @@ interface TimeSeriesChartProps {
   height?: number;
 }
 
+export type RequestMetric = 'interactivity' | 'ttft';
+export type RequestPercentile = 'p75' | 'p90';
+
+/** Linear-interpolated percentile (matches numpy's default method). */
+function quantile(sortedAsc: number[], q: number): number {
+  if (sortedAsc.length === 1) return sortedAsc[0]!;
+  const pos = (sortedAsc.length - 1) * q;
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) return sortedAsc[lo]!;
+  return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo);
+}
+
+/**
+ * Build raw request samples plus a trailing request-count percentile.
+ *
+ * The percentile is computed in latency space. Interactivity then inverts
+ * the selected TPOT percentile, matching the aggregate chart convention:
+ * P90 interactivity = 1 / P90 TPOT (a conservative tail-latency view).
+ */
+export function rollingRequestMetric(
+  requests: readonly RequestRecord[],
+  metric: RequestMetric,
+  percentile: RequestPercentile,
+  windowSize = 50,
+): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } {
+  const q = percentile === 'p75' ? 0.75 : 0.9;
+  const samples = requests
+    .filter((request) => request.phase === 'profiling' && !request.cancelled)
+    .flatMap((request) => {
+      const latencyMs = metric === 'ttft' ? request.ttftMs : request.tpotMs;
+      if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return [];
+      return [{ t: request.end / 1e9, latencyMs }];
+    })
+    .toSorted((a, b) => a.t - b.t);
+
+  const raw = samples.map(({ t, latencyMs }) => ({
+    t,
+    value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs,
+  }));
+  const trend = samples.map(({ t }, i) => {
+    const start = Math.max(0, i - Math.max(1, windowSize) + 1);
+    const sorted = samples
+      .slice(start, i + 1)
+      .map((sample) => sample.latencyMs)
+      .toSorted((a, b) => a - b);
+    const latencyMs = quantile(sorted, q);
+    return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs };
+  });
+  let latencySumMs = 0;
+  const cumulative = samples.map(({ t, latencyMs }, i) => {
+    latencySumMs += latencyMs;
+    const meanLatencyMs = latencySumMs / (i + 1);
+    return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs };
+  });
+
+  return { raw, trend, cumulative };
+}
+
 /**
  * Time-weighted rolling average over a `windowS`-second trailing window.
  * Treats the input as a step function (value held constant between
diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts
index d3ceaab8..094d2230 100644
--- a/packages/app/src/hooks/api/use-request-timeline.ts
+++ b/packages/app/src/hooks/api/use-request-timeline.ts
@@ -20,6 +20,8 @@ export interface RequestRecord {
   /** ns offset from timeline.startNs. Last byte received. */
   end: number;
   ttftMs: number | null;
+  /** Time per output token in ms. */
+  tpotMs: number | null;
   isl: number | null;
   osl: number | null;
   cancelled: boolean;
diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts
index 64512aca..61e69fe8 100644
--- a/packages/db/src/etl/compute-request-timeline.test.ts
+++ b/packages/db/src/etl/compute-request-timeline.test.ts
@@ -15,6 +15,8 @@ interface SyntheticRequest {
   end: number;
   ack?: number | null;
   ttftMs?: number | null;
+  tpotMs?: number | null;
+  tpotKey?: 'inter_token_latency' | 'time_per_output_token';
   isl?: number | null;
   osl?: number | null;
   cancelled?: boolean;
@@ -37,6 +39,8 @@ function makeBlob(requests: SyntheticRequest[]) {
       },
       metrics: {
         time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' },
+        [r.tpotKey ?? 'inter_token_latency']:
+          r.tpotMs === null ? null : { value: r.tpotMs ?? 10, unit: 'ms' },
         input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' },
         output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' },
       },
@@ -115,7 +119,7 @@ describe('computeRequestTimeline', () => {
     expect(r.phase).toBe('profiling');
   });
 
-  it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => {
+  it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => {
     const tl = computeRequestTimeline(
       makeBlob([
         {
@@ -125,6 +129,7 @@ describe('computeRequestTimeline', () => {
           start: 10,
           end: 100,
           ttftMs: 25.5,
+          tpotMs: 12.5,
           isl: 1024,
           osl: 256,
           cancelled: true,
@@ -134,10 +139,28 @@ describe('computeRequestTimeline', () => {
     const r = tl?.requests[0]!;
     expect(r.cancelled).toBe(true);
     expect(r.ttftMs).toBeCloseTo(25.5, 6);
+    expect(r.tpotMs).toBeCloseTo(12.5, 6);
     expect(r.isl).toBe(1024);
     expect(r.osl).toBe(256);
   });
 
+  it('accepts time_per_output_token as a TPOT alias', () => {
+    const tl = computeRequestTimeline(
+      makeBlob([
+        {
+          cid: 'a',
+          ti: 0,
+          credit: 0,
+          start: 10,
+          end: 100,
+          tpotMs: 8.25,
+          tpotKey: 'time_per_output_token',
+        },
+      ]),
+    );
+    expect(tl?.requests[0]?.tpotMs).toBeCloseTo(8.25, 6);
+  });
+
   it('skips records missing both credit_issued_ns and request_start_ns', () => {
     // Build a record with only request_end_ns — the helper rejects it.
     const broken = gzipSync(
diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts
index a1134f7a..707e8c54 100644
--- a/packages/db/src/etl/compute-request-timeline.ts
+++ b/packages/db/src/etl/compute-request-timeline.ts
@@ -14,7 +14,7 @@
 import { gunzipSync } from 'node:zlib';
 
 /** Bump when the extraction algorithm changes — backfill recomputes anything older. */
-export const REQUEST_TIMELINE_VERSION = 1;
+export const REQUEST_TIMELINE_VERSION = 3;
 
 export interface RequestRecord {
   /** Conversation id (groups turns of one agent session). */
@@ -37,6 +37,8 @@ export interface RequestRecord {
   end: number;
   /** Time-to-first-token in ms. */
   ttftMs: number | null;
+  /** Time per output token in ms. */
+  tpotMs: number | null;
   /** Input sequence length (tokens). */
   isl: number | null;
   /** Output sequence length (tokens). */
@@ -76,6 +78,8 @@ interface RawRecord {
   metadata?: RawMetadata;
   metrics?: {
     time_to_first_token?: RawMetricValue | number;
+    time_per_output_token?: RawMetricValue | number;
+    inter_token_latency?: RawMetricValue | number;
     input_sequence_length?: RawMetricValue | number;
     output_sequence_length?: RawMetricValue | number;
   };
@@ -108,6 +112,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
   const raw: {
     meta: RawMetadata;
     ttftMs: number | null;
+    tpotMs: number | null;
     isl: number | null;
     osl: number | null;
   }[] = [];
@@ -135,6 +140,10 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
     raw.push({
       meta,
       ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null,
+      tpotMs:
+        readNum(rec.metrics?.time_per_output_token) ??
+        readNum(rec.metrics?.inter_token_latency) ??
+        null,
       isl: readNum(rec.metrics?.input_sequence_length) ?? null,
       osl: readNum(rec.metrics?.output_sequence_length) ?? null,
     });
@@ -163,6 +172,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n
       ack,
       end,
       ttftMs: r.ttftMs,
+      tpotMs: r.tpotMs,
       isl: r.isl,
       osl: r.osl,
       cancelled: m.was_cancelled === true,
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
index 95bfef38..5287b682 100644
--- a/packages/db/src/etl/weka-structure.test.ts
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -86,17 +86,18 @@ describe('buildConversationStructure', () => {
       id: 'c4',
       block_size: 64,
       requests: [
-        { type: 'n', model: 'main', in: 64, out: 10, hash_ids: [1] },
+        { type: 'n', model: 'main', t: 0, in: 64, out: 10, hash_ids: [1] },
         {
           type: 'subagent',
           agent_id: 'a1',
           subagent_type: 'Explore',
+          t: 12.5,
           duration_ms: 1234,
           requests: [
             // sees parent block 1 (snapshot at spawn) → 1 block cached
-            { type: 'n', model: 'sub', in: 128, out: 7, hash_ids: [1, 5] },
+            { type: 'n', model: 'sub', t: 12.5, in: 128, out: 7, hash_ids: [1, 5] },
             // now block 5 is also seen within the subagent → 2 cached
-            { type: 'n', model: 'sub', in: 128, out: 3, hash_ids: [1, 5] },
+            { type: 'n', model: 'sub', t: 13.1, in: 128, out: 3, hash_ids: [1, 5] },
           ],
         },
         // Parent turn after subagent: block 5 must NOT be cached (subagent
@@ -113,7 +114,10 @@ describe('buildConversationStructure', () => {
     expect(sub.label).toBe('Explore');
     expect(sub.agentId).toBe('a1');
     expect(sub.durationMs).toBe(1234);
+    expect(sub.startS).toBe(12.5);
+    expect(sub.endS).toBeCloseTo(13.734, 6);
     expect(sub.children).toHaveLength(2);
+    expect(sub.children.map((child) => child.startS)).toEqual([12.5, 13.1]);
     expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot
     expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child
     expect(sub.in).toBe(256);
@@ -132,6 +136,24 @@ describe('buildConversationStructure', () => {
     expect(s.blockSize).toBe(64);
     expect((s.nodes[0] as SubagentNode).label).toBe('Subagent');
   });
+
+  it('derives a subagent time range from child timings when group timing is absent', () => {
+    const conv: RawWekaConversation = {
+      id: 'c6',
+      requests: [
+        {
+          type: 'subagent',
+          requests: [
+            { type: 'n', t: 5, api_time: 2.5, in: 10, out: 1 },
+            { type: 'n', t: 9, api_time: 3, in: 10, out: 1 },
+          ],
+        },
+      ],
+    };
+    const sub = buildConversationStructure(conv).nodes[0] as SubagentNode;
+    expect(sub.startS).toBe(5);
+    expect(sub.endS).toBe(12);
+  });
 });
 
 describe('histograms', () => {
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index e4113c68..33e222b4 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -48,6 +48,8 @@ export interface RawWekaConversation {
 export interface TurnNode {
   kind: 'turn';
   turnIndex: number;
+  /** Seconds from the start of the conversation. */
+  startS?: number;
   model?: string;
   in: number;
   out: number;
@@ -61,6 +63,10 @@ export interface SubagentNode {
   kind: 'subagent';
   label: string;
   agentId?: string;
+  /** Seconds from the start of the conversation. */
+  startS?: number;
+  /** Seconds from the start of the conversation. */
+  endS?: number;
   durationMs?: number;
   in: number;
   out: number;
@@ -130,6 +136,35 @@ function subagentLabel(s: RawWekaSubagent): string {
   return base && base.length > 0 ? base : 'Subagent';
 }
 
+function finiteTime(value: number | undefined): number | undefined {
+  return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined;
+}
+
+function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } {
+  const children = entry.requests ?? [];
+  const childStarts = children
+    .map((child) => finiteTime(child.t))
+    .filter((value): value is number => value !== undefined);
+  const startS =
+    finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined);
+  const durationMs = finiteTime(entry.duration_ms);
+  if (startS !== undefined && durationMs !== undefined) {
+    return { startS, endS: startS + durationMs / 1000 };
+  }
+
+  const childEnds = children
+    .map((child) => {
+      const childStart = finiteTime(child.t);
+      if (childStart === undefined) return undefined;
+      return childStart + (finiteTime(child.api_time) ?? 0);
+    })
+    .filter((value): value is number => value !== undefined);
+  return {
+    startS,
+    endS: childEnds.length > 0 ? Math.max(...childEnds) : startS,
+  };
+}
+
 /**
  * Build the flamegraph structure for one conversation. Main turns share a single
  * accumulating prefix-cache `seen` set; each subagent group runs against a
@@ -153,6 +188,7 @@ export function buildConversationStructure(
 
   for (const entry of conv.requests ?? []) {
     if (isSubagent(entry)) {
+      const { startS, endS } = subagentTimeRange(entry);
       const childSeen = new Set(seen); // snapshot at spawn; not merged back
       const children: TurnNode[] = [];
       let gin = 0;
@@ -165,6 +201,7 @@ export function buildConversationStructure(
         children.push({
           kind: 'turn',
           turnIndex: turnIndex++,
+          startS: finiteTime(inner.t),
           model: inner.model,
           in: split.in,
           out,
@@ -180,6 +217,8 @@ export function buildConversationStructure(
         kind: 'subagent',
         label: subagentLabel(entry),
         agentId: entry.agent_id,
+        startS,
+        endS,
         durationMs: entry.duration_ms,
         in: gin,
         out: gout,
@@ -198,6 +237,7 @@ export function buildConversationStructure(
       nodes.push({
         kind: 'turn',
         turnIndex: turnIndex++,
+        startS: finiteTime(entry.t),
         model: entry.model,
         in: split.in,
         out,

From 13471d75072d574d42be008a462dbfce9467c95d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 13:44:55 -0500
Subject: [PATCH 93/96] add dataset percentile distributions

---
 .../component/distribution-card.cy.tsx        | 41 ++++++++-
 .../cypress/e2e/datasets-distributions.cy.ts  | 90 +++++++++++++++++++
 .../components/datasets/dataset-detail.tsx    |  6 ++
 .../components/datasets/distribution-card.tsx | 23 +++--
 packages/app/src/hooks/api/use-datasets.ts    |  5 ++
 packages/db/src/etl/weka-structure.test.ts    | 18 ++++
 packages/db/src/etl/weka-structure.ts         | 46 ++++++++++
 packages/db/src/ingest-weka-dataset.ts        | 50 ++++-------
 8 files changed, 235 insertions(+), 44 deletions(-)
 create mode 100644 packages/app/cypress/e2e/datasets-distributions.cy.ts

diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx
index fb7e5461..511505b9 100644
--- a/packages/app/cypress/component/distribution-card.cy.tsx
+++ b/packages/app/cypress/component/distribution-card.cy.tsx
@@ -8,7 +8,16 @@ const distribution: Distribution = {
     { x0: 200, x1: 300, count: 12 },
     { x0: 300, x1: 400, count: 3 },
   ],
-  stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 },
+  stats: {
+    count: 40,
+    min: 10,
+    max: 390,
+    mean: 180,
+    median: 175,
+    p75: 250,
+    p90: 320,
+    p95: 360,
+  },
 };
 
 describe('DistributionCard', () => {
@@ -18,8 +27,13 @@ describe('DistributionCard', () => {
     );
     cy.contains('Input tokens per turn').should('be.visible');
     cy.contains('n=40').should('be.visible');
-    cy.contains('median 175').should('be.visible');
+    cy.contains('p50 175').should('be.visible');
+    cy.contains('p75 250').should('be.visible');
     cy.contains('p90 320').should('be.visible');
+    cy.contains('p95 360').should('be.visible');
+    cy.get(
+      'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]',
+    ).should('have.length', 8);
     // One filled bar rect per bin (ChartHover may add a transparent overlay rect).
     cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length);
   });
@@ -42,4 +56,27 @@ describe('DistributionCard', () => {
     );
     cy.contains('log scale').should('be.visible');
   });
+
+  it('renders older v1 stats without unavailable percentile guides', () => {
+    cy.mount(
+      <DistributionCard
+        title="Legacy metric"
+        unit="tok"
+        distribution={{
+          bins: distribution.bins,
+          stats: {
+            count: 40,
+            min: 10,
+            max: 390,
+            mean: 180,
+            median: 175,
+            p90: 320,
+          },
+        }}
+      />,
+    );
+    cy.contains('p50 175').should('be.visible');
+    cy.contains('p90 320').should('be.visible');
+    cy.contains('NaN').should('not.exist');
+  });
 });
diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts
new file mode 100644
index 00000000..7edda341
--- /dev/null
+++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts
@@ -0,0 +1,90 @@
+const distribution = (values: {
+  median: number;
+  p75: number;
+  p90: number;
+  p95: number;
+  max: number;
+}) => ({
+  bins: [
+    { x0: 0, x1: 10, count: 5 },
+    { x0: 10, x1: 100, count: 15 },
+  ],
+  stats: {
+    count: 20,
+    min: 0,
+    mean: 40,
+    ...values,
+  },
+});
+
+describe('Dataset distribution percentiles', () => {
+  before(() => {
+    cy.intercept('GET', '/api/v1/datasets/test-dataset', {
+      body: {
+        id: 'test-dataset',
+        slug: 'test-dataset',
+        label: 'Test dataset',
+        variant: 'full',
+        description: null,
+        hf_url: null,
+        license: 'apache-2.0',
+        conversation_count: 1,
+        summary: {
+          mainTurns: 20,
+          subagentGroups: 0,
+          subagentTurns: 0,
+          cachedPct: 0.5,
+          totalIn: 1000,
+          totalOut: 200,
+        },
+        chart_data: {
+          version: 2,
+          inputTokensPerTurn: distribution({
+            median: 100,
+            p75: 200,
+            p90: 300,
+            p95: 400,
+            max: 500,
+          }),
+          outputTokensPerTurn: distribution({
+            median: 10,
+            p75: 20,
+            p90: 30,
+            p95: 40,
+            max: 50,
+          }),
+          uncachedInputTokensPerTurn: distribution({
+            median: 0,
+            p75: 64,
+            p90: 128,
+            p95: 256,
+            max: 512,
+          }),
+        },
+        ingested_at: '2026-06-23T00:00:00Z',
+      },
+    });
+    cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', {
+      body: { total: 0, items: [] },
+    });
+    cy.visit('/datasets/test-dataset');
+  });
+
+  it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => {
+    const expected = [
+      ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']],
+      ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']],
+      ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']],
+    ] as const;
+
+    for (const [title, percentiles] of expected) {
+      cy.contains('[data-slot="card"]', title).within(() => {
+        for (const percentile of percentiles) cy.contains(percentile).should('be.visible');
+        cy.get('svg line[stroke="#3b82f6"]').should('exist');
+        cy.get('svg line[stroke="#22c55e"]').should('exist');
+        cy.get('svg line[stroke="#f59e0b"]').should('exist');
+        cy.get('svg line[stroke="#ef4444"]').should('exist');
+      });
+    }
+  });
+});
diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx
index 9410a505..ac8b2de5 100644
--- a/packages/app/src/components/datasets/dataset-detail.tsx
+++ b/packages/app/src/components/datasets/dataset-detail.tsx
@@ -145,6 +145,12 @@ export function DatasetDetail({ slug }: { slug: string }) {
             scale="log"
             distribution={cd.outputTokensPerTurn}
           />
+          <DistributionCard
+            title="Uncached input tokens per request"
+            unit="tokens"
+            scale="log"
+            distribution={cd.uncachedInputTokensPerTurn}
+          />
           <DistributionCard
             title="Turns per conversation"
             unit="turns"
diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx
index d0c0f166..3d0e45d7 100644
--- a/packages/app/src/components/datasets/distribution-card.tsx
+++ b/packages/app/src/components/datasets/distribution-card.tsx
@@ -23,7 +23,7 @@ const PAD = { top: 12, right: 16, bottom: 48, left: 52 };
 
 /**
  * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a
- * themeable bar chart with median/p90 guide lines and a hover tooltip. Bars are
+ * themeable bar chart with p50/p75/p90/p95 guide lines and a hover tooltip. Bars are
  * drawn at equal visual width; for log-scaled bins the edge labels are already
  * log-spaced so the shape reads as a log histogram.
  */
@@ -71,11 +71,17 @@ export function DistributionCard({
   const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed;
   const stats = distribution?.stats;
 
-  const guides = stats
-    ? ([
-        { label: 'median', value: stats.median, color: '#3b82f6' },
+  const guides: { label: string; value: number; color: string }[] = stats
+    ? [
+        { label: 'p50', value: stats.median, color: '#3b82f6' },
+        ...(typeof stats.p75 === 'number'
+          ? [{ label: 'p75', value: stats.p75, color: '#22c55e' }]
+          : []),
         { label: 'p90', value: stats.p90, color: '#f59e0b' },
-      ] as const)
+        ...(typeof stats.p95 === 'number'
+          ? [{ label: 'p95', value: stats.p95, color: '#ef4444' }]
+          : []),
+      ]
     : [];
 
   // X tick labels from a few bin edges.
@@ -108,8 +114,11 @@ export function DistributionCard({
       {subtitle && <div className="mb-1 text-xs text-muted-foreground">{subtitle}</div>}
       {stats && (
         <div className="mb-2 text-xs text-muted-foreground">
-          n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '}
-          {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit}
+          n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)}
+          {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}</>} · p90{' '}
+          {formatValue(stats.p90)}
+          {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}</>} · max{' '}
+          {formatValue(stats.max)} {unit}
         </div>
       )}
       <div className="w-full text-muted-foreground">
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts
index 3ce61a85..96b0f59f 100644
--- a/packages/app/src/hooks/api/use-datasets.ts
+++ b/packages/app/src/hooks/api/use-datasets.ts
@@ -46,7 +46,11 @@ export interface DistributionStats {
   max: number;
   mean: number;
   median: number;
+  /** Added in chart_data v2. */
+  p75?: number;
   p90: number;
+  /** Added in chart_data v2. */
+  p95?: number;
 }
 
 export interface Distribution {
@@ -57,6 +61,7 @@ export interface Distribution {
 export interface DatasetChartData {
   version?: number;
   inputTokensPerTurn?: Distribution;
+  uncachedInputTokensPerTurn?: Distribution;
   outputTokensPerTurn?: Distribution;
   turnsPerConversation?: Distribution;
   subagentGroupsPerConversation?: Distribution;
diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts
index 5287b682..4debf1ae 100644
--- a/packages/db/src/etl/weka-structure.test.ts
+++ b/packages/db/src/etl/weka-structure.test.ts
@@ -4,6 +4,8 @@ import {
   buildConversationStructure,
   linearHistogram,
   logHistogram,
+  logHistogramWithZero,
+  summarizeValues,
   type RawWekaConversation,
   type SubagentNode,
   type TurnNode,
@@ -177,4 +179,20 @@ describe('histograms', () => {
     expect(linearHistogram([])).toEqual([]);
     expect(logHistogram([])).toEqual([]);
   });
+
+  it('preserves zero-valued samples in a dedicated log histogram bin', () => {
+    const bins = logHistogramWithZero([0, 0, 1, 10, 100], 4);
+    expect(bins[0]).toEqual({ x0: 0, x1: 1, count: 2 });
+    expect(bins.reduce((total, bin) => total + bin.count, 0)).toBe(5);
+  });
+});
+
+describe('summarizeValues', () => {
+  it('computes the same linearly-interpolated percentile set as request distributions', () => {
+    const summary = summarizeValues(Array.from({ length: 100 }, (_, i) => i + 1));
+    expect(summary.median).toBeCloseTo(50.5, 6);
+    expect(summary.p75).toBeCloseTo(75.25, 6);
+    expect(summary.p90).toBeCloseTo(90.1, 6);
+    expect(summary.p95).toBeCloseTo(95.05, 6);
+  });
 });
diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts
index 33e222b4..ac7a6eab 100644
--- a/packages/db/src/etl/weka-structure.ts
+++ b/packages/db/src/etl/weka-structure.ts
@@ -274,6 +274,42 @@ export interface HistogramBin {
   count: number;
 }
 
+export interface NumberSummary {
+  count: number;
+  min: number;
+  max: number;
+  mean: number;
+  median: number;
+  p75: number;
+  p90: number;
+  p95: number;
+}
+
+/** Distribution summary with linear-interpolated percentiles. */
+export function summarizeValues(values: readonly number[]): NumberSummary {
+  if (values.length === 0) {
+    return { count: 0, min: 0, max: 0, mean: 0, median: 0, p75: 0, p90: 0, p95: 0 };
+  }
+  const sorted = [...values].toSorted((a, b) => a - b);
+  const quantile = (q: number): number => {
+    const pos = (sorted.length - 1) * q;
+    const lo = Math.floor(pos);
+    const hi = Math.ceil(pos);
+    if (lo === hi) return sorted[lo]!;
+    return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo);
+  };
+  return {
+    count: sorted.length,
+    min: sorted[0]!,
+    max: sorted.at(-1)!,
+    mean: sorted.reduce((sum, value) => sum + value, 0) / sorted.length,
+    median: quantile(0.5),
+    p75: quantile(0.75),
+    p90: quantile(0.9),
+    p95: quantile(0.95),
+  };
+}
+
 /** Linear-width histogram over [0, max]. Empty input → []. */
 export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] {
   if (values.length === 0) return [];
@@ -313,3 +349,13 @@ export function logHistogram(values: readonly number[], bins = 40): HistogramBin
   }
   return out;
 }
+
+/** Log-width histogram that preserves zero as a dedicated first bin. */
+export function logHistogramWithZero(values: readonly number[], bins = 40): HistogramBin[] {
+  const zeroCount = values.filter((value) => value === 0).length;
+  const positive = values.filter((value) => value > 0);
+  if (zeroCount === 0) return logHistogram(positive, bins);
+  if (positive.length === 0) return [{ x0: 0, x1: 1, count: zeroCount }];
+  const positiveBins = logHistogram(positive, Math.max(1, bins - 1));
+  return [{ x0: 0, x1: positiveBins[0]?.x0 ?? 1, count: zeroCount }, ...positiveBins];
+}
diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts
index 22069419..e00471d7 100644
--- a/packages/db/src/ingest-weka-dataset.ts
+++ b/packages/db/src/ingest-weka-dataset.ts
@@ -24,6 +24,8 @@ import {
   buildConversationStructure,
   linearHistogram,
   logHistogram,
+  logHistogramWithZero,
+  summarizeValues,
   type ConversationStructure,
   type RawWekaConversation,
   type TurnNode,
@@ -140,6 +142,7 @@ async function* iterRows(
 
 interface Accumulator {
   inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children)
+  uncachedInputPerTurn: number[];
   outputPerTurn: number[];
   cachedFractionPerTurn: number[]; // cached/in, for turns with in>0
   turnsPerConv: number[]; // main (top-level) turns
@@ -157,6 +160,7 @@ interface Accumulator {
 function newAccumulator(): Accumulator {
   return {
     inputPerTurn: [],
+    uncachedInputPerTurn: [],
     outputPerTurn: [],
     cachedFractionPerTurn: [],
     turnsPerConv: [],
@@ -174,6 +178,7 @@ function newAccumulator(): Accumulator {
 
 function recordTurn(acc: Accumulator, t: TurnNode): void {
   acc.inputPerTurn.push(t.in);
+  acc.uncachedInputPerTurn.push(t.uncached);
   acc.outputPerTurn.push(t.out);
   if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in);
   if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1;
@@ -198,57 +203,32 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void {
   }
 }
 
-interface NumberSummary {
-  count: number;
-  min: number;
-  max: number;
-  mean: number;
-  median: number;
-  p90: number;
-}
-
-function summarize(values: number[]): NumberSummary {
-  if (values.length === 0) {
-    return { count: 0, min: 0, max: 0, mean: 0, median: 0, p90: 0 };
-  }
-  const sorted = [...values].toSorted((a, b) => a - b);
-  const n = sorted.length;
-  // Quantile by position; q(0)=min, q(1)=max — avoids array-tail indexing that
-  // the linter rewrites to `.at(-1)` (which widens the type to `| undefined`).
-  const q = (p: number) => sorted[Math.min(n - 1, Math.max(0, Math.floor(p * (n - 1))))];
-  const sum = sorted.reduce((a, b) => a + b, 0);
-  return {
-    count: n,
-    min: q(0),
-    max: q(1),
-    mean: sum / n,
-    median: q(0.5),
-    p90: q(0.9),
-  };
-}
-
 function buildChartData(acc: Accumulator) {
   return {
-    version: 1,
+    version: 2,
     inputTokensPerTurn: {
       bins: logHistogram(acc.inputPerTurn),
-      stats: summarize(acc.inputPerTurn),
+      stats: summarizeValues(acc.inputPerTurn),
+    },
+    uncachedInputTokensPerTurn: {
+      bins: logHistogramWithZero(acc.uncachedInputPerTurn),
+      stats: summarizeValues(acc.uncachedInputPerTurn),
     },
     outputTokensPerTurn: {
       bins: logHistogram(acc.outputPerTurn),
-      stats: summarize(acc.outputPerTurn),
+      stats: summarizeValues(acc.outputPerTurn),
     },
     turnsPerConversation: {
       bins: linearHistogram(acc.turnsPerConv),
-      stats: summarize(acc.turnsPerConv),
+      stats: summarizeValues(acc.turnsPerConv),
     },
     subagentGroupsPerConversation: {
       bins: linearHistogram(acc.subagentGroupsPerConv),
-      stats: summarize(acc.subagentGroupsPerConv),
+      stats: summarizeValues(acc.subagentGroupsPerConv),
     },
     cachedFractionPerTurn: {
       bins: linearHistogram(acc.cachedFractionPerTurn, 20),
-      stats: summarize(acc.cachedFractionPerTurn),
+      stats: summarizeValues(acc.cachedFractionPerTurn),
     },
   };
 }

From 8bfe66408d6b8514031e47af1b94ede19c369d97 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 16:10:02 -0500
Subject: [PATCH 94/96] use cumulative percentiles for agentic charts

---
 .../e2e/agentic-point-time-series.cy.ts       | 34 ++++++++++---------
 .../agentic-point/agentic-point-detail.tsx    |  7 ++--
 .../agentic-point/time-series-chart.test.ts   |  4 +--
 .../agentic-point/time-series-chart.tsx       | 20 ++++++++---
 4 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
index b0cfb60d..db59dda2 100644
--- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
+++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts
@@ -46,15 +46,15 @@ describe('Agentic point request metric time series', () => {
     cy.visit('/inference/agentic/206885');
   });
 
-  it('renders rolling P75 interactivity and TTFT using profiling requests only', () => {
+  it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => {
     cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
       cy.contains('h2', 'Interactivity over time').should('be.visible');
       cy.get('[data-testid="interactivity-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
-        .should('have.text', 'P75');
+        .should('have.text', 'P90');
       cy.get('svg circle').should('have.length', 5);
-      cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
-      cy.get('svg').should('contain.text', '1 / cumulative mean TPOT');
+      cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+      cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT');
       cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
     });
 
@@ -62,37 +62,39 @@ describe('Agentic point request metric time series', () => {
       cy.contains('h2', 'TTFT over time').should('be.visible');
       cy.get('svg circle').should('have.length', 5);
       cy.get('svg').should('contain.text', 'TTFT (s)');
-      cy.get('svg').should('contain.text', 'Cumulative mean TTFT');
+      cy.get('svg').should('contain.text', 'Cumulative P90 TTFT');
       cy.get('svg path[stroke="#ef4444"]').should('have.length', 1);
     });
   });
 
-  it('switches each chart independently from P75 to P90', () => {
+  it('switches each chart independently from P90 to P75', () => {
     cy.get('[data-testid="interactivity-over-time-chart"]').within(() => {
-      cy.contains('svg', 'P75 (rolling 50 req)')
+      cy.contains('svg', 'P90 (rolling 50 req)')
         .find('path')
         .first()
         .invoke('attr', 'd')
-        .as('p75Path');
-      cy.contains('button', 'P90').click();
+        .as('p90Path');
+      cy.contains('button', 'P75').click();
       cy.get('[data-testid="interactivity-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
-        .should('have.text', 'P90');
-      cy.contains('svg', 'P90 (rolling 50 req)')
+        .should('have.text', 'P75');
+      cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT');
+      cy.contains('svg', 'P75 (rolling 50 req)')
         .find('path')
         .first()
         .invoke('attr', 'd')
-        .then(function (p90Path) {
-          expect(p90Path).not.to.equal(this.p75Path);
+        .then(function (p75Path) {
+          expect(p75Path).not.to.equal(this.p90Path);
         });
     });
 
     cy.get('[data-testid="ttft-over-time-chart"]').within(() => {
       cy.get('[data-testid="ttft-percentile-toggle"]')
         .find('[role="tab"][aria-selected="true"]')
-        .should('have.text', 'P75');
-      cy.contains('button', 'P90').click();
-      cy.get('svg').should('contain.text', 'P90 (rolling 50 req)');
+        .should('have.text', 'P90');
+      cy.contains('button', 'P75').click();
+      cy.get('svg').should('contain.text', 'P75 (rolling 50 req)');
+      cy.get('svg').should('contain.text', 'Cumulative P75 TTFT');
     });
   });
 });
diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
index e24b7e6b..e1bc1524 100644
--- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
+++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx
@@ -137,7 +137,7 @@ function RequestMetricOverTime({
   timeline: RequestTimeline | null | undefined;
   isLoading: boolean;
 }) {
-  const [percentile, setPercentile] = useState<RequestPercentile>('p75');
+  const [percentile, setPercentile] = useState<RequestPercentile>('p90');
   const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null;
   const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity';
   const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4';
@@ -174,7 +174,10 @@ function RequestMetricOverTime({
                 strokeWidth: 2.5,
               },
               {
-                name: metric === 'ttft' ? 'Cumulative mean TTFT' : '1 / cumulative mean TPOT',
+                name:
+                  metric === 'ttft'
+                    ? `Cumulative ${percentile.toUpperCase()} TTFT`
+                    : `1 / cumulative ${percentile.toUpperCase()} TPOT`,
                 data: result?.cumulative ?? [],
                 color: '#ef4444',
                 strokeWidth: 3,
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
index 926772db..3506ff45 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts
@@ -38,7 +38,7 @@ describe('rollingRequestMetric', () => {
 
     expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 });
     expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]);
-    expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]);
+    expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.325]);
   });
 
   it('inverts the rolling TPOT percentile for interactivity', () => {
@@ -51,7 +51,7 @@ describe('rollingRequestMetric', () => {
 
     expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]);
     expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8);
-    expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]);
+    expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]);
   });
 
   it('drops warmup, cancelled, missing, and non-positive samples', () => {
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
index 749a17e4..0c0b5739 100644
--- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
+++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx
@@ -82,11 +82,21 @@ export function rollingRequestMetric(
     const latencyMs = quantile(sorted, q);
     return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs };
   });
-  let latencySumMs = 0;
-  const cumulative = samples.map(({ t, latencyMs }, i) => {
-    latencySumMs += latencyMs;
-    const meanLatencyMs = latencySumMs / (i + 1);
-    return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs };
+  const prefixLatencies: number[] = [];
+  const cumulative = samples.map(({ t, latencyMs }) => {
+    let lo = 0;
+    let hi = prefixLatencies.length;
+    while (lo < hi) {
+      const mid = (lo + hi) >> 1;
+      if (prefixLatencies[mid]! <= latencyMs) lo = mid + 1;
+      else hi = mid;
+    }
+    prefixLatencies.splice(lo, 0, latencyMs);
+    const cumulativeLatencyMs = quantile(prefixLatencies, q);
+    return {
+      t,
+      value: metric === 'ttft' ? cumulativeLatencyMs / 1000 : 1000 / cumulativeLatencyMs,
+    };
   });
 
   return { raw, trend, cumulative };

From e3e0bf43ddec5dd8c1d4f21e1c3f9baff469f8f9 Mon Sep 17 00:00:00 2001
From: Alec Ibarra <93070681+adibarra@users.noreply.github.com>
Date: Tue, 23 Jun 2026 18:34:16 -0500
Subject: [PATCH 95/96] fix(db): build each chart line from a single run, no
 cross-run/date stitching (#491)

---
 ..._latest_benchmarks_single_run_per_line.sql |  49 +++++
 .../src/json-provider.line-single-run.test.ts | 203 ++++++++++++++++++
 packages/db/src/json-provider.ts              |  50 +++--
 packages/db/src/queries/benchmarks.ts         |  58 +++--
 4 files changed, 323 insertions(+), 37 deletions(-)
 create mode 100644 packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
 create mode 100644 packages/db/src/json-provider.line-single-run.test.ts

diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
new file mode 100644
index 00000000..039dfe09
--- /dev/null
+++ b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql
@@ -0,0 +1,49 @@
+-- ============================================================
+-- LATEST_BENCHMARKS — one run per line (no cross-run stitching)
+-- ============================================================
+--
+-- Previously the view did `distinct on (config_id, conc, isl, osl)` ordered by
+-- date desc — resolved INDEPENDENTLY per concurrency. So if a newer run
+-- re-measured only some concurrencies (a partial re-sweep), the concurrencies it
+-- skipped fell back to an older run that did measure them, and a single chart line
+-- ended up stitched from points produced by different runs on different dates.
+--
+-- A line is one config + sequence + offload mode
+-- (config_id, benchmark_type, isl, osl, offload_mode) plotted
+-- across concurrencies, and it must come from a SINGLE workflow run. We pick the
+-- newest run per line (newest date, then latest sweep by run_started_at, then
+-- highest workflow_run_id so exactly one run wins even on a same-day / null tie),
+-- then keep EVERY concurrency that one run measured. A partial re-sweep therefore
+-- truncates the line to its own concurrencies rather than borrowing an older run's.
+
+drop materialized view if exists latest_benchmarks;
+
+create materialized view latest_benchmarks as
+with winners as (
+  select distinct on (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode)
+         br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+         br.workflow_run_id as winning_run_id
+  from benchmark_results br
+  join latest_workflow_runs wr on wr.id = br.workflow_run_id
+  where br.error is null
+  order by br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+           br.date desc, wr.run_started_at desc nulls last, br.workflow_run_id desc
+)
+select br.*
+from benchmark_results br
+join winners w
+  on  w.config_id      = br.config_id
+  and w.benchmark_type = br.benchmark_type
+  and w.isl is not distinct from br.isl
+  and w.osl is not distinct from br.osl
+  and w.offload_mode = br.offload_mode
+  and w.winning_run_id = br.workflow_run_id
+where br.error is null;
+
+-- Unique key now includes benchmark_type (part of the line key). One run per line
+-- guarantees one row per concurrency, so this stays unique and keeps
+-- REFRESH MATERIALIZED VIEW CONCURRENTLY working.
+create unique index latest_benchmarks_pk
+  on latest_benchmarks (config_id, conc, isl, osl, benchmark_type, offload_mode)
+  nulls not distinct;
+create index latest_benchmarks_model_idx on latest_benchmarks (config_id);
diff --git a/packages/db/src/json-provider.line-single-run.test.ts b/packages/db/src/json-provider.line-single-run.test.ts
new file mode 100644
index 00000000..b75fa26a
--- /dev/null
+++ b/packages/db/src/json-provider.line-single-run.test.ts
@@ -0,0 +1,203 @@
+import { mkdtempSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
+
+import type { getLatestBenchmarks as GetLatestBenchmarks } from './json-provider.js';
+
+/**
+ * A chart line is one config + sequence + offload mode
+ * (config_id, benchmark_type, isl, osl, offload_mode) plotted across concurrencies, and it must
+ * come from a SINGLE workflow run. getLatestBenchmarks picks the
+ * newest run per line (date, then run_started_at, then workflow_run_id) and returns EVERY
+ * concurrency that one run measured — never stitching skipped concurrencies from an older run.
+ *
+ * These fixtures exercise the multi-concurrency cases the as-of test can't (it is single-conc):
+ * a partial re-sweep that must truncate the line, per-sequence line independence, and the
+ * same-day workflow_run_id tiebreak.
+ */
+
+const cfg = (id: number) => ({
+  id,
+  hardware: 'h100',
+  framework: 'vllm',
+  model: 'testm',
+  precision: 'fp8',
+  spec_method: 'none',
+  disagg: false,
+  is_multinode: false,
+  prefill_tp: 1,
+  prefill_ep: 1,
+  prefill_dp_attention: false,
+  prefill_num_workers: 1,
+  decode_tp: 1,
+  decode_ep: 1,
+  decode_dp_attention: false,
+  decode_num_workers: 1,
+  num_prefill_gpu: 0,
+  num_decode_gpu: 8,
+});
+
+const run = (id: number, githubId: number, startedAt: string | null, date: string) => ({
+  id,
+  github_run_id: githubId,
+  run_attempt: 1,
+  name: `run ${githubId}`,
+  status: 'completed',
+  conclusion: 'success',
+  head_sha: 'sha',
+  head_branch: 'main',
+  html_url: `https://github.com/x/runs/${githubId}`,
+  created_at: startedAt ?? `${date}T00:00:00Z`,
+  run_started_at: startedAt,
+  date,
+});
+
+let nextResultId = 1000;
+const result = (
+  runDbId: number,
+  configId: number,
+  date: string,
+  conc: number,
+  tpot: number,
+  isl = 1024,
+  osl = 1024,
+  offloadMode = 'off',
+) => ({
+  id: nextResultId++,
+  workflow_run_id: runDbId,
+  config_id: configId,
+  benchmark_type: 'latency',
+  date,
+  isl,
+  osl,
+  conc,
+  offload_mode: offloadMode,
+  image: null,
+  metrics: { median_tpot: tpot },
+  error: null,
+  server_log_id: null,
+});
+
+const OLD = '2026-06-10';
+const NEW = '2026-06-14';
+let getLatestBenchmarks: typeof GetLatestBenchmarks;
+
+beforeAll(async () => {
+  const dir = mkdtempSync(join(tmpdir(), 'infx-line-'));
+  writeFileSync(join(dir, 'configs.json'), JSON.stringify([cfg(1), cfg(2)]));
+  writeFileSync(
+    join(dir, 'workflow_runs.json'),
+    JSON.stringify([
+      run(10, 100, `${OLD}T04:00:00Z`, OLD), // run A: older full sweep
+      run(11, 101, `${NEW}T05:00:00Z`, NEW), // run B: newer partial re-sweep
+      run(20, 200, `${NEW}T07:00:00Z`, NEW), // run E: same-day, lower run id
+      run(21, 201, `${NEW}T07:00:00Z`, NEW), // run F: same-day, SAME timestamp, higher run id
+    ]),
+  );
+  writeFileSync(
+    join(dir, 'benchmark_results.json'),
+    JSON.stringify([
+      // config 1, seq (1024,1024): run A full sweep, run B partial re-sweep.
+      result(10, 1, OLD, 1, 0.1),
+      result(10, 1, OLD, 8, 0.18),
+      result(10, 1, OLD, 64, 0.5),
+      result(11, 1, NEW, 1, 0.09),
+      result(11, 1, NEW, 8, 0.16),
+      // config 1, seq (8192,1024): only run A measured it (run B skipped this sequence).
+      result(10, 1, OLD, 1, 0.2, 8192, 1024),
+      result(10, 1, OLD, 8, 0.3, 8192, 1024),
+      // Offload mode is an independent line dimension. A newer off-mode run must not hide
+      // the older on-mode line for the same config and sequence.
+      result(10, 1, OLD, 4, 0.25, 4096, 4096, 'on'),
+      result(11, 1, NEW, 4, 0.2, 4096, 4096, 'off'),
+      // config 2, seq (1024,1024): two same-day runs with identical run_started_at.
+      result(20, 2, NEW, 1, 0.5),
+      result(20, 2, NEW, 8, 0.6),
+      result(20, 2, NEW, 64, 0.7),
+      result(21, 2, NEW, 1, 0.4),
+      result(21, 2, NEW, 8, 0.45),
+    ]),
+  );
+  process.env.DUMP_DIR = dir;
+  const mod = await import('./json-provider.js');
+  getLatestBenchmarks = mod.getLatestBenchmarks;
+});
+
+afterAll(() => {
+  delete process.env.DUMP_DIR;
+});
+
+/** Concurrencies + their run urls for one (config sequence) line, sorted by conc. */
+function line(
+  rows: { isl: number | null; osl: number | null; conc: number; run_url: string | null }[],
+  configRunUrlRe: RegExp,
+  isl: number,
+  osl: number,
+) {
+  return rows
+    .filter((r) => r.isl === isl && r.osl === osl && r.run_url?.match(configRunUrlRe))
+    .toSorted((a, b) => a.conc - b.conc)
+    .map((r) => ({ conc: r.conc, runUrl: r.run_url }));
+}
+
+describe('getLatestBenchmarks — one run per line', () => {
+  it('truncates a line to the newest run: a partial re-sweep hides the older run’s extra concs', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false);
+    // config 1 / seq (1024,1024): run B (101) measured only conc 1 & 8. conc 64 from run A is gone.
+    const seq = line(rows, /runs\/(?:100|101)\//u, 1024, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/101/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/101/attempts/1' },
+    ]);
+    expect(seq.some((p) => p.conc === 64)).toBe(false);
+  });
+
+  it('keeps a different sequence of the same config on its own winning run', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false);
+    // seq (8192,1024) was only in run A; run B winning the other sequence must not erase it.
+    const seq = line(rows, /runs\/100\//u, 8192, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+    ]);
+  });
+
+  it('selects winning runs independently for each offload mode', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false).filter(
+      (r) => r.isl === 4096 && r.osl === 4096,
+    );
+
+    expect(
+      rows
+        .map((r) => ({ offloadMode: r.offload_mode, runUrl: r.run_url }))
+        .toSorted((a, b) => a.offloadMode.localeCompare(b.offloadMode)),
+    ).toEqual([
+      { offloadMode: 'off', runUrl: 'https://github.com/x/runs/101/attempts/1' },
+      { offloadMode: 'on', runUrl: 'https://github.com/x/runs/100/attempts/1' },
+    ]);
+  });
+
+  it('breaks a same-day, same-timestamp tie by workflow_run_id (higher id wins the whole line)', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false);
+    // config 2: run E (200, id 20) and run F (201, id 21) share run_started_at; F wins by id.
+    const seq = line(rows, /runs\/(?:200|201)\//u, 1024, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/201/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/201/attempts/1' },
+    ]);
+    // run E's extra conc 64 must not bleed into run F's line.
+    expect(seq.some((p) => p.conc === 64)).toBe(false);
+  });
+
+  it('as of the older run, shows that run’s full sweep (no truncation by a later run)', () => {
+    const rows = getLatestBenchmarks('testm', NEW, false, '100');
+    const seq = line(rows, /runs\/100\//u, 1024, 1024);
+    expect(seq).toEqual([
+      { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+      { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+      { conc: 64, runUrl: 'https://github.com/x/runs/100/attempts/1' },
+    ]);
+  });
+});
diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts
index c23e5f48..4e548efe 100644
--- a/packages/db/src/json-provider.ts
+++ b/packages/db/src/json-provider.ts
@@ -72,6 +72,8 @@ interface RawBenchmarkResult {
   isl: number;
   osl: number;
   conc: number;
+  /** Added by the AgentX schema; older dumps omit it and are treated as off. */
+  offload_mode?: string;
   image: string | null;
   metrics: Record<string, number>;
   /** Added in migration 006; older dumps omit this field — surfaced as undefined. */
@@ -333,12 +335,11 @@ const STRIP_HISTORY_KEYS = new Set([
 ]);
 
 /**
- * Comparator for DISTINCT ON (config, conc, isl, osl) selection: latest calendar
- * day first, then — for sweeps on the same day — the latest workflow run first by
- * `run_started_at` (NULLS LAST). Mirrors the SQL date-filtered query and the
- * `latest_benchmarks` view (migration 003): a calendar day alone ties two same-day
- * sweeps, so without this an older run's points can shadow a same-day re-sweep.
- * `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically.
+ * Run-recency comparator used to pick the newest run per line: latest calendar day first,
+ * then — for sweeps on the same day — the latest workflow run first by `run_started_at`
+ * (NULLS LAST). Mirrors the `br.date DESC, wr.run_started_at DESC NULLS LAST` portion of the
+ * SQL ORDER BY; callers apply a `workflow_run_id` DESC final tiebreak on top so exactly one
+ * run wins. `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically.
  * Exported so the same-day tiebreak is unit-tested in parity with the SQL.
  */
 export function compareBenchmarkRecency(
@@ -355,6 +356,10 @@ export function compareBenchmarkRecency(
   return bStarted.localeCompare(aStarted);
 }
 
+/** Chart-line identity: one config + sequence + offload mode. */
+const lineKey = (br: RawBenchmarkResult): string =>
+  `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`;
+
 export function getLatestBenchmarks(
   modelKey: string | string[],
   date?: string,
@@ -390,27 +395,32 @@ export function getLatestBenchmarks(
     return true;
   });
 
-  // DISTINCT ON (config_id, conc, isl, osl) — keep the one with the latest date,
-  // tiebreaking same-day runs by run_started_at so the latest sweep wins.
-  const seen = new Map<string, RawBenchmarkResult>();
-  candidates.sort((a, b) =>
-    compareBenchmarkRecency(
+  // Single run per LINE (config_id, benchmark_type, isl, osl, offload_mode): pick the newest run that
+  // produced data for the line, then keep EVERY concurrency that one run measured. Sort by
+  // recency (date, then run_started_at) with a final workflow_run_id DESC tiebreak so exactly
+  // one run wins even when run_started_at is equal/null — matching the SQL ORDER BY.
+  candidates.sort((a, b) => {
+    const recency = compareBenchmarkRecency(
       toDateString(a.date),
       toDateString(b.date),
       s.latestRunsById.get(a.workflow_run_id)?.run_started_at ?? null,
       s.latestRunsById.get(b.workflow_run_id)?.run_started_at ?? null,
-    ),
-  );
+    );
+    return recency === 0 ? b.workflow_run_id - a.workflow_run_id : recency;
+  });
+  const winningRun = new Map<string, number>();
   for (const br of candidates) {
-    const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`;
-    if (!seen.has(key)) seen.set(key, br);
+    const key = lineKey(br);
+    if (!winningRun.has(key)) winningRun.set(key, br.workflow_run_id);
   }
 
-  return [...seen.values()].map((br) => {
-    const c = s.configs.get(br.config_id)!;
-    const wr = s.latestRunsById.get(br.workflow_run_id)!;
-    return toBenchmarkRow(br, c, wr);
-  });
+  return candidates
+    .filter((br) => winningRun.get(lineKey(br)) === br.workflow_run_id)
+    .map((br) => {
+      const c = s.configs.get(br.config_id)!;
+      const wr = s.latestRunsById.get(br.workflow_run_id)!;
+      return toBenchmarkRow(br, c, wr);
+    });
 }
 
 /** In-memory mirror of {@link import('./queries/benchmarks.js').getBenchmarksForRun}. */
diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts
index 6833756a..37301e2b 100644
--- a/packages/db/src/queries/benchmarks.ts
+++ b/packages/db/src/queries/benchmarks.ts
@@ -51,9 +51,14 @@ export interface BenchmarkRow {
 /**
  * Fetch the latest benchmark results for one or more model DB keys across ALL sequences,
  * up to a given date. Multiple keys support point-release grouping — e.g. passing
- * `['glm5', 'glm5.1']` unions both buckets under the one display. Returns the most recent
- * result per (config, concurrency, isl, osl) — so every GPU/framework + sequence combo
- * that has been benchmarked appears, with the newest data winning.
+ * `['glm5', 'glm5.1']` unions both buckets under the one display.
+ *
+ * Selection unit is the LINE, not the point: for each line
+ * `(config_id, benchmark_type, isl, osl, offload_mode)` we pick the single newest workflow run that
+ * produced data for it (newest date, then latest sweep, then highest run id) and return
+ * EVERY concurrency that one run measured — and nothing from any other run. A partial
+ * re-sweep therefore truncates the line to its own concurrencies rather than stitching the
+ * skipped ones from an older run. This guarantees a line never mixes runs/dates.
  *
  * The frontend filters by sequence client-side. This eliminates API round-trips when
  * switching sequences — the data is already cached by React Query.
@@ -74,13 +79,8 @@ export async function getLatestBenchmarks(
 ): Promise<BenchmarkRow[]> {
   const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey];
   if (date) {
-    // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest)
-    // exact=true: only return data from this exact date (for GPU comparison)
-    // exact=false (default): return latest data as of this date (for main chart)
-    // Same-day tiebreak by wr.run_started_at (latest sweep wins), mirroring the
-    // latest_benchmarks view (migration 003). br.date is a calendar day, so two
-    // sweeps on the same day tie on date alone and Postgres would otherwise pick
-    // an arbitrary one — leaving an older run's points shadowing a same-day re-sweep.
+    // Date-filtered: use the base table (the view only has the absolute latest).
+    // exact=true: only this exact date (GPU comparison); exact=false (default): as of this date.
     const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`;
     // "As of run" filter (main chart only): keep results whose run started no later
     // than the selected run. run_started_at is an absolute timestamp, so this also
@@ -97,8 +97,29 @@ export async function getLatestBenchmarks(
             )
           )`
         : sql``;
+    // winners: the single newest run per LINE
+    // (config_id, benchmark_type, isl, osl, offload_mode) under the
+    // date/run cutoff. br.date is a calendar day, so two same-day sweeps tie on date — break
+    // by wr.run_started_at (latest sweep wins), then br.workflow_run_id so exactly one run wins
+    // even when run_started_at is equal/null. The outer join then pulls EVERY concurrency that
+    // winning run measured for the line, so the line is built from one run only (no carry-forward
+    // of concurrencies a partial re-sweep skipped).
     const rows = await sql`
-      SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl)
+      WITH winners AS (
+        SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode)
+          br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+          br.workflow_run_id AS winning_run_id
+        FROM benchmark_results br
+        JOIN configs c ON c.id = br.config_id
+        JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
+        WHERE c.model = ANY(${modelKeys})
+          AND br.error IS NULL
+          AND ${dateFilter}
+          ${runFilter}
+        ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode,
+                 br.date DESC, wr.run_started_at DESC NULLS LAST, br.workflow_run_id DESC
+      )
+      SELECT
         br.id,
         c.hardware,
         c.framework,
@@ -130,12 +151,15 @@ export async function getLatestBenchmarks(
       FROM benchmark_results br
       JOIN configs c ON c.id = br.config_id
       JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id
-      WHERE c.model = ANY(${modelKeys})
-        AND br.error IS NULL
-        AND ${dateFilter}
-        ${runFilter}
-      ORDER BY br.config_id, br.conc, br.isl, br.osl,
-               br.date DESC, wr.run_started_at DESC NULLS LAST
+      JOIN winners w
+        ON w.config_id = br.config_id
+        AND w.benchmark_type = br.benchmark_type
+        AND w.isl IS NOT DISTINCT FROM br.isl
+        AND w.osl IS NOT DISTINCT FROM br.osl
+        AND w.offload_mode = br.offload_mode
+        AND w.winning_run_id = br.workflow_run_id
+      WHERE br.error IS NULL
+      ORDER BY br.config_id, br.conc, br.isl, br.osl
     `;
     return rows as unknown as BenchmarkRow[];
   }

From 2c3bb6dcaaff6c04ec56928cc08843b267c464bb Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 23 Jun 2026 23:08:36 -0500
Subject: [PATCH 96/96] Default agentic charts to interactivity

---
 packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts          | 7 ++++---
 packages/app/src/components/inference/InferenceContext.tsx | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
index 636a7ccf..df199b81 100644
--- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
+++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts
@@ -9,13 +9,14 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
     cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1);
   });
 
-  it('shows the x-axis mode buttons with Interactivity active by default', () => {
+  it('shows Interactivity by default for the agentic view', () => {
+    cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces');
     cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible');
     cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible');
     cy.get('[data-testid="x-axis-mode-interactivity"]')
       .should('be.visible')
       .and('have.attr', 'aria-selected', 'true');
-    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
   });
 
   it('switches the x-axis to TTFT and updates the heading', () => {
@@ -37,6 +38,6 @@ describe('X-Axis Mode Toggle (inference chart)', () => {
       'aria-selected',
       'true',
     );
-    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity');
+    cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity');
   });
 });
diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx
index 839afeed..ddb923b8 100644
--- a/packages/app/src/components/inference/InferenceContext.tsx
+++ b/packages/app/src/components/inference/InferenceContext.tsx
@@ -533,7 +533,7 @@ export function InferenceProvider({
 
   // Reconcile the x-axis mode with the scenario kind:
   //  - On mount with no `i_xmode` URL param: snap to the kind's natural default
-  //    (agentic → ttft, fixed → interactivity). The state itself was initialized
+  //    (interactivity for both agentic and fixed-sequence scenarios). The state was initialized
   //    to a SSR-stable constant so server and client render the same DOM; this
   //    effect fixes it up after hydration.
   //  - When the user later switches sequence kinds: snap to the new kind's
@@ -565,7 +565,7 @@ export function InferenceProvider({
       // — fall through to the default snap below.
       return;
     }
-    handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity');
+    handleSetXAxisMode('interactivity');
   }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]);
 
   // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or