From 0e35e5f0b10c2c9db10094031a2ac92e59fff9f3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 23 Apr 2026 13:40:27 -0500 Subject: [PATCH 01/96] feat: agentic benchmark ingest + UI with offload-mode halo Adds agentic_traces scenario end-to-end: - Schema migrations for agentic scenario, availability, and KV offload mode - DB ingest/ETL + query updates to carry scenario, offload_mode, and server/theoretical cache-hit rates through to the API layer - Frontend types, filters (GlobalFilterContext / InferenceContext / ChartControls), URL state, and tooltip rows for agentic-only fields - ScatterGraph: subtle dashed halo on Pareto-frontier points that used KV offload so the tradeoff is visible at a glance --- packages/app/cypress/support/mock-data.ts | 2 + .../app/src/app/api/unofficial-run/route.ts | 2 + .../src/components/GlobalFilterContext.tsx | 12 +- .../components/inference/InferenceContext.tsx | 15 ++- .../inference/hooks/useChartData.ts | 34 +++-- .../app/src/components/inference/types.ts | 26 ++++ .../components/inference/ui/ChartControls.tsx | 27 +++- .../components/inference/ui/ScatterGraph.tsx | 21 +++ .../inference/utils/tooltipUtils.ts | 54 +++++++- .../app/src/components/ui/chart-selectors.tsx | 124 ++++++++++++++++++ .../unofficial-run-provider.test.ts | 2 + .../components/unofficial-run-provider.tsx | 4 +- packages/app/src/lib/api.ts | 14 +- .../app/src/lib/benchmark-transform.test.ts | 2 + packages/app/src/lib/benchmark-transform.ts | 65 ++++++++- packages/app/src/lib/data-mappings.ts | 72 +++++++++- packages/app/src/lib/url-state.ts | 2 + packages/constants/src/models.ts | 17 +++ .../db/migrations/002_agentic_scenario.sql | 30 +++++ .../migrations/003_agentic_availability.sql | 21 +++ packages/db/migrations/004_offload_mode.sql | 42 ++++++ packages/db/src/etl/benchmark-ingest.ts | 28 ++-- packages/db/src/etl/benchmark-mapper.ts | 45 ++++++- packages/db/src/ingest-ci-run.ts | 6 +- packages/db/src/ingest-gcs-backup.ts | 6 +- packages/db/src/ingest-supplemental.ts | 14 +- packages/db/src/json-provider.ts | 8 +- packages/db/src/queries/benchmarks.ts | 13 +- packages/db/src/queries/workflow-info.ts | 15 ++- 29 files changed, 645 insertions(+), 78 deletions(-) create mode 100644 packages/db/migrations/002_agentic_scenario.sql create mode 100644 packages/db/migrations/003_agentic_availability.sql create mode 100644 packages/db/migrations/004_offload_mode.sql diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index e6720c0b..7a4f59a9 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,6 +189,8 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), + selectedPercentile: 'median', + setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), selectedE2eXAxisMetric: null, diff --git a/packages/app/src/app/api/unofficial-run/route.ts b/packages/app/src/app/api/unofficial-run/route.ts index 79ac0665..dbfb9c33 100644 --- a/packages/app/src/app/api/unofficial-run/route.ts +++ b/packages/app/src/app/api/unofficial-run/route.ts @@ -49,6 +49,8 @@ export function normalizeArtifactRows( decode_num_workers: config.decodeNumWorkers, num_prefill_gpu: config.numPrefillGpu, num_decode_gpu: config.numDecodeGpu, + benchmark_type: params.benchmarkType, + offload_mode: params.offloadMode, isl: params.isl, osl: params.osl, conc: params.conc, diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 65f510cd..f603081a 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -11,7 +11,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { useAvailability } from '@/hooks/api/use-availability'; import { useWorkflowInfo } from '@/hooks/api/use-workflow-info'; @@ -172,11 +172,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { const availableSequences = useMemo(() => { if (!availabilityRows) return SEQUENCE_OPTIONS; const seqs = [ - ...new Set( - modelRows - .map((r) => islOslToSequence(r.isl, r.osl)) - .filter((s): s is Sequence => s !== null), - ), + ...new Set(modelRows.map((r) => rowToSequence(r)).filter((s): s is Sequence => s !== null)), ]; return seqs.length > 0 ? seqs : SEQUENCE_OPTIONS; }, [availabilityRows, modelRows]); @@ -190,7 +186,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { // Precisions available for the selected model + sequence const availablePrecisions = useMemo(() => { if (!availabilityRows) return ['fp4']; - const rows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const rows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const precs = [...new Set(rows.map((r) => r.precision))].toSorted(); return precs.length > 0 ? precs : ['fp4']; }, [availabilityRows, modelRows, effectiveSequence]); @@ -205,7 +201,7 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { // Dates available for selected model + sequence + precisions const availableDates = useMemo(() => { if (!availabilityRows) return []; - const seqRows = modelRows.filter((r) => islOslToSequence(r.isl, r.osl) === effectiveSequence); + const seqRows = modelRows.filter((r) => rowToSequence(r) === effectiveSequence); const rows = seqRows.filter((r) => effectivePrecisions.includes(r.precision)); if (rows.length === 0) { return [...new Set(seqRows.map((r) => r.date))].toSorted(); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 7fa416fd..6f45d8d7 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -11,7 +11,7 @@ import { useState, } from 'react'; -import { DISPLAY_MODEL_TO_DB, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DISPLAY_MODEL_TO_DB, rowToSequence } from '@semianalysisai/inferencex-constants'; import { track } from '@/lib/analytics'; import { FAVORITE_PRESETS, type FavoritePreset } from '@/components/favorites/favorite-presets'; @@ -110,6 +110,11 @@ export function InferenceProvider({ const [selectedE2eXAxisMetric, setSelectedE2eXAxisMetric] = useState( () => getUrlParam('i_e2e_xmetric') || null, ); + // Latency percentile applied to the chart x-axis for agentic scenarios. + // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. + const [selectedPercentile, setSelectedPercentile] = useState( + () => getUrlParam('i_pctl') || 'median', + ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', ); @@ -163,6 +168,7 @@ export function InferenceProvider({ effectiveRunDate, isActive, latestDate, + selectedPercentile, ); // For GPU comparison date picker — use shared availability data from global filters @@ -176,7 +182,7 @@ export function InferenceProvider({ if (!availabilityRows) return availableDates; const rows = availabilityRows.filter((r) => { if (!dbModelKeys.includes(r.model)) return false; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) return false; + if (rowToSequence(r) !== effectiveSequence) return false; if (!effectivePrecisions.includes(r.precision)) return false; if (!r.hardware) return false; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -201,7 +207,7 @@ export function InferenceProvider({ const hwKeys = new Set(); for (const r of availabilityRows) { if (!dbModelKeys.includes(r.model)) continue; - if (islOslToSequence(r.isl, r.osl) !== effectiveSequence) continue; + if (rowToSequence(r) !== effectiveSequence) continue; if (!effectivePrecisions.includes(r.precision)) continue; if (!r.hardware) continue; const hwKey = buildAvailabilityHwKey(r.hardware, r.framework, r.spec_method, r.disagg); @@ -589,6 +595,7 @@ export function InferenceProvider({ useUrlStateSync( { i_metric: selectedYAxisMetric, + i_pctl: selectedPercentile, i_gpus: selectedGPUs.join(','), i_dates: selectedDates.join(','), i_dstart: selectedDateRange.startDate, @@ -783,6 +790,8 @@ export function InferenceProvider({ workflowInfo, selectedYAxisMetric, setSelectedYAxisMetric: setSelectedYAxisMetricAndClear, + selectedPercentile, + setSelectedPercentile, selectedGPUs, setSelectedGPUs: setSelectedGPUsAndClear, availableGPUs, diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 625e63ab..81ab0780 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -1,7 +1,7 @@ import { useMemo, useRef } from 'react'; import { useQueries } from '@tanstack/react-query'; -import { sequenceToIslOsl } from '@semianalysisai/inferencex-constants'; +import { rowToSequence } from '@semianalysisai/inferencex-constants'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { @@ -15,7 +15,7 @@ import type { import { filterDataByCostLimit } from '@/components/inference/utils'; import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants'; -import { transformBenchmarkRows } from '@/lib/benchmark-transform'; +import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; import type { Model, Sequence } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; @@ -79,6 +79,7 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, + selectedPercentile = 'median', ) { // When the selected date is the latest available, use '' (empty string) to match // the initial no-date query key, reusing the eagerly-fetched benchmarks from the @@ -119,11 +120,13 @@ export function useChartData( // Merge main rows with comparison date rows. // Stamp each row with the *requested* date (not the actual DB date) so that // GPUGraph's activeDates filter (keyed by user-selected date) matches the points. - const sequenceIslOsl = useMemo(() => sequenceToIslOsl(selectedSequence), [selectedSequence]); + // + // rowToSequence handles both fixed-seq (via isl/osl) and agentic (via + // benchmark_type), so one filter covers every scenario. const rows = useMemo(() => { - if (!allRows || !sequenceIslOsl) return []; - const seqFilter = (r: { isl: number; osl: number }) => - r.isl === sequenceIslOsl.isl && r.osl === sequenceIslOsl.osl; + if (!allRows) return []; + const seqFilter = (r: { isl: number | null; osl: number | null; benchmark_type: string }) => + rowToSequence(r) === selectedSequence; const seqFiltered = allRows.filter(seqFilter); // For each (hw, framework, spec_method, disagg, precision) group, keep only @@ -150,14 +153,14 @@ export function useChartData( .map((r) => ({ ...r, date: comparisonDates[i], actualDate: r.date })), ); return [...mainRows, ...extraRows]; - }, [allRows, sequenceIslOsl, comparisonDates, comparisonDataKey, selectedRunDate]); + }, [allRows, selectedSequence, comparisonDates, comparisonDataKey, selectedRunDate]); // Transform filtered rows into chart data const { chartData, hardwareConfig: rawHardwareConfig } = useMemo(() => { if (rows.length === 0) return { chartData: [] as InferenceData[][], hardwareConfig: {} as HardwareConfig }; - return transformBenchmarkRows(rows); - }, [rows]); + return transformBenchmarkRows(rows, selectedPercentile); + }, [rows, selectedPercentile]); // Sort hardware config — stabilize reference when keys haven't changed. // Different sequences for the same model often have the same GPU configs, @@ -192,8 +195,11 @@ export function useChartData( (chartDefinitions as ChartDefinition[]).map((chartDef) => { const metricKey = selectedYAxisMetric.replace('y_', '') as YAxisMetricKey; - // Determine dynamic x-axis - let xAxisField: keyof AggDataEntry = chartDef.x; + // Default x-axis = chart's natural latency metric, percentile-adjusted + // for the agentic case (median_e2el → p99_e2el etc.). For non-agentic + // scenarios `withPercentile` is a no-op when percentile === 'median'. + const naturalX = withPercentile(chartDef.x, selectedPercentile) as keyof AggDataEntry; + let xAxisField: keyof AggDataEntry = naturalX; let xAxisLabel = chartDef.x_label; const metricTitle = @@ -232,8 +238,10 @@ export function useChartData( // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), // so no roofline flip is needed for the e2e chart. + // Compare against `naturalX` (percentile-adjusted) — switching the + // percentile of the same logical metric is NOT a flip. const xAxisFlipped = - xAxisField !== chartDef.x && !(chartDef.chartType === 'e2e' && isTtftOverride); + xAxisField !== naturalX && !(chartDef.chartType === 'e2e' && isTtftOverride); const yLabelKey = `${selectedYAxisMetric}_label` as keyof ChartDefinition; const dynamicYLabel = chartDef[yLabelKey]; @@ -261,7 +269,7 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric], + [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile], ); // Build renderable graphs (data processing + stable chart definitions) diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a23707ba..53c8d84c 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -88,6 +88,29 @@ export interface AggDataEntry { actualDate?: string; /** URL to the GitHub Actions workflow run that produced this data point. */ run_url?: string; + /** Benchmark scenario: `single_turn` (fixed-seq isl/osl) or `agentic_traces`. */ + benchmark_type?: string; + /** ISL in tokens — null for agentic_traces. */ + isl?: number | null; + /** OSL in tokens — null for agentic_traces. */ + osl?: number | null; + // ── Agentic-only fields (populated from metrics JSONB for `agentic_traces` rows) ── + /** "on" | "off" — whether KV cache offload to CPU was enabled. */ + offload_mode?: string; + /** Actual server-observed GPU prefix-cache hit rate (0..1). */ + server_gpu_cache_hit_rate?: number; + /** Actual server-observed CPU prefix-cache hit rate (0..1). */ + server_cpu_cache_hit_rate?: number; + /** Infinite-cache theoretical hit rate (0..1) computed from trace. */ + theoretical_cache_hit_rate?: number; + /** Total requests attempted during the window. */ + num_requests_total?: number; + /** Requests that completed successfully. */ + num_requests_successful?: number; + /** Total prompt tokens served. */ + total_prompt_tokens?: number; + /** Total generated (output) tokens. */ + total_generation_tokens?: number; } /** @@ -468,6 +491,9 @@ export interface InferenceChartContextType { workflowInfo: any; selectedYAxisMetric: string; setSelectedYAxisMetric: (metric: string) => void; + /** Latency percentile for the x-axis under agentic scenarios (median/p90/p99/p99.9). */ + selectedPercentile: string; + setSelectedPercentile: (p: string) => void; selectedXAxisMetric: string | null; setSelectedXAxisMetric: (metric: string | null) => void; selectedE2eXAxisMetric: string | null; diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 5f8e7787..e4f55ad7 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -1,11 +1,14 @@ 'use client'; +import { useEffect, useState } from 'react'; + import { track } from '@/lib/analytics'; import { useInference } from '@/components/inference/InferenceContext'; import { ModelSelector, - SequenceSelector, + ScenarioSelector, + PercentileSelector, PrecisionSelector, } from '@/components/ui/chart-selectors'; import { DateRangePicker } from '@/components/ui/date-range-picker'; @@ -23,7 +26,7 @@ import { import { TooltipProvider } from '@/components/ui/tooltip'; import chartDefinitions from '@/components/inference/inference-chart-config.json'; import type { ChartDefinition } from '@/components/inference/types'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model, type Percentile } from '@/lib/data-mappings'; // Build Y-axis metric options from static chart config JSON — available immediately, no API wait const METRIC_GROUPS = [ @@ -78,6 +81,13 @@ interface ChartControlsProps { } export default function ChartControls({ hideGpuComparison = false }: ChartControlsProps) { + // The percentile selector is rendered conditionally on `selectedSequence`, + // which on the client is hydrated from URL params. SSR doesn't see the URL, + // so deferring the conditional until after mount keeps the initial DOM + // identical between server and client (avoids hydration warnings). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + const { selectedModel, setSelectedModel, @@ -87,6 +97,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro setSelectedPrecisions, selectedYAxisMetric, setSelectedYAxisMetric, + selectedPercentile, + setSelectedPercentile, graphs, selectedGPUs, setSelectedGPUs, @@ -203,12 +215,19 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro availableModels={availableModels} data-testid="model-selector" /> - + {mounted && selectedSequence === Sequence.AgenticTraces && ( + setSelectedPercentile(p)} + data-testid="percentile-selector" + /> + )} ('.dot-group').each(function (d) { + const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`); + const showHalo = onFrontier && d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); + // Double-click to track/untrack zoomGroup .selectAll('.dot-group') @@ -1567,6 +1585,9 @@ const ScatterGraph = React.memo( chartDefinition.chartType, xScaleConfig._isLog, yScaleConfig.type, + optimalPointKeys, + getCssColor, + resolveColor, ], ); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index e88e9930..7391225e 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -88,6 +88,51 @@ const runLinkHTML = (runUrl?: string) => const tooltipLine = (label: string, value: string | number) => `
${label}: ${value}
`; +const formatPct = (v: number | undefined): string | null => + v === undefined || v === null || Number.isNaN(v) ? null : `${(v * 100).toFixed(1)}%`; + +/** + * Agentic-only tooltip rows: offload mode, KV cache hit rates, request + * success, token totals. Returns an empty string for non-agentic rows. + */ +const generateAgenticHTML = (d: InferenceData): string => { + if (d.benchmark_type !== 'agentic_traces') return ''; + + const parts: string[] = []; + if (d.offload_mode) { + parts.push(tooltipLine('Offload Mode', d.offload_mode.toUpperCase())); + } + + const gpuHit = formatPct(d.server_gpu_cache_hit_rate); + const cpuHit = formatPct(d.server_cpu_cache_hit_rate); + const theoHit = formatPct(d.theoretical_cache_hit_rate); + if (gpuHit) parts.push(tooltipLine('GPU Cache Hit Rate', gpuHit)); + if (cpuHit) parts.push(tooltipLine('CPU Cache Hit Rate', cpuHit)); + if (theoHit) parts.push(tooltipLine('Theoretical Cache Hit Rate', theoHit)); + + if (d.num_requests_total !== undefined && d.num_requests_successful !== undefined) { + const successPct = + d.num_requests_total > 0 + ? ` (${((d.num_requests_successful / d.num_requests_total) * 100).toFixed(0)}%)` + : ''; + parts.push( + tooltipLine( + 'Requests', + `${d.num_requests_successful} / ${d.num_requests_total}${successPct}`, + ), + ); + } + + if (d.total_prompt_tokens !== undefined) { + parts.push(tooltipLine('Prompt Tokens', formatNumber(d.total_prompt_tokens))); + } + if (d.total_generation_tokens !== undefined) { + parts.push(tooltipLine('Generated Tokens', formatNumber(d.total_generation_tokens))); + } + + return parts.join(''); +}; + /** * Generates HTML for the parallelism configuration section of a tooltip. * Falls back to GPU count for old data without parallelism fields. @@ -177,9 +222,10 @@ export const generateTooltipContent = (config: TooltipConfig): string => {
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} ${ isPinned @@ -231,9 +277,10 @@ export const generateOverlayTooltipContent = (config: OverlayTooltipConfig): str
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)}
`; }; @@ -292,9 +339,10 @@ export const generateGPUGraphTooltipContent = (config: TooltipConfig): string =>
Concurrency: ${d.conc}
-
+
Precision: ${d.precision.toUpperCase()}
+ ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)}
`; diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index 75e2f257..1c843e12 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -19,12 +19,16 @@ import { type Model, type Precision, type Sequence, + type Percentile, + PERCENTILE_OPTIONS, getModelCategory, getModelLabel, + getPercentileLabel, getPrecisionLabel, getSequenceCategory, getSequenceLabel, groupByCategory, + sequenceKind, } from '@/lib/data-mappings'; function DeprecatedLabel({ reason }: { reason: string }) { @@ -167,6 +171,126 @@ export function SequenceSelector({ ); } +interface ScenarioSelectorProps { + id?: string; + value: string; + onChange: (value: Sequence) => void; + availableSequences: string[]; + 'data-testid'?: string; +} + +/** + * Scenario selector — fixed-seq-len rows grouped under "Fixed Sequence Length", + * agentic-trace rows rendered flat below. Label is "Scenario" (the ISL/OSL + * framing only applies to the fixed-seq subset). + */ +export function ScenarioSelector({ + id = 'scenario-select', + value, + onChange, + availableSequences, + 'data-testid': testId, +}: ScenarioSelectorProps) { + const fixedSeq = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'fixed-seq'); + const agentic = availableSequences.filter((s) => sequenceKind(s as Sequence) === 'agentic'); + const fixedGroups = groupByCategory(fixedSeq, (s) => getSequenceCategory(s as Sequence)); + + return ( +
+ + +
+ ); +} + +interface PercentileSelectorProps { + id?: string; + value: string; + onChange: (value: Percentile) => void; + 'data-testid'?: string; +} + +/** + * Latency percentile selector for agentic-trace charts. The selected value + * rewrites the chart x-axis metric from `median_*` to `{percentile}_*`, so + * picking p99 plots p99 e2e latency / interactivity instead of the median. + */ +export function PercentileSelector({ + id = 'percentile-select', + value, + onChange, + 'data-testid': testId, +}: PercentileSelectorProps) { + return ( +
+ + +
+ ); +} + interface PrecisionSelectorProps { id?: string; value: string[]; diff --git a/packages/app/src/components/unofficial-run-provider.test.ts b/packages/app/src/components/unofficial-run-provider.test.ts index f4263d2c..05b522c5 100644 --- a/packages/app/src/components/unofficial-run-provider.test.ts +++ b/packages/app/src/components/unofficial-run-provider.test.ts @@ -29,6 +29,8 @@ function stubRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 128, diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index 2dccdf7f..42530a51 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -12,7 +12,7 @@ import { import type { ChartDefinition, HardwareConfig, InferenceData } from '@/components/inference/types'; import { UnofficialBanner } from '@/components/ui/unofficial-banner'; -import { DB_MODEL_TO_DISPLAY, islOslToSequence } from '@semianalysisai/inferencex-constants'; +import { DB_MODEL_TO_DISPLAY, rowToSequence } from '@semianalysisai/inferencex-constants'; import { computeToggle } from '@/hooks/useTogglableSet'; import type { BenchmarkRow, EvalRow } from '@/lib/api'; import { normalizeEvalHardwareKey } from '@/lib/chart-utils'; @@ -93,7 +93,7 @@ export function buildChartData(benchmarks: BenchmarkRow[]): UnofficialChartData const groups = new Map(); for (const row of benchmarks) { const displayModel = DB_MODEL_TO_DISPLAY[row.model] ?? row.model; - const sequence = islOslToSequence(row.isl, row.osl); + const sequence = rowToSequence(row); if (!sequence) continue; const key = `${displayModel}_${sequence}`; if (!groups.has(key)) groups.set(key, []); diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 11ba4521..240251c3 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -23,9 +23,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -140,13 +144,15 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) { export interface AvailabilityRow { model: string; - isl: number; - osl: number; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; spec_method: string; disagg: boolean; + benchmark_type: string; date: string; } diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index be76438e..6a6c97c8 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -23,6 +23,8 @@ function makeRow(overrides: Partial = {}): BenchmarkRow { decode_num_workers: 0, num_prefill_gpu: 8, num_decode_gpu: 8, + benchmark_type: 'single_turn', + offload_mode: 'off', isl: 1024, osl: 1024, conc: 64, diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 107f0b12..69745da2 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -15,9 +15,39 @@ import { createChartDataPoint, getHardwareKey } from '@/lib/chart-utils'; import { getHardwareConfig } from '@/lib/constants'; import type { BenchmarkRow } from '@/lib/api'; +/** + * Agentic trace-replay runs (`benchmark_type === 'agentic_traces'`) emit ttft/ttlt/itl + * but not the intvty/e2el/tpot keys the chart pipeline expects. Bridge them here: + * e2el ≡ ttlt (time-to-last-token == end-to-end latency) + * tpot ≡ itl (time-per-output-token == inter-token-latency for single-output) + * intvty ≡ 1/itl (tok/s from the user's perspective) + * Existing fields win if present; we only fill in the gaps. + */ +function agenticAliases(m: Record): Record { + const out: Record = {}; + for (const suffix of ['mean', 'median', 'p90', 'p99']) { + const itl = m[`${suffix}_itl`]; + const ttlt = m[`${suffix}_ttlt`]; + if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; + if (m[`${suffix}_tpot`] === undefined && itl !== undefined) out[`${suffix}_tpot`] = itl; + if (m[`${suffix}_intvty`] === undefined && itl !== undefined && itl > 0) { + out[`${suffix}_intvty`] = 1 / itl; + } + } + return out; +} + /** Convert a DB benchmark row to an AggDataEntry. */ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { - const m = row.metrics; + const isAgentic = row.benchmark_type === 'agentic_traces'; + const m = isAgentic ? { ...row.metrics, ...agenticAliases(row.metrics) } : row.metrics; + // Prefer the dedicated column (added in migration 004); fall back to the + // legacy stash inside `metrics` for any rows ingested before that column + // existed. + const rawMetrics = row.metrics as Record; + const offloadMode = + row.offload_mode ?? + (typeof rawMetrics.offload_mode === 'string' ? rawMetrics.offload_mode : undefined); return { hw: row.hardware, framework: row.framework, @@ -68,6 +98,17 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { date: row.date, actualDate: (row as any).actualDate ?? row.date, run_url: row.run_url ?? undefined, + benchmark_type: row.benchmark_type, + isl: row.isl, + osl: row.osl, + offload_mode: offloadMode, + server_gpu_cache_hit_rate: m.server_gpu_cache_hit_rate, + server_cpu_cache_hit_rate: m.server_cpu_cache_hit_rate, + theoretical_cache_hit_rate: m.theoretical_cache_hit_rate, + num_requests_total: m.num_requests_total, + num_requests_successful: m.num_requests_successful, + total_prompt_tokens: m.total_prompt_tokens, + total_generation_tokens: m.total_generation_tokens, }; } @@ -77,13 +118,30 @@ interface PreparedEntry { date: string; } +/** + * Rewrite a chart x-axis key to use a different latency percentile prefix + * (`median_` → `p99_` etc). Only touches keys that start with a known + * percentile prefix; leaves everything else alone. + */ +export function withPercentile(key: string, percentile: string): string { + return key.replace(/^(mean|median|p90|p99|p99\.9)_/, `${percentile}_`); +} + /** * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig. * Returns one InferenceData[] per chart definition (e2e, interactivity). * * Converts rows to AggDataEntry once, then reuses for each chart definition. + * + * @param percentile Optional latency percentile for the chart x-axis + * (default 'median'). Swaps `median_intvty`/`median_e2el` in the chart + * definition for the chosen percentile — only agentic rows carry the + * full set (median/p90/p99/p99.9) so this mainly affects that scenario. */ -export function transformBenchmarkRows(rows: BenchmarkRow[]): { +export function transformBenchmarkRows( + rows: BenchmarkRow[], + percentile = 'median', +): { chartData: InferenceData[][]; hardwareConfig: HardwareConfig; } { @@ -109,13 +167,14 @@ export function transformBenchmarkRows(rows: BenchmarkRow[]): { // Phase 2: Build chart data per chart definition (reusing prepared entries) const chartData = (chartDefinitions as ChartDefinition[]).map((chartDef) => { + const xKey = withPercentile(chartDef.x, percentile); const groupedByHw: Record = {}; for (const { entry, hwKey, date } of prepared) { const dataPoint = createChartDataPoint( date, entry, - chartDef.x as keyof AggDataEntry, + xKey as keyof AggDataEntry, chartDef.y as keyof AggDataEntry, hwKey, ); diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 823b6823..8900f50e 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -102,17 +102,77 @@ export enum Sequence { OneK_OneK = '1k/1k', OneK_EightK = '1k/8k', EightK_OneK = '8k/1k', + AgenticTraces = 'agentic-traces', } -const SEQUENCE_CONFIG: Record = - { - [Sequence.OneK_OneK]: { label: '1K / 1K', compact: '1k1k', category: 'default' }, - [Sequence.OneK_EightK]: { label: '1K / 8K', compact: '1k8k', category: 'deprecated' }, - [Sequence.EightK_OneK]: { label: '8K / 1K', compact: '8k1k', category: 'default' }, - }; +/** + * Top-level scenario kind. Fixed-seq sequences cluster under a single group + * in the selector; agentic traces sit alongside as their own kind. + */ +export type ScenarioKind = 'fixed-seq' | 'agentic'; + +export function sequenceKind(seq: Sequence): ScenarioKind { + return seq === Sequence.AgenticTraces ? 'agentic' : 'fixed-seq'; +} + +const SEQUENCE_CONFIG: Record< + Sequence, + { label: string; compact: string; category: CategoryTag; kind: ScenarioKind } +> = { + [Sequence.OneK_OneK]: { + label: '1K / 1K', + compact: '1k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.OneK_EightK]: { + label: '1K / 8K', + compact: '1k8k', + category: 'deprecated', + kind: 'fixed-seq', + }, + [Sequence.EightK_OneK]: { + label: '8K / 1K', + compact: '8k1k', + category: 'default', + kind: 'fixed-seq', + }, + [Sequence.AgenticTraces]: { + label: 'Agentic Traces', + compact: 'agentic', + category: 'default', + kind: 'agentic', + }, +}; export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; +/** + * Percentile of the latency distribution used for the chart x-axis when + * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which + * slice to plot. + */ +export enum Percentile { + Median = 'median', + P90 = 'p90', + P99 = 'p99', + P99_9 = 'p99.9', +} + +const PERCENTILE_CONFIG: Record = { + [Percentile.Median]: { label: 'p50 (median)' }, + [Percentile.P90]: { label: 'p90' }, + [Percentile.P99]: { label: 'p99' }, + [Percentile.P99_9]: { label: 'p99.9' }, +}; + +export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; + +export function getPercentileLabel(p: Percentile): string { + return PERCENTILE_CONFIG[p]?.label ?? p; +} + export const DEPRECATED_SEQUENCES: ReadonlySet = new Set( (Object.entries(SEQUENCE_CONFIG) as [Sequence, (typeof SEQUENCE_CONFIG)[Sequence]][]) .filter(([, c]) => c.category === 'deprecated') diff --git a/packages/app/src/lib/url-state.ts b/packages/app/src/lib/url-state.ts index 3947488f..fb2e9d70 100644 --- a/packages/app/src/lib/url-state.ts +++ b/packages/app/src/lib/url-state.ts @@ -22,6 +22,7 @@ const URL_STATE_KEYS = [ 'i_seq', 'i_prec', 'i_metric', + 'i_pctl', 'i_xmetric', 'i_e2e_xmetric', 'i_scale', @@ -61,6 +62,7 @@ export const PARAM_DEFAULTS: Record = { i_seq: '8k/1k', i_prec: 'fp4', i_metric: 'y_tpPerGpu', + i_pctl: 'median', i_xmetric: 'p99_ttft', i_e2e_xmetric: '', i_scale: 'auto', diff --git a/packages/constants/src/models.ts b/packages/constants/src/models.ts index 6d646f08..d9a3d2d1 100644 --- a/packages/constants/src/models.ts +++ b/packages/constants/src/models.ts @@ -53,3 +53,20 @@ export function islOslToSequence(isl: number, osl: number): string | null { }; return map[`${isl}_${osl}`] ?? null; } + +/** + * Map a benchmark/availability row to its sequence (scenario) string. + * - `agentic_traces` rows map to `'agentic-traces'` regardless of isl/osl. + * - Other rows (today: `single_turn`) fall back to `islOslToSequence`. + * Returns `null` for rows that can't be classified (e.g. `single_turn` with + * unmapped isl/osl values). + */ +export function rowToSequence(row: { + isl: number | null; + osl: number | null; + benchmark_type: string; +}): string | null { + if (row.benchmark_type === 'agentic_traces') return 'agentic-traces'; + if (row.isl === null || row.osl === null) return null; + return islOslToSequence(row.isl, row.osl); +} diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql new file mode 100644 index 00000000..c143914e --- /dev/null +++ b/packages/db/migrations/002_agentic_scenario.sql @@ -0,0 +1,30 @@ +-- Support agentic scenarios in benchmark_results. +-- +-- Scenarios are discriminated by benchmark_type: +-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. +-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. +-- +-- conc retains its meaning (concurrent users/requests) for both. + +-- 1) isl/osl become nullable for agentic rows +alter table benchmark_results + alter column isl drop not null, + alter column osl drop not null; + +-- 2) CHECK constraints: positive-or-null +alter table benchmark_results + drop constraint benchmark_results_isl_positive, + drop constraint benchmark_results_osl_positive; + +alter table benchmark_results + add constraint benchmark_results_isl_positive check (isl is null or isl > 0), + add constraint benchmark_results_osl_positive check (osl is null or osl > 0); + +-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows +-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc); diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql new file mode 100644 index 00000000..e96cbd50 --- /dev/null +++ b/packages/db/migrations/003_agentic_availability.sql @@ -0,0 +1,21 @@ +-- Extend the availability table to cover agentic scenarios. +-- +-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same +-- for availability and add benchmark_type so the frontend can enumerate +-- agentic vs single_turn scenarios per model/date. +-- +-- Postgres primary keys require every column to be NOT NULL, so we drop the PK +-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally +-- equivalent except it allows isl/osl to be NULL for agentic rows. + +alter table availability + drop constraint availability_pkey; + +alter table availability + alter column isl drop not null, + alter column osl drop not null, + add column benchmark_type text not null default 'single_turn'; + +alter table availability + add constraint availability_natural_key unique nulls not distinct + (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql new file mode 100644 index 00000000..24b617f1 --- /dev/null +++ b/packages/db/migrations/004_offload_mode.sql @@ -0,0 +1,42 @@ +-- Add offload_mode as a first-class dimension on benchmark_results. +-- +-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace +-- runs: a single run may emit two rows for the same (config, isl, osl, conc) +-- — one with offload disabled, one enabled. The pre-existing unique key +-- collapsed those into one row, forcing the ingest to skip variants. +-- +-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the +-- assumption baked into the existing 5,500+ rows. + +alter table benchmark_results + add column offload_mode text not null default 'off'; + +-- Backfill agentic rows from the offload_mode value already living in metrics +-- JSONB (set during the earlier agentic ingest backfill). +update benchmark_results + set offload_mode = metrics->>'offload_mode' + where benchmark_type = 'agentic_traces' + and metrics ? 'offload_mode'; + +-- Replace the unique constraint so on/off variants can coexist. +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); + +-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. +drop materialized view if exists latest_benchmarks cascade; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; + +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/src/etl/benchmark-ingest.ts b/packages/db/src/etl/benchmark-ingest.ts index 67173c64..ea802d3f 100644 --- a/packages/db/src/etl/benchmark-ingest.ts +++ b/packages/db/src/etl/benchmark-ingest.ts @@ -29,12 +29,19 @@ export async function bulkIngestBenchmarkRows( // Postgres rejects ON CONFLICT DO UPDATE if the same conflict key appears // more than once in a single batch. Deduplicate within the batch, keeping - // the last occurrence (last metrics for each unique config/isl/osl/conc). + // the last occurrence (last metrics for each unique config/benchmark_type/isl/osl/conc/offload_mode). const seen = new Map(); - for (const r of rows) seen.set(`${r.configId}-${r.isl}-${r.osl}-${r.conc}`, r); + for (const r of rows) { + seen.set( + `${r.configId}-${r.benchmarkType}-${r.isl ?? ''}-${r.osl ?? ''}-${r.conc}-${r.offloadMode}`, + r, + ); + } const deduped = [...seen.values()]; const configIds = deduped.map((r) => r.configId); + const benchmarkTypes = deduped.map((r) => r.benchmarkType); + const offloadModes = deduped.map((r) => r.offloadMode); const isls = deduped.map((r) => r.isl); const osls = deduped.map((r) => r.osl); const concs = deduped.map((r) => r.conc); @@ -43,20 +50,21 @@ export async function bulkIngestBenchmarkRows( const result = await sql<{ inserted: boolean; id: number }[]>` insert into benchmark_results ( - workflow_run_id, config_id, benchmark_type, date, + workflow_run_id, config_id, benchmark_type, offload_mode, date, isl, osl, conc, image, metrics ) select ${workflowRunId}, unnest(${sql.array(configIds)}::int[]), - 'single_turn', + unnest(${sql.array(benchmarkTypes)}::text[]), + unnest(${sql.array(offloadModes)}::text[]), ${date}::date, unnest(${sql.array(isls)}::int[]), unnest(${sql.array(osls)}::int[]), unnest(${sql.array(concs)}::int[]), unnest(${sql.array(images)}), unnest(${sql.array(metricsJsons)}::jsonb[]) - on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc) + on conflict (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode) do update set metrics = excluded.metrics, image = excluded.image @@ -147,13 +155,14 @@ export async function bulkUpsertAvailability( sql: Sql, rows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[], date: string, ): Promise { @@ -162,7 +171,7 @@ export async function bulkUpsertAvailability( const seen = new Set(); const unique: typeof rows = []; for (const r of rows) { - const key = `${r.model}|${r.isl}|${r.osl}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${date}`; + const key = `${r.model}|${r.isl ?? ''}|${r.osl ?? ''}|${r.precision}|${r.hardware}|${r.framework}|${r.specMethod}|${r.disagg}|${r.benchmarkType}|${date}`; if (!seen.has(key)) { seen.add(key); unique.push(r); @@ -170,7 +179,7 @@ export async function bulkUpsertAvailability( } await sql` - insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, date) + insert into availability (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date) select unnest(${sql.array(unique.map((r) => r.model))}::text[]), unnest(${sql.array(unique.map((r) => r.isl))}::int[]), @@ -180,6 +189,7 @@ export async function bulkUpsertAvailability( unnest(${sql.array(unique.map((r) => r.framework))}::text[]), unnest(${sql.array(unique.map((r) => r.specMethod))}::text[]), unnest(${sql.array(unique.map((r) => r.disagg))}::bool[]), + unnest(${sql.array(unique.map((r) => r.benchmarkType))}::text[]), ${date}::date on conflict do nothing `; diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 7d78e175..5b120843 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -57,8 +57,21 @@ const NON_METRIC_KEYS = new Set([ 'decode_num_workers', 'num_prefill_gpu', 'num_decode_gpu', + // agentic scenario + 'scenario_type', + 'users', + 'offload_mode', + 'num_requests_total', + 'num_requests_successful', ]); +/** + * `benchmark_type` values understood by the ingest. + * - `single_turn` — fixed sequence-length runs (isl/osl set). + * - `agentic_traces` — trace-replay agentic runs (isl/osl null, `users` → conc). + */ +export type BenchmarkType = 'single_turn' | 'agentic_traces'; + /** * METRIC_KEYS from constants is the canonical set of known metric keys. * Any numeric field outside this set and `NON_METRIC_KEYS` is auto-captured @@ -70,9 +83,13 @@ const _warnedMetricKeys = new Set(); export interface BenchmarkParams { config: ConfigParams; - isl: number; - osl: number; + benchmarkType: BenchmarkType; + // Null for agentic_traces; present for single_turn. + isl: number | null; + osl: number | null; conc: number; + /** 'on' | 'off' — KV cache offload to CPU. Defaults to 'off'. */ + offloadMode: string; image: string | null; metrics: Record; } @@ -114,10 +131,15 @@ export function mapBenchmarkRow( return null; } - const isl = parseInt2(row.isl) ?? islOslFallback?.isl; - const osl = parseInt2(row.osl) ?? islOslFallback?.osl; - const conc = parseInt2(row.conc); - if (!isl || !osl || !conc) { + // Agentic-trace runs emit `scenario_type: 'agentic-coding'` (and variants), + // no isl/osl, and `users` instead of `conc`. Everything else stays as-is. + const isAgentic = String(row.scenario_type ?? '').startsWith('agentic'); + const benchmarkType: BenchmarkType = isAgentic ? 'agentic_traces' : 'single_turn'; + + const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); + const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); + const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc); + if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } @@ -182,6 +204,12 @@ export function mapBenchmarkRow( } } + // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it + // as a stringified metric so the frontend can expose it in tooltips. + if (isAgentic && typeof row.offload_mode === 'string') { + (metrics as Record).offload_mode = row.offload_mode; + } + // Artifact names encode '/' as '#' to avoid path separators; restore the URI. const image = row.image ? String(row.image).replaceAll('#', '/') : null; @@ -205,9 +233,14 @@ export function mapBenchmarkRow( numPrefillGpu, numDecodeGpu, }, + benchmarkType, isl, osl, conc, + offloadMode: + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : 'off', image, metrics, }; diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 14c7b4d0..8cce43ca 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -248,13 +248,14 @@ async function main(): Promise { const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; let totalNewBmk = 0, @@ -367,6 +368,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index e20278d6..6dc604e9 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -596,13 +596,14 @@ async function main(): Promise { // Upsert availability rows only for successfully resolved configs const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const r of allInserted) { availRows.push({ @@ -614,6 +615,7 @@ async function main(): Promise { framework: r.config.framework, specMethod: r.config.specMethod, disagg: r.config.disagg, + benchmarkType: r.benchmarkType, }); } if (availRows.length > 0) { diff --git a/packages/db/src/ingest-supplemental.ts b/packages/db/src/ingest-supplemental.ts index 1e494e9f..43aae047 100644 --- a/packages/db/src/ingest-supplemental.ts +++ b/packages/db/src/ingest-supplemental.ts @@ -219,8 +219,10 @@ async function ingestSupplementalBmk( const rows: { configId: number; - isl: number; - osl: number; + benchmarkType: 'single_turn' | 'agentic_traces'; + offloadMode: string; + isl: number | null; + osl: number | null; conc: number; image: string | null; metrics: Record; @@ -271,6 +273,8 @@ async function ingestSupplementalBmk( rows.push({ configId, + benchmarkType: 'single_turn', + offloadMode: 'off', isl: entry.isl, osl: entry.osl, conc: entry.conc, @@ -294,13 +298,14 @@ async function ingestSupplementalBmk( // to `rows` are exactly the valid ones. const availRows: { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; specMethod: string; disagg: boolean; + benchmarkType: string; }[] = []; for (const entry of entries) { const modelKey = resolveModelKey({ model: entry.model, infmax_model_prefix: undefined }); @@ -317,6 +322,7 @@ async function ingestSupplementalBmk( framework, specMethod, disagg, + benchmarkType: 'single_turn', }); } if (availRows.length > 0) { diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index 0d9373d3..f09a2686 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -290,6 +290,8 @@ function toBenchmarkRow( decode_num_workers: c.decode_num_workers, num_prefill_gpu: c.num_prefill_gpu, num_decode_gpu: c.num_decode_gpu, + benchmark_type: br.benchmark_type ?? 'single_turn', + offload_mode: (br as { offload_mode?: string }).offload_mode ?? 'off', isl: br.isl, osl: br.osl, conc: br.conc, @@ -410,7 +412,11 @@ export function getAvailabilityData(): AvailabilityRow[] { for (const a of s.availability) { const key = `${a.model}|${a.hardware}|${a.framework}|${a.precision}|${a.isl}|${a.osl}|${toDateString(a.date)}`; if (validKeys.has(key)) { - rows.push({ ...a, date: toDateString(a.date) }); + rows.push({ + ...a, + benchmark_type: (a as { benchmark_type?: string }).benchmark_type ?? 'single_turn', + date: toDateString(a.date), + }); } } diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 1c30b1fd..74e20380 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -18,9 +18,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces; numeric for single_turn fixed-seq runs. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode: 'on' | 'off'. Defaults to 'off' for fixed-seq. */ + offload_mode: string; image: string | null; metrics: Record; date: string; @@ -68,6 +72,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -106,6 +112,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + lb.benchmark_type, + lb.offload_mode, lb.isl, lb.osl, lb.conc, @@ -153,6 +161,7 @@ export async function getAllBenchmarksForHistory( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, br.isl, br.osl, br.conc, diff --git a/packages/db/src/queries/workflow-info.ts b/packages/db/src/queries/workflow-info.ts index b4e4f255..d5e2d933 100644 --- a/packages/db/src/queries/workflow-info.ts +++ b/packages/db/src/queries/workflow-info.ts @@ -88,20 +88,22 @@ export async function getDateConfigs(sql: DbClient, date: string): Promise { const rows = await sql` - SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.date::text + SELECT a.model, a.isl, a.osl, a.precision, a.hardware, a.framework, a.spec_method, a.disagg, a.benchmark_type, a.date::text FROM availability a WHERE EXISTS ( SELECT 1 @@ -112,8 +114,9 @@ export async function getAvailabilityData(sql: DbClient): Promise Date: Thu, 30 Apr 2026 19:01:56 -0500 Subject: [PATCH 02/96] =?UTF-8?q?fix:=20agentic=20offload=20variants=20?= =?UTF-8?q?=E2=80=94=20render=20both=20halos=20+=20map=20renamed=20fields?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ScatterGraph: include `offload_mode` in `buildPointConfigId` so d3's data join keeps both `on` and `off` variants for the same (config, conc). Without it, the second variant collapsed onto the first key, so FP8 offload-on points (and their halos) silently disappeared. - benchmark-mapper: handle older artifacts that emit `users`/`offload_mode` AND newer ones that emit `conc`/`offloading` (with 'none' → 'off' mapping). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 4 +++ packages/db/src/etl/benchmark-mapper.ts | 27 ++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 15bb60f0..55a206ce 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -295,6 +295,10 @@ const ScatterGraph = React.memo( const buildPointConfigId = useCallback((point: InferenceData): string => { let key = `${point.hwKey}|${point.precision}|${point.tp}|${point.conc}|${point.decode_ep ?? 0}|${point.prefill_tp ?? 0}|${point.prefill_ep ?? 0}`; if (point.disagg) key += `|disagg|${point.num_prefill_gpu ?? 0}|${point.num_decode_gpu ?? 0}`; + // Agentic runs emit two rows per (config, conc) — one offload=on, one off. + // Without this suffix, d3's data join treats them as the same point and + // drops one variant (along with its halo). + if (point.offload_mode) key += `|offload-${point.offload_mode}`; return key; }, []); diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index 5b120843..d842276e 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -138,12 +138,24 @@ export function mapBenchmarkRow( const isl = isAgentic ? null : (parseInt2(row.isl) ?? islOslFallback?.isl ?? null); const osl = isAgentic ? null : (parseInt2(row.osl) ?? islOslFallback?.osl ?? null); - const conc = isAgentic ? parseInt2(row.users) : parseInt2(row.conc); + // Agentic artifacts encode concurrency as `users` in older schemas and `conc` in newer ones. + const conc = isAgentic ? (parseInt2(row.users) ?? parseInt2(row.conc)) : parseInt2(row.conc); if (!conc || (!isAgentic && (!isl || !osl))) { tracker.skips.noIslOsl++; return null; } + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` + // ('none' → 'off'; any other non-empty value → 'on'). + const offloadModeRaw = + typeof row.offload_mode === 'string' && row.offload_mode.length > 0 + ? row.offload_mode + : typeof row.offloading === 'string' && row.offloading.length > 0 + ? row.offloading === 'none' + ? 'off' + : 'on' + : 'off'; + const { framework, disagg } = normalizeFramework(String(row.framework ?? ''), row.disagg); const isMultinode = parseBool(row.is_multinode); const precision = normalizePrecision(String(row.precision ?? '')); @@ -204,10 +216,10 @@ export function mapBenchmarkRow( } } - // Agentic rows emit `offload_mode: "on" | "off"` as a string — preserve it - // as a stringified metric so the frontend can expose it in tooltips. - if (isAgentic && typeof row.offload_mode === 'string') { - (metrics as Record).offload_mode = row.offload_mode; + // Agentic rows emit `offload_mode: "on" | "off"` (or older `offloading: "none"|...`) + // — preserve as a stringified metric so the frontend can expose it in tooltips. + if (isAgentic) { + (metrics as Record).offload_mode = offloadModeRaw; } // Artifact names encode '/' as '#' to avoid path separators; restore the URI. @@ -237,10 +249,7 @@ export function mapBenchmarkRow( isl, osl, conc, - offloadMode: - typeof row.offload_mode === 'string' && row.offload_mode.length > 0 - ? row.offload_mode - : 'off', + offloadMode: offloadModeRaw, image, metrics, }; From 07ba10636dae87b5a819afa524d7c10322fae41b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 00:29:55 -0500 Subject: [PATCH 03/96] fix: render offload halo on every offload-on point, not just frontier The halo's purpose is to surface KV-offload usage; restricting it to Pareto-frontier-only points hid the indicator on most runs. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/inference/ui/ScatterGraph.tsx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 55a206ce..61ac0983 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -1516,10 +1516,9 @@ const ScatterGraph = React.memo( .attr('pointer-events', 'none'); }); - // Offload halo: dashed ring on frontier points that used KV offload + // Offload halo: dashed ring on every point that used KV offload (Pareto or not) zoomGroup.selectAll('.dot-group').each(function (d) { - const onFrontier = optimalPointKeys.has(`${d.hwKey}_${d.precision}-${d.x}-${d.y}`); - const showHalo = onFrontier && d.offload_mode === 'on'; + const showHalo = d.offload_mode === 'on'; d3.select(this) .selectAll('.offload-halo') .data(showHalo ? [true] : []) From 95e9dc77431adf5354ef0df36989816199624383 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 01:13:42 -0500 Subject: [PATCH 04/96] fix: strip runner-pool suffix (-p1, -p2, ...) from hw identifier b300-p1 (and similar) artifacts were skipping ingest because the runner-pool suffix wasn't in the strip list and didn't normalize to the canonical b300 GPU key. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/db/src/etl/normalizers.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/normalizers.ts b/packages/db/src/etl/normalizers.ts index ad12a454..bd497f7a 100644 --- a/packages/db/src/etl/normalizers.ts +++ b/packages/db/src/etl/normalizers.ts @@ -34,7 +34,8 @@ export function hwToGpuKey(hw: string): string | null { .replace(/-dgxc-slurm$/, '') .replace(/-dgxc$/, '') .replace(/-nb$/, '') - .replace(/-nv$/, ''); + .replace(/-nv$/, '') + .replace(/-p\d+$/, ''); // strip runner-pool suffix (e.g. b300-p1 → b300) return GPU_KEYS.has(base) ? base : null; } From 982106da5f4421983841304f0503b6467033852d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 09:25:33 -0500 Subject: [PATCH 05/96] feat: bold scatter labels with concurrency tag + collision avoidance - Label text now includes `C=` alongside the GPU/parallelism tag (default ` C=`, advanced ` C=`) - Bumped point-label font-weight to 700 so the labels read clearly against the chart fill - Greedy collision-avoidance pass on render and zoom: tries placing each label above/below the point through 4 candidate dy offsets, hiding the label only when no slot is free Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 68 ++++++++++++++++++- .../src/lib/d3-chart/layers/scatter-points.ts | 1 + 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 61ac0983..3fbd8588 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -55,6 +55,63 @@ import { buildGradientColorMap, } from '@/components/inference/utils/paretoLabels'; +// Greedy label-collision avoidance: try positions above/below the point; +// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom. +function avoidLabelCollisions( + zoomGroup: d3.Selection, +): void { + const labels: { + el: SVGTextElement; + cx: number; + cy: number; + w: number; + h: number; + }[] = []; + zoomGroup.selectAll('.dot-group').each(function () { + const labelEl = this.querySelector('.point-label'); + if (!labelEl) return; + if ((this as SVGGElement).style.opacity === '0') return; + const transform = (this as SVGGElement).getAttribute('transform') ?? ''; + const m = transform.match(/translate\(([^,]+),([^)]+)\)/); + if (!m) return; + const cx = parseFloat(m[1]); + const cy = parseFloat(m[2]); + labelEl.setAttribute('dy', '-8'); + labelEl.style.opacity = '1'; + const bbox = labelEl.getBBox(); + labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height }); + }); + labels.sort((a, b) => a.cx - b.cx); + const placed: { left: number; right: number; top: number; bottom: number }[] = []; + const pad = 1; + const candidates = [-8, 14, -22, 28]; + for (const lab of labels) { + let chosenDy: number | null = null; + let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; + for (const dy of candidates) { + const top = lab.cy + dy - lab.h - pad; + const bottom = lab.cy + dy + pad; + const left = lab.cx - lab.w / 2 - pad; + const right = lab.cx + lab.w / 2 + pad; + const collides = placed.some( + (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), + ); + if (!collides) { + chosenDy = dy; + chosenBox = { left, right, top, bottom }; + break; + } + } + if (chosenDy !== null && chosenBox) { + lab.el.setAttribute('dy', String(chosenDy)); + lab.el.style.opacity = '1'; + placed.push(chosenBox); + } else { + lab.el.style.opacity = '0'; + } + } +} + // X-shape path for overlay (unofficial) data points const X_SIZE = 5; const X_HOVER_SIZE = 7; @@ -603,6 +660,7 @@ const ScatterGraph = React.memo( d3.axisLeft(newYS).ticks(10).tickFormat(logTickFormat(newYS)) as any, ); } + avoidLabelCollisions(ctx.layout.zoomGroup); }, }), [zoomResetEventName, eventPrefix, xScaleConfig._isLog, yScaleConfig.type], @@ -1251,7 +1309,8 @@ const ScatterGraph = React.memo( getOpacity: (d) => (isPointVisible(d) ? 1 : 0), getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'), hideLabels: hidePointLabels || showGradientLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + getLabelText: (d) => + useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1353,8 +1412,11 @@ const ScatterGraph = React.memo( .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') + .attr('font-weight', '700') .attr('pointer-events', 'none') - .text(useAdvancedLabels ? getPointLabel(d) : String(d.tp)); + .text( + useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, + ); }); // Overlay tooltip handlers @@ -1566,6 +1628,8 @@ const ScatterGraph = React.memo( }); }); + avoidLabelCollisions(zoomGroup); + // Log tick formatting on initial render if (xScaleConfig._isLog) { const xScale = ctx.xScale as d3.ScaleLogarithmic; diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 507654e1..9f2d2f38 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -72,6 +72,7 @@ export function renderScatterPoints Date: Fri, 1 May 2026 09:32:44 -0500 Subject: [PATCH 06/96] fix: stack multi-line point labels upward so they don't overlap the point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tspans now ride above the text's `dy` anchor — the LAST line sits at the anchor (just above the point) and earlier lines stack above it. Previously the second tspan landed below the anchor and crashed into the marker. Also widened collision candidates by label height so the flipped-below position fully clears the point on multi-line labels. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 28 +++++++--- .../src/lib/d3-chart/layers/scatter-points.ts | 52 +++++++++++++------ 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 3fbd8588..f8ce9b8f 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -84,8 +84,11 @@ function avoidLabelCollisions( labels.sort((a, b) => a.cx - b.cx); const placed: { left: number; right: number; top: number; bottom: number }[] = []; const pad = 1; - const candidates = [-8, 14, -22, 28]; for (const lab of labels) { + // Candidates scale with the label's own height so multi-line labels don't + // overlap the point shape when flipped below. + const below = lab.h + 8; + const candidates = [-8, below, -8 - below - 4, 2 * below]; let chosenDy: number | null = null; let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; for (const dy of candidates) { @@ -1310,7 +1313,7 @@ const ScatterGraph = React.memo( getPointerEvents: (d) => (isPointVisible(d) ? 'auto' : 'none'), hideLabels: hidePointLabels || showGradientLabels, getLabelText: (d) => - useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, + useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { 'hw-key': (d) => String(d.hwKey), @@ -1403,7 +1406,14 @@ const ScatterGraph = React.memo( // Labels const showLabels = !hidePointLabels && !showGradientLabels; overlayPoints.each(function (d) { - d3.select(this) + const lines = showLabels + ? (useAdvancedLabels + ? `${getPointLabel(d)}\nC=${d.conc}` + : `${d.tp}\nC=${d.conc}` + ).split('\n') + : []; + const text = d3 + .select(this) .selectAll('.overlay-label') .data(showLabels ? [true] : []) .join('text') @@ -1413,10 +1423,14 @@ const ScatterGraph = React.memo( .style('fill', 'var(--foreground)') .attr('font-size', '10px') .attr('font-weight', '700') - .attr('pointer-events', 'none') - .text( - useAdvancedLabels ? `${getPointLabel(d)} C=${d.conc}` : `${d.tp} C=${d.conc}`, - ); + .attr('pointer-events', 'none'); + text + .selectAll('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .text((l) => l); }); // Overlay tooltip handlers diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 9f2d2f38..13c588d8 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -63,18 +63,30 @@ export function renderScatterPoints` element — the + // intra-stack offsets stay correct whether the label ends up above or below. if (!config.hideLabels && config.getLabelText && config.foreground) { - entered - .append('text') - .attr('class', 'point-label') - .attr('dy', -8) - .attr('text-anchor', 'middle') - .attr('fill', config.foreground) - .attr('font-size', '10px') - .attr('font-weight', '700') - .attr('pointer-events', 'none') - .text(config.getLabelText); + const labelGetter = config.getLabelText; + entered.each(function (d) { + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .append('text') + .attr('class', 'point-label') + .attr('dy', -8) + .attr('text-anchor', 'middle') + .attr('fill', config.foreground!) + .attr('font-size', '10px') + .attr('font-weight', '700') + .attr('pointer-events', 'none'); + lines.forEach((line, i) => { + const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'; + text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line); + }); + }); } // Exit: remove stale points @@ -103,9 +115,12 @@ export function renderScatterPoints('.point-label') + const lines = labelGetter(d).split('\n'); + const text = d3 + .select(this) + .selectAll('.point-label') .data([true]) .join('text') .attr('class', 'point-label') @@ -113,8 +128,15 @@ export function renderScatterPoints('tspan') + .data(lines) + .join('tspan') + .attr('x', 0) + .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .text((l) => l); }); } else { points.selectAll('.point-label').remove(); From 37eecc6e28c10751ffc52c8a0d0588177e43d4d8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 09:38:39 -0500 Subject: [PATCH 07/96] fix: anchor multi-line labels via first tspan + tspan-aware collision pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a `` contains tspans, the parent's `dy` does not shift the bbox cleanly — its (unused) y=0 origin still factors in, so the rendered text ended up centered on the point. Move the absolute offset into the FIRST tspan's `dy`; later tspans cascade by 1.1em. Collision avoidance now drives the first tspan's `dy` and tries four candidate baselines (primary above, primary below, secondary above, secondary below), accounting for full label height when picking a non- overlapping slot. Labels still hidden as a last resort. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 72 +++++++++++++------ .../src/lib/d3-chart/layers/scatter-points.ts | 25 ++++--- 2 files changed, 66 insertions(+), 31 deletions(-) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index f8ce9b8f..27d3680c 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -55,58 +55,88 @@ import { buildGradientColorMap, } from '@/components/inference/utils/paretoLabels'; -// Greedy label-collision avoidance: try positions above/below the point; -// hide labels that can't fit anywhere. Re-runs cheaply on each render/zoom. +// Greedy label-collision avoidance. +// Each candidate is the y-position of the FIRST baseline (relative to point +// center) which we apply via the first tspan's `dy` — later tspans cascade +// down by 1.1em. We try above/below at primary and secondary offsets, and +// hide the label if all four positions collide. function avoidLabelCollisions( zoomGroup: d3.Selection, ): void { - const labels: { + interface LabelInfo { el: SVGTextElement; + firstTspan: SVGTSpanElement; cx: number; cy: number; w: number; - h: number; - }[] = []; + nLines: number; + defaultFirstY: number; + } + const labels: LabelInfo[] = []; + const ASCENT = 9; + const DESCENT = 3; + const LINE_H = 11; + zoomGroup.selectAll('.dot-group').each(function () { const labelEl = this.querySelector('.point-label'); if (!labelEl) return; if ((this as SVGGElement).style.opacity === '0') return; + const tspans = labelEl.querySelectorAll('tspan'); + if (tspans.length === 0) return; const transform = (this as SVGGElement).getAttribute('transform') ?? ''; const m = transform.match(/translate\(([^,]+),([^)]+)\)/); if (!m) return; const cx = parseFloat(m[1]); const cy = parseFloat(m[2]); - labelEl.setAttribute('dy', '-8'); + const nLines = tspans.length; + const defaultFirstY = -(8 + (nLines - 1) * LINE_H); // last baseline 8px above point + // Reset to default before measuring so prior positioning doesn't bias bbox + tspans[0].setAttribute('dy', `${defaultFirstY}px`); labelEl.style.opacity = '1'; const bbox = labelEl.getBBox(); - labels.push({ el: labelEl, cx, cy, w: bbox.width, h: bbox.height }); + labels.push({ + el: labelEl, + firstTspan: tspans[0], + cx, + cy, + w: bbox.width, + nLines, + defaultFirstY, + }); }); + labels.sort((a, b) => a.cx - b.cx); const placed: { left: number; right: number; top: number; bottom: number }[] = []; - const pad = 1; + const pad = 2; + for (const lab of labels) { - // Candidates scale with the label's own height so multi-line labels don't - // overlap the point shape when flipped below. - const below = lab.h + 8; - const candidates = [-8, below, -8 - below - 4, 2 * below]; - let chosenDy: number | null = null; + const blockH = (lab.nLines - 1) * LINE_H + ASCENT + DESCENT; + const aboveFirstY = lab.defaultFirstY; + const belowFirstY = 14; // first baseline 14px below point center + const candidates = [ + aboveFirstY, + belowFirstY, + aboveFirstY - blockH - 2, + belowFirstY + blockH + 2, + ]; + let chosenY: number | null = null; let chosenBox: { left: number; right: number; top: number; bottom: number } | null = null; - for (const dy of candidates) { - const top = lab.cy + dy - lab.h - pad; - const bottom = lab.cy + dy + pad; + for (const firstY of candidates) { + const top = lab.cy + firstY - ASCENT - pad; + const bottom = lab.cy + firstY + (lab.nLines - 1) * LINE_H + DESCENT + pad; const left = lab.cx - lab.w / 2 - pad; const right = lab.cx + lab.w / 2 + pad; const collides = placed.some( (p) => !(right < p.left || left > p.right || bottom < p.top || top > p.bottom), ); if (!collides) { - chosenDy = dy; + chosenY = firstY; chosenBox = { left, right, top, bottom }; break; } } - if (chosenDy !== null && chosenBox) { - lab.el.setAttribute('dy', String(chosenDy)); + if (chosenY !== null && chosenBox) { + lab.firstTspan.setAttribute('dy', `${chosenY}px`); lab.el.style.opacity = '1'; placed.push(chosenBox); } else { @@ -1418,18 +1448,18 @@ const ScatterGraph = React.memo( .data(showLabels ? [true] : []) .join('text') .attr('class', 'overlay-label') - .attr('dy', -10) .attr('text-anchor', 'middle') .style('fill', 'var(--foreground)') .attr('font-size', '10px') .attr('font-weight', '700') .attr('pointer-events', 'none'); + const firstDy = -(1 + (lines.length - 1) * 1.1); text .selectAll('tspan') .data(lines) .join('tspan') .attr('x', 0) - .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) .text((l) => l); }); diff --git a/packages/app/src/lib/d3-chart/layers/scatter-points.ts b/packages/app/src/lib/d3-chart/layers/scatter-points.ts index 13c588d8..71d1f050 100644 --- a/packages/app/src/lib/d3-chart/layers/scatter-points.ts +++ b/packages/app/src/lib/d3-chart/layers/scatter-points.ts @@ -64,10 +64,10 @@ export function renderScatterPoints` element — the - // intra-stack offsets stay correct whether the label ends up above or below. + // we anchor the entire stack via the FIRST tspan's `dy` so getBBox() doesn't + // pick up the text element's own (unused) y=0 origin. The first tspan is + // raised so the LAST line baseline lands ~8px above the point; subsequent + // tspans cascade down by 1.1em. if (!config.hideLabels && config.getLabelText && config.foreground) { const labelGetter = config.getLabelText; entered.each(function (d) { @@ -76,15 +76,18 @@ export function renderScatterPoints { - const tspanDy = i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em'; - text.append('tspan').attr('x', 0).attr('dy', tspanDy).text(line); + text + .append('tspan') + .attr('x', 0) + .attr('dy', i === 0 ? `${firstDy}em` : '1.1em') + .text(line); }); }); } @@ -113,7 +116,9 @@ export function renderScatterPoints('tspan') .data(lines) .join('tspan') .attr('x', 0) - .attr('dy', (_l, i) => (i === 0 ? `-${(lines.length - 1) * 1.1}em` : '1.1em')) + .attr('dy', (_l, i) => (i === 0 ? `${firstDy}em` : '1.1em')) .text((l) => l); }); } else { From f317377dfaea35f9cb5dc435ea177966aa17fbf8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 10:21:00 -0500 Subject: [PATCH 08/96] fix: dedupe artifacts by logical name + skip 0-successful agg rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two complementary fixes for runs whose `results_bmk` aggregated artifact ends up containing both a successful row and a failed-attempt row for the same (config, conc, offload) — the failed row's null metrics were overwriting the good row via ON CONFLICT DO UPDATE. 1. Artifact-level: strip the trailing `__` suffix from each artifact name and group by the logical name, keeping only the most recent per group. 2. Row-level: skip rows with `num_requests_successful === 0` AND `num_requests_total > 0`. The aggregated artifact merges rows from all runners — including failed ones — so artifact-level dedup alone can't reach inside it. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/db/src/etl/benchmark-mapper.ts | 14 +++++++++++ packages/db/src/etl/skip-tracker.ts | 10 +++++++- packages/db/src/ingest-ci-run.ts | 33 ++++++++++++++++++++----- packages/db/src/ingest-gcs-backup.ts | 1 + 4 files changed, 51 insertions(+), 7 deletions(-) diff --git a/packages/db/src/etl/benchmark-mapper.ts b/packages/db/src/etl/benchmark-mapper.ts index d842276e..1aff5ea9 100644 --- a/packages/db/src/etl/benchmark-mapper.ts +++ b/packages/db/src/etl/benchmark-mapper.ts @@ -145,6 +145,20 @@ export function mapBenchmarkRow( return null; } + // Failed-run guard: aggregated artifacts (`results_bmk`) merge rows from + // every runner, including ones with 0 successful requests and null metrics. + // Without this skip, the empty row's nulls overwrite a good row via + // ON CONFLICT DO UPDATE when both share the same (config, conc, offload). + if ( + typeof row.num_requests_successful === 'number' && + row.num_requests_successful === 0 && + typeof row.num_requests_total === 'number' && + row.num_requests_total > 0 + ) { + tracker.skips.failedRun++; + return null; + } + // Agentic offload signal: prefer `offload_mode` ('on'|'off'), fall back to `offloading` // ('none' → 'off'; any other non-empty value → 'on'). const offloadModeRaw = diff --git a/packages/db/src/etl/skip-tracker.ts b/packages/db/src/etl/skip-tracker.ts index 6166ea44..588718dd 100644 --- a/packages/db/src/etl/skip-tracker.ts +++ b/packages/db/src/etl/skip-tracker.ts @@ -8,6 +8,7 @@ export interface Skips { unmappedModel: number; unmappedHw: number; noIslOsl: number; + failedRun: number; dbError: number; } @@ -66,7 +67,14 @@ const MAX_DB_ERRORS = 10; * @returns A `SkipTracker` with zeroed counters and empty unmapped-name sets. */ export function createSkipTracker(): SkipTracker { - const skips: Skips = { badZip: 0, unmappedModel: 0, unmappedHw: 0, noIslOsl: 0, dbError: 0 }; + const skips: Skips = { + badZip: 0, + unmappedModel: 0, + unmappedHw: 0, + noIslOsl: 0, + failedRun: 0, + dbError: 0, + }; const unmappedModels = new Set(); const unmappedHws = new Set(); const unmappedPrecisions = new Set(); diff --git a/packages/db/src/ingest-ci-run.ts b/packages/db/src/ingest-ci-run.ts index 8cce43ca..fb1fbbbc 100644 --- a/packages/db/src/ingest-ci-run.ts +++ b/packages/db/src/ingest-ci-run.ts @@ -101,15 +101,30 @@ if (isDownloadMode) { } catch {} } - const byName = new Map(); + // Strip the trailing `__` token from each + // artifact name, then group by the resulting logical name and keep only + // the most recent per group. Without this, two artifacts produced on + // different runners for the same logical config (e.g. `…_h200-cw_00` and + // `…_h200-dgxc-slurm_1`) both land in the DB and the failed one's empty + // metrics can overwrite the good one via ON CONFLICT DO UPDATE. + // + // The runner pool name itself has no underscores (`h200-cw`, + // `h200-dgxc-slurm`, `b200-nb`), so `[a-zA-Z0-9.-]*` keeps the strip + // bounded — using `\w` here would over-match across earlier `_` + // separators and collapse different (conc, offload) variants into the + // same logical name. + const RUNNER_SUFFIX_RE = /_[a-zA-Z][a-zA-Z0-9.-]*_\d+$/; + const byLogical = new Map(); for (const a of allArtifacts) { - const existing = byName.get(a.name); + const key = a.name.replace(RUNNER_SUFFIX_RE, ''); + const existing = byLogical.get(key); if (!existing || a.created_at > existing.created_at) { - byName.set(a.name, a); + byLogical.set(key, a); } } - for (const [name, artifact] of byName) { + for (const [, artifact] of byLogical) { + const name = artifact.name; console.log(` ${name}`); const zipPath = path.join(artifactsDir, 'artifact.zip'); execSync(`gh api "${artifact.archive_download_url}" > "${zipPath}"`, { @@ -121,7 +136,7 @@ if (isDownloadMode) { fs.unlinkSync(zipPath); } - console.log(`\n Downloaded ${byName.size} artifact(s)`); + console.log(`\n Downloaded ${byLogical.size} artifact(s)`); // Fetch run attempt from API const attemptStr = execSync( @@ -510,11 +525,17 @@ async function main(): Promise { const { skips, unmappedModels, unmappedHws, unmappedPrecisions } = tracker; const totalSkips = - skips.badZip + skips.unmappedModel + skips.unmappedHw + skips.noIslOsl + skips.dbError; + skips.badZip + + skips.unmappedModel + + skips.unmappedHw + + skips.noIslOsl + + skips.failedRun + + skips.dbError; if (totalSkips > 0) { console.log(`\n Skipped: ${totalSkips} rows`); const skipLines: [string, number][] = [ ['no isl/osl (old format)', skips.noIslOsl], + ['failed run (0 successful)', skips.failedRun], ['unmapped model', skips.unmappedModel], ['unmapped hw', skips.unmappedHw], ['bad/empty zip', skips.badZip], diff --git a/packages/db/src/ingest-gcs-backup.ts b/packages/db/src/ingest-gcs-backup.ts index 6dc604e9..d67f5164 100644 --- a/packages/db/src/ingest-gcs-backup.ts +++ b/packages/db/src/ingest-gcs-backup.ts @@ -434,6 +434,7 @@ async function mapWorkflowDir( unmappedModel: local.skips.unmappedModel, unmappedHw: local.skips.unmappedHw, noIslOsl: local.skips.noIslOsl, + failedRun: local.skips.failedRun, }, localUnmappedModels: new Set(local.unmappedModels), localUnmappedHws: new Set(local.unmappedHws), From c2f66f62f5a1dedb6a87c7c5e58ca990b3cb0956 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 7 May 2026 08:41:26 -0500 Subject: [PATCH 09/96] feat: add AIPerf to FRAMEWORK_LABELS Tag display name for the `aiperf` spec_method suffix used by the alternate-harness runs ingested for the agentic minimax sweep. Without this entry the legend shows 'AIPERF' from the default toUpperCase fallback. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/constants/src/framework-aliases.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/constants/src/framework-aliases.ts b/packages/constants/src/framework-aliases.ts index cc5eb6b4..e23a93bc 100644 --- a/packages/constants/src/framework-aliases.ts +++ b/packages/constants/src/framework-aliases.ts @@ -44,6 +44,7 @@ export const FRAMEWORK_LABELS: Record = { ]), ), mtp: 'MTP', + aiperf: 'AIPerf', }; /** From 024797a978a2a6e2954f66a963de3205b62a149e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 12 May 2026 15:02:07 -0500 Subject: [PATCH 10/96] fix(changelog): coerce ids to string when filtering changelog by run bigint workflow_run_id sometimes deserializes as a number on the frontend depending on the postgres adapter's behavior; strict === between a number and a string silently dropped every match, so the changelog popover always reported "no changelog data available." Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/GlobalFilterContext.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 08fc7094..11e56de7 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -87,7 +87,9 @@ function buildRunInfo(data: WorkflowInfoResponse): Record { const runs: Record = {}; for (const run of data.runs) { const runId = String(run.github_run_id); - const runChangelogs = data.changelogs.filter((c) => c.workflow_run_id === run.github_run_id); + const runChangelogs = data.changelogs.filter( + (c) => String(c.workflow_run_id) === String(run.github_run_id), + ); runs[runId] = { runId, runDate: run.created_at, From aa154193dfbc12535f25444cdf6fccc16a3e1382 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 12 May 2026 15:36:57 -0500 Subject: [PATCH 11/96] feat: default sequence to Agentic Traces when available If the selected model has agentic_traces data, prefer that over the default 8K/1K fixed-seq when the user hasn't explicitly chosen via URL. effectiveSequence already falls back to availableSequences[0] for models without agentic, so models with only fixed-seq data still render correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/src/components/GlobalFilterContext.tsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/GlobalFilterContext.tsx b/packages/app/src/components/GlobalFilterContext.tsx index 11e56de7..7813d079 100644 --- a/packages/app/src/components/GlobalFilterContext.tsx +++ b/packages/app/src/components/GlobalFilterContext.tsx @@ -125,7 +125,9 @@ export function GlobalFilterProvider({ children }: { children: ReactNode }) { const [selectedSequence, setSelectedSequence] = useState(() => { const urlSeq = getUrlParam('i_seq'); if (urlSeq && Object.values(Sequence).includes(urlSeq as Sequence)) return urlSeq as Sequence; - return Sequence.EightK_OneK; + // Prefer Agentic Traces by default when the selected model has it; the + // effectiveSequence fallback below handles models without agentic data. + return Sequence.AgenticTraces; }); const [selectedPrecisions, setSelectedPrecisionsRaw] = useState(() => { From 099a33efcb53f5130dc40d715a0f4b86d6136a93 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:25:25 -0500 Subject: [PATCH 12/96] fix(agentic): respect percentile selector for input-throughput x axis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rowToAggDataEntry was only copying median/p99 metric variants — picking p90/p99.9 in the percentile selector silently fell back to 0 and collapsed every point into a vertical line at x=0. Copy the full median/p90/p99/p99.9 set into AggDataEntry. Hide the X-Axis Metric dropdown for agentic mode (it doubled up with the percentile selector) and route the input-metric chart through withPercentile so picking p99 actually plots p99_ttft instead of the hard-coded p99_ttft config default. Percentile options pared back to median + p99. --- .../inference/hooks/useChartData.ts | 46 +++++++++++++++++-- .../app/src/components/inference/types.ts | 10 ++++ .../components/inference/ui/ChartControls.tsx | 3 +- packages/app/src/lib/benchmark-transform.ts | 12 ++++- packages/app/src/lib/data-mappings.ts | 8 +--- packages/app/src/lib/energy-metrics.test.ts | 10 ++++ 6 files changed, 77 insertions(+), 12 deletions(-) diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 81ab0780..57e9a1c2 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -16,7 +16,7 @@ import { filterDataByCostLimit } from '@/components/inference/utils'; import { useBenchmarks, benchmarkQueryOptions } from '@/hooks/api/use-benchmarks'; import { GPU_ALIAS_TO_CANONICAL, getModelSortIndex } from '@/lib/constants'; import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; -import type { Model, Sequence } from '@/lib/data-mappings'; +import { Sequence, type Model } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; /** Build deduplicated comparison dates, excluding the main run date. */ @@ -216,7 +216,14 @@ export function useChartData( ? 'P99 Time To First Token (s)' : 'Median Time To First Token (s)'; - if (effectiveXMetric && chartDef.chartType === 'interactivity' && isInputMetric) { + const isAgentic = selectedSequence === Sequence.AgenticTraces; + + if ( + effectiveXMetric && + chartDef.chartType === 'interactivity' && + isInputMetric && + !isAgentic + ) { xAxisField = effectiveXMetric as keyof AggDataEntry; const labelKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; if (effectiveXMetric === chartDef[`${selectedYAxisMetric}_x` as keyof ChartDefinition]) { @@ -225,15 +232,40 @@ export function useChartData( xAxisLabel = isTtftOverride ? ttftLabel : chartDef.x_label; } } else if (chartDef.chartType === 'interactivity' && isInputMetric) { + // Agentic falls through here too — the manual X-axis dropdown is + // hidden in agentic mode (would double up with the percentile + // selector), so the config default + percentile post-processing + // below drives the x axis. const xOverrideKey = `${selectedYAxisMetric}_x` as keyof ChartDefinition; const xLabelOverrideKey = `${selectedYAxisMetric}_x_label` as keyof ChartDefinition; xAxisField = (chartDef[xOverrideKey] as keyof AggDataEntry) || chartDef.x; xAxisLabel = (chartDef[xLabelOverrideKey] as string) || chartDef.x_label; - } else if (chartDef.chartType === 'e2e' && isTtftOverride) { + } else if (chartDef.chartType === 'e2e' && isTtftOverride && !isAgentic) { xAxisField = effectiveXMetric as keyof AggDataEntry; xAxisLabel = ttftLabel; } + // Agentic: rewrite the resolved x metric to the chosen percentile, + // and relabel accordingly. naturalX is already percentile-adjusted, + // so the per-metric override path is the only one that actually + // changes here. + if (isAgentic) { + const adjusted = withPercentile( + xAxisField as string, + selectedPercentile, + ) as keyof AggDataEntry; + if (adjusted !== xAxisField) { + const pctlWord = + selectedPercentile === 'median' + ? 'Median' + : selectedPercentile === 'p99.9' + ? 'P99.9' + : selectedPercentile.toUpperCase(); + xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord); + xAxisField = adjusted; + } + } + // The x-axis is "flipped" only when the good-direction reverses // (e.g. interactivity → TTFT: "higher is better" → "lower is better"). // E2EL → TTFT keeps the same direction ("lower is better" for both), @@ -269,7 +301,13 @@ export function useChartData( xAxisField, }; }), - [selectedYAxisMetric, selectedXAxisMetric, selectedE2eXAxisMetric, selectedPercentile], + [ + selectedYAxisMetric, + selectedXAxisMetric, + selectedE2eXAxisMetric, + selectedPercentile, + selectedSequence, + ], ); // Build renderable graphs (data processing + stable chart definitions) diff --git a/packages/app/src/components/inference/types.ts b/packages/app/src/components/inference/types.ts index a2d9ef2e..cddeba54 100644 --- a/packages/app/src/components/inference/types.ts +++ b/packages/app/src/components/inference/types.ts @@ -50,23 +50,33 @@ export interface AggDataEntry { mean_ttft: number; median_ttft: number; std_ttft: number; + p90_ttft: number; p99_ttft: number; + 'p99.9_ttft': number; mean_tpot: number; mean_intvty: number; median_tpot: number; median_intvty: number; std_tpot: number; std_intvty: number; + p90_tpot: number; + p90_intvty: number; p99_tpot: number; p99_intvty: number; + 'p99.9_tpot': number; + 'p99.9_intvty': number; mean_itl: number; median_itl: number; std_itl: number; + p90_itl: number; p99_itl: number; + 'p99.9_itl': number; mean_e2el: number; median_e2el: number; std_e2el: number; + p90_e2el: number; p99_e2el: number; + 'p99.9_e2el': number; disagg: boolean; num_prefill_gpu: number; num_decode_gpu: number; diff --git a/packages/app/src/components/inference/ui/ChartControls.tsx b/packages/app/src/components/inference/ui/ChartControls.tsx index 6707bd9e..7b4fa08f 100644 --- a/packages/app/src/components/inference/ui/ChartControls.tsx +++ b/packages/app/src/components/inference/ui/ChartControls.tsx @@ -269,7 +269,8 @@ export default function ChartControls({ hideGpuComparison = false }: ChartContro
{graphs.some((g) => g.chartDefinition?.chartType === 'interactivity') && - isInputMetric && ( + isInputMetric && + selectedSequence !== Sequence.AgenticTraces && (
): Record { const out: Record = {}; - for (const suffix of ['mean', 'median', 'p90', 'p99']) { + for (const suffix of ['mean', 'median', 'p90', 'p99', 'p99.9']) { const itl = m[`${suffix}_itl`]; const ttlt = m[`${suffix}_ttlt`]; if (m[`${suffix}_e2el`] === undefined && ttlt !== undefined) out[`${suffix}_e2el`] = ttlt; @@ -62,23 +62,33 @@ export function rowToAggDataEntry(row: BenchmarkRow): AggDataEntry { mean_ttft: m.mean_ttft ?? 0, median_ttft: m.median_ttft ?? 0, std_ttft: m.std_ttft ?? 0, + p90_ttft: m.p90_ttft ?? 0, p99_ttft: m.p99_ttft ?? 0, + 'p99.9_ttft': m['p99.9_ttft'] ?? 0, mean_tpot: m.mean_tpot ?? 0, median_tpot: m.median_tpot ?? 0, std_tpot: m.std_tpot ?? 0, + p90_tpot: m.p90_tpot ?? 0, p99_tpot: m.p99_tpot ?? 0, + 'p99.9_tpot': m['p99.9_tpot'] ?? 0, mean_intvty: m.mean_intvty ?? 0, median_intvty: m.median_intvty ?? 0, std_intvty: m.std_intvty ?? 0, + p90_intvty: m.p90_intvty ?? 0, p99_intvty: m.p99_intvty ?? 0, + 'p99.9_intvty': m['p99.9_intvty'] ?? 0, mean_itl: m.mean_itl ?? 0, median_itl: m.median_itl ?? 0, std_itl: m.std_itl ?? 0, + p90_itl: m.p90_itl ?? 0, p99_itl: m.p99_itl ?? 0, + 'p99.9_itl': m['p99.9_itl'] ?? 0, mean_e2el: m.mean_e2el ?? 0, median_e2el: m.median_e2el ?? 0, std_e2el: m.std_e2el ?? 0, + p90_e2el: m.p90_e2el ?? 0, p99_e2el: m.p99_e2el ?? 0, + 'p99.9_e2el': m['p99.9_e2el'] ?? 0, disagg: row.disagg, num_prefill_gpu: row.num_prefill_gpu, num_decode_gpu: row.num_decode_gpu, diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index f137875c..bf48c864 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -186,21 +186,17 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; /** * Percentile of the latency distribution used for the chart x-axis when * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants - * for ttft, ttlt (=e2el), and itl (and intvty derived from itl) — pick which - * slice to plot. + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the + * two most commonly read slices (p50, p99) are surfaced in the UI. */ export enum Percentile { Median = 'median', - P90 = 'p90', P99 = 'p99', - P99_9 = 'p99.9', } const PERCENTILE_CONFIG: Record = { [Percentile.Median]: { label: 'p50 (median)' }, - [Percentile.P90]: { label: 'p90' }, [Percentile.P99]: { label: 'p99' }, - [Percentile.P99_9]: { label: 'p99.9' }, }; export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; diff --git a/packages/app/src/lib/energy-metrics.test.ts b/packages/app/src/lib/energy-metrics.test.ts index 28cc1e36..54788585 100644 --- a/packages/app/src/lib/energy-metrics.test.ts +++ b/packages/app/src/lib/energy-metrics.test.ts @@ -57,23 +57,33 @@ function makeEntry(overrides: Partial = {}): AggDataEntry { mean_ttft: 0.5, median_ttft: 0.4, std_ttft: 0.1, + p90_ttft: 0.7, p99_ttft: 0.8, + 'p99.9_ttft': 0.9, mean_tpot: 0.02, mean_intvty: 45, median_tpot: 0.02, median_intvty: 44, std_tpot: 0.005, std_intvty: 5, + p90_tpot: 0.025, + p90_intvty: 55, p99_tpot: 0.03, p99_intvty: 60, + 'p99.9_tpot': 0.035, + 'p99.9_intvty': 65, mean_itl: 0.01, median_itl: 0.01, std_itl: 0.002, + p90_itl: 0.013, p99_itl: 0.015, + 'p99.9_itl': 0.018, mean_e2el: 5, median_e2el: 4.8, std_e2el: 0.5, + p90_e2el: 5.5, p99_e2el: 6, + 'p99.9_e2el': 6.5, disagg: false, num_prefill_gpu: 0, num_decode_gpu: 0, From 50a06d1419c70ddd8d24b2c6545da44fe6be3a4d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:27:19 -0500 Subject: [PATCH 13/96] fix(agentic): default percentile to p99 and drop median option --- packages/app/src/components/inference/InferenceContext.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index b4ccb9ef..af2d364e 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -122,7 +122,7 @@ export function InferenceProvider({ // Latency percentile applied to the chart x-axis for agentic scenarios. // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( - () => getUrlParam('i_pctl') || 'median', + () => getUrlParam('i_pctl') || 'p99', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index bf48c864..1b4f47c3 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -186,16 +186,14 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; /** * Percentile of the latency distribution used for the chart x-axis when * viewing agentic traces. Agentic rows carry median/p90/p99/p99.9 variants - * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only the - * two most commonly read slices (p50, p99) are surfaced in the UI. + * for ttft, ttlt (=e2el), and itl (and intvty derived from itl); only p99 + * is surfaced in the UI. */ export enum Percentile { - Median = 'median', P99 = 'p99', } const PERCENTILE_CONFIG: Record = { - [Percentile.Median]: { label: 'p50 (median)' }, [Percentile.P99]: { label: 'p99' }, }; From 3c96e9137776d1c368a0acdfeee6e769d5733464 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 15 May 2026 12:31:27 -0500 Subject: [PATCH 14/96] fix(agentic): keep only p90 as the percentile option --- packages/app/src/components/inference/InferenceContext.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 0ba14a21..accfdf9e 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -136,7 +136,7 @@ export function InferenceProvider({ // Latency percentile applied to the chart x-axis for agentic scenarios. // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( - () => getUrlParam('i_pctl') || 'p99', + () => getUrlParam('i_pctl') || 'p90', ); const [scaleType, setScaleType] = useState<'auto' | 'linear' | 'log'>( () => (getUrlParam('i_scale') as 'auto' | 'linear' | 'log') || 'auto', diff --git a/packages/app/src/lib/data-mappings.ts b/packages/app/src/lib/data-mappings.ts index 0afb304a..83e6648a 100644 --- a/packages/app/src/lib/data-mappings.ts +++ b/packages/app/src/lib/data-mappings.ts @@ -191,12 +191,10 @@ export const SEQUENCE_OPTIONS = Object.keys(SEQUENCE_CONFIG) as Sequence[]; */ export enum Percentile { P90 = 'p90', - P99 = 'p99', } const PERCENTILE_CONFIG: Record = { [Percentile.P90]: { label: 'p90' }, - [Percentile.P99]: { label: 'p99' }, }; export const PERCENTILE_OPTIONS = Object.keys(PERCENTILE_CONFIG) as Percentile[]; From 642081af77c8165ac89a5177abbd6c0244dfb9c0 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Fri, 15 May 2026 13:31:30 -0400 Subject: [PATCH 15/96] fix(agentic): default percentile to p90, surface only p90/p99 Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/app/cypress/support/mock-data.ts | 2 +- .../app/src/components/inference/InferenceContext.tsx | 2 +- .../app/src/components/inference/hooks/useChartData.ts | 9 ++------- packages/app/src/components/ui/chart-selectors.tsx | 2 +- packages/app/src/lib/data-mappings.ts | 6 ++++-- packages/app/src/lib/url-state.ts | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) diff --git a/packages/app/cypress/support/mock-data.ts b/packages/app/cypress/support/mock-data.ts index f267dcc9..34b89aba 100644 --- a/packages/app/cypress/support/mock-data.ts +++ b/packages/app/cypress/support/mock-data.ts @@ -189,7 +189,7 @@ export function createMockInferenceContext( workflowInfo: null, selectedYAxisMetric: 'y_tpPerGpu', setSelectedYAxisMetric: namedStub('setSelectedYAxisMetric'), - selectedPercentile: 'median', + selectedPercentile: 'p90', setSelectedPercentile: namedStub('setSelectedPercentile'), selectedXAxisMetric: null, setSelectedXAxisMetric: namedStub('setSelectedXAxisMetric'), diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index accfdf9e..36dc672d 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -134,7 +134,7 @@ export function InferenceProvider({ () => getUrlParam('i_e2e_xmetric') || null, ); // Latency percentile applied to the chart x-axis for agentic scenarios. - // Values: 'median' | 'p90' | 'p99' | 'p99.9'. Non-agentic charts ignore. + // Values: 'p90' | 'p99'. Non-agentic charts ignore. const [selectedPercentile, setSelectedPercentile] = useState( () => getUrlParam('i_pctl') || 'p90', ); diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index f2ef85ec..436fd662 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -83,7 +83,7 @@ export function useChartData( selectedRunDate?: string, enabled = true, latestAvailableDate?: string, - selectedPercentile = 'median', + selectedPercentile = 'p90', /** When set, only series for these two registry GPU keys are shown (compare pages). */ compareGpuPair?: readonly [string, string] | null, ) { @@ -261,12 +261,7 @@ export function useChartData( selectedPercentile, ) as keyof AggDataEntry; if (adjusted !== xAxisField) { - const pctlWord = - selectedPercentile === 'median' - ? 'Median' - : selectedPercentile === 'p99.9' - ? 'P99.9' - : selectedPercentile.toUpperCase(); + const pctlWord = selectedPercentile.toUpperCase(); xAxisLabel = xAxisLabel.replace(/^(Median|Mean|P90|P99(?:\.9)?)\b/iu, pctlWord); xAxisField = adjusted; } diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index d2940de4..e30816fa 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -315,7 +315,7 @@ export function PercentileSelector({ - P99 TTFT - Median TTFT + P90 TTFT
diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index f0e1692a..78df2c37 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -408,27 +408,20 @@ export default function ChartDisplay() { if ( graph.chartDefinition.chartType === 'interactivity' && isInputMetric && - selectedXAxisMetric + selectedXAxisMetric === 'p90_ttft' ) { - if (selectedXAxisMetric === 'p99_ttft') { - return 'vs. P99 Time To First Token'; - } else if (selectedXAxisMetric === 'median_ttft') { - return 'vs. Median Time To First Token'; - } + return 'vs. P90 Time To First Token'; } // For e2e chart: render clickable inline dropdown for x-axis if (graph.chartDefinition.chartType === 'e2e') { const xAxisLabel = - selectedE2eXAxisMetric === 'p99_ttft' - ? 'P99 TTFT' - : selectedE2eXAxisMetric === 'median_ttft' - ? 'Median TTFT' - : 'End-to-end Latency'; + selectedE2eXAxisMetric === 'p90_ttft' + ? 'P90 TTFT' + : 'End-to-end Latency'; const xAxisOptions = [ { value: null, label: 'End-to-end Latency' }, - { value: 'p99_ttft', label: 'P99 TTFT' }, - { value: 'median_ttft', label: 'Median TTFT' }, + { value: 'p90_ttft', label: 'P90 TTFT' }, ]; const zoomPrefix = selectedDateRange.startDate && diff --git a/packages/app/src/components/inference/utils.test.ts b/packages/app/src/components/inference/utils.test.ts index 8f8705e1..589ba580 100644 --- a/packages/app/src/components/inference/utils.test.ts +++ b/packages/app/src/components/inference/utils.test.ts @@ -157,12 +157,12 @@ describe('processOverlayChartData', () => { }); it('remaps x to config override for input metrics on interactivity chart', () => { - // inputTputPerGpu has x override to p99_ttft on interactivity chart + // inputTputPerGpu has x override to p90_ttft on interactivity chart const data = [ pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_intvty: 50, } as any), ]; @@ -176,16 +176,11 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - median_ttft: 0.1, + p90_ttft: 0.1, median_intvty: 50, } as any), ]; - const result = processOverlayChartData( - data, - 'interactivity', - 'y_inputTputPerGpu', - 'median_ttft', - ); + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.1); }); @@ -195,76 +190,62 @@ describe('processOverlayChartData', () => { pt({ x: 100, inputTputPerGpu: { y: 5, roof: false }, - p99_ttft: 0.25, + p90_ttft: 0.25, median_e2el: 2.5, } as any), ]; const result = processOverlayChartData(data, 'e2e', 'y_inputTputPerGpu', null); expect(result).toHaveLength(1); - // e2e uses median_e2el as x (from chart config default), not p99_ttft + // e2e uses median_e2el as x (from chart config default), not p90_ttft expect(result[0].x).toBe(2.5); }); - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p99_ttft', () => { - const data = [ - pt({ - x: 100, - tpPerGpu: { y: 42, roof: false }, - p99_ttft: 0.35, - median_e2el: 2.5, - } as any), - ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); - expect(result).toHaveLength(1); - expect(result[0].x).toBe(0.35); - }); - - it('remaps x to TTFT for e2e chart when selectedXAxisMetric is median_ttft', () => { + it('remaps x to TTFT for e2e chart when selectedXAxisMetric is p90_ttft', () => { const data = [ pt({ x: 100, tpPerGpu: { y: 42, roof: false }, - median_ttft: 0.12, + p90_ttft: 0.12, median_e2el: 2.5, } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'median_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(1); expect(result[0].x).toBe(0.12); }); it('filters e2e TTFT outliers exceeding y_latency_limit', () => { const data = [ - pt({ tpPerGpu: { y: 10, roof: false }, p99_ttft: 0.5, median_e2el: 1 } as any), - pt({ tpPerGpu: { y: 5, roof: false }, p99_ttft: 999, median_e2el: 2 } as any), + pt({ tpPerGpu: { y: 10, roof: false }, p90_ttft: 0.5, median_e2el: 1 } as any), + pt({ tpPerGpu: { y: 5, roof: false }, p90_ttft: 999, median_e2el: 2 } as any), ]; - const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'e2e', 'y_tpPerGpu', 'p90_ttft'); // y_latency_limit is 60 in the e2e chart config — the 999 outlier should be filtered expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); it('does not filter interactivity points by latency limit when x-axis is default', () => { - // Regression: selectedXAxisMetric defaults to 'p99_ttft' but the interactivity + // Regression: selectedXAxisMetric defaults to 'p90_ttft' but the interactivity // chart's x-axis stays median_intvty for non-input metrics. The latency limit // (60) must NOT apply to median_intvty values. const data = [ pt({ tpPerGpu: { y: 42, roof: false }, median_intvty: 200 } as any), pt({ tpPerGpu: { y: 10, roof: false }, median_intvty: 30 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p99_ttft'); + const result = processOverlayChartData(data, 'interactivity', 'y_tpPerGpu', 'p90_ttft'); expect(result).toHaveLength(2); }); it('applies latency limit on interactivity only when x-axis is actually overridden', () => { - // When an input metric IS selected and x-axis overrides to p99_ttft, + // When an input metric IS selected and x-axis overrides to p90_ttft, // the latency limit should apply. const data = [ - pt({ inputTputPerGpu: { y: 5, roof: false }, p99_ttft: 0.5, median_intvty: 10 } as any), - pt({ inputTputPerGpu: { y: 3, roof: false }, p99_ttft: 999, median_intvty: 20 } as any), + pt({ inputTputPerGpu: { y: 5, roof: false }, p90_ttft: 0.5, median_intvty: 10 } as any), + pt({ inputTputPerGpu: { y: 3, roof: false }, p90_ttft: 999, median_intvty: 20 } as any), ]; - const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p99_ttft'); - // x-axis is overridden to p99_ttft for input metric — latency limit SHOULD filter 999 + const result = processOverlayChartData(data, 'interactivity', 'y_inputTputPerGpu', 'p90_ttft'); + // x-axis is overridden to p90_ttft for input metric — latency limit SHOULD filter 999 expect(result).toHaveLength(1); expect(result[0].x).toBe(0.5); }); diff --git a/packages/app/src/components/inference/utils.ts b/packages/app/src/components/inference/utils.ts index 4b5335b6..735007ab 100644 --- a/packages/app/src/components/inference/utils.ts +++ b/packages/app/src/components/inference/utils.ts @@ -88,8 +88,7 @@ export function processOverlayChartData( let xAxisField: string = chartDef.x; // selectedXAxisMetric is already the effective metric for this chart type // (interactivity uses selectedXAxisMetric, e2e uses selectedE2eXAxisMetric) - const isTtftOverride = - selectedXAxisMetric === 'p99_ttft' || selectedXAxisMetric === 'median_ttft'; + const isTtftOverride = selectedXAxisMetric === 'p90_ttft'; if (selectedXAxisMetric && chartDef.chartType === 'interactivity' && isInputMetric) { xAxisField = selectedXAxisMetric; diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index e30816fa..19b4bfb0 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -315,7 +315,7 @@ export function PercentileSelector({ From 19b99586353cd39bccd4072bd6e2a2afcaf73367 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 26 May 2026 18:32:26 -0500 Subject: [PATCH 43/96] fix(scenario-selector): wrap Deprecated header in SelectLabel only inside Select MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous commit (b3e315c) changed DeprecatedSectionTitle to render SelectLabel internally, which throws at runtime ("SelectLabel must be used within SelectGroup") in callsites that render the header via MultiSelect — MultiSelect wraps the header in its own div, not a Radix SelectGroup. Revert the component to a plain styled span (MultiSelect's div wrapper supplies the small/muted styling), and wrap with SelectLabel only at the ScenarioSelector callsite, where the header sits directly inside a SelectGroup. Co-Authored-By: Claude Opus 4.7 --- .../app/src/components/ui/chart-selectors.tsx | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/packages/app/src/components/ui/chart-selectors.tsx b/packages/app/src/components/ui/chart-selectors.tsx index 8b91059a..49ea3f1a 100644 --- a/packages/app/src/components/ui/chart-selectors.tsx +++ b/packages/app/src/components/ui/chart-selectors.tsx @@ -31,9 +31,16 @@ import { sequenceKind, } from '@/lib/data-mappings'; +/** + * "Deprecated" sub-header used by selectors. Rendered as a span (not + * SelectLabel) because some callsites use `MultiSelect`, which wraps + * headers in its own div and isn't a SelectGroup. The span carries no + * styling of its own — the parent context supplies the muted/small + * treatment. ScenarioSelector renders this inside a SelectLabel directly. + */ function DeprecatedSectionTitle({ reason }: { reason: string }) { return ( - + Deprecated @@ -43,7 +50,7 @@ function DeprecatedSectionTitle({ reason }: { reason: string }) { {reason} - + ); } @@ -282,7 +289,9 @@ export function ScenarioSelector({ ))} {fixedGroups.deprecated.length > 0 && ( <> - + + + {fixedGroups.deprecated.map((seq) => ( {getSequenceLabel(seq as Sequence)} From 7114833409b92a206f7c22b80846db527e01da43 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 13:22:13 -0500 Subject: [PATCH 44/96] feat(agentic-detail): add cumulative input tokens chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces a new chart on the agentic detail page showing the running total of input (prompt) tokens served over the course of the run — useful for seeing how the load actually accumulates vs the instantaneous prefill_tps line we already plot. Adds a `cumulativeSum` helper alongside the existing `cumulativeAverage` and `sumSeries` time-series utilities. No backfill needed — the source data (`chart_series.prefillTps`) is already pre-computed at ingest time for every blob-bearing row. (Input throughput as a Pareto axis is already wired via the existing `y_inputTputPerGpu` y-axis option; no change there.) Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 24 +++++++++++++++++++ .../agentic-point/time-series-chart.tsx | 17 +++++++++++++ 2 files changed, 41 insertions(+) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 2e43b4fb..1a61b93b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -26,6 +26,7 @@ import { StackedAreaChart, TimeSeriesChart, cumulativeAverage, + cumulativeSum, rollingAverage, sumSeries, } from './time-series-chart'; @@ -381,6 +382,29 @@ export function AgenticPointDetail({ id }: Props) { ); }} /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!metrics) return ; + return ( + + ); + }} + /> )} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index cd10aff7..042c4331 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -58,6 +58,23 @@ export function cumulativeAverage(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } +/** + * Running cumulative sum of a per-interval rate series. Each output point + * is the integral of the rate from start to that point, assuming the rate + * applies over a 1-second window (aiperf's scrape interval). Use for + * "total tokens served so far" from a tokens-per-second series. + */ +export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { + if (data.length === 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + let sum = 0; + for (let i = 0; i < data.length; i++) { + sum += data[i]!.value; + out[i] = { t: data[i]!.t, value: sum }; + } + return out; +} + /** Pointwise sum of two arrays sharing the same t index. */ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { const n = Math.min(a.length, b.length); From c6697de8ff3d8263924986fd71b4622f1369f9a3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 14:44:19 -0500 Subject: [PATCH 45/96] feat(agentic-detail): plot cumulative unique input tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the "Total input tokens over time" chart with "Total unique input tokens over time" — cumsum of (prompt-token rate − prefix-cache- hit rate per second), which equals the cumulative tokens vllm actually had to prefill from scratch (= vllm:request_prefill_kv_computed_tokens). Adds `prefixCacheHitsTps` to the chart_series JSONB (extracted by summing vllm:prefix_cache_hits.rate across all engine series, same DP- aware path as prefillTps). Bumps CHART_SERIES_VERSION to 3; the existing trace-server-metrics query defaults the field to [] for any older v2 rows so reads stay safe before backfill catches up. Backfilled 62 rows to v3. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 14 +++++++++++--- .../src/hooks/api/use-trace-server-metrics.ts | 2 ++ packages/db/src/etl/compute-chart-series.ts | 16 +++++++++++++++- packages/db/src/queries/trace-server-metrics.ts | 4 ++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 1a61b93b..4bebd37c 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -384,16 +384,24 @@ export function AgenticPointDetail({ id }: Props) { /> { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; + // Unique = total prompt tokens vllm received minus the tokens + // it served from the prefix cache. The cache-miss portion is + // what actually constitutes "new content" the GPU had to + // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens. + const unique = sumSeries( + metrics.prefillTps, + metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })), + ); return ( ; prefillTps: TimeSeriesPoint[]; decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; } async function fetchTraceServerMetrics( diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 530600cf..91e89521 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -25,8 +25,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * only series[0], which under-counted by Nx on multi-engine DP/PP * deployments — most visible as a request-queue-depth chart that maxed out * at ~3 when the timeline clearly showed 20+ in-flight). + * + * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative + * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps). */ -export const CHART_SERIES_VERSION = 2; +export const CHART_SERIES_VERSION = 3; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -57,6 +60,13 @@ export interface ChartSeries { promptTokensBySource: Record; prefillTps: TimeSeriesPoint[]; decodeTps: TimeSeriesPoint[]; + /** + * Per-scrape rate (tokens/sec) of vllm:prefix_cache_hits, summed across + * engines. Detail page derives "cumulative unique input tokens" as + * cumsum(prefillTps - prefixCacheHitsTps) — what the cache actually + * saved vs the raw queries that came in. + */ + prefixCacheHitsTps: TimeSeriesPoint[]; } // ── Raw blob shapes (subset we read) ──────────────────────────────────── @@ -249,6 +259,9 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { })); const prefillTps = counterRate('vllm:prompt_tokens'); const decodeTps = counterRate('vllm:generation_tokens'); + // Tokens served from prefix cache per scrape. Lets the frontend derive + // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). + const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits'); // Per-source prompt tokens — sum across engines per source label. const promptBySrcByT = new Map>(); @@ -286,5 +299,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { promptTokensBySource, prefillTps, decodeTps, + prefixCacheHitsTps, }; } diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 624b6ed3..76775e77 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -71,6 +71,8 @@ export interface TraceServerMetrics { prefillTps: TimeSeriesPoint[]; /** Decode throughput: vllm:generation_tokens rate (tokens/sec) per scrape. */ decodeTps: TimeSeriesPoint[]; + /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ + prefixCacheHitsTps: TimeSeriesPoint[]; } interface RawMetaRow extends PointMeta { @@ -114,6 +116,8 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { promptTokensBySource: series.promptTokensBySource, prefillTps: series.prefillTps, decodeTps: series.decodeTps, + // v2 chart_series rows pre-backfill don't have this field — default to [] + prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], }; } From b5679bb10acfd6a6765b48a5864b2a0ec73d4915 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 15:00:12 -0500 Subject: [PATCH 46/96] feat(request-timeline): expandable subagent -> stream rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The harness fans a single subagent into multiple parallel ":sN" streams when its inner requests overlap in time (weka_trace._pack_into_streams). Previously each :sN got its own swimlane row, which made one parent conversation with 5 subagents (each fanned into 2-8 streams) render as 23 separate rows — visually implying 23 distinct subagent invocations when really there are 5. Now: each subagent shows as one row by default with a chevron + stream count chip ("subagent 003 · f1e7 ×8"). The collapsed row draws the union of all stream bars overlaid, so the concurrency burst is still visible at a glance. Click the chevron to fan into per-stream rows; click again to collapse. For conv 0f5b266f in benchmark 206360: 23 rows → 5 rows by default. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/request-timeline.tsx | 325 ++++++++++++------ 1 file changed, 226 insertions(+), 99 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index bcbe105a..8762a158 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -53,44 +53,84 @@ const PHASE_COLORS: Record = { unknown: '#64748b', }; +/** + * Row kinds: + * parent — top-level conversation (depth 0) + * worker — worker swimlane (depth 0, worker mode) + * subagent — a subagent invocation (depth 1). Either a single + * stream (renders its own bars), or a multi-stream + * container whose bars are the union of its streams + * when collapsed. + * stream — one :sN stream of a multi-stream subagent (depth 2). + * Hidden by default; toggled in via the parent's chevron. + */ +type RowKind = 'parent' | 'worker' | 'subagent' | 'stream'; + interface Row { key: string; label: string; color: string; requests: RequestRecord[]; - /** 0 = top-level conversation/worker, 1+ = sub-agent under its parent. */ depth: number; - /** True if this row is a sub-agent ("Subagent N of parent X"). */ - isSubagent: boolean; + kind: RowKind; + /** Number of streams under this subagent (>=1). Only set for subagent rows. */ + streamCount?: number; + /** For stream rows: the parent subagent's row key (drives expand/collapse). */ + parentRowKey?: string; } /** * Conversation ids for subagent calls look like - * ::sa:subagent__ - * Split into the parent cid and a sub-agent label (or the whole thing if - * this is a top-level conversation). + * ::sa:[:s] + * The optional `:s` suffix is set when the harness fans a single + * subagent into multiple parallel "streams" (interval-graph + * decomposition in weka_trace._pack_into_streams). We split it off so + * we can group all streams of one subagent under a single header row. */ -function splitCid(cid: string): { parent: string; subagent: string | null } { +function splitCid(cid: string): { + parent: string; + subagentBase: string | null; + stream: number | null; +} { const sep = cid.indexOf('::sa:'); - if (sep === -1) return { parent: cid, subagent: null }; - return { parent: cid.slice(0, sep), subagent: cid.slice(sep + 5) }; + if (sep === -1) return { parent: cid, subagentBase: null, stream: null }; + const parent = cid.slice(0, sep); + const raw = cid.slice(sep + 5); + const m = /^(.*):s(\d+)$/.exec(raw); + if (m) return { parent, subagentBase: m[1]!, stream: Number(m[2]) }; + return { parent, subagentBase: raw, stream: null }; } -/** Group requests into rows; in conversation mode subagents nest under parents. */ -function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { - const groups = new Map(); - for (const r of requests) { - const key = mode === 'conversation' ? r.cid : r.wid; - let list = groups.get(key); - if (!list) { - list = []; - groups.set(key, list); - } - list.push(r); - } - +/** + * Group requests into rows. In conversation mode, output order is: + * parent_conv + * subagent_001 (collapsed by default, container) + * :s0 (hidden unless expanded) + * :s1 + * subagent_002 + * ... + * + * `expandedSubagents` controls which subagent containers reveal their + * stream children. Bars on a collapsed subagent are the UNION of all its + * streams' requests — overlapping bars visually communicate the + * stream-level parallelism without expanding. + */ +function buildRows( + requests: RequestRecord[], + mode: RowMode, + expandedSubagents: ReadonlySet, +): Row[] { if (mode !== 'conversation') { // Worker mode: flat rows, sorted by first activity. + const groups = new Map(); + for (const r of requests) { + let list = groups.get(r.wid); + if (!list) { + list = []; + groups.set(r.wid, list); + } + list.push(r); + } const rows: Row[] = []; let i = 0; for (const [key, list] of groups) { @@ -101,7 +141,7 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { color: ROW_COLORS[i % ROW_COLORS.length]!, requests: list, depth: 0, - isSubagent: false, + kind: 'worker', }); i++; } @@ -109,36 +149,40 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { return rows; } - // Conversation mode: build a parent → [subagents] tree so each parent - // group renders as one parent row followed by its sub-agent rows. Color - // is shared inside a tree so the visual grouping reads. + // Conversation mode — tree: parent → subagent → stream. interface Tree { parentCid: string; - parentRow: { key: string; requests: RequestRecord[] } | null; - subagents: Map; // subagent label → requests + parentReqs: RequestRecord[]; + // subagentBase → (streamIndex|null → requests) + subagents: Map>; firstStart: number; } const trees = new Map(); - for (const [cid, list] of groups) { - list.sort((a, b) => a.start - b.start); - const { parent, subagent } = splitCid(cid); + for (const r of requests) { + const { parent, subagentBase, stream } = splitCid(r.cid); let tree = trees.get(parent); if (!tree) { tree = { parentCid: parent, - parentRow: null, + parentReqs: [], subagents: new Map(), firstStart: Number.POSITIVE_INFINITY, }; trees.set(parent, tree); } - if (subagent === null) { - tree.parentRow = { key: cid, requests: list }; + if (subagentBase === null) { + tree.parentReqs.push(r); } else { - tree.subagents.set(subagent, list); + let saMap = tree.subagents.get(subagentBase); + if (!saMap) { + saMap = new Map(); + tree.subagents.set(subagentBase, saMap); + } + const list = saMap.get(stream); + if (list) list.push(r); + else saMap.set(stream, [r]); } - const earliest = list[0]!.start; - if (earliest < tree.firstStart) tree.firstStart = earliest; + if (r.start < tree.firstStart) tree.firstStart = r.start; } const sortedTrees = [...trees.values()].toSorted((a, b) => a.firstStart - b.firstStart); @@ -147,39 +191,66 @@ function buildRows(requests: RequestRecord[], mode: RowMode): Row[] { for (const tree of sortedTrees) { const color = ROW_COLORS[colorIdx % ROW_COLORS.length]!; colorIdx++; - if (tree.parentRow) { + // Parent row (use a placeholder key if the parent itself wasn't replayed). + tree.parentReqs.sort((a, b) => a.start - b.start); + rows.push({ + key: tree.parentReqs.length > 0 ? tree.parentCid : `__parent_${tree.parentCid}`, + label: tree.parentCid, + color, + requests: tree.parentReqs, + depth: 0, + kind: 'parent', + }); + + // One subagent row per base (which may contain N streams). + const subagentEntries = [...tree.subagents.entries()].toSorted((a, b) => { + const aStart = Math.min( + ...[...a[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ); + const bStart = Math.min( + ...[...b[1].values()].map((reqs) => reqs[0]?.start ?? Number.POSITIVE_INFINITY), + ); + return aStart - bStart; + }); + for (const [saBase, streams] of subagentEntries) { + const subagentKey = `${tree.parentCid}::sa:${saBase}`; + // Union of all stream requests for collapsed-view bars. + const allReqs: RequestRecord[] = []; + for (const reqs of streams.values()) allReqs.push(...reqs); + allReqs.sort((a, b) => a.start - b.start); + const streamCount = streams.size; rows.push({ - key: tree.parentRow.key, - label: shortenCid(tree.parentCid), + key: subagentKey, + label: `↳ ${formatSubagentLabel(saBase)}`, color, - requests: tree.parentRow.requests, - depth: 0, - isSubagent: false, - }); - } else { - // Pseudo-parent header so orphan subagents still render under - // something they belong to. - rows.push({ - key: `__parent_${tree.parentCid}`, - label: shortenCid(tree.parentCid), - color, - requests: [], - depth: 0, - isSubagent: false, - }); - } - const subagentEntries = [...tree.subagents.entries()].toSorted( - (a, b) => a[1][0]!.start - b[1][0]!.start, - ); - for (const [saLabel, list] of subagentEntries) { - rows.push({ - key: `${tree.parentCid}::${saLabel}`, - label: `↳ ${formatSubagentLabel(saLabel)}`, - color, - requests: list, + requests: allReqs, depth: 1, - isSubagent: true, + kind: 'subagent', + streamCount, }); + + // Stream children only when expanded AND there's more than one + // stream (a single-stream subagent has nothing extra to show). + if (streamCount > 1 && expandedSubagents.has(subagentKey)) { + const streamEntries = [...streams.entries()].toSorted((a, b) => { + // Sort by stream index (null first as the "default" stream) + const ai = a[0] ?? -1; + const bi = b[0] ?? -1; + return ai - bi; + }); + for (const [streamIdx, reqs] of streamEntries) { + reqs.sort((a, b) => a.start - b.start); + rows.push({ + key: `${subagentKey}:s${streamIdx ?? '∅'}`, + label: `stream ${streamIdx ?? '∅'}`, + color, + requests: reqs, + depth: 2, + kind: 'stream', + parentRowKey: subagentKey, + }); + } + } } } return rows; @@ -192,11 +263,6 @@ function formatSubagentLabel(raw: string): string { return `subagent ${m[1]} · ${m[2]!.slice(0, 4)}`; } -function shortenCid(cid: string): string { - if (cid.length <= 12) return cid; - return `${cid.slice(0, 8)}…${cid.slice(-4)}`; -} - function shortenWid(wid: string): string { // worker_4ae87bea → w_4ae8 return wid.replace(/^worker_/, 'w_').slice(0, 12); @@ -314,6 +380,17 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { const [rowMode, setRowMode] = useState('conversation'); const [phaseFilter, setPhaseFilter] = useState('profiling'); const [tooltip, setTooltip] = useState(null); + // Which multi-stream subagents currently have their per-stream rows + // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id). + const [expandedSubagents, setExpandedSubagents] = useState>(() => new Set()); + const toggleSubagent = useCallback((key: string) => { + setExpandedSubagents((prev) => { + const next = new Set(prev); + if (next.has(key)) next.delete(key); + else next.add(key); + return next; + }); + }, []); const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null); // Apply phase filter, then group into rows. @@ -322,7 +399,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'), [data.requests, phaseFilter], ); - const rows = useMemo(() => buildRows(filtered, rowMode), [filtered, rowMode]); + const rows = useMemo( + () => buildRows(filtered, rowMode, expandedSubagents), + [filtered, rowMode, expandedSubagents], + ); // Pre-sort the timestamp columns so the cursor-time stats popover can // count "running / waiting at time t" in O(log n). With a few hundred @@ -359,7 +439,10 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { const isZoomed = viewEnd !== null; // Layout - const LABEL_WIDTH = 160; + // Wide enough for a full 36-char conversation id at 10px font, plus the + // indent + color stripe + count badge. Subagent rows inherit the same + // width but truncate the longer "↳ subagent N · hash" tail with ellipsis. + const LABEL_WIDTH = 360; const ROW_HEIGHT = 22; const ROW_GAP = 3; const HEADER_HEIGHT = 24; @@ -537,33 +620,58 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {rowMode === 'conversation' ? 'Conversation' : 'Worker'} - {rows.map((row) => ( -
- { + const isSubagentRow = row.kind === 'subagent'; + const isStreamRow = row.kind === 'stream'; + const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; + const isExpanded = isExpandable && expandedSubagents.has(row.key); + return ( +
- - {row.label} - - - {row.requests.length > 0 ? row.requests.length : '—'} - -
- ))} + {isExpandable ? ( + + ) : ( + + )} + + + {row.label} + {isExpandable && ( + ×{row.streamCount} + )} + + + {row.requests.length > 0 ? row.requests.length : '—'} + +
+ ); + })} {/* Scrollable SVG */} @@ -636,6 +744,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {rows.map((row, rowIdx) => { const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; const barH = ROW_HEIGHT - 4; + // For multi-stream subagent containers, suppress the union + // bars when expanded — the child stream rows draw them + // individually instead, so we'd double-draw otherwise. + if ( + row.kind === 'subagent' && + (row.streamCount ?? 1) > 1 && + expandedSubagents.has(row.key) + ) { + return null; + } return row.requests.map((req) => { const xCredit = xOf(req.credit); const xStart = xOf(req.start); @@ -663,7 +781,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { opacity={0.35} /> )} - {/* Main bar */} + {/* Main bar — opacity stepped down with depth so + parent > subagent > stream reads visually. */} {/* Phase strip at bottom */} Date: Wed, 27 May 2026 15:07:27 -0500 Subject: [PATCH 47/96] fix(agentic-detail): make unique-input-tokens chart monotonic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vllm's per-scrape prompt_tokens.rate and prefix_cache_hits.rate counters can lag each other by several seconds across scrapes (we see prefill=0 at one tick with hits=1.1M, then prefill=1.5M with hits=452K six ticks later — lifetime totals agree but per-tick they don't). Computing cumsum(prefill - hits) per tick made the chart dip well negative at the start. Replaces the per-tick subtraction with `cumulativeDifferenceMonotonic`: union the two series by timestamp, accumulate each independently, take the diff, then enforce a running max so the curve never decreases. End-of-run totals are unchanged (both counters converge to the right value); transient skew just looks like a brief plateau instead of a negative dip. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 21 ++++++----- .../agentic-point/time-series-chart.tsx | 37 +++++++++++++++++++ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 4bebd37c..1abf64e6 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -26,7 +26,7 @@ import { StackedAreaChart, TimeSeriesChart, cumulativeAverage, - cumulativeSum, + cumulativeDifferenceMonotonic, rollingAverage, sumSeries, } from './time-series-chart'; @@ -388,20 +388,21 @@ export function AgenticPointDetail({ id }: Props) { render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; - // Unique = total prompt tokens vllm received minus the tokens - // it served from the prefix cache. The cache-miss portion is - // what actually constitutes "new content" the GPU had to - // process — equivalent to cumsum of vllm:request_prefill_kv_computed_tokens. - const unique = sumSeries( - metrics.prefillTps, - metrics.prefixCacheHitsTps.map((p) => ({ t: p.t, value: -p.value })), - ); + // Unique = total prompt tokens received minus tokens served + // from the prefix cache. Equivalent to cumsum of + // vllm:request_prefill_kv_computed_tokens. We compute it as + // monotonic-non-decreasing cumulative-diff so per-scrape + // timing skew between the prompt_tokens and prefix_cache_hits + // counters can't make the line dip negative. return ( [p.t, p.value])); + const bByT = new Map(b.map((p) => [p.t, p.value])); + const allT = [...new Set([...aByT.keys(), ...bByT.keys()])].toSorted((x, y) => x - y); + const out: TimeSeriesPoint[] = Array.from({ length: allT.length }); + let cumA = 0; + let cumB = 0; + let runningMax = 0; + for (let i = 0; i < allT.length; i++) { + const t = allT[i]!; + cumA += aByT.get(t) ?? 0; + cumB += bByT.get(t) ?? 0; + const diff = cumA - cumB; + if (diff > runningMax) runningMax = diff; + out[i] = { t, value: runningMax }; + } + return out; +} + /** Pointwise sum of two arrays sharing the same t index. */ export function sumSeries(a: TimeSeriesPoint[], b: TimeSeriesPoint[]): TimeSeriesPoint[] { const n = Math.min(a.length, b.length); From 08bbe6650c73935d7ac7a9fa29a722b141911bc9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 15:15:05 -0500 Subject: [PATCH 48/96] feat(agentic-detail): add unique input tokens in flight chart New chart on the per-point view that plots the deduped count of input tokens currently held by in-flight requests, as a 30s time- weighted rolling average with the raw step series rendered as faint scatter behind it. Useful for seeing the working set the model has to hold KV cache for at any instant. Computation (frontend, from request_timeline): - At each request start/end event, maintain active ISL per cid (within one cid turns are sequential, so each cid contributes at most one in-flight ISL at a time) - total_in_flight(t) = sum over cids with active request of that cid's current ISL - Across cids we treat content as independent (cross-conv prefix sharing measured at <1 pp, so summing is a tight approximation) Adds timeRollingAverage helper: time-weighted (vs sample-count) moving average suitable for irregularly-sampled event series like this one. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 43 ++++++++- .../agentic-point/time-series-chart.tsx | 96 +++++++++++++++++++ 2 files changed, 137 insertions(+), 2 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 1abf64e6..2db2809b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -27,8 +27,10 @@ import { TimeSeriesChart, cumulativeAverage, cumulativeDifferenceMonotonic, + inflightUniqueTokens, rollingAverage, sumSeries, + timeRollingAverage, } from './time-series-chart'; interface Props { @@ -124,8 +126,10 @@ export function AgenticPointDetail({ id }: Props) { // shows how the metric varies across the SKU. const siblingIds = siblingsData?.siblings.map((s) => s.id) ?? []; const aggregatesQuery = useAgenticAggregates(siblingIds, view === 'aggregates'); - // Per-request timeline fetched only when the timeline view is active. - const timelineQuery = useRequestTimeline(id, view === 'timeline'); + // Per-request timeline used by both the timeline view AND the per-point + // "Unique input tokens in flight" chart, so fetch whenever we're on + // either view. + const timelineQuery = useRequestTimeline(id, view === 'timeline' || view === 'point'); return (
@@ -414,6 +418,41 @@ export function AgenticPointDetail({ id }: Props) { ); }} /> + + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!timelineQuery.data) { + return timelineQuery.isLoading ? : ; + } + // Step function: at each request start/end, sum the ISLs of + // currently-active requests across distinct cids. Within one + // cid turns are sequential so each cid contributes at most + // one in-flight ISL; across cids we treat content as + // independent (cross-conv prefix sharing adds <1pp in + // practice). Smooth with a 30s time-weighted rolling average + // so brief turn-handoff dips don't dominate the chart. + const raw = inflightUniqueTokens(timelineQuery.data.requests); + const smoothed = timeRollingAverage(raw, 30); + return ( + + ); + }} + />
)} diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 25d5a672..520b3ed6 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -27,6 +27,39 @@ interface TimeSeriesChartProps { height?: number; } +/** + * Time-weighted rolling average over a `windowS`-second trailing window. + * Treats the input as a step function (value held constant between + * samples) and integrates over the trailing window, dividing by the + * window length. Good for smoothing irregularly-sampled event series + * (e.g. request start/end events) where the regular sample-count + * `rollingAverage` would over-weight bursts of close-together events. + */ +export function timeRollingAverage(data: TimeSeriesPoint[], windowS: number): TimeSeriesPoint[] { + if (data.length === 0 || windowS <= 0) return data; + const out: TimeSeriesPoint[] = Array.from({ length: data.length }); + for (let i = 0; i < data.length; i++) { + const tEnd = data[i]!.t; + const tStart = Math.max(0, tEnd - windowS); + // Find the first sample j whose t is >= tStart; the step value at + // tStart is data[j-1].value if j > 0, else data[0].value. + let j = 0; + while (j < data.length && data[j]!.t < tStart) j++; + let prevT = tStart; + let prevV = j > 0 ? data[j - 1]!.value : data[0]!.value; + let area = 0; + for (; j <= i; j++) { + const curT = data[j]!.t; + area += prevV * (curT - prevT); + prevT = curT; + prevV = data[j]!.value; + } + const dur = tEnd - tStart; + out[i] = { t: tEnd, value: dur > 0 ? area / dur : data[i]!.value }; + } + return out; +} + /** Centered rolling average over `windowSize` samples. */ export function rollingAverage(data: TimeSeriesPoint[], windowSize: number): TimeSeriesPoint[] { if (data.length === 0 || windowSize <= 1) return data; @@ -75,6 +108,69 @@ export function cumulativeSum(data: TimeSeriesPoint[]): TimeSeriesPoint[] { return out; } +/** + * Per-event step series: at each request start/end, sum the ISLs of + * currently-active requests across distinct `cid`s. Within a single + * `cid` aiperf dispatches turns sequentially (turn N+1 waits for N), + * so each cid contributes at most one in-flight ISL at a time. Across + * different cids we assume content is independent (parent ↔ subagent + * and conv ↔ conv share negligible prefix in practice — cross-conv + * dedup added ~0.25 pp to theoretical hit rate, so treating them as + * independent is a tight approximation of the true in-flight unique + * token count). + * + * Output is a step function: one point per event, value held constant + * until the next event. Time axis is seconds relative to the earliest + * event in `requests`. + */ +export function inflightUniqueTokens( + requests: readonly { cid: string; start: number; end: number; isl: number | null }[], +): TimeSeriesPoint[] { + if (requests.length === 0) return []; + // The request_timeline timestamps are ns-relative to its own origin. + // Convert events to seconds and emit a step series. + interface Event { + tNs: number; + kind: 'start' | 'end'; + cid: string; + isl: number; + } + const events: Event[] = []; + for (const r of requests) { + const isl = r.isl ?? 0; + if (isl <= 0) continue; + events.push({ tNs: r.start, kind: 'start', cid: r.cid, isl }); + events.push({ tNs: r.end, kind: 'end', cid: r.cid, isl }); + } + if (events.length === 0) return []; + // Sort by time; on ties, process 'end' before 'start' so a same-instant + // turn handoff within one cid doesn't transiently double-count. + events.sort((a, b) => a.tNs - b.tNs || (a.kind === 'end' ? -1 : 1)); + + // Active ISL per cid (max in case the same cid somehow has overlapping + // events; in practice it's always 0 or 1 request at a time per cid). + const activeByCid = new Map(); + let total = 0; + const out: TimeSeriesPoint[] = [{ t: 0, value: 0 }]; + for (const e of events) { + const tSec = e.tNs / 1e9; + if (e.kind === 'start') { + const prev = activeByCid.get(e.cid) ?? 0; + const next = Math.max(prev, e.isl); + activeByCid.set(e.cid, next); + total += next - prev; + } else { + const cur = activeByCid.get(e.cid) ?? 0; + if (cur > 0) { + total -= cur; + activeByCid.delete(e.cid); + } + } + out.push({ t: tSec, value: Math.max(0, total) }); + } + return out; +} + /** * Monotonic-non-decreasing cumulative difference of two rate series: * for each unique timestamp, compute Σa[0..t] − Σb[0..t], then enforce From 7561deb1cc5a210ce6cd074ab0d4771b3b9f8342 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 20:30:39 -0500 Subject: [PATCH 49/96] feat(chart-series): extract SGLang metrics alongside vllm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our chart_series + aggregate_stats extractors hardcoded vllm:* metric names, so SGLang runs (e.g. qwen3.5/h100/sglang) ingested cleanly but the per-point detail page rendered empty charts — chart_series fields were all zero-length arrays. Adds fallback chains in each extractor: KV cache util vllm:kv_cache_usage_perc → sglang:token_usage Prefix cache hits vllm:prefix_cache_hits → sglang:cached_tokens Prefix cache qrys vllm:prefix_cache_queries → sglang:prompt_tokens Requests running vllm:num_requests_running → sglang:num_running_reqs Requests waiting vllm:num_requests_waiting → sglang:num_queue_reqs Prompt tokens rate vllm:prompt_tokens → sglang:prompt_tokens Generation rate vllm:generation_tokens → sglang:generation_tokens The `pickFirstNonEmpty` helper walks the chain and uses whichever series has data, so a future framework (mori-sglang, dynamo, etc.) can plug in by adding its names to each chain — no per-framework branching. CHART_SERIES_VERSION → 4, STATS_VERSION → 3. Both backfills re-ran (86 chart_series rows, 190 aggregate_stats rows). SGLang chart_series for qwen3.5 run 944 verified — was 0-length arrays before, now ~1800 samples each. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 67 +++++++++++++++---- packages/db/src/queries/agentic-aggregates.ts | 56 +++++++++++++--- 2 files changed, 98 insertions(+), 25 deletions(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 91e89521..86b79925 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -28,8 +28,11 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * * v3: extract `prefixCacheHitsTps` so the detail page can derive cumulative * unique input tokens as cumsum(prefillTps - prefixCacheHitsTps). + * + * v4: extract sglang:* metrics too (fallback chain in each picker), so + * SGLang runs populate the chart_series the same way vllm runs do. */ -export const CHART_SERIES_VERSION = 3; +export const CHART_SERIES_VERSION = 4; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -89,8 +92,13 @@ interface RawMetric { type MetricsMap = Record; -/** The set of metric subtrees the chart consumes. */ +/** + * The set of metric subtrees the chart consumes. Includes both vllm:* and + * sglang:* names so the stream-parse fallback collects whichever framework + * the blob was emitted by — `buildSeriesFromMetrics` then picks per metric. + */ const CHART_METRIC_KEYS = new Set([ + // vLLM 'vllm:kv_cache_usage_perc', 'vllm:gpu_cache_usage_perc', 'vllm:prefix_cache_hits', @@ -100,6 +108,13 @@ const CHART_METRIC_KEYS = new Set([ 'vllm:prompt_tokens', 'vllm:generation_tokens', 'vllm:prompt_tokens_by_source', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', + 'sglang:generation_tokens', + 'sglang:num_running_reqs', + 'sglang:num_queue_reqs', ]); /** @@ -220,18 +235,37 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { if (!Number.isFinite(startNs)) startNs = 0; const tOf = (ns: number) => (ns - startNs) / 1e9; + // Pick the first metric name whose series array has any data; fallback + // chain lets the same code path serve both vllm:* and sglang:* blobs. + const pickSeries = (...names: string[]): readonly RawSeries[] | undefined => { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; + }; + // KV cache usage (gauge, 0..1) — average across engines so the value // stays a fraction (each engine has its own KV pool). - const kvSeries = - metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvSeries = pickSeries( + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); const kvCacheUsage: TimeSeriesPoint[] = sortedEntries( aggregateByStart(kvSeries, 'avg', 'avg'), ).map(([t, v]) => ({ t: tOf(t), value: v })); // Prefix cache hit rate per scrape: Σhits.rate / Σqueries.rate across - // engines, joined on start_ns. - const hitsByT = aggregateByStart(metrics['vllm:prefix_cache_hits']?.series, 'rate', 'sum'); - const qsByT = aggregateByStart(metrics['vllm:prefix_cache_queries']?.series, 'rate', 'sum'); + // engines, joined on start_ns. SGLang names: cached_tokens / prompt_tokens. + const hitsSeries = pickSeries('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + const qsSeries = pickSeries( + 'vllm:prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); + const hitsByT = aggregateByStart(hitsSeries, 'rate', 'sum'); + const qsByT = aggregateByStart(qsSeries, 'rate', 'sum'); const prefixCacheHitRate: TimeSeriesPoint[] = []; for (const [t, h] of [...hitsByT.entries()].toSorted((a, b) => a[0] - b[0])) { const q = qsByT.get(t); @@ -239,8 +273,10 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } // Queue depth: sum running + waiting across engines per timeslice. - const runByT = aggregateByStart(metrics['vllm:num_requests_running']?.series, 'avg', 'sum'); - const waitByT = aggregateByStart(metrics['vllm:num_requests_waiting']?.series, 'avg', 'sum'); + const runSeries = pickSeries('vllm:num_requests_running', 'sglang:num_running_reqs'); + const waitSeries = pickSeries('vllm:num_requests_waiting', 'sglang:num_queue_reqs'); + const runByT = aggregateByStart(runSeries, 'avg', 'sum'); + const waitByT = aggregateByStart(waitSeries, 'avg', 'sum'); const queueDepth: QueueDepthPoint[] = []; // Union of timestamps so we surface activity even if one of the gauges // didn't report a sample on a given tick. @@ -252,16 +288,19 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } // Throughput: sum the counter `rate` (already per-second) across engines. - const counterRate = (name: string): TimeSeriesPoint[] => - sortedEntries(aggregateByStart(metrics[name]?.series, 'rate', 'sum')).map(([t, v]) => ({ + // Takes a fallback chain so vllm:* and sglang:* both work. + const counterRate = (...names: string[]): TimeSeriesPoint[] => { + const s = pickSeries(...names); + return sortedEntries(aggregateByStart(s, 'rate', 'sum')).map(([t, v]) => ({ t: tOf(t), value: v, })); - const prefillTps = counterRate('vllm:prompt_tokens'); - const decodeTps = counterRate('vllm:generation_tokens'); + }; + const prefillTps = counterRate('vllm:prompt_tokens', 'sglang:prompt_tokens'); + const decodeTps = counterRate('vllm:generation_tokens', 'sglang:generation_tokens'); // Tokens served from prefix cache per scrape. Lets the frontend derive // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). - const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits'); + const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); // Per-source prompt tokens — sum across engines per source label. const promptBySrcByT = new Map>(); diff --git a/packages/db/src/queries/agentic-aggregates.ts b/packages/db/src/queries/agentic-aggregates.ts index 1ad7fd7f..da5d18a0 100644 --- a/packages/db/src/queries/agentic-aggregates.ts +++ b/packages/db/src/queries/agentic-aggregates.ts @@ -32,8 +32,12 @@ import type { DbClient } from '../connection.js'; * * v2: aggregate vllm gauges/counters across all engine series (was reading * only series[0], which under-counted by Nx on multi-engine DP/PP deployments). + * + * v3: extract sglang:* metrics too — kv_cache_util + prefix_cache_hit_rate + * populate for SGLang runs (qwen3.5/h100, mi355x sglang, etc.) the same way + * they do for vllm runs. */ -export const STATS_VERSION = 2; +export const STATS_VERSION = 3; export interface MetricPercentiles { mean: number; @@ -199,6 +203,18 @@ function aggregateSeriesByStart( * Aggregates across all engine series so multi-engine DP/PP deployments are * counted correctly (previously we only read engine 0). */ +/** First metric whose series array is non-empty; supports vllm/sglang fallback. */ +function pickFirstNonEmpty( + metrics: Record, + ...names: string[] +): Series[] | undefined { + for (const name of names) { + const s = metrics[name]?.series; + if (s && s.length > 0) return s; + } + return undefined; +} + export function extractServerMetricSamples(json: string): { kvCacheUtil: number[]; prefixCacheHitRate: number[]; @@ -208,17 +224,29 @@ export function extractServerMetricSamples(json: string): { // KV cache util — per-engine gauge in [0, 1]. Average across engines so the // value stays a percentage; summing would give meaningless 0..N. - const kvSeriesAll = - metrics['vllm:kv_cache_usage_perc']?.series ?? metrics['vllm:gpu_cache_usage_perc']?.series; + const kvSeriesAll = pickFirstNonEmpty( + metrics, + 'vllm:kv_cache_usage_perc', + 'vllm:gpu_cache_usage_perc', + 'sglang:token_usage', + ); const kvCacheUtil = [...aggregateSeriesByStart(kvSeriesAll, 'avg', 'avg').values()]; // Prefix cache hit rate per interval = Σhits.rate / Σqueries.rate across - // all engines. Sum first, then divide. - const hitsAll = - metrics['vllm:prefix_cache_hits']?.series ?? metrics['vllm:gpu_prefix_cache_hits']?.series; - const queriesAll = - metrics['vllm:prefix_cache_queries']?.series ?? - metrics['vllm:gpu_prefix_cache_queries']?.series; + // all engines. Sum first, then divide. SGLang names: cached_tokens / prompt_tokens. + const hitsAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_hits', + 'vllm:gpu_prefix_cache_hits', + 'sglang:cached_tokens', + ); + const queriesAll = pickFirstNonEmpty( + metrics, + 'vllm:prefix_cache_queries', + 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + 'sglang:prompt_tokens', + ); const hitsByT = aggregateSeriesByStart(hitsAll, 'rate', 'sum'); const qByT = aggregateSeriesByStart(queriesAll, 'rate', 'sum'); const prefixCacheHitRate: number[] = []; @@ -232,12 +260,18 @@ export function extractServerMetricSamples(json: string): { /** Metrics our aggregates pipeline cares about. Anything else in the blob is skipped. */ const TARGET_METRIC_KEYS = new Set([ + // vLLM 'vllm:kv_cache_usage_perc', - 'vllm:gpu_cache_usage_perc', // older fallback name + 'vllm:gpu_cache_usage_perc', 'vllm:prefix_cache_hits', 'vllm:prefix_cache_queries', - 'vllm:gpu_prefix_cache_hits', // legacy alias (used in pre-fix code paths) + 'vllm:gpu_prefix_cache_hits', 'vllm:gpu_prefix_cache_queries', + 'vllm:prompt_tokens', + // SGLang + 'sglang:token_usage', + 'sglang:cached_tokens', + 'sglang:prompt_tokens', ]); /** From 625d6e85e411cf8081977d3b76ad98d1805ad3c5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 27 May 2026 20:48:58 -0500 Subject: [PATCH 50/96] fix(ingest): derive GPU cache hit rate for SGLang at ingest time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SGLang runs' harness JSON doesn't populate server_gpu_cache_hit_rate (vLLM runs do), so the detail-page header and inference chart tooltip showed "—" for SGLang points. Now at trace_replay ingest, if any of the linked benchmark_results rows has a null server_gpu_cache_hit_rate and we have non-empty prefill/hits time-series in the computed chart_series, derive the lifetime cluster ratio as sum(hits.rate) / sum(prompt.rate) and write it into the row's metrics JSONB. Already-stored SGLang rows from runs 944/945 backfilled via a one-off UPDATE earlier in this session (8 rows, mostly ~87-89% hit rate, one high-conc outlier at 2.4%). Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 8cc03f2a..8d1e01b8 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -100,4 +100,23 @@ export async function insertTraceReplay( set trace_replay_id = ${traceReplayId} where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; + + // Derive a lifetime GPU cache hit rate from chart_series for any linked + // row whose harness JSON didn't set one (SGLang runs don't populate + // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has + // no usable prefill data — leaves the field null in that case, matching + // legacy "no trace_replay" behavior. + if (chartSeries && chartSeries.prefillTps.length > 0) { + const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); + const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); + if (sumPrompts > 0) { + const rate = sumHits / sumPrompts; + await sql` + update benchmark_results + set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric)) + where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) + and (metrics->>'server_gpu_cache_hit_rate') is null + `; + } + } } From aa76e9eca423d3ab2c7079ff28d74b70adefae1c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 14:38:52 -0500 Subject: [PATCH 51/96] feat(chart-series): map sglang:realtime_tokens to promptTokensBySource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "Cumulative prompt token source breakdown" chart was empty for SGLang runs because the vllm-specific vllm:prompt_tokens_by_source metric doesn't exist on SGLang. Maps sglang:realtime_tokens (which has mode={prefill_cache, prefill_compute, decode}) into the same source breakdown when no vllm series is present, filtered to prefill_* modes (decode tokens are output throughput, not prompt-token volume). CHART_SERIES_VERSION → 5. Backfilled 128 rows; SGLang rows from runs 944/946/947 now have prefill_cache + prefill_compute sources populated. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 31 ++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 86b79925..0807e238 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -31,8 +31,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * * v4: extract sglang:* metrics too (fallback chain in each picker), so * SGLang runs populate the chart_series the same way vllm runs do. + * + * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode}) + * into promptTokensBySource so the cumulative prompt-token-source-breakdown + * chart shows useful splits for SGLang runs (filtered to prefill_* modes). */ -export const CHART_SERIES_VERSION = 4; +export const CHART_SERIES_VERSION = 5; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -115,6 +119,7 @@ const CHART_METRIC_KEYS = new Set([ 'sglang:generation_tokens', 'sglang:num_running_reqs', 'sglang:num_queue_reqs', + 'sglang:realtime_tokens', ]); /** @@ -303,6 +308,12 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); // Per-source prompt tokens — sum across engines per source label. + // vllm: vllm:prompt_tokens_by_source has one series per source label + // (local_cache_hit, external_cache_hit, miss, ...). Use the + // `source`/`reason`/`kind` label as the breakdown key. + // sglang: sglang:realtime_tokens uses a `mode` label with values + // {prefill_cache, prefill_compute, decode}. Filter to prefill_* + // since decode isn't prompt-token volume. const promptBySrcByT = new Map>(); for (const series of metrics['vllm:prompt_tokens_by_source']?.series ?? []) { const labels = series.labels ?? {}; @@ -318,6 +329,24 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } } } + // SGLang fallback: only consider when the vllm metric wasn't found. + if (promptBySrcByT.size === 0) { + for (const series of metrics['sglang:realtime_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const mode = labels['mode'] ?? 'unknown'; + if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens) + let byT = promptBySrcByT.get(mode); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(mode, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + } + } const promptTokensBySource: Record = {}; for (const [source, byT] of promptBySrcByT) { const arr: TimeSeriesPoint[] = []; From 5872a3d8d3c6f5e6feee879e2f8f6f5d0ddd04ac Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 14:48:27 -0500 Subject: [PATCH 52/96] feat(chart-series): break out SGLang cache hits by cache_source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously SGLang detail pages showed two stacked-area layers in the prompt-token source breakdown: prefill_cache (everything that hit the cache) + prefill_compute (cache miss). The user wanted finer granularity — specifically a distinction between on-GPU HBM cache and CPU-offloaded (hicache) host cache. SGLang's sglang:cached_tokens metric carries a cache_source label that varies per cache tier: - "device" → on-GPU HBM cache hit - "host" → CPU-offload (hicache) cache hit - "total" → older sglang, single series with no tier breakdown Switches the cache-hit portion of the breakdown from the coarse `prefill_cache` mode label to per-cache_source series: - device → "cache hit (HBM)" - host → "cache hit (CPU offload)" - total → "cache hit" - other → "cache hit ()" Cache misses still come from realtime_tokens[mode=prefill_compute], relabeled "compute (miss)" for symmetry. Current data only contains device/total (no hicache runs ingested yet) — when hicache runs come in, the chart will automatically split cache hits into HBM + CPU-offload layers with no further code change. CHART_SERIES_VERSION → 6. Backfilled 128 rows. Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/compute-chart-series.ts | 47 +++++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 0807e238..1996708f 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -35,8 +35,13 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * v5: map sglang:realtime_tokens (mode={prefill_cache,prefill_compute,decode}) * into promptTokensBySource so the cumulative prompt-token-source-breakdown * chart shows useful splits for SGLang runs (filtered to prefill_* modes). + * + * v6: for SGLang, swap the coarse "prefill_cache" bucket for per-cache_source + * breakdown from sglang:cached_tokens — current runs always have one + * cache_source ("device" / HBM) but hicache (CPU offload) runs would + * split into "device" + "host" automatically once ingested. */ -export const CHART_SERIES_VERSION = 5; +export const CHART_SERIES_VERSION = 6; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -330,15 +335,49 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { } } // SGLang fallback: only consider when the vllm metric wasn't found. + // - Cache misses (fresh prefill): `sglang:realtime_tokens[mode=prefill_compute]` + // - Cache hits, split by tier: per-series `sglang:cached_tokens` where each + // series carries a `cache_source` label ("device" = HBM, "host" = CPU + // offload via hicache). Current runs have only `device`; when hicache + // runs land, additional series will appear and the chart will split. if (promptBySrcByT.size === 0) { for (const series of metrics['sglang:realtime_tokens']?.series ?? []) { const labels = series.labels ?? {}; const mode = labels['mode'] ?? 'unknown'; - if (!mode.startsWith('prefill')) continue; // skip 'decode' (output tokens) - let byT = promptBySrcByT.get(mode); + // Only carry the cache-miss line over — cache hits come from + // sglang:cached_tokens broken out by cache_source below, so we'd + // double-count if we kept `prefill_cache` here too. + if (mode !== 'prefill_compute') continue; + const label = 'compute (miss)'; + let byT = promptBySrcByT.get(label); + if (!byT) { + byT = new Map(); + promptBySrcByT.set(label, byT); + } + for (const ts of series.timeslices ?? []) { + if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { + byT.set(ts.start_ns, (byT.get(ts.start_ns) ?? 0) + ts.rate); + } + } + } + // Cache hits broken out per cache_source. Strip the noisy "total" label + // (older sglang versions emit a single un-broken-out series labelled + // total — show that as just "cache hit"). + for (const series of metrics['sglang:cached_tokens']?.series ?? []) { + const labels = series.labels ?? {}; + const src = labels['cache_source'] ?? 'cache hit'; + const label = + src === 'device' + ? 'cache hit (HBM)' + : src === 'host' + ? 'cache hit (CPU offload)' + : src === 'total' + ? 'cache hit' + : `cache hit (${src})`; + let byT = promptBySrcByT.get(label); if (!byT) { byT = new Map(); - promptBySrcByT.set(mode, byT); + promptBySrcByT.set(label, byT); } for (const ts of series.timeslices ?? []) { if (typeof ts.rate === 'number' && typeof ts.start_ns === 'number') { From 94a3e8b1986e54165c062e2a14eda60d9e9dd146 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:01:24 -0500 Subject: [PATCH 53/96] feat(chart-series): host cache util line + fix SGLang stacked-area colors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes for SGLang hicache rendering on the agentic detail page: 1. KV cache utilization chart was GPU-HBM-only. SGLang hicache runs also expose sglang:hicache_host_{used,total}_tokens — the CPU offload pool's tokens-in-use over its capacity. Extracted as a new `hostKvCacheUsage` time series; frontend overlays it as a second orange line on the existing chart when the row has hicache data. 2. The cumulative-prompt-token-source-breakdown chart rendered ALL three SGLang sources in the same color, because the colors dict only knew vllm-style names (local_compute, local_cache_hit, etc.). Added explicit colors for the SGLang label names ('cache hit (HBM)', 'cache hit (CPU offload)', 'cache hit', 'compute (miss)') plus a memoized fallback palette so any future unknown source name gets a distinct color rather than falling through to gray. CHART_SERIES_VERSION → 7. Backfilled 128 rows; hicache rows from workflow_run 947 (8 rows) now have ~1830 hostKvCacheUsage samples matching their HBM samples. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/agentic-point-detail.tsx | 16 ++++++++- .../agentic-point/time-series-chart.tsx | 30 ++++++++++++++-- .../src/hooks/api/use-trace-server-metrics.ts | 2 ++ packages/db/src/etl/compute-chart-series.ts | 36 ++++++++++++++++++- .../db/src/queries/trace-server-metrics.ts | 3 ++ 5 files changed, 83 insertions(+), 4 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 2db2809b..b047ea8f 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -236,16 +236,30 @@ export function AgenticPointDetail({ id }: Props) { render={(expanded) => { const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; if (!metrics) return ; + // For SGLang hicache rows we have both GPU (HBM) util and + // host (CPU offload pool) util — overlay them as two lines. + const hasHost = metrics.hostKvCacheUsage.length > 0; return ( = { + // vLLM source names local_compute: '#f97316', local_cache_hit: '#3b82f6', external_kv_transfer: '#22c55e', miss: '#f97316', + // SGLang source names (set by compute-chart-series for sglang rows) + 'cache hit (HBM)': '#3b82f6', + 'cache hit (CPU offload)': '#22c55e', + 'cache hit': '#3b82f6', + 'compute (miss)': '#f97316', }; const labelFor: Record = { local_compute: 'Prefill', @@ -496,6 +502,26 @@ export function StackedAreaChart({ external_kv_transfer: 'Offload Cache Hit', miss: 'Miss', }; + // Fallback palette for any source name not in `colors` so we never + // emit two layers in the same shade. Cycles by insertion order. + const fallbackPalette = [ + '#3b82f6', + '#f97316', + '#22c55e', + '#a855f7', + '#ef4444', + '#06b6d4', + '#f59e0b', + '#ec4899', + ]; + let fallbackIdx = 0; + const colorFor = (name: string): string => { + if (colors[name]) return colors[name]!; + const c = fallbackPalette[fallbackIdx % fallbackPalette.length]!; + fallbackIdx++; + colors[name] = c; // memoize so the SAME unknown name always gets the same color + return c; + }; if (!computed) { return ( @@ -522,7 +548,7 @@ export function StackedAreaChart({ .toReversed() .map(([x, y]) => `L${x.toFixed(2)},${y.toFixed(2)}`) .join(' ')} Z`; - const color = colors[name] ?? '#6b7280'; + const color = colorFor(name); for (let i = 0; i < tValues.length; i++) lower[i] = upper[i]!; return { name, color, d }; }); @@ -540,7 +566,7 @@ export function StackedAreaChart({ } } const items: HoverItem[] = stackOrder.map((name) => ({ - color: colors[name] ?? '#6b7280', + color: colorFor(name), label: labelFor[name] ?? name, value: `${((shares[name]?.[idx] ?? 0) * 100).toFixed(1)}%`, })); diff --git a/packages/app/src/hooks/api/use-trace-server-metrics.ts b/packages/app/src/hooks/api/use-trace-server-metrics.ts index 664bc6c7..bac67a50 100644 --- a/packages/app/src/hooks/api/use-trace-server-metrics.ts +++ b/packages/app/src/hooks/api/use-trace-server-metrics.ts @@ -44,6 +44,8 @@ export interface TraceServerMetrics { decodeTps: TimeSeriesPoint[]; /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; } async function fetchTraceServerMetrics( diff --git a/packages/db/src/etl/compute-chart-series.ts b/packages/db/src/etl/compute-chart-series.ts index 1996708f..8105961e 100644 --- a/packages/db/src/etl/compute-chart-series.ts +++ b/packages/db/src/etl/compute-chart-series.ts @@ -40,8 +40,12 @@ import { streamObject } from 'stream-json/streamers/stream-object.js'; * breakdown from sglang:cached_tokens — current runs always have one * cache_source ("device" / HBM) but hicache (CPU offload) runs would * split into "device" + "host" automatically once ingested. + * + * v7: extract sglang:hicache_host_{used,total}_tokens into a new + * hostKvCacheUsage series so the KV cache utilization chart can plot + * the CPU offload pool's usage alongside the on-GPU HBM line. */ -export const CHART_SERIES_VERSION = 6; +export const CHART_SERIES_VERSION = 7; export interface TimeSeriesPoint { /** Seconds from benchmark start. */ @@ -79,6 +83,12 @@ export interface ChartSeries { * saved vs the raw queries that came in. */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** + * Host (CPU offload) KV cache utilization, 0..1. Only populated for + * SGLang hicache runs (derived as hicache_host_used / hicache_host_total). + * Frontend overlays this on the KV cache util chart as a second line. + */ + hostKvCacheUsage: TimeSeriesPoint[]; } // ── Raw blob shapes (subset we read) ──────────────────────────────────── @@ -125,6 +135,8 @@ const CHART_METRIC_KEYS = new Set([ 'sglang:num_running_reqs', 'sglang:num_queue_reqs', 'sglang:realtime_tokens', + 'sglang:hicache_host_used_tokens', + 'sglang:hicache_host_total_tokens', ]); /** @@ -312,6 +324,27 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { // "cumulative unique input tokens served" = cumsum(prefillTps) − cumsum(hits). const prefixCacheHitsTps = counterRate('vllm:prefix_cache_hits', 'sglang:cached_tokens'); + // SGLang hicache: host-pool KV cache utilization as used/total per + // timeslice. Both metrics are gauges in absolute tokens. Total stays + // constant (it's the pool size), used fluctuates. + const hostUsedByT = aggregateByStart( + metrics['sglang:hicache_host_used_tokens']?.series, + 'avg', + 'sum', + ); + const hostTotalByT = aggregateByStart( + metrics['sglang:hicache_host_total_tokens']?.series, + 'avg', + 'sum', + ); + const hostKvCacheUsage: TimeSeriesPoint[] = []; + for (const [t, used] of [...hostUsedByT.entries()].toSorted((a, b) => a[0] - b[0])) { + const total = hostTotalByT.get(t); + if (total !== undefined && total > 0) { + hostKvCacheUsage.push({ t: tOf(t), value: used / total }); + } + } + // Per-source prompt tokens — sum across engines per source label. // vllm: vllm:prompt_tokens_by_source has one series per source label // (local_cache_hit, external_cache_hit, miss, ...). Use the @@ -407,5 +440,6 @@ function buildSeriesFromMetrics(metrics: MetricsMap): ChartSeries { prefillTps, decodeTps, prefixCacheHitsTps, + hostKvCacheUsage, }; } diff --git a/packages/db/src/queries/trace-server-metrics.ts b/packages/db/src/queries/trace-server-metrics.ts index 76775e77..eccb0a0c 100644 --- a/packages/db/src/queries/trace-server-metrics.ts +++ b/packages/db/src/queries/trace-server-metrics.ts @@ -73,6 +73,8 @@ export interface TraceServerMetrics { decodeTps: TimeSeriesPoint[]; /** Tokens served from prefix cache per scrape (vllm:prefix_cache_hits rate). */ prefixCacheHitsTps: TimeSeriesPoint[]; + /** Host (CPU offload) KV cache utilization, 0..1. SGLang hicache only. */ + hostKvCacheUsage: TimeSeriesPoint[]; } interface RawMetaRow extends PointMeta { @@ -118,6 +120,7 @@ function merge(meta: PointMeta, series: ChartSeries): TraceServerMetrics { decodeTps: series.decodeTps, // v2 chart_series rows pre-backfill don't have this field — default to [] prefixCacheHitsTps: series.prefixCacheHitsTps ?? [], + hostKvCacheUsage: series.hostKvCacheUsage ?? [], }; } From 93e197b7e54d140acfe65b61aeb4f5c48ca27091 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:19:20 -0500 Subject: [PATCH 54/96] fix(stacked-area): align sources by timestamp before computing shares MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cumulative-prompt-token-source-breakdown chart was showing huge "100% compute (miss)" plateaus around minute 20-24 of many SGLang runs. Root cause: the chart computed cumulative shares per ARRAY INDEX (not timestamp), but in SGLang's per-scrape metrics, cache hits and misses fire on different ticks — one scrape reports 193K hits + 0 miss, the next reports 0 hits + 8K miss. So each source has a different timestamp array. Indexing them in lockstep mixed values from different moments and made the share calculation flap to 100% one side or the other. Fix: union timestamps across all sources, then for each unique timestamp carry forward each source's cumulative sum (a source that didn't report at time t holds its previous cumulative value rather than appearing as 0). After fix: shares change smoothly over time as each source's cumulative sum grows; transient single-tick gaps no longer drive the visible share to either extreme. Co-Authored-By: Claude Opus 4.7 --- .../agentic-point/time-series-chart.tsx | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 15a15869..75d7bb1e 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -464,15 +464,36 @@ export function StackedAreaChart({ const computed = useMemo(() => { const entries = Object.entries(sourceSeries).filter(([, v]) => v.length > 0); if (entries.length === 0) return null; - const tValues = entries[0]![1].map((p) => p.t); + + // Different sources can land on different scrape timestamps + // (SGLang's hits/misses fire on alternating ticks), so we MUST + // align across all sources before computing shares — otherwise the + // share calculation indexes into each source's own time axis and + // mixes values from different moments. + // + // Approach: union all timestamps across sources, then for each + // unique timestamp carry forward the cumulative sum for every + // source (a source that didn't report at time t holds its previous + // cumulative value rather than dropping to 0). + const tValues = [...new Set(entries.flatMap(([, arr]) => arr.map((p) => p.t)))].toSorted( + (a, b) => a - b, + ); + + // For each source, walk its (sorted) array and produce a parallel + // cumulative-sum array indexed against `tValues` via carry-forward. const cum: Record = {}; for (const [name, arr] of entries) { + const valByT = new Map(arr.map((p) => [p.t, p.value])); + const out: number[] = Array.from({ length: tValues.length }); let acc = 0; - cum[name] = arr.map((p) => { - acc += p.value; - return acc; - }); + for (let i = 0; i < tValues.length; i++) { + const v = valByT.get(tValues[i]!); + if (v !== undefined) acc += v; + out[i] = acc; + } + cum[name] = out; } + const shares: Record = {}; for (const name of Object.keys(cum)) shares[name] = []; for (let i = 0; i < tValues.length; i++) { From c14e19e277930495e4a43c3a6d6f42a611fec336 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 28 May 2026 15:44:07 -0500 Subject: [PATCH 55/96] fix(ingest): split GPU vs CPU cache hit rate for SGLang hicache rows Previous inline derivation (commit 625d6e8) summed ALL cache hit sources into server_gpu_cache_hit_rate, which conflated GPU HBM hits with CPU offload hits on SGLang hicache rows. The harness JSON also never sets server_cpu_cache_hit_rate. Now derives both metrics from chart_series.promptTokensBySource: server_gpu_cache_hit_rate = sum(HBM + 'cache hit') / sum(prompts) server_cpu_cache_hit_rate = sum(CPU offload) / sum(prompts) or null (null when no CPU offload source exists) Falls back to prefixCacheHitsTps for vLLM rows where promptTokensBySource isn't broken out by cache source. Overwrites any pre-existing value so the derivation stays consistent with what the detail-page charts plot. Backfilled all existing rows via two-phase SQL update earlier in the session: - 8 hicache rows in workflow_run 947 now show GPU ~1-2% / CPU ~87-91% - Other SGLang rows show GPU ~87% / CPU null - vLLM rows restored to their original GPU hit rates Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 40 +++++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 8d1e01b8..43655d9a 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -101,21 +101,43 @@ export async function insertTraceReplay( where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) `; - // Derive a lifetime GPU cache hit rate from chart_series for any linked - // row whose harness JSON didn't set one (SGLang runs don't populate - // server_gpu_cache_hit_rate; vLLM runs do). Skip when chart_series has - // no usable prefill data — leaves the field null in that case, matching - // legacy "no trace_replay" behavior. + // Derive lifetime GPU + CPU cache hit rates from chart_series. SGLang + // runs don't populate these in the harness JSON; vLLM runs do but only + // for GPU. We always recompute to keep the derivation consistent with + // what the detail-page charts plot — overwriting any pre-existing value. + // + // For hicache (CPU offload) rows the chart_series.promptTokensBySource + // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)" + // sources, letting us split GPU vs CPU hit rate. Other rows just have + // a single cache-hit source (either "cache hit (HBM)" / "cache hit" + // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps + // sum which equals the single cache source's total). if (chartSeries && chartSeries.prefillTps.length > 0) { const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); - const sumHits = chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); if (sumPrompts > 0) { - const rate = sumHits / sumPrompts; + const sumOf = (name: string): number => + (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0); + const cpuHits = sumOf('cache hit (CPU offload)'); + const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit'); + // If the source breakdown has a HBM entry, use it (covers SGLang). + // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path). + const gpuHits = + hbmFromBreakdown > 0 + ? hbmFromBreakdown + : chartSeries.prefixCacheHitsTps.reduce((s, p) => s + p.value, 0); + const gpuRate = gpuHits / sumPrompts; + const cpuRate = cpuHits > 0 ? cpuHits / sumPrompts : null; await sql` update benchmark_results - set metrics = jsonb_set(metrics, '{server_gpu_cache_hit_rate}', to_jsonb(${rate}::numeric)) + set metrics = jsonb_set( + case when ${cpuRate}::numeric is not null + then jsonb_set(metrics, '{server_cpu_cache_hit_rate}', to_jsonb(${cpuRate}::numeric)) + else metrics + end, + '{server_gpu_cache_hit_rate}', + to_jsonb(${gpuRate}::numeric) + ) where id = any(${sql.array(unlinked.map((r) => r.id))}::bigint[]) - and (metrics->>'server_gpu_cache_hit_rate') is null `; } } From 268617ccd85ccc8aea6ed12dd4bd61273c8a37c1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 10:40:04 -0500 Subject: [PATCH 56/96] fix(ingest): recognize vLLM LMCache external_kv_transfer as CPU hit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline cache-hit-rate derivation only handled SGLang's hicache label ('cache hit (CPU offload)'). vLLM with LMCache uses 'external_kv_transfer' in its prompt_tokens_by_source breakdown for the same concept (CPU offload pool serving tokens to GPU). Those vLLM rows had cpu rate null even when external_kv_transfer was the dominant source. Adds external_kv_transfer + local_cache_hit to the source name aliases: GPU hits = local_cache_hit + cache hit (HBM) + cache hit CPU hits = external_kv_transfer + cache hit (CPU offload) fallback = prefixCacheHitsTps total (for single-source rows) Backfilled 132 affected rows via SQL — vLLM LMCache rows now show CPU rate where present (e.g. dsv4 b300 conc=128 offload=on shows GPU ~1% + CPU ~87%, matching the actual cache topology). Co-Authored-By: Claude Opus 4.7 --- packages/db/src/etl/trace-replay-ingest.ts | 23 ++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/packages/db/src/etl/trace-replay-ingest.ts b/packages/db/src/etl/trace-replay-ingest.ts index 43655d9a..cb022ca9 100644 --- a/packages/db/src/etl/trace-replay-ingest.ts +++ b/packages/db/src/etl/trace-replay-ingest.ts @@ -106,21 +106,24 @@ export async function insertTraceReplay( // for GPU. We always recompute to keep the derivation consistent with // what the detail-page charts plot — overwriting any pre-existing value. // - // For hicache (CPU offload) rows the chart_series.promptTokensBySource - // breakdown has separate "cache hit (HBM)" + "cache hit (CPU offload)" - // sources, letting us split GPU vs CPU hit rate. Other rows just have - // a single cache-hit source (either "cache hit (HBM)" / "cache hit" - // for sglang, or no breakdown for vllm — falls back to prefixCacheHitsTps - // sum which equals the single cache source's total). + // Source label naming differs by framework / cache topology: + // SGLang hicache: 'cache hit (HBM)' + 'cache hit (CPU offload)' + // SGLang older: 'cache hit' (no tier breakdown) + // vLLM LMCache: 'local_cache_hit' + 'external_kv_transfer' (+ 'local_compute' for miss) + // vLLM single: falls back to prefixCacheHitsTps total (= local cache only) if (chartSeries && chartSeries.prefillTps.length > 0) { const sumPrompts = chartSeries.prefillTps.reduce((s, p) => s + p.value, 0); if (sumPrompts > 0) { const sumOf = (name: string): number => (chartSeries.promptTokensBySource[name] ?? []).reduce((s, p) => s + p.value, 0); - const cpuHits = sumOf('cache hit (CPU offload)'); - const hbmFromBreakdown = sumOf('cache hit (HBM)') + sumOf('cache hit'); - // If the source breakdown has a HBM entry, use it (covers SGLang). - // Otherwise fall back to total prefixCacheHitsTps sum (vLLM path). + // CPU-offload hits: SGLang hicache + vLLM LMCache external transfer. + const cpuHits = sumOf('cache hit (CPU offload)') + sumOf('external_kv_transfer'); + // GPU/HBM hits from source breakdown, summed across known aliases. + const hbmFromBreakdown = + sumOf('cache hit (HBM)') + sumOf('cache hit') + sumOf('local_cache_hit'); + // If the source breakdown has any GPU entry, use it. Otherwise fall back + // to total prefixCacheHitsTps sum (single-source vLLM path with no + // by_source metric — equals the lone cache counter's lifetime). const gpuHits = hbmFromBreakdown > 0 ? hbmFromBreakdown From 7fc6b4f7b5a49aa370d912d6df36b40d80b813a6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 13:02:34 -0500 Subject: [PATCH 57/96] fix(scatter): use lightweight presence endpoint for View charts button MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chart pre-fetched full trace_replay JSONL blobs for every visible agentic point just to decide whether to render the "View charts" button in pinned tooltips. With the latest run's 8x8 conc=512 rows pushing up to 13 MB compressed per blob, 12-id chunks blew past Neon's 64 MB per-HTTP-response cap and 500'd — hiding the button for every point. New /api/v1/trace-availability returns {id: true} for ids that have a stored blob; ScatterGraph uses that boolean instead. trace-histograms is still used by the detail page (single id, no chunking issue). Co-Authored-By: Claude Opus 4.7 --- .../app/api/v1/trace-availability/route.ts | 59 +++++++++++++++++++ .../components/inference/ui/ScatterGraph.tsx | 23 ++++---- .../inference/utils/tooltipUtils.ts | 15 ++--- .../src/hooks/api/use-trace-availability.ts | 29 +++++++++ packages/db/src/queries/trace-availability.ts | 34 +++++++++++ 5 files changed, 143 insertions(+), 17 deletions(-) create mode 100644 packages/app/src/app/api/v1/trace-availability/route.ts create mode 100644 packages/app/src/hooks/api/use-trace-availability.ts create mode 100644 packages/db/src/queries/trace-availability.ts diff --git a/packages/app/src/app/api/v1/trace-availability/route.ts b/packages/app/src/app/api/v1/trace-availability/route.ts new file mode 100644 index 00000000..2484ceaf --- /dev/null +++ b/packages/app/src/app/api/v1/trace-availability/route.ts @@ -0,0 +1,59 @@ +import { type NextRequest, NextResponse } from 'next/server'; + +import { getDb } from '@semianalysisai/inferencex-db/connection'; +import { + getTraceAvailability, + type TraceAvailabilityMap, +} from '@semianalysisai/inferencex-db/queries/trace-availability'; + +import { cachedJson, cachedQuery } from '@/lib/api-cache'; + +export const dynamic = 'force-dynamic'; + +const getCachedTraceAvailability = cachedQuery( + (ids: number[]): Promise => getTraceAvailability(getDb(), ids), + 'trace-availability', +); + +const MAX_IDS_PER_REQUEST = 500; + +/** + * GET /api/v1/trace-availability?ids=1,2,3 + * + * Returns `{[id]: true}` for ids that have a stored trace_replay blob. + * Lightweight presence check used by the scatter tooltip to decide whether + * to render the "View charts" button — see queries/trace-availability.ts. + */ +export async function GET(request: NextRequest) { + const raw = request.nextUrl.searchParams.get('ids'); + if (!raw) { + return NextResponse.json({ error: 'ids query param is required' }, { status: 400 }); + } + + const ids = [ + ...new Set( + raw + .split(',') + .map((s) => Number(s.trim())) + .filter((n) => Number.isFinite(n) && n > 0), + ), + ]; + if (ids.length === 0) { + return NextResponse.json({ error: 'no valid ids provided' }, { status: 400 }); + } + if (ids.length > MAX_IDS_PER_REQUEST) { + return NextResponse.json( + { error: `too many ids (max ${MAX_IDS_PER_REQUEST})` }, + { status: 400 }, + ); + } + + try { + const sorted = [...ids].toSorted((a, b) => a - b); + const availability = await getCachedTraceAvailability(sorted); + return cachedJson(availability); + } catch (error) { + console.error('Error fetching trace availability:', error); + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }); + } +} diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index fdcf8952..b93799db 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -6,7 +6,7 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react'; import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry'; import { useInference } from '@/components/inference/InferenceContext'; -import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; +import { useTraceAvailability } from '@/hooks/api/use-trace-availability'; import { useRouter } from 'next/navigation'; import ChartLegend from '@/components/ui/chart-legend'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; @@ -497,8 +497,11 @@ const ScatterGraph = React.memo( // All official points for rendering (unfiltered — visibility via opacity) const pointsData = useMemo(() => Object.values(groupedData).flat(), [groupedData]); - // Trace-replay histograms (ISL / OSL distributions) for agentic points. - // Pre-fetch the whole visible set so tooltip render stays synchronous. + // Bulk presence lookup for agentic points: which ids have a stored + // trace_replay blob → controls the "View charts" button in the pinned + // tooltip. We deliberately don't fetch the histograms themselves here; + // a 95-point dsv4-b300 dashboard would pull GB of profile blobs through + // Neon's HTTP API and trip its 64 MB per-response cap. const agenticIds = useMemo(() => { const ids: number[] = []; for (const p of pointsData) { @@ -506,7 +509,7 @@ const ScatterGraph = React.memo( } return ids; }, [pointsData]); - const { data: traceHistograms } = useTraceHistograms(agenticIds); + const { data: traceAvailability } = useTraceAvailability(agenticIds); const router = useRouter(); // Gradient label data @@ -774,8 +777,7 @@ const ScatterGraph = React.memo( hardwareConfig, isTracked: trackedConfigIdsRef.current.has(buildPointConfigId(d)), runUrl: d.run_url ? updateRepoUrl(d.run_url) : undefined, - traceHistogram: - typeof d.id === 'number' ? (traceHistograms?.[d.id] ?? undefined) : undefined, + hasTrace: typeof d.id === 'number' ? traceAvailability?.[d.id] === true : false, }), getRulerX: (d: InferenceData, xScale: any) => (xScale as ContinuousScale)(d.x), getRulerY: (d: InferenceData, yScale: any) => (yScale as ContinuousScale)(d.y), @@ -842,10 +844,11 @@ const ScatterGraph = React.memo( removeTrackedConfig, chartDefinition.chartType, selectedPrecisions, - // Tooltip content closure reads traceHistograms to decide whether to - // show the "View charts" button — rebuild config when the histogram - // fetch resolves so the button appears for points that have data. - traceHistograms, + // Tooltip content closure reads traceAvailability to decide whether + // to render the "View charts" button — rebuild config when the + // presence fetch resolves so the button appears for points that + // have a trace_replay blob. + traceAvailability, router, ], ); diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index ccc371f9..ed68c41b 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -20,12 +20,13 @@ export interface TooltipConfig { /** URL to the GitHub Actions workflow run */ runUrl?: string; /** - * Per-request ISL/OSL arrays for agentic points, sourced from the stored - * aiperf `profile_export.jsonl`. Used to detect whether the point has any - * trace data (so the "View charts" button can appear); the actual - * distributions are rendered on the detail page, not inline. + * Whether this agentic point has a stored trace_replay blob. Controls + * visibility of the "View charts" button — the actual distributions are + * rendered on the detail page, not inline, so all the tooltip needs is a + * presence boolean (sourced from the bulk `/api/v1/trace-availability` + * call so we don't ship megabytes of profile JSONL just for this check). */ - traceHistogram?: { isl: number[]; osl: number[] } | undefined; + hasTrace?: boolean; } export interface OverlayTooltipConfig extends TooltipConfig { @@ -221,7 +222,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => { selectedYAxisMetric, hardwareConfig, runUrl, - traceHistogram, + hasTrace, } = config; return ` @@ -271,7 +272,7 @@ export const generateTooltipContent = (config: TooltipConfig): string => { ${generateAgenticHTML(d)} ${runLinkHTML(runUrl)} - ${viewChartsButtonHTML(isPinned, Boolean(traceHistogram))} + ${viewChartsButtonHTML(isPinned, Boolean(hasTrace))} ${ isPinned ? ` + + + {sw.infoTooltip} + + + + )} ))} From de5e51a1330d7c24f51850e729a19a2d8802d990 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 14:50:42 -0500 Subject: [PATCH 63/96] fix(inference): don't scope chart to one run when runs cover different hardware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two workflow runs landing on the same date for the same model+precision but DIFFERENT hardware (e.g. a B300 dsv4 run and a B200 dsv4 run) each get their own changelog entry. The single-run scoping guard matched runs by model+precision only, so both counted as "runs with a changelog for this model", length>1 tripped, and selecting either run scoped the benchmarks query to that one workflow run — hiding the other GPU's curve entirely (carry-forward across hardware silently broke). Scope to a single run only when two runs contest the SAME full config_key (model-precision-hardware-framework) — a genuine same-day re-run of one hardware, where a DISTINCT ON merge could mix them. Complementary different-hardware runs now both render via the normal date carry-forward. Co-Authored-By: Claude Opus 4.7 --- .../components/inference/InferenceContext.tsx | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index c446dc71..244c713c 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -242,25 +242,42 @@ export function InferenceProvider({ const modelPrefixesEarly = Object.entries(MODEL_PREFIX_MAPPING) .filter(([, model]) => model === selectedModel) .map(([prefix]) => prefix); - const runIdsWithModelChangelog: string[] = []; + // Map each FULL config_key (model-precision-hardware-framework) a run's + // changelog claims to the set of runs claiming it. Single-run scoping should + // only kick in when two runs contest the SAME full key — e.g. a same-day + // re-run of one hardware — because then a DISTINCT ON merge could mix them + // and the user needs to pick which run wins. Runs covering DIFFERENT hardware + // of the same model (e.g. a B300 run and a B200 run on the same date) are + // complementary: both must render via carry-forward. Matching on model+ + // precision alone (the old behavior) wrongly treated those as alternatives + // and scoped the chart to one run, hiding the other GPU's curve. + const runsByConfigKey = new Map>(); if (availableRuns) { for (const [runId, runInfo] of Object.entries(availableRuns)) { if (!runInfo.changelog) continue; - const matches = runInfo.changelog.entries.some((entry) => - entry.config_keys.some((key) => { + for (const entry of runInfo.changelog.entries) { + for (const key of entry.config_keys) { const parts = key.split('-'); - return modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!); - }), - ); - if (matches) runIdsWithModelChangelog.push(runId); + if (modelPrefixesEarly.includes(parts[0]!) && effectivePrecisions.includes(parts[1]!)) { + let runs = runsByConfigKey.get(key); + if (!runs) { + runs = new Set(); + runsByConfigKey.set(key, runs); + } + runs.add(runId); + } + } + } } } + // A run is "contested" only if some full config_key it claims is also claimed + // by another run. Only then does picking a run disambiguate anything. + const contestedRunIds = new Set(); + for (const runs of runsByConfigKey.values()) { + if (runs.size > 1) for (const r of runs) contestedRunIds.add(r); + } const benchmarkRunId = - selectedRunId && - runIdsWithModelChangelog.length > 1 && - runIdsWithModelChangelog.includes(selectedRunId) - ? String(selectedRunId) - : undefined; + selectedRunId && contestedRunIds.has(String(selectedRunId)) ? String(selectedRunId) : undefined; const { graphs, From af8766ddbe9a3077b9a226cd3487f4f4e040e58b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 11:24:29 -0500 Subject: [PATCH 64/96] fix(inference): carry forward un-contested configs when a run is selected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selecting a workflow run in the picker scoped the ENTIRE benchmarks query to that run, so any same-day config living in a different workflow run vanished — e.g. with two vLLM runs and one SGLang run on the same date, picking either vLLM run (contested, so scoping kicks in) hid the SGLang curve entirely, while picking the SGLang run (uncontested, no scoping) showed everything. Fetch both the normal latest-per-config rows and the run-scoped rows, and merge: the selected run wins for every (model, precision, hardware, framework, benchmark_type) group it actually produced — preserving the disambiguation that scoping exists for, including dropping base rows for concs the run didn't cover so DISTINCT-ON mixing can't sneak back — and every other config carries forward from the base rows. benchmark_type is part of the replacement key so an agentic-only run can't hide the same config's fixed-seq carry-forward. The base query is the default view query so it's effectively always cached; run selection adds no extra latency in practice. Verified live: Jun 10, DSv4 B300, run 3/3 (vLLM affinity run) now renders both b300_vllm (run-scoped) and b300_sglang (carried forward). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/inference/InferenceContext.tsx | 4 ++ .../inference/hooks/useChartData.ts | 41 ++++++++++--- .../app/src/lib/benchmark-transform.test.ts | 60 ++++++++++++++++++- packages/app/src/lib/benchmark-transform.ts | 29 +++++++++ 4 files changed, 125 insertions(+), 9 deletions(-) diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 3b994367..5d165e60 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -286,6 +286,10 @@ export function InferenceProvider({ } // A run is "contested" only if some full config_key it claims is also claimed // by another run. Only then does picking a run disambiguate anything. + // Downstream (useChartData / mergeRunScopedRows) this no longer scopes the + // WHOLE chart to the run: only the configs the run actually produced are + // pinned to it, and every other config (e.g. another framework's same-day + // run) still carries forward from the normal latest-per-config rows. const contestedRunIds = new Set(); for (const runs of runsByConfigKey.values()) { if (runs.size > 1) for (const r of runs) contestedRunIds.add(r); diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 019d0691..e76c3123 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -19,7 +19,11 @@ import { getModelSortIndex, hardwareKeyMatchesAnyBase, } from '@/lib/constants'; -import { transformBenchmarkRows, withPercentile } from '@/lib/benchmark-transform'; +import { + mergeRunScopedRows, + transformBenchmarkRows, + withPercentile, +} from '@/lib/benchmark-transform'; import { Sequence, type Model } from '@/lib/data-mappings'; import { calculateCostsForGpus, calculatePowerForGpus } from '@/lib/utils'; import { @@ -183,19 +187,40 @@ export function useChartData( // When the selected date is the latest available, use '' (empty string) to match // the initial no-date query key, reusing the eagerly-fetched benchmarks from the // materialized view instead of firing a redundant second fetch with identical data. - // When a specific run is selected, we always go through the runId branch and the - // date is effectively ignored — keep queryDate set so React Query still has a - // distinct cache key per date if the user navigates back to "latest". const queryDate = selectedRunDate && latestAvailableDate && selectedRunDate === latestAvailableDate ? '' : selectedRunDate; + // Two queries: the normal latest-per-config view (always), plus the + // run-scoped rows when a specific workflow run is selected. The merged + // result pins ONLY the configs the selected run produced to that run, and + // carries every other config forward from the base rows — selecting one of + // two same-day vLLM runs must not hide the day's SGLang curve just because + // it lives in a different workflow run. The base query is the default view + // query, so it's almost always already in the React Query cache. + const { + data: baseRows, + isLoading: baseLoading, + error: baseError, + } = useBenchmarks(selectedModel, queryDate, enabled); const { - data: allRows, - isLoading: queryLoading, - error: queryError, - } = useBenchmarks(selectedModel, queryDate, enabled, selectedRunId); + data: runRows, + isLoading: runLoading, + error: runError, + } = useBenchmarks(selectedModel, queryDate, enabled && Boolean(selectedRunId), selectedRunId); + + const allRows = useMemo(() => { + if (!selectedRunId) return baseRows; + // Wait for the run rows before rendering a scoped view — rendering base + // rows first would flash the un-scoped chart, then swap contested points. + if (!runRows) return undefined; + if (!baseRows) return runRows; + return mergeRunScopedRows(runRows, baseRows); + }, [selectedRunId, runRows, baseRows]); + + const queryLoading = baseLoading || (Boolean(selectedRunId) && runLoading); + const queryError = baseError ?? (selectedRunId ? runError : null); // GPU comparison: fetch data for each additional comparison date const comparisonDates = useMemo( diff --git a/packages/app/src/lib/benchmark-transform.test.ts b/packages/app/src/lib/benchmark-transform.test.ts index 62cc1809..077e8c3e 100644 --- a/packages/app/src/lib/benchmark-transform.test.ts +++ b/packages/app/src/lib/benchmark-transform.test.ts @@ -2,7 +2,11 @@ import { describe, it, expect, vi } from 'vitest'; import type { BenchmarkRow } from '@/lib/api'; -import { rowToAggDataEntry, transformBenchmarkRows } from './benchmark-transform'; +import { + mergeRunScopedRows, + rowToAggDataEntry, + transformBenchmarkRows, +} from './benchmark-transform'; function makeRow(overrides: Partial = {}): BenchmarkRow { return { @@ -776,3 +780,57 @@ describe('transformBenchmarkRows — dp_attention narrowing', () => { expect(point.decode_dp_attention).toBe(true); }); }); + +describe('mergeRunScopedRows', () => { + const vllmRun = (over: Partial = {}) => + makeRow({ model: 'dsv4', hardware: 'b300', framework: 'vllm', precision: 'fp4', ...over }); + const sglangBase = (over: Partial = {}) => + makeRow({ model: 'dsv4', hardware: 'b300', framework: 'sglang', precision: 'fp4', ...over }); + + it('pins configs the run covers to the run rows, replacing base rows', () => { + const runRows = [vllmRun({ id: 10, conc: 32 }), vllmRun({ id: 11, conc: 64 })]; + const baseRows = [vllmRun({ id: 90, conc: 32 }), vllmRun({ id: 91, conc: 128 })]; + const merged = mergeRunScopedRows(runRows, baseRows); + // All vllm base rows dropped (incl. conc=128 the run didn't cover) — a + // partial-sweep run must fully own its config or the DISTINCT-ON mixing + // the scoping exists to prevent comes right back. + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 11]); + }); + + it('carries forward configs the run does not cover (the same-day other-framework curve)', () => { + const runRows = [vllmRun({ id: 10 })]; + const baseRows = [ + vllmRun({ id: 90 }), + sglangBase({ id: 91 }), + sglangBase({ id: 92, conc: 128 }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91, 92]); + }); + + it('keeps base rows of other hardware / precision / model untouched', () => { + const runRows = [vllmRun({ id: 10 })]; + const baseRows = [ + vllmRun({ id: 90, hardware: 'b200' }), + vllmRun({ id: 91, precision: 'fp8' }), + vllmRun({ id: 92, model: 'kimik2.5' }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 90, 91, 92]); + }); + + it('scopes per benchmark_type — an agentic run does not hide fixed-seq carry-forward', () => { + const runRows = [vllmRun({ id: 10, benchmark_type: 'agentic_traces' })]; + const baseRows = [ + vllmRun({ id: 90, benchmark_type: 'agentic_traces' }), + vllmRun({ id: 91, benchmark_type: 'single_turn' }), + ]; + const merged = mergeRunScopedRows(runRows, baseRows); + expect(merged.map((r) => r.id).toSorted((a, b) => a - b)).toEqual([10, 91]); + }); + + it('returns base rows unchanged when the run produced nothing', () => { + const baseRows = [vllmRun({ id: 90 }), sglangBase({ id: 91 })]; + expect(mergeRunScopedRows([], baseRows)).toBe(baseRows); + }); +}); diff --git a/packages/app/src/lib/benchmark-transform.ts b/packages/app/src/lib/benchmark-transform.ts index 9f6b43d1..8329c84b 100644 --- a/packages/app/src/lib/benchmark-transform.ts +++ b/packages/app/src/lib/benchmark-transform.ts @@ -172,6 +172,35 @@ export function withPercentile(key: string, percentile: string): string { return key.replace(/^(?:mean|median|p75|p90|p95|p99|p99\.9)_/u, `${percentile}_`); } +// Replacement granularity for single-run scoping: the changelog config_key +// tuple (model-precision-hardware-framework) plus benchmark_type, so an +// agentic-only run never hides the same config's fixed-seq carry-forward. +const runScopeKey = (r: BenchmarkRow): string => + `${r.model}|${r.precision}|${r.hardware}|${r.framework}|${r.benchmark_type}`; + +/** + * Merge run-scoped benchmark rows with the normal latest-per-config rows. + * + * When the user picks a specific workflow run (to disambiguate two same-day + * sweeps of the same config), only the configs that run actually produced + * should be pinned to it — every other config must keep its normal + * carry-forward rows. Scoping the whole chart to the run (the old behavior) + * silently hid complementary configs that happened to land on the same date, + * e.g. selecting one of two same-day vLLM runs made the day's SGLang curve + * vanish because it lived in a different workflow run. + * + * Run rows win for every (model, precision, hardware, framework, + * benchmark_type) group they cover; base rows fill in the rest. + */ +export function mergeRunScopedRows( + runRows: BenchmarkRow[], + baseRows: BenchmarkRow[], +): BenchmarkRow[] { + if (runRows.length === 0) return baseRows; + const claimed = new Set(runRows.map(runScopeKey)); + return [...runRows, ...baseRows.filter((r) => !claimed.has(runScopeKey(r)))]; +} + /** * Transform raw BenchmarkRow[] into chart-ready InferenceData[][] and HardwareConfig. * Returns one InferenceData[] per chart definition (e2e, interactivity). From d6d31436abf38eb32e6383ab692ff0b8519ca32c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:25:49 -0500 Subject: [PATCH 65/96] fix: reconcile agentic data after master merge --- .../component/inference-chart-controls.cy.tsx | 4 +- .../inference/hooks/useChartData.ts | 8 +- .../components/inference/ui/ChartDisplay.tsx | 481 +++++++++--------- .../components/inference/ui/ScatterGraph.tsx | 5 +- .../components/unofficial-run-provider.tsx | 10 +- packages/app/src/lib/api.ts | 15 +- packages/db/src/queries/benchmarks.ts | 21 +- 7 files changed, 282 insertions(+), 262 deletions(-) diff --git a/packages/app/cypress/component/inference-chart-controls.cy.tsx b/packages/app/cypress/component/inference-chart-controls.cy.tsx index 03e6a50c..5a6311f4 100644 --- a/packages/app/cypress/component/inference-chart-controls.cy.tsx +++ b/packages/app/cypress/component/inference-chart-controls.cy.tsx @@ -14,8 +14,8 @@ describe('Inference ChartControls', () => { it('renders the sequence selector with the current sequence', () => { // Default mock: selectedSequence = Sequence.EightK_OneK -> label "8K / 1K" - cy.get('#sequence-select').should('be.visible'); - cy.get('#sequence-select').should('contain.text', '8K / 1K'); + cy.get('#scenario-select').should('be.visible'); + cy.get('#scenario-select').should('contain.text', '8K / 1K'); }); it('renders the precision multi-select with the current precision', () => { diff --git a/packages/app/src/components/inference/hooks/useChartData.ts b/packages/app/src/components/inference/hooks/useChartData.ts index 0d1eac64..ee5acb88 100644 --- a/packages/app/src/components/inference/hooks/useChartData.ts +++ b/packages/app/src/components/inference/hooks/useChartData.ts @@ -220,13 +220,7 @@ export function useChartData( data: runRows, isLoading: runLoading, error: runError, - } = useBenchmarks( - selectedModel, - '', - enabled && Boolean(selectedRunId), - selectedRunId, - true, - ); + } = useBenchmarks(selectedModel, '', enabled && Boolean(selectedRunId), selectedRunId, true); const allRows = useMemo(() => { if (!selectedRunId) return baseRows; diff --git a/packages/app/src/components/inference/ui/ChartDisplay.tsx b/packages/app/src/components/inference/ui/ChartDisplay.tsx index 3a431440..caf713cc 100644 --- a/packages/app/src/components/inference/ui/ChartDisplay.tsx +++ b/packages/app/src/components/inference/ui/ChartDisplay.tsx @@ -429,217 +429,206 @@ export default function ChartDisplay() { }); }, [useDerived, visibleGraphs, derivedMetrics, selectedXAxisMode, selectedYAxisMetric]); - const displayGraphs = isFirstLoad || isDerivedLoading - ? [ - - - - - , - ] - : renderableGraphs.length === 0 - ? [] - : renderableGraphs.map((graph, graphIndex) => { - const isTimelineMode = Boolean( - selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, - ); - const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; - return ( -
-
- handleViewModeChange(graphIndex, v)} - ariaLabel="View mode" - testId={`inference-view-toggle-${graphIndex}`} - /> - } - hideImageExport={getViewMode(graphIndex) === 'table'} - setIsLegendExpanded={setIsLegendExpanded} - exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} - onExportMp4={ - replayAvailable ? () => replayHandlesRef.current[graphIndex]?.open() : undefined - } - onExportCsv={() => { - const visibleData = graph.data.filter((d) => + const displayGraphs = + isFirstLoad || isDerivedLoading + ? [ + + + + + , + ] + : renderableGraphs.length === 0 + ? [] + : renderableGraphs.map((graph, graphIndex) => { + const isTimelineMode = Boolean( + selectedDateRange.startDate && selectedDateRange.endDate && selectedGPUs.length > 0, + ); + const replayAvailable = getViewMode(graphIndex) === 'chart' && !isTimelineMode; + return ( +
+
+ - activeOverlayHwTypes.has(p.hwKey as string) && - selectedPrecisions.includes(p.precision), - ); - const issueNotes = matchKnownConfigIssues(graph.model, [ - ...visibleData, - ...visibleOverlayRows, - ]).map((issue) => - knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))), - ); - exportToCsv( - `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`, - headers, - rows, - issueNotes, - ); - }} - /> - - {(() => { - const chartCaption = ( - <> -

- { - graph.chartDefinition[ - `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] - }{' '} - {(() => { - // For Input metrics with dynamic x-axis, use dynamic heading - const metricTitle = - (graph.chartDefinition[ - `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition - ] as string) || ''; - const isInputMetric = metricTitle.toLowerCase().includes('input'); - if ( - graph.chartDefinition.chartType === 'interactivity' && - isInputMetric && - selectedXAxisMetric - ) { - if (selectedXAxisMetric === 'p99_ttft') { - return 'vs. P99 Time To First Token'; - } else if (selectedXAxisMetric === 'median_ttft') { - return 'vs. Median Time To First Token'; - } - } - - // The e2e chart heading follows the branch-level x-axis mode - // selector, including agentic-only derived metrics. - if (graph.chartDefinition.chartType === 'e2e') { - if (selectedXAxisMode === 'session-time') { - return 'vs. Mean Normalized Session Time'; - } - if (selectedXAxisMode === 'prefill-tps') { - return 'vs. P90 Prefill TPS / user'; - } - const isAgentic = sequenceKind(selectedSequence) === 'agentic'; - if (selectedE2eXAxisMetric?.endsWith('_ttft')) { - const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, ''); - const word = - percentile === 'median' ? 'Median' : percentile.toUpperCase(); - return `vs. ${word} Time To First Token`; - } - return isAgentic - ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency` - : 'vs. End-to-end Latency'; - } - - // Fall back to configured heading - return ( - graph.chartDefinition[ - `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition - ] || graph.chartDefinition.heading - ); - })()} -

-

- {getModelLabel(graph.model as Model)} •{' '} - {selectedPrecisions - .map((prec) => getPrecisionLabel(prec as Precision)) - .join(', ')}{' '} - • {getSequenceLabel(graph.sequence as Sequence)} •{' '} - {isUnofficialRun - ? 'Source: UNOFFICIAL' - : 'Source: SemiAnalysis InferenceX™'} - {selectedRunDate && ( - <> - {' '} - • Updated:{' '} - {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( - 'en-US', - { - year: 'numeric', - month: '2-digit', - day: '2-digit', - timeZone: 'UTC', - }, - )} - - )} -

- - - - ); - - if (getViewMode(graphIndex) === 'table') { + ? 'gpu_timeseries' + : graph.chartDefinition.chartType === 'e2e' + ? 'latency' + : 'interactivity' + } + leadingControls={ + handleViewModeChange(graphIndex, v)} + ariaLabel="View mode" + testId={`inference-view-toggle-${graphIndex}`} + /> + } + hideImageExport={getViewMode(graphIndex) === 'table'} + setIsLegendExpanded={setIsLegendExpanded} + exportFileName={`InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`} + onExportMp4={ + replayAvailable + ? () => replayHandlesRef.current[graphIndex]?.open() + : undefined + } + onExportCsv={() => { + const visibleData = graph.data.filter((d) => + isTimelineMode + ? activeDates.has(`${d.date}_${d.hwKey}`) + : activeHwTypes.has(d.hwKey as string) && + selectedPrecisions.includes(d.precision), + ); + const { headers, rows } = inferenceChartToCsv( + visibleData, + graph.model, + graph.sequence, + ); + // Match warnings against the same series the chart annotates, + // including visible unofficial-run overlay series. const overlay = graph.chartDefinition.chartType === 'e2e' ? overlayDataByChartType.e2e : overlayDataByChartType.interactivity; - const overlayRows = (overlay?.data ?? []).filter((p) => - selectedPrecisions.includes(p.precision), + const visibleOverlayRows = isTimelineMode + ? [] + : (overlay?.data ?? []).filter( + (p) => + activeOverlayHwTypes.has(p.hwKey as string) && + selectedPrecisions.includes(p.precision), + ); + const issueNotes = matchKnownConfigIssues(graph.model, [ + ...visibleData, + ...visibleOverlayRows, + ]).map((issue) => + knownIssueCsvNote(issue, getDisplayLabel(getHardwareConfig(issue.hwKey))), ); - return ( + exportToCsv( + `InferenceX_${selectedModel}_${graph.chartDefinition.chartType}`, + headers, + rows, + issueNotes, + ); + }} + /> + + {(() => { + const chartCaption = ( <> - {chartCaption} - 0 ? [...graph.data, ...overlayRows] : graph.data - } - chartDefinition={graph.chartDefinition} - selectedYAxisMetric={selectedYAxisMetric} - /> +

+ { + graph.chartDefinition[ + `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition + ] + }{' '} + {(() => { + // For Input metrics with dynamic x-axis, use dynamic heading + const metricTitle = + (graph.chartDefinition[ + `${selectedYAxisMetric}_title` as keyof typeof graph.chartDefinition + ] as string) || ''; + const isInputMetric = metricTitle.toLowerCase().includes('input'); + if ( + graph.chartDefinition.chartType === 'interactivity' && + isInputMetric && + selectedXAxisMetric + ) { + if (selectedXAxisMetric === 'p99_ttft') { + return 'vs. P99 Time To First Token'; + } else if (selectedXAxisMetric === 'median_ttft') { + return 'vs. Median Time To First Token'; + } + } + + // The e2e chart heading follows the branch-level x-axis mode + // selector, including agentic-only derived metrics. + if (graph.chartDefinition.chartType === 'e2e') { + if (selectedXAxisMode === 'session-time') { + return 'vs. Mean Normalized Session Time'; + } + if (selectedXAxisMode === 'prefill-tps') { + return 'vs. P90 Prefill TPS / user'; + } + const isAgentic = sequenceKind(selectedSequence) === 'agentic'; + if (selectedE2eXAxisMetric?.endsWith('_ttft')) { + const percentile = selectedE2eXAxisMetric.replace(/_ttft$/u, ''); + const word = + percentile === 'median' ? 'Median' : percentile.toUpperCase(); + return `vs. ${word} Time To First Token`; + } + return isAgentic + ? `vs. ${selectedPercentile.toUpperCase()} End-to-end Latency` + : 'vs. End-to-end Latency'; + } + + // Fall back to configured heading + return ( + graph.chartDefinition[ + `${selectedYAxisMetric}_heading` as keyof typeof graph.chartDefinition + ] || graph.chartDefinition.heading + ); + })()} +

+

+ {getModelLabel(graph.model as Model)} •{' '} + {selectedPrecisions + .map((prec) => getPrecisionLabel(prec as Precision)) + .join(', ')}{' '} + • {getSequenceLabel(graph.sequence as Sequence)} •{' '} + {isUnofficialRun + ? 'Source: UNOFFICIAL' + : 'Source: SemiAnalysis InferenceX™'} + {selectedRunDate && ( + <> + {' '} + • Updated:{' '} + {new Date(`${selectedRunDate}T00:00:00Z`).toLocaleDateString( + 'en-US', + { + year: 'numeric', + month: '2-digit', + day: '2-digit', + timeZone: 'UTC', + }, + )} + + )} +

+ + ); - } - return selectedGPUs.length > 0 && - ((selectedDateRange.startDate && selectedDateRange.endDate) || - selectedDates.length > 0) ? ( - - ) : ( -
- + selectedPrecisions.includes(p.precision), + ); + return ( + <> + {chartCaption} + 0 + ? [...graph.data, ...overlayRows] + : graph.data + } + chartDefinition={graph.chartDefinition} + selectedYAxisMetric={selectedYAxisMetric} + /> + + ); + } + + return selectedGPUs.length > 0 && + ((selectedDateRange.startDate && selectedDateRange.endDate) || + selectedDates.length > 0) ? ( + - {selectedGPUs.length > 0 && - (!selectedDateRange.startDate || !selectedDateRange.endDate) && - selectedDates.length === 0 && ( -
-

- Select a date range or add a run to view GPU comparison -

-
- )} -
- ); - })()} - {replayAvailable && ( - { - replayHandlesRef.current[graphIndex] = handle; - }} - parentChartId={`chart-${graphIndex}`} - chartDefinition={graph.chartDefinition} - yLabel={`${ - graph.chartDefinition[ - `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition - ] - }`} - xLabel={graph.chartDefinition.x_label} - /> - )} -
-
-
- ); - }); + ) : ( +
+ + {selectedGPUs.length > 0 && + (!selectedDateRange.startDate || !selectedDateRange.endDate) && + selectedDates.length === 0 && ( +
+

+ Select a date range or add a run to view GPU comparison +

+
+ )} +
+ ); + })()} + {replayAvailable && ( + { + replayHandlesRef.current[graphIndex] = handle; + }} + parentChartId={`chart-${graphIndex}`} + chartDefinition={graph.chartDefinition} + yLabel={`${ + graph.chartDefinition[ + `${selectedYAxisMetric}_label` as keyof typeof graph.chartDefinition + ] + }`} + xLabel={graph.chartDefinition.x_label} + /> + )} + +
+
+ ); + }); return (
diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 982c24d2..e1cad1a4 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -7,7 +7,6 @@ import React, { useCallback, useEffect, useMemo, useRef } from 'react'; import { GRADIENT_NUDGE_EVENT } from '@/lib/nudges/registry'; import { useInference } from '@/components/inference/InferenceContext'; import { useTraceAvailability } from '@/hooks/api/use-trace-availability'; -import { useRouter } from 'next/navigation'; import { pointNearestX } from '@/components/inference/ui/line-label-anchor'; import ChartLegend from '@/components/ui/chart-legend'; import { useUnofficialRun } from '@/components/unofficial-run-provider'; @@ -582,7 +581,6 @@ const ScatterGraph = React.memo( return ids; }, [pointsData]); const { data: traceAvailability } = useTraceAvailability(agenticIds); - const router = useRouter(); // Gradient label data const allPointLabelsByKey = useMemo(() => { @@ -902,7 +900,7 @@ const ScatterGraph = React.memo( }); chartRef.current?.dismissTooltip(); chartRef.current?.hideTooltip(); - router.push(`/inference/agentic/${pointId}`); + window.location.assign(`/inference/agentic/${pointId}`); }); } }, @@ -923,7 +921,6 @@ const ScatterGraph = React.memo( // presence fetch resolves so the button appears for points that // have a trace_replay blob. traceAvailability, - router, ], ); diff --git a/packages/app/src/components/unofficial-run-provider.tsx b/packages/app/src/components/unofficial-run-provider.tsx index b8e76f38..54b470ff 100644 --- a/packages/app/src/components/unofficial-run-provider.tsx +++ b/packages/app/src/components/unofficial-run-provider.tsx @@ -279,11 +279,11 @@ export function UnofficialRunProvider({ children }: { children: ReactNode }) { // Filter chart data by stamped `run_url`. A row belongs to the dismissed // run if its URL matches exactly OR the numeric id parses to the same. const belongsToDismissed = (rowUrl?: string | null) => { - if (!rowUrl) return false; - if (rowUrl === target.url) return true; - const m = rowUrl.match(/\/runs\/(?\d+)/u); - return m?.groups?.runId === runId; - }; + if (!rowUrl) return false; + if (rowUrl === target.url) return true; + const m = rowUrl.match(/\/runs\/(?\d+)/u); + return m?.groups?.runId === runId; + }; // Compute the filtered chart data BEFORE any setState so we can pass the // same value to setUnofficialChartData and parseAvailableModelsAndSequences. diff --git a/packages/app/src/lib/api.ts b/packages/app/src/lib/api.ts index 0dac5883..a9d66715 100644 --- a/packages/app/src/lib/api.ts +++ b/packages/app/src/lib/api.ts @@ -8,6 +8,8 @@ import type { WorkerPower } from '@/components/inference/types'; import type { SubmissionsResponse } from './submissions-types'; export interface BenchmarkRow { + /** Stable per-point id from benchmark_results; used for agentic detail lookups. */ + id: number; hardware: string; framework: string; model: string; @@ -25,9 +27,13 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + // Null for agentic_traces rows; numeric for single_turn fixed-seq rows. + isl: number | null; + osl: number | null; conc: number; + /** KV-cache offload mode. Defaults to 'off' for fixed-sequence rows. */ + offload_mode: string; image: string | null; metrics: Record; /** @@ -176,13 +182,14 @@ export function fetchWorkflowInfo(date: string, signal?: AbortSignal) { export interface AvailabilityRow { model: string; - isl: number; - osl: number; + isl: number | null; + osl: number | null; precision: string; hardware: string; framework: string; spec_method: string; disagg: boolean; + benchmark_type: string; date: string; } diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 49c60604..6833756a 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -11,6 +11,8 @@ import type { WorkerPower } from '../etl/benchmark-mapper.js'; export type BenchmarkWorkerRow = WorkerPower; export interface BenchmarkRow { + /** Stable benchmark_results id used for agentic detail lookups. */ + id: number; hardware: string; framework: string; model: string; @@ -28,9 +30,11 @@ export interface BenchmarkRow { decode_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; - isl: number; - osl: number; + benchmark_type: string; + isl: number | null; + osl: number | null; conc: number; + offload_mode: string; image: string | null; metrics: Record; /** @@ -95,6 +99,7 @@ export async function getLatestBenchmarks( : sql``; const rows = await sql` SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + br.id, c.hardware, c.framework, c.model, @@ -112,6 +117,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -136,6 +143,7 @@ export async function getLatestBenchmarks( // No date filter: use materialized view for instant lookups const rows = await sql` SELECT + lb.id, c.hardware, c.framework, c.model, @@ -153,6 +161,8 @@ export async function getLatestBenchmarks( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + lb.benchmark_type, + lb.offload_mode, lb.isl, lb.osl, lb.conc, @@ -185,6 +195,7 @@ export async function getBenchmarksForRun( const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + br.id, c.hardware, c.framework, c.model, @@ -202,6 +213,8 @@ export async function getBenchmarksForRun( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, @@ -235,6 +248,7 @@ export async function getAllBenchmarksForHistory( const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; const rows = await sql` SELECT + br.id, c.hardware, c.framework, c.model, @@ -252,9 +266,12 @@ export async function getAllBenchmarksForHistory( c.decode_num_workers, c.num_prefill_gpu, c.num_decode_gpu, + br.benchmark_type, + br.offload_mode, br.isl, br.osl, br.conc, + br.image, br.metrics - '{std_ttft,std_tpot,std_e2el,std_intvty,std_itl,mean_ttft,mean_tpot,mean_e2el,mean_intvty,mean_itl}'::text[] as metrics, br.workers, br.date::text, From f60ef9c7f18a1782edd5542510328b242048a2de Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:34:00 -0500 Subject: [PATCH 66/96] fix(gpu-compare): show concurrency (C=) over points GPU compare mode (GPUGraph) labeled points with only the parallelism/tp string, dropping the C= suffix that the single-run scatter chart (ScatterGraph) shows. Append it so compare-mode points are annotated the same way. Verified live in compare mode: points now read e.g. 'DEP8 / C=2048', 'TP4 / C=64'. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/app/src/components/inference/ui/GPUGraph.tsx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index e7737a2e..24b1266f 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -759,7 +759,11 @@ const GPUGraph = React.memo( config: { getColor, hideLabels: hidePointLabels, - getLabelText: (d) => (useAdvancedLabels ? getPointLabel(d) : String(d.tp)), + // Match ScatterGraph: append the concurrency (C=) to the + // parallelism/tp label so compare-mode points are annotated the + // same way as the single-run scatter chart. + getLabelText: (d) => + useAdvancedLabels ? `${getPointLabel(d)}\nC=${d.conc}` : `${d.tp}\nC=${d.conc}`, foreground: 'var(--foreground)', dataAttrs: { series: (d) => `${d.date}_${d.hwKey}`, From 22028ccfe3141aa632b4c23aaca26b9c4bd51b58 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:43:42 -0500 Subject: [PATCH 67/96] fix(agentic-timeline): hide no-op phase toggle; fixed-height scroll window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes to the conversation/request-timeline view: 1. The Profiling vs 'All (incl. warmup)' toggle never did anything — aiperf's profile_export only contains profiling-phase requests, so every stored record has phase='profiling' (verified: 297k/297k rows). Hide the toggle unless a non-profiling request actually exists, so it reappears and works only if warmup is ever exported. 2. The timeline grew to fit every conversation/worker, making the card arbitrarily tall. Cap the body at a fixed height (480px) and scroll the rows vertically inside it. Few-row runs still size to content (no empty space); the label column and bars scroll together since they share the one scroll container. Verified live on a 3475-request point: phase toggle absent, row-mode toggle still present, window clientHeight 480 with ~3745px scrolling inside. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agentic-point/request-timeline.tsx | 474 +++++++++--------- 1 file changed, 249 insertions(+), 225 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 3c032fdd..2313775e 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -30,6 +30,11 @@ const PHASE_OPTIONS: SegmentedToggleOption[] = [ { value: 'all', label: 'All (incl. warmup)', testId: 'timeline-phase-all' }, ]; +// The timeline body is capped at this height and scrolls internally, so a run +// with many conversations/workers doesn't make the card grow unbounded and push +// the rest of the detail page down. Sized to show ~16 rows + the header. +const TIMELINE_BODY_MAX_HEIGHT = 480; + /** A stable color palette indexed by row-key hash. */ const ROW_COLORS = [ '#3b82f6', @@ -393,11 +398,24 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { }, []); const dragRef = useRef<{ startX: number; vs: number; ve: number } | null>(null); - // Apply phase filter, then group into rows. + // The phase toggle only means something when warmup requests are actually + // present. aiperf's profile_export only contains profiling-phase requests, so + // in practice every record is `profiling` and the toggle is a no-op — hide it + // unless a non-profiling request exists (keeps it working if warmup is ever + // exported). + const hasWarmup = useMemo( + () => data.requests.some((r) => r.phase !== 'profiling'), + [data.requests], + ); + + // Apply phase filter, then group into rows. With no warmup data the filter + // collapses to "profiling" regardless of the (hidden) toggle state. const filtered = useMemo( () => - phaseFilter === 'all' ? data.requests : data.requests.filter((r) => r.phase === 'profiling'), - [data.requests, phaseFilter], + phaseFilter === 'all' && hasWarmup + ? data.requests + : data.requests.filter((r) => r.phase === 'profiling'), + [data.requests, phaseFilter, hasWarmup], ); const rows = useMemo( () => buildRows(filtered, rowMode, expandedSubagents), @@ -581,14 +599,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { testId="timeline-row-mode" buttonClassName="px-2.5 py-1 text-xs" /> - + {hasWarmup && ( + + )} {totalRequests} request{totalRequests === 1 ? '' : 's'} · {rows.length}{' '} {rowMode === 'conversation' ? 'conversations' : 'workers'} · span{' '} @@ -606,243 +626,247 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {/* Chart container */}
-
- {/* Label column — sticky, doesn't scroll horizontally with the chart. */} -
+ {/* Fixed-height window: the rows scroll vertically inside it instead of + the card growing to fit every conversation/worker. */} +
+
+ {/* Label column — sticky, doesn't scroll horizontally with the chart. */}
- - {rowMode === 'conversation' ? 'Conversation' : 'Worker'} - -
- {rows.map((row) => { - const isSubagentRow = row.kind === 'subagent'; - const isStreamRow = row.kind === 'stream'; - const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; - const isExpanded = isExpandable && expandedSubagents.has(row.key); - return ( -
- {isExpandable ? ( - - ) : ( - - )} - - + + {rowMode === 'conversation' ? 'Conversation' : 'Worker'} + +
+ {rows.map((row) => { + const isSubagentRow = row.kind === 'subagent'; + const isStreamRow = row.kind === 'stream'; + const isExpandable = isSubagentRow && (row.streamCount ?? 1) > 1; + const isExpanded = isExpandable && expandedSubagents.has(row.key); + return ( +
- {row.label} - {isExpandable && ( - ×{row.streamCount} + {isExpandable ? ( + + ) : ( + )} - - - {row.requests.length > 0 ? row.requests.length : '—'} - -
- ); - })} -
- - {/* Scrollable SVG */} -
- - {/* Header / time-axis baseline */} - - - {/* Time axis ticks */} - {ticks.map((t) => { - // Convert visible-window ns offset → x px (the tick array - // is already in dataStart-relative coords). - const x = (t - vStart) * scale; - return ( - - - - {formatTickLabel(t)} - - + {row.label} + {isExpandable && ( + ×{row.streamCount} + )} + + + {row.requests.length > 0 ? row.requests.length : '—'} + +
); })} +
- {/* Row separators */} - {rows.map((row, idx) => ( + {/* Scrollable SVG */} +
+ + {/* Header / time-axis baseline */} - ))} - - {/* Request bars */} - {rows.map((row, rowIdx) => { - const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; - const barH = ROW_HEIGHT - 4; - // For multi-stream subagent containers, suppress the union - // bars when expanded — the child stream rows draw them - // individually instead, so we'd double-draw otherwise. - if ( - row.kind === 'subagent' && - (row.streamCount ?? 1) > 1 && - expandedSubagents.has(row.key) - ) { - return null; - } - return row.requests.map((req) => { - const xCredit = xOf(req.credit); - const xStart = xOf(req.start); - const xEnd = xOf(req.end); - // Cull bars entirely outside the visible window so big - // benchmarks don't render thousands of zero-width rects. - if (xEnd < -2 || xCredit > chartWidth + 2) return null; - const runW = Math.max(xEnd - xStart, 1); - const queueW = Math.max(xStart - xCredit, 0); - const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!; + + {/* Time axis ticks */} + {ticks.map((t) => { + // Convert visible-window ns offset → x px (the tick array + // is already in dataStart-relative coords). + const x = (t - vStart) * scale; return ( - setTooltip({ x: e.clientX, y: e.clientY, row, req })} - onMouseLeave={() => setTooltip(null)} - > - {/* Queue lead-in (faint) — only drawn when noticeable. */} - {queueW >= 1 && ( + + + + {formatTickLabel(t)} + + + ); + })} + + {/* Row separators */} + {rows.map((row, idx) => ( + + ))} + + {/* Request bars */} + {rows.map((row, rowIdx) => { + const yTop = HEADER_HEIGHT + rowIdx * (ROW_HEIGHT + ROW_GAP) + 2; + const barH = ROW_HEIGHT - 4; + // For multi-stream subagent containers, suppress the union + // bars when expanded — the child stream rows draw them + // individually instead, so we'd double-draw otherwise. + if ( + row.kind === 'subagent' && + (row.streamCount ?? 1) > 1 && + expandedSubagents.has(row.key) + ) { + return null; + } + return row.requests.map((req) => { + const xCredit = xOf(req.credit); + const xStart = xOf(req.start); + const xEnd = xOf(req.end); + // Cull bars entirely outside the visible window so big + // benchmarks don't render thousands of zero-width rects. + if (xEnd < -2 || xCredit > chartWidth + 2) return null; + const runW = Math.max(xEnd - xStart, 1); + const queueW = Math.max(xStart - xCredit, 0); + const phaseColor = PHASE_COLORS[req.phase] ?? PHASE_COLORS.unknown!; + return ( + setTooltip({ x: e.clientX, y: e.clientY, row, req })} + onMouseLeave={() => setTooltip(null)} + > + {/* Queue lead-in (faint) — only drawn when noticeable. */} + {queueW >= 1 && ( + + )} + {/* Main bar — opacity stepped down with depth so + parent > subagent > stream reads visually. */} - )} - {/* Main bar — opacity stepped down with depth so - parent > subagent > stream reads visually. */} - - {/* Phase strip at bottom */} - - {/* Cancelled X overlay */} - {req.cancelled && runW > 6 && ( - - )} - - ); - }); - })} - - {/* Cursor crosshair — drawn on top of bars so it stays visible + {/* Cancelled X overlay */} + {req.cancelled && runW > 6 && ( + + )} + + ); + }); + })} + + {/* Cursor crosshair — drawn on top of bars so it stays visible through dense rows. Stats popover is rendered as fixed HTML below the SVG block. */} - {cursor && ( - - )} - + {cursor && ( + + )} + +
From 28d25a53b7e3543a3d91e9c19f05b2409c20c032 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 17 Jun 2026 11:50:26 -0500 Subject: [PATCH 68/96] feat(agentic-timeline): sticky bottom h-scroll + double-click to reset zoom The fixed-height window put the chart's horizontal scrollbar at the bottom of the tall (full-height) content, below the fold and unreachable. Make the window itself the single scroll container (overflow-auto, both axes) and pin the label column with position:sticky left-0, so the horizontal scrollbar stays at the window's bottom edge while the label column stays put during horizontal scroll and scrolls with the rows vertically. Also add double-click anywhere on the timeline to reset zoom/pan (same resetZoom the existing button calls) and note it in the hint text. Verified live: window scrollW 1280 > clientW 879 (h-scroll present and working), label column sticky, rows scroll vertically. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agentic-point/request-timeline.tsx | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 2313775e..7c5fdab0 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -626,13 +626,16 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { {/* Chart container */}
- {/* Fixed-height window: the rows scroll vertically inside it instead of - the card growing to fit every conversation/worker. */} -
-
- {/* Label column — sticky, doesn't scroll horizontally with the chart. */} + {/* Fixed-height window: rows scroll vertically and the chart scrolls + horizontally inside it, so the card doesn't grow to fit every + conversation/worker AND the horizontal scrollbar stays pinned to the + window's bottom edge (rather than the bottom of the tall content). */} +
+
+ {/* Label column — pinned left (sticky) so it stays put during + horizontal scroll, while scrolling vertically with the rows. */}
- {/* Scrollable SVG */} -
+ {/* Chart column — horizontal scrolling is handled by the window + container above so its scrollbar stays pinned to the window's + bottom edge; double-click anywhere resets the zoom. */} +
{/* Header / time-axis baseline */} warmup - scroll to zoom · drag to pan + + scroll to zoom · drag to pan · double-click to reset +
{/* Cursor stats popover: count of in-flight / waiting at the cursor's From 6e56bbfb2a29c6ffad2e4d4484bfcb6673fdacfd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 18 Jun 2026 09:29:18 -0500 Subject: [PATCH 69/96] fix(gpu-compare): show CPU-offload halo on points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dashed offload-mode ring (drawn in ScatterGraph's onRender for every point with offload_mode='on') was missing from GPU compare mode (GPUGraph), so the CPU-offloading indicator never appeared there. Mirror it in GPUGraph's onRender — same dashed var(--foreground) ring at POINT_SIZE+4, appended inside each .dot-group so it travels with the point on zoom/pan. Verified live in compare mode (DSv4 B200/B300 agentic): offload points now render the dashed halo (5 rings, r=7.5, dash 3 2). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/components/inference/ui/GPUGraph.tsx | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/packages/app/src/components/inference/ui/GPUGraph.tsx b/packages/app/src/components/inference/ui/GPUGraph.tsx index 24b1266f..19ba574f 100644 --- a/packages/app/src/components/inference/ui/GPUGraph.tsx +++ b/packages/app/src/components/inference/ui/GPUGraph.tsx @@ -26,6 +26,7 @@ import { formatLargeNumber, getShapeKeyForPrecision, logTickFormat, + POINT_SIZE, } from '@/lib/chart-rendering'; import { paretoFrontLowerLeft, @@ -827,6 +828,28 @@ const GPUGraph = React.memo( } // Set foreground color on scatter point labels ctx.layout.zoomGroup.selectAll('.point-label').style('fill', 'var(--foreground)'); + + // Offload halo: dashed ring on every point that used KV offload + // (mirrors ScatterGraph so compare mode shows the same CPU-offload + // indicator). The ring is a child of the dot-group, so it travels + // with the point on zoom/pan without a separate onZoom pass. + ctx.layout.zoomGroup + .selectAll('.dot-group') + .each(function (d) { + const showHalo = d.offload_mode === 'on'; + d3.select(this) + .selectAll('.offload-halo') + .data(showHalo ? [true] : []) + .join('circle') + .attr('class', 'offload-halo') + .attr('r', POINT_SIZE + 4) + .attr('fill', 'none') + .attr('stroke', 'var(--foreground)') + .attr('stroke-width', 1.5) + .attr('stroke-dasharray', '3 2') + .attr('opacity', 0.9) + .attr('pointer-events', 'none'); + }); }} legendElement={ Date: Thu, 18 Jun 2026 12:56:08 -0500 Subject: [PATCH 70/96] fix(high-contrast): use full hue wheel for single-vendor comparisons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit generateHighContrastColors clamps each vendor's series into its brand hue zone (NVIDIA=green, AMD=red) at <=PREFERRED_MAX items. The point of that clamp is to keep DIFFERENT vendors apart at a glance — but when only one vendor is present (the common all-NVIDIA agentic comparison: B200/B300 x vLLM/SGLang), there's no rival to separate from, so every series collapses into the same narrow green band and high-contrast mode looks like it does nothing. When a single vendor is present, skip the brand zone and rival-ban and use the full hue wheel for maximum separation. Verified on an all-NVIDIA agentic view: HC now spreads pink/blue/gold/green (hues 45/99/227/330, min adjacent gap 54deg) instead of four near-identical greens. Multi-vendor behavior is unchanged — vendors keep their brand zones so they stay distinguishable. The non-HC palette still carries vendor identity. Updated the single-vendor color tests to assert separability across the full wheel rather than brand-zone confinement. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/app/src/lib/chart-utils.test.ts | 39 ++++++++++-------------- packages/app/src/lib/chart-utils.ts | 19 ++++++++++-- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/packages/app/src/lib/chart-utils.test.ts b/packages/app/src/lib/chart-utils.test.ts index 061037ed..f6828ce2 100644 --- a/packages/app/src/lib/chart-utils.test.ts +++ b/packages/app/src/lib/chart-utils.test.ts @@ -353,30 +353,29 @@ describe('generateHighContrastColors', () => { expect(Object.values(dark).join(',')).not.toEqual(Object.values(light).join(',')); }); - // ---------- Tier 1: few items → brand zone ---------- - - it('3 NVIDIA GPUs are not red', () => { + // ---------- Single vendor: full wheel for maximum contrast ---------- + // Brand-zone / rival-ban only apply when MULTIPLE vendors are present (so the + // vendors stay visually separable). With a single vendor there's no rival to + // distinguish from, so HC opens the full hue wheel — brand hue is sacrificed + // for the contrast HC exists to provide (fixes the all-NVIDIA agentic case + // where every series otherwise collapsed into the green brand band). + + it('3 NVIDIA GPUs (single vendor) are distinguishable across the full wheel', () => { const result = generateHighContrastColors(['h100_vllm', 'h200_vllm', 'b200_vllm'], 'dark'); - for (const color of Object.values(result)) { - expect(isNotReddish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(3); assertMinDist(result, 30); }); - it('2 AMD GPUs are not green', () => { + it('2 AMD GPUs (single vendor) are distinguishable across the full wheel', () => { const result = generateHighContrastColors(['mi300x_sglang', 'mi325x_sglang'], 'dark'); - for (const color of Object.values(result)) { - expect(isNotGreenish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(2); assertMinDist(result, 30); }); - it('4 NVIDIA GPUs stay in brand zone and are distinguishable', () => { + it('4 NVIDIA GPUs (single vendor) use the full wheel and stay well-separated', () => { const keys = ['h100_vllm', 'h200_vllm', 'b200_vllm', 'b300_vllm']; const result = generateHighContrastColors(keys, 'dark'); - for (const color of Object.values(result)) { - expect(isNotReddish(parseRgb(color))).toBe(true); - } + expect(Object.keys(result)).toHaveLength(4); assertMinDist(result, 25); }); @@ -401,19 +400,13 @@ describe('generateHighContrastColors', () => { assertMinDist(result, 25); }); - // ---------- Tier 2: moderate items → full wheel minus rival color ---------- + // ---------- Single vendor, many items → full wheel, best spacing ---------- - it('10 NVIDIA GPUs: no red hues, still distinguishable', () => { + it('10 NVIDIA GPUs (single vendor) are well-separated across the full wheel', () => { const gpus = ['h100', 'h200', 'b200', 'b300', 'gb200']; const keys = gpus.flatMap((g) => [`${g}_vllm`, `${g}_sglang`]); const result = generateHighContrastColors(keys, 'dark'); - // Should not be reddish (banned) - for (const color of Object.values(result)) { - const rgb = parseRgb(color); - // Not red-dominant with low green — i.e. not in the red/pink zone - const isRedPink = rgb[0] > 150 && rgb[1] < 80 && rgb[2] < 150; - expect(isRedPink).toBe(false); - } + expect(Object.keys(result)).toHaveLength(10); assertMinDist(result, 20); }); diff --git a/packages/app/src/lib/chart-utils.ts b/packages/app/src/lib/chart-utils.ts index 33a5b4e3..3eeda15b 100644 --- a/packages/app/src/lib/chart-utils.ts +++ b/packages/app/src/lib/chart-utils.ts @@ -61,10 +61,17 @@ const PALETTE_CACHE = new Map(); /** * Generates high-contrast colors using iwanthue (k-means in CIELab space). * - * Tiered strategy per vendor: + * Tiered strategy per vendor (only when >1 vendor is present): * ≤ PREFERRED_MAX → constrain to brand zone (NVIDIA=green, AMD=red) * ≤ BAN_MAX → full wheel minus rival's brand color * > BAN_MAX → full wheel, no restrictions, best spacing wins + * + * Single-vendor case (e.g. an all-NVIDIA agentic comparison of B200/B300 × + * vLLM/SGLang): the brand zone and rival-ban exist to keep vendors apart at a + * glance, but with one vendor there's no rival — clamping every series into the + * same narrow hue band just collapses the contrast HC is supposed to maximize. + * So skip both restrictions and use the full wheel, giving the series the widest + * possible separation. */ export const generateHighContrastColors = ( keys: string[], @@ -91,6 +98,12 @@ export const generateHighContrastColors = ( list.push(key); } + // Brand-zone / rival-ban only serve to keep DIFFERENT vendors apart. With a + // single vendor present there's nothing to separate from, so those + // restrictions only shrink the usable hue range and kill contrast — open the + // full wheel instead (the common all-NVIDIA agentic comparison case). + const multiVendor = groups.size > 1; + for (const [vendor, vendorKeys] of groups) { const count = vendorKeys.length; const isBanned = BANNED_HUE_TEST[vendor] ?? null; @@ -99,8 +112,8 @@ export const generateHighContrastColors = ( // Tier 1: few items → brand zone only // Tier 2: moderate → full wheel minus rival color // Tier 3: many → full wheel, no restrictions - const usePreferred = preferred && count <= PREFERRED_MAX; - const useBan = !usePreferred && isBanned && count <= BAN_MAX; + const usePreferred = multiVendor && preferred && count <= PREFERRED_MAX; + const useBan = multiVendor && !usePreferred && isBanned && count <= BAN_MAX; // Everything iwanthue's output depends on (the ban filter and preferred // zone are functions of vendor; the seed is vendor+theme). From 6275aa70bf0162cd83762ff79a2e0a5c053270e2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 10:17:42 -0500 Subject: [PATCH 71/96] feat(inference): default line labels off, parallelism labels + high contrast on Change the inference chart's default toggle states: - Line Labels: on -> off (i_linelabel=1 overrides on) - Parallelism Labels: off -> on, which also defaults point labels on since parallelism labels ARE point labels (i_advlabel=0 overrides off) - High Contrast: off -> on, via a new opt-in defaultHighContrast on useChartUIState so reliability/evaluation (r_/e_ prefixes) stay off; i_hc=0 overrides off. Historical trends shares the inference context so it inherits the high-contrast default too. URL serialization flipped to omit each param at its new default and only write the override value, so share links stay clean. Updated line-labels, gradient-labels, and url-params E2E specs to the new defaults. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../app/cypress/e2e/gradient-labels.cy.ts | 16 +++++----- packages/app/cypress/e2e/line-labels.cy.ts | 31 ++++++++++++------- packages/app/cypress/e2e/url-params.cy.ts | 14 +++++++-- .../components/inference/InferenceContext.tsx | 25 ++++++++------- packages/app/src/hooks/useChartContext.ts | 12 +++++-- 5 files changed, 61 insertions(+), 37 deletions(-) diff --git a/packages/app/cypress/e2e/gradient-labels.cy.ts b/packages/app/cypress/e2e/gradient-labels.cy.ts index 333baa6d..a0753e90 100644 --- a/packages/app/cypress/e2e/gradient-labels.cy.ts +++ b/packages/app/cypress/e2e/gradient-labels.cy.ts @@ -24,8 +24,8 @@ describe('Gradient Labels Toggle', () => { cy.get('label[for="scatter-parallelism-labels"]').should('contain.text', 'Parallelism Labels'); }); - it('Parallelism Labels toggle is off by default', () => { - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'unchecked'); + it('Parallelism Labels toggle is on by default', () => { + cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); }); it('per-point labels are visible by default (gradient labels off)', () => { @@ -60,21 +60,19 @@ describe('Gradient Labels Toggle', () => { }); it('both toggles can be enabled simultaneously', () => { - // Turn on Gradient Labels (off by default) + // Parallelism Labels is on by default; ensure it's on, then turn on Gradient. + cy.get('#scatter-parallelism-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('#scatter-gradient-labels').click(); cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); - // Turn on Parallelism Labels - cy.get('#scatter-parallelism-labels').click(); - cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Both should be checked cy.get('#scatter-gradient-labels').should('have.attr', 'data-state', 'checked'); cy.get('#scatter-parallelism-labels').should('have.attr', 'data-state', 'checked'); - // Reset for next tests + // Reset gradient for next tests (parallelism stays at its default-on). cy.get('#scatter-gradient-labels').click(); - cy.get('#scatter-parallelism-labels').click(); }); it('URL param i_gradlabel=1 enables gradient labels on load', () => { diff --git a/packages/app/cypress/e2e/line-labels.cy.ts b/packages/app/cypress/e2e/line-labels.cy.ts index 84e655f8..23b372df 100644 --- a/packages/app/cypress/e2e/line-labels.cy.ts +++ b/packages/app/cypress/e2e/line-labels.cy.ts @@ -15,26 +15,30 @@ describe('Line Labels Toggle', () => { cy.get('label[for="scatter-line-labels"]').should('contain.text', 'Line Labels'); }); - it('Line Labels toggle is on by default', () => { - cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); - - // Line labels render without any interaction - cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); - }); - - it('toggling Line Labels off then back on removes and restores label elements', () => { - // On by default — turn it off first. - cy.get('#scatter-line-labels').click(); + it('Line Labels toggle is off by default', () => { cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + + // No line labels render without interaction cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); + }); - // Turn it back on — labels return. + it('toggling Line Labels on then back off adds and removes label elements', () => { + // Off by default — turn it on first. cy.get('#scatter-line-labels').click(); cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'checked'); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); + + // Turn it back off — labels disappear. + cy.get('#scatter-line-labels').click(); + cy.get('#scatter-line-labels').should('have.attr', 'data-state', 'unchecked'); + cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length', 0); }); it('line labels have colored background rects and text', () => { + // Off by default — ensure on (idempotent; prior test left them off). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); // Each line label group should contain a background rect and text cy.get('[data-testid="scatter-graph"] svg g.line-label .ll-bg').should( 'have.length.greaterThan', @@ -47,7 +51,10 @@ describe('Line Labels Toggle', () => { }); it('line labels render in the foreground, after the scatter points', () => { - // Labels were toggled on in the test above and remain on here. + // Off by default — ensure on (idempotent; previous test leaves them on). + cy.get('#scatter-line-labels').then(($el) => { + if ($el.attr('data-state') !== 'checked') cy.wrap($el).click(); + }); cy.get('[data-testid="scatter-graph"] svg g.line-label').should('have.length.greaterThan', 0); cy.get('[data-testid="scatter-graph"] svg').then(($svg) => { diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts index 33282b9c..3c480686 100644 --- a/packages/app/cypress/e2e/url-params.cy.ts +++ b/packages/app/cypress/e2e/url-params.cy.ts @@ -236,9 +236,15 @@ describe('URL Parameter Persistence', () => { }); describe('High contrast mode', () => { - it('page loads without high contrast by default', () => { + it('inference loads with high contrast on by default', () => { visitWithDismissedModal('/inference'); cy.get('[data-testid="scatter-graph"]').should('exist'); + cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'checked'); + }); + + it('i_hc=0 disables high contrast on load', () => { + visitWithDismissedModal('/inference?i_hc=0'); + cy.get('[data-testid="scatter-graph"]').should('exist'); cy.get('#scatter-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); }); @@ -267,10 +273,12 @@ describe('URL Parameter Persistence', () => { cy.get('#eval-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); - it('historical trends tab has high contrast switch off by default', () => { + it('historical trends tab shares the inference high-contrast default (on)', () => { + // Historical reads highContrast from the same InferenceContext as the + // scatter chart, so it inherits the default-on behavior. visitWithDismissedModal('/historical'); cy.get('[data-testid="historical-trends-display"]').should('exist'); - cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'unchecked'); + cy.get('#historical-high-contrast').first().should('have.attr', 'data-state', 'checked'); }); it('i_hc=1 enables historical trends high contrast', () => { diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index d66febd0..c2c599ff 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -195,6 +195,8 @@ export function InferenceProvider({ ); const { highContrast, setHighContrast, isLegendExpanded, setIsLegendExpanded } = useChartUIState({ urlPrefix: 'i_', + // Inference chart defaults to high contrast (?i_hc=0 overrides off). + defaultHighContrast: true, }); const [hideNonOptimal, setHideNonOptimal] = useState(() => getUrlParam('i_optimal') !== '0'); @@ -202,21 +204,22 @@ export function InferenceProvider({ // Legacy `?i_nolabel=1` from before the rename: keep hiding point labels // explicitly so the share link's intent survives future default changes. if (getUrlParam('i_nolabel') === '1') return false; + if (getUrlParam('i_label') === '0') return false; if (getUrlParam('i_label') === '1') return true; - // Old share links set `?i_advlabel=1` while keeping the labels default - // (shown). Mirror the toggle's auto-enable side-effect on load so those - // links still render advanced labels under the new default-off behavior. - if (getUrlParam('i_advlabel') === '1') return true; - return false; + // Default on: parallelism labels (also default on) are point labels and + // are pointless without them shown. + return true; }); const [logScale, setLogScale] = useState(() => getUrlParam('i_log') === '1'); + // Parallelism labels default on (?i_advlabel=0 overrides off). const [useAdvancedLabels, setUseAdvancedLabels] = useState( - () => getUrlParam('i_advlabel') === '1', + () => getUrlParam('i_advlabel') !== '0', ); const [showGradientLabels, setShowGradientLabels] = useState( () => getUrlParam('i_gradlabel') === '1', ); - const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') !== '0'); + // Line labels default off (?i_linelabel=1 overrides on). + const [showLineLabels, setShowLineLabels] = useState(() => getUrlParam('i_linelabel') === '1'); const [showSpeedOverlay, setShowSpeedOverlay] = useState(() => getUrlParam('i_speed') === '1'); const [showMinecraftOverlay, setShowMinecraftOverlay] = useState( () => getUrlParam('i_mc') === '1', @@ -983,17 +986,17 @@ export function InferenceProvider({ i_dstart: selectedDateRange.startDate, i_dend: selectedDateRange.endDate, i_optimal: hideNonOptimal ? '' : '0', - i_label: showPointLabels ? '1' : '', - i_hc: highContrast ? '1' : '', + i_label: showPointLabels ? '' : '0', + i_hc: highContrast ? '' : '0', i_log: logScale ? '1' : '', i_xmetric: selectedXAxisMetric || '', i_e2e_xmetric: selectedE2eXAxisMetric || '', i_xmode: selectedXAxisMode, i_scale: scaleType, i_legend: isLegendExpanded ? '' : '0', - i_advlabel: useAdvancedLabels ? '1' : '', + i_advlabel: useAdvancedLabels ? '' : '0', i_gradlabel: showGradientLabels ? '1' : '', - i_linelabel: showLineLabels ? '' : '0', + i_linelabel: showLineLabels ? '1' : '', i_speed: showSpeedOverlay ? '1' : '', i_mc: showMinecraftOverlay ? '1' : '', i_active: iActiveStr, diff --git a/packages/app/src/hooks/useChartContext.ts b/packages/app/src/hooks/useChartContext.ts index 49812c3e..be095430 100644 --- a/packages/app/src/hooks/useChartContext.ts +++ b/packages/app/src/hooks/useChartContext.ts @@ -37,6 +37,12 @@ export function reconcileActiveSet( interface UseChartStateConfig { /** URL parameter prefix (e.g., 'i_' for inference, 'r_' for reliability, 'e_' for evaluation) */ urlPrefix: string; + /** + * Initial high-contrast value when the URL has no `hc` param. + * Defaults to false; the inference chart opts in to true. A `hc=0` + * URL param overrides it back off. + */ + defaultHighContrast?: boolean; } /** @@ -44,7 +50,7 @@ interface UseChartStateConfig { * Includes mobile-specific legend collapse behavior. */ export function useChartUIState(config: UseChartStateConfig) { - const { urlPrefix } = config; + const { urlPrefix, defaultHighContrast = false } = config; const { getUrlParam } = useUrlState(); const hcParam = `${urlPrefix}hc` as any; @@ -52,7 +58,7 @@ export function useChartUIState(config: UseChartStateConfig) { // Initialize with safe defaults that match SSR output to avoid hydration mismatches. // URL-param values are applied in a mount effect so the state is only set client-side. - const [highContrast, setHighContrast] = useState(false); + const [highContrast, setHighContrast] = useState(defaultHighContrast); const [isLegendExpanded, setIsLegendExpanded] = useState(true); const didInit = useRef(false); @@ -60,7 +66,9 @@ export function useChartUIState(config: UseChartStateConfig) { if (didInit.current) return; didInit.current = true; const hcVal = getUrlParam(hcParam); + // Respect both overrides so the toggle round-trips regardless of the default. if (hcVal === '1') setHighContrast(true); + else if (hcVal === '0') setHighContrast(false); const legendVal = getUrlParam(legendParam); if (legendVal === '0') setIsLegendExpanded(false); }, [getUrlParam, hcParam, legendParam]); From 5c290a49f50d7a0834a544d3e837bc1d1ccad5de Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 14:30:44 -0500 Subject: [PATCH 72/96] feat(agentic): use the chart's TP/EP/DEP/TEP parallelism labels on sibling chips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agentic detail page's sibling navigator labeled configs with an ad-hoc `TP{n}EP{n}` / `{p}P+{d}D` scheme that ignored dp-attention and the TEP/DEP collapse, so a DEP4 config read as plain TP4EP4 (and, mid-deploy before the API carried dp_attention, as TEP4). Extract the scatter chart's labeler into a shared parallelism-label module (configSegmentLabel + parallelismLabel) and route both getPointLabel and the sibling chipLabel through it, so the two surfaces describe a config identically (TP/EP/TEP/DEP/DPA…, multinode-disagg worker segments). Carry the fields the labeler needs through the siblings query/API/hook: decode/prefill dp_attention + num_workers + is_multinode. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../inference/agentic-point/sibling-nav.tsx | 20 ++++- .../inference/utils/parallelism-label.test.ts | 58 ++++++++++++++ .../inference/utils/parallelism-label.ts | 79 +++++++++++++++++++ .../inference/utils/tooltipUtils.ts | 69 ++++++---------- .../src/hooks/api/use-benchmark-siblings.ts | 5 ++ packages/db/src/queries/benchmark-siblings.ts | 20 ++++- 6 files changed, 202 insertions(+), 49 deletions(-) create mode 100644 packages/app/src/components/inference/utils/parallelism-label.test.ts create mode 100644 packages/app/src/components/inference/utils/parallelism-label.ts diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx index aa727fdc..f92d6b63 100644 --- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx +++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx @@ -4,6 +4,7 @@ import { useRouter } from 'next/navigation'; import { ChevronLeft, ChevronRight } from 'lucide-react'; import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings'; +import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; const HW_LABELS: Record = { b200: 'B200', @@ -49,9 +50,22 @@ function frameworkLabel(fw: string) { /** Short label for a sibling chip: parallelism + concurrency. */ export function chipLabel(s: BenchmarkSibling): string { - const parallel = s.disagg - ? `${s.num_prefill_gpu}P+${s.num_decode_gpu}D` - : `TP${s.decode_tp}${s.decode_ep > 1 ? `EP${s.decode_ep}` : ''}`; + // Same parallelism labeler the chart points use (TP/EP/TEP/DEP/DPA…). + const parallel = parallelismLabel({ + tp: s.decode_tp, + ep: s.decode_ep, + dpAttention: s.decode_dp_attention, + disagg: s.disagg, + isMultinode: s.is_multinode, + prefillTp: s.prefill_tp, + prefillEp: s.prefill_ep, + prefillDpAttention: s.prefill_dp_attention, + prefillNumWorkers: s.prefill_num_workers, + decodeTp: s.decode_tp, + decodeEp: s.decode_ep, + decodeDpAttention: s.decode_dp_attention, + decodeNumWorkers: s.decode_num_workers, + }); const offload = s.offload_mode === 'on' ? ' • off=ON' : ''; return `${parallel} • c=${s.conc}${offload}`; } diff --git a/packages/app/src/components/inference/utils/parallelism-label.test.ts b/packages/app/src/components/inference/utils/parallelism-label.test.ts new file mode 100644 index 00000000..aaf715d3 --- /dev/null +++ b/packages/app/src/components/inference/utils/parallelism-label.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; + +import { configSegmentLabel, parallelismLabel } from './parallelism-label'; + +describe('configSegmentLabel', () => { + it('collapses symmetric tp===ep to TEP / DEP by dp-attention', () => { + expect(configSegmentLabel(8, 8, false)).toBe('TEP8'); + expect(configSegmentLabel(8, 8, true)).toBe('DEP8'); + }); + + it('uses EP / DPAEP when ep>1 and tp!==ep', () => { + expect(configSegmentLabel(4, 16, false)).toBe('EP16'); + expect(configSegmentLabel(4, 16, true)).toBe('DPAEP16'); + }); + + it('uses TP / DPATP when ep<=1 or absent', () => { + expect(configSegmentLabel(8, 1, false)).toBe('TP8'); + expect(configSegmentLabel(8, undefined, false)).toBe('TP8'); + expect(configSegmentLabel(8, 1, true)).toBe('DPATP8'); + }); +}); + +describe('parallelismLabel', () => { + it('falls back to bare tp when no ep data', () => { + expect(parallelismLabel({ tp: 8 })).toBe('8'); + }); + + it('labels a single-segment config', () => { + expect(parallelismLabel({ tp: 8, ep: 8, dpAttention: true })).toBe('DEP8'); + expect(parallelismLabel({ tp: 4, ep: 8, dpAttention: false })).toBe('EP8'); + }); + + it('builds multinode-disagg per-role worker segments', () => { + expect( + parallelismLabel({ + tp: 8, + ep: 4, + disagg: true, + isMultinode: true, + prefillTp: 4, + prefillEp: 4, + prefillDpAttention: false, + prefillNumWorkers: 2, + decodeTp: 8, + decodeEp: 8, + decodeDpAttention: true, + decodeNumWorkers: 1, + }), + ).toBe('2xTEP4+1xDEP8'); + }); + + it('single-node disagg uses the single (decode) segment, not worker syntax', () => { + // is_multinode false → no "NxPrefill+MxDecode" expansion. + expect( + parallelismLabel({ tp: 8, ep: 8, dpAttention: false, disagg: true, isMultinode: false }), + ).toBe('TEP8'); + }); +}); diff --git a/packages/app/src/components/inference/utils/parallelism-label.ts b/packages/app/src/components/inference/utils/parallelism-label.ts new file mode 100644 index 00000000..98207110 --- /dev/null +++ b/packages/app/src/components/inference/utils/parallelism-label.ts @@ -0,0 +1,79 @@ +/** + * Shared parallelism-config labeling — the single source of truth for the + * short "TP8 / EP8 / TEP8 / DEP8 / DPAEP8 / 2xEP4+1xDPAEP32" labels. + * + * Used by the scatter/GPU chart point labels (via getPointLabel) and the + * agentic detail page's sibling navigator chips, so both surfaces describe a + * config identically. + */ + +/** + * Generates a short config segment label from parallelism params. + * - tp == ep and dp-attn false: "TEP{N}" + * - tp == ep and dp-attn true: "DEP{N}" + * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}" + * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}" + */ +export const configSegmentLabel = ( + tp: number, + ep: number | undefined, + dpAttention: boolean | undefined, +): string => { + if (ep !== null && ep !== undefined && ep > 1 && tp === ep) { + return dpAttention ? `DEP${tp}` : `TEP${tp}`; + } + const dpaPrefix = dpAttention ? 'DPA' : ''; + if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`; + return `${dpaPrefix}EP${ep}`; +}; + +/** Parallelism params for one benchmark config, framework-agnostic. */ +export interface ParallelismFields { + tp: number; + ep?: number; + dpAttention?: boolean; + disagg?: boolean; + isMultinode?: boolean; + prefillTp?: number; + prefillEp?: number; + prefillDpAttention?: boolean; + prefillNumWorkers?: number; + decodeTp?: number; + decodeEp?: number; + decodeDpAttention?: boolean; + decodeNumWorkers?: number; +} + +/** + * Returns the short parallelism label for a config. + * - No EP data (old rows): falls back to the bare tp value (e.g. "8"). + * - Multinode disagg: per-role segments with worker counts, + * e.g. "2xEP4+1xDPAEP32". + * - Otherwise: a single segment from (tp, ep, dpAttention). + */ +export const parallelismLabel = (f: ParallelismFields): string => { + if ( + (f.ep === null || f.ep === undefined) && + (f.prefillEp === null || f.prefillEp === undefined) + ) { + return String(f.tp); + } + + if (f.isMultinode && f.disagg) { + const prefillLabel = configSegmentLabel( + f.prefillTp ?? f.tp, + f.prefillEp ?? f.ep, + f.prefillDpAttention ?? f.dpAttention, + ); + const decodeLabel = configSegmentLabel( + f.decodeTp ?? f.tp, + f.decodeEp ?? f.ep, + f.decodeDpAttention ?? f.dpAttention, + ); + const pw = f.prefillNumWorkers ?? 1; + const dw = f.decodeNumWorkers ?? 1; + return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`; + } + + return configSegmentLabel(f.tp, f.ep, f.dpAttention); +}; diff --git a/packages/app/src/components/inference/utils/tooltipUtils.ts b/packages/app/src/components/inference/utils/tooltipUtils.ts index 14d3b553..ea039336 100644 --- a/packages/app/src/components/inference/utils/tooltipUtils.ts +++ b/packages/app/src/components/inference/utils/tooltipUtils.ts @@ -1,6 +1,7 @@ import { formatNumber, getDisplayLabel } from '@/lib/utils'; import type { HardwareConfig, InferenceData, OverlayData } from '@/components/inference/types'; +import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; export interface TooltipConfig { /** The data point to display */ @@ -34,57 +35,37 @@ export interface OverlayTooltipConfig extends TooltipConfig { overlayData: OverlayData; } -/** - * Generates a short config segment label from parallelism params. - * - tp == ep and dp-attn false: "TEP{N}" - * - tp == ep and dp-attn true: "DEP{N}" - * - ep > 1 (tp != ep): "EP{ep}" or "DPAEP{ep}" - * - ep <= 1 (or no EP): "TP{tp}" or "DPATP{tp}" - */ -const configSegmentLabel = ( - tp: number, - ep: number | undefined, - dpAttention: boolean | undefined, -): string => { - if (ep !== null && ep !== undefined && ep > 1 && tp === ep) { - return dpAttention ? `DEP${tp}` : `TEP${tp}`; - } - const dpaPrefix = dpAttention ? 'DPA' : ''; - if (ep === null || ep === undefined || ep <= 1) return `${dpaPrefix}TP${tp}`; - return `${dpaPrefix}EP${ep}`; -}; +// `dp_attention` is `boolean | string` on InferenceData (DB sends raw, the +// transform narrows "true"/"false" → boolean). Coerce to a plain boolean for +// the shared labeler, treating the legacy string form correctly. +const asBool = (v: boolean | string | undefined): boolean | undefined => + typeof v === 'string' ? v === 'true' : v; /** * Returns the short label for a data point on the chart. * - Non-multinode: e.g. "TP8", "EP8", "TEP8", "DEP8", "DPAEP8" * - Multinode disagg: e.g. "2xEP4+1xDPAEP32" * - Old data (no ep field): falls back to tp value + * + * Delegates to the shared {@link parallelismLabel} so the chart points and the + * agentic sibling navigator describe a config identically. */ -export const getPointLabel = (d: InferenceData): string => { - if ( - (d.ep === null || d.ep === undefined) && - (d.prefill_ep === null || d.prefill_ep === undefined) - ) - return String(d.tp); - - if (d.is_multinode && d.disagg) { - const prefillLabel = configSegmentLabel( - d.prefill_tp ?? d.tp, - d.prefill_ep ?? d.ep, - d.prefill_dp_attention ?? d.dp_attention, - ); - const decodeLabel = configSegmentLabel( - d.decode_tp ?? d.tp, - d.decode_ep ?? d.ep, - d.decode_dp_attention ?? d.dp_attention, - ); - const pw = d.prefill_num_workers ?? 1; - const dw = d.decode_num_workers ?? 1; - return `${pw}x${prefillLabel}+${dw}x${decodeLabel}`; - } - - return configSegmentLabel(d.tp, d.ep, d.dp_attention); -}; +export const getPointLabel = (d: InferenceData): string => + parallelismLabel({ + tp: d.tp, + ep: d.ep, + dpAttention: asBool(d.dp_attention), + disagg: d.disagg, + isMultinode: d.is_multinode, + prefillTp: d.prefill_tp, + prefillEp: d.prefill_ep, + prefillDpAttention: asBool(d.prefill_dp_attention), + prefillNumWorkers: d.prefill_num_workers, + decodeTp: d.decode_tp, + decodeEp: d.decode_ep, + decodeDpAttention: asBool(d.decode_dp_attention), + decodeNumWorkers: d.decode_num_workers, + }); const runLinkHTML = (runUrl?: string) => runUrl diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts index 1ea90c0d..e6bc4906 100644 --- a/packages/app/src/hooks/api/use-benchmark-siblings.ts +++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts @@ -6,11 +6,16 @@ export interface BenchmarkSibling { offload_mode: string | null; decode_tp: number; decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; prefill_tp: number; prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; disagg: boolean; + is_multinode: boolean; is_current: boolean; has_trace: boolean; } diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts index 245a1170..241a48ba 100644 --- a/packages/db/src/queries/benchmark-siblings.ts +++ b/packages/db/src/queries/benchmark-siblings.ts @@ -14,11 +14,16 @@ export interface BenchmarkSibling { offload_mode: string | null; decode_tp: number; decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; prefill_tp: number; prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; disagg: boolean; + is_multinode: boolean; /** True if this row IS the point passed in. */ is_current: boolean; /** Whether the row has a stored trace_replay blob (for navigation hint). */ @@ -74,8 +79,9 @@ export async function getBenchmarkSiblings( const rows = (await sql` select br.id, br.conc, br.offload_mode, - c.decode_tp, c.decode_ep, c.prefill_tp, c.prefill_ep, - c.num_prefill_gpu, c.num_decode_gpu, c.disagg, + c.decode_tp, c.decode_ep, c.decode_dp_attention, c.decode_num_workers, + c.prefill_tp, c.prefill_ep, c.prefill_dp_attention, c.prefill_num_workers, + c.num_prefill_gpu, c.num_decode_gpu, c.disagg, c.is_multinode, (br.trace_replay_id is not null) as has_trace from benchmark_results br join configs c on c.id = br.config_id @@ -93,11 +99,16 @@ export async function getBenchmarkSiblings( offload_mode: string | null; decode_tp: number; decode_ep: number; + decode_dp_attention: boolean; + decode_num_workers: number; prefill_tp: number; prefill_ep: number; + prefill_dp_attention: boolean; + prefill_num_workers: number; num_prefill_gpu: number; num_decode_gpu: number; disagg: boolean; + is_multinode: boolean; has_trace: boolean; }[]; @@ -107,11 +118,16 @@ export async function getBenchmarkSiblings( offload_mode: r.offload_mode, decode_tp: r.decode_tp, decode_ep: r.decode_ep, + decode_dp_attention: r.decode_dp_attention, + decode_num_workers: r.decode_num_workers, prefill_tp: r.prefill_tp, prefill_ep: r.prefill_ep, + prefill_dp_attention: r.prefill_dp_attention, + prefill_num_workers: r.prefill_num_workers, num_prefill_gpu: r.num_prefill_gpu, num_decode_gpu: r.num_decode_gpu, disagg: r.disagg, + is_multinode: r.is_multinode, is_current: Number(r.id) === benchmarkResultId, has_trace: r.has_trace, })); From 32adf6bec66f41ffe2cfa4f08251afcb333c007d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 14:53:17 -0500 Subject: [PATCH 73/96] feat(agentic): sort dropdown for the sibling point navigator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 'Sort by' dropdown to the agentic detail page's point navigator: - Default (DB order) - Concurrency ↑ - Parallelism (groups all TP, then TEP/DEP/EP… by ep→tp→dpa, conc within) - Throughput/GPU ↓ - Total requests ↓ Carry tput_per_gpu and total_requests (total_requests_completed, falling back to legacy num_requests_total) through the siblings query/API/hook. prev/next follow the sorted order, and the chosen sort is persisted in the URL (?sort=) — read on mount and threaded through every point link plus a router.replace — so navigating to another point no longer resets it. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../inference/agentic-point/sibling-nav.tsx | 131 ++++++++++++++++-- .../src/hooks/api/use-benchmark-siblings.ts | 2 + packages/db/src/queries/benchmark-siblings.ts | 16 +++ 3 files changed, 141 insertions(+), 8 deletions(-) diff --git a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx index f92d6b63..a1a5d1ab 100644 --- a/packages/app/src/components/inference/agentic-point/sibling-nav.tsx +++ b/packages/app/src/components/inference/agentic-point/sibling-nav.tsx @@ -1,10 +1,19 @@ 'use client'; +import { useMemo, useState } from 'react'; import { useRouter } from 'next/navigation'; import { ChevronLeft, ChevronRight } from 'lucide-react'; import type { BenchmarkSibling, BenchmarkSku } from '@/hooks/api/use-benchmark-siblings'; import { parallelismLabel } from '@/components/inference/utils/parallelism-label'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { track } from '@/lib/analytics'; const HW_LABELS: Record = { b200: 'B200', @@ -70,12 +79,83 @@ export function chipLabel(s: BenchmarkSibling): string { return `${parallel} • c=${s.conc}${offload}`; } +type SortMode = 'default' | 'conc' | 'parallelism' | 'tput' | 'requests'; + +const SORT_OPTIONS: { value: SortMode; label: string }[] = [ + { value: 'default', label: 'Default' }, + { value: 'conc', label: 'Concurrency ↑' }, + { value: 'parallelism', label: 'Parallelism' }, + { value: 'tput', label: 'Throughput/GPU ↓' }, + { value: 'requests', label: 'Total requests ↓' }, +]; + +// Group key for the "parallelism" sort: ep first (so TP/EP1 sorts ahead of +// EP/TEP/DEP groups), then tp, then dp-attention, then disagg — every config +// of one parallelism lands together, ordered by concurrency within. +const parallelRank = (s: BenchmarkSibling): [number, number, number, number] => [ + s.decode_ep ?? 0, + s.decode_tp ?? 0, + s.decode_dp_attention ? 1 : 0, + s.disagg ? 1 : 0, +]; + +function sortSiblings(siblings: BenchmarkSibling[], mode: SortMode): BenchmarkSibling[] { + if (mode === 'default') return siblings; + const out = [...siblings]; + if (mode === 'conc') { + out.sort((a, b) => a.conc - b.conc); + } else if (mode === 'tput') { + // Highest throughput/GPU first; rows missing the metric sink to the end. + out.sort((a, b) => (b.tput_per_gpu ?? -Infinity) - (a.tput_per_gpu ?? -Infinity)); + } else if (mode === 'requests') { + // Most total requests first; rows missing the metric sink to the end. + out.sort((a, b) => (b.total_requests ?? -Infinity) - (a.total_requests ?? -Infinity)); + } else { + out.sort((a, b) => { + const ra = parallelRank(a); + const rb = parallelRank(b); + for (let i = 0; i < ra.length; i++) { + if (ra[i] !== rb[i]) return ra[i] - rb[i]; + } + // Within a parallelism group: offload off before on, then concurrency. + const oa = a.offload_mode === 'on' ? 1 : 0; + const ob = b.offload_mode === 'on' ? 1 : 0; + return oa - ob || a.conc - b.conc; + }); + } + return out; +} + +const isSortMode = (v: string | null): v is SortMode => + v !== null && SORT_OPTIONS.some((o) => o.value === v); + export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: BenchmarkSibling[] }) { const router = useRouter(); - const currentIdx = siblings.findIndex((s) => s.is_current); - const prev = currentIdx > 0 ? siblings[currentIdx - 1] : null; - const next = - currentIdx !== -1 && currentIdx < siblings.length - 1 ? siblings[currentIdx + 1] : null; + // Persist the sort in the URL so clicking a point (which remounts this + // component on the new route) keeps the chosen order instead of resetting. + // Read it once from the URL on mount — this component only renders after the + // client-side siblings query resolves, so `window` is always available here + // (no SSR/hydration mismatch). Matches the app's window-based url-state read. + const [sortMode, setSortMode] = useState(() => { + if (typeof window === 'undefined') return 'default'; + const v = new URLSearchParams(window.location.search).get('sort'); + return isSortMode(v) ? v : 'default'; + }); + + const sorted = useMemo(() => sortSiblings(siblings, sortMode), [siblings, sortMode]); + + // prev/next follow the displayed (sorted) order so navigation matches the row. + const currentIdx = sorted.findIndex((s) => s.is_current); + const prev = currentIdx > 0 ? sorted[currentIdx - 1] : null; + const next = currentIdx !== -1 && currentIdx < sorted.length - 1 ? sorted[currentIdx + 1] : null; + + // Carry the active sort through every point-to-point link. + const hrefFor = (id: number) => + sortMode === 'default' + ? `/inference/agentic/${id}` + : `/inference/agentic/${id}?sort=${sortMode}`; + + const currentId = siblings.find((s) => s.is_current)?.id; const skuLabel = `${hwLabel(sku.hardware)} · ${modelLabel(sku.model)} · ${sku.precision.toUpperCase()} · ${frameworkLabel(sku.framework)}`; @@ -88,23 +168,58 @@ export function SiblingNav({ sku, siblings }: { sku: BenchmarkSku; siblings: Ben
+
+ Sort by + +
- {siblings.map((s) => { + {sorted.map((s) => { const active = s.is_current; return ( + + Page {page + 1} of {pageCount} + + +
+ )} + +
+ ); +} + +function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx new file mode 100644 index 00000000..5fcc0dfe --- /dev/null +++ b/packages/app/src/components/datasets/dataset-list.tsx @@ -0,0 +1,85 @@ +'use client'; + +import Link from 'next/link'; + +import { Card } from '@/components/ui/card'; +import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets'; +import { track } from '@/lib/analytics'; + +function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + return String(Math.round(n)); +} + +function DatasetCard({ d }: { d: DatasetRecord }) { + const s = d.summary ?? {}; + const cachedPct = typeof s.cachedPct === 'number' ? `${(s.cachedPct * 100).toFixed(0)}%` : '—'; + return ( + track('datasets_card_clicked', { slug: d.slug })} + className="block transition-colors hover:[&_*]:border-primary/40" + > + +
+

{d.label}

+ + {d.variant} + +
+ {d.description && ( +

{d.description}

+ )} +
+ + + + + + +
+
View dataset →
+
+ + ); +} + +function Stat({ label, value }: { label: string; value: string }) { + return ( +
+
{label}
+
{value}
+
+ ); +} + +export function DatasetList() { + const { data, isLoading, isError } = useDatasets(); + + if (isLoading) { + return
Loading datasets…
; + } + if (isError || !data) { + return ( +
Failed to load datasets.
+ ); + } + if (data.length === 0) { + return ( +
+ No datasets ingested yet. +
+ ); + } + + return ( +
+ {data.map((d) => ( + + ))} +
+ ); +} diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx new file mode 100644 index 00000000..7abc367f --- /dev/null +++ b/packages/app/src/components/datasets/distribution-card.tsx @@ -0,0 +1,220 @@ +'use client'; + +import { useMemo } from 'react'; + +import { Card } from '@/components/ui/card'; +import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover'; +import type { Distribution } from '@/hooks/api/use-datasets'; + +/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ +function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + if (abs > 0 && abs < 1) return n.toFixed(2); + return String(Math.round(n)); +} + +interface DistributionCardProps { + title: string; + subtitle?: string; + unit: string; + distribution?: Distribution; + scale?: 'log' | 'linear'; + /** Format the x value (defaults to compact). e.g. percent for cached fraction. */ + formatValue?: (v: number) => string; +} + +const W = 720; +const H = 240; +const PAD = { top: 12, right: 16, bottom: 48, left: 52 }; + +/** + * Renders a precomputed histogram (bins + stats from datasets.chart_data) as a + * themeable bar chart with median/p90 guide lines and a hover tooltip. Bars are + * drawn at equal visual width; for log-scaled bins the edge labels are already + * log-spaced so the shape reads as a log histogram. + */ +export function DistributionCard({ + title, + subtitle, + unit, + distribution, + scale = 'linear', + formatValue = compact, +}: DistributionCardProps) { + const computed = useMemo(() => { + const bins = distribution?.bins ?? []; + if (bins.length === 0) return null; + const maxCount = Math.max(1, ...bins.map((b) => b.count)); + const innerW = W - PAD.left - PAD.right; + const innerH = H - PAD.top - PAD.bottom; + const n = bins.length; + const barW = innerW / n; + // Map a data value to an x pixel by locating its bin (positional — works for + // both linear and log bins since the edges are precomputed at ingest). + const valueToX = (v: number): number | null => { + for (let i = 0; i < n; i++) { + if (v >= bins[i].x0 && (v < bins[i].x1 || i === n - 1)) { + return PAD.left + (i + 0.5) * barW; + } + } + if (v <= bins[0].x0) return PAD.left + 0.5 * barW; + return PAD.left + (n - 0.5) * barW; + }; + return { bins, maxCount, innerW, innerH, n, barW, valueToX }; + }, [distribution]); + + if (!computed) { + return ( + +
{title}
+
+ No data +
+
+ ); + } + + const { bins, maxCount, innerW, innerH, n, barW, valueToX } = computed; + const stats = distribution?.stats; + + const guides = stats + ? ([ + { label: 'median', value: stats.median, color: '#3b82f6' }, + { label: 'p90', value: stats.p90, color: '#f59e0b' }, + ] as const) + : []; + + // X tick labels from a few bin edges. + const tickIdxs = [0, Math.floor(n / 3), Math.floor((2 * n) / 3), n - 1]; + + const resolve = (fraction: number) => { + const i = Math.min(n - 1, Math.max(0, Math.floor(fraction * n))); + const b = bins[i]; + const items: HoverItem[] = [ + { + color: 'currentColor', + label: 'Range', + value: `${formatValue(b.x0)}–${formatValue(b.x1)} ${unit}`, + }, + { color: 'currentColor', label: 'Count', value: b.count.toLocaleString() }, + ]; + return { items }; + }; + + return ( + +
+ {title} + {scale === 'log' && ( + + log scale + + )} +
+ {subtitle &&
{subtitle}
} + {stats && ( +
+ n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '} + {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit} +
+ )} +
+ + {/* bars */} + {bins.map((b, i) => { + const h = (b.count / maxCount) * innerH; + const x = PAD.left + i * barW; + const y = PAD.top + (innerH - h); + return ( + + ); + })} + + {/* guide lines */} + {guides.map((g) => { + const x = valueToX(g.value); + if (x === null) return null; + return ( + + ); + })} + + {/* x axis */} + + {tickIdxs.map((i, k) => { + const anchor = k === 0 ? 'start' : k === tickIdxs.length - 1 ? 'end' : 'middle'; + const x = PAD.left + (i + 0.5) * barW; + return ( + + {formatValue(bins[i].x0)} + + ); + })} + + {unit} + + + {/* guide legend */} + {guides.map((g, i) => ( + + + + {g.label} {formatValue(g.value)} + + + ))} + +
+
+ ); +} diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx new file mode 100644 index 00000000..12588582 --- /dev/null +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -0,0 +1,273 @@ +'use client'; + +import { useCallback, useMemo, useState } from 'react'; + +import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; + +/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ +function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + return String(Math.round(n)); +} + +// Stacked-bar segment colors. Cached prefix vs uncached input vs output — +// fixed hues (theme-independent) so the meaning is stable in light/dark. +const SEG = { + cached: '#10b981', // emerald-500 — input served from prefix cache + uncached: '#f59e0b', // amber-500 — input that must be (re)computed + output: '#8b5cf6', // violet-500 — generated tokens +} as const; + +const LEGEND = [ + { key: 'cached', label: 'Cached prefix', color: SEG.cached }, + { key: 'uncached', label: 'Uncached input', color: SEG.uncached }, + { key: 'output', label: 'Output', color: SEG.output }, +] as const; + +interface VisibleRow { + key: string; + label: string; + sublabel?: string; + cached: number; + uncached: number; + output: number; + total: number; + indent: number; + isGroup: boolean; + isExpanded: boolean; + groupIndex?: number; +} + +interface TooltipState { + x: number; + y: number; + row: VisibleRow; +} + +/** + * Per-conversation flamegraph driven by the precomputed `structure` JSONB. + * One row per turn; subagent groups render a collapsible header with indented + * children (collapsed by default). Each bar stacks cached-prefix + uncached + * input + output, scaled to the widest visible turn. + */ +export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) { + const nodes = structure.nodes; + + // Subagent groups collapsed by default. + const [expanded, setExpanded] = useState>(() => new Set()); + const [tooltip, setTooltip] = useState(null); + + const groupIndexes = useMemo(() => { + const out: number[] = []; + nodes.forEach((node, i) => { + if (node.kind === 'subagent') out.push(i); + }); + return out; + }, [nodes]); + + const toggle = useCallback((i: number) => { + setExpanded((prev) => { + const next = new Set(prev); + if (next.has(i)) next.delete(i); + else next.add(i); + return next; + }); + }, []); + + const expandAll = useCallback(() => setExpanded(new Set(groupIndexes)), [groupIndexes]); + const collapseAll = useCallback(() => setExpanded(new Set()), []); + + const rows = useMemo(() => { + const out: VisibleRow[] = []; + let turnNo = 0; + nodes.forEach((node: StructureNode, i) => { + if (node.kind === 'turn') { + turnNo += 1; + out.push({ + key: `t-${i}`, + label: `Turn ${turnNo}`, + sublabel: node.model ?? undefined, + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: false, + isExpanded: false, + }); + } else { + const isExpanded = expanded.has(i); + out.push({ + key: `g-${i}`, + label: `${node.label}`, + sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${ + node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : '' + }`, + cached: node.cached, + uncached: node.uncached, + output: node.out, + total: node.in + node.out, + indent: 0, + isGroup: true, + isExpanded, + groupIndex: i, + }); + if (isExpanded) { + node.children.forEach((child, ci) => { + out.push({ + key: `g-${i}-c-${ci}`, + label: `↳ subturn ${ci + 1}`, + sublabel: child.model ?? undefined, + cached: child.cached, + uncached: child.uncached, + output: child.out, + total: child.in + child.out, + indent: 1, + isGroup: false, + isExpanded: false, + }); + }); + } + } + }); + return out; + }, [nodes, expanded]); + + const maxTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)), + [rows], + ); + + const onMove = (e: React.MouseEvent, row: VisibleRow) => { + setTooltip({ x: e.clientX, y: e.clientY, row }); + }; + + return ( +
+
+
+ {LEGEND.map((l) => ( + + + {l.label} + + ))} +
+ {groupIndexes.length > 0 && ( +
+ + +
+ )} +
+ +
+ {rows.map((row) => { + const barFrac = row.total / maxTotal; + const cw = (row.cached / row.total) * 100; + const uw = (row.uncached / row.total) * 100; + const ow = (row.output / row.total) * 100; + return ( +
+ {/* label / group toggle */} +
+ {row.isGroup ? ( + + ) : ( + {row.label} + )} +
+ + {/* stacked bar */} +
onMove(e, row)} + onMouseLeave={() => setTooltip(null)} + > +
+
+
+
+
+
+ + {/* total */} +
+ {compact(row.total)} +
+
+ ); + })} +
+ + {tooltip && ( +
+
+ {tooltip.row.label} + {tooltip.row.sublabel ? ( + {tooltip.row.sublabel} + ) : null} +
+
+ Cached prefix + + {compact(tooltip.row.cached)} + + Uncached input + + {compact(tooltip.row.uncached)} + + Output + + {compact(tooltip.row.output)} + + Cached % + + {tooltip.row.cached + tooltip.row.uncached > 0 + ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%` + : '—'} + +
+
+ )} +
+ ); +} diff --git a/packages/app/src/components/header/header.tsx b/packages/app/src/components/header/header.tsx index 57965518..5725d99f 100644 --- a/packages/app/src/components/header/header.tsx +++ b/packages/app/src/components/header/header.tsx @@ -46,6 +46,12 @@ const NAV_LINKS = [ testId: 'nav-link-supporters', event: 'header_supporters_clicked', }, + { + href: '/datasets', + label: 'Datasets', + testId: 'nav-link-datasets', + event: 'header_datasets_clicked', + }, { href: '/blog', label: 'Articles', testId: 'nav-link-blog', event: 'header_blog_clicked' }, { href: '/about', label: 'About', testId: 'nav-link-about', event: 'header_about_clicked' }, ] as const; diff --git a/packages/db/src/queries/datasets.test.ts b/packages/db/src/queries/datasets.test.ts new file mode 100644 index 00000000..c1676445 --- /dev/null +++ b/packages/db/src/queries/datasets.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from 'vitest'; + +import type { DbClient } from '../connection.js'; +import { getConversation, listConversations, listDatasets } from './datasets.js'; + +/** + * Mock DbClient: returns canned result sets in call order. Each call to the + * tagged-template `sql` shifts the next queued rows array. The query text is + * ignored — these tests assert the JS-side shaping/coercion, not SQL. + */ +function mockSql(queue: unknown[][]): DbClient { + const responses = [...queue]; + return (() => Promise.resolve(responses.shift() ?? [])) as unknown as DbClient; +} + +describe('listDatasets', () => { + it('coerces conversation_count to a number', async () => { + const sql = mockSql([ + [ + { + id: 'a/b', + slug: 'b', + label: 'B', + variant: 'full', + conversation_count: '393', + summary: {}, + }, + ], + ]); + const out = await listDatasets(sql); + expect(out).toHaveLength(1); + expect(out[0].conversation_count).toBe(393); + expect(typeof out[0].conversation_count).toBe('number'); + }); +}); + +describe('listConversations', () => { + it('returns null when the dataset slug is unknown', async () => { + const sql = mockSql([[]]); // datasets lookup → no rows + expect(await listConversations(sql, 'missing')).toBeNull(); + }); + + it('returns total + numerically-coerced items', async () => { + const sql = mockSql([ + [{ id: 'ds-id' }], // datasets lookup + [{ n: 2 }], // count + [ + { + conv_id: 'c1', + models: ['m'], + num_turns: '5', + num_subagent_groups: '1', + total_in: '1000', + total_out: '200', + total_cached: '900', + }, + ], // items + ]); + const out = await listConversations(sql, 'b', { sort: 'tokens' }); + expect(out).not.toBeNull(); + expect(out!.total).toBe(2); + expect(out!.items[0]).toMatchObject({ + conv_id: 'c1', + num_turns: 5, + num_subagent_groups: 1, + total_in: 1000, + total_out: 200, + total_cached: 900, + }); + expect(typeof out!.items[0].total_in).toBe('number'); + }); +}); + +describe('getConversation', () => { + it('returns null when the conversation is missing', async () => { + const sql = mockSql([[]]); + expect(await getConversation(sql, 'b', 'nope')).toBeNull(); + }); + + it('coerces counts and passes through the structure', async () => { + const structure = { blockSize: 64, nodes: [], totals: {} }; + const sql = mockSql([ + [ + { + conv_id: 'c1', + models: ['m'], + num_turns: '3', + num_subagent_groups: '0', + total_in: '500', + total_out: '100', + total_cached: '450', + structure, + }, + ], + ]); + const out = await getConversation(sql, 'b', 'c1'); + expect(out).not.toBeNull(); + expect(out!.num_turns).toBe(3); + expect(out!.total_cached).toBe(450); + expect(out!.structure).toBe(structure); + }); +}); From 0c50139594a99adcc43f558d0b80ae08870af20e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 16:18:38 -0500 Subject: [PATCH 80/96] docs(ingest): note the separate agentic-dataset ingest script Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/agents/ingest.md | 188 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 .claude/agents/ingest.md diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md new file mode 100644 index 00000000..aa0099ac --- /dev/null +++ b/.claude/agents/ingest.md @@ -0,0 +1,188 @@ +--- +name: ingest +description: Ingest a benchmark run from GitHub Actions into the Neon DB used by the feat/agentx deployment. The target DB write URL must be provided in the invocation. Handles standard ingest, delete+reingest, and changelog entries. Invoke when the user asks to ingest a workflow run URL. +tools: Bash, Read, Edit, Write +--- + +You ingest benchmark runs from `SemiAnalysisAI/InferenceX` GitHub Actions into the Neon branch used by the `feat/agentx` deployment of this dashboard. Operate on `/Users/quilicic/InferenceX-app`. + +## Environment + +- **Repo root**: `/Users/quilicic/InferenceX-app` +- **DB write URL — MUST be provided by the invoker.** There is no default: the target Neon branch changes over time, and ingesting into the wrong one silently corrupts a live deployment. If the prompt does not include a `postgresql://` write URL, STOP and ask for it before touching anything. Requirements: + - Use the **direct (non-pooled)** host for ingest/migrations — no `-pooler` in the hostname. + - For psql diagnostics you may use the same URL directly: `psql "$DATABASE_WRITE_URL" -c "..."`. +- **Local dev server**: usually `http://localhost:3002` (port 3000 is a different project on this machine — never purge port 3000) +- **Preview URL**: `https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app` +- **INVALIDATE_SECRET** lives in repo root `.env` under that key. +- **GitHub auth**: `gh auth token` for `gh` calls and the GITHUB_TOKEN env var. + +## Standard ingest + +```bash +cd /Users/quilicic/InferenceX-app/packages/db +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts --download SemiAnalysisAI/InferenceX +``` + +Then refresh the materialized view (the script's auto-refresh sometimes races): +`REFRESH MATERIALIZED VIEW latest_benchmarks;` + +## Cache purge (always do after any DB mutation) + +```bash +SECRET=$(grep "^INVALIDATE_SECRET" /Users/quilicic/InferenceX-app/.env | cut -d= -f2 | tr -d '"') +# Localhost (port 3002, NOT 3000) +curl -s -X POST -H "Authorization: Bearer $SECRET" http://localhost:3002/api/v1/invalidate +# Preview +mkdir -p /tmp/vp && cd /tmp/vp \ + && vercel link --project inferencemax-app --scope semianalysisai --yes >/dev/null 2>&1 \ + && vercel curl /api/v1/invalidate \ + --deployment https://inferencemax-app-git-feat-agentx-semianalysisai.vercel.app \ + --yes -- -sS -X POST -H "Authorization: Bearer $SECRET" +rm -rf /tmp/vp +``` + +## Delete + reingest (use only when user explicitly says "delete and reingest" OR when the run supersedes prior data with the same (model, hw, framework, precision)) + +```sql +BEGIN; +DELETE FROM benchmark_results br USING configs c +WHERE c.id = br.config_id + AND c.model = '' AND c.hardware = '' AND c.framework = '' + AND c.precision = '' AND br.benchmark_type = ''; +DELETE FROM availability +WHERE model = '' AND hardware = '' AND framework = '' + AND precision = '' AND benchmark_type = ''; +COMMIT; +``` + +If the user says "replace ONLY the points this run produces", scope the DELETE to `AND br.conc IN (...)` so untouched conc levels survive. Don't do this unless asked. + +## AIPerf tagging — DO NOT use by default + +AIPerf is no longer a separate harness from the user's perspective. **Always** ingest with `spec_method='none'` (the standard path above), regardless of run name. Run names that include the word "aiperf" do NOT mean you should set `spec_decoding='aiperf'` — the user wants those runs to merge into the standard legend entry alongside other runs of the same (model, hw, framework, precision). + +Only override this if the user **explicitly** asks for the run to appear as a separate legend line. If they do, the patching procedure is preserved below. Otherwise, use the standard ingest section above and do not touch `spec_decoding`. + +
+Explicit-request-only: how to tag a run as `spec_decoding='aiperf'` + +```bash +RID= +TMPDIR=$(mktemp -d -t aiperf-$RID-XXXX) +cd $TMPDIR + +# 1. Logical-name dedup + download +gh api "repos/SemiAnalysisAI/InferenceX/actions/runs/$RID/artifacts" --paginate \ + --jq '.artifacts[] | "\(.name)\t\(.archive_download_url)\t\(.created_at)"' \ + | python3 -c " +import sys, re, collections +seen = collections.OrderedDict() +for line in sys.stdin: + name, url, created = line.rstrip('\n').split('\t') + key = re.sub(r'_[a-zA-Z][a-zA-Z0-9.-]*_\d+$', '', name) + if key not in seen or seen[key][2] < created: + seen[key] = (name, url, created) +for _, (name, url, _) in seen.items(): + print(f'{name}\t{url}') +" > artifacts.tsv +while IFS=$'\t' read -r name url; do + mkdir -p "$name" + gh api "$url" > "$name/a.zip" 2>/dev/null + unzip -oq "$name/a.zip" -d "$name" 2>/dev/null + rm "$name/a.zip" +done < artifacts.tsv + +# 2. Patch every benchmark JSON to set spec_decoding=aiperf +find $TMPDIR -name "*.json" | python3 -c " +import sys, json +for fn in (l.strip() for l in sys.stdin): + try: + with open(fn) as f: d = json.load(f) + except Exception: continue + rows = d if isinstance(d, list) else [d] + if not rows or not isinstance(rows[0], dict): continue + changed = False + for row in rows: + if isinstance(row, dict) and ('scenario_type' in row or 'infmax_model_prefix' in row or 'tput_per_gpu' in row): + row['spec_decoding'] = 'aiperf' + changed = True + if changed: + with open(fn, 'w') as f: json.dump(d if isinstance(d, list) else rows[0], f) +" + +# 3. Ingest in CI mode (reads INGEST_* env vars) +cd /Users/quilicic/InferenceX-app/packages/db +INGEST_RUN_ID=$RID INGEST_RUN_ATTEMPT=1 INGEST_ARTIFACTS_PATH=$TMPDIR INGEST_REPO=SemiAnalysisAI/InferenceX \ +DATABASE_WRITE_URL='' \ +GITHUB_TOKEN=$(gh auth token) \ +pnpm exec tsx src/ingest-ci-run.ts +rm -rf $TMPDIR +``` + +The `spec_method` column has a lowercase check constraint — always lowercase. + +
+ +## Don't auto-mention "AIPerf" in changelog entries + +Changelog descriptions used to include "AIPerf harness" wording. Don't add this anymore — the user considers AIPerf the standard harness now. A run named "e2e Test - kimi aiperf w/ live assistant" should become a changelog entry like `B200 Kimi Ingest #N (live assistant)`, not `... (AIPerf harness, live assistant)`. + +## Adding a perf changelog entry + +Run AFTER ingest. The popover filters by `config_keys[].split('-')[1] === selected_precision` and drops entries with empty `config_keys`, so you MUST provide at least one config_key in the format `---` (matches what the user actually sees in the filter chain). + +```sql +INSERT INTO changelog_entries (workflow_run_id, date, base_ref, head_ref, config_keys, description, pr_link) +SELECT id, date, '', '', ARRAY['---'], '', NULL +FROM latest_workflow_runs WHERE github_run_id = +RETURNING id, workflow_run_id, date::text, description; +``` + +Description convention from prior entries: ` Ingest # ()` — e.g. + +- `B200 Kimi Ingest #1` +- `MI355X Kimi Ingest #2` +- `H200 Kimi Ingest #1 (mmap cache)` + +If user doesn't specify a description, ask for one OR derive from the run name. + +## Common gotchas + +- **`conclusion IS NULL` filter**: availability hides runs whose `latest_workflow_runs.conclusion` is null (still in_progress). If a user wants in-progress data shown, you can `UPDATE workflow_runs SET conclusion='success', status='completed' WHERE id = ` then `REFRESH MATERIALIZED VIEW latest_benchmarks`. +- **failed_run filter**: rows where `num_requests_successful === 0 AND num_requests_total > 0` get skipped on purpose — they have null metrics and would overwrite good rows via ON CONFLICT. +- **Aggregated `results_bmk` artifact** contains rows from all runner attempts merged together — pair the artifact-level logical-name dedup with the row-level failed-run skip to avoid empty-row overwrites. +- **Multi-attempt artifacts**: a single GitHub run can spill across runners (`h200-cw_00` + `h200-dgxc-slurm_1`); the logical-name dedup strips the `__` suffix. +- **Materialized view dedup tiebreaker**: `latest_benchmarks` picks rows by `date DESC, wr.run_started_at DESC`. Backfilling old data may not surface unless dates align with the user's date picker selection. +- **Date alignment for partial runs**: when a re-run only covers a subset of concs (`replace ONLY the points this run produces`), align dates with prior full sweep via `UPDATE benchmark_results.date = ''` so the frontend's max-date-per-group dedup doesn't drop the older sweep. + +## Process + +1. **Always start by checking the run** with `gh api repos/SemiAnalysisAI/InferenceX/actions/runs/ --jq '{name, status, conclusion}'`. Note the model/hw/precision from the name. If `status != "completed"`, ask the user if they want to ingest in-progress data (will likely have failed_run skips). +2. **Check the DB** for any pre-existing rows for this run or the same (model, hw, framework, precision) combo if the user mentioned superseding. +3. **Ingest** via the standard path. Do NOT use AIPerf tagging unless the user explicitly asks for a separate legend line. +4. **Refresh materialized view**. +5. **Add changelog entry** if the user asked or if the run is a "marker" worth surfacing. +6. **Purge both caches** (localhost 3002 + preview). +7. **Report** the row count, date, hardware, run id, and changelog id (if added). + +## Related: ingesting agentic _datasets_ (not benchmark runs) + +This agent ingests **benchmark runs**. The HF agentic trace **datasets** (`semianalysisai/cc-traces-weka-*`) that the agentic benchmark replays are ingested by a separate script, not this flow: + +```bash +cd packages/db && DATABASE_WRITE_URL='' \ + pnpm exec tsx src/ingest-weka-dataset.ts \ + [--label "…"] [--variant full|256k] [--description "…"] [--limit N] +``` + +It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). + +## Don't + +- Don't push to git unless the user asked. +- Don't ingest without permission if it's a delete+reingest of existing data. +- Don't hit port 3000 for cache purge — it's a different project. +- Don't capitalize `spec_method` values (DB has a lowercase check constraint). From 2ae6ebaab06b27bd65f0601aa6ae7905cbd01d79 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 16:24:01 -0500 Subject: [PATCH 81/96] fix(datasets): flamegraph scroll box + dual-scale group bars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap rows in a fixed-height (max-h-[520px]) vertically scrollable bordered box. Subagent group headers carry aggregate token totals that dwarf any single turn, which made their bars overflow the row (width >> 100%). Now turns/subturns use a per-turn scale while group headers use a separate group-aggregate scale (slim muted strips), both clamped to the track — groups stay comparable to each other and nothing overflows. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/datasets/trace-flamegraph.tsx | 111 ++++++++++-------- 1 file changed, 63 insertions(+), 48 deletions(-) diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 12588582..12cc14ec 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -135,10 +135,19 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur return out; }, [nodes, expanded]); + // Two scales: leaf turns/subturns share a per-turn axis (the primary signal — + // how cached/uncached evolves), while subagent group headers carry aggregates + // orders of magnitude larger, so they get their own axis to stay comparable to + // each other. Group bars render slim + muted, so the mixed scale reads as a + // distinct "group summary" track rather than a contradiction. const maxTotal = useMemo( () => Math.max(1, ...rows.filter((r) => !r.isGroup).map((r) => r.total)), [rows], ); + const maxGroupTotal = useMemo( + () => Math.max(1, ...rows.filter((r) => r.isGroup).map((r) => r.total)), + [rows], + ); const onMove = (e: React.MouseEvent, row: VisibleRow) => { setTooltip({ x: e.clientX, y: e.clientY, row }); @@ -178,61 +187,67 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur )}
-
- {rows.map((row) => { - const barFrac = row.total / maxTotal; - const cw = (row.cached / row.total) * 100; - const uw = (row.uncached / row.total) * 100; - const ow = (row.output / row.total) * 100; - return ( -
- {/* label / group toggle */} -
- {row.isGroup ? ( - - ) : ( - {row.label} - )} -
- - {/* stacked bar */} +
+
+ {rows.map((row) => { + // Group headers use the group axis; turns/subturns use the per-turn + // axis. Clamp to the track width either way. + const denom = row.isGroup ? maxGroupTotal : maxTotal; + const widthPct = Math.min(100, Math.max(0.5, (row.total / denom) * 100)); + const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0; + const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0; + const ow = row.total > 0 ? (row.output / row.total) * 100 : 0; + return (
onMove(e, row)} - onMouseLeave={() => setTooltip(null)} + key={row.key} + className="flex items-center gap-2" + style={{ paddingLeft: row.indent * 20 }} > + {/* label / group toggle */} +
+ {row.isGroup ? ( + + ) : ( + {row.label} + )} +
+ + {/* stacked bar — group headers render as a slim muted summary + strip so they read as aggregates, not individual turns. */}
onMove(e, row)} + onMouseLeave={() => setTooltip(null)} > -
-
-
+
+
+
+
+
-
- {/* total */} -
- {compact(row.total)} + {/* total */} +
+ {compact(row.total)} +
-
- ); - })} + ); + })} +
{tooltip && ( From c749f8f271bcfa46293b1ce2ec29adac1907231d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 16:31:40 -0500 Subject: [PATCH 82/96] feat(datasets): link request timeline to source-dataset conversation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add run_datasets (workflow_run → dataset slug) mapping (migration 012) and surface it through the benchmark-siblings sku. The agentic detail page's request timeline now deep-links each request bar to its exact conversation in the /datasets viewer — the request cid, stripped of any ::sa:/::fa: suffix, is the dataset conv_id. Tooltip shows a 'click to view in dataset' hint; bars get a pointer cursor only when a mapping exists. Backfilled workflow_run 27915787191 (the dsv4/b300/vllm run incl. point 422083) → cc-traces-weka-062126. Verified: clicking a timeline bar on /inference/agentic/422083 navigates to the matching /datasets/cc-traces-weka-062126/conversations/. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../agentic-point/agentic-point-detail.tsx | 5 ++- .../agentic-point/dataset-conv-id.test.ts | 27 ++++++++++++ .../agentic-point/request-timeline.tsx | 43 +++++++++++++++++-- .../src/hooks/api/use-benchmark-siblings.ts | 1 + packages/db/migrations/012_run_datasets.sql | 19 ++++++++ packages/db/src/queries/benchmark-siblings.ts | 7 ++- 6 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts create mode 100644 packages/db/migrations/012_run_datasets.sql diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 278ad8f7..4a076955 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -225,7 +225,10 @@ export function AgenticPointDetail({ id }: Props) { Loading request timeline…
) : timelineQuery.data ? ( - + ) : (
No per-request timeline for benchmark point #{id} — the profile_export.jsonl artifact diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts new file mode 100644 index 00000000..a7ebbd8c --- /dev/null +++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts @@ -0,0 +1,27 @@ +import { describe, expect, it } from 'vitest'; + +import { datasetConvId } from './request-timeline'; + +describe('datasetConvId', () => { + it('returns a plain conversation id unchanged', () => { + expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602')).toBe( + '002001296e8a8c38ad9d7cc436d691afc602', + ); + }); + + it('strips a ::sa: subagent suffix to the parent conv id', () => { + expect(datasetConvId('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe( + '002001296e8a8c38ad9d7cc436d691afc602', + ); + }); + + it('strips a ::fa: forked-agent suffix', () => { + expect(datasetConvId('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBe( + '02bc0afb13f7a2d9efa86c28511261d85c0e', + ); + }); + + it('strips at the first :: even with a trailing stream index', () => { + expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc'); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 7c5fdab0..655556fb 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -1,9 +1,21 @@ 'use client'; import { useCallback, useMemo, useRef, useState } from 'react'; +import { useRouter } from 'next/navigation'; import { type RequestRecord, type RequestTimeline } from '@/hooks/api/use-request-timeline'; import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; + +/** + * The dataset conversation id for a request: the cid with any subagent/forked + * suffix (`::sa:…`, `::fa:…`) stripped. This is exactly the `conv_id` stored in + * dataset_conversations, so it deep-links into /datasets//conversations/. + */ +export function datasetConvId(cid: string): string { + const i = cid.indexOf('::'); + return i === -1 ? cid : cid.slice(0, i); +} /** * Gantt-style request timeline for one agentic benchmark point. @@ -317,7 +329,7 @@ interface TooltipData { req: RequestRecord; } -function Tooltip({ data }: { data: TooltipData }) { +function Tooltip({ data, linkable }: { data: TooltipData; linkable?: boolean }) { const { row, req } = data; const totalMs = (req.end - req.start) / 1e6; const queueMs = (req.start - req.credit) / 1e6; @@ -377,14 +389,37 @@ function Tooltip({ data }: { data: TooltipData }) {
Started at {formatTickLabel(req.start)}
+ {linkable && ( +
+ Click to view this conversation in the dataset → +
+ )}
); } -export function RequestTimelineView({ data }: { data: RequestTimeline }) { +export function RequestTimelineView({ + data, + datasetSlug, +}: { + data: RequestTimeline; + /** Source dataset slug for this run; enables click-to-conversation deep links. */ + datasetSlug?: string | null; +}) { + const router = useRouter(); const [rowMode, setRowMode] = useState('conversation'); const [phaseFilter, setPhaseFilter] = useState('profiling'); const [tooltip, setTooltip] = useState(null); + + const openConversation = useCallback( + (cid: string) => { + if (!datasetSlug) return; + const convId = datasetConvId(cid); + track('agentic_timeline_to_dataset', { slug: datasetSlug }); + router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`); + }, + [datasetSlug, router], + ); // Which multi-stream subagents currently have their per-stream rows // expanded. Key is the subagent row's `key` (parent_cid::sa:agent_id). const [expandedSubagents, setExpandedSubagents] = useState>(() => new Set()); @@ -798,6 +833,8 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { key={`${req.cid}-${req.ti}-${req.start}`} onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })} onMouseLeave={() => setTooltip(null)} + onClick={datasetSlug ? () => openConversation(req.cid) : undefined} + style={datasetSlug ? { cursor: 'pointer' } : undefined} > {/* Queue lead-in (faint) — only drawn when noticeable. */} {queueW >= 1 && ( @@ -910,7 +947,7 @@ export function RequestTimelineView({ data }: { data: RequestTimeline }) { )} {/* Tooltip */} - {tooltip && } + {tooltip && }
); } diff --git a/packages/app/src/hooks/api/use-benchmark-siblings.ts b/packages/app/src/hooks/api/use-benchmark-siblings.ts index 55720bdf..f8bef99e 100644 --- a/packages/app/src/hooks/api/use-benchmark-siblings.ts +++ b/packages/app/src/hooks/api/use-benchmark-siblings.ts @@ -31,6 +31,7 @@ export interface BenchmarkSku { benchmark_type: string; github_run_id: number; date: string; + dataset_slug: string | null; } export interface BenchmarkSiblings { diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql new file mode 100644 index 00000000..58dd9f88 --- /dev/null +++ b/packages/db/migrations/012_run_datasets.sql @@ -0,0 +1,19 @@ +-- Maps a benchmark workflow_run to the source dataset it replayed, so the +-- agentic detail page can deep-link each request in the timeline to the exact +-- conversation in the /datasets viewer (the request's conversation_id, with any +-- ::sa:/::fa: suffix stripped, is the dataset conv_id). +-- +-- One row per workflow_run (every benchmark in a run replays the same dataset). +-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/ +-- URL) rather than an FK, so the mapping can be recorded before/independent of +-- the dataset being ingested; the UI degrades gracefully if the slug is absent. +-- +-- Additive only. To revert: +-- drop table if exists run_datasets; +-- delete from schema_migrations where filename = '012_run_datasets.sql'; + +create table run_datasets ( + workflow_run_id bigint primary key references workflow_runs(id) on delete cascade, + dataset_slug text not null, + created_at timestamptz not null default now() +); diff --git a/packages/db/src/queries/benchmark-siblings.ts b/packages/db/src/queries/benchmark-siblings.ts index c7e4a317..2d36eb22 100644 --- a/packages/db/src/queries/benchmark-siblings.ts +++ b/packages/db/src/queries/benchmark-siblings.ts @@ -47,6 +47,8 @@ export interface BenchmarkSku { /** Human-readable workflow_run summary so the page header can hint at provenance. */ github_run_id: number; date: string; + /** Slug of the source dataset this run replayed (run_datasets), or null. */ + dataset_slug: string | null; } export interface BenchmarkSiblings { @@ -63,10 +65,11 @@ export async function getBenchmarkSiblings( select c.hardware, c.framework, c.model, c.precision, c.spec_method, br.benchmark_type, br.workflow_run_id, br.date::text, - wr.github_run_id + wr.github_run_id, rd.dataset_slug from benchmark_results br join configs c on c.id = br.config_id join workflow_runs wr on wr.id = br.workflow_run_id + left join run_datasets rd on rd.workflow_run_id = br.workflow_run_id where br.id = ${benchmarkResultId} `) as unknown as { hardware: string; @@ -78,6 +81,7 @@ export async function getBenchmarkSiblings( workflow_run_id: number; date: string; github_run_id: number; + dataset_slug: string | null; }[]; const root = seed[0]; if (!root) return null; @@ -158,6 +162,7 @@ export async function getBenchmarkSiblings( benchmark_type: root.benchmark_type, github_run_id: Number(root.github_run_id), date: root.date, + dataset_slug: root.dataset_slug ?? null, }, siblings, }; From 6b700a3ccbc53fbc7e109360a2e5baa582e588c9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 17:31:14 -0500 Subject: [PATCH 83/96] feat(datasets): deep-link request-timeline bar to the exact turn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timeline link now carries ?turn= (and &sa= for subagent requests). The flamegraph resolves the target node — main turns by ordinal, subagent turns by matching the group's agentId then the ti-th child — expands the subagent group if needed, scrolls the row into view, and flashes a ring. subagentIdOf strips the harness stream suffix (:s and :aux:) so the cid's agent id matches the dataset SubagentNode.agentId. Verified end-to-end: clicking a subagent bar on /inference/agentic/422083 opens the conversation, expands the right group, and highlights the exact subturn. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/datasets/conversation-view.tsx | 18 +++++- .../components/datasets/trace-flamegraph.tsx | 60 +++++++++++++++++-- .../agentic-point/dataset-conv-id.test.ts | 28 ++++++++- .../agentic-point/request-timeline.tsx | 30 ++++++++-- 4 files changed, 125 insertions(+), 11 deletions(-) diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index 43992c41..ba1d0532 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -1,5 +1,6 @@ 'use client'; +import { useState } from 'react'; import Link from 'next/link'; import { Card } from '@/components/ui/card'; @@ -17,6 +18,17 @@ function compact(n: number): string { export function ConversationView({ slug, convId }: { slug: string; convId: string }) { const { data, isLoading, isError } = useDatasetConversation(slug, convId); + // Deep-link target from a request-timeline click: ?turn=[&sa=]. + // Read once from the URL on mount (matches the app's window-based url-state + // reads; avoids a Suspense boundary for useSearchParams). + const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => { + if (typeof window === 'undefined') return { turn: null, agent: null }; + const p = new URLSearchParams(window.location.search); + const turnRaw = p.get('turn'); + const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null; + return { turn, agent: p.get('sa') }; + }); + if (isLoading) { return (
Loading conversation…
@@ -85,7 +97,11 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin click a group to expand it. Each bar splits input into cached prefix and uncached suffix, plus generated output.

- +
); diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 12cc14ec..3995a9c5 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -1,6 +1,6 @@ 'use client'; -import { useCallback, useMemo, useState } from 'react'; +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; @@ -52,12 +52,58 @@ interface TooltipState { * children (collapsed by default). Each bar stacks cached-prefix + uncached * input + output, scaled to the widest visible turn. */ -export function TraceFlamegraph({ structure }: { structure: ConversationStructure }) { +export function TraceFlamegraph({ + structure, + highlightTurn, + highlightAgentId, +}: { + structure: ConversationStructure; + /** Turn index to scroll to / highlight (from a request-timeline deep link). */ + highlightTurn?: number | null; + /** Subagent id when the highlighted turn is inside a subagent group. */ + highlightAgentId?: string | null; +}) { const nodes = structure.nodes; - // Subagent groups collapsed by default. - const [expanded, setExpanded] = useState>(() => new Set()); + // Resolve the deep-link target to a row key (+ the group that must be open to + // show it). Main turns match by their main-turn ordinal; subagent turns match + // the group by agentId, then the ti-th child. + const target = useMemo(() => { + if (typeof highlightTurn !== 'number' || highlightTurn < 0) return null; + if (highlightAgentId) { + const gi = nodes.findIndex((n) => n.kind === 'subagent' && n.agentId === highlightAgentId); + if (gi === -1) return null; + const group = nodes[gi] as Extract; + if (highlightTurn >= group.children.length) return null; + return { rowKey: `g-${gi}-c-${highlightTurn}`, expandGroup: gi }; + } + let ordinal = 0; + for (let i = 0; i < nodes.length; i++) { + if (nodes[i].kind === 'turn') { + if (ordinal === highlightTurn) return { rowKey: `t-${i}`, expandGroup: null }; + ordinal += 1; + } + } + return null; + }, [nodes, highlightTurn, highlightAgentId]); + + // Subagent groups collapsed by default — except the deep-link target's group. + const [expanded, setExpanded] = useState>(() => + typeof target?.expandGroup === 'number' ? new Set([target.expandGroup]) : new Set(), + ); const [tooltip, setTooltip] = useState(null); + const scrollRef = useRef(null); + + // Scroll the target row into view and flash a highlight once it's rendered. + useEffect(() => { + if (!target) return; + const el = scrollRef.current?.querySelector(`[data-rowkey="${target.rowKey}"]`); + if (!el) return; + el.scrollIntoView({ block: 'center', behavior: 'smooth' }); + el.classList.add('ring-2', 'ring-primary', 'rounded-sm'); + const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600); + return () => clearTimeout(t); + }, [target]); const groupIndexes = useMemo(() => { const out: number[] = []; @@ -187,7 +233,10 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur )}
-
+
{rows.map((row) => { // Group headers use the group axis; turns/subturns use the per-turn @@ -200,6 +249,7 @@ export function TraceFlamegraph({ structure }: { structure: ConversationStructur return (
diff --git a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts index a7ebbd8c..f55d6131 100644 --- a/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts +++ b/packages/app/src/components/inference/agentic-point/dataset-conv-id.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { datasetConvId } from './request-timeline'; +import { datasetConvId, subagentIdOf } from './request-timeline'; describe('datasetConvId', () => { it('returns a plain conversation id unchanged', () => { @@ -25,3 +25,29 @@ describe('datasetConvId', () => { expect(datasetConvId('abc::sa:agent_1:s2')).toBe('abc'); }); }); + +describe('subagentIdOf', () => { + it('returns null for a main-conversation cid', () => { + expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602')).toBeNull(); + }); + + it('extracts the subagent id from a ::sa: cid', () => { + expect(subagentIdOf('002001296e8a8c38ad9d7cc436d691afc602::sa:subagent_004_27c95af7')).toBe( + 'subagent_004_27c95af7', + ); + }); + + it('drops a trailing :s index from the subagent id', () => { + expect(subagentIdOf('abc::sa:subagent_001_f552fe6f:s3')).toBe('subagent_001_f552fe6f'); + }); + + it('drops an :aux: stream suffix from the subagent id', () => { + expect(subagentIdOf('04dba6fe::sa:subagent_001_b00fdc12:aux:011')).toBe( + 'subagent_001_b00fdc12', + ); + }); + + it('returns null for a ::fa: forked-agent cid (no matching subagent group)', () => { + expect(subagentIdOf('02bc0afb13f7a2d9efa86c28511261d85c0e::fa:007')).toBeNull(); + }); +}); diff --git a/packages/app/src/components/inference/agentic-point/request-timeline.tsx b/packages/app/src/components/inference/agentic-point/request-timeline.tsx index 655556fb..baf3dc1f 100644 --- a/packages/app/src/components/inference/agentic-point/request-timeline.tsx +++ b/packages/app/src/components/inference/agentic-point/request-timeline.tsx @@ -17,6 +17,21 @@ export function datasetConvId(cid: string): string { return i === -1 ? cid : cid.slice(0, i); } +/** + * The subagent id encoded in a cid (`…::sa:[:s|:aux:]`), or null + * for a main-conversation request. The harness fans a single subagent into + * parallel streams with a `:s` or `:aux:` suffix; the dataset + * SubagentNode.agentId is the bare base (e.g. `subagent_001_b00fdc12`). Agent + * ids never contain a colon, so the base is everything up to the first one. + */ +export function subagentIdOf(cid: string): string | null { + const i = cid.indexOf('::sa:'); + if (i === -1) return null; + const raw = cid.slice(i + '::sa:'.length); + const colon = raw.indexOf(':'); + return colon === -1 ? raw : raw.slice(0, colon); +} + /** * Gantt-style request timeline for one agentic benchmark point. * @@ -412,11 +427,18 @@ export function RequestTimelineView({ const [tooltip, setTooltip] = useState(null); const openConversation = useCallback( - (cid: string) => { + (req: RequestRecord) => { if (!datasetSlug) return; - const convId = datasetConvId(cid); + const convId = datasetConvId(req.cid); + // Carry the turn (and, for subagent requests, the subagent id) so the + // flamegraph can scroll to / highlight the exact node this bar maps to. + const params = new URLSearchParams({ turn: String(req.ti) }); + const sa = subagentIdOf(req.cid); + if (sa) params.set('sa', sa); track('agentic_timeline_to_dataset', { slug: datasetSlug }); - router.push(`/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}`); + router.push( + `/datasets/${datasetSlug}/conversations/${encodeURIComponent(convId)}?${params.toString()}`, + ); }, [datasetSlug, router], ); @@ -833,7 +855,7 @@ export function RequestTimelineView({ key={`${req.cid}-${req.ti}-${req.start}`} onMouseMove={(e) => setTooltip({ x: e.clientX, y: e.clientY, row, req })} onMouseLeave={() => setTooltip(null)} - onClick={datasetSlug ? () => openConversation(req.cid) : undefined} + onClick={datasetSlug ? () => openConversation(req) : undefined} style={datasetSlug ? { cursor: 'pointer' } : undefined} > {/* Queue lead-in (faint) — only drawn when noticeable. */} From 83fcd04e16649ca7a8fb3b1b78231c8588f274e8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 17:44:05 -0500 Subject: [PATCH 84/96] fix(datasets): visible turn highlight + pointer-tracking flamegraph tooltip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Deep-link highlight is now state-driven (bg-primary/20 + ring, fades over 700ms) instead of fragile classList mutation, so it's clearly visible and survives re-renders. Subagent groups still auto-expand and scroll into view. - Portal the hover tooltip to document.body so its position:fixed is viewport-relative — an ancestor transform was offsetting it away from the cursor. Now it sits at pointer+12px. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/datasets/trace-flamegraph.tsx | 96 +++++++++++-------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 3995a9c5..53f13b6a 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -1,6 +1,7 @@ 'use client'; import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { createPortal } from 'react-dom'; import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; @@ -94,14 +95,23 @@ export function TraceFlamegraph({ const [tooltip, setTooltip] = useState(null); const scrollRef = useRef(null); - // Scroll the target row into view and flash a highlight once it's rendered. + // Portal target only exists after mount (the tooltip is portaled to body so + // its position:fixed is viewport-relative, immune to ancestor transforms). + const [mounted, setMounted] = useState(false); + useEffect(() => setMounted(true), []); + + // The deep-link target row gets a state-driven highlight (ring + bg flash) + // that fades out — state-driven so a re-render can't clobber it, and so the + // fade is a real CSS transition rather than an abrupt classList removal. + const [highlightKey, setHighlightKey] = useState(target?.rowKey ?? null); + + // Scroll the target row into view once it's rendered, then fade the highlight. useEffect(() => { if (!target) return; + setHighlightKey(target.rowKey); const el = scrollRef.current?.querySelector(`[data-rowkey="${target.rowKey}"]`); - if (!el) return; - el.scrollIntoView({ block: 'center', behavior: 'smooth' }); - el.classList.add('ring-2', 'ring-primary', 'rounded-sm'); - const t = setTimeout(() => el.classList.remove('ring-2', 'ring-primary', 'rounded-sm'), 2600); + el?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + const t = setTimeout(() => setHighlightKey(null), 2200); return () => clearTimeout(t); }, [target]); @@ -246,11 +256,14 @@ export function TraceFlamegraph({ const cw = row.total > 0 ? (row.cached / row.total) * 100 : 0; const uw = row.total > 0 ? (row.uncached / row.total) * 100 : 0; const ow = row.total > 0 ? (row.output / row.total) * 100 : 0; + const isHighlighted = row.key === highlightKey; return (
{/* label / group toggle */} @@ -300,39 +313,44 @@ export function TraceFlamegraph({
- {tooltip && ( -
-
- {tooltip.row.label} - {tooltip.row.sublabel ? ( - {tooltip.row.sublabel} - ) : null} -
-
- Cached prefix - - {compact(tooltip.row.cached)} - - Uncached input - - {compact(tooltip.row.uncached)} - - Output - - {compact(tooltip.row.output)} - - Cached % - - {tooltip.row.cached + tooltip.row.uncached > 0 - ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%` - : '—'} - -
-
- )} + {tooltip && + mounted && + createPortal( +
+
+ {tooltip.row.label} + {tooltip.row.sublabel ? ( + + {tooltip.row.sublabel} + + ) : null} +
+
+ Cached prefix + + {compact(tooltip.row.cached)} + + Uncached input + + {compact(tooltip.row.uncached)} + + Output + + {compact(tooltip.row.output)} + + Cached % + + {tooltip.row.cached + tooltip.row.uncached > 0 + ? `${((tooltip.row.cached / (tooltip.row.cached + tooltip.row.uncached)) * 100).toFixed(0)}%` + : '—'} + +
+
, + document.body, + )}
); } From 3c40d31172cce46f5e150223bcfa092ff573288f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 17:58:47 -0500 Subject: [PATCH 85/96] fix(datasets): deep-link highlight fires on first navigation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The conversation page read ?turn/&sa from window.location.search in a useState initializer, which captures stale/empty params during a client-side navigation — so scroll+highlight+expand only worked after a manual reload. Switch to the reactive useSearchParams (page wrapped in Suspense) so the params are present on the first nav. Also make the flamegraph expand the target subagent group via an effect (reacting to target changes), and defer the scroll one frame so the just-expanded child row exists. Verified via a real timeline click — no reload. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../[slug]/conversations/[convId]/page.tsx | 5 ++++- .../components/datasets/conversation-view.tsx | 19 ++++++++-------- .../components/datasets/trace-flamegraph.tsx | 22 +++++++++++++++---- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx index 75702c1b..83eb56a0 100644 --- a/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx +++ b/packages/app/src/app/datasets/[slug]/conversations/[convId]/page.tsx @@ -1,3 +1,4 @@ +import { Suspense } from 'react'; import type { Metadata } from 'next'; import { ConversationView } from '@/components/datasets/conversation-view'; @@ -25,7 +26,9 @@ export default async function ConversationPage({ params }: Props) { return (
- + + +
); diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index ba1d0532..739d3bb2 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -1,7 +1,7 @@ 'use client'; -import { useState } from 'react'; import Link from 'next/link'; +import { useSearchParams } from 'next/navigation'; import { Card } from '@/components/ui/card'; import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; @@ -19,15 +19,14 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin const { data, isLoading, isError } = useDatasetConversation(slug, convId); // Deep-link target from a request-timeline click: ?turn=[&sa=]. - // Read once from the URL on mount (matches the app's window-based url-state - // reads; avoids a Suspense boundary for useSearchParams). - const [highlight] = useState<{ turn: number | null; agent: string | null }>(() => { - if (typeof window === 'undefined') return { turn: null, agent: null }; - const p = new URLSearchParams(window.location.search); - const turnRaw = p.get('turn'); - const turn = turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null; - return { turn, agent: p.get('sa') }; - }); + // useSearchParams (not a one-shot window.location read) so the params are + // present on the very first client-side navigation, not just after a reload. + const params = useSearchParams(); + const turnRaw = params.get('turn'); + const highlight = { + turn: turnRaw !== null && /^\d+$/u.test(turnRaw) ? Number(turnRaw) : null, + agent: params.get('sa'), + }; if (isLoading) { return ( diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 53f13b6a..a577193b 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -105,14 +105,28 @@ export function TraceFlamegraph({ // fade is a real CSS transition rather than an abrupt classList removal. const [highlightKey, setHighlightKey] = useState(target?.rowKey ?? null); - // Scroll the target row into view once it's rendered, then fade the highlight. + // When the deep-link target resolves/changes: expand its subagent group, then + // (after the row renders) scroll it into view and flash the highlight. Runs on + // first load and on any later target change (e.g. clicking another bar into + // the same conversation). The row query/scroll is deferred to the next frame + // so the just-expanded child row exists in the DOM. useEffect(() => { if (!target) return; + if (typeof target.expandGroup === 'number') { + const gi = target.expandGroup; + setExpanded((prev) => (prev.has(gi) ? prev : new Set(prev).add(gi))); + } setHighlightKey(target.rowKey); - const el = scrollRef.current?.querySelector(`[data-rowkey="${target.rowKey}"]`); - el?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + const raf = requestAnimationFrame(() => { + scrollRef.current + ?.querySelector(`[data-rowkey="${target.rowKey}"]`) + ?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + }); const t = setTimeout(() => setHighlightKey(null), 2200); - return () => clearTimeout(t); + return () => { + cancelAnimationFrame(raf); + clearTimeout(t); + }; }, [target]); const groupIndexes = useMemo(() => { From e460ea2300f57912eff46d92fbb6fb447fc435e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 22 Jun 2026 22:34:55 -0500 Subject: [PATCH 86/96] fix(high-contrast): stable line colors when deselecting legend items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In HC mode the iwanthue palette is sized and indexed by the key set it's generated over. ScatterGraph generated it from the *active* (selected) hw set, so deselecting a line shrank the set, re-sized the palette, and shifted every remaining line's hue — most visible on single-vendor agentic runs (which span the full hue wheel since 2c06009), where deselecting B300 could recolor B200 from red to blue. Pass the stable full set of hw-types-with-data as hcKeys so the palette and per-key index are fixed; toggling now only hides/shows lines without recoloring the rest. Adds a useThemeColors regression test asserting a line's HC color is identical across active subsets when hcKeys is the full set. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../components/inference/ui/ScatterGraph.tsx | 8 ++++++ packages/app/src/hooks/useThemeColors.test.ts | 28 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/packages/app/src/components/inference/ui/ScatterGraph.tsx b/packages/app/src/components/inference/ui/ScatterGraph.tsx index 76231522..77770ec0 100644 --- a/packages/app/src/components/inference/ui/ScatterGraph.tsx +++ b/packages/app/src/components/inference/ui/ScatterGraph.tsx @@ -381,10 +381,18 @@ const ScatterGraph = React.memo( () => [...effectiveOfficialHwTypes], [effectiveOfficialHwTypes], ); + // High-contrast palette is keyed off the FULL set of official hw types with + // data, not the active subset. Otherwise deselecting a line shrinks the key + // set, which re-sizes the iwanthue palette and shifts every remaining line's + // hue (most visible for single-vendor agentic runs that span the full wheel — + // e.g. deselecting B300 would recolor B200 from red to blue). Keying off the + // stable full set fixes each hw's color so toggling only hides/shows lines. + const stableHcKeys = useMemo(() => [...hwTypesWithData], [hwTypesWithData]); const { resolveColor, getCssColor } = useThemeColors({ highContrast, identifiers: activeHwKeys, activeKeys: activeOfficialKeys, + hcKeys: stableHcKeys, }); // --- Changelog --- diff --git a/packages/app/src/hooks/useThemeColors.test.ts b/packages/app/src/hooks/useThemeColors.test.ts index 7275e384..11050d19 100644 --- a/packages/app/src/hooks/useThemeColors.test.ts +++ b/packages/app/src/hooks/useThemeColors.test.ts @@ -170,4 +170,32 @@ describe('useThemeColors color maps', () => { } unmountOn(); }); + + // Regression: deselecting a legend line must not recolor the remaining lines. + // The HC palette is sized/indexed by the key set it's generated over, so when + // it was generated over the *active* subset (no hcKeys), shrinking the + // selection re-sized the palette and shifted every remaining line's hue (most + // visible on single-vendor agentic runs spanning the full wheel). Passing a + // stable `hcKeys` (the full set with data) fixes each line's color. + it('keeps a line HC color stable across active subsets when hcKeys is the full set', () => { + const FULL = ['b200', 'b300']; // single-vendor (NVIDIA) agentic comparison + + const all = renderHook(() => + useThemeColors({ highContrast: true, activeKeys: ['b200', 'b300'], hcKeys: FULL }), + ); + const b200WithBoth = all.result.current.resolveColor('b200'); + const b300Color = all.result.current.resolveColor('b300'); + all.unmount(); + + // b300 deselected → only b200 active, but hcKeys is still the full set. + const subset = renderHook(() => + useThemeColors({ highContrast: true, activeKeys: ['b200'], hcKeys: FULL }), + ); + const b200Alone = subset.result.current.resolveColor('b200'); + subset.unmount(); + + expect(b200WithBoth).toMatch(/^#[0-9a-f]{6}$/iu); + expect(b200WithBoth).not.toBe(b300Color); // HC still produces distinct hues + expect(b200Alone).toBe(b200WithBoth); // deselecting b300 did NOT recolor b200 + }); }); From a912eab780a76ba015b21590d3c162e0fd4c37ea Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:04:28 -0500 Subject: [PATCH 87/96] chore(security): bump dompurify override to >=3.4.11 (GHSA-cmwh-pvxp-8882) --- pnpm-lock.yaml | 52 ++++++++++++++++++++++++++++++++------------- pnpm-workspace.yaml | 2 +- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cdd8a01d..bb7bb824 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -5,7 +5,7 @@ settings: excludeLinksFromLockfile: false overrides: - dompurify@<3.4.9: '>=3.4.9' + dompurify@<=3.4.10: '>=3.4.11' esbuild@>=0.27.3 <0.28.1: '>=0.28.1' form-data@>=4.0.0 <4.0.6: '>=4.0.6' hono@<4.12.21: '>=4.12.21' @@ -20,7 +20,7 @@ importers: devDependencies: '@babel/core': specifier: ^7.29.6 - version: 7.29.7 + version: 7.29.7(supports-color@8.1.1) audit-ci: specifier: ^7.1.0 version: 7.1.0 @@ -2994,9 +2994,6 @@ packages: resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==} engines: {node: '>=8'} - dompurify@3.4.10: - resolution: {integrity: sha512-0xzNv0e7oYC6yyuOGZIABPM4qtg3QxLFniDNPP4ZP90wR8Yq3zgwpRbrNiT4N3IKqDbbYFEJLV+JWEs19aZ//w==} - dompurify@3.4.11: resolution: {integrity: sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==} @@ -5538,7 +5535,27 @@ snapshots: '@babel/helpers': 7.29.7 '@babel/parser': 7.29.7 '@babel/template': 7.29.7 - '@babel/traverse': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) + '@babel/types': 7.29.7 + '@jridgewell/remapping': 2.3.5 + convert-source-map: 2.0.0 + debug: 4.4.3(supports-color@8.1.1) + gensync: 1.0.0-beta.2 + json5: 2.2.3 + semver: 6.3.1 + transitivePeerDependencies: + - supports-color + + '@babel/core@7.29.7(supports-color@8.1.1)': + dependencies: + '@babel/code-frame': 7.29.7 + '@babel/generator': 7.29.7 + '@babel/helper-compilation-targets': 7.29.7 + '@babel/helper-module-transforms': 7.29.7(@babel/core@7.29.7(supports-color@8.1.1)) + '@babel/helpers': 7.29.7 + '@babel/parser': 7.29.7 + '@babel/template': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) '@babel/types': 7.29.7 '@jridgewell/remapping': 2.3.5 convert-source-map: 2.0.0 @@ -5569,17 +5586,26 @@ snapshots: '@babel/helper-module-imports@7.29.7': dependencies: - '@babel/traverse': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) '@babel/types': 7.29.7 transitivePeerDependencies: - supports-color + '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7(supports-color@8.1.1))': + dependencies: + '@babel/core': 7.29.7(supports-color@8.1.1) + '@babel/helper-module-imports': 7.29.7 + '@babel/helper-validator-identifier': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) + transitivePeerDependencies: + - supports-color + '@babel/helper-module-transforms@7.29.7(@babel/core@7.29.7)': dependencies: '@babel/core': 7.29.7 '@babel/helper-module-imports': 7.29.7 '@babel/helper-validator-identifier': 7.29.7 - '@babel/traverse': 7.29.7 + '@babel/traverse': 7.29.7(supports-color@8.1.1) transitivePeerDependencies: - supports-color @@ -5621,7 +5647,7 @@ snapshots: '@babel/parser': 7.29.7 '@babel/types': 7.29.7 - '@babel/traverse@7.29.7': + '@babel/traverse@7.29.7(supports-color@8.1.1)': dependencies: '@babel/code-frame': 7.29.7 '@babel/generator': 7.29.7 @@ -7981,10 +8007,6 @@ snapshots: dependencies: path-type: 4.0.0 - dompurify@3.4.10: - optionalDependencies: - '@types/trusted-types': 2.0.7 - dompurify@3.4.11: optionalDependencies: '@types/trusted-types': 2.0.7 @@ -8812,7 +8834,7 @@ snapshots: jest-worker@27.5.1: dependencies: - '@types/node': 25.9.3 + '@types/node': 26.0.0 merge-stream: 2.0.0 supports-color: 8.1.1 @@ -9790,7 +9812,7 @@ snapshots: '@posthog/core': 1.35.3 '@posthog/types': 1.390.2 core-js: 3.49.0 - dompurify: 3.4.10 + dompurify: 3.4.11 fflate: 0.4.8 preact: 10.29.2 query-selector-shadow-dom: 1.0.1 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index c6ea723c..361059bb 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -25,7 +25,7 @@ auditConfig: - GHSA-h67p-54hq-rp68 overrides: - dompurify@<3.4.9: '>=3.4.9' + dompurify@<=3.4.10: '>=3.4.11' esbuild@>=0.27.3 <0.28.1: '>=0.28.1' form-data@>=4.0.0 <4.0.6: '>=4.0.6' hono@<4.12.21: '>=4.12.21' From ba6bc1ce6cedce56d45c8fcd96a74c3cd53879dc Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:28:18 -0500 Subject: [PATCH 88/96] test(e2e): align selector testid with scenario-selector rename; rewrite x-axis toggle test for single-chart mode buttons --- .../app/cypress/e2e/dropdown-switching.cy.ts | 4 +- .../app/cypress/e2e/historical-trends.cy.ts | 4 +- .../app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 64 +++++++++---------- packages/app/cypress/e2e/url-params.cy.ts | 10 +-- 4 files changed, 39 insertions(+), 43 deletions(-) diff --git a/packages/app/cypress/e2e/dropdown-switching.cy.ts b/packages/app/cypress/e2e/dropdown-switching.cy.ts index ac88dc84..4bc8b695 100644 --- a/packages/app/cypress/e2e/dropdown-switching.cy.ts +++ b/packages/app/cypress/e2e/dropdown-switching.cy.ts @@ -17,10 +17,10 @@ describe('Dropdown one-click switching', () => { cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[data-testid="model-selector"]').should('have.attr', 'aria-expanded', 'false'); - cy.get('[data-testid="sequence-selector"]').should('have.attr', 'aria-expanded', 'true'); + cy.get('[data-testid="scenario-selector"]').should('have.attr', 'aria-expanded', 'true'); cy.get('[role="option"]').should('have.length.greaterThan', 0); }); diff --git a/packages/app/cypress/e2e/historical-trends.cy.ts b/packages/app/cypress/e2e/historical-trends.cy.ts index f0a70a56..55b0e274 100644 --- a/packages/app/cypress/e2e/historical-trends.cy.ts +++ b/packages/app/cypress/e2e/historical-trends.cy.ts @@ -88,8 +88,8 @@ describe('Historical Trends — Content & Interactions', () => { delete doc.body.dataset.scrollLocked; doc.body.style.removeProperty('pointer-events'); }); - cy.get('[data-testid="sequence-selector"]').should('be.visible'); - cy.get('[data-testid="sequence-selector"]').click(); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').click(); cy.get('[role="option"]').should('have.length.greaterThan', 0); cy.get('body').type('{esc}'); }); diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index e17a4aff..636a7ccf 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -1,46 +1,42 @@ -describe('TTFT X-Axis Toggle (E2E chart)', () => { +describe('X-Axis Mode Toggle (inference chart)', () => { before(() => { - cy.window().then((win) => { - win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + cy.visit('/inference', { + onBeforeLoad(win) { + win.localStorage.setItem('inferencex-star-modal-dismissed', String(Date.now())); + }, }); - cy.visit('/inference'); - cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 2); + cy.get('[data-testid="x-axis-mode-buttons"]').should('be.visible'); + cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); }); - it('shows the x-axis dropdown in the e2e chart heading', () => { - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2 button') - .should('contain.text', 'vs.') - .and('contain.text', 'Latency'); + it('shows the x-axis mode buttons with Interactivity active by default', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible'); + cy.get('[data-testid="x-axis-mode-interactivity"]') + .should('be.visible') + .and('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); }); - it('opens popover with three x-axis options', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').within(() => { - cy.contains('End-to-end Latency').should('exist'); - cy.contains('P99 TTFT').should('exist'); - cy.contains('Median TTFT').should('exist'); - }); - }); - - it('switches x-axis to P99 TTFT and updates the heading', () => { - cy.get('[data-slot="popover-content"]').contains('P99 TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'P99 TTFT'); + it('switches the x-axis to TTFT and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-ttft"]').click(); + cy.get('[data-testid="x-axis-mode-ttft"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Time To First Token'); }); - it('switches x-axis to Median TTFT and updates the heading', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('Median TTFT').click(); - cy.get('[data-testid="chart-figure"]').eq(1).find('h2').should('contain.text', 'Median TTFT'); + it('switches the x-axis to E2E Latency and updates the heading', () => { + cy.get('[data-testid="x-axis-mode-e2e"]').click(); + cy.get('[data-testid="x-axis-mode-e2e"]').should('have.attr', 'aria-selected', 'true'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'End-to-end Latency'); }); - it('switches back to End-to-end Latency', () => { - cy.get('[data-testid="chart-figure"]').eq(1).find('h2 button').click(); - cy.get('[data-slot="popover-content"]').contains('End-to-end Latency').click(); - cy.get('[data-testid="chart-figure"]') - .eq(1) - .find('h2') - .should('contain.text', 'End-to-end Latency'); + it('switches back to Interactivity', () => { + cy.get('[data-testid="x-axis-mode-interactivity"]').click(); + cy.get('[data-testid="x-axis-mode-interactivity"]').should( + 'have.attr', + 'aria-selected', + 'true', + ); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); }); }); diff --git a/packages/app/cypress/e2e/url-params.cy.ts b/packages/app/cypress/e2e/url-params.cy.ts index 3c480686..927aee5f 100644 --- a/packages/app/cypress/e2e/url-params.cy.ts +++ b/packages/app/cypress/e2e/url-params.cy.ts @@ -21,7 +21,7 @@ const visitWithErrorSpy = (path: string) => { }; const assertNoHydrationMismatch = () => { - cy.get('[data-testid="sequence-selector"]').should('be.visible'); + cy.get('[data-testid="scenario-selector"]').should('be.visible'); cy.get('@consoleError').then((spy) => { const calls = (spy as unknown as { args: unknown[][] }).args; const hydration = calls.filter((args) => @@ -152,7 +152,7 @@ describe('URL Parameter Persistence', () => { it('/inference?i_seq=1k/1k seeds the sequence without a hydration error', () => { visitWithErrorSpy('/inference?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); @@ -160,13 +160,13 @@ describe('URL Parameter Persistence', () => { // Visit the canonical model-prefixed slug so the assertion is directly // about the rendered page, not about a bare-slug redirect interleaving. visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=1k/1k'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); assertNoHydrationMismatch(); }); it('/compare/[slug] with invalid ?i_seq=junk falls back to the seeded default', () => { visitWithErrorSpy('/compare/deepseek-r1-h100-vs-h200?i_seq=junk'); - cy.get('[data-testid="sequence-selector"]') + cy.get('[data-testid="scenario-selector"]') .invoke('text') .should('not.contain', 'junk') .and('match', /[18]K . [18]K/u); @@ -228,7 +228,7 @@ describe('URL Parameter Persistence', () => { // `effectivePrecisions` intersects the selection with available precisions // and the UI may render the fallback. dsr1 + fp8 + 1k/1k is supported. visitWithErrorSpy('/inference?i_seq=1k/1k&g_model=DeepSeek-R1-0528&i_prec=fp8'); - cy.get('[data-testid="sequence-selector"]').should('contain.text', '1K / 1K'); + cy.get('[data-testid="scenario-selector"]').should('contain.text', '1K / 1K'); cy.get('[data-testid="model-selector"]').should('contain.text', 'DeepSeek'); cy.get('[data-testid="precision-multiselect"]').should('contain.text', 'FP8'); assertNoHydrationMismatch(); From ada19b54e41ea3ad87cdfc22dd3d27e1a3d7df44 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:41:03 -0500 Subject: [PATCH 89/96] test(datasets): component tests for distribution card, trace flamegraph (incl deep-link), and dataset list states --- .../app/cypress/component/dataset-list.cy.tsx | 93 +++++++++++++++++++ .../component/distribution-card.cy.tsx | 45 +++++++++ .../cypress/component/trace-flamegraph.cy.tsx | 86 +++++++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 packages/app/cypress/component/dataset-list.cy.tsx create mode 100644 packages/app/cypress/component/distribution-card.cy.tsx create mode 100644 packages/app/cypress/component/trace-flamegraph.cy.tsx diff --git a/packages/app/cypress/component/dataset-list.cy.tsx b/packages/app/cypress/component/dataset-list.cy.tsx new file mode 100644 index 00000000..f7cfcb9a --- /dev/null +++ b/packages/app/cypress/component/dataset-list.cy.tsx @@ -0,0 +1,93 @@ +import { QueryClient, QueryClientProvider } from '@tanstack/react-query'; +import { AppRouterContext } from 'next/dist/shared/lib/app-router-context.shared-runtime'; + +import { DatasetList } from '@/components/datasets/dataset-list'; +import type { DatasetRecord } from '@/hooks/api/use-datasets'; + +const datasets: DatasetRecord[] = [ + { + id: 'ds-1', + slug: 'cc-traces-weka-full', + label: 'cc-traces-weka (full)', + variant: 'full', + description: 'Every captured request, unmodified.', + hf_url: 'https://huggingface.co/datasets/semianalysisai/cc-traces-weka-full', + license: 'apache-2.0', + conversation_count: 1234, + summary: { + totalIn: 5_000_000, + totalOut: 250_000, + cachedPct: 0.82, + mainTurns: 9800, + subagentGroups: 540, + }, + ingested_at: '2026-06-20T00:00:00Z', + }, + { + id: 'ds-2', + slug: 'cc-traces-weka-256k', + label: 'cc-traces-weka (256k)', + variant: '256k', + description: 'Turns trimmed to a 256k context window.', + hf_url: null, + license: 'apache-2.0', + conversation_count: 980, + summary: { + totalIn: 3_200_000, + totalOut: 180_000, + cachedPct: 0.79, + mainTurns: 7600, + subagentGroups: 410, + }, + ingested_at: '2026-06-19T00:00:00Z', + }, +]; + +function createMockRouter() { + return { + push: cy.stub(), + replace: cy.stub(), + refresh: cy.stub(), + back: cy.stub(), + forward: cy.stub(), + prefetch: cy.stub().resolves(), + }; +} + +function mountList() { + const queryClient = new QueryClient({ defaultOptions: { queries: { retry: false } } }); + cy.mount( + + + + + , + ); +} + +describe('DatasetList', () => { + it('renders a card per dataset with its summary stats', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: datasets }).as('list'); + mountList(); + cy.wait('@list'); + cy.contains('cc-traces-weka (full)').should('be.visible'); + cy.contains('cc-traces-weka (256k)').should('be.visible'); + cy.contains('1,234').should('be.visible'); // conversation_count, localized + cy.contains('82%').should('be.visible'); // cachedPct + cy.get('a[href="/datasets/cc-traces-weka-full"]').should('exist'); + }); + + it('shows the empty state when no datasets are ingested', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 200, body: [] }).as('empty'); + mountList(); + cy.wait('@empty'); + cy.contains('No datasets ingested yet.').should('be.visible'); + }); + + it('shows the error state when the request fails', () => { + cy.intercept('GET', '/api/v1/datasets', { statusCode: 500, body: { error: 'boom' } }).as('err'); + mountList(); + cy.wait('@err'); + cy.contains('Failed to load datasets.').should('be.visible'); + }); +}); diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx new file mode 100644 index 00000000..fb7e5461 --- /dev/null +++ b/packages/app/cypress/component/distribution-card.cy.tsx @@ -0,0 +1,45 @@ +import { DistributionCard } from '@/components/datasets/distribution-card'; +import type { Distribution } from '@/hooks/api/use-datasets'; + +const distribution: Distribution = { + bins: [ + { x0: 0, x1: 100, count: 5 }, + { x0: 100, x1: 200, count: 20 }, + { x0: 200, x1: 300, count: 12 }, + { x0: 300, x1: 400, count: 3 }, + ], + stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 }, +}; + +describe('DistributionCard', () => { + it('renders the title, summary stats, and one bar per bin', () => { + cy.mount( + , + ); + cy.contains('Input tokens per turn').should('be.visible'); + cy.contains('n=40').should('be.visible'); + cy.contains('median 175').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + // One filled bar rect per bin (ChartHover may add a transparent overlay rect). + cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length); + }); + + it('shows a "No data" placeholder when no distribution is provided', () => { + cy.mount(); + cy.contains('Empty metric').should('be.visible'); + cy.contains('No data').should('be.visible'); + cy.get('rect[class*="fill-primary"]').should('not.exist'); + }); + + it('marks the chart as log scale when scale="log"', () => { + cy.mount( + , + ); + cy.contains('log scale').should('be.visible'); + }); +}); diff --git a/packages/app/cypress/component/trace-flamegraph.cy.tsx b/packages/app/cypress/component/trace-flamegraph.cy.tsx new file mode 100644 index 00000000..1be90e0c --- /dev/null +++ b/packages/app/cypress/component/trace-flamegraph.cy.tsx @@ -0,0 +1,86 @@ +import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; +import type { ConversationStructure } from '@/hooks/api/use-datasets'; + +// Two main turns followed by one subagent group with two child turns. +// Node indices: 0 = turn, 1 = turn, 2 = subagent (so its rows key off `g-2`). +const structure: ConversationStructure = { + blockSize: 64, + nodes: [ + { kind: 'turn', turnIndex: 0, model: 'claude', in: 1000, out: 200, cached: 600, uncached: 400 }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2000, + out: 300, + cached: 1500, + uncached: 500, + }, + { + kind: 'subagent', + label: 'Subagent: search', + agentId: 'agent-1', + durationMs: 12000, + in: 5000, + out: 800, + cached: 3000, + uncached: 2000, + children: [ + { + kind: 'turn', + turnIndex: 0, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + { + kind: 'turn', + turnIndex: 1, + model: 'claude', + in: 2500, + out: 400, + cached: 1500, + uncached: 1000, + }, + ], + }, + ], + totals: { in: 8000, out: 1300, cached: 5100, uncached: 2900, numTurns: 2, numSubagentGroups: 1 }, +}; + +describe('TraceFlamegraph', () => { + it('renders the legend, main-turn rows, and the subagent group header', () => { + cy.mount(); + cy.contains('Cached prefix').should('be.visible'); + cy.contains('Uncached input').should('be.visible'); + cy.contains('Output').should('be.visible'); + cy.get('[data-rowkey="t-0"]').should('contain.text', 'Turn 1'); + cy.get('[data-rowkey="t-1"]').should('contain.text', 'Turn 2'); + cy.contains('Subagent: search').should('be.visible'); + }); + + it('keeps subagent children collapsed until the group is expanded', () => { + cy.mount(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + cy.contains('button', 'Subagent: search').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible'); + }); + + it('expand all / collapse all toggles every subagent group', () => { + cy.mount(); + cy.contains('button', 'Expand all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('be.visible'); + cy.contains('button', 'Collapse all').click(); + cy.get('[data-rowkey="g-2-c-0"]').should('not.exist'); + }); + + it('auto-expands and highlights the target group child for a request-timeline deep link', () => { + cy.mount( + , + ); + cy.get('[data-rowkey="g-2-c-1"]').should('be.visible').and('have.class', 'ring-primary'); + }); +}); From 1c61ee3f597e22d33e891b73f7f95511a73844d3 Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:47:02 -0500 Subject: [PATCH 90/96] refactor(datasets): extract shared compact() formatter, dedupe 5 local copies --- .../src/components/datasets/conversation-view.tsx | 9 +-------- .../app/src/components/datasets/dataset-detail.tsx | 9 +-------- .../app/src/components/datasets/dataset-list.tsx | 9 +-------- .../src/components/datasets/distribution-card.tsx | 11 +---------- packages/app/src/components/datasets/format.ts | 12 ++++++++++++ .../app/src/components/datasets/trace-flamegraph.tsx | 9 +-------- 6 files changed, 17 insertions(+), 42 deletions(-) create mode 100644 packages/app/src/components/datasets/format.ts diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index 739d3bb2..d39b83d9 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -6,14 +6,7 @@ import { useSearchParams } from 'next/navigation'; import { Card } from '@/components/ui/card'; import { TraceFlamegraph } from '@/components/datasets/trace-flamegraph'; import { useDatasetConversation } from '@/hooks/api/use-datasets'; - -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; export function ConversationView({ slug, convId }: { slug: string; convId: string }) { const { data, isLoading, isError } = useDatasetConversation(slug, convId); diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx index 57c50649..9410a505 100644 --- a/packages/app/src/components/datasets/dataset-detail.tsx +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -18,14 +18,7 @@ import { type ConversationSort, } from '@/hooks/api/use-datasets'; import { track } from '@/lib/analytics'; - -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; const PAGE = 50; diff --git a/packages/app/src/components/datasets/dataset-list.tsx b/packages/app/src/components/datasets/dataset-list.tsx index 5fcc0dfe..84b279db 100644 --- a/packages/app/src/components/datasets/dataset-list.tsx +++ b/packages/app/src/components/datasets/dataset-list.tsx @@ -5,14 +5,7 @@ import Link from 'next/link'; import { Card } from '@/components/ui/card'; import { useDatasets, type DatasetRecord } from '@/hooks/api/use-datasets'; import { track } from '@/lib/analytics'; - -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; function DatasetCard({ d }: { d: DatasetRecord }) { const s = d.summary ?? {}; diff --git a/packages/app/src/components/datasets/distribution-card.tsx b/packages/app/src/components/datasets/distribution-card.tsx index 7abc367f..d0c0f166 100644 --- a/packages/app/src/components/datasets/distribution-card.tsx +++ b/packages/app/src/components/datasets/distribution-card.tsx @@ -5,16 +5,7 @@ import { useMemo } from 'react'; import { Card } from '@/components/ui/card'; import { ChartHover, type HoverItem } from '@/components/inference/agentic-point/chart-hover'; import type { Distribution } from '@/hooks/api/use-datasets'; - -/** Compact token/count formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - if (abs > 0 && abs < 1) return n.toFixed(2); - return String(Math.round(n)); -} +import { compact } from './format'; interface DistributionCardProps { title: string; diff --git a/packages/app/src/components/datasets/format.ts b/packages/app/src/components/datasets/format.ts new file mode 100644 index 00000000..f6f5530c --- /dev/null +++ b/packages/app/src/components/datasets/format.ts @@ -0,0 +1,12 @@ +/** + * Compact number formatter for dataset token/count displays: + * 1234 → "1.2k", 1_200_000 → "1.2M", 3.2e9 → "3.2B", 0.82 → "0.82". + */ +export function compact(n: number): string { + const abs = Math.abs(n); + if (abs >= 1e9) return `${(n / 1e9).toFixed(1)}B`; + if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; + if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; + if (abs > 0 && abs < 1) return n.toFixed(2); + return String(Math.round(n)); +} diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index a577193b..12ecb4a4 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -4,14 +4,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { createPortal } from 'react-dom'; import type { ConversationStructure, StructureNode } from '@/hooks/api/use-datasets'; - -/** Compact token formatter: 1234 → "1.2k", 1_200_000 → "1.2M". */ -function compact(n: number): string { - const abs = Math.abs(n); - if (abs >= 1e6) return `${(n / 1e6).toFixed(1)}M`; - if (abs >= 1e3) return `${(n / 1e3).toFixed(1)}k`; - return String(Math.round(n)); -} +import { compact } from './format'; // Stacked-bar segment colors. Cached prefix vs uncached input vs output — // fixed hues (theme-independent) so the meaning is stable in light/dark. From e2e5424e7071d380d05b7c1bcfddfc5bccfc3c5b Mon Sep 17 00:00:00 2001 From: adibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 10:26:34 -0500 Subject: [PATCH 91/96] refactor(db): squash agentic migrations into 007_agentic.sql so numbering doesn't collide with master --- .claude/agents/ingest.md | 2 +- .../db/migrations/002_agentic_scenario.sql | 30 -- .../migrations/003_agentic_availability.sql | 21 -- packages/db/migrations/004_offload_mode.sql | 42 --- .../migrations/006_agentic_trace_replay.sql | 34 -- packages/db/migrations/007_agentic.sql | 326 ++++++++++++++++++ .../007_agentic_trace_server_metrics_json.sql | 17 - .../008_agentic_aggregate_stats.sql | 18 - .../migrations/009_agentic_chart_series.sql | 19 - .../010_agentic_request_timeline.sql | 15 - packages/db/migrations/011_datasets.sql | 55 --- packages/db/migrations/012_run_datasets.sql | 19 - 12 files changed, 327 insertions(+), 271 deletions(-) delete mode 100644 packages/db/migrations/002_agentic_scenario.sql delete mode 100644 packages/db/migrations/003_agentic_availability.sql delete mode 100644 packages/db/migrations/004_offload_mode.sql delete mode 100644 packages/db/migrations/006_agentic_trace_replay.sql create mode 100644 packages/db/migrations/007_agentic.sql delete mode 100644 packages/db/migrations/007_agentic_trace_server_metrics_json.sql delete mode 100644 packages/db/migrations/008_agentic_aggregate_stats.sql delete mode 100644 packages/db/migrations/009_agentic_chart_series.sql delete mode 100644 packages/db/migrations/010_agentic_request_timeline.sql delete mode 100644 packages/db/migrations/011_datasets.sql delete mode 100644 packages/db/migrations/012_run_datasets.sql diff --git a/.claude/agents/ingest.md b/.claude/agents/ingest.md index aa0099ac..4ecbc1dd 100644 --- a/.claude/agents/ingest.md +++ b/.claude/agents/ingest.md @@ -178,7 +178,7 @@ cd packages/db && DATABASE_WRITE_URL='' \ [--label "…"] [--variant full|256k] [--description "…"] [--limit N] ``` -It populates the `datasets` + `dataset_conversations` tables (migration `011_datasets.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). +It populates the `datasets` + `dataset_conversations` tables (migration `007_agentic.sql`) that back the `/datasets` pages — upsert/replace per dataset, then purge the API cache like any other ingest. Same write-URL rule applies (direct, non-pooled, provided by the invoker). ## Don't diff --git a/packages/db/migrations/002_agentic_scenario.sql b/packages/db/migrations/002_agentic_scenario.sql deleted file mode 100644 index c143914e..00000000 --- a/packages/db/migrations/002_agentic_scenario.sql +++ /dev/null @@ -1,30 +0,0 @@ --- Support agentic scenarios in benchmark_results. --- --- Scenarios are discriminated by benchmark_type: --- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. --- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. --- --- conc retains its meaning (concurrent users/requests) for both. - --- 1) isl/osl become nullable for agentic rows -alter table benchmark_results - alter column isl drop not null, - alter column osl drop not null; - --- 2) CHECK constraints: positive-or-null -alter table benchmark_results - drop constraint benchmark_results_isl_positive, - drop constraint benchmark_results_osl_positive; - -alter table benchmark_results - add constraint benchmark_results_isl_positive check (isl is null or isl > 0), - add constraint benchmark_results_osl_positive check (osl is null or osl > 0); - --- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows --- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). -alter table benchmark_results - drop constraint benchmark_results_unique; - -alter table benchmark_results - add constraint benchmark_results_unique unique nulls not distinct - (workflow_run_id, config_id, benchmark_type, isl, osl, conc); diff --git a/packages/db/migrations/003_agentic_availability.sql b/packages/db/migrations/003_agentic_availability.sql deleted file mode 100644 index e96cbd50..00000000 --- a/packages/db/migrations/003_agentic_availability.sql +++ /dev/null @@ -1,21 +0,0 @@ --- Extend the availability table to cover agentic scenarios. --- --- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same --- for availability and add benchmark_type so the frontend can enumerate --- agentic vs single_turn scenarios per model/date. --- --- Postgres primary keys require every column to be NOT NULL, so we drop the PK --- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally --- equivalent except it allows isl/osl to be NULL for agentic rows. - -alter table availability - drop constraint availability_pkey; - -alter table availability - alter column isl drop not null, - alter column osl drop not null, - add column benchmark_type text not null default 'single_turn'; - -alter table availability - add constraint availability_natural_key unique nulls not distinct - (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); diff --git a/packages/db/migrations/004_offload_mode.sql b/packages/db/migrations/004_offload_mode.sql deleted file mode 100644 index 24b617f1..00000000 --- a/packages/db/migrations/004_offload_mode.sql +++ /dev/null @@ -1,42 +0,0 @@ --- Add offload_mode as a first-class dimension on benchmark_results. --- --- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace --- runs: a single run may emit two rows for the same (config, isl, osl, conc) --- — one with offload disabled, one enabled. The pre-existing unique key --- collapsed those into one row, forcing the ingest to skip variants. --- --- For fixed-seq runs `offload_mode` defaults to 'off', which matches the --- assumption baked into the existing 5,500+ rows. - -alter table benchmark_results - add column offload_mode text not null default 'off'; - --- Backfill agentic rows from the offload_mode value already living in metrics --- JSONB (set during the earlier agentic ingest backfill). -update benchmark_results - set offload_mode = metrics->>'offload_mode' - where benchmark_type = 'agentic_traces' - and metrics ? 'offload_mode'; - --- Replace the unique constraint so on/off variants can coexist. -alter table benchmark_results - drop constraint benchmark_results_unique; - -alter table benchmark_results - add constraint benchmark_results_unique unique nulls not distinct - (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); - --- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. -drop materialized view if exists latest_benchmarks cascade; - -create materialized view latest_benchmarks as -select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) - br.* -from benchmark_results br -join latest_workflow_runs wr on wr.id = br.workflow_run_id -where br.error is null -order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; - -create unique index latest_benchmarks_pk - on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; -create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/migrations/006_agentic_trace_replay.sql b/packages/db/migrations/006_agentic_trace_replay.sql deleted file mode 100644 index 398bc725..00000000 --- a/packages/db/migrations/006_agentic_trace_replay.sql +++ /dev/null @@ -1,34 +0,0 @@ --- Capture raw aiperf trace files per agentic benchmark point. --- --- The aiperf harness produces two per-point export files inside each --- `agentic_` artifact: --- - profile_export.jsonl (~2 MB raw, per-request data) --- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots) --- --- We persist them so the dashboard can later show per-request distributions, --- KV cache utilization over time, and conversation traces without needing to --- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at --- ~500 KB per point post-gzip the total fits comfortably without a separate --- blob service. --- --- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK --- column on benchmark_results). Older, non-aiperf agentic runs simply have a --- NULL `trace_replay_id`. - -create table agentic_trace_replay ( - id bigserial primary key, - -- gzip(profile_export.jsonl); null when only the server metrics file existed - profile_export_jsonl_gz bytea, - profile_export_uncompressed_size bigint, - -- raw csv bytes; null when only the profile file existed - server_metrics_csv bytea, - server_metrics_csv_size bigint, - created_at timestamptz not null default now() -); - -alter table benchmark_results - add column trace_replay_id bigint references agentic_trace_replay(id); - -create index benchmark_results_trace_replay_idx - on benchmark_results (trace_replay_id) - where trace_replay_id is not null; diff --git a/packages/db/migrations/007_agentic.sql b/packages/db/migrations/007_agentic.sql new file mode 100644 index 00000000..eceea82e --- /dev/null +++ b/packages/db/migrations/007_agentic.sql @@ -0,0 +1,326 @@ +-- 007_agentic.sql +-- +-- Squashed agentic-benchmark + datasets schema. Collapses the feat/agentx +-- migrations 002_agentic_scenario .. 012_run_datasets into one file that sorts +-- after master's highest migration (006_benchmark_results_workers), so the +-- branch's numbering no longer collides with master's 002-006. None of the +-- collapsed migrations had been applied to any deployed database. +-- +-- Statement order is preserved exactly. The latest_benchmarks recreate uses +-- 'select br.*', so it retains every benchmark_results column added earlier +-- (including master's 'workers' from 006) and re-keys the view on offload_mode. + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 002_agentic_scenario.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Support agentic scenarios in benchmark_results. +-- +-- Scenarios are discriminated by benchmark_type: +-- 'single_turn' — fixed-seq-len runs (1k1k, 1k8k, 8k1k, …). isl/osl set. +-- 'agentic_traces' — trace-replay agentic runs. isl/osl NULL. +-- +-- conc retains its meaning (concurrent users/requests) for both. + +-- 1) isl/osl become nullable for agentic rows +alter table benchmark_results + alter column isl drop not null, + alter column osl drop not null; + +-- 2) CHECK constraints: positive-or-null +alter table benchmark_results + drop constraint benchmark_results_isl_positive, + drop constraint benchmark_results_osl_positive; + +alter table benchmark_results + add constraint benchmark_results_isl_positive check (isl is null or isl > 0), + add constraint benchmark_results_osl_positive check (osl is null or osl > 0); + +-- 3) Uniqueness must treat (NULL, NULL) pairs as equal so agentic rows +-- can't duplicate on (workflow_run_id, config_id, benchmark_type, conc). +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 003_agentic_availability.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Extend the availability table to cover agentic scenarios. +-- +-- The 002 migration relaxed benchmark_results.isl/osl to nullable; do the same +-- for availability and add benchmark_type so the frontend can enumerate +-- agentic vs single_turn scenarios per model/date. +-- +-- Postgres primary keys require every column to be NOT NULL, so we drop the PK +-- and replace it with a UNIQUE NULLS NOT DISTINCT constraint — functionally +-- equivalent except it allows isl/osl to be NULL for agentic rows. + +alter table availability + drop constraint availability_pkey; + +alter table availability + alter column isl drop not null, + alter column osl drop not null, + add column benchmark_type text not null default 'single_turn'; + +alter table availability + add constraint availability_natural_key unique nulls not distinct + (model, isl, osl, precision, hardware, framework, spec_method, disagg, benchmark_type, date); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 004_offload_mode.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Add offload_mode as a first-class dimension on benchmark_results. +-- +-- KV-cache offload (on/off) is a meaningful sweep dimension for agentic-trace +-- runs: a single run may emit two rows for the same (config, isl, osl, conc) +-- — one with offload disabled, one enabled. The pre-existing unique key +-- collapsed those into one row, forcing the ingest to skip variants. +-- +-- For fixed-seq runs `offload_mode` defaults to 'off', which matches the +-- assumption baked into the existing 5,500+ rows. + +alter table benchmark_results + add column offload_mode text not null default 'off'; + +-- Backfill agentic rows from the offload_mode value already living in metrics +-- JSONB (set during the earlier agentic ingest backfill). +update benchmark_results + set offload_mode = metrics->>'offload_mode' + where benchmark_type = 'agentic_traces' + and metrics ? 'offload_mode'; + +-- Replace the unique constraint so on/off variants can coexist. +alter table benchmark_results + drop constraint benchmark_results_unique; + +alter table benchmark_results + add constraint benchmark_results_unique unique nulls not distinct + (workflow_run_id, config_id, benchmark_type, isl, osl, conc, offload_mode); + +-- Rebuild the latest-per-config materialized view to dedupe by offload_mode too. +drop materialized view if exists latest_benchmarks cascade; + +create materialized view latest_benchmarks as +select distinct on (br.config_id, br.conc, br.isl, br.osl, br.offload_mode) + br.* +from benchmark_results br +join latest_workflow_runs wr on wr.id = br.workflow_run_id +where br.error is null +order by br.config_id, br.conc, br.isl, br.osl, br.offload_mode, br.date desc; + +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, offload_mode) nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 006_agentic_trace_replay.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Capture raw aiperf trace files per agentic benchmark point. +-- +-- The aiperf harness produces two per-point export files inside each +-- `agentic_` artifact: +-- - profile_export.jsonl (~2 MB raw, per-request data) +-- - server_metrics_export.csv (~20 KB raw, periodic Prometheus snapshots) +-- +-- We persist them so the dashboard can later show per-request distributions, +-- KV cache utilization over time, and conversation traces without needing to +-- re-download the GitHub artifacts. Storage stays in Postgres (TOASTed) — at +-- ~500 KB per point post-gzip the total fits comfortably without a separate +-- blob service. +-- +-- Mirrors the existing `server_logs` pattern (id-keyed sibling table + FK +-- column on benchmark_results). Older, non-aiperf agentic runs simply have a +-- NULL `trace_replay_id`. + +create table agentic_trace_replay ( + id bigserial primary key, + -- gzip(profile_export.jsonl); null when only the server metrics file existed + profile_export_jsonl_gz bytea, + profile_export_uncompressed_size bigint, + -- raw csv bytes; null when only the profile file existed + server_metrics_csv bytea, + server_metrics_csv_size bigint, + created_at timestamptz not null default now() +); + +alter table benchmark_results + add column trace_replay_id bigint references agentic_trace_replay(id); + +create index benchmark_results_trace_replay_idx + on benchmark_results (trace_replay_id) + where trace_replay_id is not null; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 007_agentic_trace_server_metrics_json.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Add the full server-metrics time-series JSON to agentic_trace_replay. +-- +-- The existing `server_metrics_csv` column holds aiperf's summary export — +-- one row per metric with avg/min/max/std/p1..p99 across the entire run. +-- That's enough for the cumulative cache-hit number but not for any +-- "metric over time" view (KV cache utilization curve, queue depth, prefix +-- hit rate per interval, cumulative prefill token source). +-- +-- The harness also writes `server_metrics_export.json` which contains the +-- raw per-scrape (~1Hz) values for every Prometheus metric over the whole +-- benchmark window. Raw size is ~250 MB per point but it compresses ~42x +-- to ~6 MB gzipped (text with repeated metric names + numeric values). +-- That's the file we store here for any future time-series chart. + +alter table agentic_trace_replay + add column server_metrics_json_gz bytea, + add column server_metrics_json_uncompressed_size bigint; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 008_agentic_aggregate_stats.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed aggregate stats for each agentic_trace_replay row. +-- +-- Previously the agentic detail page parsed the (huge) profile_export.jsonl +-- and server_metrics_json blobs on every request to compute distribution +-- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived +-- metrics (session-time, p90 prefill TPS). That took ~20s per row and the +-- worst rows (high-conc TP+EP server_metrics blobs that decompress past +-- Node's 512 MB string cap) couldn't be parsed without a stream fallback. +-- +-- This column holds the computed stats so the API serves the page from a +-- single SQL row read. Shape mirrors the existing benchmark_results.metrics +-- JSONB convention; an inner `version` field lets the backfill script +-- detect rows whose stats were computed by an older algorithm and +-- recompute them. Null when stats haven't been computed yet (existing +-- rows pre-backfill; the API has a slow-path fallback for that case). + +alter table agentic_trace_replay + add column aggregate_stats jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 009_agentic_chart_series.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed time-series for the agentic detail page chart. +-- +-- Sibling to `aggregate_stats` (migration 008): that column stores +-- per-row percentile/derived *summaries*, this one stores the full +-- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate, +-- queueDepth, prefillTps, decodeTps, promptTokensBySource). +-- +-- Without this, the detail page parsed the entire `server_metrics_json_gz` +-- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc +-- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length). +-- With pre-computed series the page is a single SQL row read. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored series were produced by an older algorithm. +-- Null when the series haven't been computed yet; the API has a slow-path +-- fallback (with stream-parse for oversized blobs) for that case. + +alter table agentic_trace_replay + add column chart_series jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 010_agentic_request_timeline.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Pre-computed per-request timeline for the agentic detail page. +-- +-- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one +-- holds a thin per-request array extracted from `profile_export_jsonl_gz` +-- so the detail page can render a Gantt-style swimlane of every request +-- (one bar per conversation turn) without re-parsing the JSONL on every +-- page load. +-- +-- Shape includes an inner `version` field so the backfill script can +-- recompute rows whose stored timeline was produced by an older +-- algorithm. Null when the timeline hasn't been computed yet; the API +-- falls back to parsing the blob in that case. + +alter table agentic_trace_replay + add column request_timeline jsonb; + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 011_datasets.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora +-- the agentic benchmarks replay) + their per-conversation trace structure. +-- +-- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but +-- not the source traces. These two tables back the new /datasets area: a +-- registry of ingested dataset versions with precomputed summary + chart data, +-- and one row per conversation holding a flamegraph-ready `structure` (turns + +-- subagent groups with input split into cached-prefix vs uncached-suffix). The +-- raw hash_ids are NOT stored — they're only needed at ingest to derive the +-- cached/uncached split, so the runtime read is a single small JSONB. +-- +-- Additive only. To revert this migration: +-- drop table if exists dataset_conversations; +-- drop table if exists datasets; +-- (and see the run_datasets revert below; this is all one migration now: +-- delete from schema_migrations where filename = '007_agentic.sql';) + +create table datasets ( + -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'. + id text primary key, + -- URL key, e.g. 'cc-traces-weka-062126'. + slug text not null unique, + label text not null, + -- 'full' | '256k' | 'no-subagents' (the published variants). + variant text not null default 'full', + description text, + hf_url text, + license text, + conversation_count integer not null default 0, + -- Token totals, main_turns, subagent_groups, model mix, date range, etc. + summary jsonb not null default '{}'::jsonb, + -- Precomputed distributions for the dataset-detail cards (input/output length, + -- turns per conversation, subagent fan-out, …). Versioned via an inner field. + chart_data jsonb not null default '{}'::jsonb, + dataset_version integer not null default 1, + ingested_at timestamptz not null default now() +); + +create table dataset_conversations ( + id bigserial primary key, + dataset_id text not null references datasets(id) on delete cascade, + -- The conversation id from the dataset record (trace id). + conv_id text not null, + models text[] not null default '{}', + num_turns integer not null default 0, + num_subagent_groups integer not null default 0, + total_in bigint not null default 0, + total_out bigint not null default 0, + total_cached bigint not null default 0, + -- Flamegraph-ready ordered node tree (turns + subagent groups, each with + -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts. + structure jsonb not null, + unique (dataset_id, conv_id) +); + +create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id); + +-- ─────────────────────────────────────────────────────────────────────── +-- (was 012_run_datasets.sql) +-- ─────────────────────────────────────────────────────────────────────── +-- Maps a benchmark workflow_run to the source dataset it replayed, so the +-- agentic detail page can deep-link each request in the timeline to the exact +-- conversation in the /datasets viewer (the request's conversation_id, with any +-- ::sa:/::fa: suffix stripped, is the dataset conv_id). +-- +-- One row per workflow_run (every benchmark in a run replays the same dataset). +-- dataset_slug is a plain slug (matches datasets.slug / the /datasets/ +-- URL) rather than an FK, so the mapping can be recorded before/independent of +-- the dataset being ingested; the UI degrades gracefully if the slug is absent. +-- +-- Additive only. To revert this whole squashed migration: +-- drop table if exists run_datasets; +-- drop table if exists dataset_conversations; +-- drop table if exists datasets; +-- drop table if exists agentic_trace_replay cascade; +-- (plus the benchmark_results/availability column + constraint changes above) +-- delete from schema_migrations where filename = '007_agentic.sql'; + +create table run_datasets ( + workflow_run_id bigint primary key references workflow_runs(id) on delete cascade, + dataset_slug text not null, + created_at timestamptz not null default now() +); diff --git a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql b/packages/db/migrations/007_agentic_trace_server_metrics_json.sql deleted file mode 100644 index ba7bd095..00000000 --- a/packages/db/migrations/007_agentic_trace_server_metrics_json.sql +++ /dev/null @@ -1,17 +0,0 @@ --- Add the full server-metrics time-series JSON to agentic_trace_replay. --- --- The existing `server_metrics_csv` column holds aiperf's summary export — --- one row per metric with avg/min/max/std/p1..p99 across the entire run. --- That's enough for the cumulative cache-hit number but not for any --- "metric over time" view (KV cache utilization curve, queue depth, prefix --- hit rate per interval, cumulative prefill token source). --- --- The harness also writes `server_metrics_export.json` which contains the --- raw per-scrape (~1Hz) values for every Prometheus metric over the whole --- benchmark window. Raw size is ~250 MB per point but it compresses ~42x --- to ~6 MB gzipped (text with repeated metric names + numeric values). --- That's the file we store here for any future time-series chart. - -alter table agentic_trace_replay - add column server_metrics_json_gz bytea, - add column server_metrics_json_uncompressed_size bigint; diff --git a/packages/db/migrations/008_agentic_aggregate_stats.sql b/packages/db/migrations/008_agentic_aggregate_stats.sql deleted file mode 100644 index d55533b9..00000000 --- a/packages/db/migrations/008_agentic_aggregate_stats.sql +++ /dev/null @@ -1,18 +0,0 @@ --- Pre-computed aggregate stats for each agentic_trace_replay row. --- --- Previously the agentic detail page parsed the (huge) profile_export.jsonl --- and server_metrics_json blobs on every request to compute distribution --- stats for ISL/OSL/KV-util/prefix-hit-rate, plus the per-point derived --- metrics (session-time, p90 prefill TPS). That took ~20s per row and the --- worst rows (high-conc TP+EP server_metrics blobs that decompress past --- Node's 512 MB string cap) couldn't be parsed without a stream fallback. --- --- This column holds the computed stats so the API serves the page from a --- single SQL row read. Shape mirrors the existing benchmark_results.metrics --- JSONB convention; an inner `version` field lets the backfill script --- detect rows whose stats were computed by an older algorithm and --- recompute them. Null when stats haven't been computed yet (existing --- rows pre-backfill; the API has a slow-path fallback for that case). - -alter table agentic_trace_replay - add column aggregate_stats jsonb; diff --git a/packages/db/migrations/009_agentic_chart_series.sql b/packages/db/migrations/009_agentic_chart_series.sql deleted file mode 100644 index b42718b9..00000000 --- a/packages/db/migrations/009_agentic_chart_series.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Pre-computed time-series for the agentic detail page chart. --- --- Sibling to `aggregate_stats` (migration 008): that column stores --- per-row percentile/derived *summaries*, this one stores the full --- chart-ready time-series arrays (kvCacheUsage, prefixCacheHitRate, --- queueDepth, prefillTps, decodeTps, promptTokensBySource). --- --- Without this, the detail page parsed the entire `server_metrics_json_gz` --- blob on every request and blew up with ERR_STRING_TOO_LONG on high-conc --- TP+EP rows (the blob decompresses past Node's 512 MB max-string-length). --- With pre-computed series the page is a single SQL row read. --- --- Shape includes an inner `version` field so the backfill script can --- recompute rows whose stored series were produced by an older algorithm. --- Null when the series haven't been computed yet; the API has a slow-path --- fallback (with stream-parse for oversized blobs) for that case. - -alter table agentic_trace_replay - add column chart_series jsonb; diff --git a/packages/db/migrations/010_agentic_request_timeline.sql b/packages/db/migrations/010_agentic_request_timeline.sql deleted file mode 100644 index 756b775e..00000000 --- a/packages/db/migrations/010_agentic_request_timeline.sql +++ /dev/null @@ -1,15 +0,0 @@ --- Pre-computed per-request timeline for the agentic detail page. --- --- Sibling to `aggregate_stats` (008) and `chart_series` (009). This one --- holds a thin per-request array extracted from `profile_export_jsonl_gz` --- so the detail page can render a Gantt-style swimlane of every request --- (one bar per conversation turn) without re-parsing the JSONL on every --- page load. --- --- Shape includes an inner `version` field so the backfill script can --- recompute rows whose stored timeline was produced by an older --- algorithm. Null when the timeline hasn't been computed yet; the API --- falls back to parsing the blob in that case. - -alter table agentic_trace_replay - add column request_timeline jsonb; diff --git a/packages/db/migrations/011_datasets.sql b/packages/db/migrations/011_datasets.sql deleted file mode 100644 index 7a70d83f..00000000 --- a/packages/db/migrations/011_datasets.sql +++ /dev/null @@ -1,55 +0,0 @@ --- Agentic benchmarking source datasets (the HuggingFace cc-traces-weka corpora --- the agentic benchmarks replay) + their per-conversation trace structure. --- --- The app already stores benchmark *replay* artifacts (agentic_trace_replay) but --- not the source traces. These two tables back the new /datasets area: a --- registry of ingested dataset versions with precomputed summary + chart data, --- and one row per conversation holding a flamegraph-ready `structure` (turns + --- subagent groups with input split into cached-prefix vs uncached-suffix). The --- raw hash_ids are NOT stored — they're only needed at ingest to derive the --- cached/uncached split, so the runtime read is a single small JSONB. --- --- Additive only. To revert this migration: --- drop table if exists dataset_conversations; --- drop table if exists datasets; --- delete from schema_migrations where filename = '011_datasets.sql'; - -create table datasets ( - -- HuggingFace dataset id, e.g. 'semianalysisai/cc-traces-weka-062126'. - id text primary key, - -- URL key, e.g. 'cc-traces-weka-062126'. - slug text not null unique, - label text not null, - -- 'full' | '256k' | 'no-subagents' (the published variants). - variant text not null default 'full', - description text, - hf_url text, - license text, - conversation_count integer not null default 0, - -- Token totals, main_turns, subagent_groups, model mix, date range, etc. - summary jsonb not null default '{}'::jsonb, - -- Precomputed distributions for the dataset-detail cards (input/output length, - -- turns per conversation, subagent fan-out, …). Versioned via an inner field. - chart_data jsonb not null default '{}'::jsonb, - dataset_version integer not null default 1, - ingested_at timestamptz not null default now() -); - -create table dataset_conversations ( - id bigserial primary key, - dataset_id text not null references datasets(id) on delete cascade, - -- The conversation id from the dataset record (trace id). - conv_id text not null, - models text[] not null default '{}', - num_turns integer not null default 0, - num_subagent_groups integer not null default 0, - total_in bigint not null default 0, - total_out bigint not null default 0, - total_cached bigint not null default 0, - -- Flamegraph-ready ordered node tree (turns + subagent groups, each with - -- in/out/cached/uncached token counts). See packages/db/src/etl/weka-structure.ts. - structure jsonb not null, - unique (dataset_id, conv_id) -); - -create index dataset_conversations_dataset_idx on dataset_conversations (dataset_id); diff --git a/packages/db/migrations/012_run_datasets.sql b/packages/db/migrations/012_run_datasets.sql deleted file mode 100644 index 58dd9f88..00000000 --- a/packages/db/migrations/012_run_datasets.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Maps a benchmark workflow_run to the source dataset it replayed, so the --- agentic detail page can deep-link each request in the timeline to the exact --- conversation in the /datasets viewer (the request's conversation_id, with any --- ::sa:/::fa: suffix stripped, is the dataset conv_id). --- --- One row per workflow_run (every benchmark in a run replays the same dataset). --- dataset_slug is a plain slug (matches datasets.slug / the /datasets/ --- URL) rather than an FK, so the mapping can be recorded before/independent of --- the dataset being ingested; the UI degrades gracefully if the slug is absent. --- --- Additive only. To revert: --- drop table if exists run_datasets; --- delete from schema_migrations where filename = '012_run_datasets.sql'; - -create table run_datasets ( - workflow_run_id bigint primary key references workflow_runs(id) on delete cascade, - dataset_slug text not null, - created_at timestamptz not null default now() -); From 772dfef5cde7a79d02963a9f151cb43b6592920e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 10:57:37 -0500 Subject: [PATCH 92/96] add agentic time-series and dataset timing --- .../e2e/agentic-point-time-series.cy.ts | 98 +++++++++++++++++++ .../e2e/datasets-flamegraph-time.cy.ts | 85 ++++++++++++++++ .../components/datasets/conversation-view.tsx | 3 +- .../datasets/trace-flamegraph.test.ts | 16 +++ .../components/datasets/trace-flamegraph.tsx | 35 +++++++ .../agentic-point/agentic-point-detail.tsx | 97 +++++++++++++++++- .../agentic-point/expandable-chart.tsx | 30 ++++-- .../agentic-point/time-series-chart.test.ts | 73 +++++++++++++- .../agentic-point/time-series-chart.tsx | 60 ++++++++++++ .../app/src/hooks/api/use-request-timeline.ts | 2 + .../src/etl/compute-request-timeline.test.ts | 25 ++++- .../db/src/etl/compute-request-timeline.ts | 12 ++- packages/db/src/etl/weka-structure.test.ts | 28 +++++- packages/db/src/etl/weka-structure.ts | 40 ++++++++ 14 files changed, 586 insertions(+), 18 deletions(-) create mode 100644 packages/app/cypress/e2e/agentic-point-time-series.cy.ts create mode 100644 packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts create mode 100644 packages/app/src/components/datasets/trace-flamegraph.test.ts diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts new file mode 100644 index 00000000..b0cfb60d --- /dev/null +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -0,0 +1,98 @@ +const timelineRequest = ( + index: number, + ttftMs: number, + tpotMs: number, + overrides: Record = {}, +) => ({ + cid: 'conversation-1', + ti: index, + wid: 'worker-1', + ad: 0, + phase: 'profiling', + credit: index * 1_000_000_000, + start: index * 1_000_000_000, + ack: null, + end: (index + 1) * 1_000_000_000, + ttftMs, + tpotMs, + isl: 1024, + osl: 128, + cancelled: false, + ...overrides, +}); + +describe('Agentic point request metric time series', () => { + before(() => { + cy.intercept('GET', '/api/v1/trace-histograms*', { body: {} }); + cy.intercept('GET', '/api/v1/trace-server-metrics*', { body: null }); + cy.intercept('GET', '/api/v1/benchmark-siblings*', { statusCode: 404 }); + cy.intercept('GET', '/api/v1/request-timeline*', { + body: { + version: 3, + startNs: 0, + endNs: 7_000_000_000, + durationS: 7, + requests: [ + timelineRequest(0, 100, 10), + timelineRequest(1, 200, 20), + timelineRequest(2, 400, 25), + timelineRequest(3, 800, 40), + timelineRequest(4, 1600, 80), + timelineRequest(5, 3200, 160, { phase: 'warmup' }), + timelineRequest(6, 6400, 320, { cancelled: true }), + ], + }, + }); + cy.visit('/inference/agentic/206885'); + }); + + it('renders rolling P75 interactivity and TTFT using profiling requests only', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('h2', 'Interactivity over time').should('be.visible'); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P75'); + cy.get('svg circle').should('have.length', 5); + cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); + cy.get('svg').should('contain.text', '1 / cumulative mean TPOT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.contains('h2', 'TTFT over time').should('be.visible'); + cy.get('svg circle').should('have.length', 5); + cy.get('svg').should('contain.text', 'TTFT (s)'); + cy.get('svg').should('contain.text', 'Cumulative mean TTFT'); + cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); + }); + }); + + it('switches each chart independently from P75 to P90', () => { + cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { + cy.contains('svg', 'P75 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .as('p75Path'); + cy.contains('button', 'P90').click(); + cy.get('[data-testid="interactivity-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P90'); + cy.contains('svg', 'P90 (rolling 50 req)') + .find('path') + .first() + .invoke('attr', 'd') + .then(function (p90Path) { + expect(p90Path).not.to.equal(this.p75Path); + }); + }); + + cy.get('[data-testid="ttft-over-time-chart"]').within(() => { + cy.get('[data-testid="ttft-percentile-toggle"]') + .find('[role="tab"][aria-selected="true"]') + .should('have.text', 'P75'); + cy.contains('button', 'P90').click(); + cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + }); + }); +}); diff --git a/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts new file mode 100644 index 00000000..672675a3 --- /dev/null +++ b/packages/app/cypress/e2e/datasets-flamegraph-time.cy.ts @@ -0,0 +1,85 @@ +describe('Dataset conversation flamegraph timing', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations/conversation-1', { + body: { + conv_id: 'conversation-1', + models: ['model-a'], + num_turns: 2, + num_subagent_groups: 1, + total_in: 1000, + total_out: 100, + total_cached: 500, + structure: { + blockSize: 64, + totals: { + in: 1000, + out: 100, + cached: 500, + uncached: 500, + numTurns: 2, + numSubagentGroups: 1, + }, + nodes: [ + { + kind: 'turn', + turnIndex: 0, + startS: 0, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + { + kind: 'subagent', + label: 'Explore', + agentId: 'agent-1', + startS: 3661.2, + endS: 3782.6, + durationMs: 121_400, + in: 800, + out: 80, + cached: 500, + uncached: 300, + children: [ + { + kind: 'turn', + turnIndex: 1, + startS: 3661.2, + model: 'model-a', + in: 800, + out: 80, + cached: 500, + uncached: 300, + }, + ], + }, + { + kind: 'turn', + turnIndex: 2, + startS: 65.4, + model: 'model-a', + in: 100, + out: 10, + cached: 0, + uncached: 100, + }, + ], + }, + }, + }); + cy.visit('/datasets/test-dataset/conversations/conversation-1'); + }); + + it('shows turn offsets and a collapsed subagent time range', () => { + cy.get('[data-testid="flamegraph-time-t-0"]').should('have.text', '+00:00'); + cy.get('[data-testid="flamegraph-time-t-2"]').should('have.text', '+01:05'); + cy.get('[data-testid="flamegraph-time-g-1"]').should('have.text', '+1:01:01–1:03:03'); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('not.exist'); + }); + + it('shows subturn offsets when the subagent group is expanded', () => { + cy.contains('button', 'Explore').click(); + cy.get('[data-testid="flamegraph-time-g-1-c-0"]').should('have.text', '+1:01:01'); + }); +}); diff --git a/packages/app/src/components/datasets/conversation-view.tsx b/packages/app/src/components/datasets/conversation-view.tsx index d39b83d9..57aaa0c3 100644 --- a/packages/app/src/components/datasets/conversation-view.tsx +++ b/packages/app/src/components/datasets/conversation-view.tsx @@ -87,7 +87,8 @@ export function ConversationView({ slug, convId }: { slug: string; convId: strin

One bar per turn, scaled to the largest turn. Subagent groups are collapsed by default — click a group to expand it. Each bar splits input into cached prefix and uncached suffix, - plus generated output. + plus generated output. Timestamps are elapsed from conversation start; subagent headers + show their full active range.

{ + it('formats elapsed seconds below and above one hour', () => { + expect(formatElapsedTime(0)).toBe('00:00'); + expect(formatElapsedTime(65.4)).toBe('01:05'); + expect(formatElapsedTime(3661.6)).toBe('1:01:02'); + expect(formatElapsedTime(86_541.149)).toBe('24:02:21'); + }); + + it('clamps negative offsets to the conversation origin', () => { + expect(formatElapsedTime(-5)).toBe('00:00'); + }); +}); diff --git a/packages/app/src/components/datasets/trace-flamegraph.tsx b/packages/app/src/components/datasets/trace-flamegraph.tsx index 12ecb4a4..d0bbb01f 100644 --- a/packages/app/src/components/datasets/trace-flamegraph.tsx +++ b/packages/app/src/components/datasets/trace-flamegraph.tsx @@ -24,6 +24,7 @@ interface VisibleRow { key: string; label: string; sublabel?: string; + timeLabel?: string; cached: number; uncached: number; output: number; @@ -34,6 +35,24 @@ interface VisibleRow { groupIndex?: number; } +/** Format seconds from conversation start as a compact elapsed timestamp. */ +export function formatElapsedTime(seconds: number): string { + const total = Math.max(0, Math.round(seconds)); + const hours = Math.floor(total / 3600); + const minutes = Math.floor((total % 3600) / 60); + const secs = total % 60; + const mm = String(minutes).padStart(2, '0'); + const ss = String(secs).padStart(2, '0'); + return hours > 0 ? `${hours}:${mm}:${ss}` : `${mm}:${ss}`; +} + +function timeLabel(startS?: number, endS?: number): string | undefined { + if (startS === undefined || !Number.isFinite(startS)) return undefined; + const start = formatElapsedTime(startS); + if (endS === undefined || !Number.isFinite(endS) || endS <= startS) return `+${start}`; + return `+${start}–${formatElapsedTime(endS)}`; +} + interface TooltipState { x: number; y: number; @@ -152,6 +171,7 @@ export function TraceFlamegraph({ key: `t-${i}`, label: `Turn ${turnNo}`, sublabel: node.model ?? undefined, + timeLabel: timeLabel(node.startS), cached: node.cached, uncached: node.uncached, output: node.out, @@ -168,6 +188,7 @@ export function TraceFlamegraph({ sublabel: `${node.children.length} turn${node.children.length === 1 ? '' : 's'}${ node.durationMs ? ` · ${(node.durationMs / 1000).toFixed(0)}s` : '' }`, + timeLabel: timeLabel(node.startS, node.endS), cached: node.cached, uncached: node.uncached, output: node.out, @@ -183,6 +204,7 @@ export function TraceFlamegraph({ key: `g-${i}-c-${ci}`, label: `↳ subturn ${ci + 1}`, sublabel: child.model ?? undefined, + timeLabel: timeLabel(child.startS), cached: child.cached, uncached: child.uncached, output: child.out, @@ -291,6 +313,15 @@ export function TraceFlamegraph({ )}
+ {/* Offset from conversation start. Group rows span the full + subagent lifetime; leaf rows show their start instant. */} +
+ {row.timeLabel ?? '—'} +
+ {/* stacked bar — group headers render as a slim muted summary strip so they read as aggregates, not individual turns. */}
+ From start + + {tooltip.row.timeLabel ?? '—'} +
, document.body, diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index 4a076955..e24b7e6b 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -6,7 +6,7 @@ import { useState } from 'react'; import { ArrowLeft } from 'lucide-react'; import { useAgenticAggregates, type AgenticAggregateMap } from '@/hooks/api/use-agentic-aggregates'; -import { useRequestTimeline } from '@/hooks/api/use-request-timeline'; +import { useRequestTimeline, type RequestTimeline } from '@/hooks/api/use-request-timeline'; import { useTraceHistograms } from '@/hooks/api/use-trace-histograms'; import { useTraceServerMetrics, @@ -16,6 +16,7 @@ import { } from '@/hooks/api/use-trace-server-metrics'; import { useBenchmarkSiblings } from '@/hooks/api/use-benchmark-siblings'; import { SegmentedToggle, type SegmentedToggleOption } from '@/components/ui/segmented-toggle'; +import { track } from '@/lib/analytics'; import { AggregateChart, type AggregatePoint, type PercentileKey } from './aggregate-chart'; import { Distribution } from './distribution'; @@ -30,8 +31,11 @@ import { cumulativeUniqueInputTokens, inflightUniqueTokens, rollingAverage, + rollingRequestMetric, sumSeries, timeRollingAverage, + type RequestMetric, + type RequestPercentile, } from './time-series-chart'; interface Props { @@ -114,6 +118,83 @@ const VIEW_OPTIONS: SegmentedToggleOption[] = [ { value: 'aggregates', label: 'Aggregates across configs', testId: 'detail-view-aggregates' }, ]; +const REQUEST_PERCENTILE_OPTIONS: SegmentedToggleOption[] = [ + { value: 'p75', label: 'P75' }, + { value: 'p90', label: 'P90' }, +]; + +// Unofficial-run overlays cannot open this persisted point-detail route: they +// have no benchmark_results id or stored request timeline. These charts are +// therefore intentionally limited to DB-backed agentic points. +function RequestMetricOverTime({ + title, + metric, + timeline, + isLoading, +}: { + title: string; + metric: RequestMetric; + timeline: RequestTimeline | null | undefined; + isLoading: boolean; +}) { + const [percentile, setPercentile] = useState('p75'); + const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null; + const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity'; + const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4'; + + const controls = ( + { + setPercentile(value); + track('inference_agentic_percentile_changed', { metric, percentile: value }); + }} + ariaLabel={`${metricLabel} percentile`} + testId={`${metric}-percentile-toggle`} + /> + ); + + return ( + { + const size = expanded ? CHART_SIZES.expanded : CHART_SIZES.inline; + if (!timeline) return isLoading ? : ; + return ( + `${value < 10 ? value.toFixed(1) : value.toFixed(0)}s` + : (value) => `${value.toFixed(0)}` + } + yAxisLabel={metric === 'ttft' ? 'TTFT (s)' : 'Interactivity (tok/s/user)'} + {...size} + /> + ); + }} + /> + ); +} + /** Bundle per-percentile values for one sibling into the shape AggregateChart wants. */ function toAggPoint( sibling: { id: number; label: string }, @@ -254,6 +335,20 @@ export function AgenticPointDetail({ id }: Props) { }} /> + + + + { diff --git a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx index 7c8e4538..cb5987ec 100644 --- a/packages/app/src/components/inference/agentic-point/expandable-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/expandable-chart.tsx @@ -13,30 +13,40 @@ import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/u export function ExpandableChart({ title, render, + controls, + testId, }: { title: string; render: (expanded: boolean) => ReactNode; + controls?: ReactNode; + testId?: string; }) { const [open, setOpen] = useState(false); return ( -
+

{title}

- +
+ {controls} + +
{render(false)} - {title} +
+ {title} + {controls} +
{render(true)}
diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts index 64deace4..926772db 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts @@ -1,6 +1,77 @@ import { describe, expect, it } from 'vitest'; -import { cumulativeUniqueInputTokens } from './time-series-chart'; +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; + +import { cumulativeUniqueInputTokens, rollingRequestMetric } from './time-series-chart'; + +const request = ( + endS: number, + ttftMs: number | null, + tpotMs: number | null, + overrides: Partial = {}, +): RequestRecord => ({ + cid: 'conversation', + ti: endS, + wid: 'worker', + ad: 0, + phase: 'profiling', + credit: 0, + start: 0, + ack: null, + end: endS * 1e9, + ttftMs, + tpotMs, + isl: 100, + osl: 10, + cancelled: false, + ...overrides, +}); + +describe('rollingRequestMetric', () => { + it('computes a trailing P75 TTFT over the requested window', () => { + const result = rollingRequestMetric( + [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30), request(4, 400, 40)], + 'ttft', + 'p75', + 3, + ); + + expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 }); + expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]); + expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]); + }); + + it('inverts the rolling TPOT percentile for interactivity', () => { + const result = rollingRequestMetric( + [request(1, 100, 10), request(2, 200, 20), request(3, 300, 30)], + 'interactivity', + 'p90', + 3, + ); + + expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]); + expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8); + expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]); + }); + + it('drops warmup, cancelled, missing, and non-positive samples', () => { + const result = rollingRequestMetric( + [ + request(1, 100, 10), + request(2, 200, 20, { phase: 'warmup' }), + request(3, 300, 30, { cancelled: true }), + request(4, null, null), + request(5, 0, 0), + ], + 'ttft', + 'p90', + ); + + expect(result.raw).toEqual([{ t: 1, value: 0.1 }]); + expect(result.trend).toEqual([{ t: 1, value: 0.1 }]); + expect(result.cumulative).toEqual([{ t: 1, value: 0.1 }]); + }); +}); describe('cumulativeUniqueInputTokens', () => { it('cumulates only the freshly-computed buckets, ignoring cache tiers', () => { diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 6b00b1e6..749a17e4 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -3,6 +3,7 @@ import { useMemo } from 'react'; import type { TimeSeriesPoint } from '@/hooks/api/use-trace-server-metrics'; +import type { RequestRecord } from '@/hooks/api/use-request-timeline'; import { ChartHover, type HoverItem } from './chart-hover'; @@ -32,6 +33,65 @@ interface TimeSeriesChartProps { height?: number; } +export type RequestMetric = 'interactivity' | 'ttft'; +export type RequestPercentile = 'p75' | 'p90'; + +/** Linear-interpolated percentile (matches numpy's default method). */ +function quantile(sortedAsc: number[], q: number): number { + if (sortedAsc.length === 1) return sortedAsc[0]!; + const pos = (sortedAsc.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sortedAsc[lo]!; + return sortedAsc[lo]! + (sortedAsc[hi]! - sortedAsc[lo]!) * (pos - lo); +} + +/** + * Build raw request samples plus a trailing request-count percentile. + * + * The percentile is computed in latency space. Interactivity then inverts + * the selected TPOT percentile, matching the aggregate chart convention: + * P90 interactivity = 1 / P90 TPOT (a conservative tail-latency view). + */ +export function rollingRequestMetric( + requests: readonly RequestRecord[], + metric: RequestMetric, + percentile: RequestPercentile, + windowSize = 50, +): { raw: TimeSeriesPoint[]; trend: TimeSeriesPoint[]; cumulative: TimeSeriesPoint[] } { + const q = percentile === 'p75' ? 0.75 : 0.9; + const samples = requests + .filter((request) => request.phase === 'profiling' && !request.cancelled) + .flatMap((request) => { + const latencyMs = metric === 'ttft' ? request.ttftMs : request.tpotMs; + if (latencyMs === null || !Number.isFinite(latencyMs) || latencyMs <= 0) return []; + return [{ t: request.end / 1e9, latencyMs }]; + }) + .toSorted((a, b) => a.t - b.t); + + const raw = samples.map(({ t, latencyMs }) => ({ + t, + value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs, + })); + const trend = samples.map(({ t }, i) => { + const start = Math.max(0, i - Math.max(1, windowSize) + 1); + const sorted = samples + .slice(start, i + 1) + .map((sample) => sample.latencyMs) + .toSorted((a, b) => a - b); + const latencyMs = quantile(sorted, q); + return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs }; + }); + let latencySumMs = 0; + const cumulative = samples.map(({ t, latencyMs }, i) => { + latencySumMs += latencyMs; + const meanLatencyMs = latencySumMs / (i + 1); + return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs }; + }); + + return { raw, trend, cumulative }; +} + /** * Time-weighted rolling average over a `windowS`-second trailing window. * Treats the input as a step function (value held constant between diff --git a/packages/app/src/hooks/api/use-request-timeline.ts b/packages/app/src/hooks/api/use-request-timeline.ts index d3ceaab8..094d2230 100644 --- a/packages/app/src/hooks/api/use-request-timeline.ts +++ b/packages/app/src/hooks/api/use-request-timeline.ts @@ -20,6 +20,8 @@ export interface RequestRecord { /** ns offset from timeline.startNs. Last byte received. */ end: number; ttftMs: number | null; + /** Time per output token in ms. */ + tpotMs: number | null; isl: number | null; osl: number | null; cancelled: boolean; diff --git a/packages/db/src/etl/compute-request-timeline.test.ts b/packages/db/src/etl/compute-request-timeline.test.ts index 64512aca..61e69fe8 100644 --- a/packages/db/src/etl/compute-request-timeline.test.ts +++ b/packages/db/src/etl/compute-request-timeline.test.ts @@ -15,6 +15,8 @@ interface SyntheticRequest { end: number; ack?: number | null; ttftMs?: number | null; + tpotMs?: number | null; + tpotKey?: 'inter_token_latency' | 'time_per_output_token'; isl?: number | null; osl?: number | null; cancelled?: boolean; @@ -37,6 +39,8 @@ function makeBlob(requests: SyntheticRequest[]) { }, metrics: { time_to_first_token: r.ttftMs === null ? null : { value: r.ttftMs ?? 50, unit: 'ms' }, + [r.tpotKey ?? 'inter_token_latency']: + r.tpotMs === null ? null : { value: r.tpotMs ?? 10, unit: 'ms' }, input_sequence_length: { value: r.isl ?? 100, unit: 'tokens' }, output_sequence_length: { value: r.osl ?? 10, unit: 'tokens' }, }, @@ -115,7 +119,7 @@ describe('computeRequestTimeline', () => { expect(r.phase).toBe('profiling'); }); - it('preserves the cancelled flag and TTFT/ISL/OSL metrics', () => { + it('preserves the cancelled flag and TTFT/TPOT/ISL/OSL metrics', () => { const tl = computeRequestTimeline( makeBlob([ { @@ -125,6 +129,7 @@ describe('computeRequestTimeline', () => { start: 10, end: 100, ttftMs: 25.5, + tpotMs: 12.5, isl: 1024, osl: 256, cancelled: true, @@ -134,10 +139,28 @@ describe('computeRequestTimeline', () => { const r = tl?.requests[0]!; expect(r.cancelled).toBe(true); expect(r.ttftMs).toBeCloseTo(25.5, 6); + expect(r.tpotMs).toBeCloseTo(12.5, 6); expect(r.isl).toBe(1024); expect(r.osl).toBe(256); }); + it('accepts time_per_output_token as a TPOT alias', () => { + const tl = computeRequestTimeline( + makeBlob([ + { + cid: 'a', + ti: 0, + credit: 0, + start: 10, + end: 100, + tpotMs: 8.25, + tpotKey: 'time_per_output_token', + }, + ]), + ); + expect(tl?.requests[0]?.tpotMs).toBeCloseTo(8.25, 6); + }); + it('skips records missing both credit_issued_ns and request_start_ns', () => { // Build a record with only request_end_ns — the helper rejects it. const broken = gzipSync( diff --git a/packages/db/src/etl/compute-request-timeline.ts b/packages/db/src/etl/compute-request-timeline.ts index a1134f7a..707e8c54 100644 --- a/packages/db/src/etl/compute-request-timeline.ts +++ b/packages/db/src/etl/compute-request-timeline.ts @@ -14,7 +14,7 @@ import { gunzipSync } from 'node:zlib'; /** Bump when the extraction algorithm changes — backfill recomputes anything older. */ -export const REQUEST_TIMELINE_VERSION = 1; +export const REQUEST_TIMELINE_VERSION = 3; export interface RequestRecord { /** Conversation id (groups turns of one agent session). */ @@ -37,6 +37,8 @@ export interface RequestRecord { end: number; /** Time-to-first-token in ms. */ ttftMs: number | null; + /** Time per output token in ms. */ + tpotMs: number | null; /** Input sequence length (tokens). */ isl: number | null; /** Output sequence length (tokens). */ @@ -76,6 +78,8 @@ interface RawRecord { metadata?: RawMetadata; metrics?: { time_to_first_token?: RawMetricValue | number; + time_per_output_token?: RawMetricValue | number; + inter_token_latency?: RawMetricValue | number; input_sequence_length?: RawMetricValue | number; output_sequence_length?: RawMetricValue | number; }; @@ -108,6 +112,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n const raw: { meta: RawMetadata; ttftMs: number | null; + tpotMs: number | null; isl: number | null; osl: number | null; }[] = []; @@ -135,6 +140,10 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n raw.push({ meta, ttftMs: readNum(rec.metrics?.time_to_first_token) ?? null, + tpotMs: + readNum(rec.metrics?.time_per_output_token) ?? + readNum(rec.metrics?.inter_token_latency) ?? + null, isl: readNum(rec.metrics?.input_sequence_length) ?? null, osl: readNum(rec.metrics?.output_sequence_length) ?? null, }); @@ -163,6 +172,7 @@ export function computeRequestTimeline(blob: Buffer | null): RequestTimeline | n ack, end, ttftMs: r.ttftMs, + tpotMs: r.tpotMs, isl: r.isl, osl: r.osl, cancelled: m.was_cancelled === true, diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts index 95bfef38..5287b682 100644 --- a/packages/db/src/etl/weka-structure.test.ts +++ b/packages/db/src/etl/weka-structure.test.ts @@ -86,17 +86,18 @@ describe('buildConversationStructure', () => { id: 'c4', block_size: 64, requests: [ - { type: 'n', model: 'main', in: 64, out: 10, hash_ids: [1] }, + { type: 'n', model: 'main', t: 0, in: 64, out: 10, hash_ids: [1] }, { type: 'subagent', agent_id: 'a1', subagent_type: 'Explore', + t: 12.5, duration_ms: 1234, requests: [ // sees parent block 1 (snapshot at spawn) → 1 block cached - { type: 'n', model: 'sub', in: 128, out: 7, hash_ids: [1, 5] }, + { type: 'n', model: 'sub', t: 12.5, in: 128, out: 7, hash_ids: [1, 5] }, // now block 5 is also seen within the subagent → 2 cached - { type: 'n', model: 'sub', in: 128, out: 3, hash_ids: [1, 5] }, + { type: 'n', model: 'sub', t: 13.1, in: 128, out: 3, hash_ids: [1, 5] }, ], }, // Parent turn after subagent: block 5 must NOT be cached (subagent @@ -113,7 +114,10 @@ describe('buildConversationStructure', () => { expect(sub.label).toBe('Explore'); expect(sub.agentId).toBe('a1'); expect(sub.durationMs).toBe(1234); + expect(sub.startS).toBe(12.5); + expect(sub.endS).toBeCloseTo(13.734, 6); expect(sub.children).toHaveLength(2); + expect(sub.children.map((child) => child.startS)).toEqual([12.5, 13.1]); expect(sub.children[0].cached).toBe(64); // block 1 from parent snapshot expect(sub.children[1].cached).toBe(128); // blocks 1 & 5 now seen in child expect(sub.in).toBe(256); @@ -132,6 +136,24 @@ describe('buildConversationStructure', () => { expect(s.blockSize).toBe(64); expect((s.nodes[0] as SubagentNode).label).toBe('Subagent'); }); + + it('derives a subagent time range from child timings when group timing is absent', () => { + const conv: RawWekaConversation = { + id: 'c6', + requests: [ + { + type: 'subagent', + requests: [ + { type: 'n', t: 5, api_time: 2.5, in: 10, out: 1 }, + { type: 'n', t: 9, api_time: 3, in: 10, out: 1 }, + ], + }, + ], + }; + const sub = buildConversationStructure(conv).nodes[0] as SubagentNode; + expect(sub.startS).toBe(5); + expect(sub.endS).toBe(12); + }); }); describe('histograms', () => { diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index e4113c68..33e222b4 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -48,6 +48,8 @@ export interface RawWekaConversation { export interface TurnNode { kind: 'turn'; turnIndex: number; + /** Seconds from the start of the conversation. */ + startS?: number; model?: string; in: number; out: number; @@ -61,6 +63,10 @@ export interface SubagentNode { kind: 'subagent'; label: string; agentId?: string; + /** Seconds from the start of the conversation. */ + startS?: number; + /** Seconds from the start of the conversation. */ + endS?: number; durationMs?: number; in: number; out: number; @@ -130,6 +136,35 @@ function subagentLabel(s: RawWekaSubagent): string { return base && base.length > 0 ? base : 'Subagent'; } +function finiteTime(value: number | undefined): number | undefined { + return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined; +} + +function subagentTimeRange(entry: RawWekaSubagent): { startS?: number; endS?: number } { + const children = entry.requests ?? []; + const childStarts = children + .map((child) => finiteTime(child.t)) + .filter((value): value is number => value !== undefined); + const startS = + finiteTime(entry.t) ?? (childStarts.length > 0 ? Math.min(...childStarts) : undefined); + const durationMs = finiteTime(entry.duration_ms); + if (startS !== undefined && durationMs !== undefined) { + return { startS, endS: startS + durationMs / 1000 }; + } + + const childEnds = children + .map((child) => { + const childStart = finiteTime(child.t); + if (childStart === undefined) return undefined; + return childStart + (finiteTime(child.api_time) ?? 0); + }) + .filter((value): value is number => value !== undefined); + return { + startS, + endS: childEnds.length > 0 ? Math.max(...childEnds) : startS, + }; +} + /** * Build the flamegraph structure for one conversation. Main turns share a single * accumulating prefix-cache `seen` set; each subagent group runs against a @@ -153,6 +188,7 @@ export function buildConversationStructure( for (const entry of conv.requests ?? []) { if (isSubagent(entry)) { + const { startS, endS } = subagentTimeRange(entry); const childSeen = new Set(seen); // snapshot at spawn; not merged back const children: TurnNode[] = []; let gin = 0; @@ -165,6 +201,7 @@ export function buildConversationStructure( children.push({ kind: 'turn', turnIndex: turnIndex++, + startS: finiteTime(inner.t), model: inner.model, in: split.in, out, @@ -180,6 +217,8 @@ export function buildConversationStructure( kind: 'subagent', label: subagentLabel(entry), agentId: entry.agent_id, + startS, + endS, durationMs: entry.duration_ms, in: gin, out: gout, @@ -198,6 +237,7 @@ export function buildConversationStructure( nodes.push({ kind: 'turn', turnIndex: turnIndex++, + startS: finiteTime(entry.t), model: entry.model, in: split.in, out, From 13471d75072d574d42be008a462dbfce9467c95d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 13:44:55 -0500 Subject: [PATCH 93/96] add dataset percentile distributions --- .../component/distribution-card.cy.tsx | 41 ++++++++- .../cypress/e2e/datasets-distributions.cy.ts | 90 +++++++++++++++++++ .../components/datasets/dataset-detail.tsx | 6 ++ .../components/datasets/distribution-card.tsx | 23 +++-- packages/app/src/hooks/api/use-datasets.ts | 5 ++ packages/db/src/etl/weka-structure.test.ts | 18 ++++ packages/db/src/etl/weka-structure.ts | 46 ++++++++++ packages/db/src/ingest-weka-dataset.ts | 50 ++++------- 8 files changed, 235 insertions(+), 44 deletions(-) create mode 100644 packages/app/cypress/e2e/datasets-distributions.cy.ts diff --git a/packages/app/cypress/component/distribution-card.cy.tsx b/packages/app/cypress/component/distribution-card.cy.tsx index fb7e5461..511505b9 100644 --- a/packages/app/cypress/component/distribution-card.cy.tsx +++ b/packages/app/cypress/component/distribution-card.cy.tsx @@ -8,7 +8,16 @@ const distribution: Distribution = { { x0: 200, x1: 300, count: 12 }, { x0: 300, x1: 400, count: 3 }, ], - stats: { count: 40, min: 10, max: 390, mean: 180, median: 175, p90: 320 }, + stats: { + count: 40, + min: 10, + max: 390, + mean: 180, + median: 175, + p75: 250, + p90: 320, + p95: 360, + }, }; describe('DistributionCard', () => { @@ -18,8 +27,13 @@ describe('DistributionCard', () => { ); cy.contains('Input tokens per turn').should('be.visible'); cy.contains('n=40').should('be.visible'); - cy.contains('median 175').should('be.visible'); + cy.contains('p50 175').should('be.visible'); + cy.contains('p75 250').should('be.visible'); cy.contains('p90 320').should('be.visible'); + cy.contains('p95 360').should('be.visible'); + cy.get( + 'line[stroke="#3b82f6"], line[stroke="#22c55e"], line[stroke="#f59e0b"], line[stroke="#ef4444"]', + ).should('have.length', 8); // One filled bar rect per bin (ChartHover may add a transparent overlay rect). cy.get('rect[class*="fill-primary"]').should('have.length', distribution.bins.length); }); @@ -42,4 +56,27 @@ describe('DistributionCard', () => { ); cy.contains('log scale').should('be.visible'); }); + + it('renders older v1 stats without unavailable percentile guides', () => { + cy.mount( + , + ); + cy.contains('p50 175').should('be.visible'); + cy.contains('p90 320').should('be.visible'); + cy.contains('NaN').should('not.exist'); + }); }); diff --git a/packages/app/cypress/e2e/datasets-distributions.cy.ts b/packages/app/cypress/e2e/datasets-distributions.cy.ts new file mode 100644 index 00000000..7edda341 --- /dev/null +++ b/packages/app/cypress/e2e/datasets-distributions.cy.ts @@ -0,0 +1,90 @@ +const distribution = (values: { + median: number; + p75: number; + p90: number; + p95: number; + max: number; +}) => ({ + bins: [ + { x0: 0, x1: 10, count: 5 }, + { x0: 10, x1: 100, count: 15 }, + ], + stats: { + count: 20, + min: 0, + mean: 40, + ...values, + }, +}); + +describe('Dataset distribution percentiles', () => { + before(() => { + cy.intercept('GET', '/api/v1/datasets/test-dataset', { + body: { + id: 'test-dataset', + slug: 'test-dataset', + label: 'Test dataset', + variant: 'full', + description: null, + hf_url: null, + license: 'apache-2.0', + conversation_count: 1, + summary: { + mainTurns: 20, + subagentGroups: 0, + subagentTurns: 0, + cachedPct: 0.5, + totalIn: 1000, + totalOut: 200, + }, + chart_data: { + version: 2, + inputTokensPerTurn: distribution({ + median: 100, + p75: 200, + p90: 300, + p95: 400, + max: 500, + }), + outputTokensPerTurn: distribution({ + median: 10, + p75: 20, + p90: 30, + p95: 40, + max: 50, + }), + uncachedInputTokensPerTurn: distribution({ + median: 0, + p75: 64, + p90: 128, + p95: 256, + max: 512, + }), + }, + ingested_at: '2026-06-23T00:00:00Z', + }, + }); + cy.intercept('GET', '/api/v1/datasets/test-dataset/conversations*', { + body: { total: 0, items: [] }, + }); + cy.visit('/datasets/test-dataset'); + }); + + it('shows P50/P75/P90/P95 for ISL, OSL, and uncached input', () => { + const expected = [ + ['Input tokens per turn', ['p50 100', 'p75 200', 'p90 300', 'p95 400']], + ['Output tokens per turn', ['p50 10', 'p75 20', 'p90 30', 'p95 40']], + ['Uncached input tokens per request', ['p50 0', 'p75 64', 'p90 128', 'p95 256']], + ] as const; + + for (const [title, percentiles] of expected) { + cy.contains('[data-slot="card"]', title).within(() => { + for (const percentile of percentiles) cy.contains(percentile).should('be.visible'); + cy.get('svg line[stroke="#3b82f6"]').should('exist'); + cy.get('svg line[stroke="#22c55e"]').should('exist'); + cy.get('svg line[stroke="#f59e0b"]').should('exist'); + cy.get('svg line[stroke="#ef4444"]').should('exist'); + }); + } + }); +}); diff --git a/packages/app/src/components/datasets/dataset-detail.tsx b/packages/app/src/components/datasets/dataset-detail.tsx index 9410a505..ac8b2de5 100644 --- a/packages/app/src/components/datasets/dataset-detail.tsx +++ b/packages/app/src/components/datasets/dataset-detail.tsx @@ -145,6 +145,12 @@ export function DatasetDetail({ slug }: { slug: string }) { scale="log" distribution={cd.outputTokensPerTurn} /> + {subtitle}
} {stats && (
- n={stats.count.toLocaleString()} · median {formatValue(stats.median)} · p90{' '} - {formatValue(stats.p90)} · max {formatValue(stats.max)} {unit} + n={stats.count.toLocaleString()} · p50 {formatValue(stats.median)} + {typeof stats.p75 === 'number' && <> · p75 {formatValue(stats.p75)}} · p90{' '} + {formatValue(stats.p90)} + {typeof stats.p95 === 'number' && <> · p95 {formatValue(stats.p95)}} · max{' '} + {formatValue(stats.max)} {unit}
)}
diff --git a/packages/app/src/hooks/api/use-datasets.ts b/packages/app/src/hooks/api/use-datasets.ts index 3ce61a85..96b0f59f 100644 --- a/packages/app/src/hooks/api/use-datasets.ts +++ b/packages/app/src/hooks/api/use-datasets.ts @@ -46,7 +46,11 @@ export interface DistributionStats { max: number; mean: number; median: number; + /** Added in chart_data v2. */ + p75?: number; p90: number; + /** Added in chart_data v2. */ + p95?: number; } export interface Distribution { @@ -57,6 +61,7 @@ export interface Distribution { export interface DatasetChartData { version?: number; inputTokensPerTurn?: Distribution; + uncachedInputTokensPerTurn?: Distribution; outputTokensPerTurn?: Distribution; turnsPerConversation?: Distribution; subagentGroupsPerConversation?: Distribution; diff --git a/packages/db/src/etl/weka-structure.test.ts b/packages/db/src/etl/weka-structure.test.ts index 5287b682..4debf1ae 100644 --- a/packages/db/src/etl/weka-structure.test.ts +++ b/packages/db/src/etl/weka-structure.test.ts @@ -4,6 +4,8 @@ import { buildConversationStructure, linearHistogram, logHistogram, + logHistogramWithZero, + summarizeValues, type RawWekaConversation, type SubagentNode, type TurnNode, @@ -177,4 +179,20 @@ describe('histograms', () => { expect(linearHistogram([])).toEqual([]); expect(logHistogram([])).toEqual([]); }); + + it('preserves zero-valued samples in a dedicated log histogram bin', () => { + const bins = logHistogramWithZero([0, 0, 1, 10, 100], 4); + expect(bins[0]).toEqual({ x0: 0, x1: 1, count: 2 }); + expect(bins.reduce((total, bin) => total + bin.count, 0)).toBe(5); + }); +}); + +describe('summarizeValues', () => { + it('computes the same linearly-interpolated percentile set as request distributions', () => { + const summary = summarizeValues(Array.from({ length: 100 }, (_, i) => i + 1)); + expect(summary.median).toBeCloseTo(50.5, 6); + expect(summary.p75).toBeCloseTo(75.25, 6); + expect(summary.p90).toBeCloseTo(90.1, 6); + expect(summary.p95).toBeCloseTo(95.05, 6); + }); }); diff --git a/packages/db/src/etl/weka-structure.ts b/packages/db/src/etl/weka-structure.ts index 33e222b4..ac7a6eab 100644 --- a/packages/db/src/etl/weka-structure.ts +++ b/packages/db/src/etl/weka-structure.ts @@ -274,6 +274,42 @@ export interface HistogramBin { count: number; } +export interface NumberSummary { + count: number; + min: number; + max: number; + mean: number; + median: number; + p75: number; + p90: number; + p95: number; +} + +/** Distribution summary with linear-interpolated percentiles. */ +export function summarizeValues(values: readonly number[]): NumberSummary { + if (values.length === 0) { + return { count: 0, min: 0, max: 0, mean: 0, median: 0, p75: 0, p90: 0, p95: 0 }; + } + const sorted = [...values].toSorted((a, b) => a - b); + const quantile = (q: number): number => { + const pos = (sorted.length - 1) * q; + const lo = Math.floor(pos); + const hi = Math.ceil(pos); + if (lo === hi) return sorted[lo]!; + return sorted[lo]! + (sorted[hi]! - sorted[lo]!) * (pos - lo); + }; + return { + count: sorted.length, + min: sorted[0]!, + max: sorted.at(-1)!, + mean: sorted.reduce((sum, value) => sum + value, 0) / sorted.length, + median: quantile(0.5), + p75: quantile(0.75), + p90: quantile(0.9), + p95: quantile(0.95), + }; +} + /** Linear-width histogram over [0, max]. Empty input → []. */ export function linearHistogram(values: readonly number[], bins = 40): HistogramBin[] { if (values.length === 0) return []; @@ -313,3 +349,13 @@ export function logHistogram(values: readonly number[], bins = 40): HistogramBin } return out; } + +/** Log-width histogram that preserves zero as a dedicated first bin. */ +export function logHistogramWithZero(values: readonly number[], bins = 40): HistogramBin[] { + const zeroCount = values.filter((value) => value === 0).length; + const positive = values.filter((value) => value > 0); + if (zeroCount === 0) return logHistogram(positive, bins); + if (positive.length === 0) return [{ x0: 0, x1: 1, count: zeroCount }]; + const positiveBins = logHistogram(positive, Math.max(1, bins - 1)); + return [{ x0: 0, x1: positiveBins[0]?.x0 ?? 1, count: zeroCount }, ...positiveBins]; +} diff --git a/packages/db/src/ingest-weka-dataset.ts b/packages/db/src/ingest-weka-dataset.ts index 22069419..e00471d7 100644 --- a/packages/db/src/ingest-weka-dataset.ts +++ b/packages/db/src/ingest-weka-dataset.ts @@ -24,6 +24,8 @@ import { buildConversationStructure, linearHistogram, logHistogram, + logHistogramWithZero, + summarizeValues, type ConversationStructure, type RawWekaConversation, type TurnNode, @@ -140,6 +142,7 @@ async function* iterRows( interface Accumulator { inputPerTurn: number[]; // effective input tokens, every turn (incl. subagent children) + uncachedInputPerTurn: number[]; outputPerTurn: number[]; cachedFractionPerTurn: number[]; // cached/in, for turns with in>0 turnsPerConv: number[]; // main (top-level) turns @@ -157,6 +160,7 @@ interface Accumulator { function newAccumulator(): Accumulator { return { inputPerTurn: [], + uncachedInputPerTurn: [], outputPerTurn: [], cachedFractionPerTurn: [], turnsPerConv: [], @@ -174,6 +178,7 @@ function newAccumulator(): Accumulator { function recordTurn(acc: Accumulator, t: TurnNode): void { acc.inputPerTurn.push(t.in); + acc.uncachedInputPerTurn.push(t.uncached); acc.outputPerTurn.push(t.out); if (t.in > 0) acc.cachedFractionPerTurn.push(t.cached / t.in); if (t.model) acc.modelCounts[t.model] = (acc.modelCounts[t.model] ?? 0) + 1; @@ -198,57 +203,32 @@ function accumulate(acc: Accumulator, s: ConversationStructure): void { } } -interface NumberSummary { - count: number; - min: number; - max: number; - mean: number; - median: number; - p90: number; -} - -function summarize(values: number[]): NumberSummary { - if (values.length === 0) { - return { count: 0, min: 0, max: 0, mean: 0, median: 0, p90: 0 }; - } - const sorted = [...values].toSorted((a, b) => a - b); - const n = sorted.length; - // Quantile by position; q(0)=min, q(1)=max — avoids array-tail indexing that - // the linter rewrites to `.at(-1)` (which widens the type to `| undefined`). - const q = (p: number) => sorted[Math.min(n - 1, Math.max(0, Math.floor(p * (n - 1))))]; - const sum = sorted.reduce((a, b) => a + b, 0); - return { - count: n, - min: q(0), - max: q(1), - mean: sum / n, - median: q(0.5), - p90: q(0.9), - }; -} - function buildChartData(acc: Accumulator) { return { - version: 1, + version: 2, inputTokensPerTurn: { bins: logHistogram(acc.inputPerTurn), - stats: summarize(acc.inputPerTurn), + stats: summarizeValues(acc.inputPerTurn), + }, + uncachedInputTokensPerTurn: { + bins: logHistogramWithZero(acc.uncachedInputPerTurn), + stats: summarizeValues(acc.uncachedInputPerTurn), }, outputTokensPerTurn: { bins: logHistogram(acc.outputPerTurn), - stats: summarize(acc.outputPerTurn), + stats: summarizeValues(acc.outputPerTurn), }, turnsPerConversation: { bins: linearHistogram(acc.turnsPerConv), - stats: summarize(acc.turnsPerConv), + stats: summarizeValues(acc.turnsPerConv), }, subagentGroupsPerConversation: { bins: linearHistogram(acc.subagentGroupsPerConv), - stats: summarize(acc.subagentGroupsPerConv), + stats: summarizeValues(acc.subagentGroupsPerConv), }, cachedFractionPerTurn: { bins: linearHistogram(acc.cachedFractionPerTurn, 20), - stats: summarize(acc.cachedFractionPerTurn), + stats: summarizeValues(acc.cachedFractionPerTurn), }, }; } From 8bfe66408d6b8514031e47af1b94ede19c369d97 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 16:10:02 -0500 Subject: [PATCH 94/96] use cumulative percentiles for agentic charts --- .../e2e/agentic-point-time-series.cy.ts | 34 ++++++++++--------- .../agentic-point/agentic-point-detail.tsx | 7 ++-- .../agentic-point/time-series-chart.test.ts | 4 +-- .../agentic-point/time-series-chart.tsx | 20 ++++++++--- 4 files changed, 40 insertions(+), 25 deletions(-) diff --git a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts index b0cfb60d..db59dda2 100644 --- a/packages/app/cypress/e2e/agentic-point-time-series.cy.ts +++ b/packages/app/cypress/e2e/agentic-point-time-series.cy.ts @@ -46,15 +46,15 @@ describe('Agentic point request metric time series', () => { cy.visit('/inference/agentic/206885'); }); - it('renders rolling P75 interactivity and TTFT using profiling requests only', () => { + it('renders rolling P90 interactivity and TTFT by default using profiling requests only', () => { cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { cy.contains('h2', 'Interactivity over time').should('be.visible'); cy.get('[data-testid="interactivity-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') - .should('have.text', 'P75'); + .should('have.text', 'P90'); cy.get('svg circle').should('have.length', 5); - cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); - cy.get('svg').should('contain.text', '1 / cumulative mean TPOT'); + cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + cy.get('svg').should('contain.text', '1 / cumulative P90 TPOT'); cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); }); @@ -62,37 +62,39 @@ describe('Agentic point request metric time series', () => { cy.contains('h2', 'TTFT over time').should('be.visible'); cy.get('svg circle').should('have.length', 5); cy.get('svg').should('contain.text', 'TTFT (s)'); - cy.get('svg').should('contain.text', 'Cumulative mean TTFT'); + cy.get('svg').should('contain.text', 'Cumulative P90 TTFT'); cy.get('svg path[stroke="#ef4444"]').should('have.length', 1); }); }); - it('switches each chart independently from P75 to P90', () => { + it('switches each chart independently from P90 to P75', () => { cy.get('[data-testid="interactivity-over-time-chart"]').within(() => { - cy.contains('svg', 'P75 (rolling 50 req)') + cy.contains('svg', 'P90 (rolling 50 req)') .find('path') .first() .invoke('attr', 'd') - .as('p75Path'); - cy.contains('button', 'P90').click(); + .as('p90Path'); + cy.contains('button', 'P75').click(); cy.get('[data-testid="interactivity-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') - .should('have.text', 'P90'); - cy.contains('svg', 'P90 (rolling 50 req)') + .should('have.text', 'P75'); + cy.get('svg').should('contain.text', '1 / cumulative P75 TPOT'); + cy.contains('svg', 'P75 (rolling 50 req)') .find('path') .first() .invoke('attr', 'd') - .then(function (p90Path) { - expect(p90Path).not.to.equal(this.p75Path); + .then(function (p75Path) { + expect(p75Path).not.to.equal(this.p90Path); }); }); cy.get('[data-testid="ttft-over-time-chart"]').within(() => { cy.get('[data-testid="ttft-percentile-toggle"]') .find('[role="tab"][aria-selected="true"]') - .should('have.text', 'P75'); - cy.contains('button', 'P90').click(); - cy.get('svg').should('contain.text', 'P90 (rolling 50 req)'); + .should('have.text', 'P90'); + cy.contains('button', 'P75').click(); + cy.get('svg').should('contain.text', 'P75 (rolling 50 req)'); + cy.get('svg').should('contain.text', 'Cumulative P75 TTFT'); }); }); }); diff --git a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx index e24b7e6b..e1bc1524 100644 --- a/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx +++ b/packages/app/src/components/inference/agentic-point/agentic-point-detail.tsx @@ -137,7 +137,7 @@ function RequestMetricOverTime({ timeline: RequestTimeline | null | undefined; isLoading: boolean; }) { - const [percentile, setPercentile] = useState('p75'); + const [percentile, setPercentile] = useState('p90'); const result = timeline ? rollingRequestMetric(timeline.requests, metric, percentile, 50) : null; const metricLabel = metric === 'ttft' ? 'TTFT' : 'Interactivity'; const color = metric === 'ttft' ? '#f59e0b' : '#06b6d4'; @@ -174,7 +174,10 @@ function RequestMetricOverTime({ strokeWidth: 2.5, }, { - name: metric === 'ttft' ? 'Cumulative mean TTFT' : '1 / cumulative mean TPOT', + name: + metric === 'ttft' + ? `Cumulative ${percentile.toUpperCase()} TTFT` + : `1 / cumulative ${percentile.toUpperCase()} TPOT`, data: result?.cumulative ?? [], color: '#ef4444', strokeWidth: 3, diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts index 926772db..3506ff45 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.test.ts @@ -38,7 +38,7 @@ describe('rollingRequestMetric', () => { expect(result.raw.at(-1)).toEqual({ t: 4, value: 0.4 }); expect(result.trend.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.35]); - expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.15, 0.2, 0.25]); + expect(result.cumulative.map((point) => point.value)).toEqual([0.1, 0.175, 0.25, 0.325]); }); it('inverts the rolling TPOT percentile for interactivity', () => { @@ -51,7 +51,7 @@ describe('rollingRequestMetric', () => { expect(result.raw.map((point) => point.value)).toEqual([100, 50, 1000 / 30]); expect(result.trend.at(-1)?.value).toBeCloseTo(1000 / 28, 8); - expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 15, 50]); + expect(result.cumulative.map((point) => point.value)).toEqual([100, 1000 / 19, 1000 / 28]); }); it('drops warmup, cancelled, missing, and non-positive samples', () => { diff --git a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx index 749a17e4..0c0b5739 100644 --- a/packages/app/src/components/inference/agentic-point/time-series-chart.tsx +++ b/packages/app/src/components/inference/agentic-point/time-series-chart.tsx @@ -82,11 +82,21 @@ export function rollingRequestMetric( const latencyMs = quantile(sorted, q); return { t, value: metric === 'ttft' ? latencyMs / 1000 : 1000 / latencyMs }; }); - let latencySumMs = 0; - const cumulative = samples.map(({ t, latencyMs }, i) => { - latencySumMs += latencyMs; - const meanLatencyMs = latencySumMs / (i + 1); - return { t, value: metric === 'ttft' ? meanLatencyMs / 1000 : 1000 / meanLatencyMs }; + const prefixLatencies: number[] = []; + const cumulative = samples.map(({ t, latencyMs }) => { + let lo = 0; + let hi = prefixLatencies.length; + while (lo < hi) { + const mid = (lo + hi) >> 1; + if (prefixLatencies[mid]! <= latencyMs) lo = mid + 1; + else hi = mid; + } + prefixLatencies.splice(lo, 0, latencyMs); + const cumulativeLatencyMs = quantile(prefixLatencies, q); + return { + t, + value: metric === 'ttft' ? cumulativeLatencyMs / 1000 : 1000 / cumulativeLatencyMs, + }; }); return { raw, trend, cumulative }; From e3e0bf43ddec5dd8c1d4f21e1c3f9baff469f8f9 Mon Sep 17 00:00:00 2001 From: Alec Ibarra <93070681+adibarra@users.noreply.github.com> Date: Tue, 23 Jun 2026 18:34:16 -0500 Subject: [PATCH 95/96] fix(db): build each chart line from a single run, no cross-run/date stitching (#491) --- ..._latest_benchmarks_single_run_per_line.sql | 49 +++++ .../src/json-provider.line-single-run.test.ts | 203 ++++++++++++++++++ packages/db/src/json-provider.ts | 50 +++-- packages/db/src/queries/benchmarks.ts | 58 +++-- 4 files changed, 323 insertions(+), 37 deletions(-) create mode 100644 packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql create mode 100644 packages/db/src/json-provider.line-single-run.test.ts diff --git a/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql new file mode 100644 index 00000000..039dfe09 --- /dev/null +++ b/packages/db/migrations/008_latest_benchmarks_single_run_per_line.sql @@ -0,0 +1,49 @@ +-- ============================================================ +-- LATEST_BENCHMARKS — one run per line (no cross-run stitching) +-- ============================================================ +-- +-- Previously the view did `distinct on (config_id, conc, isl, osl)` ordered by +-- date desc — resolved INDEPENDENTLY per concurrency. So if a newer run +-- re-measured only some concurrencies (a partial re-sweep), the concurrencies it +-- skipped fell back to an older run that did measure them, and a single chart line +-- ended up stitched from points produced by different runs on different dates. +-- +-- A line is one config + sequence + offload mode +-- (config_id, benchmark_type, isl, osl, offload_mode) plotted +-- across concurrencies, and it must come from a SINGLE workflow run. We pick the +-- newest run per line (newest date, then latest sweep by run_started_at, then +-- highest workflow_run_id so exactly one run wins even on a same-day / null tie), +-- then keep EVERY concurrency that one run measured. A partial re-sweep therefore +-- truncates the line to its own concurrencies rather than borrowing an older run's. + +drop materialized view if exists latest_benchmarks; + +create materialized view latest_benchmarks as +with winners as ( + select distinct on (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode) + br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.workflow_run_id as winning_run_id + from benchmark_results br + join latest_workflow_runs wr on wr.id = br.workflow_run_id + where br.error is null + order by br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.date desc, wr.run_started_at desc nulls last, br.workflow_run_id desc +) +select br.* +from benchmark_results br +join winners w + on w.config_id = br.config_id + and w.benchmark_type = br.benchmark_type + and w.isl is not distinct from br.isl + and w.osl is not distinct from br.osl + and w.offload_mode = br.offload_mode + and w.winning_run_id = br.workflow_run_id +where br.error is null; + +-- Unique key now includes benchmark_type (part of the line key). One run per line +-- guarantees one row per concurrency, so this stays unique and keeps +-- REFRESH MATERIALIZED VIEW CONCURRENTLY working. +create unique index latest_benchmarks_pk + on latest_benchmarks (config_id, conc, isl, osl, benchmark_type, offload_mode) + nulls not distinct; +create index latest_benchmarks_model_idx on latest_benchmarks (config_id); diff --git a/packages/db/src/json-provider.line-single-run.test.ts b/packages/db/src/json-provider.line-single-run.test.ts new file mode 100644 index 00000000..b75fa26a --- /dev/null +++ b/packages/db/src/json-provider.line-single-run.test.ts @@ -0,0 +1,203 @@ +import { mkdtempSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +import type { getLatestBenchmarks as GetLatestBenchmarks } from './json-provider.js'; + +/** + * A chart line is one config + sequence + offload mode + * (config_id, benchmark_type, isl, osl, offload_mode) plotted across concurrencies, and it must + * come from a SINGLE workflow run. getLatestBenchmarks picks the + * newest run per line (date, then run_started_at, then workflow_run_id) and returns EVERY + * concurrency that one run measured — never stitching skipped concurrencies from an older run. + * + * These fixtures exercise the multi-concurrency cases the as-of test can't (it is single-conc): + * a partial re-sweep that must truncate the line, per-sequence line independence, and the + * same-day workflow_run_id tiebreak. + */ + +const cfg = (id: number) => ({ + id, + hardware: 'h100', + framework: 'vllm', + model: 'testm', + precision: 'fp8', + spec_method: 'none', + disagg: false, + is_multinode: false, + prefill_tp: 1, + prefill_ep: 1, + prefill_dp_attention: false, + prefill_num_workers: 1, + decode_tp: 1, + decode_ep: 1, + decode_dp_attention: false, + decode_num_workers: 1, + num_prefill_gpu: 0, + num_decode_gpu: 8, +}); + +const run = (id: number, githubId: number, startedAt: string | null, date: string) => ({ + id, + github_run_id: githubId, + run_attempt: 1, + name: `run ${githubId}`, + status: 'completed', + conclusion: 'success', + head_sha: 'sha', + head_branch: 'main', + html_url: `https://github.com/x/runs/${githubId}`, + created_at: startedAt ?? `${date}T00:00:00Z`, + run_started_at: startedAt, + date, +}); + +let nextResultId = 1000; +const result = ( + runDbId: number, + configId: number, + date: string, + conc: number, + tpot: number, + isl = 1024, + osl = 1024, + offloadMode = 'off', +) => ({ + id: nextResultId++, + workflow_run_id: runDbId, + config_id: configId, + benchmark_type: 'latency', + date, + isl, + osl, + conc, + offload_mode: offloadMode, + image: null, + metrics: { median_tpot: tpot }, + error: null, + server_log_id: null, +}); + +const OLD = '2026-06-10'; +const NEW = '2026-06-14'; +let getLatestBenchmarks: typeof GetLatestBenchmarks; + +beforeAll(async () => { + const dir = mkdtempSync(join(tmpdir(), 'infx-line-')); + writeFileSync(join(dir, 'configs.json'), JSON.stringify([cfg(1), cfg(2)])); + writeFileSync( + join(dir, 'workflow_runs.json'), + JSON.stringify([ + run(10, 100, `${OLD}T04:00:00Z`, OLD), // run A: older full sweep + run(11, 101, `${NEW}T05:00:00Z`, NEW), // run B: newer partial re-sweep + run(20, 200, `${NEW}T07:00:00Z`, NEW), // run E: same-day, lower run id + run(21, 201, `${NEW}T07:00:00Z`, NEW), // run F: same-day, SAME timestamp, higher run id + ]), + ); + writeFileSync( + join(dir, 'benchmark_results.json'), + JSON.stringify([ + // config 1, seq (1024,1024): run A full sweep, run B partial re-sweep. + result(10, 1, OLD, 1, 0.1), + result(10, 1, OLD, 8, 0.18), + result(10, 1, OLD, 64, 0.5), + result(11, 1, NEW, 1, 0.09), + result(11, 1, NEW, 8, 0.16), + // config 1, seq (8192,1024): only run A measured it (run B skipped this sequence). + result(10, 1, OLD, 1, 0.2, 8192, 1024), + result(10, 1, OLD, 8, 0.3, 8192, 1024), + // Offload mode is an independent line dimension. A newer off-mode run must not hide + // the older on-mode line for the same config and sequence. + result(10, 1, OLD, 4, 0.25, 4096, 4096, 'on'), + result(11, 1, NEW, 4, 0.2, 4096, 4096, 'off'), + // config 2, seq (1024,1024): two same-day runs with identical run_started_at. + result(20, 2, NEW, 1, 0.5), + result(20, 2, NEW, 8, 0.6), + result(20, 2, NEW, 64, 0.7), + result(21, 2, NEW, 1, 0.4), + result(21, 2, NEW, 8, 0.45), + ]), + ); + process.env.DUMP_DIR = dir; + const mod = await import('./json-provider.js'); + getLatestBenchmarks = mod.getLatestBenchmarks; +}); + +afterAll(() => { + delete process.env.DUMP_DIR; +}); + +/** Concurrencies + their run urls for one (config sequence) line, sorted by conc. */ +function line( + rows: { isl: number | null; osl: number | null; conc: number; run_url: string | null }[], + configRunUrlRe: RegExp, + isl: number, + osl: number, +) { + return rows + .filter((r) => r.isl === isl && r.osl === osl && r.run_url?.match(configRunUrlRe)) + .toSorted((a, b) => a.conc - b.conc) + .map((r) => ({ conc: r.conc, runUrl: r.run_url })); +} + +describe('getLatestBenchmarks — one run per line', () => { + it('truncates a line to the newest run: a partial re-sweep hides the older run’s extra concs', () => { + const rows = getLatestBenchmarks('testm', NEW, false); + // config 1 / seq (1024,1024): run B (101) measured only conc 1 & 8. conc 64 from run A is gone. + const seq = line(rows, /runs\/(?:100|101)\//u, 1024, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/101/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/101/attempts/1' }, + ]); + expect(seq.some((p) => p.conc === 64)).toBe(false); + }); + + it('keeps a different sequence of the same config on its own winning run', () => { + const rows = getLatestBenchmarks('testm', NEW, false); + // seq (8192,1024) was only in run A; run B winning the other sequence must not erase it. + const seq = line(rows, /runs\/100\//u, 8192, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + ]); + }); + + it('selects winning runs independently for each offload mode', () => { + const rows = getLatestBenchmarks('testm', NEW, false).filter( + (r) => r.isl === 4096 && r.osl === 4096, + ); + + expect( + rows + .map((r) => ({ offloadMode: r.offload_mode, runUrl: r.run_url })) + .toSorted((a, b) => a.offloadMode.localeCompare(b.offloadMode)), + ).toEqual([ + { offloadMode: 'off', runUrl: 'https://github.com/x/runs/101/attempts/1' }, + { offloadMode: 'on', runUrl: 'https://github.com/x/runs/100/attempts/1' }, + ]); + }); + + it('breaks a same-day, same-timestamp tie by workflow_run_id (higher id wins the whole line)', () => { + const rows = getLatestBenchmarks('testm', NEW, false); + // config 2: run E (200, id 20) and run F (201, id 21) share run_started_at; F wins by id. + const seq = line(rows, /runs\/(?:200|201)\//u, 1024, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/201/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/201/attempts/1' }, + ]); + // run E's extra conc 64 must not bleed into run F's line. + expect(seq.some((p) => p.conc === 64)).toBe(false); + }); + + it('as of the older run, shows that run’s full sweep (no truncation by a later run)', () => { + const rows = getLatestBenchmarks('testm', NEW, false, '100'); + const seq = line(rows, /runs\/100\//u, 1024, 1024); + expect(seq).toEqual([ + { conc: 1, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + { conc: 8, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + { conc: 64, runUrl: 'https://github.com/x/runs/100/attempts/1' }, + ]); + }); +}); diff --git a/packages/db/src/json-provider.ts b/packages/db/src/json-provider.ts index c23e5f48..4e548efe 100644 --- a/packages/db/src/json-provider.ts +++ b/packages/db/src/json-provider.ts @@ -72,6 +72,8 @@ interface RawBenchmarkResult { isl: number; osl: number; conc: number; + /** Added by the AgentX schema; older dumps omit it and are treated as off. */ + offload_mode?: string; image: string | null; metrics: Record; /** Added in migration 006; older dumps omit this field — surfaced as undefined. */ @@ -333,12 +335,11 @@ const STRIP_HISTORY_KEYS = new Set([ ]); /** - * Comparator for DISTINCT ON (config, conc, isl, osl) selection: latest calendar - * day first, then — for sweeps on the same day — the latest workflow run first by - * `run_started_at` (NULLS LAST). Mirrors the SQL date-filtered query and the - * `latest_benchmarks` view (migration 003): a calendar day alone ties two same-day - * sweeps, so without this an older run's points can shadow a same-day re-sweep. - * `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically. + * Run-recency comparator used to pick the newest run per line: latest calendar day first, + * then — for sweeps on the same day — the latest workflow run first by `run_started_at` + * (NULLS LAST). Mirrors the `br.date DESC, wr.run_started_at DESC NULLS LAST` portion of the + * SQL ORDER BY; callers apply a `workflow_run_id` DESC final tiebreak on top so exactly one + * run wins. `run_started_at` is an ISO-8601 string, so localeCompare orders it chronologically. * Exported so the same-day tiebreak is unit-tested in parity with the SQL. */ export function compareBenchmarkRecency( @@ -355,6 +356,10 @@ export function compareBenchmarkRecency( return bStarted.localeCompare(aStarted); } +/** Chart-line identity: one config + sequence + offload mode. */ +const lineKey = (br: RawBenchmarkResult): string => + `${br.config_id}:${br.benchmark_type}:${br.isl}:${br.osl}:${br.offload_mode ?? 'off'}`; + export function getLatestBenchmarks( modelKey: string | string[], date?: string, @@ -390,27 +395,32 @@ export function getLatestBenchmarks( return true; }); - // DISTINCT ON (config_id, conc, isl, osl) — keep the one with the latest date, - // tiebreaking same-day runs by run_started_at so the latest sweep wins. - const seen = new Map(); - candidates.sort((a, b) => - compareBenchmarkRecency( + // Single run per LINE (config_id, benchmark_type, isl, osl, offload_mode): pick the newest run that + // produced data for the line, then keep EVERY concurrency that one run measured. Sort by + // recency (date, then run_started_at) with a final workflow_run_id DESC tiebreak so exactly + // one run wins even when run_started_at is equal/null — matching the SQL ORDER BY. + candidates.sort((a, b) => { + const recency = compareBenchmarkRecency( toDateString(a.date), toDateString(b.date), s.latestRunsById.get(a.workflow_run_id)?.run_started_at ?? null, s.latestRunsById.get(b.workflow_run_id)?.run_started_at ?? null, - ), - ); + ); + return recency === 0 ? b.workflow_run_id - a.workflow_run_id : recency; + }); + const winningRun = new Map(); for (const br of candidates) { - const key = `${br.config_id}:${br.conc}:${br.isl}:${br.osl}`; - if (!seen.has(key)) seen.set(key, br); + const key = lineKey(br); + if (!winningRun.has(key)) winningRun.set(key, br.workflow_run_id); } - return [...seen.values()].map((br) => { - const c = s.configs.get(br.config_id)!; - const wr = s.latestRunsById.get(br.workflow_run_id)!; - return toBenchmarkRow(br, c, wr); - }); + return candidates + .filter((br) => winningRun.get(lineKey(br)) === br.workflow_run_id) + .map((br) => { + const c = s.configs.get(br.config_id)!; + const wr = s.latestRunsById.get(br.workflow_run_id)!; + return toBenchmarkRow(br, c, wr); + }); } /** In-memory mirror of {@link import('./queries/benchmarks.js').getBenchmarksForRun}. */ diff --git a/packages/db/src/queries/benchmarks.ts b/packages/db/src/queries/benchmarks.ts index 6833756a..37301e2b 100644 --- a/packages/db/src/queries/benchmarks.ts +++ b/packages/db/src/queries/benchmarks.ts @@ -51,9 +51,14 @@ export interface BenchmarkRow { /** * Fetch the latest benchmark results for one or more model DB keys across ALL sequences, * up to a given date. Multiple keys support point-release grouping — e.g. passing - * `['glm5', 'glm5.1']` unions both buckets under the one display. Returns the most recent - * result per (config, concurrency, isl, osl) — so every GPU/framework + sequence combo - * that has been benchmarked appears, with the newest data winning. + * `['glm5', 'glm5.1']` unions both buckets under the one display. + * + * Selection unit is the LINE, not the point: for each line + * `(config_id, benchmark_type, isl, osl, offload_mode)` we pick the single newest workflow run that + * produced data for it (newest date, then latest sweep, then highest run id) and return + * EVERY concurrency that one run measured — and nothing from any other run. A partial + * re-sweep therefore truncates the line to its own concurrencies rather than stitching the + * skipped ones from an older run. This guarantees a line never mixes runs/dates. * * The frontend filters by sequence client-side. This eliminates API round-trips when * switching sequences — the data is already cached by React Query. @@ -74,13 +79,8 @@ export async function getLatestBenchmarks( ): Promise { const modelKeys = Array.isArray(modelKey) ? modelKey : [modelKey]; if (date) { - // Date-filtered: use base table with DISTINCT ON (the view only has the absolute latest) - // exact=true: only return data from this exact date (for GPU comparison) - // exact=false (default): return latest data as of this date (for main chart) - // Same-day tiebreak by wr.run_started_at (latest sweep wins), mirroring the - // latest_benchmarks view (migration 003). br.date is a calendar day, so two - // sweeps on the same day tie on date alone and Postgres would otherwise pick - // an arbitrary one — leaving an older run's points shadowing a same-day re-sweep. + // Date-filtered: use the base table (the view only has the absolute latest). + // exact=true: only this exact date (GPU comparison); exact=false (default): as of this date. const dateFilter = exact ? sql`br.date = ${date}::date` : sql`br.date <= ${date}::date`; // "As of run" filter (main chart only): keep results whose run started no later // than the selected run. run_started_at is an absolute timestamp, so this also @@ -97,8 +97,29 @@ export async function getLatestBenchmarks( ) )` : sql``; + // winners: the single newest run per LINE + // (config_id, benchmark_type, isl, osl, offload_mode) under the + // date/run cutoff. br.date is a calendar day, so two same-day sweeps tie on date — break + // by wr.run_started_at (latest sweep wins), then br.workflow_run_id so exactly one run wins + // even when run_started_at is equal/null. The outer join then pulls EVERY concurrency that + // winning run measured for the line, so the line is built from one run only (no carry-forward + // of concurrencies a partial re-sweep skipped). const rows = await sql` - SELECT DISTINCT ON (br.config_id, br.conc, br.isl, br.osl) + WITH winners AS ( + SELECT DISTINCT ON (br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode) + br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.workflow_run_id AS winning_run_id + FROM benchmark_results br + JOIN configs c ON c.id = br.config_id + JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id + WHERE c.model = ANY(${modelKeys}) + AND br.error IS NULL + AND ${dateFilter} + ${runFilter} + ORDER BY br.config_id, br.benchmark_type, br.isl, br.osl, br.offload_mode, + br.date DESC, wr.run_started_at DESC NULLS LAST, br.workflow_run_id DESC + ) + SELECT br.id, c.hardware, c.framework, @@ -130,12 +151,15 @@ export async function getLatestBenchmarks( FROM benchmark_results br JOIN configs c ON c.id = br.config_id JOIN latest_workflow_runs wr ON wr.id = br.workflow_run_id - WHERE c.model = ANY(${modelKeys}) - AND br.error IS NULL - AND ${dateFilter} - ${runFilter} - ORDER BY br.config_id, br.conc, br.isl, br.osl, - br.date DESC, wr.run_started_at DESC NULLS LAST + JOIN winners w + ON w.config_id = br.config_id + AND w.benchmark_type = br.benchmark_type + AND w.isl IS NOT DISTINCT FROM br.isl + AND w.osl IS NOT DISTINCT FROM br.osl + AND w.offload_mode = br.offload_mode + AND w.winning_run_id = br.workflow_run_id + WHERE br.error IS NULL + ORDER BY br.config_id, br.conc, br.isl, br.osl `; return rows as unknown as BenchmarkRow[]; } From 2c3bb6dcaaff6c04ec56928cc08843b267c464bb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 23 Jun 2026 23:08:36 -0500 Subject: [PATCH 96/96] Default agentic charts to interactivity --- packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts | 7 ++++--- packages/app/src/components/inference/InferenceContext.tsx | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts index 636a7ccf..df199b81 100644 --- a/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts +++ b/packages/app/cypress/e2e/ttft-x-axis-toggle.cy.ts @@ -9,13 +9,14 @@ describe('X-Axis Mode Toggle (inference chart)', () => { cy.get('[data-testid="chart-figure"]').should('have.length.at.least', 1); }); - it('shows the x-axis mode buttons with Interactivity active by default', () => { + it('shows Interactivity by default for the agentic view', () => { + cy.get('[data-testid="scenario-selector"]').should('contain.text', 'Agentic Traces'); cy.get('[data-testid="x-axis-mode-ttft"]').should('be.visible'); cy.get('[data-testid="x-axis-mode-e2e"]').should('be.visible'); cy.get('[data-testid="x-axis-mode-interactivity"]') .should('be.visible') .and('have.attr', 'aria-selected', 'true'); - cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); it('switches the x-axis to TTFT and updates the heading', () => { @@ -37,6 +38,6 @@ describe('X-Axis Mode Toggle (inference chart)', () => { 'aria-selected', 'true', ); - cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'vs. Interactivity'); + cy.get('[data-testid="chart-figure"] h2').should('contain.text', 'Interactivity'); }); }); diff --git a/packages/app/src/components/inference/InferenceContext.tsx b/packages/app/src/components/inference/InferenceContext.tsx index 839afeed..ddb923b8 100644 --- a/packages/app/src/components/inference/InferenceContext.tsx +++ b/packages/app/src/components/inference/InferenceContext.tsx @@ -533,7 +533,7 @@ export function InferenceProvider({ // Reconcile the x-axis mode with the scenario kind: // - On mount with no `i_xmode` URL param: snap to the kind's natural default - // (agentic → ttft, fixed → interactivity). The state itself was initialized + // (interactivity for both agentic and fixed-sequence scenarios). The state was initialized // to a SSR-stable constant so server and client render the same DOM; this // effect fixes it up after hydration. // - When the user later switches sequence kinds: snap to the new kind's @@ -565,7 +565,7 @@ export function InferenceProvider({ // — fall through to the default snap below. return; } - handleSetXAxisMode(kind === 'agentic' ? 'ttft' : 'interactivity'); + handleSetXAxisMode('interactivity'); }, [effectiveSequence, selectedXAxisMode, handleSetXAxisMode]); // Reconcile selectedE2eXAxisMetric whenever the mode, sequence kind, or