From fc195b0fd2f0cfa68cd883727bcd9482f9227cd2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 3 Jun 2026 08:03:27 +0200 Subject: [PATCH] test(cli): simplify churny local coverage --- .../test/commands/eval/pipeline/bench.test.ts | 29 +------ .../test/commands/eval/pipeline/grade.test.ts | 47 +++-------- .../test/commands/eval/pipeline/input.test.ts | 48 ++--------- apps/cli/test/commands/results/serve.test.ts | 31 +------ .../commands/results/studio-config.test.ts | 80 +++++-------------- apps/cli/test/unit/studio-navigation.test.ts | 62 -------------- apps/dashboard/src/lib/navigation.test.ts | 37 +++++++++ .../src/content/docs/docs/tools/dashboard.mdx | 20 +++-- 8 files changed, 92 insertions(+), 262 deletions(-) delete mode 100644 apps/cli/test/unit/studio-navigation.test.ts diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts index 1fab8843..8514f5a9 100644 --- a/apps/cli/test/commands/eval/pipeline/bench.test.ts +++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts @@ -59,8 +59,7 @@ describe('pipeline bench', () => { await rm(OUT_DIR, { recursive: true, force: true }); }); - it('writes grading.json with merged scores and pass_rate', async () => { - // Write LLM grader result to disk (the default flow) + it('writes grading, index, and benchmark artifacts', async () => { await writeFile( join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'), JSON.stringify({ @@ -76,19 +75,6 @@ describe('pipeline bench', () => { expect(grading.summary.pass_rate).toBeGreaterThan(0); expect(grading.assertions.length).toBeGreaterThan(0); expect(grading.graders).toHaveLength(2); - }, 30_000); - - it('writes index.jsonl with one entry per test', async () => { - await writeFile( - join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'), - JSON.stringify({ - score: 0.8, - assertions: [{ text: 'Relevant', passed: true }], - }), - ); - - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); const lines = indexContent @@ -98,19 +84,6 @@ describe('pipeline bench', () => { expect(lines).toHaveLength(1); expect(lines[0].test_id).toBe('test-01'); expect(lines[0].score).toBeGreaterThan(0); - }, 30_000); - - it('writes benchmark.json with run_summary', async () => { - await writeFile( - join(OUT_DIR, 'test-01', 'llm_grader_results', 'relevance.json'), - JSON.stringify({ - score: 0.8, - assertions: [{ text: 'ok', passed: true }], - }), - ); - - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8')); expect(benchmark.metadata.targets).toContain('test-target'); diff --git a/apps/cli/test/commands/eval/pipeline/grade.test.ts b/apps/cli/test/commands/eval/pipeline/grade.test.ts index d2cdf802..cf9abac3 100644 --- a/apps/cli/test/commands/eval/pipeline/grade.test.ts +++ b/apps/cli/test/commands/eval/pipeline/grade.test.ts @@ -46,7 +46,7 @@ describe('pipeline grade', () => { await rm(OUT_DIR, { recursive: true, force: true }); }); - it('writes code_grader_results/.json with score', async () => { + it('writes code_grader_results/.json with score and assertions', async () => { const { execa } = await import('execa'); await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', OUT_DIR]); @@ -55,15 +55,6 @@ describe('pipeline grade', () => { ); expect(result.score).toBe(1); expect(result.name).toBe('always_pass'); - }, 30_000); - - it('includes assertions from code grader output', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', OUT_DIR]); - - const result = JSON.parse( - await readFile(join(OUT_DIR, 'test-01', 'code_grader_results', 'always_pass.json'), 'utf8'), - ); expect(result.assertions).toHaveLength(1); expect(result.assertions[0].passed).toBe(true); }, 30_000); @@ -83,7 +74,6 @@ describe('pipeline grade — builtin assertions', () => { JSON.stringify({ input: [{ role: 'user', content: 'say hello' }] }), ); - // contains assertion — should pass await writeFile( join(builtinGradersDir, 'has_hello.json'), JSON.stringify({ @@ -95,7 +85,6 @@ describe('pipeline grade — builtin assertions', () => { }), ); - // regex assertion — should pass await writeFile( join(builtinGradersDir, 'matches_pattern.json'), JSON.stringify({ @@ -107,7 +96,6 @@ describe('pipeline grade — builtin assertions', () => { }), ); - // contains assertion — should fail await writeFile( join(builtinGradersDir, 'has_goodbye.json'), JSON.stringify({ @@ -134,48 +122,37 @@ describe('pipeline grade — builtin assertions', () => { await rm(BUILTIN_OUT, { recursive: true, force: true }); }); - it('evaluates contains assertion and writes result', async () => { + it('evaluates builtin assertions and writes results', async () => { const { execa } = await import('execa'); await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]); - const result = JSON.parse( + const containsResult = JSON.parse( await readFile(join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_hello.json'), 'utf8'), ); - expect(result.score).toBe(1); - expect(result.type).toBe('contains'); - expect(result.assertions[0].passed).toBe(true); - }, 30_000); - - it('evaluates regex assertion and writes result', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]); + expect(containsResult.score).toBe(1); + expect(containsResult.type).toBe('contains'); + expect(containsResult.assertions[0].passed).toBe(true); - const result = JSON.parse( + const regexResult = JSON.parse( await readFile( join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'matches_pattern.json'), 'utf8', ), ); - expect(result.score).toBe(1); - expect(result.type).toBe('regex'); - }, 30_000); + expect(regexResult.score).toBe(1); + expect(regexResult.type).toBe('regex'); - it('scores 0 when contains assertion does not match', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'grade', BUILTIN_OUT]); - - const result = JSON.parse( + const failingContainsResult = JSON.parse( await readFile( join(BUILTIN_OUT, 'test-01', 'code_grader_results', 'has_goodbye.json'), 'utf8', ), ); - expect(result.score).toBe(0); - expect(result.assertions[0].passed).toBe(false); + expect(failingContainsResult.score).toBe(0); + expect(failingContainsResult.assertions[0].passed).toBe(false); }, 30_000); it('applies negate to invert score', async () => { - // Overwrite has_goodbye with negate: true — "not contains goodbye" should pass await writeFile( join(BUILTIN_OUT, 'test-01', 'code_graders', 'has_goodbye.json'), JSON.stringify({ diff --git a/apps/cli/test/commands/eval/pipeline/input.test.ts b/apps/cli/test/commands/eval/pipeline/input.test.ts index d814675e..f994207c 100644 --- a/apps/cli/test/commands/eval/pipeline/input.test.ts +++ b/apps/cli/test/commands/eval/pipeline/input.test.ts @@ -12,65 +12,41 @@ describe('pipeline input', () => { await rm(OUT_DIR, { recursive: true, force: true }); }); - it('writes manifest.json with test_ids and eval_file', async () => { + it('materializes the default input workspace', async () => { const { execa } = await import('execa'); await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8')); expect(manifest.test_ids).toEqual(['test-01']); expect(manifest.eval_file).toContain('input-test.eval.yaml'); - }, 30_000); - - it('writes per-test input.json with input and input_files', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); + expect(manifest.experiment).toBeUndefined(); const input = JSON.parse( await readFile(join(OUT_DIR, 'input-test', 'test-01', 'input.json'), 'utf8'), ); expect(input.input).toHaveLength(1); expect(input.input[0].content).toBe('hello world'); - }, 30_000); - - it('writes code_graders/.json with resolved command', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); - const grader = JSON.parse( + const codeGrader = JSON.parse( await readFile( join(OUT_DIR, 'input-test', 'test-01', 'code_graders', 'contains_hello.json'), 'utf8', ), ); - expect(grader.command).toBeDefined(); - expect(grader.name).toBe('contains_hello'); - }, 30_000); + expect(codeGrader.command).toBeDefined(); + expect(codeGrader.name).toBe('contains_hello'); - it('writes llm_graders/.json with prompt content', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); - - const grader = JSON.parse( + const llmGrader = JSON.parse( await readFile( join(OUT_DIR, 'input-test', 'test-01', 'llm_graders', 'relevance.json'), 'utf8', ), ); - expect(grader.prompt_content).toBeDefined(); - expect(grader.name).toBe('relevance'); - }, 30_000); - - it('writes criteria.md', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); + expect(llmGrader.prompt_content).toBeDefined(); + expect(llmGrader.name).toBe('relevance'); const criteria = await readFile(join(OUT_DIR, 'input-test', 'test-01', 'criteria.md'), 'utf8'); expect(criteria).toContain('Response echoes the input'); - }, 30_000); - - it('writes invoke.json', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); const invoke = JSON.parse( await readFile(join(OUT_DIR, 'input-test', 'test-01', 'invoke.json'), 'utf8'), @@ -95,14 +71,6 @@ describe('pipeline input', () => { expect(manifest.experiment).toBe('without_skills'); }, 30_000); - it('omits experiment from manifest when --experiment is not provided', async () => { - const { execa } = await import('execa'); - await execa('bun', [CLI_ENTRY, 'pipeline', 'input', EVAL_PATH, '--out', OUT_DIR]); - - const manifest = JSON.parse(await readFile(join(OUT_DIR, 'manifest.json'), 'utf8')); - expect(manifest.experiment).toBeUndefined(); - }, 30_000); - it('writes code_graders/.json for deterministic assertions', async () => { const { execa } = await import('execa'); const builtinEvalPath = join(FIXTURE_DIR, 'builtin-test.eval.yaml'); diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 519382f8..8d2e9f3e 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -563,15 +563,12 @@ describe('serve app', () => { }); }); - it('computes pass_rate using the configured dashboard threshold (strict threshold yields lower rate)', async () => { + it('computes pass_rate using the configured dashboard threshold', async () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); mkdirSync(runsDir, { recursive: true }); const filename = '2026-03-25T10-00-00-000Z'; const runDir = path.join(runsDir, filename); mkdirSync(runDir, { recursive: true }); - // Two results: score=0.8 and score=0.6 - // With DEFAULT_THRESHOLD=0.8: score=0.8 passes → 1/2 = 50% - // With threshold=0.9: neither passes → 0% const resultHigh = { ...RESULT_A, test_id: 'high', score: 0.8 }; const resultLow = { ...RESULT_B, test_id: 'low', score: 0.6 }; writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow)); @@ -584,35 +581,9 @@ describe('serve app', () => { expect(res.status).toBe(200); const data = (await res.json()) as { runs: Array<{ pass_rate: number }> }; expect(data.runs).toHaveLength(1); - // With threshold=0.9: neither 0.8 nor 0.6 passes → 0% expect(data.runs[0].pass_rate).toBe(0); }); - it('computes pass_rate using the configured dashboard threshold (lenient threshold yields higher rate)', async () => { - const runsDir = path.join(tempDir, '.agentv', 'results', 'runs'); - mkdirSync(runsDir, { recursive: true }); - const filename = '2026-03-25T12-00-00-000Z'; - const runDir = path.join(runsDir, filename); - mkdirSync(runDir, { recursive: true }); - // Two results: score=0.8 and score=0.6 - // With DEFAULT_THRESHOLD=0.8: score=0.8 passes → 1/2 = 50% - // With threshold=0.5: both pass → 2/2 = 100% - const resultHigh = { ...RESULT_A, test_id: 'high', score: 0.8 }; - const resultLow = { ...RESULT_B, test_id: 'low', score: 0.6 }; - writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultHigh, resultLow)); - - mkdirSync(path.join(tempDir, '.agentv'), { recursive: true }); - writeFileSync(path.join(tempDir, '.agentv', 'config.yaml'), 'dashboard:\n threshold: 0.5\n'); - - const app = createApp([], tempDir, tempDir, undefined, { studioDir }); - const res = await app.request('/api/runs'); - expect(res.status).toBe(200); - const data = (await res.json()) as { runs: Array<{ pass_rate: number }> }; - expect(data.runs).toHaveLength(1); - // With threshold=0.5: both 0.8 and 0.6 pass → 100% - expect(data.runs[0].pass_rate).toBe(1); - }); - it('infers the experiment name from the run id when live results have not written it yet', async () => { const runsDir = path.join(tempDir, '.agentv', 'results', 'runs', 'issue-1198-live-name'); mkdirSync(runsDir, { recursive: true }); diff --git a/apps/cli/test/commands/results/studio-config.test.ts b/apps/cli/test/commands/results/studio-config.test.ts index 5dd80b72..cb9ab2e8 100644 --- a/apps/cli/test/commands/results/studio-config.test.ts +++ b/apps/cli/test/commands/results/studio-config.test.ts @@ -24,28 +24,15 @@ describe('loadStudioConfig', () => { expect(config.threshold).toBe(DEFAULT_THRESHOLD); }); - it('reads threshold from dashboard section', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'dashboard:\n threshold: 0.6\n'); - const config = loadStudioConfig(tempDir); - expect(config.threshold).toBe(0.6); - }); - - it('reads pass_threshold from dashboard section as fallback', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'dashboard:\n pass_threshold: 0.6\n'); - const config = loadStudioConfig(tempDir); - expect(config.threshold).toBe(0.6); - }); - - it('reads threshold from studio section as fallback (legacy)', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n threshold: 0.6\n'); - const config = loadStudioConfig(tempDir); - expect(config.threshold).toBe(0.6); - }); - - it('reads pass_threshold from studio section as fallback (legacy)', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 0.6\n'); - const config = loadStudioConfig(tempDir); - expect(config.threshold).toBe(0.6); + it.each([ + ['dashboard.threshold', 'dashboard:\n threshold: 0.6\n'], + ['dashboard.pass_threshold fallback', 'dashboard:\n pass_threshold: 0.6\n'], + ['legacy studio.threshold fallback', 'studio:\n threshold: 0.6\n'], + ['legacy studio.pass_threshold fallback', 'studio:\n pass_threshold: 0.6\n'], + ['legacy root pass_threshold fallback', 'pass_threshold: 0.6\n'], + ])('reads %s', (_name, yaml) => { + writeFileSync(path.join(tempDir, 'config.yaml'), yaml); + expect(loadStudioConfig(tempDir).threshold).toBe(0.6); }); it('prefers dashboard.threshold over dashboard.pass_threshold', () => { @@ -75,12 +62,6 @@ describe('loadStudioConfig', () => { expect(config.threshold).toBe(0.5); }); - it('falls back to root-level pass_threshold (legacy)', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'pass_threshold: 0.7\n'); - const config = loadStudioConfig(tempDir); - expect(config.threshold).toBe(0.7); - }); - it('prefers dashboard section over root-level pass_threshold', () => { writeFileSync( path.join(tempDir, 'config.yaml'), @@ -90,16 +71,12 @@ describe('loadStudioConfig', () => { expect(config.threshold).toBe(0.9); }); - it('clamps threshold to 0 when negative', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'dashboard:\n threshold: -0.5\n'); - const config = loadStudioConfig(tempDir); - expect(config.threshold).toBe(0); - }); - - it('clamps threshold to 1 when above 1', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'dashboard:\n threshold: 1.5\n'); - const config = loadStudioConfig(tempDir); - expect(config.threshold).toBe(1); + it.each([ + ['negative', -0.5, 0], + ['above 1', 1.5, 1], + ])('clamps %s threshold', (_name, value, expected) => { + writeFileSync(path.join(tempDir, 'config.yaml'), `dashboard:\n threshold: ${value}\n`); + expect(loadStudioConfig(tempDir).threshold).toBe(expected); }); it('returns defaults for empty config.yaml', () => { @@ -140,10 +117,10 @@ describe('saveStudioConfig', () => { expect((parsed.dashboard as Record).threshold).toBe(0.9); }); - it('removes legacy root-level pass_threshold on save', () => { + it('writes canonical dashboard.threshold and removes legacy threshold fields on save', () => { writeFileSync( path.join(tempDir, 'config.yaml'), - 'required_version: ">=4.2.0"\npass_threshold: 0.8\n', + 'required_version: ">=4.2.0"\npass_threshold: 0.8\ndashboard:\n pass_threshold: 0.6\nstudio:\n theme: dark\n pass_threshold: 0.5\n', ); saveStudioConfig(tempDir, { threshold: 0.7 }); @@ -151,28 +128,9 @@ describe('saveStudioConfig', () => { const parsed = parseYaml(raw) as Record; expect(parsed.required_version).toBe('>=4.2.0'); expect(parsed.pass_threshold).toBeUndefined(); - expect((parsed.dashboard as Record).threshold).toBe(0.7); - }); - - it('removes legacy pass_threshold from dashboard section on save', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'dashboard:\n pass_threshold: 0.8\n'); - saveStudioConfig(tempDir, { threshold: 0.7 }); - - const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); - const parsed = parseYaml(raw) as Record; - const dashboard = parsed.dashboard as Record; - expect(dashboard.pass_threshold).toBeUndefined(); - expect(dashboard.threshold).toBe(0.7); - }); - - it('migrates legacy studio section to dashboard on save', () => { - writeFileSync(path.join(tempDir, 'config.yaml'), 'studio:\n pass_threshold: 0.8\n'); - saveStudioConfig(tempDir, { threshold: 0.7 }); - - const raw = readFileSync(path.join(tempDir, 'config.yaml'), 'utf-8'); - const parsed = parseYaml(raw) as Record; - const dashboard = parsed.dashboard as Record; expect(parsed.studio).toBeUndefined(); + const dashboard = parsed.dashboard as Record; + expect(dashboard.theme).toBe('dark'); expect(dashboard.pass_threshold).toBeUndefined(); expect(dashboard.threshold).toBe(0.7); }); diff --git a/apps/cli/test/unit/studio-navigation.test.ts b/apps/cli/test/unit/studio-navigation.test.ts deleted file mode 100644 index d75c729a..00000000 --- a/apps/cli/test/unit/studio-navigation.test.ts +++ /dev/null @@ -1,62 +0,0 @@ -import { describe, expect, it } from 'bun:test'; - -import { - categoryPath, - evalPath, - experimentPath, - jobPath, - projectHomePath, - resolveIndexRoute, - runPath, - runsHomePath, - suitePath, -} from '../../../dashboard/src/lib/navigation.ts'; - -describe('studio navigation helpers', () => { - it('redirects when the preferred project id matches a registered project', () => { - expect(resolveIndexRoute(['demo-project'], undefined, 'demo-project', 'analytics')).toEqual({ - kind: 'redirect', - redirectPath: '/projects/demo-project?tab=analytics', - }); - }); - - it('keeps explicit single-project mode on the legacy root home', () => { - expect(resolveIndexRoute(['demo-project'], false, 'runs')).toEqual({ - kind: 'single-project-home', - }); - }); - - it('keeps the dashboard for zero or many projects', () => { - expect(resolveIndexRoute([], true)).toEqual({ kind: 'dashboard' }); - expect(resolveIndexRoute(['one', 'two'], true)).toEqual({ kind: 'dashboard' }); - }); - - it('builds project-scoped drill-down paths', () => { - expect(projectHomePath('demo project', 'runs')).toBe('/projects/demo%20project?tab=runs'); - expect(runPath('run::1', 'demo project')).toBe('/projects/demo%20project/runs/run%3A%3A1'); - expect(evalPath('run::1', 'case/a', 'demo project')).toBe( - '/projects/demo%20project/evals/run%3A%3A1/case%2Fa', - ); - expect(jobPath('job/1', 'demo project')).toBe('/projects/demo%20project/jobs/job%2F1'); - expect(categoryPath('run::1', 'Safety > PII', 'demo project')).toBe( - '/projects/demo%20project/runs/run%3A%3A1/category/Safety%20%3E%20PII', - ); - expect(suitePath('run::1', 'evals/smoke.eval.yaml', 'demo project')).toBe( - '/projects/demo%20project/runs/run%3A%3A1/suite/evals%2Fsmoke.eval.yaml', - ); - expect(experimentPath('prod-baseline', 'demo project')).toBe( - '/projects/demo%20project/experiments/prod-baseline', - ); - }); - - it('keeps unscoped paths for legacy single-project routes', () => { - expect(runPath('run::1')).toBe('/runs/run%3A%3A1'); - expect(evalPath('run::1', 'case/a')).toBe('/evals/run%3A%3A1/case%2Fa'); - expect(jobPath('job/1')).toBe('/jobs/job%2F1'); - expect(categoryPath('run::1', 'Safety')).toBe('/runs/run%3A%3A1/category/Safety'); - expect(suitePath('run::1', 'evals/smoke.eval.yaml')).toBe( - '/runs/run%3A%3A1/suite/evals%2Fsmoke.eval.yaml', - ); - expect(runsHomePath()).toBe('/?tab=runs'); - }); -}); diff --git a/apps/dashboard/src/lib/navigation.test.ts b/apps/dashboard/src/lib/navigation.test.ts index 1c246750..b735b239 100644 --- a/apps/dashboard/src/lib/navigation.test.ts +++ b/apps/dashboard/src/lib/navigation.test.ts @@ -1,9 +1,16 @@ import { describe, expect, it } from 'bun:test'; import { + categoryPath, + evalPath, + experimentPath, initialProjectRedirectStorageKey, + jobPath, resolveIndexRoute, resolveInitialProjectRedirect, + runPath, + runsHomePath, + suitePath, } from './navigation'; describe('resolveInitialProjectRedirect', () => { @@ -44,3 +51,33 @@ describe('resolveIndexRoute', () => { expect(resolveIndexRoute(['alpha'], true)).toEqual({ kind: 'dashboard' }); }); }); + +describe('route path helpers', () => { + it('builds project-scoped drill-down paths', () => { + expect(runPath('run::1', 'demo project')).toBe('/projects/demo%20project/runs/run%3A%3A1'); + expect(evalPath('run::1', 'case/a', 'demo project')).toBe( + '/projects/demo%20project/evals/run%3A%3A1/case%2Fa', + ); + expect(jobPath('job/1', 'demo project')).toBe('/projects/demo%20project/jobs/job%2F1'); + expect(categoryPath('run::1', 'Safety > PII', 'demo project')).toBe( + '/projects/demo%20project/runs/run%3A%3A1/category/Safety%20%3E%20PII', + ); + expect(suitePath('run::1', 'evals/smoke.eval.yaml', 'demo project')).toBe( + '/projects/demo%20project/runs/run%3A%3A1/suite/evals%2Fsmoke.eval.yaml', + ); + expect(experimentPath('prod-baseline', 'demo project')).toBe( + '/projects/demo%20project/experiments/prod-baseline', + ); + }); + + it('keeps unscoped paths for legacy single-project routes', () => { + expect(runPath('run::1')).toBe('/runs/run%3A%3A1'); + expect(evalPath('run::1', 'case/a')).toBe('/evals/run%3A%3A1/case%2Fa'); + expect(jobPath('job/1')).toBe('/jobs/job%2F1'); + expect(categoryPath('run::1', 'Safety')).toBe('/runs/run%3A%3A1/category/Safety'); + expect(suitePath('run::1', 'evals/smoke.eval.yaml')).toBe( + '/runs/run%3A%3A1/suite/evals%2Fsmoke.eval.yaml', + ); + expect(runsHomePath()).toBe('/?tab=runs'); + }); +}); diff --git a/apps/web/src/content/docs/docs/tools/dashboard.mdx b/apps/web/src/content/docs/docs/tools/dashboard.mdx index a28084db..c0b458a8 100644 --- a/apps/web/src/content/docs/docs/tools/dashboard.mdx +++ b/apps/web/src/content/docs/docs/tools/dashboard.mdx @@ -62,6 +62,17 @@ agentv dashboard .agentv/results/runs/2026-03-30T11-45-56-989Z - **Analytics** — two modes: an aggregated experiment × target matrix, and a per-run view for selecting individual runs to compare side-by-side with optional retroactive tags. Includes a collapsible charts section with baseline comparison analytics - **Remote Results** — sync and browse runs pushed from other machines or CI (see [Remote Results](#remote-results)) +## Pass threshold + +Dashboard treats scores greater than or equal to the configured threshold as passing when it calculates pass rates. Configure this in `.agentv/config.yaml`: + +```yaml +dashboard: + threshold: 0.8 +``` + +Legacy `studio.threshold`, `studio.pass_threshold`, and root-level `pass_threshold` values are still read for existing projects. When Dashboard saves settings, it writes the canonical `dashboard.threshold` field and preserves unrelated config. + ## Run Detail Click any run to see a breakdown by suite, per-test scores, target, duration, and cost. The source label (`local` or `remote`) tells you where the run came from. @@ -182,14 +193,11 @@ This satisfies the 24/7-Dashboard use case: the server stays up; projects come a ### Launching the Dashboard -Dashboard auto-detects the mode based on how many projects are registered: - -- `0` or `1` registered: single-project view -- `2+` registered: Projects dashboard +Dashboard opens the Projects dashboard by default, even when no projects or one project are registered. When launched from a registered project, the UI redirects to that project's runs tab on first load. Use `--single` only when you need the legacy single-project route layout. ```bash -agentv dashboard # auto-detects -agentv dashboard --single # force single-project view +agentv dashboard # Projects dashboard +agentv dashboard --single # legacy single-project route layout ``` The landing page shows a card for each project with run count, pass rate, and last run time.