Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,8 @@ jobs:
- name: Check evals directories have eval files
run: bun scripts/validate-eval-dirs.ts

- name: Run Phoenix adapter dry-run smoke
run: bun run phoenix:assert-smoke

- name: Validate eval schemas
run: bun apps/cli/dist/cli.js validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml'
141 changes: 137 additions & 4 deletions bun.lock

Large diffs are not rendered by default.

15 changes: 9 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
"packageManager": "bun@1.3.3",
"workspaces": ["apps/*", "packages/*"],
"scripts": {
"build": "bun --filter @agentv/core build && bun --filter @agentv/eval build && bun --filter @agentv/dashboard build && bun --filter agentv build",
"build": "bun --filter @agentv/core build && bun --filter @agentv/eval build && bun --filter @agentv/phoenix-adapter build && bun --filter @agentv/dashboard build && bun --filter agentv build",
"verify": "bun run build && bun run typecheck && bun run lint && bun run test",
"typecheck": "bun --filter @agentv/core typecheck && bun --filter agentv typecheck",
"typecheck": "bun --filter @agentv/core typecheck && bun --filter @agentv/phoenix-adapter typecheck && bun --filter agentv typecheck",
"typecheck:workspace": "tsc -b tsconfig.build.json",
"typecheck:watch": "bun --filter @agentv/core typecheck -- --watch & bun --filter agentv typecheck -- --watch",
"lint": "biome check .",
"format": "biome format --write .",
"fix": "biome check --write .",
"test": "bun --filter @agentv/core test && bun --filter @agentv/eval test && bun --filter agentv test",
"test": "bun --filter @agentv/core test && bun --filter @agentv/eval test && bun --filter @agentv/phoenix-adapter test && bun --filter agentv test",
"test:watch": "bun --filter @agentv/core test:watch & bun --filter agentv test:watch",
"agentv": "bun apps/cli/src/cli.ts",
"agentv:buildrun": "bun run build && bun apps/cli/dist/cli.js",
Expand All @@ -25,13 +25,16 @@
"examples:install": "bun scripts/install-examples.ts",
"publish": "bun run build && bun scripts/publish.ts",
"publish:next": "bun run build && bun scripts/publish.ts next",
"prepare": "test -d .git && bunx prek install -t pre-push || true"
"prepare": "test -d .git && bunx prek install -t pre-push || true",
"phoenix:dry-run": "bun --filter @agentv/phoenix-adapter phoenix:dry-run",
"phoenix:assert-smoke": "bun --filter @agentv/phoenix-adapter phoenix:assert-smoke"
},
"devDependencies": {
"@biomejs/biome": "^1.9.4",
"@j178/prek": "^0.3.0",
"@agentv/core": "workspace:*",
"@agentv/eval": "workspace:*",
"@agentv/phoenix-adapter": "workspace:*",
"@biomejs/biome": "^1.9.4",
"@j178/prek": "^0.3.0",
"@types/bun": "latest",
"@types/node": "24.1.0",
"async-mutex": "^0.5.0",
Expand Down
1 change: 1 addition & 0 deletions packages/phoenix-adapter/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
reports/
12 changes: 12 additions & 0 deletions packages/phoenix-adapter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# @agentv/phoenix-adapter

Converts AgentV eval YAML suites into Phoenix datasets and can run Phoenix experiments while keeping AgentV eval files as the source of truth.

Current adapter support is intentionally small: deterministic `contains`, `regex`, `equals`, and `is-json` assertions run through a Phoenix CODE evaluator. LLM, code, trace, composite, metric, and custom evaluator families are reported as unsupported instead of being silently mapped.

```bash
bun --filter @agentv/phoenix-adapter phoenix:assert-smoke
bun --filter @agentv/phoenix-adapter phoenix:dry-run
```

See `docs/support-matrix.md` for evaluator coverage and `docs/e2e-verification.md` for smoke-test notes.
50 changes: 50 additions & 0 deletions packages/phoenix-adapter/docs/e2e-verification.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# E2E Verification

## Dry-Run Conversion

Dry-run mode discovers AgentV example evals, normalizes cases through `@agentv/core`, creates Phoenix dataset payloads in memory, and compares test IDs against AgentV baselines where present.

```bash
bun run phoenix:assert-smoke
bun run phoenix:dry-run
```

Current filtered smoke result against `examples/features/assert/evals/dataset.eval.yaml`:

- 1 suite discovered
- 4 tests normalized
- 1 suite passed structural parity
- 0 failed suites

Current full dry-run result against this AgentV checkout:

- 97 suites discovered
- 405 tests normalized
- 93 suites passed structural parity
- 4 suites failed baseline/loader parity

The failing suites are currently source/baseline or source-reference mismatches, not Phoenix conversion crashes:

- `examples/features/matrix-evaluation/evals/dataset.eval.yaml`: baseline has 5 rows, source has 3 tests.
- `examples/features/prompt-template-sdk/evals/dataset.eval.yaml`: AgentV core skips 2 tests because `../prompts/custom-grader.ts` cannot be resolved from the eval source.
- `examples/features/tool-trajectory-simple/evals/dataset.eval.yaml`: source has 11 tests, baseline has 7 rows.
- `examples/features/weighted-graders/evals/dataset.eval.yaml`: baseline IDs use `evaluator` naming while source IDs use `grader` naming.

## Live Phoenix Smoke

Live mode creates or updates a Phoenix dataset and records a Phoenix experiment. It currently uses the deterministic adapter path, so the best smoke target is `examples/features/assert/evals/dataset.eval.yaml`.

```bash
(cd packages/phoenix-adapter && bun src/cli.ts run \
--agentv-root ../.. \
--filter examples/features/assert/evals/dataset.eval.yaml \
--out reports/live-assert-final.json \
--namespace agentv-phoenix-e2e-final)
```

The source harness was verified locally against Phoenix at `http://localhost:6006`:

- 4 Phoenix task runs
- 4 Phoenix evaluator runs
- average evaluator score: 1.0
- experiment ID: `RXhwZXJpbWVudDo2`
23 changes: 23 additions & 0 deletions packages/phoenix-adapter/docs/support-matrix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Phoenix Adapter Support Matrix

This workspace converts AgentV example evals into Phoenix dataset and experiment payloads.

| AgentV family | Phoenix status |
| --- | --- |
| `contains` | Supported by deterministic adapter |
| `regex` | Supported by deterministic adapter |
| `equals` | Supported by deterministic adapter |
| `is-json` | Supported by deterministic adapter |
| `llm-grader` | Reported as unsupported in first pass |
| `rubrics` | Reported as unsupported in first pass |
| `code-grader` | Reported as unsupported in first pass |
| `composite` | Reported as unsupported in first pass |
| `field-accuracy` | Reported as unsupported in first pass |
| `execution-metrics` | Reported as unsupported in first pass |
| `tool-trajectory` | Reported as unsupported in first pass |
| `cost` | Reported as unsupported in first pass |
| `latency` | Reported as unsupported in first pass |
| `trial-output-consistency` | Reported as unsupported in first pass |
| Other custom families | Reported as unsupported with the family name |

Unsupported does not block conversion unless `--fail-on-unsupported` is set. The report keeps unsupported families visible so parity gaps are explicit.
33 changes: 33 additions & 0 deletions packages/phoenix-adapter/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "@agentv/phoenix-adapter",
"version": "4.31.4-next.1",
"description": "Phoenix execution and observability adapter for AgentV eval YAML suites",
"private": true,
"type": "module",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js"
}
},
"scripts": {
"build": "(cd ../core && bun run build) && tsup",
"typecheck": "(cd ../core && bun run build) && tsc --noEmit",
"test": "(cd ../core && bun run build) && bun test",
"phoenix:dry-run": "bun src/cli.ts run --dry-run --agentv-root ../.. --out reports/dry-run.json",
"phoenix:assert-smoke": "bun src/cli.ts run --dry-run --agentv-root ../.. --filter examples/features/assert/evals/dataset.eval.yaml --out /tmp/agentv-phoenix-assert-smoke.json"
},
"files": ["dist", "README.md", "docs"],
"dependencies": {
"@agentv/core": "workspace:*",
"@arizeai/phoenix-client": "6.10.0",
"@arizeai/phoenix-evals": "1.0.3",
"yaml": "^2.8.3"
},
"devDependencies": {
"tsup": "8.3.5",
"typescript": "5.8.3"
}
}
39 changes: 39 additions & 0 deletions packages/phoenix-adapter/src/agentv/discovery.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { readdir } from 'node:fs/promises';
import path from 'node:path';
import { relativePosix } from './path.js';
import type { AgentVSource } from './types.js';

const EVAL_FILE_RE = /\.(?:eval|EVAL)\.ya?ml$/;

async function walk(dir: string, results: string[] = []): Promise<string[]> {
const entries = await readdir(dir, { withFileTypes: true });
for (const entry of entries) {
if (entry.name === 'node_modules' || entry.name === '.git') continue;
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
await walk(fullPath, results);
continue;
}
if (entry.isFile()) results.push(fullPath);
}
return results;
}

export async function discoverAgentVEvals(agentvRoot: string): Promise<AgentVSource[]> {
const examplesRoot = path.join(agentvRoot, 'examples');
const files = await walk(examplesRoot);

return files
.filter(
(file) => EVAL_FILE_RE.test(path.basename(file)) || path.basename(file) === 'evals.json',
)
.map((file): AgentVSource => {
const relativePath = relativePosix(agentvRoot, file);
return {
path: file,
relativePath,
kind: path.basename(file) === 'evals.json' ? 'agent-skills-json' : 'eval-yaml',
};
})
.sort((a, b) => a.relativePath.localeCompare(b.relativePath));
}
123 changes: 123 additions & 0 deletions packages/phoenix-adapter/src/agentv/load-spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import { existsSync, readFileSync } from 'node:fs';
import path from 'node:path';
import { loadTestSuite } from '@agentv/core';
import YAML from 'yaml';
import type {
AgentVMessage,
AgentVSource,
JsonObject,
NormalizedAssertion,
NormalizedCase,
NormalizedSuite,
} from './types.js';

function parseStructuredFile(filePath: string): unknown {
const content = readFileSync(filePath, 'utf8');
if (filePath.endsWith('.json')) return JSON.parse(content);
if (filePath.endsWith('.jsonl')) {
return content
.split('\n')
.map((line) => line.trim())
.filter(Boolean)
.map((line) => JSON.parse(line));
}
return YAML.parse(content);
}

function normalizeAssertion(assertion: unknown, index: number): NormalizedAssertion {
if (typeof assertion === 'string') {
return { type: 'rubrics', source: assertion };
}
const record = (assertion ?? {}) as JsonObject;
const type = String(record.type ?? record.name ?? `assertion-${index + 1}`);
return {
name: typeof record.name === 'string' ? record.name : undefined,
type,
source: assertion,
};
}

function normalizeExpectedOutput(test: {
readonly reference_answer?: string;
readonly expected_output?: unknown;
}): unknown {
const expectedOutput = test.expected_output;
const hasExpectedOutput = Array.isArray(expectedOutput)
? expectedOutput.length > 0
: expectedOutput !== undefined;
if (!hasExpectedOutput) return undefined;
return test.reference_answer ?? expectedOutput;
}

function deriveAgentVRoot(source: AgentVSource): string {
return path.resolve(source.path, ...source.relativePath.split('/').map(() => '..'));
}

function collectUnsupported(
raw: JsonObject,
suite: Awaited<ReturnType<typeof loadTestSuite>>,
): readonly string[] {
const unsupported: string[] = [];
for (const key of ['workspace', 'before_all', 'after_all', 'matrix']) {
if (raw[key] !== undefined) unsupported.push(key);
}
if (suite.trials !== undefined) unsupported.push('trials');
if (suite.workspacePath !== undefined) unsupported.push('workspace');
if ((suite.targets?.length ?? 0) > 0 || (suite.targetRefs?.length ?? 0) > 0)
unsupported.push('matrix');
return [...new Set(unsupported)];
}

/**
* Load an AgentV-authored eval source into the Phoenix adapter's normalized shape.
*
* AgentV eval YAML remains the source of truth: this adapter delegates case expansion,
* external case files, assertion parsing, Agent Skills `evals.json`, interpolation, and
* metadata handling to `@agentv/core`'s loader, then projects the result into Phoenix
* dataset examples. Add Phoenix-specific behavior after this boundary rather than
* duplicating AgentV YAML semantics in the adapter.
*/
export async function loadAgentVEvalSuite(source: AgentVSource): Promise<NormalizedSuite> {
if (!existsSync(source.path)) {
throw new Error(`AgentV eval source does not exist: ${source.path}`);
}

const raw = (parseStructuredFile(source.path) ?? {}) as JsonObject;
const loaded = await loadTestSuite(source.path, deriveAgentVRoot(source));
const suiteName =
raw.skill_name ??
loaded.tests[0]?.suite ??
raw.name ??
path.basename(source.path).replace(/\.ya?ml$/, '');

const cases = loaded.tests.map((test, index): NormalizedCase => {
const assertions = (test.assertions ?? []).map((assertion, assertionIndex) =>
normalizeAssertion(assertion, assertionIndex),
);

return {
id: String(test.id ?? `case-${index + 1}`),
criteria: test.criteria || undefined,
input: test.input as readonly AgentVMessage[],
expectedOutput: normalizeExpectedOutput(test),
assertions,
metadata: {
...(test.metadata ?? {}),
...(test.targets ? { targets: test.targets } : {}),
},
sourcePath: source.relativePath,
};
});

return {
name: String(suiteName),
description: typeof raw.description === 'string' ? raw.description : undefined,
source,
cases,
suiteAssertions: [],
warnings: cases
.filter((testCase) => testCase.input.length === 0)
.map((testCase) => `${source.relativePath}: ${testCase.id} has no input`),
unsupportedFeatures: collectUnsupported(raw, loaded),
};
}
22 changes: 22 additions & 0 deletions packages/phoenix-adapter/src/agentv/path.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { existsSync } from 'node:fs';
import path from 'node:path';

export function resolveAgentVRoot(input?: string): string {
const configured = input ?? process.env.AGENTV_ROOT ?? defaultAgentVRoot();
return path.resolve(configured);
}

function defaultAgentVRoot(): string {
for (const candidate of ['../agentv', '../../agentv']) {
if (existsSync(path.resolve(candidate, 'examples'))) return candidate;
}
return '../agentv';
}

export function toPosixPath(value: string): string {
return value.split(path.sep).join('/');
}

export function relativePosix(from: string, to: string): string {
return toPosixPath(path.relative(from, to));
}
40 changes: 40 additions & 0 deletions packages/phoenix-adapter/src/agentv/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
export type JsonObject = Record<string, unknown>;

export type AgentVSourceKind = 'eval-yaml' | 'agent-skills-json';

export interface AgentVSource {
readonly path: string;
readonly relativePath: string;
readonly kind: AgentVSourceKind;
}

export interface AgentVMessage {
readonly role: string;
readonly content: unknown;
}

export interface NormalizedAssertion {
readonly name?: string;
readonly type: string;
readonly source: unknown;
}

export interface NormalizedCase {
readonly id: string;
readonly criteria?: string;
readonly input: readonly AgentVMessage[];
readonly expectedOutput?: unknown;
readonly assertions: readonly NormalizedAssertion[];
readonly metadata: JsonObject;
readonly sourcePath: string;
}

export interface NormalizedSuite {
readonly name: string;
readonly description?: string;
readonly source: AgentVSource;
readonly cases: readonly NormalizedCase[];
readonly suiteAssertions: readonly NormalizedAssertion[];
readonly warnings: readonly string[];
readonly unsupportedFeatures: readonly string[];
}
Loading
Loading