EntityProcess · christso · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml
@@ -51,5 +51,8 @@ jobs:
       - name: Check evals directories have eval files
         run: bun scripts/validate-eval-dirs.ts
 
+      - name: Run Phoenix adapter dry-run smoke
+        run: bun run phoenix:assert-smoke
+
       - name: Validate eval schemas
         run: bun apps/cli/dist/cli.js validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml'
diff --git a/bun.lock b/bun.lock
diff --git a/package.json b/package.json
@@ -6,15 +6,15 @@
   "packageManager": "bun@1.3.3",
   "workspaces": ["apps/*", "packages/*"],
   "scripts": {
-    "build": "bun --filter @agentv/core build && bun --filter @agentv/eval build && bun --filter @agentv/dashboard build && bun --filter agentv build",
+    "build": "bun --filter @agentv/core build && bun --filter @agentv/eval build && bun --filter @agentv/phoenix-adapter build && bun --filter @agentv/dashboard build && bun --filter agentv build",
     "verify": "bun run build && bun run typecheck && bun run lint && bun run test",
-    "typecheck": "bun --filter @agentv/core typecheck && bun --filter agentv typecheck",
+    "typecheck": "bun --filter @agentv/core typecheck && bun --filter @agentv/phoenix-adapter typecheck && bun --filter agentv typecheck",
     "typecheck:workspace": "tsc -b tsconfig.build.json",
     "typecheck:watch": "bun --filter @agentv/core typecheck -- --watch & bun --filter agentv typecheck -- --watch",
     "lint": "biome check .",
     "format": "biome format --write .",
     "fix": "biome check --write .",
-    "test": "bun --filter @agentv/core test && bun --filter @agentv/eval test && bun --filter agentv test",
+    "test": "bun --filter @agentv/core test && bun --filter @agentv/eval test && bun --filter @agentv/phoenix-adapter test && bun --filter agentv test",
     "test:watch": "bun --filter @agentv/core test:watch & bun --filter agentv test:watch",
     "agentv": "bun apps/cli/src/cli.ts",
     "agentv:buildrun": "bun run build && bun apps/cli/dist/cli.js",
@@ -25,13 +25,16 @@
     "examples:install": "bun scripts/install-examples.ts",
     "publish": "bun run build && bun scripts/publish.ts",
     "publish:next": "bun run build && bun scripts/publish.ts next",
-    "prepare": "test -d .git && bunx prek install -t pre-push || true"
+    "prepare": "test -d .git && bunx prek install -t pre-push || true",
+    "phoenix:dry-run": "bun --filter @agentv/phoenix-adapter phoenix:dry-run",
+    "phoenix:assert-smoke": "bun --filter @agentv/phoenix-adapter phoenix:assert-smoke"
   },
   "devDependencies": {
-    "@biomejs/biome": "^1.9.4",
-    "@j178/prek": "^0.3.0",
     "@agentv/core": "workspace:*",
     "@agentv/eval": "workspace:*",
+    "@agentv/phoenix-adapter": "workspace:*",
+    "@biomejs/biome": "^1.9.4",
+    "@j178/prek": "^0.3.0",
     "@types/bun": "latest",
     "@types/node": "24.1.0",
     "async-mutex": "^0.5.0",

diff --git a/packages/phoenix-adapter/.gitignore b/packages/phoenix-adapter/.gitignore
@@ -0,0 +1 @@
+reports/
diff --git a/packages/phoenix-adapter/README.md b/packages/phoenix-adapter/README.md
@@ -0,0 +1,12 @@
+# @agentv/phoenix-adapter
+
+Converts AgentV eval YAML suites into Phoenix datasets and can run Phoenix experiments while keeping AgentV eval files as the source of truth.
+
+Current adapter support is intentionally small: deterministic `contains`, `regex`, `equals`, and `is-json` assertions run through a Phoenix CODE evaluator. LLM, code, trace, composite, metric, and custom evaluator families are reported as unsupported instead of being silently mapped.
+
+```bash
+bun --filter @agentv/phoenix-adapter phoenix:assert-smoke
+bun --filter @agentv/phoenix-adapter phoenix:dry-run
+```
+
+See `docs/support-matrix.md` for evaluator coverage and `docs/e2e-verification.md` for smoke-test notes.
diff --git a/packages/phoenix-adapter/docs/e2e-verification.md b/packages/phoenix-adapter/docs/e2e-verification.md
@@ -0,0 +1,50 @@
+# E2E Verification
+
+## Dry-Run Conversion
+
+Dry-run mode discovers AgentV example evals, normalizes cases through `@agentv/core`, creates Phoenix dataset payloads in memory, and compares test IDs against AgentV baselines where present.
+
+```bash
+bun run phoenix:assert-smoke
+bun run phoenix:dry-run
+```
+
+Current filtered smoke result against `examples/features/assert/evals/dataset.eval.yaml`:
+
+- 1 suite discovered
+- 4 tests normalized
+- 1 suite passed structural parity
+- 0 failed suites
+
+Current full dry-run result against this AgentV checkout:
+
+- 97 suites discovered
+- 405 tests normalized
+- 93 suites passed structural parity
+- 4 suites failed baseline/loader parity
+
+The failing suites are currently source/baseline or source-reference mismatches, not Phoenix conversion crashes:
+
+- `examples/features/matrix-evaluation/evals/dataset.eval.yaml`: baseline has 5 rows, source has 3 tests.
+- `examples/features/prompt-template-sdk/evals/dataset.eval.yaml`: AgentV core skips 2 tests because `../prompts/custom-grader.ts` cannot be resolved from the eval source.
+- `examples/features/tool-trajectory-simple/evals/dataset.eval.yaml`: source has 11 tests, baseline has 7 rows.
+- `examples/features/weighted-graders/evals/dataset.eval.yaml`: baseline IDs use `evaluator` naming while source IDs use `grader` naming.
+
+## Live Phoenix Smoke
+
+Live mode creates or updates a Phoenix dataset and records a Phoenix experiment. It currently uses the deterministic adapter path, so the best smoke target is `examples/features/assert/evals/dataset.eval.yaml`.
+
+```bash
+(cd packages/phoenix-adapter && bun src/cli.ts run \
+  --agentv-root ../.. \
+  --filter examples/features/assert/evals/dataset.eval.yaml \
+  --out reports/live-assert-final.json \
+  --namespace agentv-phoenix-e2e-final)
+```
+
+The source harness was verified locally against Phoenix at `http://localhost:6006`:
+
+- 4 Phoenix task runs
+- 4 Phoenix evaluator runs
+- average evaluator score: 1.0
+- experiment ID: `RXhwZXJpbWVudDo2`
diff --git a/packages/phoenix-adapter/docs/support-matrix.md b/packages/phoenix-adapter/docs/support-matrix.md
@@ -0,0 +1,23 @@
+# Phoenix Adapter Support Matrix
+
+This workspace converts AgentV example evals into Phoenix dataset and experiment payloads.
+
+| AgentV family | Phoenix status |
+| --- | --- |
+| `contains` | Supported by deterministic adapter |
+| `regex` | Supported by deterministic adapter |
+| `equals` | Supported by deterministic adapter |
+| `is-json` | Supported by deterministic adapter |
+| `llm-grader` | Reported as unsupported in first pass |
+| `rubrics` | Reported as unsupported in first pass |
+| `code-grader` | Reported as unsupported in first pass |
+| `composite` | Reported as unsupported in first pass |
+| `field-accuracy` | Reported as unsupported in first pass |
+| `execution-metrics` | Reported as unsupported in first pass |
+| `tool-trajectory` | Reported as unsupported in first pass |
+| `cost` | Reported as unsupported in first pass |
+| `latency` | Reported as unsupported in first pass |
+| `trial-output-consistency` | Reported as unsupported in first pass |
+| Other custom families | Reported as unsupported with the family name |
+
+Unsupported does not block conversion unless `--fail-on-unsupported` is set. The report keeps unsupported families visible so parity gaps are explicit.
diff --git a/packages/phoenix-adapter/package.json b/packages/phoenix-adapter/package.json
@@ -0,0 +1,33 @@
+{
+  "name": "@agentv/phoenix-adapter",
+  "version": "4.31.4-next.1",
+  "description": "Phoenix execution and observability adapter for AgentV eval YAML suites",
+  "private": true,
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js"
+    }
+  },
+  "scripts": {
+    "build": "(cd ../core && bun run build) && tsup",
+    "typecheck": "(cd ../core && bun run build) && tsc --noEmit",
+    "test": "(cd ../core && bun run build) && bun test",
+    "phoenix:dry-run": "bun src/cli.ts run --dry-run --agentv-root ../.. --out reports/dry-run.json",
+    "phoenix:assert-smoke": "bun src/cli.ts run --dry-run --agentv-root ../.. --filter examples/features/assert/evals/dataset.eval.yaml --out /tmp/agentv-phoenix-assert-smoke.json"
+  },
+  "files": ["dist", "README.md", "docs"],
+  "dependencies": {
+    "@agentv/core": "workspace:*",
+    "@arizeai/phoenix-client": "6.10.0",
+    "@arizeai/phoenix-evals": "1.0.3",
+    "yaml": "^2.8.3"
+  },
+  "devDependencies": {
+    "tsup": "8.3.5",
+    "typescript": "5.8.3"
+  }
+}
diff --git a/packages/phoenix-adapter/src/agentv/discovery.ts b/packages/phoenix-adapter/src/agentv/discovery.ts
@@ -0,0 +1,39 @@
+import { readdir } from 'node:fs/promises';
+import path from 'node:path';
+import { relativePosix } from './path.js';
+import type { AgentVSource } from './types.js';
+
+const EVAL_FILE_RE = /\.(?:eval|EVAL)\.ya?ml$/;
+
+async function walk(dir: string, results: string[] = []): Promise<string[]> {
+  const entries = await readdir(dir, { withFileTypes: true });
+  for (const entry of entries) {
+    if (entry.name === 'node_modules' || entry.name === '.git') continue;
+    const fullPath = path.join(dir, entry.name);
+    if (entry.isDirectory()) {
+      await walk(fullPath, results);
+      continue;
+    }
+    if (entry.isFile()) results.push(fullPath);
+  }
+  return results;
+}
+
+export async function discoverAgentVEvals(agentvRoot: string): Promise<AgentVSource[]> {
+  const examplesRoot = path.join(agentvRoot, 'examples');
+  const files = await walk(examplesRoot);
+
+  return files
+    .filter(
+      (file) => EVAL_FILE_RE.test(path.basename(file)) || path.basename(file) === 'evals.json',
+    )
+    .map((file): AgentVSource => {
+      const relativePath = relativePosix(agentvRoot, file);
+      return {
+        path: file,
+        relativePath,
+        kind: path.basename(file) === 'evals.json' ? 'agent-skills-json' : 'eval-yaml',
+      };
+    })
+    .sort((a, b) => a.relativePath.localeCompare(b.relativePath));
+}
diff --git a/packages/phoenix-adapter/src/agentv/load-spec.ts b/packages/phoenix-adapter/src/agentv/load-spec.ts
@@ -0,0 +1,123 @@
+import { existsSync, readFileSync } from 'node:fs';
+import path from 'node:path';
+import { loadTestSuite } from '@agentv/core';
+import YAML from 'yaml';
+import type {
+  AgentVMessage,
+  AgentVSource,
+  JsonObject,
+  NormalizedAssertion,
+  NormalizedCase,
+  NormalizedSuite,
+} from './types.js';
+
+function parseStructuredFile(filePath: string): unknown {
+  const content = readFileSync(filePath, 'utf8');
+  if (filePath.endsWith('.json')) return JSON.parse(content);
+  if (filePath.endsWith('.jsonl')) {
+    return content
+      .split('\n')
+      .map((line) => line.trim())
+      .filter(Boolean)
+      .map((line) => JSON.parse(line));
+  }
+  return YAML.parse(content);
+}
+
+function normalizeAssertion(assertion: unknown, index: number): NormalizedAssertion {
+  if (typeof assertion === 'string') {
+    return { type: 'rubrics', source: assertion };
+  }
+  const record = (assertion ?? {}) as JsonObject;
+  const type = String(record.type ?? record.name ?? `assertion-${index + 1}`);
+  return {
+    name: typeof record.name === 'string' ? record.name : undefined,
+    type,
+    source: assertion,
+  };
+}
+
+function normalizeExpectedOutput(test: {
+  readonly reference_answer?: string;
+  readonly expected_output?: unknown;
+}): unknown {
+  const expectedOutput = test.expected_output;
+  const hasExpectedOutput = Array.isArray(expectedOutput)
+    ? expectedOutput.length > 0
+    : expectedOutput !== undefined;
+  if (!hasExpectedOutput) return undefined;
+  return test.reference_answer ?? expectedOutput;
+}
+
+function deriveAgentVRoot(source: AgentVSource): string {
+  return path.resolve(source.path, ...source.relativePath.split('/').map(() => '..'));
+}
+
+function collectUnsupported(
+  raw: JsonObject,
+  suite: Awaited<ReturnType<typeof loadTestSuite>>,
+): readonly string[] {
+  const unsupported: string[] = [];
+  for (const key of ['workspace', 'before_all', 'after_all', 'matrix']) {
+    if (raw[key] !== undefined) unsupported.push(key);
+  }
+  if (suite.trials !== undefined) unsupported.push('trials');
+  if (suite.workspacePath !== undefined) unsupported.push('workspace');
+  if ((suite.targets?.length ?? 0) > 0 || (suite.targetRefs?.length ?? 0) > 0)
+    unsupported.push('matrix');
+  return [...new Set(unsupported)];
+}
+
+/**
+ * Load an AgentV-authored eval source into the Phoenix adapter's normalized shape.
+ *
+ * AgentV eval YAML remains the source of truth: this adapter delegates case expansion,
+ * external case files, assertion parsing, Agent Skills `evals.json`, interpolation, and
+ * metadata handling to `@agentv/core`'s loader, then projects the result into Phoenix
+ * dataset examples. Add Phoenix-specific behavior after this boundary rather than
+ * duplicating AgentV YAML semantics in the adapter.
+ */
+export async function loadAgentVEvalSuite(source: AgentVSource): Promise<NormalizedSuite> {
+  if (!existsSync(source.path)) {
+    throw new Error(`AgentV eval source does not exist: ${source.path}`);
+  }
+
+  const raw = (parseStructuredFile(source.path) ?? {}) as JsonObject;
+  const loaded = await loadTestSuite(source.path, deriveAgentVRoot(source));
+  const suiteName =
+    raw.skill_name ??
+    loaded.tests[0]?.suite ??
+    raw.name ??
+    path.basename(source.path).replace(/\.ya?ml$/, '');
+
+  const cases = loaded.tests.map((test, index): NormalizedCase => {
+    const assertions = (test.assertions ?? []).map((assertion, assertionIndex) =>
+      normalizeAssertion(assertion, assertionIndex),
+    );
+
+    return {
+      id: String(test.id ?? `case-${index + 1}`),
+      criteria: test.criteria || undefined,
+      input: test.input as readonly AgentVMessage[],
+      expectedOutput: normalizeExpectedOutput(test),
+      assertions,
+      metadata: {
+        ...(test.metadata ?? {}),
+        ...(test.targets ? { targets: test.targets } : {}),
+      },
+      sourcePath: source.relativePath,
+    };
+  });
+
+  return {
+    name: String(suiteName),
+    description: typeof raw.description === 'string' ? raw.description : undefined,
+    source,
+    cases,
+    suiteAssertions: [],
+    warnings: cases
+      .filter((testCase) => testCase.input.length === 0)
+      .map((testCase) => `${source.relativePath}: ${testCase.id} has no input`),
+    unsupportedFeatures: collectUnsupported(raw, loaded),
+  };
+}
diff --git a/packages/phoenix-adapter/src/agentv/path.ts b/packages/phoenix-adapter/src/agentv/path.ts
@@ -0,0 +1,22 @@
+import { existsSync } from 'node:fs';
+import path from 'node:path';
+
+export function resolveAgentVRoot(input?: string): string {
+  const configured = input ?? process.env.AGENTV_ROOT ?? defaultAgentVRoot();
+  return path.resolve(configured);
+}
+
+function defaultAgentVRoot(): string {
+  for (const candidate of ['../agentv', '../../agentv']) {
+    if (existsSync(path.resolve(candidate, 'examples'))) return candidate;
+  }
+  return '../agentv';
+}
+
+export function toPosixPath(value: string): string {
+  return value.split(path.sep).join('/');
+}
+
+export function relativePosix(from: string, to: string): string {
+  return toPosixPath(path.relative(from, to));
+}
diff --git a/packages/phoenix-adapter/src/agentv/types.ts b/packages/phoenix-adapter/src/agentv/types.ts
@@ -0,0 +1,40 @@
+export type JsonObject = Record<string, unknown>;
+
+export type AgentVSourceKind = 'eval-yaml' | 'agent-skills-json';
+
+export interface AgentVSource {
+  readonly path: string;
+  readonly relativePath: string;
+  readonly kind: AgentVSourceKind;
+}
+
+export interface AgentVMessage {
+  readonly role: string;
+  readonly content: unknown;
+}
+
+export interface NormalizedAssertion {
+  readonly name?: string;
+  readonly type: string;
+  readonly source: unknown;
+}
+
+export interface NormalizedCase {
+  readonly id: string;
+  readonly criteria?: string;
+  readonly input: readonly AgentVMessage[];
+  readonly expectedOutput?: unknown;
+  readonly assertions: readonly NormalizedAssertion[];
+  readonly metadata: JsonObject;
+  readonly sourcePath: string;
+}
+
+export interface NormalizedSuite {
+  readonly name: string;
+  readonly description?: string;
+  readonly source: AgentVSource;
+  readonly cases: readonly NormalizedCase[];
+  readonly suiteAssertions: readonly NormalizedAssertion[];
+  readonly warnings: readonly string[];
+  readonly unsupportedFeatures: readonly string[];
+}