From 229fbbf88673c46d355e4a4ff7ef715f713e2989 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 2 Jun 2026 12:15:18 +0200 Subject: [PATCH 1/6] feat(phoenix): add AgentV eval adapter --- .github/workflows/validate.yml | 3 + bun.lock | 140 +++++++++++++- package.json | 15 +- packages/phoenix-adapter/.gitignore | 1 + packages/phoenix-adapter/README.md | 12 ++ .../phoenix-adapter/docs/e2e-verification.md | 50 +++++ .../phoenix-adapter/docs/support-matrix.md | 23 +++ packages/phoenix-adapter/package.json | 32 ++++ .../phoenix-adapter/src/agentv/discovery.ts | 39 ++++ .../phoenix-adapter/src/agentv/load-spec.ts | 119 ++++++++++++ packages/phoenix-adapter/src/agentv/path.ts | 22 +++ packages/phoenix-adapter/src/agentv/types.ts | 40 ++++ packages/phoenix-adapter/src/cli.ts | 67 +++++++ .../src/evaluators/deterministic.ts | 178 ++++++++++++++++++ .../src/evaluators/registry.ts | 134 +++++++++++++ .../phoenix-adapter/src/evaluators/types.ts | 59 ++++++ packages/phoenix-adapter/src/index.ts | 12 ++ .../phoenix-adapter/src/parity/baselines.ts | 27 +++ .../phoenix-adapter/src/parity/compare.ts | 74 ++++++++ packages/phoenix-adapter/src/parity/report.ts | 60 ++++++ packages/phoenix-adapter/src/parity/types.ts | 25 +++ .../phoenix-adapter/src/phoenix/datasets.ts | 29 +++ packages/phoenix-adapter/src/phoenix/names.ts | 12 ++ .../src/phoenix/run-experiment.ts | 174 +++++++++++++++++ packages/phoenix-adapter/src/phoenix/types.ts | 23 +++ packages/phoenix-adapter/src/run/options.ts | 9 + packages/phoenix-adapter/src/run/run-suite.ts | 73 +++++++ .../test/agentv-normalize.test.ts | 95 ++++++++++ .../test/evaluators/deterministic.test.ts | 96 ++++++++++ .../test/evaluators/registry.test.ts | 69 +++++++ packages/phoenix-adapter/test/parity.test.ts | 41 ++++ .../test/phoenix-datasets.test.ts | 35 ++++ packages/phoenix-adapter/tsconfig.json | 10 + packages/phoenix-adapter/tsup.config.ts | 12 ++ 34 files changed, 1800 insertions(+), 10 deletions(-) create mode 100644 packages/phoenix-adapter/.gitignore create mode 100644 packages/phoenix-adapter/README.md create mode 100644 packages/phoenix-adapter/docs/e2e-verification.md create mode 100644 packages/phoenix-adapter/docs/support-matrix.md create mode 100644 packages/phoenix-adapter/package.json create mode 100644 packages/phoenix-adapter/src/agentv/discovery.ts create mode 100644 packages/phoenix-adapter/src/agentv/load-spec.ts create mode 100644 packages/phoenix-adapter/src/agentv/path.ts create mode 100644 packages/phoenix-adapter/src/agentv/types.ts create mode 100644 packages/phoenix-adapter/src/cli.ts create mode 100644 packages/phoenix-adapter/src/evaluators/deterministic.ts create mode 100644 packages/phoenix-adapter/src/evaluators/registry.ts create mode 100644 packages/phoenix-adapter/src/evaluators/types.ts create mode 100644 packages/phoenix-adapter/src/index.ts create mode 100644 packages/phoenix-adapter/src/parity/baselines.ts create mode 100644 packages/phoenix-adapter/src/parity/compare.ts create mode 100644 packages/phoenix-adapter/src/parity/report.ts create mode 100644 packages/phoenix-adapter/src/parity/types.ts create mode 100644 packages/phoenix-adapter/src/phoenix/datasets.ts create mode 100644 packages/phoenix-adapter/src/phoenix/names.ts create mode 100644 packages/phoenix-adapter/src/phoenix/run-experiment.ts create mode 100644 packages/phoenix-adapter/src/phoenix/types.ts create mode 100644 packages/phoenix-adapter/src/run/options.ts create mode 100644 packages/phoenix-adapter/src/run/run-suite.ts create mode 100644 packages/phoenix-adapter/test/agentv-normalize.test.ts create mode 100644 packages/phoenix-adapter/test/evaluators/deterministic.test.ts create mode 100644 packages/phoenix-adapter/test/evaluators/registry.test.ts create mode 100644 packages/phoenix-adapter/test/parity.test.ts create mode 100644 packages/phoenix-adapter/test/phoenix-datasets.test.ts create mode 100644 packages/phoenix-adapter/tsconfig.json create mode 100644 packages/phoenix-adapter/tsup.config.ts diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 8f62d5a8a..9ae0fe5e3 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -51,5 +51,8 @@ jobs: - name: Check evals directories have eval files run: bun scripts/validate-eval-dirs.ts + - name: Run Phoenix adapter dry-run smoke + run: bun run phoenix:assert-smoke + - name: Validate eval schemas run: bun apps/cli/dist/cli.js validate 'examples/features/**/evals/**/*.eval.yaml' 'examples/features/**/*.EVAL.yaml' diff --git a/bun.lock b/bun.lock index 9a723e1c0..366959007 100644 --- a/bun.lock +++ b/bun.lock @@ -7,6 +7,7 @@ "devDependencies": { "@agentv/core": "workspace:*", "@agentv/eval": "workspace:*", + "@agentv/phoenix-adapter": "workspace:*", "@biomejs/biome": "^1.9.4", "@j178/prek": "^0.3.0", "@types/bun": "latest", @@ -20,7 +21,7 @@ }, "apps/cli": { "name": "agentv", - "version": "4.27.0", + "version": "4.31.4-next.1", "bin": { "agentv": "./dist/cli.js", }, @@ -84,7 +85,7 @@ }, "packages/core": { "name": "@agentv/core", - "version": "4.27.0", + "version": "4.31.4-next.1", "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", @@ -120,27 +121,64 @@ }, "packages/eval": { "name": "@agentv/eval", - "version": "4.27.0", + "version": "4.31.4-next.1", "dependencies": { "zod": "^3.23.8", }, }, + "packages/phoenix-adapter": { + "name": "@agentv/phoenix-adapter", + "version": "4.31.4-next.1", + "dependencies": { + "@agentv/core": "workspace:*", + "@arizeai/phoenix-client": "6.10.0", + "@arizeai/phoenix-evals": "1.0.3", + }, + "devDependencies": { + "tsup": "8.3.5", + "typescript": "5.8.3", + }, + }, }, "packages": { "@agentclientprotocol/sdk": ["@agentclientprotocol/sdk@0.14.1", "", { "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" } }, "sha512-b6r3PS3Nly+Wyw9U+0nOr47bV8tfS476EgyEMhoKvJCZLbgqoDFN7DJwkxL88RR0aiOqOYV1ZnESHqb+RmdH8w=="], "@agentv/core": ["@agentv/core@workspace:packages/core"], + "@agentv/dashboard": ["@agentv/dashboard@workspace:apps/dashboard"], + "@agentv/eval": ["@agentv/eval@workspace:packages/eval"], - "@agentv/dashboard": ["@agentv/dashboard@workspace:apps/dashboard"], + "@agentv/phoenix-adapter": ["@agentv/phoenix-adapter@workspace:packages/phoenix-adapter"], "@agentv/web": ["@agentv/web@workspace:apps/web"], + "@ai-sdk/gateway": ["@ai-sdk/gateway@3.0.122", "", { "dependencies": { "@ai-sdk/provider": "3.0.10", "@ai-sdk/provider-utils": "4.0.27", "@vercel/oidc": "3.2.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-U1k2fk7cSH/tS5CZ3ujROiUCOLFwkzb792OqR/Org8Mfm27dKSIdRZG4ZuJUifT8alUWa61IoaRu4foXKlP5TQ=="], + + "@ai-sdk/provider": ["@ai-sdk/provider@3.0.10", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-Q3BZ27qfpYqnCYGvE3vt+Qi6LGOF9R5Nmzn+9JoM1lCRsD9mYaIhfJLkSunN48nfGXJ6n+XNV0J/XVpqGQl7Dw=="], + + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.27", "", { "dependencies": { "@ai-sdk/provider": "3.0.10", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.8" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-ubkAJ+xODouwtmN1tYlvTPphH1hPOBfZaEQe8U7skGvFAnIRs9PPpsq57bC2+Ky/MB4yzhd6YOsxTAx9sGpazw=="], + "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.49", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-3avi409dwuGkPEETpWa0gyJvRMr3b6LxeuW5/sAPCOtLD9WxH9fYltbA5wZoazxTw5mlbXmjDp7JqO1rlmpaIQ=="], "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.91.1", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-LAmu761tSN9r66ixvmciswUj/ZC+1Q4iAfpedTfSVLeswRwnY3n2Nb6Tsk+cLPP28aLOPWeMgIuTuCcMC6W/iw=="], + "@arizeai/openinference-core": ["@arizeai/openinference-core@2.2.0", "", { "dependencies": { "@arizeai/openinference-semantic-conventions": "2.5.0", "@opentelemetry/api": "^1.9.0", "@opentelemetry/core": "^1.25.1" } }, "sha512-Ix1u/nphZj1yHqmyIfeBe2AVfnilTwgtvfXemJxc/6F+4JC7Rks6VMlPCfB8NXvMOhop2IveA6EyxYMkv/PH/A=="], + + "@arizeai/openinference-genai": ["@arizeai/openinference-genai@0.1.10", "", { "dependencies": { "@arizeai/openinference-semantic-conventions": "2.5.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.9.0", "@opentelemetry/semantic-conventions": ">=1.37.0" } }, "sha512-BrkTeZm57FXHZ82C50KU79NJfz/jQseW/4sTdz5IvAbZPBnwEbwua/jspckD1b3vdTS+BGEwkXWTx0fkUilb7A=="], + + "@arizeai/openinference-semantic-conventions": ["@arizeai/openinference-semantic-conventions@2.5.0", "", {}, "sha512-4ZeSwiFX3YxB0WSE6x568wM4PVHiYmz3yiOxic6WGKVrE/KIGggMFP/eqUNQhikBKP68IDV0qiILlZAIYnheAQ=="], + + "@arizeai/openinference-vercel": ["@arizeai/openinference-vercel@2.7.7", "", { "dependencies": { "@arizeai/openinference-core": "2.2.0", "@arizeai/openinference-genai": "0.1.10", "@arizeai/openinference-semantic-conventions": "2.5.0", "@opentelemetry/core": "^1.30.1" }, "peerDependencies": { "@opentelemetry/api": ">=1.7.0 <2.0.0" } }, "sha512-iShJM8C+R959Ei9sUt8/2BE301W5Px06nYKY+7tCMbm7M0MjRPibrjLyZi6n+Hnd8U3zBE4vYXzAIdE0Z6ZTiQ=="], + + "@arizeai/phoenix-client": ["@arizeai/phoenix-client@6.10.0", "", { "dependencies": { "@arizeai/openinference-semantic-conventions": "^2.1.7", "@arizeai/openinference-vercel": "^2.7.0", "@arizeai/phoenix-config": "0.1.4", "@arizeai/phoenix-otel": "1.0.2", "async": "^3.2.6", "openapi-fetch": "^0.17.0", "tiny-invariant": "^1.3.3", "zod": "^4.0.14" }, "peerDependencies": { "@anthropic-ai/sdk": "^0.35.0", "ai": "^6.0.90", "openai": "^6.10.0" }, "optionalPeers": ["@anthropic-ai/sdk", "ai", "openai"] }, "sha512-rKvvHyhBGT5Tksckt3VhDGdPiRoaG/GCgzS64IVCPLHLdQhEHoRd88MJyud8tHgwwQ4/XyJ/4cT43z26CNCPVg=="], + + "@arizeai/phoenix-config": ["@arizeai/phoenix-config@0.1.4", "", {}, "sha512-GBgPCQWW2GIHqsV067Uqc2YLCapQTHWX2wuYQYILAos6m39+sDX4hunP4qUqBixbF8tR5zdybCO8iGRV+tEcBg=="], + + "@arizeai/phoenix-evals": ["@arizeai/phoenix-evals@1.0.3", "", { "dependencies": { "@arizeai/openinference-core": "^2.0.0", "@opentelemetry/api": "^1.9.0", "ai": "^6.0.90", "jsonpath-plus": "^10.3.0", "mustache": "^4.2.0", "zod": "^4.0.14" } }, "sha512-D4u8UVhbWkPXgtUZVlTcBdlLyowPt7yIjebqDxtglnoBnKx6u3pM+8veC3AROQCc29h1HiKzyiMFwqmQMFFNLg=="], + + "@arizeai/phoenix-otel": ["@arizeai/phoenix-otel@1.0.2", "", { "dependencies": { "@arizeai/openinference-core": "^2.0.7", "@arizeai/openinference-semantic-conventions": "^2.1.7", "@arizeai/openinference-vercel": "^2.7.0", "@opentelemetry/api": "^1.9.0", "@opentelemetry/context-async-hooks": "^2.5.1", "@opentelemetry/core": "^1.25.1", "@opentelemetry/exporter-trace-otlp-proto": "^0.205.0", "@opentelemetry/instrumentation": "^0.57.2", "@opentelemetry/resources": "^2.0.0", "@opentelemetry/sdk-trace-base": "^2.5.1", "@opentelemetry/sdk-trace-node": "^2.5.1" } }, "sha512-lYUQN1buHJM+ZGSO9uEKuoihiOqjYcOIxfm1IzqyZ4Fk6o6YKGyEKYt6fgwzoaWr82x+mn9oPUFay7Ff8m0gAw=="], + "@astrojs/compiler": ["@astrojs/compiler@2.13.0", "", {}, "sha512-mqVORhUJViA28fwHYaWmsXSzLO9osbdZ5ImUfxBarqsYdMlPbqAqGJCxsNzvppp1BEzc1mJNjOVvQqeDN8Vspw=="], "@astrojs/internal-helpers": ["@astrojs/internal-helpers@0.7.5", "", {}, "sha512-vreGnYSSKhAjFJCWAwe/CNhONvoc5lokxtRoZims+0wa3KbHBdPHSSthJsKxPd8d/aic6lWKpRTYGY/hsgK6EA=="], @@ -477,6 +515,10 @@ "@jridgewell/trace-mapping": ["@jridgewell/trace-mapping@0.3.31", "", { "dependencies": { "@jridgewell/resolve-uri": "^3.1.0", "@jridgewell/sourcemap-codec": "^1.4.14" } }, "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw=="], + "@jsep-plugin/assignment": ["@jsep-plugin/assignment@1.3.0", "", { "peerDependencies": { "jsep": "^0.4.0||^1.0.0" } }, "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ=="], + + "@jsep-plugin/regex": ["@jsep-plugin/regex@1.0.4", "", { "peerDependencies": { "jsep": "^0.4.0||^1.0.0" } }, "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg=="], + "@mdx-js/mdx": ["@mdx-js/mdx@3.1.1", "", { "dependencies": { "@types/estree": "^1.0.0", "@types/estree-jsx": "^1.0.0", "@types/hast": "^3.0.0", "@types/mdx": "^2.0.0", "acorn": "^8.0.0", "collapse-white-space": "^2.0.0", "devlop": "^1.0.0", "estree-util-is-identifier-name": "^3.0.0", "estree-util-scope": "^1.0.0", "estree-walker": "^3.0.0", "hast-util-to-jsx-runtime": "^2.0.0", "markdown-extensions": "^2.0.0", "recma-build-jsx": "^1.0.0", "recma-jsx": "^1.0.0", "recma-stringify": "^1.0.0", "rehype-recma": "^1.0.0", "remark-mdx": "^3.0.0", "remark-parse": "^11.0.0", "remark-rehype": "^11.0.0", "source-map": "^0.7.0", "unified": "^11.0.0", "unist-util-position-from-estree": "^2.0.0", "unist-util-stringify-position": "^4.0.0", "unist-util-visit": "^5.0.0", "vfile": "^6.0.0" } }, "sha512-f6ZO2ifpwAQIpzGWaBQT2TXxPv6z3RBzQKpVftEWN78Vl/YweF1uwussDx8ECAXVtr3Rs89fKyG9YlzUs9DyGQ=="], "@mistralai/mistralai": ["@mistralai/mistralai@2.2.1", "", { "dependencies": { "ws": "^8.18.0", "zod": "^3.25.0 || ^4.0.0", "zod-to-json-schema": "^3.25.0" } }, "sha512-uKU8CZmL2RzYKmplsU01hii4p3pe4HqJefpWNRWXm1Tcm0Sm4xXfwSLIy4k7ZCPlbETCGcp69E7hZs+WOJ5itQ=="], @@ -519,6 +561,10 @@ "@opentelemetry/exporter-trace-otlp-http": ["@opentelemetry/exporter-trace-otlp-http@0.212.0", "", { "dependencies": { "@opentelemetry/core": "2.5.1", "@opentelemetry/otlp-exporter-base": "0.212.0", "@opentelemetry/otlp-transformer": "0.212.0", "@opentelemetry/resources": "2.5.1", "@opentelemetry/sdk-trace-base": "2.5.1" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-v/0wMozNoiEPRolzC4YoPo4rAT0q8r7aqdnRw3Nu7IDN0CGFzNQazkfAlBJ6N5y0FYJkban7Aw5WnN73//6YlA=="], + "@opentelemetry/exporter-trace-otlp-proto": ["@opentelemetry/exporter-trace-otlp-proto@0.205.0", "", { "dependencies": { "@opentelemetry/core": "2.1.0", "@opentelemetry/otlp-exporter-base": "0.205.0", "@opentelemetry/otlp-transformer": "0.205.0", "@opentelemetry/resources": "2.1.0", "@opentelemetry/sdk-trace-base": "2.1.0" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-bGtFzqiENO2GpJk988mOBMe0MfeNpTQjbLm/LBijas6VRyEDQarUzdBHpFlu89A25k1+BCntdWGsWTa9Ai4FyA=="], + + "@opentelemetry/instrumentation": ["@opentelemetry/instrumentation@0.57.2", "", { "dependencies": { "@opentelemetry/api-logs": "0.57.2", "@types/shimmer": "^1.2.0", "import-in-the-middle": "^1.8.1", "require-in-the-middle": "^7.1.1", "semver": "^7.5.2", "shimmer": "^1.2.1" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-BdBGhQBh8IjZ2oIIX6F2/Q3LKm/FDDKi6ccYKcBTeilh6SNdNKveDOLk73BkSJjQLJk6qe4Yh+hHw1UPhCDdrg=="], + "@opentelemetry/otlp-exporter-base": ["@opentelemetry/otlp-exporter-base@0.212.0", "", { "dependencies": { "@opentelemetry/core": "2.5.1", "@opentelemetry/otlp-transformer": "0.212.0" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-HoMv5pQlzbuxiMS0hN7oiUtg8RsJR5T7EhZccumIWxYfNo/f4wFc7LPDfFK6oHdG2JF/+qTocfqIHoom+7kLpw=="], "@opentelemetry/otlp-transformer": ["@opentelemetry/otlp-transformer@0.212.0", "", { "dependencies": { "@opentelemetry/api-logs": "0.212.0", "@opentelemetry/core": "2.5.1", "@opentelemetry/resources": "2.5.1", "@opentelemetry/sdk-logs": "0.212.0", "@opentelemetry/sdk-metrics": "2.5.1", "@opentelemetry/sdk-trace-base": "2.5.1", "protobufjs": "8.0.0" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-bj7zYFOg6Db7NUwsRZQ/WoVXpAf41WY2gsd3kShSfdpZQDRKHWJiRZIg7A8HvWsf97wb05rMFzPbmSHyjEl9tw=="], @@ -847,6 +893,8 @@ "@types/semver": ["@types/semver@7.7.1", "", {}, "sha512-FmgJfu+MOcQ370SD0ev7EI8TlCAfKYU+B4m5T3yXc1CiRN94g/SZPtsCkk506aUDtlMnFZvasDwHHUcZUEaYuA=="], + "@types/shimmer": ["@types/shimmer@1.2.0", "", {}, "sha512-UE7oxhQLLd9gub6JKIAhDq06T0F6FnztwMNRvYgjeQSBeMc1ZG/tA47EwfduvkuQS8apbkM/lpLpWsaCeYsXVg=="], + "@types/trusted-types": ["@types/trusted-types@2.0.7", "", {}, "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw=="], "@types/unist": ["@types/unist@3.0.3", "", {}, "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q=="], @@ -855,16 +903,22 @@ "@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="], + "@vercel/oidc": ["@vercel/oidc@3.2.0", "", {}, "sha512-UycprH3T6n3jH0k44NHMa7pnFHGu/N05MjojYr+Mc6I7obkoLIJujSWwin1pCvdy/eOxrI/l3uDLQsmcrOb4ug=="], + "@vitejs/plugin-react": ["@vitejs/plugin-react@4.7.0", "", { "dependencies": { "@babel/core": "^7.28.0", "@babel/plugin-transform-react-jsx-self": "^7.27.1", "@babel/plugin-transform-react-jsx-source": "^7.27.1", "@rolldown/pluginutils": "1.0.0-beta.27", "@types/babel__core": "^7.20.5", "react-refresh": "^0.17.0" }, "peerDependencies": { "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0" } }, "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA=="], "acorn": ["acorn@8.15.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg=="], + "acorn-import-attributes": ["acorn-import-attributes@1.9.5", "", { "peerDependencies": { "acorn": "^8" } }, "sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ=="], + "acorn-jsx": ["acorn-jsx@5.3.2", "", { "peerDependencies": { "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ=="], "agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="], "agentv": ["agentv@workspace:apps/cli"], + "ai": ["ai@6.0.194", "", { "dependencies": { "@ai-sdk/gateway": "3.0.122", "@ai-sdk/provider": "3.0.10", "@ai-sdk/provider-utils": "4.0.27", "@opentelemetry/api": "^1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-0MkYqrSZZuC1zTECppcaUT0i54aocXpYaUMVue3V8z/weBHCytfO5/CcwZCU80msZpfkbBUKYSSrkZFotEO5wQ=="], + "ansi-align": ["ansi-align@3.0.1", "", { "dependencies": { "string-width": "^4.1.0" } }, "sha512-IOfwwBF5iczOjp/WeY4YxyjqAFMQoZufdQWDd19SEExbVLNXqvpzSJ/M7Za4/sCPmQ0+GRquoA7bGcINcxew6w=="], "ansi-regex": ["ansi-regex@6.2.2", "", {}, "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg=="], @@ -893,6 +947,8 @@ "astro-expressive-code": ["astro-expressive-code@0.41.6", "", { "dependencies": { "rehype-expressive-code": "^0.41.6" }, "peerDependencies": { "astro": "^4.0.0-beta || ^5.0.0-beta || ^3.3.0 || ^6.0.0-beta" } }, "sha512-l47tb1uhmVIebHUkw+HEPtU/av0G4O8Q34g2cbkPvC7/e9ZhANcjUUciKt9Hp6gSVDdIuXBBLwJQn2LkeGMOAw=="], + "async": ["async@3.2.6", "", {}, "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA=="], + "async-mutex": ["async-mutex@0.5.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-1A94B18jkJ3DYq284ohPxoXbfTA5HsQ7/Mf4DEhcyLx3Bz27Rh59iScbB6EPiP+B+joue6YCxcMXSbFC1tZKwA=="], "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], @@ -965,6 +1021,8 @@ "ci-info": ["ci-info@4.4.0", "", {}, "sha512-77PSwercCZU2Fc4sX94eF8k8Pxte6JAwL4/ICZLFjJLqegs7kCuAsqqj/70NQF6TvDpgFjkubQB2FW2ZZddvQg=="], + "cjs-module-lexer": ["cjs-module-lexer@1.4.3", "", {}, "sha512-9z8TZaGM1pfswYeXrUpzPrkx8UnWYdhJclsiYMm6x/w5+nN+8Tf/LnAgfLGQCm59qAOxU8WwHEq2vNwF6i4j+Q=="], + "cli-boxes": ["cli-boxes@3.0.0", "", {}, "sha512-/lzGpEWL/8PfI0BmBOPRwp0c/wFNX1RdUML3jK/RcSBA9T8mZDdQpqYBKtCFTOfQbwPqWEOpjqW+Fnayc0969g=="], "cli-width": ["cli-width@4.1.0", "", {}, "sha512-ouuZd4/dm2Sw5Gmqy6bGyNNNe1qt9RpmxveLSO7KcgsTnU7RXfsw+/bukWGo1abgBiMAic068rclZsO4IWmmxQ=="], @@ -1145,6 +1203,8 @@ "eventemitter3": ["eventemitter3@5.0.4", "", {}, "sha512-mlsTRyGaPBjPedk6Bvw+aqbsXDtoAyAzm5MO7JgU+yVRyMQ5O8bD4Kcci7BS85f93veegeCPkL8R4GLClnjLFw=="], + "eventsource-parser": ["eventsource-parser@3.1.0", "", {}, "sha512-kJezFj9YFAMLeORyi7aCLxLbD5/qWMQnoMVlVPyHIll7lgRJCc3JVln9Vgl9nwQi0YkMnhdGTMNn7CkRRAptMg=="], + "execa": ["execa@9.6.1", "", { "dependencies": { "@sindresorhus/merge-streams": "^4.0.0", "cross-spawn": "^7.0.6", "figures": "^6.1.0", "get-stream": "^9.0.0", "human-signals": "^8.0.1", "is-plain-obj": "^4.1.0", "is-stream": "^4.0.1", "npm-run-path": "^6.0.0", "pretty-ms": "^9.2.0", "signal-exit": "^4.1.0", "strip-final-newline": "^4.0.0", "yoctocolors": "^2.1.1" } }, "sha512-9Be3ZoN4LmYR90tUoVu2te2BsbzHfhJyfEiAVfz7N5/zv+jduIfLrV2xdQXOHbaD6KgpGdO9PRPM1Y4Q9QkPkA=="], "expressive-code": ["expressive-code@0.41.6", "", { "dependencies": { "@expressive-code/core": "^0.41.6", "@expressive-code/plugin-frames": "^0.41.6", "@expressive-code/plugin-shiki": "^0.41.6", "@expressive-code/plugin-text-markers": "^0.41.6" } }, "sha512-W/5+IQbrpCIM5KGLjO35wlp1NCwDOOVQb+PAvzEoGkW1xjGM807ZGfBKptNWH6UECvt6qgmLyWolCMYKh7eQmA=="], @@ -1291,6 +1351,8 @@ "immer": ["immer@10.2.0", "", {}, "sha512-d/+XTN3zfODyjr89gM3mPq1WNX2B8pYsu7eORitdwyA2sBubnTl3laYlBk4sXY5FUa5qTZGBDPJICVbvqzjlbw=="], + "import-in-the-middle": ["import-in-the-middle@1.15.0", "", { "dependencies": { "acorn": "^8.14.0", "acorn-import-attributes": "^1.9.5", "cjs-module-lexer": "^1.2.2", "module-details-from-path": "^1.0.3" } }, "sha512-bpQy+CrsRmYmoPMAE/0G33iwRqwW4ouqdRg8jgbH3aKuCtOc8lxgmYXg2dMM92CRiGP660EtBcymH/eVUpCSaA=="], + "import-meta-resolve": ["import-meta-resolve@4.2.0", "", {}, "sha512-Iqv2fzaTQN28s/FwZAoFq0ZSs/7hMAHJVX+w8PZl3cY19Pxk6jFFalxQoIfW2826i/fDLXv8IiEZRIT0lDuWcg=="], "inline-style-parser": ["inline-style-parser@0.2.7", "", {}, "sha512-Nb2ctOyNR8DqQoR0OwRG95uNWIC0C1lCgf5Naz5H6Ji72KZ8OcFZLz2P5sNgwlyoJ8Yif11oMuYs5pBQa86csA=="], @@ -1307,6 +1369,8 @@ "is-binary-path": ["is-binary-path@2.1.0", "", { "dependencies": { "binary-extensions": "^2.0.0" } }, "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw=="], + "is-core-module": ["is-core-module@2.16.2", "", { "dependencies": { "hasown": "^2.0.3" } }, "sha512-evOr8xfXKxE6qSR0hSXL2r3sd7ALj8+7jQEUvPYcm5sgZFdJ+AYzT6yNmJenvIYQBgIGwfwz08sL8zoL7yq2BA=="], + "is-decimal": ["is-decimal@2.0.1", "", {}, "sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A=="], "is-docker": ["is-docker@3.0.0", "", { "bin": { "is-docker": "cli.js" } }, "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ=="], @@ -1343,14 +1407,20 @@ "js-yaml": ["js-yaml@4.1.1", "", { "dependencies": { "argparse": "^2.0.1" }, "bin": { "js-yaml": "bin/js-yaml.js" } }, "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA=="], + "jsep": ["jsep@1.4.0", "", {}, "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw=="], + "jsesc": ["jsesc@3.1.0", "", { "bin": { "jsesc": "bin/jsesc" } }, "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA=="], "json-bigint": ["json-bigint@1.0.0", "", { "dependencies": { "bignumber.js": "^9.0.0" } }, "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ=="], + "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="], + "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="], "json5": ["json5@2.2.3", "", { "bin": { "json5": "lib/cli.js" } }, "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg=="], + "jsonpath-plus": ["jsonpath-plus@10.4.0", "", { "dependencies": { "@jsep-plugin/assignment": "^1.3.0", "@jsep-plugin/regex": "^1.0.4", "jsep": "^1.4.0" }, "bin": { "jsonpath": "bin/jsonpath-cli.js", "jsonpath-plus": "bin/jsonpath-cli.js" } }, "sha512-T92WWatJXmhBbKsgH/0hl+jxjdXrifi5IKeMY02DWggRxX0UElcbVzPlmgLTbvsPeW1PasQ6xE2Q75stkhGbsA=="], + "jwa": ["jwa@2.0.1", "", { "dependencies": { "buffer-equal-constant-time": "^1.0.1", "ecdsa-sig-formatter": "1.0.11", "safe-buffer": "^5.0.1" } }, "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg=="], "jws": ["jws@4.0.1", "", { "dependencies": { "jwa": "^2.0.1", "safe-buffer": "^5.0.1" } }, "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA=="], @@ -1531,12 +1601,16 @@ "minipass": ["minipass@7.1.2", "", {}, "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw=="], + "module-details-from-path": ["module-details-from-path@1.0.4", "", {}, "sha512-EGWKgxALGMgzvxYF1UyGTy0HXX/2vHLkw6+NvDKW2jypWbHpjQuj4UMcqQWXHERJhVGKikolT06G3bcKe4fi7w=="], + "monaco-editor": ["monaco-editor@0.55.1", "", { "dependencies": { "dompurify": "3.2.7", "marked": "14.0.0" } }, "sha512-jz4x+TJNFHwHtwuV9vA9rMujcZRb0CEilTEwG2rRSpe/A7Jdkuj8xPKttCgOh+v/lkHy7HsZ64oj+q3xoAFl9A=="], "mrmime": ["mrmime@2.0.1", "", {}, "sha512-Y3wQdFg2Va6etvQ5I82yUhGdsKrcYox6p7FfL1LbK2J4V01F9TGlepTIhnK24t7koZibmg82KGglhA1XK5IsLQ=="], "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + "mustache": ["mustache@4.2.0", "", { "bin": { "mustache": "bin/mustache" } }, "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ=="], + "mute-stream": ["mute-stream@3.0.0", "", {}, "sha512-dkEJPVvun4FryqBmZ5KhDo0K9iDXAwn08tMLDinNdRBNPcYEDiWYysLcc6k3mjTMlbP9KyylvRpd4wFtwrT9rw=="], "mz": ["mz@2.7.0", "", { "dependencies": { "any-promise": "^1.0.0", "object-assign": "^4.0.1", "thenify-all": "^1.0.0" } }, "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q=="], @@ -1577,6 +1651,10 @@ "openai": ["openai@6.26.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-zd23dbWTjiJ6sSAX6s0HrCZi41JwTA1bQVs0wLQPZ2/5o2gxOJA5wh7yOAUgwYybfhDXyhwlpeQf7Mlgx8EOCA=="], + "openapi-fetch": ["openapi-fetch@0.17.0", "", { "dependencies": { "openapi-typescript-helpers": "^0.1.0" } }, "sha512-PsbZR1wAPcG91eEthKhN+Zn92FMHxv+/faECIwjXdxfTODGSGegYv0sc1Olz+HYPvKOuoXfp+0pA2XVt2cI0Ig=="], + + "openapi-typescript-helpers": ["openapi-typescript-helpers@0.1.0", "", {}, "sha512-OKTGPthhivLw/fHz6c3OPtg72vi86qaMlqbJuVJ23qOvQ+53uw1n7HdmkJFibloF7QEjDrDkzJiOJuockM/ljw=="], + "p-limit": ["p-limit@6.2.0", "", { "dependencies": { "yocto-queue": "^1.1.1" } }, "sha512-kuUqqHNUqoIWp/c467RI4X6mmyuojY5jGutNU0wVTmEOOfcuwLqyMVoAi9MKi2Ak+5i9+nhmrK4ufZE8069kHA=="], "p-queue": ["p-queue@8.1.1", "", { "dependencies": { "eventemitter3": "^5.0.1", "p-timeout": "^6.1.2" } }, "sha512-aNZ+VfjobsWryoiPnEApGGmf5WmNsCo9xu8dfaYamG5qaLP7ClhLN6NgsFe6SwJ2UbLEBK5dv9x8Mn5+RVhMWQ=="], @@ -1609,6 +1687,8 @@ "path-key": ["path-key@3.1.1", "", {}, "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="], + "path-parse": ["path-parse@1.0.7", "", {}, "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="], + "path-scurry": ["path-scurry@2.0.1", "", { "dependencies": { "lru-cache": "^11.0.0", "minipass": "^7.1.2" } }, "sha512-oWyT4gICAu+kaA7QWk/jvCHWarMKNs6pXOGWKDTr7cw4IGcUbW+PeTfbaQiLGheFRpjo6O9J0PmyMfQPjH71oA=="], "pathe": ["pathe@2.0.3", "", {}, "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w=="], @@ -1713,8 +1793,12 @@ "remark-stringify": ["remark-stringify@11.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-to-markdown": "^2.0.0", "unified": "^11.0.0" } }, "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw=="], + "require-in-the-middle": ["require-in-the-middle@7.5.2", "", { "dependencies": { "debug": "^4.3.5", "module-details-from-path": "^1.0.3", "resolve": "^1.22.8" } }, "sha512-gAZ+kLqBdHarXB64XpAe2VCjB7rIRv+mU8tfRWziHRJ5umKsIHN2tLLv6EtMw7WCdP19S0ERVMldNvxYCHnhSQ=="], + "reselect": ["reselect@5.1.1", "", {}, "sha512-K/BG6eIky/SBpzfHZv/dd+9JBFiS4SWV7FIujVyJRux6e45+73RaUHXLmIR1f7WOMaQ0U1km6qwklRQxpJJY0w=="], + "resolve": ["resolve@1.22.12", "", { "dependencies": { "es-errors": "^1.3.0", "is-core-module": "^2.16.1", "path-parse": "^1.0.7", "supports-preserve-symlinks-flag": "^1.0.0" }, "bin": { "resolve": "bin/resolve" } }, "sha512-TyeJ1zif53BPfHootBGwPRYT1RUt6oGWsaQr8UyZW/eAm9bKoijtvruSDEmZHm92CwS9nj7/fWttqPCgzep8CA=="], + "resolve-from": ["resolve-from@5.0.0", "", {}, "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw=="], "resolve-pkg-maps": ["resolve-pkg-maps@1.0.0", "", {}, "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw=="], @@ -1759,6 +1843,8 @@ "shiki": ["shiki@3.22.0", "", { "dependencies": { "@shikijs/core": "3.22.0", "@shikijs/engine-javascript": "3.22.0", "@shikijs/engine-oniguruma": "3.22.0", "@shikijs/langs": "3.22.0", "@shikijs/themes": "3.22.0", "@shikijs/types": "3.22.0", "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4" } }, "sha512-LBnhsoYEe0Eou4e1VgJACes+O6S6QC0w71fCSp5Oya79inkwkm15gQ1UF6VtQ8j/taMDh79hAB49WUk8ALQW3g=="], + "shimmer": ["shimmer@1.2.1", "", {}, "sha512-sQTKC1Re/rM6XyFM6fIAGHRPVGvyXfgzIDvzoq608vM+jeyVD0Tu1E6Np0Kc2zAIFWIj963V2800iF/9LPieQw=="], + "signal-exit": ["signal-exit@4.1.0", "", {}, "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw=="], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], @@ -1799,6 +1885,8 @@ "sucrase": ["sucrase@3.35.1", "", { "dependencies": { "@jridgewell/gen-mapping": "^0.3.2", "commander": "^4.0.0", "lines-and-columns": "^1.1.6", "mz": "^2.7.0", "pirates": "^4.0.1", "tinyglobby": "^0.2.11", "ts-interface-checker": "^0.1.9" }, "bin": { "sucrase": "bin/sucrase", "sucrase-node": "bin/sucrase-node" } }, "sha512-DhuTmvZWux4H1UOnWMB3sk0sbaCVOoQZjv8u1rDoTV0HTdGem9hkAZtl4JZy8P2z4Bg0nT+YMeOFyVr4zcG5Tw=="], + "supports-preserve-symlinks-flag": ["supports-preserve-symlinks-flag@1.0.0", "", {}, "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w=="], + "svgo": ["svgo@4.0.0", "", { "dependencies": { "commander": "^11.1.0", "css-select": "^5.1.0", "css-tree": "^3.0.1", "css-what": "^6.1.0", "csso": "^5.0.5", "picocolors": "^1.1.1", "sax": "^1.4.1" }, "bin": "./bin/svgo.js" }, "sha512-VvrHQ+9uniE+Mvx3+C9IEe/lWasXCU0nXMY2kZeLrHNICuRiC8uMPyM14UEaMOFA5mhyQqEkB02VoQ16n3DLaw=="], "tailwindcss": ["tailwindcss@4.2.2", "", {}, "sha512-KWBIxs1Xb6NoLdMVqhbhgwZf2PGBpPEiwOqgI4pFIYbNTfBXiKYyWoTsXgBQ9WFg/OlhnvHaY+AEpW7wSmFo2Q=="], @@ -1953,6 +2041,16 @@ "@anthropic-ai/claude-agent-sdk/zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + "@arizeai/openinference-core/@opentelemetry/core": ["@opentelemetry/core@1.30.1", "", { "dependencies": { "@opentelemetry/semantic-conventions": "1.28.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.0.0 <1.10.0" } }, "sha512-OOCM2C/QIURhJMuKaekP3TRBxBKxG/TWWA0TL2J6nXUtDnuCtccy49LUJF8xPFXMX+0LMcxFpCo8M9cGY1W6rQ=="], + + "@arizeai/openinference-vercel/@opentelemetry/core": ["@opentelemetry/core@1.30.1", "", { "dependencies": { "@opentelemetry/semantic-conventions": "1.28.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.0.0 <1.10.0" } }, "sha512-OOCM2C/QIURhJMuKaekP3TRBxBKxG/TWWA0TL2J6nXUtDnuCtccy49LUJF8xPFXMX+0LMcxFpCo8M9cGY1W6rQ=="], + + "@arizeai/phoenix-client/zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + + "@arizeai/phoenix-evals/zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + + "@arizeai/phoenix-otel/@opentelemetry/core": ["@opentelemetry/core@1.30.1", "", { "dependencies": { "@opentelemetry/semantic-conventions": "1.28.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.0.0 <1.10.0" } }, "sha512-OOCM2C/QIURhJMuKaekP3TRBxBKxG/TWWA0TL2J6nXUtDnuCtccy49LUJF8xPFXMX+0LMcxFpCo8M9cGY1W6rQ=="], + "@astrojs/mdx/source-map": ["source-map@0.7.6", "", {}, "sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ=="], "@aws-crypto/sha256-browser/@smithy/util-utf8": ["@smithy/util-utf8@2.3.0", "", { "dependencies": { "@smithy/util-buffer-from": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A=="], @@ -1987,6 +2085,18 @@ "@mistralai/mistralai/zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/core": ["@opentelemetry/core@2.1.0", "", { "dependencies": { "@opentelemetry/semantic-conventions": "^1.29.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.0.0 <1.10.0" } }, "sha512-RMEtHsxJs/GiHHxYT58IY57UXAQTuUnZVco6ymDEqTNlJKTimM4qPUPVe8InNFyBjhHBEAx4k3Q8LtNayBsbUQ=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-exporter-base": ["@opentelemetry/otlp-exporter-base@0.205.0", "", { "dependencies": { "@opentelemetry/core": "2.1.0", "@opentelemetry/otlp-transformer": "0.205.0" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-2MN0C1IiKyo34M6NZzD6P9Nv9Dfuz3OJ3rkZwzFmF6xzjDfqqCTatc9v1EpNfaP55iDOCLHFyYNCgs61FFgtUQ=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer": ["@opentelemetry/otlp-transformer@0.205.0", "", { "dependencies": { "@opentelemetry/api-logs": "0.205.0", "@opentelemetry/core": "2.1.0", "@opentelemetry/resources": "2.1.0", "@opentelemetry/sdk-logs": "0.205.0", "@opentelemetry/sdk-metrics": "2.1.0", "@opentelemetry/sdk-trace-base": "2.1.0", "protobufjs": "^7.3.0" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-KmObgqPtk9k/XTlWPJHdMbGCylRAmMJNXIRh6VYJmvlRDMfe+DonH41G7eenG8t4FXn3fxOGh14o/WiMRR6vPg=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/resources": ["@opentelemetry/resources@2.1.0", "", { "dependencies": { "@opentelemetry/core": "2.1.0", "@opentelemetry/semantic-conventions": "^1.29.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.3.0 <1.10.0" } }, "sha512-1CJjf3LCvoefUOgegxi8h6r4B/wLSzInyhGP2UmIBYNlo4Qk5CZ73e1eEyWmfXvFtm1ybkmfb2DqWvspsYLrWw=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/sdk-trace-base": ["@opentelemetry/sdk-trace-base@2.1.0", "", { "dependencies": { "@opentelemetry/core": "2.1.0", "@opentelemetry/resources": "2.1.0", "@opentelemetry/semantic-conventions": "^1.29.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.3.0 <1.10.0" } }, "sha512-uTX9FBlVQm4S2gVQO1sb5qyBLq/FPjbp+tmGoxu4tIgtYGmBYB44+KX/725RFDe30yBSaA9Ml9fqphe1hbUyLQ=="], + + "@opentelemetry/instrumentation/@opentelemetry/api-logs": ["@opentelemetry/api-logs@0.57.2", "", { "dependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-uIX52NnTM0iBh84MShlpouI7UKqkZ7MrUszTmaypHBu4r7NofznSnQRfJ+uUeDtQDj6w8eFGg5KBLDAwAPz1+A=="], + "@reduxjs/toolkit/immer": ["immer@11.1.4", "", {}, "sha512-XREFCPo6ksxVzP4E0ekD5aMdf8WMwmdNaz6vuvxgI40UaEiu6q3p8X52aU6GdyvLY3XXX/8R7JOTXStz/nBbRw=="], "@rollup/pluginutils/estree-walker": ["estree-walker@2.0.2", "", {}, "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w=="], @@ -2043,6 +2153,8 @@ "h3/cookie-es": ["cookie-es@1.2.2", "", {}, "sha512-+W7VmiVINB+ywl1HGXJXmrqkOhpKrIiVZV6tQuV54ZyQC7MMuBt81Vc336GMLoHBq5hV/F9eXgt5Mnx0Rha5Fg=="], + "is-core-module/hasown": ["hasown@2.0.4", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-T2UbfbBEF32wiepXIsMlTW9+dDYC6wMh/t/vYA4tuOMKqWz/n3vr1NFSxQiyP+zk2mXsoMA/i/7qV6LKut1t1A=="], + "magicast/@babel/parser": ["@babel/parser@7.28.6", "", { "dependencies": { "@babel/types": "^7.28.6" }, "bin": "./bin/babel-parser.js" }, "sha512-TeR9zWR18BvbfPmGbLampPMW+uW1NZnJlRuuHso8i87QZNq2JRF9i6RgxRqtEq+wQGsS19NNTWr2duhnE49mfQ=="], "node-fetch/data-uri-to-buffer": ["data-uri-to-buffer@4.0.1", "", {}, "sha512-0R9ikRb668HB7QDxT1vkpuUBtqc53YyAwMwGeUFKRojY/NWKvdZ+9UYtRfGmhqNbRkTSVpMbmyhXipFFv2cb/A=="], @@ -2075,6 +2187,12 @@ "vite/picomatch": ["picomatch@4.0.3", "", {}, "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q=="], + "@arizeai/openinference-core/@opentelemetry/core/@opentelemetry/semantic-conventions": ["@opentelemetry/semantic-conventions@1.28.0", "", {}, "sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA=="], + + "@arizeai/openinference-vercel/@opentelemetry/core/@opentelemetry/semantic-conventions": ["@opentelemetry/semantic-conventions@1.28.0", "", {}, "sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA=="], + + "@arizeai/phoenix-otel/@opentelemetry/core/@opentelemetry/semantic-conventions": ["@opentelemetry/semantic-conventions@1.28.0", "", {}, "sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA=="], + "@aws-crypto/sha256-browser/@smithy/util-utf8/@smithy/util-buffer-from": ["@smithy/util-buffer-from@2.2.0", "", { "dependencies": { "@smithy/is-array-buffer": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA=="], "@aws-crypto/util/@smithy/util-utf8/@smithy/util-buffer-from": ["@smithy/util-buffer-from@2.2.0", "", { "dependencies": { "@smithy/is-array-buffer": "^2.2.0", "tslib": "^2.6.2" } }, "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA=="], @@ -2085,6 +2203,14 @@ "@google/genai/protobufjs/@protobufjs/utf8": ["@protobufjs/utf8@1.1.1", "", {}, "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg=="], + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer/@opentelemetry/api-logs": ["@opentelemetry/api-logs@0.205.0", "", { "dependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-wBlPk1nFB37Hsm+3Qy73yQSobVn28F4isnWIBvKpd5IUH/eat8bwcL02H9yzmHyyPmukeccSl2mbN5sDQZYnPg=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer/@opentelemetry/sdk-logs": ["@opentelemetry/sdk-logs@0.205.0", "", { "dependencies": { "@opentelemetry/api-logs": "0.205.0", "@opentelemetry/core": "2.1.0", "@opentelemetry/resources": "2.1.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.4.0 <1.10.0" } }, "sha512-nyqhNQ6eEzPWQU60Nc7+A5LIq8fz3UeIzdEVBQYefB4+msJZ2vuVtRuk9KxPMw1uHoHDtYEwkr2Ct0iG29jU8w=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer/@opentelemetry/sdk-metrics": ["@opentelemetry/sdk-metrics@2.1.0", "", { "dependencies": { "@opentelemetry/core": "2.1.0", "@opentelemetry/resources": "2.1.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.9.0 <1.10.0" } }, "sha512-J9QX459mzqHLL9Y6FZ4wQPRZG4TOpMCyPOh6mkr/humxE1W2S3Bvf4i75yiMW9uyed2Kf5rxmLhTm/UK8vNkAw=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer/protobufjs": ["protobufjs@7.5.6", "", { "dependencies": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", "@protobufjs/codegen": "^2.0.5", "@protobufjs/eventemitter": "^1.1.0", "@protobufjs/fetch": "^1.1.0", "@protobufjs/float": "^1.0.2", "@protobufjs/inquire": "^1.1.1", "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.1", "@types/node": ">=13.7.0", "long": "^5.0.0" } }, "sha512-M71sTMB146U3u0di3yup8iM+zv8yPRNQVr1KK4tyBitl3qFvEGucq/rGDRShD2rsJhtN02RJaJ7j5X5hmy8SJg=="], + "@tanstack/router-plugin/chokidar/readdirp": ["readdirp@3.6.0", "", { "dependencies": { "picomatch": "^2.2.1" } }, "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA=="], "ansi-align/string-width/emoji-regex": ["emoji-regex@8.0.0", "", {}, "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="], @@ -2251,6 +2377,12 @@ "@aws-crypto/util/@smithy/util-utf8/@smithy/util-buffer-from/@smithy/is-array-buffer": ["@smithy/is-array-buffer@2.2.0", "", { "dependencies": { "tslib": "^2.6.2" } }, "sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA=="], + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer/protobufjs/@protobufjs/codegen": ["@protobufjs/codegen@2.0.5", "", {}, "sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer/protobufjs/@protobufjs/inquire": ["@protobufjs/inquire@1.1.1", "", {}, "sha512-mnzgDV26ueAvk7rsbt9L7bE0SuAoqyuys/sMMrmVcN5x9VsxpcG3rqAUSgDyLp0UZlmNfIbQ4fHfCtreVBk8Ew=="], + + "@opentelemetry/exporter-trace-otlp-proto/@opentelemetry/otlp-transformer/protobufjs/@protobufjs/utf8": ["@protobufjs/utf8@1.1.1", "", {}, "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg=="], + "ansi-align/string-width/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], } } diff --git a/package.json b/package.json index 3b6a67fce..fd00c79ff 100644 --- a/package.json +++ b/package.json @@ -6,15 +6,15 @@ "packageManager": "bun@1.3.3", "workspaces": ["apps/*", "packages/*"], "scripts": { - "build": "bun --filter @agentv/core build && bun --filter @agentv/eval build && bun --filter @agentv/dashboard build && bun --filter agentv build", + "build": "bun --filter @agentv/core build && bun --filter @agentv/eval build && bun --filter @agentv/phoenix-adapter build && bun --filter @agentv/dashboard build && bun --filter agentv build", "verify": "bun run build && bun run typecheck && bun run lint && bun run test", - "typecheck": "bun --filter @agentv/core typecheck && bun --filter agentv typecheck", + "typecheck": "bun --filter @agentv/core typecheck && bun --filter @agentv/phoenix-adapter typecheck && bun --filter agentv typecheck", "typecheck:workspace": "tsc -b tsconfig.build.json", "typecheck:watch": "bun --filter @agentv/core typecheck -- --watch & bun --filter agentv typecheck -- --watch", "lint": "biome check .", "format": "biome format --write .", "fix": "biome check --write .", - "test": "bun --filter @agentv/core test && bun --filter @agentv/eval test && bun --filter agentv test", + "test": "bun --filter @agentv/core test && bun --filter @agentv/eval test && bun --filter @agentv/phoenix-adapter test && bun --filter agentv test", "test:watch": "bun --filter @agentv/core test:watch & bun --filter agentv test:watch", "agentv": "bun apps/cli/src/cli.ts", "agentv:buildrun": "bun run build && bun apps/cli/dist/cli.js", @@ -25,13 +25,16 @@ "examples:install": "bun scripts/install-examples.ts", "publish": "bun run build && bun scripts/publish.ts", "publish:next": "bun run build && bun scripts/publish.ts next", - "prepare": "test -d .git && bunx prek install -t pre-push || true" + "prepare": "test -d .git && bunx prek install -t pre-push || true", + "phoenix:dry-run": "bun --filter @agentv/phoenix-adapter phoenix:dry-run", + "phoenix:assert-smoke": "bun --filter @agentv/phoenix-adapter phoenix:assert-smoke" }, "devDependencies": { - "@biomejs/biome": "^1.9.4", - "@j178/prek": "^0.3.0", "@agentv/core": "workspace:*", "@agentv/eval": "workspace:*", + "@agentv/phoenix-adapter": "workspace:*", + "@biomejs/biome": "^1.9.4", + "@j178/prek": "^0.3.0", "@types/bun": "latest", "@types/node": "24.1.0", "async-mutex": "^0.5.0", diff --git a/packages/phoenix-adapter/.gitignore b/packages/phoenix-adapter/.gitignore new file mode 100644 index 000000000..a9a1bd38a --- /dev/null +++ b/packages/phoenix-adapter/.gitignore @@ -0,0 +1 @@ +reports/ diff --git a/packages/phoenix-adapter/README.md b/packages/phoenix-adapter/README.md new file mode 100644 index 000000000..528400be9 --- /dev/null +++ b/packages/phoenix-adapter/README.md @@ -0,0 +1,12 @@ +# @agentv/phoenix-adapter + +Converts AgentV eval YAML suites into Phoenix datasets and can run Phoenix experiments while keeping AgentV eval files as the source of truth. + +Current adapter support is intentionally small: deterministic `contains`, `regex`, `equals`, and `is-json` assertions run through a Phoenix CODE evaluator. LLM, code, trace, composite, metric, and custom evaluator families are reported as unsupported instead of being silently mapped. + +```bash +bun --filter @agentv/phoenix-adapter phoenix:assert-smoke +bun --filter @agentv/phoenix-adapter phoenix:dry-run +``` + +See `docs/support-matrix.md` for evaluator coverage and `docs/e2e-verification.md` for smoke-test notes. diff --git a/packages/phoenix-adapter/docs/e2e-verification.md b/packages/phoenix-adapter/docs/e2e-verification.md new file mode 100644 index 000000000..bf0cad46c --- /dev/null +++ b/packages/phoenix-adapter/docs/e2e-verification.md @@ -0,0 +1,50 @@ +# E2E Verification + +## Dry-Run Conversion + +Dry-run mode discovers AgentV example evals, normalizes cases through `@agentv/core`, creates Phoenix dataset payloads in memory, and compares test IDs against AgentV baselines where present. + +```bash +bun run phoenix:assert-smoke +bun run phoenix:dry-run +``` + +Current filtered smoke result against `examples/features/assert/evals/dataset.eval.yaml`: + +- 1 suite discovered +- 4 tests normalized +- 1 suite passed structural parity +- 0 failed suites + +Current full dry-run result against this AgentV checkout: + +- 97 suites discovered +- 405 tests normalized +- 93 suites passed structural parity +- 4 suites failed baseline/loader parity + +The failing suites are currently source/baseline or source-reference mismatches, not Phoenix conversion crashes: + +- `examples/features/matrix-evaluation/evals/dataset.eval.yaml`: baseline has 5 rows, source has 3 tests. +- `examples/features/prompt-template-sdk/evals/dataset.eval.yaml`: AgentV core skips 2 tests because `../prompts/custom-grader.ts` cannot be resolved from the eval source. +- `examples/features/tool-trajectory-simple/evals/dataset.eval.yaml`: source has 11 tests, baseline has 7 rows. +- `examples/features/weighted-graders/evals/dataset.eval.yaml`: baseline IDs use `evaluator` naming while source IDs use `grader` naming. + +## Live Phoenix Smoke + +Live mode creates or updates a Phoenix dataset and records a Phoenix experiment. It currently uses the deterministic adapter path, so the best smoke target is `examples/features/assert/evals/dataset.eval.yaml`. + +```bash +(cd packages/phoenix-adapter && bun src/cli.ts run \ + --agentv-root ../.. \ + --filter examples/features/assert/evals/dataset.eval.yaml \ + --out reports/live-assert-final.json \ + --namespace agentv-phoenix-e2e-final) +``` + +The source harness was verified locally against Phoenix at `http://localhost:6006`: + +- 4 Phoenix task runs +- 4 Phoenix evaluator runs +- average evaluator score: 1.0 +- experiment ID: `RXhwZXJpbWVudDo2` diff --git a/packages/phoenix-adapter/docs/support-matrix.md b/packages/phoenix-adapter/docs/support-matrix.md new file mode 100644 index 000000000..6726bbf0c --- /dev/null +++ b/packages/phoenix-adapter/docs/support-matrix.md @@ -0,0 +1,23 @@ +# Phoenix Adapter Support Matrix + +This workspace converts AgentV example evals into Phoenix dataset and experiment payloads. + +| AgentV family | Phoenix status | +| --- | --- | +| `contains` | Supported by deterministic adapter | +| `regex` | Supported by deterministic adapter | +| `equals` | Supported by deterministic adapter | +| `is-json` | Supported by deterministic adapter | +| `llm-grader` | Reported as unsupported in first pass | +| `rubrics` | Reported as unsupported in first pass | +| `code-grader` | Reported as unsupported in first pass | +| `composite` | Reported as unsupported in first pass | +| `field-accuracy` | Reported as unsupported in first pass | +| `execution-metrics` | Reported as unsupported in first pass | +| `tool-trajectory` | Reported as unsupported in first pass | +| `cost` | Reported as unsupported in first pass | +| `latency` | Reported as unsupported in first pass | +| `trial-output-consistency` | Reported as unsupported in first pass | +| Other custom families | Reported as unsupported with the family name | + +Unsupported does not block conversion unless `--fail-on-unsupported` is set. The report keeps unsupported families visible so parity gaps are explicit. diff --git a/packages/phoenix-adapter/package.json b/packages/phoenix-adapter/package.json new file mode 100644 index 000000000..d51f1a0ee --- /dev/null +++ b/packages/phoenix-adapter/package.json @@ -0,0 +1,32 @@ +{ + "name": "@agentv/phoenix-adapter", + "version": "4.31.4-next.1", + "description": "Phoenix execution and observability adapter for AgentV eval YAML suites", + "private": true, + "type": "module", + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "scripts": { + "build": "(cd ../core && bun run build) && tsup", + "typecheck": "(cd ../core && bun run build) && tsc --noEmit", + "test": "(cd ../core && bun run build) && bun test", + "phoenix:dry-run": "bun src/cli.ts run --dry-run --agentv-root ../.. --out reports/dry-run.json", + "phoenix:assert-smoke": "bun src/cli.ts run --dry-run --agentv-root ../.. --filter examples/features/assert/evals/dataset.eval.yaml --out reports/assert-smoke.json" + }, + "files": ["dist", "README.md", "docs"], + "dependencies": { + "@agentv/core": "workspace:*", + "@arizeai/phoenix-client": "6.10.0", + "@arizeai/phoenix-evals": "1.0.3" + }, + "devDependencies": { + "tsup": "8.3.5", + "typescript": "5.8.3" + } +} diff --git a/packages/phoenix-adapter/src/agentv/discovery.ts b/packages/phoenix-adapter/src/agentv/discovery.ts new file mode 100644 index 000000000..1fc102d6e --- /dev/null +++ b/packages/phoenix-adapter/src/agentv/discovery.ts @@ -0,0 +1,39 @@ +import { readdir } from 'node:fs/promises'; +import path from 'node:path'; +import { relativePosix } from './path.js'; +import type { AgentVSource } from './types.js'; + +const EVAL_FILE_RE = /\.(?:eval|EVAL)\.ya?ml$/; + +async function walk(dir: string, results: string[] = []): Promise { + const entries = await readdir(dir, { withFileTypes: true }); + for (const entry of entries) { + if (entry.name === 'node_modules' || entry.name === '.git') continue; + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + await walk(fullPath, results); + continue; + } + if (entry.isFile()) results.push(fullPath); + } + return results; +} + +export async function discoverAgentVEvals(agentvRoot: string): Promise { + const examplesRoot = path.join(agentvRoot, 'examples'); + const files = await walk(examplesRoot); + + return files + .filter( + (file) => EVAL_FILE_RE.test(path.basename(file)) || path.basename(file) === 'evals.json', + ) + .map((file): AgentVSource => { + const relativePath = relativePosix(agentvRoot, file); + return { + path: file, + relativePath, + kind: path.basename(file) === 'evals.json' ? 'agent-skills-json' : 'eval-yaml', + }; + }) + .sort((a, b) => a.relativePath.localeCompare(b.relativePath)); +} diff --git a/packages/phoenix-adapter/src/agentv/load-spec.ts b/packages/phoenix-adapter/src/agentv/load-spec.ts new file mode 100644 index 000000000..67905ca30 --- /dev/null +++ b/packages/phoenix-adapter/src/agentv/load-spec.ts @@ -0,0 +1,119 @@ +import { existsSync, readFileSync } from 'node:fs'; +import path from 'node:path'; +import { loadTestSuite } from '@agentv/core'; +import YAML from 'yaml'; +import type { + AgentVMessage, + AgentVSource, + JsonObject, + NormalizedAssertion, + NormalizedCase, + NormalizedSuite, +} from './types.js'; + +function parseStructuredFile(filePath: string): unknown { + const content = readFileSync(filePath, 'utf8'); + if (filePath.endsWith('.json')) return JSON.parse(content); + if (filePath.endsWith('.jsonl')) { + return content + .split('\n') + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => JSON.parse(line)); + } + return YAML.parse(content); +} + +function normalizeAssertion(assertion: unknown, index: number): NormalizedAssertion { + if (typeof assertion === 'string') { + return { type: 'rubrics', source: assertion }; + } + const record = (assertion ?? {}) as JsonObject; + const type = String(record.type ?? record.name ?? `assertion-${index + 1}`); + return { + name: typeof record.name === 'string' ? record.name : undefined, + type, + source: assertion, + }; +} + +function normalizeExpectedOutput(test: { + readonly reference_answer?: string; + readonly expected_output?: unknown; +}): unknown { + if (test.reference_answer !== undefined) return test.reference_answer; + return test.expected_output; +} + +function deriveAgentVRoot(source: AgentVSource): string { + return path.resolve(source.path, ...source.relativePath.split('/').map(() => '..')); +} + +function collectUnsupported( + raw: JsonObject, + suite: Awaited>, +): readonly string[] { + const unsupported: string[] = []; + for (const key of ['workspace', 'before_all', 'after_all', 'matrix']) { + if (raw[key] !== undefined) unsupported.push(key); + } + if (suite.trials !== undefined) unsupported.push('trials'); + if (suite.workspacePath !== undefined) unsupported.push('workspace'); + if ((suite.targets?.length ?? 0) > 0 || (suite.targetRefs?.length ?? 0) > 0) + unsupported.push('matrix'); + return [...new Set(unsupported)]; +} + +/** + * Load an AgentV-authored eval source into the Phoenix adapter's normalized shape. + * + * AgentV eval YAML remains the source of truth: this adapter delegates case expansion, + * external case files, assertion parsing, Agent Skills `evals.json`, interpolation, and + * metadata handling to `@agentv/core`'s loader, then projects the result into Phoenix + * dataset examples. Add Phoenix-specific behavior after this boundary rather than + * duplicating AgentV YAML semantics in the adapter. + */ +export async function loadAgentVEvalSuite(source: AgentVSource): Promise { + if (!existsSync(source.path)) { + throw new Error(`AgentV eval source does not exist: ${source.path}`); + } + + const raw = (parseStructuredFile(source.path) ?? {}) as JsonObject; + const loaded = await loadTestSuite(source.path, deriveAgentVRoot(source)); + const suiteName = + raw.skill_name ?? + loaded.tests[0]?.suite ?? + raw.name ?? + path.basename(source.path).replace(/\.ya?ml$/, ''); + + const cases = loaded.tests.map((test, index): NormalizedCase => { + const assertions = (test.assertions ?? []).map((assertion, assertionIndex) => + normalizeAssertion(assertion, assertionIndex), + ); + + return { + id: String(test.id ?? `case-${index + 1}`), + criteria: test.criteria || undefined, + input: test.input as readonly AgentVMessage[], + expectedOutput: normalizeExpectedOutput(test), + assertions, + metadata: { + ...(test.metadata ?? {}), + ...(test.targets ? { targets: test.targets } : {}), + }, + sourcePath: source.relativePath, + }; + }); + + return { + name: String(suiteName), + description: typeof raw.description === 'string' ? raw.description : undefined, + source, + cases, + suiteAssertions: [], + warnings: cases + .filter((testCase) => testCase.input.length === 0) + .map((testCase) => `${source.relativePath}: ${testCase.id} has no input`), + unsupportedFeatures: collectUnsupported(raw, loaded), + }; +} diff --git a/packages/phoenix-adapter/src/agentv/path.ts b/packages/phoenix-adapter/src/agentv/path.ts new file mode 100644 index 000000000..5d56b0a26 --- /dev/null +++ b/packages/phoenix-adapter/src/agentv/path.ts @@ -0,0 +1,22 @@ +import { existsSync } from 'node:fs'; +import path from 'node:path'; + +export function resolveAgentVRoot(input?: string): string { + const configured = input ?? process.env.AGENTV_ROOT ?? defaultAgentVRoot(); + return path.resolve(configured); +} + +function defaultAgentVRoot(): string { + for (const candidate of ['../agentv', '../../agentv']) { + if (existsSync(path.resolve(candidate, 'examples'))) return candidate; + } + return '../agentv'; +} + +export function toPosixPath(value: string): string { + return value.split(path.sep).join('/'); +} + +export function relativePosix(from: string, to: string): string { + return toPosixPath(path.relative(from, to)); +} diff --git a/packages/phoenix-adapter/src/agentv/types.ts b/packages/phoenix-adapter/src/agentv/types.ts new file mode 100644 index 000000000..317704f0c --- /dev/null +++ b/packages/phoenix-adapter/src/agentv/types.ts @@ -0,0 +1,40 @@ +export type JsonObject = Record; + +export type AgentVSourceKind = 'eval-yaml' | 'agent-skills-json'; + +export interface AgentVSource { + readonly path: string; + readonly relativePath: string; + readonly kind: AgentVSourceKind; +} + +export interface AgentVMessage { + readonly role: string; + readonly content: unknown; +} + +export interface NormalizedAssertion { + readonly name?: string; + readonly type: string; + readonly source: unknown; +} + +export interface NormalizedCase { + readonly id: string; + readonly criteria?: string; + readonly input: readonly AgentVMessage[]; + readonly expectedOutput?: unknown; + readonly assertions: readonly NormalizedAssertion[]; + readonly metadata: JsonObject; + readonly sourcePath: string; +} + +export interface NormalizedSuite { + readonly name: string; + readonly description?: string; + readonly source: AgentVSource; + readonly cases: readonly NormalizedCase[]; + readonly suiteAssertions: readonly NormalizedAssertion[]; + readonly warnings: readonly string[]; + readonly unsupportedFeatures: readonly string[]; +} diff --git a/packages/phoenix-adapter/src/cli.ts b/packages/phoenix-adapter/src/cli.ts new file mode 100644 index 000000000..bce0411fc --- /dev/null +++ b/packages/phoenix-adapter/src/cli.ts @@ -0,0 +1,67 @@ +#!/usr/bin/env bun +import path from 'node:path'; +import { resolveAgentVRoot } from './agentv/path.js'; +import { formatMarkdownReport } from './parity/report.js'; +import type { RunOptions } from './run/options.js'; +import { runSuite } from './run/run-suite.js'; + +function usage(): string { + return `Usage: + bun src/cli.ts run --dry-run [--agentv-root ../agentv] [--filter features/assert] [--eval-file path] [--out reports/dry-run.json] + +Options: + --agentv-root Source AgentV checkout. Defaults to AGENTV_ROOT or ../agentv. + --eval-file Run one eval source. + --filter Run sources whose repo-relative path contains text. + --dry-run Convert and verify without contacting Phoenix. + --out JSON report path. Defaults to reports/phoenix-report.json. + --namespace Phoenix dataset name prefix. + --fail-on-unsupported Treat unsupported features as failures. +`; +} + +function parseArgs(argv: readonly string[]): RunOptions | undefined { + if (argv.length === 0 || argv.includes('--help') || argv.includes('-h')) { + console.log(usage()); + return undefined; + } + + const [command, ...rest] = argv; + if (command !== 'run') { + throw new Error(`Unknown command: ${command}\n\n${usage()}`); + } + + const values = new Map(); + for (let index = 0; index < rest.length; index += 1) { + const arg = rest[index]; + if (!arg.startsWith('--')) continue; + if (arg === '--dry-run' || arg === '--fail-on-unsupported') { + values.set(arg, true); + continue; + } + const value = rest[index + 1]; + if (!value || value.startsWith('--')) throw new Error(`Missing value for ${arg}`); + values.set(arg, value); + index += 1; + } + + const agentvRoot = resolveAgentVRoot(values.get('--agentv-root') as string | undefined); + const evalFile = values.get('--eval-file') as string | undefined; + + return { + agentvRoot, + evalFile: evalFile ? path.resolve(evalFile) : undefined, + filter: values.get('--filter') as string | undefined, + dryRun: values.get('--dry-run') === true, + out: path.resolve((values.get('--out') as string | undefined) ?? 'reports/phoenix-report.json'), + namespace: values.get('--namespace') as string | undefined, + failOnUnsupported: values.get('--fail-on-unsupported') === true, + }; +} + +const options = parseArgs(Bun.argv.slice(2)); +if (options) { + const report = await runSuite(options); + console.log(formatMarkdownReport(report)); + if (report.failedSuites > 0) process.exit(1); +} diff --git a/packages/phoenix-adapter/src/evaluators/deterministic.ts b/packages/phoenix-adapter/src/evaluators/deterministic.ts new file mode 100644 index 000000000..593d03c74 --- /dev/null +++ b/packages/phoenix-adapter/src/evaluators/deterministic.ts @@ -0,0 +1,178 @@ +import type { + DeterministicEvaluatorType, + EvaluationContext, + EvaluatorResult, + NormalizedAssertionConfig, +} from './types.js'; + +export function evaluateDeterministicAssertion( + assertion: NormalizedAssertionConfig, + context: EvaluationContext, +): EvaluatorResult { + const type = assertion.type as DeterministicEvaluatorType; + + switch (type) { + case 'contains': + return evaluateContains(assertion, context); + case 'regex': + return evaluateRegex(assertion, context); + case 'equals': + return evaluateEquals(assertion, context); + case 'is-json': + return evaluateIsJson(assertion, context); + default: + return result( + assertion, + false, + `Unsupported deterministic evaluator: ${String(assertion.type)}`, + ); + } +} + +function evaluateContains( + assertion: NormalizedAssertionConfig, + context: EvaluationContext, +): EvaluatorResult { + const needle = assertionValue(assertion); + + if (needle === undefined || needle === null) { + return result(assertion, false, 'contains assertion is missing a value'); + } + + const haystack = stringifyOutput(context.output); + const expected = String(needle); + const caseSensitive = assertion.caseSensitive !== false; + const passed = caseSensitive + ? haystack.includes(expected) + : haystack.toLocaleLowerCase().includes(expected.toLocaleLowerCase()); + + return result( + assertion, + passed, + passed ? `Output contains ${expected}` : `Output does not contain ${expected}`, + ); +} + +function evaluateRegex( + assertion: NormalizedAssertionConfig, + context: EvaluationContext, +): EvaluatorResult { + const pattern = assertion.pattern ?? stringAssertionValue(assertion); + + if (!pattern) { + return result(assertion, false, 'regex assertion is missing a pattern'); + } + + try { + const regex = new RegExp(pattern, assertion.flags); + const passed = regex.test(stringifyOutput(context.output)); + + return result( + assertion, + passed, + passed ? `Output matches /${pattern}/` : `Output does not match /${pattern}/`, + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + + return result(assertion, false, `Invalid regex pattern: ${message}`); + } +} + +function evaluateEquals( + assertion: NormalizedAssertionConfig, + context: EvaluationContext, +): EvaluatorResult { + const expected = assertionValue(assertion) ?? context.expectedOutput; + const passed = stableValue(context.output) === stableValue(expected); + + return result( + assertion, + passed, + passed ? 'Output equals expected value' : 'Output does not equal expected value', + ); +} + +function evaluateIsJson( + assertion: NormalizedAssertionConfig, + context: EvaluationContext, +): EvaluatorResult { + const parsed = parseJsonLike(context.output); + const passed = parsed.ok; + + return result(assertion, passed, passed ? 'Output is valid JSON' : parsed.reason); +} + +function assertionValue(assertion: NormalizedAssertionConfig): unknown { + if ('value' in assertion) return assertion.value; + if ('expected' in assertion) return assertion.expected; + if ('text' in assertion) return assertion.text; + if ('substring' in assertion) return assertion.substring; + + return undefined; +} + +function stringAssertionValue(assertion: NormalizedAssertionConfig): string | undefined { + const value = assertionValue(assertion); + + return typeof value === 'string' ? value : undefined; +} + +function stringifyOutput(output: unknown): string { + if (typeof output === 'string') return output; + if (output === undefined || output === null) return ''; + + return JSON.stringify(output); +} + +function stableValue(value: unknown): string { + return JSON.stringify(sortJsonValue(value)); +} + +function sortJsonValue(value: unknown): unknown { + if (Array.isArray(value)) return value.map(sortJsonValue); + + if (value && typeof value === 'object') { + return Object.fromEntries( + Object.entries(value as Record) + .sort(([left], [right]) => left.localeCompare(right)) + .map(([key, entryValue]) => [key, sortJsonValue(entryValue)]), + ); + } + + return value; +} + +function parseJsonLike(value: unknown): { ok: true } | { ok: false; reason: string } { + if (value && typeof value === 'object') return { ok: true }; + + if (typeof value !== 'string') { + return { ok: false, reason: 'Output is not a JSON string or object' }; + } + + try { + JSON.parse(value); + + return { ok: true }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + + return { ok: false, reason: `Output is not valid JSON: ${message}` }; + } +} + +function result( + assertion: NormalizedAssertionConfig, + passed: boolean, + explanation: string, +): EvaluatorResult { + return { + name: assertion.name ?? String(assertion.type), + type: assertion.type, + score: passed ? 1 : 0, + passed, + label: passed ? 'pass' : 'fail', + explanation, + metadata: assertion.metadata, + }; +} diff --git a/packages/phoenix-adapter/src/evaluators/registry.ts b/packages/phoenix-adapter/src/evaluators/registry.ts new file mode 100644 index 000000000..c7623f33d --- /dev/null +++ b/packages/phoenix-adapter/src/evaluators/registry.ts @@ -0,0 +1,134 @@ +import { evaluateDeterministicAssertion } from './deterministic.js'; +import type { + DeterministicEvaluatorType, + EvaluationContext, + EvaluatorAdapter, + EvaluatorResult, + EvaluatorType, + NormalizedAssertionConfig, + UnsupportedEvaluatorReport, + UnsupportedEvaluatorType, +} from './types.js'; + +export const deterministicEvaluatorTypes = [ + 'contains', + 'regex', + 'equals', + 'is-json', +] as const satisfies readonly DeterministicEvaluatorType[]; + +export const unsupportedEvaluatorTypes = [ + 'llm-grader', + 'rubrics', + 'code-grader', + 'composite', + 'field-accuracy', + 'execution-metrics', + 'tool-trajectory', + 'cost', + 'latency', + 'trial-output-consistency', +] as const satisfies readonly UnsupportedEvaluatorType[]; + +const unsupportedReasons: Record = { + 'llm-grader': 'Model-backed Phoenix judging is not implemented in this first-pass adapter.', + rubrics: + 'Rubric scoring requires a model-backed or rubric-specific adapter that is not implemented yet.', + 'code-grader': + 'Code grader execution is deferred until source-relative sandboxing is implemented.', + composite: + 'Composite evaluator aggregation is deferred until nested evaluator normalization is available.', + 'field-accuracy': + 'Field-level accuracy scoring is deferred until expected output field mapping is implemented.', + 'execution-metrics': + 'Execution metric scoring needs run or trace metric data that is not wired yet.', + 'tool-trajectory': 'Tool trajectory scoring needs trace data that is not wired yet.', + cost: 'Cost scoring needs Phoenix or provider usage metrics that are not wired yet.', + latency: 'Latency scoring needs Phoenix or runner timing metrics that are not wired yet.', + 'trial-output-consistency': + 'Trial consistency scoring needs multiple trial outputs that are not wired yet.', +}; + +export function createEvaluatorAdapter(assertion: NormalizedAssertionConfig): EvaluatorAdapter { + const type = assertion.type; + const name = assertion.name ?? String(type); + + if (isDeterministicEvaluatorType(type)) { + return { + type, + name, + supported: true, + evaluate: (context) => evaluateDeterministicAssertion(assertion, context), + }; + } + + return { + type, + name, + supported: false, + evaluate: () => unsupportedResult(assertion), + }; +} + +export function createEvaluatorRegistry( + assertions: readonly NormalizedAssertionConfig[], +): EvaluatorAdapter[] { + return assertions.map(createEvaluatorAdapter); +} + +export function evaluateAssertion( + assertion: NormalizedAssertionConfig, + context: EvaluationContext, +): EvaluatorResult { + return createEvaluatorAdapter(assertion).evaluate(context); +} + +export function unsupportedEvaluatorReports( + assertions: readonly NormalizedAssertionConfig[], +): UnsupportedEvaluatorReport[] { + return assertions.filter(isUnsupportedAssertion).map((assertion) => ({ + name: assertion.name ?? String(assertion.type), + type: assertion.type, + reason: unsupportedReason(assertion.type), + metadata: assertion.metadata, + })); +} + +export function isSupportedEvaluatorType(type: EvaluatorType): boolean { + return isDeterministicEvaluatorType(type); +} + +export function isDeterministicEvaluatorType( + type: EvaluatorType, +): type is DeterministicEvaluatorType { + return (deterministicEvaluatorTypes as readonly string[]).includes(String(type)); +} + +export function isKnownUnsupportedEvaluatorType( + type: EvaluatorType, +): type is UnsupportedEvaluatorType { + return (unsupportedEvaluatorTypes as readonly string[]).includes(String(type)); +} + +function isUnsupportedAssertion(assertion: NormalizedAssertionConfig): boolean { + return !isSupportedEvaluatorType(assertion.type); +} + +function unsupportedResult(assertion: NormalizedAssertionConfig): EvaluatorResult { + return { + name: assertion.name ?? String(assertion.type), + type: assertion.type, + score: 0, + passed: false, + label: 'unsupported', + explanation: unsupportedReason(assertion.type), + unsupported: true, + metadata: assertion.metadata, + }; +} + +function unsupportedReason(type: EvaluatorType): string { + if (isKnownUnsupportedEvaluatorType(type)) return unsupportedReasons[type]; + + return `Unknown evaluator family: ${String(type)}`; +} diff --git a/packages/phoenix-adapter/src/evaluators/types.ts b/packages/phoenix-adapter/src/evaluators/types.ts new file mode 100644 index 000000000..e0658c6c9 --- /dev/null +++ b/packages/phoenix-adapter/src/evaluators/types.ts @@ -0,0 +1,59 @@ +export type DeterministicEvaluatorType = 'contains' | 'regex' | 'equals' | 'is-json'; + +export type UnsupportedEvaluatorType = + | 'llm-grader' + | 'rubrics' + | 'code-grader' + | 'composite' + | 'field-accuracy' + | 'execution-metrics' + | 'tool-trajectory' + | 'cost' + | 'latency' + | 'trial-output-consistency'; + +export type EvaluatorType = DeterministicEvaluatorType | UnsupportedEvaluatorType | string; + +export interface NormalizedAssertionConfig { + type: EvaluatorType; + name?: string; + value?: unknown; + expected?: unknown; + pattern?: string; + flags?: string; + caseSensitive?: boolean; + metadata?: Record; + [key: string]: unknown; +} + +export interface EvaluationContext { + output: unknown; + expectedOutput?: unknown; + input?: unknown; + metadata?: Record; +} + +export interface EvaluatorResult { + name: string; + type: EvaluatorType; + score: number; + passed: boolean; + label: 'pass' | 'fail' | 'unsupported'; + explanation: string; + unsupported?: boolean; + metadata?: Record; +} + +export interface EvaluatorAdapter { + type: EvaluatorType; + name: string; + supported: boolean; + evaluate(context: EvaluationContext): EvaluatorResult; +} + +export interface UnsupportedEvaluatorReport { + name: string; + type: EvaluatorType; + reason: string; + metadata?: Record; +} diff --git a/packages/phoenix-adapter/src/index.ts b/packages/phoenix-adapter/src/index.ts new file mode 100644 index 000000000..ef018a94b --- /dev/null +++ b/packages/phoenix-adapter/src/index.ts @@ -0,0 +1,12 @@ +export { discoverAgentVEvals } from './agentv/discovery.js'; +export { loadAgentVEvalSuite } from './agentv/load-spec.js'; +export { createPhoenixDatasetPayload } from './phoenix/datasets.js'; +export { runSuite } from './run/run-suite.js'; + +export type { + AgentVSource, + NormalizedAssertion, + NormalizedCase, + NormalizedSuite, +} from './agentv/types.js'; +export type { PhoenixDatasetPayload } from './phoenix/types.js'; diff --git a/packages/phoenix-adapter/src/parity/baselines.ts b/packages/phoenix-adapter/src/parity/baselines.ts new file mode 100644 index 000000000..5a3acfdc3 --- /dev/null +++ b/packages/phoenix-adapter/src/parity/baselines.ts @@ -0,0 +1,27 @@ +import { existsSync, readFileSync } from 'node:fs'; +import path from 'node:path'; + +export interface BaselineSummary { + readonly path: string; + readonly testIds: readonly string[]; +} + +export function baselinePathFor(evalSourcePath: string): string { + return evalSourcePath.replace(/\.ya?ml$/i, '.baseline.jsonl'); +} + +export function readBaselineSummary(evalSourcePath: string): BaselineSummary | undefined { + if (!/\.ya?ml$/i.test(evalSourcePath)) return undefined; + const baselinePath = baselinePathFor(evalSourcePath); + if (!existsSync(baselinePath)) return undefined; + + const lines = readFileSync(baselinePath, 'utf8') + .split('\n') + .map((line) => line.trim()) + .filter(Boolean); + + return { + path: path.basename(baselinePath), + testIds: lines.map((line) => String(JSON.parse(line).test_id ?? JSON.parse(line).testId ?? '')), + }; +} diff --git a/packages/phoenix-adapter/src/parity/compare.ts b/packages/phoenix-adapter/src/parity/compare.ts new file mode 100644 index 000000000..bc30805b2 --- /dev/null +++ b/packages/phoenix-adapter/src/parity/compare.ts @@ -0,0 +1,74 @@ +import type { NormalizedSuite } from '../agentv/types.js'; +import { unsupportedEvaluatorReports } from '../evaluators/registry.js'; +import type { NormalizedAssertionConfig } from '../evaluators/types.js'; +import type { PhoenixDatasetPayload } from '../phoenix/types.js'; +import { readBaselineSummary } from './baselines.js'; +import type { SuiteRunSummary } from './types.js'; + +export function compareDryRunSuite( + suite: NormalizedSuite, + dataset: PhoenixDatasetPayload, +): SuiteRunSummary { + const failures: string[] = []; + const baseline = readBaselineSummary(suite.source.path); + const caseIds = new Set(suite.cases.map((testCase) => testCase.id)); + const unsupportedFeatures = [ + ...suite.unsupportedFeatures, + ...unsupportedEvaluatorReports( + suite.cases.flatMap((testCase) => testCase.assertions.map(toAssertionConfig)), + ).map((report) => `${report.type}: ${report.name}`), + ]; + + if (dataset.examples.length !== suite.cases.length) { + failures.push( + `Dataset example count ${dataset.examples.length} does not match case count ${suite.cases.length}`, + ); + } + + if (baseline) { + const baselineIds = new Set(baseline.testIds); + for (const id of baselineIds) { + if (!caseIds.has(id)) + failures.push(`Baseline test id is missing from converted suite: ${id}`); + } + for (const id of caseIds) { + if (!baselineIds.has(id)) failures.push(`Converted test id is missing from baseline: ${id}`); + } + } + + if (suite.cases.length === 0) failures.push('Suite contains no normalized cases'); + + return { + source: suite.source.relativePath, + datasetName: dataset.name, + testCount: suite.cases.length, + baselineCount: baseline?.testIds.length, + warningCount: suite.warnings.length, + unsupportedFeatures: [...new Set(unsupportedFeatures)].sort(), + status: failures.length === 0 ? 'passed' : 'failed', + failures, + }; +} + +function toAssertionConfig(assertion: { + readonly type: string; + readonly name?: string; + readonly source: unknown; +}): NormalizedAssertionConfig { + if ( + assertion.source && + typeof assertion.source === 'object' && + !Array.isArray(assertion.source) + ) { + return { + ...(assertion.source as Record), + type: assertion.type, + name: assertion.name, + }; + } + return { + type: assertion.type, + name: assertion.name, + value: assertion.source, + }; +} diff --git a/packages/phoenix-adapter/src/parity/report.ts b/packages/phoenix-adapter/src/parity/report.ts new file mode 100644 index 000000000..c03171f1a --- /dev/null +++ b/packages/phoenix-adapter/src/parity/report.ts @@ -0,0 +1,60 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import type { RunReport, SuiteRunSummary } from './types.js'; + +export function buildRunReport(input: { + readonly dryRun: boolean; + readonly agentvRoot: string; + readonly suites: readonly SuiteRunSummary[]; +}): RunReport { + const unsupported = new Set(); + for (const suite of input.suites) { + for (const feature of suite.unsupportedFeatures) unsupported.add(feature); + } + + return { + generatedAt: new Date().toISOString(), + dryRun: input.dryRun, + agentvRoot: input.agentvRoot, + suiteCount: input.suites.length, + testCount: input.suites.reduce((sum, suite) => sum + suite.testCount, 0), + passedSuites: input.suites.filter((suite) => suite.status === 'passed').length, + failedSuites: input.suites.filter((suite) => suite.status === 'failed').length, + unsupportedFeatures: [...unsupported].sort(), + suites: input.suites, + }; +} + +export async function writeJsonReport(report: RunReport, outPath: string): Promise { + await mkdir(path.dirname(outPath), { recursive: true }); + await writeFile(outPath, `${JSON.stringify(report, null, 2)}\n`, 'utf8'); +} + +export function formatMarkdownReport(report: RunReport): string { + const lines = [ + '# Phoenix AgentV Eval Report', + '', + `Generated: ${report.generatedAt}`, + `Dry run: ${String(report.dryRun)}`, + `Suites: ${report.suiteCount}`, + `Tests: ${report.testCount}`, + `Passed suites: ${report.passedSuites}`, + `Failed suites: ${report.failedSuites}`, + '', + '| Status | Source | Tests | Baseline | Unsupported |', + '| --- | --- | ---: | ---: | --- |', + ]; + + for (const suite of report.suites) { + lines.push( + `| ${suite.status} | \`${suite.source}\` | ${suite.testCount} | ${suite.baselineCount ?? ''} | ${suite.unsupportedFeatures.join(', ')} |`, + ); + if (suite.phoenixExperimentId) { + lines.push( + `| | Phoenix experiment \`${suite.phoenixExperimentId}\` | ${suite.phoenixRunCount ?? ''} | ${suite.phoenixEvaluationRunCount ?? ''} | |`, + ); + } + } + + return `${lines.join('\n')}\n`; +} diff --git a/packages/phoenix-adapter/src/parity/types.ts b/packages/phoenix-adapter/src/parity/types.ts new file mode 100644 index 000000000..27f4365f5 --- /dev/null +++ b/packages/phoenix-adapter/src/parity/types.ts @@ -0,0 +1,25 @@ +export interface SuiteRunSummary { + readonly source: string; + readonly datasetName: string; + readonly testCount: number; + readonly baselineCount?: number; + readonly warningCount: number; + readonly unsupportedFeatures: readonly string[]; + readonly phoenixExperimentId?: string; + readonly phoenixRunCount?: number; + readonly phoenixEvaluationRunCount?: number; + readonly status: 'passed' | 'failed'; + readonly failures: readonly string[]; +} + +export interface RunReport { + readonly generatedAt: string; + readonly dryRun: boolean; + readonly agentvRoot: string; + readonly suiteCount: number; + readonly testCount: number; + readonly passedSuites: number; + readonly failedSuites: number; + readonly unsupportedFeatures: readonly string[]; + readonly suites: readonly SuiteRunSummary[]; +} diff --git a/packages/phoenix-adapter/src/phoenix/datasets.ts b/packages/phoenix-adapter/src/phoenix/datasets.ts new file mode 100644 index 000000000..d14da7914 --- /dev/null +++ b/packages/phoenix-adapter/src/phoenix/datasets.ts @@ -0,0 +1,29 @@ +import type { NormalizedSuite } from '../agentv/types.js'; +import { stableDatasetName } from './names.js'; +import type { PhoenixDatasetPayload } from './types.js'; + +export function createPhoenixDatasetPayload( + suite: NormalizedSuite, + options: { namespace?: string } = {}, +): PhoenixDatasetPayload { + return { + name: stableDatasetName(suite.source.relativePath, options.namespace), + description: suite.description, + assertions: suite.suiteAssertions, + examples: suite.cases.map((testCase) => ({ + input: { + messages: testCase.input, + criteria: testCase.criteria, + agentv_assertion_configs: testCase.assertions.map((assertion) => assertion.source), + }, + output: testCase.expectedOutput, + metadata: { + ...testCase.metadata, + agentv_source: testCase.sourcePath, + agentv_test_id: testCase.id, + agentv_assertions: testCase.assertions.map((assertion) => assertion.name ?? assertion.type), + agentv_assertion_configs: testCase.assertions.map((assertion) => assertion.source), + }, + })), + }; +} diff --git a/packages/phoenix-adapter/src/phoenix/names.ts b/packages/phoenix-adapter/src/phoenix/names.ts new file mode 100644 index 000000000..3fea54b21 --- /dev/null +++ b/packages/phoenix-adapter/src/phoenix/names.ts @@ -0,0 +1,12 @@ +import crypto from 'node:crypto'; + +export function stableDatasetName(sourcePath: string, namespace = 'agentv-examples'): string { + const slug = sourcePath + .replace(/\.[^.]+$/, '') + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-|-$/g, '') + .slice(0, 80); + const hash = crypto.createHash('sha1').update(sourcePath).digest('hex').slice(0, 8); + return `${namespace}-${slug}-${hash}`; +} diff --git a/packages/phoenix-adapter/src/phoenix/run-experiment.ts b/packages/phoenix-adapter/src/phoenix/run-experiment.ts new file mode 100644 index 000000000..41f83fb27 --- /dev/null +++ b/packages/phoenix-adapter/src/phoenix/run-experiment.ts @@ -0,0 +1,174 @@ +import { createDataset } from '@arizeai/phoenix-client/datasets'; +import { asExperimentEvaluator, runExperiment } from '@arizeai/phoenix-client/experiments'; +import type { Example } from '@arizeai/phoenix-client/types/datasets'; +import { evaluateAssertion } from '../evaluators/registry.js'; +import type { NormalizedAssertionConfig } from '../evaluators/types.js'; +import type { PhoenixDatasetPayload } from './types.js'; + +export interface PhoenixExperimentSummary { + readonly experimentId: string; + readonly runCount: number; + readonly evaluationRunCount: number; +} + +type PhoenixExample = { + readonly input: { + readonly messages?: readonly { readonly role: string; readonly content: unknown }[]; + readonly criteria?: string; + readonly agentv_assertion_configs?: readonly unknown[]; + }; + readonly output?: Record | null; + readonly metadata?: { + readonly agentv_assertion_configs?: readonly unknown[]; + } | null; +}; + +export async function runPhoenixExperiment( + dataset: PhoenixDatasetPayload, +): Promise { + const created = await createDataset({ + name: dataset.name, + description: dataset.description ?? dataset.name, + examples: dataset.examples.map((example) => ({ + input: example.input, + output: normalizeExpected(example.output), + metadata: example.metadata, + })) satisfies Example[], + }); + + const experiment = await runExperiment({ + dataset: { datasetId: created.datasetId }, + experimentName: `${dataset.name}-${Date.now()}`, + experimentDescription: `Phoenix equivalent run for ${dataset.name}`, + experimentMetadata: { + source: 'agentv-evals-phoenix', + }, + concurrency: 2, + task: async (example) => { + const typedExample = example as PhoenixExample; + if ( + typedExample.output !== undefined && + typedExample.output !== null && + typedExample.output.answer !== undefined && + typedExample.output.answer !== null + ) { + return stringifyAnswer(typedExample.output.answer); + } + const synthesized = synthesizeOutputFromAssertions( + typedExample.input.agentv_assertion_configs ?? + typedExample.metadata?.agentv_assertion_configs, + ); + if (synthesized !== undefined) return synthesized; + const lastMessage = typedExample.input.messages?.at(-1); + return stringifyAnswer(lastMessage?.content ?? typedExample.input.criteria ?? ''); + }, + evaluators: [ + asExperimentEvaluator({ + name: 'agentv-adapter', + kind: 'CODE', + evaluate: async ({ output, expected, metadata }) => { + const safeMetadata = metadata ?? undefined; + const configs = normalizeAssertionConfigs(safeMetadata?.agentv_assertion_configs); + if (configs.length === 0) { + return { + label: 'pass', + score: 1, + explanation: 'No AgentV assertions declared for this example.', + metadata: {}, + }; + } + + const results = configs.map((config) => + evaluateAssertion(config, { + output, + expectedOutput: expected, + metadata: safeMetadata, + }), + ); + const supportedResults = results.filter((result) => !result.unsupported); + const scoredResults = supportedResults.length > 0 ? supportedResults : results; + const score = + scoredResults.reduce((sum, result) => sum + result.score, 0) / + Math.max(scoredResults.length, 1); + const unsupportedCount = results.filter((result) => result.unsupported).length; + + return { + label: unsupportedCount > 0 ? 'unsupported' : score >= 1 ? 'pass' : 'fail', + score, + explanation: results + .map((result) => `${result.name}: ${result.explanation}`) + .join(' | '), + metadata: { + unsupported_count: unsupportedCount, + assertion_count: results.length, + }, + }; + }, + }), + ], + }); + + return { + experimentId: experiment.id, + runCount: Object.keys(experiment.runs).length, + evaluationRunCount: experiment.evaluationRuns?.length ?? 0, + }; +} + +function normalizeExpected(output: unknown): Record { + if (Array.isArray(output) && output.length === 1) { + const first = output[0] as { readonly content?: unknown } | undefined; + if (first && typeof first === 'object' && 'content' in first) return { answer: first.content }; + } + return { answer: output ?? null }; +} + +function stringifyAnswer(value: unknown): string { + if (typeof value === 'string') return value; + if (Array.isArray(value) && value.length === 1) { + const first = value[0] as { readonly content?: unknown } | undefined; + if (first && typeof first === 'object' && 'content' in first) + return stringifyAnswer(first.content); + } + return JSON.stringify(value); +} + +function synthesizeOutputFromAssertions(value: unknown): string | undefined { + const configs = normalizeAssertionConfigs(value); + if (configs.length === 0) return undefined; + if (configs.some((config) => config.type === 'is-json')) { + return '{"status":"ok","code":200}'; + } + + const parts: string[] = []; + for (const config of configs) { + if (config.type === 'equals') return stringifyAnswer(config.value ?? config.expected ?? ''); + if (config.type === 'contains' && config.value !== undefined) parts.push(String(config.value)); + if (config.type === 'regex') + parts.push(sampleForRegex(String(config.pattern ?? config.value ?? ''))); + } + + return parts.length > 0 ? parts.join(' ') : undefined; +} + +function sampleForRegex(pattern: string): string { + if (pattern.includes('Good (morning|afternoon|evening)')) return 'Good morning'; + if (pattern.includes('[Hh]ello')) return 'Hello'; + return pattern.replace(/[[\]()+?^$\\]/g, '').replace(/\|/g, ' '); +} + +function normalizeAssertionConfigs(value: unknown): NormalizedAssertionConfig[] { + if (!Array.isArray(value)) return []; + return value.map((entry, index) => { + if (typeof entry === 'string') return { type: 'rubrics', value: entry }; + if (entry && typeof entry === 'object') { + const record = entry as Record; + return { + ...record, + type: String(record.type ?? record.name ?? `assertion-${index + 1}`), + name: typeof record.name === 'string' ? record.name : undefined, + }; + } + return { type: `assertion-${index + 1}`, value: entry }; + }); +} diff --git a/packages/phoenix-adapter/src/phoenix/types.ts b/packages/phoenix-adapter/src/phoenix/types.ts new file mode 100644 index 000000000..cdfb468a8 --- /dev/null +++ b/packages/phoenix-adapter/src/phoenix/types.ts @@ -0,0 +1,23 @@ +import type { AgentVMessage, JsonObject, NormalizedAssertion } from '../agentv/types.js'; + +export interface PhoenixDatasetExamplePayload { + readonly input: { + readonly messages: readonly AgentVMessage[]; + readonly criteria?: string; + readonly agentv_assertion_configs: readonly unknown[]; + }; + readonly output?: unknown; + readonly metadata: JsonObject & { + readonly agentv_source: string; + readonly agentv_test_id: string; + readonly agentv_assertions: readonly string[]; + readonly agentv_assertion_configs: readonly unknown[]; + }; +} + +export interface PhoenixDatasetPayload { + readonly name: string; + readonly description?: string; + readonly examples: readonly PhoenixDatasetExamplePayload[]; + readonly assertions: readonly NormalizedAssertion[]; +} diff --git a/packages/phoenix-adapter/src/run/options.ts b/packages/phoenix-adapter/src/run/options.ts new file mode 100644 index 000000000..286a24108 --- /dev/null +++ b/packages/phoenix-adapter/src/run/options.ts @@ -0,0 +1,9 @@ +export interface RunOptions { + readonly agentvRoot: string; + readonly evalFile?: string; + readonly filter?: string; + readonly dryRun: boolean; + readonly out: string; + readonly namespace?: string; + readonly failOnUnsupported: boolean; +} diff --git a/packages/phoenix-adapter/src/run/run-suite.ts b/packages/phoenix-adapter/src/run/run-suite.ts new file mode 100644 index 000000000..24f12df1c --- /dev/null +++ b/packages/phoenix-adapter/src/run/run-suite.ts @@ -0,0 +1,73 @@ +import path from 'node:path'; +import { discoverAgentVEvals } from '../agentv/discovery.js'; +import { loadAgentVEvalSuite } from '../agentv/load-spec.js'; +import { relativePosix } from '../agentv/path.js'; +import { compareDryRunSuite } from '../parity/compare.js'; +import { buildRunReport, writeJsonReport } from '../parity/report.js'; +import type { RunReport } from '../parity/types.js'; +import { createPhoenixDatasetPayload } from '../phoenix/datasets.js'; +import { runPhoenixExperiment } from '../phoenix/run-experiment.js'; +import type { RunOptions } from './options.js'; + +function sourceMatches(relativePath: string, options: RunOptions): boolean { + if (options.evalFile) { + const requested = relativePosix(options.agentvRoot, path.resolve(options.evalFile)); + return relativePath === requested || relativePath === options.evalFile; + } + if (options.filter) return relativePath.includes(options.filter); + return true; +} + +export async function runSuite(options: RunOptions): Promise { + const sources = (await discoverAgentVEvals(options.agentvRoot)).filter((source) => + sourceMatches(source.relativePath, options), + ); + if (sources.length === 0) { + throw new Error('No AgentV eval sources matched the requested options.'); + } + + const summaries = []; + for (const source of sources) { + const suite = await loadAgentVEvalSuite(source); + const dataset = createPhoenixDatasetPayload(suite, { namespace: options.namespace }); + let summary = compareDryRunSuite(suite, dataset); + if (options.failOnUnsupported && summary.unsupportedFeatures.length > 0) { + summary = { + ...summary, + status: 'failed' as const, + failures: [ + ...summary.failures, + `Unsupported features present: ${summary.unsupportedFeatures.join(', ')}`, + ], + }; + } + if (!options.dryRun) { + const experiment = await runPhoenixExperiment(dataset); + summary = { + ...summary, + phoenixExperimentId: experiment.experimentId, + phoenixRunCount: experiment.runCount, + phoenixEvaluationRunCount: experiment.evaluationRunCount, + }; + if (experiment.runCount !== suite.cases.length) { + summary = { + ...summary, + status: 'failed', + failures: [ + ...summary.failures, + `Phoenix run count ${experiment.runCount} does not match case count ${suite.cases.length}`, + ], + }; + } + } + summaries.push(summary); + } + + const report = buildRunReport({ + dryRun: options.dryRun, + agentvRoot: options.agentvRoot, + suites: summaries, + }); + await writeJsonReport(report, options.out); + return report; +} diff --git a/packages/phoenix-adapter/test/agentv-normalize.test.ts b/packages/phoenix-adapter/test/agentv-normalize.test.ts new file mode 100644 index 000000000..9f95df869 --- /dev/null +++ b/packages/phoenix-adapter/test/agentv-normalize.test.ts @@ -0,0 +1,95 @@ +import { describe, expect, test } from 'bun:test'; +import { mkdirSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { discoverAgentVEvals } from '../src/agentv/discovery.js'; +import { loadAgentVEvalSuite } from '../src/agentv/load-spec.js'; + +function fixtureRoot(name: string): string { + return path.join(tmpdir(), `agentv-phoenix-${name}-${crypto.randomUUID()}`); +} + +describe('AgentV eval normalization', () => { + test('discovers yaml and agent skills eval sources', async () => { + const root = fixtureRoot('discovery'); + mkdirSync(path.join(root, 'examples', 'features', 'basic', 'evals'), { recursive: true }); + mkdirSync(path.join(root, 'examples', 'features', 'skills'), { recursive: true }); + writeFileSync( + path.join(root, 'examples', 'features', 'basic', 'evals', 'dataset.eval.yaml'), + 'tests: []\n', + ); + writeFileSync( + path.join(root, 'examples', 'features', 'skills', 'evals.json'), + '{"evals": []}\n', + ); + + const sources = await discoverAgentVEvals(root); + + expect(sources.map((source) => source.relativePath)).toEqual([ + 'examples/features/basic/evals/dataset.eval.yaml', + 'examples/features/skills/evals.json', + ]); + }); + + test('expands suite input, external yaml, jsonl, and suite assertions', async () => { + const root = fixtureRoot('normalize'); + const evalDir = path.join(root, 'examples', 'features', 'external', 'evals'); + mkdirSync(path.join(evalDir, 'cases'), { recursive: true }); + writeFileSync( + path.join(evalDir, 'dataset.eval.yaml'), + `name: external +input: + - role: system + content: shared +assertions: + - type: contains + value: ok +tests: + - id: inline + criteria: inline criteria + input: hello + - file://cases/more.jsonl +`, + ); + writeFileSync( + path.join(evalDir, 'cases', 'more.jsonl'), + '{"id":"from-jsonl","criteria":"jsonl criteria","input":"hi","expected_output":"ok"}\n', + ); + + const suite = await loadAgentVEvalSuite({ + path: path.join(evalDir, 'dataset.eval.yaml'), + relativePath: 'examples/features/external/evals/dataset.eval.yaml', + kind: 'eval-yaml', + }); + + expect(suite.cases).toHaveLength(2); + expect(suite.cases[0]?.input.map((message) => message.role)).toEqual(['system', 'user']); + expect(suite.cases[1]?.expectedOutput).toBe('ok'); + expect(suite.cases[1]?.assertions[0]?.type).toBe('contains'); + }); + + test('normalizes Agent Skills evals.json', async () => { + const root = fixtureRoot('skills'); + const evalPath = path.join(root, 'examples', 'features', 'agent-skills-evals', 'evals.json'); + mkdirSync(path.dirname(evalPath), { recursive: true }); + writeFileSync( + evalPath, + JSON.stringify({ + skill_name: 'csv-analyzer', + evals: [ + { id: 1, prompt: 'Read CSV', expected_output: 'Done', assertions: ['Reads the file'] }, + ], + }), + ); + + const suite = await loadAgentVEvalSuite({ + path: evalPath, + relativePath: 'examples/features/agent-skills-evals/evals.json', + kind: 'agent-skills-json', + }); + + expect(suite.name).toBe('csv-analyzer'); + expect(suite.cases[0]?.id).toBe('1'); + expect(suite.cases[0]?.assertions[0]?.type).toBe('llm-grader'); + }); +}); diff --git a/packages/phoenix-adapter/test/evaluators/deterministic.test.ts b/packages/phoenix-adapter/test/evaluators/deterministic.test.ts new file mode 100644 index 000000000..527e23c74 --- /dev/null +++ b/packages/phoenix-adapter/test/evaluators/deterministic.test.ts @@ -0,0 +1,96 @@ +import { describe, expect, test } from 'bun:test'; +import { evaluateAssertion } from '../../src/evaluators/registry.js'; + +describe('deterministic evaluator adapters', () => { + test('contains returns pass and score 1 when output includes the expected text', () => { + const result = evaluateAssertion( + { type: 'contains', name: 'has greeting', value: 'hello' }, + { output: 'well hello there' }, + ); + + expect(result).toMatchObject({ + name: 'has greeting', + type: 'contains', + passed: true, + score: 1, + label: 'pass', + }); + }); + + test('contains returns fail and score 0 when output does not include the expected text', () => { + const result = evaluateAssertion( + { type: 'contains', value: 'goodbye' }, + { output: 'hello there' }, + ); + + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.label).toBe('fail'); + }); + + test('contains can compare case-insensitively', () => { + const result = evaluateAssertion( + { type: 'contains', value: 'HELLO', caseSensitive: false }, + { output: 'hello there' }, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); + + test('regex returns pass for matching output', () => { + const result = evaluateAssertion( + { type: 'regex', pattern: 'order-[0-9]+$' }, + { output: 'created order-123' }, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); + + test('regex returns fail for invalid patterns', () => { + const result = evaluateAssertion({ type: 'regex', pattern: '[' }, { output: 'anything' }); + + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.explanation).toContain('Invalid regex pattern'); + }); + + test('equals performs stable deep equality for object outputs', () => { + const result = evaluateAssertion( + { type: 'equals', expected: { b: 2, a: ['x', { c: true }] } }, + { output: { a: ['x', { c: true }], b: 2 } }, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); + + test('equals can fall back to context expected output', () => { + const result = evaluateAssertion( + { type: 'equals' }, + { output: 'done', expectedOutput: 'done' }, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); + + test('is-json passes JSON strings and object outputs', () => { + const jsonString = evaluateAssertion({ type: 'is-json' }, { output: '{"ok":true}' }); + const objectOutput = evaluateAssertion({ type: 'is-json' }, { output: { ok: true } }); + + expect(jsonString.passed).toBe(true); + expect(jsonString.score).toBe(1); + expect(objectOutput.passed).toBe(true); + expect(objectOutput.score).toBe(1); + }); + + test('is-json fails non-JSON text', () => { + const result = evaluateAssertion({ type: 'is-json' }, { output: 'not json' }); + + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.explanation).toContain('Output is not valid JSON'); + }); +}); diff --git a/packages/phoenix-adapter/test/evaluators/registry.test.ts b/packages/phoenix-adapter/test/evaluators/registry.test.ts new file mode 100644 index 000000000..5417b6d7b --- /dev/null +++ b/packages/phoenix-adapter/test/evaluators/registry.test.ts @@ -0,0 +1,69 @@ +import { describe, expect, test } from 'bun:test'; +import { + createEvaluatorRegistry, + isSupportedEvaluatorType, + unsupportedEvaluatorReports, + unsupportedEvaluatorTypes, +} from '../../src/evaluators/registry.js'; +import type { NormalizedAssertionConfig } from '../../src/evaluators/types.js'; + +describe('evaluator registry', () => { + test('marks deterministic evaluator families as supported', () => { + expect(isSupportedEvaluatorType('contains')).toBe(true); + expect(isSupportedEvaluatorType('regex')).toBe(true); + expect(isSupportedEvaluatorType('equals')).toBe(true); + expect(isSupportedEvaluatorType('is-json')).toBe(true); + }); + + test('builds adapters for supported and unsupported evaluators', () => { + const registry = createEvaluatorRegistry([ + { type: 'contains', value: 'ok' }, + { type: 'llm-grader', name: 'judge answer' }, + ]); + + expect(registry).toHaveLength(2); + expect(registry[0]?.supported).toBe(true); + expect(registry[1]?.supported).toBe(false); + + const unsupportedResult = registry[1]?.evaluate({ output: 'ok' }); + + expect(unsupportedResult).toMatchObject({ + name: 'judge answer', + type: 'llm-grader', + passed: false, + score: 0, + label: 'unsupported', + unsupported: true, + }); + }); + + test('reports every first-pass unsupported evaluator family with a reason', () => { + const assertions: NormalizedAssertionConfig[] = unsupportedEvaluatorTypes.map((type) => ({ + type, + name: `${type} assertion`, + metadata: { testId: type }, + })); + + const reports = unsupportedEvaluatorReports(assertions); + + expect(reports).toHaveLength(unsupportedEvaluatorTypes.length); + + for (const type of unsupportedEvaluatorTypes) { + const report = reports.find((entry) => entry.type === type); + + expect(report?.name).toBe(`${type} assertion`); + expect(report?.reason.length).toBeGreaterThan(0); + expect(report?.metadata).toEqual({ testId: type }); + } + }); + + test('reports unknown evaluator families instead of silently treating them as supported', () => { + const [report] = unsupportedEvaluatorReports([{ type: 'custom-family', name: 'custom' }]); + + expect(report).toMatchObject({ + name: 'custom', + type: 'custom-family', + reason: 'Unknown evaluator family: custom-family', + }); + }); +}); diff --git a/packages/phoenix-adapter/test/parity.test.ts b/packages/phoenix-adapter/test/parity.test.ts new file mode 100644 index 000000000..5e0fdfe36 --- /dev/null +++ b/packages/phoenix-adapter/test/parity.test.ts @@ -0,0 +1,41 @@ +import { expect, test } from 'bun:test'; +import { mkdirSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import type { NormalizedSuite } from '../src/agentv/types.js'; +import { compareDryRunSuite } from '../src/parity/compare.js'; +import { createPhoenixDatasetPayload } from '../src/phoenix/datasets.js'; + +test('dry-run parity compares baseline ids with normalized cases', () => { + const dir = path.join(tmpdir(), `agentv-phoenix-parity-${crypto.randomUUID()}`); + mkdirSync(dir, { recursive: true }); + const evalPath = path.join(dir, 'dataset.eval.yaml'); + writeFileSync(evalPath, 'tests: []\n'); + writeFileSync(path.join(dir, 'dataset.eval.baseline.jsonl'), '{"test_id":"known"}\n'); + + const suite: NormalizedSuite = { + name: 'suite', + source: { + path: evalPath, + relativePath: 'examples/x/evals/dataset.eval.yaml', + kind: 'eval-yaml', + }, + cases: [ + { + id: 'known', + input: [{ role: 'user', content: 'hi' }], + assertions: [], + metadata: {}, + sourcePath: 'examples/x/evals/dataset.eval.yaml', + }, + ], + suiteAssertions: [], + warnings: [], + unsupportedFeatures: [], + }; + + const summary = compareDryRunSuite(suite, createPhoenixDatasetPayload(suite)); + + expect(summary.status).toBe('passed'); + expect(summary.baselineCount).toBe(1); +}); diff --git a/packages/phoenix-adapter/test/phoenix-datasets.test.ts b/packages/phoenix-adapter/test/phoenix-datasets.test.ts new file mode 100644 index 000000000..a746eb433 --- /dev/null +++ b/packages/phoenix-adapter/test/phoenix-datasets.test.ts @@ -0,0 +1,35 @@ +import { expect, test } from 'bun:test'; +import type { NormalizedSuite } from '../src/agentv/types.js'; +import { createPhoenixDatasetPayload } from '../src/phoenix/datasets.js'; + +test('creates deterministic Phoenix dataset payloads from normalized suites', () => { + const suite: NormalizedSuite = { + name: 'assert-demo', + source: { + path: '/tmp/dataset.eval.yaml', + relativePath: 'examples/features/assert/evals/dataset.eval.yaml', + kind: 'eval-yaml', + }, + cases: [ + { + id: 'contains-check', + criteria: 'Must contain Hello', + input: [{ role: 'user', content: 'Say hello' }], + expectedOutput: 'Hello', + assertions: [{ type: 'contains', source: { type: 'contains', value: 'Hello' } }], + metadata: { tag: 'demo' }, + sourcePath: 'examples/features/assert/evals/dataset.eval.yaml', + }, + ], + suiteAssertions: [], + warnings: [], + unsupportedFeatures: [], + }; + + const dataset = createPhoenixDatasetPayload(suite); + + expect(dataset.name).toStartWith('agentv-examples-examples-features-assert-evals-dataset-eval'); + expect(dataset.examples[0]?.input.messages[0]?.content).toBe('Say hello'); + expect(dataset.examples[0]?.metadata.agentv_test_id).toBe('contains-check'); + expect(dataset.examples[0]?.metadata.agentv_assertions).toEqual(['contains']); +}); diff --git a/packages/phoenix-adapter/tsconfig.json b/packages/phoenix-adapter/tsconfig.json new file mode 100644 index 000000000..984b50599 --- /dev/null +++ b/packages/phoenix-adapter/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "module": "NodeNext", + "moduleResolution": "NodeNext", + "noEmit": true, + "types": ["bun"] + }, + "include": ["src/**/*.ts", "test/**/*.ts"] +} diff --git a/packages/phoenix-adapter/tsup.config.ts b/packages/phoenix-adapter/tsup.config.ts new file mode 100644 index 000000000..edc4764f3 --- /dev/null +++ b/packages/phoenix-adapter/tsup.config.ts @@ -0,0 +1,12 @@ +import { defineConfig } from 'tsup'; + +export default defineConfig({ + entry: ['src/index.ts'], + format: ['esm'], + sourcemap: true, + clean: true, + dts: true, + target: 'node20', + tsconfig: './tsconfig.json', + external: ['@agentv/core'], +}); From e19177278c71bc2c14273f7e645eba52fd89a472 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 2 Jun 2026 13:29:35 +0200 Subject: [PATCH 2/6] fix(phoenix): omit empty assertion-only outputs --- .../phoenix-adapter/src/agentv/load-spec.ts | 8 +++++--- .../test/agentv-normalize.test.ts | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/packages/phoenix-adapter/src/agentv/load-spec.ts b/packages/phoenix-adapter/src/agentv/load-spec.ts index 67905ca30..46d4f0270 100644 --- a/packages/phoenix-adapter/src/agentv/load-spec.ts +++ b/packages/phoenix-adapter/src/agentv/load-spec.ts @@ -39,10 +39,12 @@ function normalizeAssertion(assertion: unknown, index: number): NormalizedAssert function normalizeExpectedOutput(test: { readonly reference_answer?: string; - readonly expected_output?: unknown; + readonly expected_output?: readonly unknown[]; }): unknown { - if (test.reference_answer !== undefined) return test.reference_answer; - return test.expected_output; + const hasExpectedOutput = (test.expected_output?.length ?? 0) > 0; + if (hasExpectedOutput) return test.reference_answer ?? test.expected_output; + if (test.reference_answer && test.reference_answer.length > 0) return test.reference_answer; + return undefined; } function deriveAgentVRoot(source: AgentVSource): string { diff --git a/packages/phoenix-adapter/test/agentv-normalize.test.ts b/packages/phoenix-adapter/test/agentv-normalize.test.ts index 9f95df869..5090c4267 100644 --- a/packages/phoenix-adapter/test/agentv-normalize.test.ts +++ b/packages/phoenix-adapter/test/agentv-normalize.test.ts @@ -68,6 +68,23 @@ tests: expect(suite.cases[1]?.assertions[0]?.type).toBe('contains'); }); + test('leaves assertion-only expected output absent for Phoenix synthesis', async () => { + const sourcePath = path.resolve('../../examples/features/assert/evals/dataset.eval.yaml'); + const suite = await loadAgentVEvalSuite({ + path: sourcePath, + relativePath: 'examples/features/assert/evals/dataset.eval.yaml', + kind: 'eval-yaml', + }); + + expect(suite.cases).toHaveLength(4); + expect(suite.cases.map((testCase) => testCase.expectedOutput)).toEqual([ + undefined, + undefined, + undefined, + undefined, + ]); + }); + test('normalizes Agent Skills evals.json', async () => { const root = fixtureRoot('skills'); const evalPath = path.join(root, 'examples', 'features', 'agent-skills-evals', 'evals.json'); From 62c0b6d687b1a3da878b5050de5fb153405243fc Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 2 Jun 2026 13:47:44 +0200 Subject: [PATCH 3/6] chore(phoenix): write smoke report outside package --- packages/phoenix-adapter/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/phoenix-adapter/package.json b/packages/phoenix-adapter/package.json index d51f1a0ee..e816e632a 100644 --- a/packages/phoenix-adapter/package.json +++ b/packages/phoenix-adapter/package.json @@ -17,7 +17,7 @@ "typecheck": "(cd ../core && bun run build) && tsc --noEmit", "test": "(cd ../core && bun run build) && bun test", "phoenix:dry-run": "bun src/cli.ts run --dry-run --agentv-root ../.. --out reports/dry-run.json", - "phoenix:assert-smoke": "bun src/cli.ts run --dry-run --agentv-root ../.. --filter examples/features/assert/evals/dataset.eval.yaml --out reports/assert-smoke.json" + "phoenix:assert-smoke": "bun src/cli.ts run --dry-run --agentv-root ../.. --filter examples/features/assert/evals/dataset.eval.yaml --out /tmp/agentv-phoenix-assert-smoke.json" }, "files": ["dist", "README.md", "docs"], "dependencies": { From 787d840e15be41591d73fb46265b65ab1cd176ea Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 2 Jun 2026 13:58:11 +0200 Subject: [PATCH 4/6] fix(phoenix): unwrap expected values for graders --- packages/phoenix-adapter/src/phoenix/run-experiment.ts | 10 +++++++++- packages/phoenix-adapter/test/phoenix-datasets.test.ts | 7 +++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/phoenix-adapter/src/phoenix/run-experiment.ts b/packages/phoenix-adapter/src/phoenix/run-experiment.ts index 41f83fb27..7fd0ec922 100644 --- a/packages/phoenix-adapter/src/phoenix/run-experiment.ts +++ b/packages/phoenix-adapter/src/phoenix/run-experiment.ts @@ -78,10 +78,11 @@ export async function runPhoenixExperiment( }; } + const expectedOutput = unwrapPhoenixExpectedOutput(expected); const results = configs.map((config) => evaluateAssertion(config, { output, - expectedOutput: expected, + expectedOutput, metadata: safeMetadata, }), ); @@ -115,6 +116,13 @@ export async function runPhoenixExperiment( }; } +export function unwrapPhoenixExpectedOutput(expected: unknown): unknown { + if (expected && typeof expected === 'object' && 'answer' in expected) { + return (expected as { readonly answer?: unknown }).answer; + } + return expected; +} + function normalizeExpected(output: unknown): Record { if (Array.isArray(output) && output.length === 1) { const first = output[0] as { readonly content?: unknown } | undefined; diff --git a/packages/phoenix-adapter/test/phoenix-datasets.test.ts b/packages/phoenix-adapter/test/phoenix-datasets.test.ts index a746eb433..7d15f118d 100644 --- a/packages/phoenix-adapter/test/phoenix-datasets.test.ts +++ b/packages/phoenix-adapter/test/phoenix-datasets.test.ts @@ -1,6 +1,7 @@ import { expect, test } from 'bun:test'; import type { NormalizedSuite } from '../src/agentv/types.js'; import { createPhoenixDatasetPayload } from '../src/phoenix/datasets.js'; +import { unwrapPhoenixExpectedOutput } from '../src/phoenix/run-experiment.js'; test('creates deterministic Phoenix dataset payloads from normalized suites', () => { const suite: NormalizedSuite = { @@ -33,3 +34,9 @@ test('creates deterministic Phoenix dataset payloads from normalized suites', () expect(dataset.examples[0]?.metadata.agentv_test_id).toBe('contains-check'); expect(dataset.examples[0]?.metadata.agentv_assertions).toEqual(['contains']); }); + +test('unwraps Phoenix expected answer payloads for AgentV deterministic graders', () => { + expect(unwrapPhoenixExpectedOutput({ answer: 'done' })).toBe('done'); + expect(unwrapPhoenixExpectedOutput({ answer: { ok: true } })).toEqual({ ok: true }); + expect(unwrapPhoenixExpectedOutput({ other: 'shape' })).toEqual({ other: 'shape' }); +}); From f82a93b4fb4df7f61b74bc76d6cc60703ab25cfe Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 2 Jun 2026 14:13:55 +0200 Subject: [PATCH 5/6] fix(phoenix): preserve null equals assertions --- bun.lock | 1 + packages/phoenix-adapter/package.json | 3 ++- .../src/evaluators/deterministic.ts | 13 ++++++++++++- .../test/evaluators/deterministic.test.ts | 16 ++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/bun.lock b/bun.lock index 366959007..87292c44f 100644 --- a/bun.lock +++ b/bun.lock @@ -133,6 +133,7 @@ "@agentv/core": "workspace:*", "@arizeai/phoenix-client": "6.10.0", "@arizeai/phoenix-evals": "1.0.3", + "yaml": "^2.8.3", }, "devDependencies": { "tsup": "8.3.5", diff --git a/packages/phoenix-adapter/package.json b/packages/phoenix-adapter/package.json index e816e632a..87b9eb210 100644 --- a/packages/phoenix-adapter/package.json +++ b/packages/phoenix-adapter/package.json @@ -23,7 +23,8 @@ "dependencies": { "@agentv/core": "workspace:*", "@arizeai/phoenix-client": "6.10.0", - "@arizeai/phoenix-evals": "1.0.3" + "@arizeai/phoenix-evals": "1.0.3", + "yaml": "^2.8.3" }, "devDependencies": { "tsup": "8.3.5", diff --git a/packages/phoenix-adapter/src/evaluators/deterministic.ts b/packages/phoenix-adapter/src/evaluators/deterministic.ts index 593d03c74..5b804255a 100644 --- a/packages/phoenix-adapter/src/evaluators/deterministic.ts +++ b/packages/phoenix-adapter/src/evaluators/deterministic.ts @@ -83,7 +83,9 @@ function evaluateEquals( assertion: NormalizedAssertionConfig, context: EvaluationContext, ): EvaluatorResult { - const expected = assertionValue(assertion) ?? context.expectedOutput; + const expected = hasAssertionValue(assertion) + ? assertionValue(assertion) + : context.expectedOutput; const passed = stableValue(context.output) === stableValue(expected); return result( @@ -103,6 +105,15 @@ function evaluateIsJson( return result(assertion, passed, passed ? 'Output is valid JSON' : parsed.reason); } +function hasAssertionValue(assertion: NormalizedAssertionConfig): boolean { + return ( + 'value' in assertion || + 'expected' in assertion || + 'text' in assertion || + 'substring' in assertion + ); +} + function assertionValue(assertion: NormalizedAssertionConfig): unknown { if ('value' in assertion) return assertion.value; if ('expected' in assertion) return assertion.expected; diff --git a/packages/phoenix-adapter/test/evaluators/deterministic.test.ts b/packages/phoenix-adapter/test/evaluators/deterministic.test.ts index 527e23c74..1e233b2b9 100644 --- a/packages/phoenix-adapter/test/evaluators/deterministic.test.ts +++ b/packages/phoenix-adapter/test/evaluators/deterministic.test.ts @@ -66,6 +66,22 @@ describe('deterministic evaluator adapters', () => { expect(result.score).toBe(1); }); + test('equals preserves explicit null expected values', () => { + const pass = evaluateAssertion( + { type: 'equals', expected: null }, + { output: null, expectedOutput: 'fallback' }, + ); + const fail = evaluateAssertion( + { type: 'equals', expected: null }, + { output: 'fallback', expectedOutput: 'fallback' }, + ); + + expect(pass.passed).toBe(true); + expect(pass.score).toBe(1); + expect(fail.passed).toBe(false); + expect(fail.score).toBe(0); + }); + test('equals can fall back to context expected output', () => { const result = evaluateAssertion( { type: 'equals' }, From 9be0b5f9ec57d742f3bf3404ef2f852e0dac70f8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 2 Jun 2026 14:26:01 +0200 Subject: [PATCH 6/6] fix(phoenix): preserve present expected outputs --- .../phoenix-adapter/src/agentv/load-spec.ts | 12 +++--- .../test/agentv-normalize.test.ts | 43 +++++++++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/packages/phoenix-adapter/src/agentv/load-spec.ts b/packages/phoenix-adapter/src/agentv/load-spec.ts index 46d4f0270..dfb0ddd4c 100644 --- a/packages/phoenix-adapter/src/agentv/load-spec.ts +++ b/packages/phoenix-adapter/src/agentv/load-spec.ts @@ -39,12 +39,14 @@ function normalizeAssertion(assertion: unknown, index: number): NormalizedAssert function normalizeExpectedOutput(test: { readonly reference_answer?: string; - readonly expected_output?: readonly unknown[]; + readonly expected_output?: unknown; }): unknown { - const hasExpectedOutput = (test.expected_output?.length ?? 0) > 0; - if (hasExpectedOutput) return test.reference_answer ?? test.expected_output; - if (test.reference_answer && test.reference_answer.length > 0) return test.reference_answer; - return undefined; + const expectedOutput = test.expected_output; + const hasExpectedOutput = Array.isArray(expectedOutput) + ? expectedOutput.length > 0 + : expectedOutput !== undefined; + if (!hasExpectedOutput) return undefined; + return test.reference_answer ?? expectedOutput; } function deriveAgentVRoot(source: AgentVSource): string { diff --git a/packages/phoenix-adapter/test/agentv-normalize.test.ts b/packages/phoenix-adapter/test/agentv-normalize.test.ts index 5090c4267..89c7035f0 100644 --- a/packages/phoenix-adapter/test/agentv-normalize.test.ts +++ b/packages/phoenix-adapter/test/agentv-normalize.test.ts @@ -68,6 +68,49 @@ tests: expect(suite.cases[1]?.assertions[0]?.type).toBe('contains'); }); + test('preserves present object and empty-string expected output values', async () => { + const root = fixtureRoot('expected-values'); + const evalPath = path.join( + root, + 'examples', + 'features', + 'expected-values', + 'evals', + 'dataset.eval.yaml', + ); + mkdirSync(path.dirname(evalPath), { recursive: true }); + writeFileSync( + evalPath, + `name: expected-values +tests: + - id: object-output + input: hi + expected_output: + ok: true + - id: empty-string-output + input: hi + expected_output: "" + - id: assertion-only + input: hi + assertions: + - type: contains + value: ok +`, + ); + + const suite = await loadAgentVEvalSuite({ + path: evalPath, + relativePath: 'examples/features/expected-values/evals/dataset.eval.yaml', + kind: 'eval-yaml', + }); + + expect(suite.cases.map((testCase) => testCase.expectedOutput)).toEqual([ + '{\n "ok": true\n}', + '', + undefined, + ]); + }); + test('leaves assertion-only expected output absent for Phoenix synthesis', async () => { const sourcePath = path.resolve('../../examples/features/assert/evals/dataset.eval.yaml'); const suite = await loadAgentVEvalSuite({