diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 0d77335b..f34d9c30 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -112,6 +112,9 @@ targets: - name: codex provider: codex + executable: ${{ CODEX_EXECUTABLE }} + model: ${{ CODEX_MODEL }} + model_reasoning_effort: ${{ CODEX_REASONING_EFFORT }} grader_target: grader cwd: ${{ CODEX_WORKSPACE_DIR }} log_dir: ${{ CODEX_LOG_DIR }} diff --git a/apps/cli/package.json b/apps/cli/package.json index 1b691367..1917cc12 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -33,7 +33,7 @@ "@hono/node-server": "^1.19.11", "@inquirer/prompts": "^8.2.1", "@earendil-works/pi-ai": "^0.74.0", - "@openai/codex-sdk": "^0.104.0", + "@openai/codex-sdk": "^0.136.0", "cmd-ts": "^0.14.3", "dotenv": "^16.4.5", "fast-glob": "^3.3.3", diff --git a/apps/web/src/content/docs/docs/targets/coding-agents.mdx b/apps/web/src/content/docs/docs/targets/coding-agents.mdx index 51eaab50..19106a46 100644 --- a/apps/web/src/content/docs/docs/targets/coding-agents.mdx +++ b/apps/web/src/content/docs/docs/targets/coding-agents.mdx @@ -112,11 +112,17 @@ Since `cc-mirror` resolves to `claude-cli`, all Claude target fields (model, sys targets: - name: codex_target provider: codex + executable: codex-eng + model: ${{ CODEX_MODEL }} + model_reasoning_effort: ${{ CODEX_REASONING_EFFORT }} grader_target: azure-base ``` | Field | Required | Description | |-------|----------|-------------| +| `executable` | No | Codex binary or profile shim to run, such as `codex-eng` | +| `model` | No | Model to use | +| `model_reasoning_effort` | No | Codex SDK reasoning effort: `minimal`, `low`, `medium`, `high`, or `xhigh` | | `cwd` | No | Working directory | | `grader_target` | Yes | LLM target for evaluation | diff --git a/bun.lock b/bun.lock index 87292c44..4d4876a2 100644 --- a/bun.lock +++ b/bun.lock @@ -21,7 +21,7 @@ }, "apps/cli": { "name": "agentv", - "version": "4.31.4-next.1", + "version": "4.32.0-next.1", "bin": { "agentv": "./dist/cli.js", }, @@ -31,7 +31,7 @@ "@github/copilot-sdk": "^0.1.25", "@hono/node-server": "^1.19.11", "@inquirer/prompts": "^8.2.1", - "@openai/codex-sdk": "^0.104.0", + "@openai/codex-sdk": "^0.136.0", "cmd-ts": "^0.14.3", "dotenv": "^16.4.5", "fast-glob": "^3.3.3", @@ -85,13 +85,13 @@ }, "packages/core": { "name": "@agentv/core", - "version": "4.31.4-next.1", + "version": "4.32.0-next.1", "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", "@earendil-works/pi-ai": "^0.74.0", "@github/copilot-sdk": "^0.1.25", - "@openai/codex-sdk": "^0.104.0", + "@openai/codex-sdk": "^0.136.0", "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", @@ -121,7 +121,7 @@ }, "packages/eval": { "name": "@agentv/eval", - "version": "4.31.4-next.1", + "version": "4.32.0-next.1", "dependencies": { "zod": "^3.23.8", }, @@ -536,21 +536,21 @@ "@nodelib/fs.walk": ["@nodelib/fs.walk@1.2.8", "", { "dependencies": { "@nodelib/fs.scandir": "2.1.5", "fastq": "^1.6.0" } }, "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg=="], - "@openai/codex": ["@openai/codex@0.104.0", "", { "optionalDependencies": { "@openai/codex-darwin-arm64": "npm:@openai/codex@0.104.0-darwin-arm64", "@openai/codex-darwin-x64": "npm:@openai/codex@0.104.0-darwin-x64", "@openai/codex-linux-arm64": "npm:@openai/codex@0.104.0-linux-arm64", "@openai/codex-linux-x64": "npm:@openai/codex@0.104.0-linux-x64", "@openai/codex-win32-arm64": "npm:@openai/codex@0.104.0-win32-arm64", "@openai/codex-win32-x64": "npm:@openai/codex@0.104.0-win32-x64" }, "bin": { "codex": "bin/codex.js" } }, "sha512-pPa2VGHozwjPsPOYAEXcH7nNt1QH7AZR8zV8jYx6BFi1LJlmJkan2rvIS4MYbPdi2O6cd5kWfPCAHE0fEV2ifA=="], + "@openai/codex": ["@openai/codex@0.136.0", "", { "optionalDependencies": { "@openai/codex-darwin-arm64": "npm:@openai/codex@0.136.0-darwin-arm64", "@openai/codex-darwin-x64": "npm:@openai/codex@0.136.0-darwin-x64", "@openai/codex-linux-arm64": "npm:@openai/codex@0.136.0-linux-arm64", "@openai/codex-linux-x64": "npm:@openai/codex@0.136.0-linux-x64", "@openai/codex-win32-arm64": "npm:@openai/codex@0.136.0-win32-arm64", "@openai/codex-win32-x64": "npm:@openai/codex@0.136.0-win32-x64" }, "bin": { "codex": "bin/codex.js" } }, "sha512-yu+diXznSOt8h296/CDOf2CNi55z1raA7aZ6sejjGy1QWPqn72YkBc2ByTzZG6G+1yiUqlm2G5Hid54hm3/kaA=="], - "@openai/codex-darwin-arm64": ["@openai/codex@0.104.0-darwin-arm64", "", { "os": "darwin", "cpu": "arm64" }, "sha512-Y+lifRKAgNSBcaIM5UXXYnGWAJrPORPXABZBCxxiwwB8/XzZRDwp3K+X5i7dT0GfKScGFXuul6sJ2sVSPL4w4A=="], + "@openai/codex-darwin-arm64": ["@openai/codex@0.136.0-darwin-arm64", "", { "os": "darwin", "cpu": "arm64" }, "sha512-9xnEohnUO9l6qtPSDbG0Y2UXX5mBZ9Lsn0VMgjtkREGfWL9TawNC1d7D1ERMSJtHTlVvieoaFyK9LHPLQCO9Vw=="], - "@openai/codex-darwin-x64": ["@openai/codex@0.104.0-darwin-x64", "", { "os": "darwin", "cpu": "x64" }, "sha512-TwQ9zj0XbSrtCxFWKnnSQfmWmKhNMx1rSpSaSrLNSFVohxRwOWUZ2GBciO6jCLEiJvswR6nTMy1mA0n7MyVJiw=="], + "@openai/codex-darwin-x64": ["@openai/codex@0.136.0-darwin-x64", "", { "os": "darwin", "cpu": "x64" }, "sha512-HD9YLalPg0cv+CiZSmRWOl/8sefuJlxJcAmxyzb8mSaAMchD3V+2yS6M6E1Z2I6NX1irp4Polt1fPsjGdlHYhQ=="], - "@openai/codex-linux-arm64": ["@openai/codex@0.104.0-linux-arm64", "", { "os": "linux", "cpu": "arm64" }, "sha512-3oBBjMaCnhGfijsklOzVqG0LH/IFWoDnRJkvFl1utMI+GJECUr37uL/KsSFTuC2kIjham6U57dAK6xQnQxqxPQ=="], + "@openai/codex-linux-arm64": ["@openai/codex@0.136.0-linux-arm64", "", { "os": "linux", "cpu": "arm64" }, "sha512-xDxTOk5ClEX/nzlQseqEoiubjIzVZfcWKn9nmKkHOLKknz+c7n6tbmw7eY4mwt29zTAAoFoecdRnbbsPZUHJ1g=="], - "@openai/codex-linux-x64": ["@openai/codex@0.104.0-linux-x64", "", { "os": "linux", "cpu": "x64" }, "sha512-vhYaWsEwZmxZbeu5u9/k3VO1F4aTMYaTCebRgdzux7bfeDw2nms1SAcP+AkfCStqVSz26yaPGbwcUMqaknW4gQ=="], + "@openai/codex-linux-x64": ["@openai/codex@0.136.0-linux-x64", "", { "os": "linux", "cpu": "x64" }, "sha512-Q6yZHRtUWD+27B5yAiOYyPAtM0BzW0Mm23YGaXDmfNEO5BYgHuUT80VDofUpjSyo2+ShYoZQUFVQAsNPKqd+Sw=="], - "@openai/codex-sdk": ["@openai/codex-sdk@0.104.0", "", { "dependencies": { "@openai/codex": "0.104.0" } }, "sha512-eXnGqFUh4BRASRK4f8IyLHQG7b4DUjfM7GaasLUNggneUEUVmBgEP24mTo6Qu53oIuA1t+j1QxdCQbxAlWZKPA=="], + "@openai/codex-sdk": ["@openai/codex-sdk@0.136.0", "", { "dependencies": { "@openai/codex": "0.136.0" } }, "sha512-qrSirrVxrpVHR1YIQQ4WSouneaQrQrRAAlQWJYRflvMCcruFXorsmIPeIbDqFXOa8A9H9FHHyc+2U4tdbEGGiQ=="], - "@openai/codex-win32-arm64": ["@openai/codex@0.104.0-win32-arm64", "", { "os": "win32", "cpu": "arm64" }, "sha512-2ypuM6yWcjAtq7DmEgFBsmtw7rWLcoy6Cxaq+Hn8dZfEdijASyc59AzyWhWLKYLuOxcprFn/oQitElrpPD9JOA=="], + "@openai/codex-win32-arm64": ["@openai/codex@0.136.0-win32-arm64", "", { "os": "win32", "cpu": "arm64" }, "sha512-mAHZXfygQxBg1JjZ9+bAZfBXe6fg8MabJhuDYhj5z0VF++orKSFsC1lMiv6PksQN36ikQefgVjc9EOlaE4lt7g=="], - "@openai/codex-win32-x64": ["@openai/codex@0.104.0-win32-x64", "", { "os": "win32", "cpu": "x64" }, "sha512-awyNLtfbTbj+2JzgsAIm+KFrxeAmxe/Fuqw/ZwBj8ljtO7SQWTT3kxDbf7iuA7E7IErGlQw/plgFgq/LJdsacg=="], + "@openai/codex-win32-x64": ["@openai/codex@0.136.0-win32-x64", "", { "os": "win32", "cpu": "x64" }, "sha512-zS6DAmvjdWeAB1CL9KTUMkwzTwfXtxHy8GAtePw2a93jIqawoG07fBxAXuyoHZ3QXQkwEgqBx1zEEh33gdIKAw=="], "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], diff --git a/packages/core/package.json b/packages/core/package.json index 92408165..4357cce5 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -44,7 +44,7 @@ "@agentv/eval": "workspace:*", "@github/copilot-sdk": "^0.1.25", "@earendil-works/pi-ai": "^0.74.0", - "@openai/codex-sdk": "^0.104.0", + "@openai/codex-sdk": "^0.136.0", "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts index 4318ec5c..c23365f1 100644 --- a/packages/core/src/evaluation/providers/codex.ts +++ b/packages/core/src/evaluation/providers/codex.ts @@ -70,6 +70,9 @@ export class CodexProvider implements Provider { // Build Codex SDK options // biome-ignore lint/suspicious/noExplicitAny: SDK constructor options are dynamic const codexOptions: any = {}; + if (this.config.executable) { + codexOptions.codexPathOverride = this.config.executable; + } if (this.config.model) { codexOptions.config = { model: this.config.model }; } @@ -86,6 +89,9 @@ export class CodexProvider implements Provider { if (cwd) { threadOptions.workingDirectory = cwd; } + if (this.config.modelReasoningEffort) { + threadOptions.modelReasoningEffort = this.config.modelReasoningEffort; + } const thread = codex.startThread(threadOptions); diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 9ed10861..b161605d 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -424,6 +424,7 @@ export interface GeminiResolvedConfig { export interface CodexResolvedConfig { readonly model?: string; + readonly modelReasoningEffort?: CodexModelReasoningEffort; readonly executable: string; readonly args?: readonly string[]; readonly cwd?: string; @@ -596,6 +597,17 @@ const DEPRECATED_TARGET_CAMEL_CASE_FIELDS = new Map([ ['retryMaxDelayMs', 'retry_max_delay_ms'], ['retryBackoffFactor', 'retry_backoff_factor'], ['retryStatusCodes', 'retry_status_codes'], + ['modelReasoningEffort', 'model_reasoning_effort'], +]); + +export type CodexModelReasoningEffort = 'minimal' | 'low' | 'medium' | 'high' | 'xhigh'; + +const CODEX_MODEL_REASONING_EFFORT_VALUES = new Set([ + 'minimal', + 'low', + 'medium', + 'high', + 'xhigh', ]); const DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = new Map([ @@ -1254,6 +1266,7 @@ function resolveCodexConfig( _evalFilePath?: string, ): CodexResolvedConfig { const modelSource = target.model; + const modelReasoningEffortSource = target.model_reasoning_effort; const executableSource = target.executable ?? target.command ?? target.binary; const argsSource = target.args ?? target.arguments; const cwdSource = target.cwd; @@ -1272,6 +1285,17 @@ function resolveCodexConfig( allowLiteral: true, optionalEnv: true, }); + const modelReasoningEffort = normalizeCodexModelReasoningEffort( + resolveOptionalString( + modelReasoningEffortSource, + env, + `${target.name} codex model reasoning effort`, + { + allowLiteral: true, + optionalEnv: true, + }, + ), + ); const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, { @@ -1300,6 +1324,7 @@ function resolveCodexConfig( return { model, + modelReasoningEffort, executable, args, cwd, @@ -1311,6 +1336,23 @@ function resolveCodexConfig( }; } +function normalizeCodexModelReasoningEffort( + value: string | undefined, +): CodexModelReasoningEffort | undefined { + if (value === undefined) { + return undefined; + } + + const normalized = value.trim().toLowerCase(); + if (CODEX_MODEL_REASONING_EFFORT_VALUES.has(normalized as CodexModelReasoningEffort)) { + return normalized as CodexModelReasoningEffort; + } + + throw new Error( + `codex model_reasoning_effort must be one of: ${[...CODEX_MODEL_REASONING_EFFORT_VALUES].join(', ')}`, + ); +} + function normalizeCodexLogFormat(value: unknown): 'summary' | 'json' | undefined { if (value === undefined || value === null) { return undefined; diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 3670705e..dcdf2311 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -391,6 +391,7 @@ export interface TargetDefinition { readonly binary?: string | unknown | undefined; readonly args?: unknown | undefined; readonly arguments?: unknown | undefined; + readonly model_reasoning_effort?: string | unknown | undefined; readonly cwd?: string | unknown | undefined; readonly timeout_seconds?: number | unknown | undefined; readonly log_dir?: string | unknown | undefined; diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index 1f9d04c0..9395b5f5 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -96,6 +96,7 @@ const GEMINI_SETTINGS = new Set([ const CODEX_SETTINGS = new Set([ ...COMMON_SETTINGS, 'model', + 'model_reasoning_effort', 'executable', 'command', 'binary', diff --git a/packages/core/test/evaluation/providers/codex-sdk.test.ts b/packages/core/test/evaluation/providers/codex-sdk.test.ts index ea3c903d..e279cc3c 100644 --- a/packages/core/test/evaluation/providers/codex-sdk.test.ts +++ b/packages/core/test/evaluation/providers/codex-sdk.test.ts @@ -128,9 +128,42 @@ describe('CodexProvider (SDK)', () => { await provider.invoke({ question: 'Test' }); const constructorArgs = CodexMock.mock.calls[0][0]; + expect(constructorArgs.codexPathOverride).toBe('codex'); expect(constructorArgs.config.model).toBe('o4-mini'); }); + it('passes executable config to Codex constructor as codexPathOverride', async () => { + const thread = createMockThread({ + events: [ + { + type: 'item.completed', + item: { id: 'msg-1', type: 'agent_message', text: 'response' }, + }, + { + type: 'turn.completed', + usage: { input_tokens: 10, output_tokens: 5, cached_input_tokens: 0 }, + }, + ], + }); + const codexInstance = createMockCodex(thread); + + const CodexMock = mock(function Codex() { + return codexInstance; + }); + mock.module('@openai/codex-sdk', () => ({ Codex: CodexMock })); + + const { CodexProvider } = await import('../../../src/evaluation/providers/codex.js'); + + const provider = new CodexProvider('test-target', { + executable: 'codex-eng', + }); + + await provider.invoke({ question: 'Test' }); + + const constructorArgs = CodexMock.mock.calls[0][0]; + expect(constructorArgs.codexPathOverride).toBe('codex-eng'); + }); + it('passes workingDirectory to startThread', async () => { const thread = createMockThread({ events: [ @@ -163,6 +196,37 @@ describe('CodexProvider (SDK)', () => { expect(threadOptions.workingDirectory).toBe(path.resolve('/tmp/test-workspace')); }); + it('passes modelReasoningEffort to startThread', async () => { + const thread = createMockThread({ + events: [ + { + type: 'item.completed', + item: { id: 'msg-1', type: 'agent_message', text: 'response' }, + }, + { + type: 'turn.completed', + usage: { input_tokens: 10, output_tokens: 5, cached_input_tokens: 0 }, + }, + ], + }); + const codexInstance = createMockCodex(thread); + const sdkMock = mockCodexSdk(codexInstance); + + mock.module('@openai/codex-sdk', () => sdkMock); + + const { CodexProvider } = await import('../../../src/evaluation/providers/codex.js'); + + const provider = new CodexProvider('test-target', { + executable: 'codex', + modelReasoningEffort: 'low', + }); + + await provider.invoke({ question: 'Test' }); + + const threadOptions = codexInstance.startThread.mock.calls[0][0]; + expect(threadOptions.modelReasoningEffort).toBe('low'); + }); + it('handles timeout', async () => { const thread = createMockThread(); // Override runStreamed to be slow diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index 038dbe9e..00de80eb 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -626,6 +626,42 @@ describe('resolveTargetDefinition', () => { expect(target.config.args).toEqual(['--profile', 'default', '--model', 'gpt-4']); }); + it('resolves codex model_reasoning_effort from env', () => { + const target = resolveTargetDefinition( + { + name: 'codex', + provider: 'codex', + model: '${{ CODEX_MODEL }}', + model_reasoning_effort: '${{ CODEX_REASONING_EFFORT }}', + }, + { + CODEX_MODEL: 'gpt-5.5', + CODEX_REASONING_EFFORT: 'low', + }, + ); + + expect(target.kind).toBe('codex'); + if (target.kind !== 'codex') { + throw new Error('expected codex target'); + } + + expect(target.config.model).toBe('gpt-5.5'); + expect(target.config.modelReasoningEffort).toBe('low'); + }); + + it('rejects unsupported codex model_reasoning_effort values', () => { + expect(() => + resolveTargetDefinition( + { + name: 'codex', + provider: 'codex', + model_reasoning_effort: 'tiny', + }, + {}, + ), + ).toThrow(/model_reasoning_effort must be one of: minimal, low, medium, high, xhigh/); + }); + it('resolves copilot alias to copilot-cli', () => { const target = resolveTargetDefinition( { diff --git a/packages/core/test/evaluation/validation/targets-validator.test.ts b/packages/core/test/evaluation/validation/targets-validator.test.ts index 24dc65c6..41f14e4e 100644 --- a/packages/core/test/evaluation/validation/targets-validator.test.ts +++ b/packages/core/test/evaluation/validation/targets-validator.test.ts @@ -50,6 +50,7 @@ describe('validateTargetsFile', () => { timeoutSeconds: 30 logDir: ./logs systemPrompt: Be precise. + modelReasoningEffort: low - name: cli-target provider: cli command: echo {PROMPT} @@ -86,6 +87,14 @@ describe('validateTargetsFile', () => { error.message.includes("Use 'system_prompt' instead"), ), ).toBe(true); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'targets[0].modelReasoningEffort' && + error.message.includes("Use 'model_reasoning_effort' instead"), + ), + ).toBe(true); expect( result.errors.some( (error) => @@ -96,6 +105,23 @@ describe('validateTargetsFile', () => { ).toBe(true); }); + it('accepts codex model_reasoning_effort', async () => { + const filePath = path.join(tempDir, 'codex-reasoning-effort.yaml'); + await writeFile( + filePath, + `targets: + - name: codex-target + provider: codex + model: \${{ CODEX_MODEL }} + model_reasoning_effort: \${{ CODEX_REASONING_EFFORT }} +`, + ); + + const result = await validateTargetsFile(filePath); + + expect(result.valid).toBe(true); + }); + it('rejects azure api_format with a migration error', async () => { const filePath = path.join(tempDir, 'azure-api-format.yaml'); await writeFile(