Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ targets:

- name: codex
provider: codex
executable: ${{ CODEX_EXECUTABLE }}
model: ${{ CODEX_MODEL }}
model_reasoning_effort: ${{ CODEX_REASONING_EFFORT }}
grader_target: grader
cwd: ${{ CODEX_WORKSPACE_DIR }}
log_dir: ${{ CODEX_LOG_DIR }}
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"@hono/node-server": "^1.19.11",
"@inquirer/prompts": "^8.2.1",
"@earendil-works/pi-ai": "^0.74.0",
"@openai/codex-sdk": "^0.104.0",
"@openai/codex-sdk": "^0.136.0",
"cmd-ts": "^0.14.3",
"dotenv": "^16.4.5",
"fast-glob": "^3.3.3",
Expand Down
6 changes: 6 additions & 0 deletions apps/web/src/content/docs/docs/targets/coding-agents.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,17 @@ Since `cc-mirror` resolves to `claude-cli`, all Claude target fields (model, sys
targets:
- name: codex_target
provider: codex
executable: codex-eng
model: ${{ CODEX_MODEL }}
model_reasoning_effort: ${{ CODEX_REASONING_EFFORT }}
grader_target: azure-base
```

| Field | Required | Description |
|-------|----------|-------------|
| `executable` | No | Codex binary or profile shim to run, such as `codex-eng` |
| `model` | No | Model to use |
| `model_reasoning_effort` | No | Codex SDK reasoning effort: `minimal`, `low`, `medium`, `high`, or `xhigh` |
| `cwd` | No | Working directory |
| `grader_target` | Yes | LLM target for evaluation |

Expand Down
26 changes: 13 additions & 13 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
"@agentv/eval": "workspace:*",
"@github/copilot-sdk": "^0.1.25",
"@earendil-works/pi-ai": "^0.74.0",
"@openai/codex-sdk": "^0.104.0",
"@openai/codex-sdk": "^0.136.0",
"fast-glob": "^3.3.3",
"json5": "^2.2.3",
"micromatch": "^4.0.8",
Expand Down
6 changes: 6 additions & 0 deletions packages/core/src/evaluation/providers/codex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ export class CodexProvider implements Provider {
// Build Codex SDK options
// biome-ignore lint/suspicious/noExplicitAny: SDK constructor options are dynamic
const codexOptions: any = {};
if (this.config.executable) {
codexOptions.codexPathOverride = this.config.executable;
}
if (this.config.model) {
codexOptions.config = { model: this.config.model };
}
Expand All @@ -86,6 +89,9 @@ export class CodexProvider implements Provider {
if (cwd) {
threadOptions.workingDirectory = cwd;
}
if (this.config.modelReasoningEffort) {
threadOptions.modelReasoningEffort = this.config.modelReasoningEffort;
}

const thread = codex.startThread(threadOptions);

Expand Down
42 changes: 42 additions & 0 deletions packages/core/src/evaluation/providers/targets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ export interface GeminiResolvedConfig {

export interface CodexResolvedConfig {
readonly model?: string;
readonly modelReasoningEffort?: CodexModelReasoningEffort;
readonly executable: string;
readonly args?: readonly string[];
readonly cwd?: string;
Expand Down Expand Up @@ -596,6 +597,17 @@ const DEPRECATED_TARGET_CAMEL_CASE_FIELDS = new Map<string, string>([
['retryMaxDelayMs', 'retry_max_delay_ms'],
['retryBackoffFactor', 'retry_backoff_factor'],
['retryStatusCodes', 'retry_status_codes'],
['modelReasoningEffort', 'model_reasoning_effort'],
]);

export type CodexModelReasoningEffort = 'minimal' | 'low' | 'medium' | 'high' | 'xhigh';

const CODEX_MODEL_REASONING_EFFORT_VALUES = new Set<CodexModelReasoningEffort>([
'minimal',
'low',
'medium',
'high',
'xhigh',
]);

const DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = new Map<string, string>([
Expand Down Expand Up @@ -1254,6 +1266,7 @@ function resolveCodexConfig(
_evalFilePath?: string,
): CodexResolvedConfig {
const modelSource = target.model;
const modelReasoningEffortSource = target.model_reasoning_effort;
const executableSource = target.executable ?? target.command ?? target.binary;
const argsSource = target.args ?? target.arguments;
const cwdSource = target.cwd;
Expand All @@ -1272,6 +1285,17 @@ function resolveCodexConfig(
allowLiteral: true,
optionalEnv: true,
});
const modelReasoningEffort = normalizeCodexModelReasoningEffort(
resolveOptionalString(
modelReasoningEffortSource,
env,
`${target.name} codex model reasoning effort`,
{
allowLiteral: true,
optionalEnv: true,
},
),
);

const executable =
resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
Expand Down Expand Up @@ -1300,6 +1324,7 @@ function resolveCodexConfig(

return {
model,
modelReasoningEffort,
executable,
args,
cwd,
Expand All @@ -1311,6 +1336,23 @@ function resolveCodexConfig(
};
}

function normalizeCodexModelReasoningEffort(
value: string | undefined,
): CodexModelReasoningEffort | undefined {
if (value === undefined) {
return undefined;
}

const normalized = value.trim().toLowerCase();
if (CODEX_MODEL_REASONING_EFFORT_VALUES.has(normalized as CodexModelReasoningEffort)) {
return normalized as CodexModelReasoningEffort;
}

throw new Error(
`codex model_reasoning_effort must be one of: ${[...CODEX_MODEL_REASONING_EFFORT_VALUES].join(', ')}`,
);
}

function normalizeCodexLogFormat(value: unknown): 'summary' | 'json' | undefined {
if (value === undefined || value === null) {
return undefined;
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/evaluation/providers/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ export interface TargetDefinition {
readonly binary?: string | unknown | undefined;
readonly args?: unknown | undefined;
readonly arguments?: unknown | undefined;
readonly model_reasoning_effort?: string | unknown | undefined;
readonly cwd?: string | unknown | undefined;
readonly timeout_seconds?: number | unknown | undefined;
readonly log_dir?: string | unknown | undefined;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ const GEMINI_SETTINGS = new Set([
const CODEX_SETTINGS = new Set([
...COMMON_SETTINGS,
'model',
'model_reasoning_effort',
'executable',
'command',
'binary',
Expand Down
64 changes: 64 additions & 0 deletions packages/core/test/evaluation/providers/codex-sdk.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,42 @@ describe('CodexProvider (SDK)', () => {
await provider.invoke({ question: 'Test' });

const constructorArgs = CodexMock.mock.calls[0][0];
expect(constructorArgs.codexPathOverride).toBe('codex');
expect(constructorArgs.config.model).toBe('o4-mini');
});

it('passes executable config to Codex constructor as codexPathOverride', async () => {
const thread = createMockThread({
events: [
{
type: 'item.completed',
item: { id: 'msg-1', type: 'agent_message', text: 'response' },
},
{
type: 'turn.completed',
usage: { input_tokens: 10, output_tokens: 5, cached_input_tokens: 0 },
},
],
});
const codexInstance = createMockCodex(thread);

const CodexMock = mock(function Codex() {
return codexInstance;
});
mock.module('@openai/codex-sdk', () => ({ Codex: CodexMock }));

const { CodexProvider } = await import('../../../src/evaluation/providers/codex.js');

const provider = new CodexProvider('test-target', {
executable: 'codex-eng',
});

await provider.invoke({ question: 'Test' });

const constructorArgs = CodexMock.mock.calls[0][0];
expect(constructorArgs.codexPathOverride).toBe('codex-eng');
});

it('passes workingDirectory to startThread', async () => {
const thread = createMockThread({
events: [
Expand Down Expand Up @@ -163,6 +196,37 @@ describe('CodexProvider (SDK)', () => {
expect(threadOptions.workingDirectory).toBe(path.resolve('/tmp/test-workspace'));
});

it('passes modelReasoningEffort to startThread', async () => {
const thread = createMockThread({
events: [
{
type: 'item.completed',
item: { id: 'msg-1', type: 'agent_message', text: 'response' },
},
{
type: 'turn.completed',
usage: { input_tokens: 10, output_tokens: 5, cached_input_tokens: 0 },
},
],
});
const codexInstance = createMockCodex(thread);
const sdkMock = mockCodexSdk(codexInstance);

mock.module('@openai/codex-sdk', () => sdkMock);

const { CodexProvider } = await import('../../../src/evaluation/providers/codex.js');

const provider = new CodexProvider('test-target', {
executable: 'codex',
modelReasoningEffort: 'low',
});

await provider.invoke({ question: 'Test' });

const threadOptions = codexInstance.startThread.mock.calls[0][0];
expect(threadOptions.modelReasoningEffort).toBe('low');
});

it('handles timeout', async () => {
const thread = createMockThread();
// Override runStreamed to be slow
Expand Down
36 changes: 36 additions & 0 deletions packages/core/test/evaluation/providers/targets.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,42 @@ describe('resolveTargetDefinition', () => {
expect(target.config.args).toEqual(['--profile', 'default', '--model', 'gpt-4']);
});

it('resolves codex model_reasoning_effort from env', () => {
const target = resolveTargetDefinition(
{
name: 'codex',
provider: 'codex',
model: '${{ CODEX_MODEL }}',
model_reasoning_effort: '${{ CODEX_REASONING_EFFORT }}',
},
{
CODEX_MODEL: 'gpt-5.5',
CODEX_REASONING_EFFORT: 'low',
},
);

expect(target.kind).toBe('codex');
if (target.kind !== 'codex') {
throw new Error('expected codex target');
}

expect(target.config.model).toBe('gpt-5.5');
expect(target.config.modelReasoningEffort).toBe('low');
});

it('rejects unsupported codex model_reasoning_effort values', () => {
expect(() =>
resolveTargetDefinition(
{
name: 'codex',
provider: 'codex',
model_reasoning_effort: 'tiny',
},
{},
),
).toThrow(/model_reasoning_effort must be one of: minimal, low, medium, high, xhigh/);
});

it('resolves copilot alias to copilot-cli', () => {
const target = resolveTargetDefinition(
{
Expand Down
Loading
Loading