diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index 78d650b43d..a4a1155bdf 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -64,7 +64,7 @@ body: attributes: value: | --- - Optional (for contributors): You can stop here if you're just proposing the improvement. + Optional: You can stop here if you're just proposing the improvement. - type: textarea id: acceptance-criteria diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml deleted file mode 100644 index b99fd7659e..0000000000 --- a/.github/workflows/evals.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: Evals - -on: - pull_request: - types: [labeled] - workflow_dispatch: - -env: - DOCKER_BUILDKIT: 1 - COMPOSE_DOCKER_CLI_BUILD: 1 - -jobs: - evals: - # Run if triggered manually or if PR has 'evals' label. - if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'evals') - runs-on: blacksmith-16vcpu-ubuntu-2404 - timeout-minutes: 45 - - defaults: - run: - working-directory: packages/evals - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Create environment - run: | - cat > .env.local << EOF - OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY || 'test-key-for-build' }} - EOF - - cat > .env.development << EOF - NODE_ENV=development - DATABASE_URL=postgresql://postgres:password@db:5432/evals_development - REDIS_URL=redis://redis:6379 - HOST_EXECUTION_METHOD=docker - EOF - - - name: Build image - uses: docker/build-push-action@v6 - with: - context: . - file: packages/evals/Dockerfile.runner - tags: evals-runner:latest - cache-from: type=gha - cache-to: type=gha,mode=max - push: false - load: true - - - name: Tag image - run: docker tag evals-runner:latest evals-runner - - - name: Start containers - run: | - docker compose up -d db redis - timeout 60 bash -c 'until docker compose exec -T db pg_isready -U postgres; do sleep 2; done' - timeout 60 bash -c 'until docker compose exec -T redis redis-cli ping | grep -q PONG; do sleep 2; done' - docker compose run --rm runner sh -c 'nc -z db 5432 && echo "āœ“ Runner -> Database connection successful"' - docker compose run --rm runner sh -c 'nc -z redis 6379 && echo "āœ“ Runner -> Redis connection successful"' - docker compose run --rm runner docker ps - - - name: Run database migrations - run: docker compose run --rm runner pnpm --filter @roo-code/evals db:migrate - - - name: Run evals - run: docker compose run --rm runner pnpm --filter @roo-code/evals cli --ci - - - name: Cleanup - if: always() - run: docker compose down -v --remove-orphans diff --git a/.github/workflows/update-contributors.yml b/.github/workflows/update-contributors.yml deleted file mode 100644 index 5709bdc10a..0000000000 --- a/.github/workflows/update-contributors.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: Update Contributors # Refresh contrib.rocks image cache - -on: - workflow_dispatch: - -permissions: - contents: write - pull-requests: write - -jobs: - refresh-contrib-cache: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Bump cacheBust in all README files - run: | - set -euo pipefail - TS="$(date +%s)" - # Target only the root README.md and localized READMEs under locales/*/README.md - mapfile -t FILES < <(git ls-files README.md 'locales/*/README.md' || true) - - if [ "${#FILES[@]}" -eq 0 ]; then - echo "No target README files found." >&2 - exit 1 - fi - - UPDATED=0 - for f in "${FILES[@]}"; do - if grep -q 'cacheBust=' "$f"; then - # Use portable sed in GNU environment of ubuntu-latest - sed -i -E "s/cacheBust=[0-9]+/cacheBust=${TS}/g" "$f" - echo "Updated cacheBust in $f" - UPDATED=1 - else - echo "Warning: cacheBust parameter not found in $f" >&2 - fi - done - - if [ "$UPDATED" -eq 0 ]; then - echo "No files were updated. Ensure READMEs embed contrib.rocks with cacheBust param." >&2 - exit 1 - fi - - - name: Detect changes - id: changes - run: | - if git diff --quiet; then - echo "changed=false" >> $GITHUB_OUTPUT - else - echo "changed=true" >> $GITHUB_OUTPUT - fi - - - name: Create Pull Request - if: steps.changes.outputs.changed == 'true' - uses: peter-evans/create-pull-request@v7 - with: - token: ${{ secrets.GITHUB_TOKEN }} - commit-message: "docs: update contributors list [skip ci]" - committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" - branch: refresh-contrib-cache - delete-branch: true - title: "Refresh contrib.rocks image cache (all READMEs)" - body: | - Automated refresh of the contrib.rocks image cache by bumping the cacheBust parameter in README.md and locales/*/README.md. - base: main diff --git a/.github/workflows/website-deploy.yml b/.github/workflows/website-deploy.yml deleted file mode 100644 index da2d4228f5..0000000000 --- a/.github/workflows/website-deploy.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Deploy roocode.com - -on: - push: - branches: - - main - paths: - - 'apps/web-roo-code/**' - workflow_dispatch: - -concurrency: - group: deploy-roocode-com - cancel-in-progress: true - -env: - VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }} - VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }} - -jobs: - check-secrets: - runs-on: ubuntu-latest - outputs: - has-vercel-token: ${{ steps.check.outputs.has-vercel-token }} - steps: - - name: Check if VERCEL_TOKEN exists - id: check - run: | - if [ -n "${{ secrets.VERCEL_TOKEN }}" ]; then - echo "has-vercel-token=true" >> $GITHUB_OUTPUT - else - echo "has-vercel-token=false" >> $GITHUB_OUTPUT - fi - - deploy: - runs-on: ubuntu-latest - needs: check-secrets - if: ${{ needs.check-secrets.outputs.has-vercel-token == 'true' }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - name: Setup Node.js and pnpm - uses: ./.github/actions/setup-node-pnpm - - name: Run lint - run: pnpm lint - working-directory: apps/web-roo-code - - name: Run type check - run: pnpm check-types - working-directory: apps/web-roo-code - - name: Run build - run: pnpm build - working-directory: apps/web-roo-code - - name: Install Vercel CLI - run: npm install --global vercel@latest - - name: Pull Vercel Environment Information - run: npx vercel pull --yes --environment=production --token=${{ secrets.VERCEL_TOKEN }} - - name: Build Project Artifacts - run: npx vercel build --prod --token=${{ secrets.VERCEL_TOKEN }} - - name: Deploy Project Artifacts to Vercel - run: npx vercel deploy --prebuilt --prod --token=${{ secrets.VERCEL_TOKEN }} diff --git a/.github/workflows/website-preview.yml b/.github/workflows/website-preview.yml deleted file mode 100644 index 9446bc7753..0000000000 --- a/.github/workflows/website-preview.yml +++ /dev/null @@ -1,102 +0,0 @@ -name: Preview roocode.com - -on: - push: - branches-ignore: - - main - paths: - - "apps/web-roo-code/**" - pull_request: - paths: - - "apps/web-roo-code/**" - workflow_dispatch: - -concurrency: - group: preview-roocode-com-${{ github.ref }} - cancel-in-progress: true - -env: - VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }} - VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }} - -jobs: - check-secrets: - runs-on: ubuntu-latest - outputs: - has-vercel-token: ${{ steps.check.outputs.has-vercel-token }} - steps: - - name: Check if VERCEL_TOKEN exists - id: check - run: | - if [ -n "${{ secrets.VERCEL_TOKEN }}" ]; then - echo "has-vercel-token=true" >> $GITHUB_OUTPUT - else - echo "has-vercel-token=false" >> $GITHUB_OUTPUT - fi - - preview: - runs-on: ubuntu-latest - needs: check-secrets - if: ${{ needs.check-secrets.outputs.has-vercel-token == 'true' }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - name: Setup Node.js and pnpm - uses: ./.github/actions/setup-node-pnpm - - name: Run lint - run: pnpm lint - working-directory: apps/web-roo-code - - name: Run type check - run: pnpm check-types - working-directory: apps/web-roo-code - - name: Run build - run: pnpm build - working-directory: apps/web-roo-code - - name: Install Vercel CLI - run: npm install --global vercel@latest - - name: Pull Vercel Environment Information - run: npx vercel pull --yes --environment=preview --token=${{ secrets.VERCEL_TOKEN }} - - name: Build Project Artifacts - run: npx vercel build --token=${{ secrets.VERCEL_TOKEN }} - - name: Deploy Project Artifacts to Vercel - id: deploy - run: | - DEPLOYMENT_URL=$(npx vercel deploy --prebuilt --token=${{ secrets.VERCEL_TOKEN }}) - echo "deployment_url=$DEPLOYMENT_URL" >> $GITHUB_OUTPUT - echo "Preview deployed to: $DEPLOYMENT_URL" - - - name: Comment PR with preview link - if: github.event_name == 'pull_request' - uses: actions/github-script@v7 - with: - script: | - const deploymentUrl = '${{ steps.deploy.outputs.deployment_url }}'; - const commentIdentifier = ''; - - const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); - - const existingComment = comments.find(comment => - comment.body.includes(commentIdentifier) - ); - - const comment = commentIdentifier + '\nšŸš€ **Preview deployed!**\n\nYour changes have been deployed to Vercel:\n\n**Preview URL:** ' + deploymentUrl + '\n\nThis preview will be updated automatically when you push new commits to this PR.'; - - if (existingComment) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: existingComment.id, - body: comment - }); - } else { - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: comment - }); - } diff --git a/apps/cli/scripts/integration/cases/create-with-session-id-resume-loads-correct-session.ts b/apps/cli/scripts/integration/cases/create-with-session-id-resume-loads-correct-session.ts index cbefd26525..ded1656e1e 100644 --- a/apps/cli/scripts/integration/cases/create-with-session-id-resume-loads-correct-session.ts +++ b/apps/cli/scripts/integration/cases/create-with-session-id-resume-loads-correct-session.ts @@ -88,7 +88,7 @@ async function createSessionWithCustomId( "dev", "--print", "--provider", - "roo", + "openrouter", "--output-format", "stream-json", "--workspace", @@ -148,7 +148,7 @@ async function resumeSessionAndSendMarker( "--print", "--stdin-prompt-stream", "--provider", - "roo", + "openrouter", "--output-format", "stream-json", "--workspace", diff --git a/apps/cli/scripts/integration/lib/stream-harness.ts b/apps/cli/scripts/integration/lib/stream-harness.ts index 73b756c7c3..2693103309 100644 --- a/apps/cli/scripts/integration/lib/stream-harness.ts +++ b/apps/cli/scripts/integration/lib/stream-harness.ts @@ -69,7 +69,7 @@ export async function runStreamCase(options: RunStreamCaseOptions): Promise { }) }) +describe("router model extraction", () => { + // This mirrors the extraction logic in requestOpenRouterModels (list.ts:226-228) + const extractOpenRouterModels = (routerModelsRaw: unknown) => { + const routerModels = isRecord(routerModelsRaw) ? routerModelsRaw : {} + const openRouterModels = routerModels.openrouter + return isRecord(openRouterModels) ? openRouterModels : {} + } + + it("extracts openrouter models from valid routerModels", () => { + const models = { "openai/gpt-4.1": { contextWindow: 128000, supportsPromptCache: false } } + const result = extractOpenRouterModels({ openrouter: models }) + expect(result).toEqual(models) + }) + + it("returns empty object when routerModels is null", () => { + expect(extractOpenRouterModels(null)).toEqual({}) + }) + + it("returns empty object when openrouter key is missing", () => { + expect(extractOpenRouterModels({ requesty: {} })).toEqual({}) + }) + + it("returns empty object when openrouter value is not a record", () => { + expect(extractOpenRouterModels({ openrouter: "invalid" })).toEqual({}) + }) +}) + describe("listSessions", () => { const workspacePath = process.cwd() diff --git a/apps/cli/src/commands/cli/__tests__/run-provider-resolution.test.ts b/apps/cli/src/commands/cli/__tests__/run-provider-resolution.test.ts deleted file mode 100644 index 486358d0db..0000000000 --- a/apps/cli/src/commands/cli/__tests__/run-provider-resolution.test.ts +++ /dev/null @@ -1,50 +0,0 @@ -import { DEFAULT_PROVIDER } from "@/types/index.js" -import { resolveProviderPreference } from "../run.js" - -describe("run provider resolution", () => { - it("defaults to the login-free provider when nothing is configured", () => { - const result = resolveProviderPreference({}) - - expect(result).toEqual({ - provider: DEFAULT_PROVIDER, - fellBackFromStoredRooPreference: false, - fellBackFromExplicitRooRequest: false, - }) - }) - - it("falls back from saved Roo preferences", () => { - const result = resolveProviderPreference({ - settingsProvider: "roo", - }) - - expect(result).toEqual({ - provider: DEFAULT_PROVIDER, - fellBackFromStoredRooPreference: true, - fellBackFromExplicitRooRequest: false, - }) - }) - - it("falls back from an explicitly requested Roo provider selection", () => { - const result = resolveProviderPreference({ - flagProvider: "roo", - }) - - expect(result).toEqual({ - provider: DEFAULT_PROVIDER, - fellBackFromStoredRooPreference: false, - fellBackFromExplicitRooRequest: true, - }) - }) - - it("preserves supported providers", () => { - const result = resolveProviderPreference({ - settingsProvider: "anthropic", - }) - - expect(result).toEqual({ - provider: "anthropic", - fellBackFromStoredRooPreference: false, - fellBackFromExplicitRooRequest: false, - }) - }) -}) diff --git a/apps/cli/src/commands/cli/list.ts b/apps/cli/src/commands/cli/list.ts index 3a7e99170f..fbd33da2cc 100644 --- a/apps/cli/src/commands/cli/list.ts +++ b/apps/cli/src/commands/cli/list.ts @@ -6,7 +6,7 @@ import pWaitFor from "p-wait-for" import type { TaskSessionEntry } from "@roo-code/core/cli" import type { Command, ModelRecord, WebviewMessage } from "@roo-code/types" -import { getProviderDefaultModelId } from "@roo-code/types" +import { openRouterDefaultModelId } from "@roo-code/types" import { ExtensionHost, type ExtensionHostOptions } from "@/agent/index.js" import { readWorkspaceTaskSessions } from "@/lib/task-history/index.js" @@ -112,7 +112,7 @@ async function createListHost(options: BaseListOptions, hostOptions: ListHostOpt reasoningEffort: undefined, user: null, provider: "openrouter", - model: getProviderDefaultModelId("openrouter"), + model: openRouterDefaultModelId, apiKey, workspacePath, extensionPath, @@ -223,8 +223,9 @@ function requestOpenRouterModels(host: ExtensionHost): Promise { return undefined } - const routerModels = isRecord(message.routerModels) ? message.routerModels : undefined - return isRecord(routerModels?.openrouter) ? (routerModels.openrouter as ModelRecord) : {} + const routerModels = isRecord(message.routerModels) ? message.routerModels : {} + const openRouterModels = routerModels.openrouter + return isRecord(openRouterModels) ? (openRouterModels as ModelRecord) : {} }, ) } diff --git a/apps/cli/src/commands/cli/run.ts b/apps/cli/src/commands/cli/run.ts index ae263c5573..908df9938b 100644 --- a/apps/cli/src/commands/cli/run.ts +++ b/apps/cli/src/commands/cli/run.ts @@ -12,7 +12,6 @@ import { isSupportedProvider, supportedProviders, DEFAULT_FLAGS, - DEFAULT_PROVIDER, REASONING_EFFORTS, OutputFormat, } from "@/types/index.js" @@ -22,10 +21,10 @@ import { JsonEventEmitter } from "@/agent/json-event-emitter.js" import { loadSettings } from "@/lib/storage/index.js" import { readWorkspaceTaskSessions, resolveWorkspaceResumeSessionId } from "@/lib/task-history/index.js" import { getEnvVarName, getApiKeyFromEnv } from "@/lib/utils/provider.js" -import { runOnboarding } from "@/lib/utils/onboarding.js" import { validateTerminalShellPath } from "@/lib/utils/shell.js" import { getDefaultExtensionPath } from "@/lib/utils/extension.js" import { isValidSessionId } from "@/lib/utils/session-id.js" +import { runOnboarding } from "@/lib/utils/onboarding.js" import { VERSION } from "@/lib/utils/version.js" import { ExtensionHost, ExtensionHostOptions } from "@/agent/index.js" @@ -50,40 +49,6 @@ function normalizeError(error: unknown): Error { return error instanceof Error ? error : new Error(String(error)) } -export function resolveProviderPreference({ - flagProvider, - settingsProvider, -}: { - flagProvider?: string - settingsProvider?: string -}): { - provider: string - fellBackFromStoredRooPreference: boolean - fellBackFromExplicitRooRequest: boolean -} { - if (flagProvider === "roo") { - return { - provider: DEFAULT_PROVIDER, - fellBackFromStoredRooPreference: false, - fellBackFromExplicitRooRequest: true, - } - } - - if (settingsProvider === "roo") { - return { - provider: DEFAULT_PROVIDER, - fellBackFromStoredRooPreference: true, - fellBackFromExplicitRooRequest: false, - } - } - - return { - provider: flagProvider ?? settingsProvider ?? DEFAULT_PROVIDER, - fellBackFromStoredRooPreference: false, - fellBackFromExplicitRooRequest: false, - } -} - export async function run(promptArg: string | undefined, flagOptions: FlagOptions) { setLogger({ info: () => {}, @@ -157,14 +122,7 @@ export async function run(promptArg: string | undefined, flagOptions: FlagOption const effectiveModel = flagOptions.model || settings.model || DEFAULT_FLAGS.model const effectiveReasoningEffort = flagOptions.reasoningEffort || settings.reasoningEffort || DEFAULT_FLAGS.reasoningEffort - const { - provider: resolvedProvider, - fellBackFromStoredRooPreference, - fellBackFromExplicitRooRequest, - } = resolveProviderPreference({ - flagProvider: flagOptions.provider, - settingsProvider: settings.provider, - }) + const effectiveProvider = flagOptions.provider ?? settings.provider ?? "openrouter" const effectiveWorkspacePath = flagOptions.workspace ? path.resolve(flagOptions.workspace) : process.cwd() const legacyRequireApprovalFromSettings = settings.requireApproval ?? @@ -175,21 +133,9 @@ export async function run(promptArg: string | undefined, flagOptions: FlagOption flagOptions.consecutiveMistakeLimit ?? settings.consecutiveMistakeLimit ?? DEFAULT_FLAGS.consecutiveMistakeLimit const effectiveConsecutiveMistakeLimit = Number(rawConsecutiveMistakeLimit) - if (fellBackFromStoredRooPreference) { - console.warn( - `[CLI] Saved Roo Code Router preference detected in CLI settings. Continuing with the default provider (${DEFAULT_PROVIDER}).`, - ) - } - - if (fellBackFromExplicitRooRequest) { - console.warn( - `[CLI] Roo Code Router is no longer supported by the CLI. Continuing with the default provider (${DEFAULT_PROVIDER}).`, - ) - } - - if (!isSupportedProvider(resolvedProvider)) { + if (!isSupportedProvider(effectiveProvider)) { console.error( - `[CLI] Error: Invalid provider: ${resolvedProvider}; must be one of: ${supportedProviders.join(", ")}`, + `[CLI] Error: Invalid provider: ${effectiveProvider}; must be one of: ${supportedProviders.join(", ")}`, ) process.exit(1) } @@ -219,7 +165,7 @@ export async function run(promptArg: string | undefined, flagOptions: FlagOption reasoningEffort: effectiveReasoningEffort === "unspecified" ? undefined : effectiveReasoningEffort, consecutiveMistakeLimit: effectiveConsecutiveMistakeLimit, user: null, - provider: resolvedProvider, + provider: effectiveProvider, model: effectiveModel, workspacePath: effectiveWorkspacePath, extensionPath: path.resolve(flagOptions.extension || getDefaultExtensionPath(__dirname)), @@ -245,7 +191,7 @@ export async function run(promptArg: string | undefined, flagOptions: FlagOption if (!extensionHostOptions.apiKey) { console.error(`[CLI] Error: No API key provided. Use --api-key or set the appropriate environment variable.`) - console.error(`For ${extensionHostOptions.provider}, set ${getEnvVarName(extensionHostOptions.provider)}`) + console.error(`[CLI] For ${extensionHostOptions.provider}, set ${getEnvVarName(extensionHostOptions.provider)}`) process.exit(1) } diff --git a/apps/cli/src/commands/index.ts b/apps/cli/src/commands/index.ts index 717a7040ef..702b8e2938 100644 --- a/apps/cli/src/commands/index.ts +++ b/apps/cli/src/commands/index.ts @@ -1,2 +1,2 @@ -export * from "./auth/index.js" export * from "./cli/index.js" +export * from "./auth/index.js" diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index f4d123f218..8c368cc233 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -69,7 +69,7 @@ const applyListOptions = (command: Command) => command .option("-w, --workspace ", "Workspace directory path (defaults to current working directory)") .option("-e, --extension ", "Path to the extension bundle directory") - .option("-k, --api-key ", "OpenRouter API key (falls back to OPENROUTER_API_KEY)") + .option("-k, --api-key ", "API key for the LLM provider") .option("--format ", 'Output format: "json" (default) or "text"', "json") .option("-d, --debug", "Enable debug output", false) @@ -107,7 +107,7 @@ applyListOptions(listCommand.command("modes").description("List available modes" }, ) -applyListOptions(listCommand.command("models").description("List available OpenRouter models")).action( +applyListOptions(listCommand.command("models").description("List available models")).action( async (options: Parameters[0]) => { await runListAction(() => listModels(options)) }, diff --git a/apps/cli/src/lib/storage/__tests__/settings.test.ts b/apps/cli/src/lib/storage/__tests__/settings.test.ts index 60488c1cb0..b2a3b18cb4 100644 --- a/apps/cli/src/lib/storage/__tests__/settings.test.ts +++ b/apps/cli/src/lib/storage/__tests__/settings.test.ts @@ -19,7 +19,7 @@ vi.mock("../config-dir.js", () => ({ // Import after mocking import { loadSettings, saveSettings, resetOnboarding, getSettingsPath } from "../settings.js" -import { DEFAULT_PROVIDER, OnboardingProviderChoice } from "@/types/index.js" +import { OnboardingProviderChoice } from "@/types/index.js" // Re-derive the test config dir for use in tests (must match the hoisted one) const actualTestConfigDir = getTestConfigDir() @@ -70,32 +70,6 @@ describe("Settings Storage", () => { expect(loaded).toEqual(settingsData) }) - it("migrates legacy Roo provider settings to the default provider path", async () => { - const legacySettings = { - onboardingProviderChoice: "roo", - provider: "roo", - mode: "architect", - } - - await fs.mkdir(actualTestConfigDir, { recursive: true }) - await fs.writeFile(expectedSettingsFile, JSON.stringify(legacySettings), "utf-8") - - const loaded = await loadSettings() - - expect(loaded).toEqual({ - onboardingProviderChoice: OnboardingProviderChoice.Byok, - provider: DEFAULT_PROVIDER, - mode: "architect", - }) - - const rewritten = JSON.parse(await fs.readFile(expectedSettingsFile, "utf-8")) - expect(rewritten).toEqual({ - onboardingProviderChoice: OnboardingProviderChoice.Byok, - provider: DEFAULT_PROVIDER, - mode: "architect", - }) - }) - it("should load settings with only some fields set", async () => { const settingsData = { mode: "code", diff --git a/apps/cli/src/lib/storage/index.ts b/apps/cli/src/lib/storage/index.ts index 53424472c2..d5856747a1 100644 --- a/apps/cli/src/lib/storage/index.ts +++ b/apps/cli/src/lib/storage/index.ts @@ -1,4 +1,4 @@ export * from "./config-dir.js" -export * from "./settings.js" export * from "./credentials.js" +export * from "./settings.js" export * from "./ephemeral.js" diff --git a/apps/cli/src/lib/storage/settings.ts b/apps/cli/src/lib/storage/settings.ts index 541a2ad9a9..86a2d9243e 100644 --- a/apps/cli/src/lib/storage/settings.ts +++ b/apps/cli/src/lib/storage/settings.ts @@ -2,43 +2,9 @@ import fs from "fs/promises" import path from "path" import type { CliSettings } from "@/types/index.js" -import { - DEFAULT_PROVIDER, - LEGACY_ONBOARDING_PROVIDER_CHOICE_ROO, - LEGACY_PROVIDER_PREFERENCE_ROO, - OnboardingProviderChoice, -} from "@/types/index.js" -import { safeWriteJson } from "../../../../../src/utils/safeWriteJson.js" import { getConfigDir } from "./index.js" -type StoredCliSettings = CliSettings & { - provider?: string - onboardingProviderChoice?: string -} - -async function persistSettings(settingsPath: string, settings: CliSettings): Promise { - await safeWriteJson(settingsPath, settings, { prettyPrint: true }) - await fs.chmod(settingsPath, 0o600) -} - -function migrateLegacySettings(settings: StoredCliSettings): { settings: CliSettings; migrated: boolean } { - let migrated = false - const nextSettings: StoredCliSettings = { ...settings } - - if (nextSettings.provider === LEGACY_PROVIDER_PREFERENCE_ROO) { - nextSettings.provider = DEFAULT_PROVIDER - migrated = true - } - - if (nextSettings.onboardingProviderChoice === LEGACY_ONBOARDING_PROVIDER_CHOICE_ROO) { - nextSettings.onboardingProviderChoice = OnboardingProviderChoice.Byok - migrated = true - } - - return { settings: nextSettings as CliSettings, migrated } -} - export function getSettingsPath(): string { return path.join(getConfigDir(), "cli-settings.json") } @@ -47,17 +13,7 @@ export async function loadSettings(): Promise { try { const settingsPath = getSettingsPath() const data = await fs.readFile(settingsPath, "utf-8") - const parsed = JSON.parse(data) as StoredCliSettings - const { settings, migrated } = migrateLegacySettings(parsed) - - if (migrated) { - console.warn( - `[CLI] Detected legacy Roo Code Router selections in CLI settings. Migrating them to the default provider (${DEFAULT_PROVIDER}).`, - ) - await persistSettings(settingsPath, settings) - } - - return settings + return JSON.parse(data) as CliSettings } catch (error) { if ((error as NodeJS.ErrnoException).code === "ENOENT") { return {} @@ -68,10 +24,15 @@ export async function loadSettings(): Promise { } export async function saveSettings(settings: Partial): Promise { + const configDir = getConfigDir() + await fs.mkdir(configDir, { recursive: true }) + const existing = await loadSettings() const merged = { ...existing, ...settings } - await persistSettings(getSettingsPath(), merged) + await fs.writeFile(getSettingsPath(), JSON.stringify(merged, null, 2), { + mode: 0o600, + }) } export async function resetOnboarding(): Promise { diff --git a/apps/cli/src/lib/utils/__tests__/input.test.ts b/apps/cli/src/lib/utils/__tests__/input.test.ts index c346e60d6d..ef20749fe1 100644 --- a/apps/cli/src/lib/utils/__tests__/input.test.ts +++ b/apps/cli/src/lib/utils/__tests__/input.test.ts @@ -20,8 +20,9 @@ function createKey(overrides: Partial = {}): Key { backspace: false, delete: false, meta: false, + super: false, ...overrides, - } + } as Key } describe("globalInputSequences", () => { diff --git a/apps/cli/src/types/__tests__/types.test.ts b/apps/cli/src/types/__tests__/types.test.ts new file mode 100644 index 0000000000..1e54b5069e --- /dev/null +++ b/apps/cli/src/types/__tests__/types.test.ts @@ -0,0 +1,46 @@ +import { isSupportedProvider, supportedProviders } from "../types.js" + +describe("isSupportedProvider", () => { + it.each(supportedProviders)("returns true for supported provider '%s'", (provider) => { + expect(isSupportedProvider(provider)).toBe(true) + }) + + it("returns false for 'roo' (retired provider)", () => { + expect(isSupportedProvider("roo")).toBe(false) + }) + + it("returns false for unknown provider", () => { + expect(isSupportedProvider("not-a-provider")).toBe(false) + }) + + it("returns false for empty string", () => { + expect(isSupportedProvider("")).toBe(false) + }) +}) + +describe("provider resolution fallback", () => { + it("defaults to openrouter when no flag or setting is provided", () => { + const flagProvider = undefined + const settingsProvider = undefined + const effectiveProvider = flagProvider ?? settingsProvider ?? "openrouter" + + expect(effectiveProvider).toBe("openrouter") + expect(isSupportedProvider(effectiveProvider)).toBe(true) + }) + + it("uses flag provider over settings and default", () => { + const flagProvider = "anthropic" + const settingsProvider = "gemini" + const effectiveProvider = flagProvider ?? settingsProvider ?? "openrouter" + + expect(effectiveProvider).toBe("anthropic") + }) + + it("uses settings provider when flag is not provided", () => { + const flagProvider = undefined + const settingsProvider = "gemini" + const effectiveProvider = flagProvider ?? settingsProvider ?? "openrouter" + + expect(effectiveProvider).toBe("gemini") + }) +}) diff --git a/apps/cli/src/types/constants.ts b/apps/cli/src/types/constants.ts index 555d73a037..007cfa0783 100644 --- a/apps/cli/src/types/constants.ts +++ b/apps/cli/src/types/constants.ts @@ -7,8 +7,6 @@ export const DEFAULT_FLAGS = { consecutiveMistakeLimit: 10, } -export const DEFAULT_PROVIDER = "openrouter" as const - export const REASONING_EFFORTS = [...reasoningEffortsExtended, "unspecified", "disabled"] /** @@ -23,7 +21,3 @@ export const ASCII_ROO = ` _,' ___ \\,\\ / \\\\ // \\\\ ,/' \`\\_,` - -export const AUTH_BASE_URL = process.env.ROO_AUTH_BASE_URL ?? "https://app.roocode.com" - -export const SDK_BASE_URL = process.env.ROO_SDK_BASE_URL ?? "https://cloud-api.roocode.com" diff --git a/apps/cli/src/types/types.ts b/apps/cli/src/types/types.ts index f7af0a2835..0a9f3d2259 100644 --- a/apps/cli/src/types/types.ts +++ b/apps/cli/src/types/types.ts @@ -46,12 +46,6 @@ export enum OnboardingProviderChoice { Byok = "byok", } -export const LEGACY_PROVIDER_PREFERENCE_ROO = "roo" as const -export const LEGACY_ONBOARDING_PROVIDER_CHOICE_ROO = "roo" as const - -export type CliProviderPreference = SupportedProvider | typeof LEGACY_PROVIDER_PREFERENCE_ROO -export type CliOnboardingProviderChoice = OnboardingProviderChoice | typeof LEGACY_ONBOARDING_PROVIDER_CHOICE_ROO - export interface OnboardingResult { choice: OnboardingProviderChoice token?: string @@ -59,11 +53,11 @@ export interface OnboardingResult { } export interface CliSettings { - onboardingProviderChoice?: CliOnboardingProviderChoice + onboardingProviderChoice?: OnboardingProviderChoice /** Default mode to use (e.g., "code", "architect", "ask", "debug") */ mode?: string /** Default provider to use */ - provider?: CliProviderPreference + provider?: SupportedProvider /** Default model to use */ model?: string /** Default reasoning effort level */ diff --git a/apps/vscode-e2e/package.json b/apps/vscode-e2e/package.json index 4a1af856d3..4ca0c3cdb8 100644 --- a/apps/vscode-e2e/package.json +++ b/apps/vscode-e2e/package.json @@ -15,7 +15,7 @@ "@roo-code/config-eslint": "workspace:^", "@roo-code/config-typescript": "workspace:^", "@roo-code/types": "workspace:^", - "@copilotkit/aimock": "^1.15.1", + "@copilotkit/aimock": "1.15.1", "@types/mocha": "^10.0.10", "@types/node": "20.x", "@types/vscode": "^1.95.0", diff --git a/apps/vscode-e2e/src/runTest.ts b/apps/vscode-e2e/src/runTest.ts index b2550559b1..e1ba4ada77 100644 --- a/apps/vscode-e2e/src/runTest.ts +++ b/apps/vscode-e2e/src/runTest.ts @@ -99,9 +99,11 @@ async function main() { // user message starts with directly — no // wrapper. JSON fixtures use substring matching so a bare "" // match would collide with all other requests. A regex anchored to the start - // uniquely identifies this post-switch turn. + // uniquely identifies this post-switch turn. Scope this fixture to the + // OpenRouter default model so provider-specific suites (e.g. DeepSeek) + // cannot accidentally match it. mock.addFixture({ - match: { userMessage: /^/ }, + match: { model: "openai/gpt-4.1", userMessage: /^/ }, response: { toolCalls: [ { @@ -128,6 +130,7 @@ async function main() { ...(testGrep && { TEST_GREP: testGrep }), ...(testFile && { TEST_FILE: testFile }), ...(mock && { AIMOCK_URL: mock.url }), + ...(mock && { E2E_MOCK_MODEL_LIST_FALLBACK: "true" }), } // Download VS Code, unzip it and run the integration test diff --git a/apps/vscode-e2e/src/suite/anthropic-opus-4-7.test.ts b/apps/vscode-e2e/src/suite/anthropic-opus-4-7.test.ts index 64c08c2ec0..ac7e5e4f5f 100644 --- a/apps/vscode-e2e/src/suite/anthropic-opus-4-7.test.ts +++ b/apps/vscode-e2e/src/suite/anthropic-opus-4-7.test.ts @@ -3,7 +3,6 @@ import { createServer, type IncomingMessage, type ServerResponse } from "http" import { RooCodeEventName, type ClineMessage } from "@roo-code/types" -import { waitUntilCompleted } from "./utils" import { setDefaultSuiteTimeout } from "./test-utils" type CapturedAnthropicRequest = { @@ -14,6 +13,19 @@ type CapturedAnthropicRequest = { const ALLOWED_PROXY_HOSTS = new Set(["127.0.0.1", "localhost", "api.anthropic.com"]) const ANTHROPIC_MESSAGES_PATH = "/v1/messages" +const HOP_BY_HOP = new Set([ + "connection", + "keep-alive", + "transfer-encoding", + "te", + "trailer", + "upgrade", + "proxy-connection", + "proxy-authenticate", + "proxy-authorization", + "host", + "content-length", +]) function isMessagesUrl(rawUrl: string): boolean { try { @@ -35,7 +47,12 @@ function readRequestBody(req: IncomingMessage): Promise { function writeResponseHeaders(target: ServerResponse, source: Response) { const headers: Record = {} source.headers.forEach((value, key) => { - if (key.toLowerCase() !== "content-length") { + const lower = key.toLowerCase() + // fetch() automatically decompresses the body, so strip content-encoding to + // prevent the SDK from attempting a second decompression (zlib "incorrect + // header check"). Also strip content-length since the decoded body length + // differs from the compressed length. + if (lower !== "content-length" && lower !== "content-encoding") { headers[key] = value } }) @@ -111,11 +128,7 @@ async function withAnthropicProxy( const forwardHeaders: Record = {} for (const [key, value] of Object.entries(req.headers)) { - if ( - key.toLowerCase() !== "host" && - key.toLowerCase() !== "content-length" && - typeof value === "string" - ) { + if (!HOP_BY_HOP.has(key.toLowerCase()) && typeof value === "string") { forwardHeaders[key] = value } } @@ -210,7 +223,35 @@ suite("Claude Opus 4.7 (Anthropic)", function () { text: `${promptTag}: what is 2+2? Reply with only the number.`, }) - await waitUntilCompleted({ api, taskId }) + await new Promise((resolve, reject) => { + const timer = setTimeout(() => { + cleanup() + reject(new Error("Timeout after 60s")) + }, 60_000) + + const cleanup = () => { + clearTimeout(timer) + api.off(RooCodeEventName.TaskCompleted, onCompleted) + api.off(RooCodeEventName.TaskAborted, onAborted) + } + + const onCompleted = (completedId: string) => { + if (completedId === taskId) { + cleanup() + resolve() + } + } + + const onAborted = (abortedId: string) => { + if (abortedId === taskId) { + cleanup() + reject(new Error("Task was aborted - Anthropic API request failed")) + } + } + + api.on(RooCodeEventName.TaskCompleted, onCompleted) + api.on(RooCodeEventName.TaskAborted, onAborted) + }) const firstRequest = requests[0] assert.ok(firstRequest, "Anthropic provider should issue at least one /v1/messages request") diff --git a/apps/vscode-e2e/src/types/global.d.ts b/apps/vscode-e2e/src/types/global.d.ts index c2b11bf335..e25959bf19 100644 --- a/apps/vscode-e2e/src/types/global.d.ts +++ b/apps/vscode-e2e/src/types/global.d.ts @@ -1,7 +1,7 @@ import type { RooCodeAPI } from "@roo-code/types" declare global { - // eslint-disable-next-line no-var + // eslint-disable-next-line no-var -- var is required in declare global var api: RooCodeAPI } diff --git a/apps/web-evals/.env b/apps/web-evals/.env deleted file mode 100644 index 1bb6dd6dac..0000000000 --- a/apps/web-evals/.env +++ /dev/null @@ -1 +0,0 @@ -DATABASE_URL=postgres://postgres:password@localhost:5433/evals_development diff --git a/apps/web-evals/.gitignore b/apps/web-evals/.gitignore deleted file mode 100644 index 443f3159ed..0000000000 --- a/apps/web-evals/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# .env -!.env - -# next.js -.next - -# typescript -tsconfig.tsbuildinfo diff --git a/apps/web-evals/CHANGELOG.md b/apps/web-evals/CHANGELOG.md deleted file mode 100644 index b3531905ac..0000000000 --- a/apps/web-evals/CHANGELOG.md +++ /dev/null @@ -1,3 +0,0 @@ -# @roo-code/web-evals - -## 0.0.1 diff --git a/apps/web-evals/components.json b/apps/web-evals/components.json deleted file mode 100644 index 5bcedb3141..0000000000 --- a/apps/web-evals/components.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "https://ui.shadcn.com/schema.json", - "style": "new-york", - "rsc": true, - "tsx": true, - "tailwind": { - "config": "", - "css": "src/app/globals.css", - "baseColor": "neutral", - "cssVariables": true, - "prefix": "" - }, - "aliases": { - "components": "@/components", - "utils": "@/lib/utils", - "ui": "@/components/ui", - "lib": "@/lib", - "hooks": "@/hooks" - }, - "iconLibrary": "lucide" -} diff --git a/apps/web-evals/eslint.config.mjs b/apps/web-evals/eslint.config.mjs deleted file mode 100644 index 024d6157d4..0000000000 --- a/apps/web-evals/eslint.config.mjs +++ /dev/null @@ -1,17 +0,0 @@ -import { nextJsConfig } from "@roo-code/config-eslint/next-js" - -/** @type {import("eslint").Linter.Config} */ -export default [ - ...nextJsConfig, - { - rules: { - "no-unused-vars": "off", - "@typescript-eslint/no-unused-vars": [ - "error", - { - caughtErrorsIgnorePattern: "^_", - }, - ], - }, - }, -] diff --git a/apps/web-evals/next-env.d.ts b/apps/web-evals/next-env.d.ts deleted file mode 100644 index 7506fe6afb..0000000000 --- a/apps/web-evals/next-env.d.ts +++ /dev/null @@ -1,6 +0,0 @@ -/// -/// -import "./.next/dev/types/routes.d.ts" - -// NOTE: This file should not be edited -// see https://nextjs.org/docs/app/api-reference/config/typescript for more information. diff --git a/apps/web-evals/next.config.ts b/apps/web-evals/next.config.ts deleted file mode 100644 index b5f54a87be..0000000000 --- a/apps/web-evals/next.config.ts +++ /dev/null @@ -1,7 +0,0 @@ -import type { NextConfig } from "next" - -const nextConfig: NextConfig = { - turbopack: {}, -} - -export default nextConfig diff --git a/apps/web-evals/package.json b/apps/web-evals/package.json deleted file mode 100644 index 1723f57583..0000000000 --- a/apps/web-evals/package.json +++ /dev/null @@ -1,63 +0,0 @@ -{ - "name": "@roo-code/web-evals", - "version": "0.0.1", - "type": "module", - "scripts": { - "lint": "eslint src --ext=ts,tsx --max-warnings=0", - "check-types": "tsc -b", - "dev": "scripts/check-services.sh && next dev -p 3446", - "format": "prettier --write src", - "build": "next build", - "start": "next start -p 3446", - "clean": "rimraf tsconfig.tsbuildinfo .next .turbo" - }, - "dependencies": { - "@hookform/resolvers": "^5.1.1", - "@radix-ui/react-alert-dialog": "^1.1.7", - "@radix-ui/react-checkbox": "^1.1.5", - "@radix-ui/react-dialog": "^1.1.6", - "@radix-ui/react-dropdown-menu": "^2.1.7", - "@radix-ui/react-label": "^2.1.2", - "@radix-ui/react-popover": "^1.1.6", - "@radix-ui/react-scroll-area": "^1.2.3", - "@radix-ui/react-select": "^2.1.6", - "@radix-ui/react-separator": "^1.1.2", - "@radix-ui/react-slider": "^1.2.4", - "@radix-ui/react-slot": "^1.1.2", - "@radix-ui/react-tabs": "^1.1.3", - "@radix-ui/react-tooltip": "^1.2.8", - "@roo-code/evals": "workspace:^", - "@roo-code/types": "workspace:^", - "@tanstack/react-query": "^5.69.0", - "archiver": "^7.0.1", - "class-variance-authority": "^0.7.1", - "clsx": "^2.1.1", - "cmdk": "^1.1.0", - "fuzzysort": "^3.1.0", - "lucide-react": "^0.518.0", - "next": "^16.1.6", - "next-themes": "^0.4.6", - "p-map": "^7.0.3", - "react": "^18.3.1", - "react-dom": "^18.3.1", - "react-hook-form": "^7.57.0", - "react-use": "^17.6.0", - "redis": "^5.5.5", - "sonner": "^2.0.5", - "tailwind-merge": "^3.3.0", - "tailwindcss-animate": "^1.0.7", - "vaul": "^1.1.2", - "zod": "^3.25.61" - }, - "devDependencies": { - "@roo-code/config-eslint": "workspace:^", - "@roo-code/config-typescript": "workspace:^", - "@tailwindcss/postcss": "^4", - "@types/archiver": "^7.0.0", - "@types/ps-tree": "^1.1.6", - "@types/react": "^18.3.23", - "@types/react-dom": "^18.3.5", - "tailwindcss": "^4", - "vitest": "^3.2.3" - } -} diff --git a/apps/web-evals/postcss.config.mjs b/apps/web-evals/postcss.config.mjs deleted file mode 100644 index 78452aadce..0000000000 --- a/apps/web-evals/postcss.config.mjs +++ /dev/null @@ -1,5 +0,0 @@ -const config = { - plugins: ["@tailwindcss/postcss"], -} - -export default config diff --git a/apps/web-evals/public/.gitkeep b/apps/web-evals/public/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/web-evals/scripts/check-services.sh b/apps/web-evals/scripts/check-services.sh deleted file mode 100755 index d72ffd54e8..0000000000 --- a/apps/web-evals/scripts/check-services.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -if ! docker info &> /dev/null; then - echo "āŒ Docker is not running. Please start Docker Desktop and try again." - exit 1 -fi - -if ! nc -z postgres 5433 2>/dev/null; then - echo "āŒ PostgreSQL is not running on port 5432" - echo "šŸ’” Start it with: pnpm --filter @roo-code/evals db:up" - exit 1 -fi - -if ! nc -z redis 6380 2>/dev/null; then - echo "āŒ Redis is not running on port 6379" - echo "šŸ’” Start it with: pnpm --filter @roo-code/evals redis:up" - exit 1 -fi - -echo "āœ… All required services are running" diff --git a/apps/web-evals/src/actions/__tests__/killRun.spec.ts b/apps/web-evals/src/actions/__tests__/killRun.spec.ts deleted file mode 100644 index 814d70d9fc..0000000000 --- a/apps/web-evals/src/actions/__tests__/killRun.spec.ts +++ /dev/null @@ -1,207 +0,0 @@ -// npx vitest run src/actions/__tests__/killRun.spec.ts - -import { execFileSync } from "child_process" - -// Mock child_process -vi.mock("child_process", () => ({ - execFileSync: vi.fn(), - spawn: vi.fn(), -})) - -// Mock next/cache -vi.mock("next/cache", () => ({ - revalidatePath: vi.fn(), -})) - -// Mock redis client -vi.mock("@/lib/server/redis", () => ({ - redisClient: vi.fn().mockResolvedValue({ - del: vi.fn().mockResolvedValue(1), - }), -})) - -// Mock @roo-code/evals -vi.mock("@roo-code/evals", () => ({ - createRun: vi.fn(), - deleteRun: vi.fn(), - createTask: vi.fn(), - exerciseLanguages: [], - getExercisesForLanguage: vi.fn().mockResolvedValue([]), -})) - -// Mock timers to speed up tests -vi.useFakeTimers() - -// Import after mocks -import { killRun } from "../runs" - -const mockExecFileSync = execFileSync as ReturnType - -describe("killRun", () => { - beforeEach(() => { - vi.clearAllMocks() - }) - - afterEach(() => { - vi.clearAllTimers() - }) - - it("should kill controller first, wait, then kill task containers", async () => { - const runId = 123 - - // execFileSync is used for all docker commands - mockExecFileSync - .mockReturnValueOnce("") // docker kill controller - .mockReturnValueOnce("evals-task-123-456.0\nevals-task-123-789.1\n") // docker ps - .mockReturnValueOnce("") // docker kill evals-task-123-456.0 - .mockReturnValueOnce("") // docker kill evals-task-123-789.1 - - const resultPromise = killRun(runId) - - // Fast-forward past the 10 second sleep - await vi.advanceTimersByTimeAsync(10000) - - const result = await resultPromise - - expect(result.success).toBe(true) - expect(result.killedContainers).toContain("evals-controller-123") - expect(result.killedContainers).toContain("evals-task-123-456.0") - expect(result.killedContainers).toContain("evals-task-123-789.1") - expect(result.errors).toHaveLength(0) - - // Verify execFileSync was called for docker kill - expect(mockExecFileSync).toHaveBeenNthCalledWith( - 1, - "docker", - ["kill", "evals-controller-123"], - expect.any(Object), - ) - // Verify execFileSync was called for docker ps with run-specific filter - expect(mockExecFileSync).toHaveBeenNthCalledWith( - 2, - "docker", - ["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-123-"], - expect.any(Object), - ) - }) - - it("should continue killing runners even if controller is not running", async () => { - const runId = 456 - - mockExecFileSync - .mockImplementationOnce(() => { - throw new Error("No such container") - }) // controller kill fails - .mockReturnValueOnce("evals-task-456-100.0\n") // docker ps - .mockReturnValueOnce("") // docker kill task - - const resultPromise = killRun(runId) - await vi.advanceTimersByTimeAsync(10000) - const result = await resultPromise - - expect(result.success).toBe(true) - expect(result.killedContainers).toContain("evals-task-456-100.0") - // Controller not in list since it failed - expect(result.killedContainers).not.toContain("evals-controller-456") - }) - - it("should clear Redis state after killing containers", async () => { - const runId = 789 - - const mockDel = vi.fn().mockResolvedValue(1) - const { redisClient } = await import("@/lib/server/redis") - vi.mocked(redisClient).mockResolvedValue({ del: mockDel } as never) - - mockExecFileSync - .mockReturnValueOnce("") // controller kill - .mockReturnValueOnce("") // docker ps (no tasks) - - const resultPromise = killRun(runId) - await vi.advanceTimersByTimeAsync(10000) - await resultPromise - - expect(mockDel).toHaveBeenCalledWith("heartbeat:789") - expect(mockDel).toHaveBeenCalledWith("runners:789") - }) - - it("should handle docker ps failure gracefully", async () => { - const runId = 111 - - mockExecFileSync - .mockReturnValueOnce("") // controller kill succeeds - .mockImplementationOnce(() => { - throw new Error("Docker error") - }) // docker ps fails - - const resultPromise = killRun(runId) - await vi.advanceTimersByTimeAsync(10000) - const result = await resultPromise - - // Should still be successful because controller was killed - expect(result.success).toBe(true) - expect(result.killedContainers).toContain("evals-controller-111") - expect(result.errors).toContain("Failed to list Docker task containers") - }) - - it("should handle individual task kill failures", async () => { - const runId = 222 - - mockExecFileSync - .mockReturnValueOnce("") // controller kill - .mockReturnValueOnce("evals-task-222-300.0\nevals-task-222-400.0\n") // docker ps - .mockImplementationOnce(() => { - throw new Error("Kill failed") - }) // first task kill fails - .mockReturnValueOnce("") // second task kill succeeds - - const resultPromise = killRun(runId) - await vi.advanceTimersByTimeAsync(10000) - const result = await resultPromise - - expect(result.success).toBe(true) - expect(result.killedContainers).toContain("evals-controller-222") - expect(result.killedContainers).toContain("evals-task-222-400.0") - expect(result.errors.length).toBe(1) - expect(result.errors[0]).toContain("evals-task-222-300.0") - }) - - it("should return success with no containers when nothing is running", async () => { - const runId = 333 - - mockExecFileSync - .mockImplementationOnce(() => { - throw new Error("No such container") - }) // controller not running - .mockReturnValueOnce("") // no task containers - - const resultPromise = killRun(runId) - await vi.advanceTimersByTimeAsync(10000) - const result = await resultPromise - - expect(result.success).toBe(true) - expect(result.killedContainers).toHaveLength(0) - expect(result.errors).toHaveLength(0) - }) - - it("should only kill containers belonging to the specific run", async () => { - const runId = 555 - - mockExecFileSync - .mockReturnValueOnce("") // controller kill - .mockReturnValueOnce("evals-task-555-100.0\n") // docker ps - .mockReturnValueOnce("") // docker kill task - - const resultPromise = killRun(runId) - await vi.advanceTimersByTimeAsync(10000) - const result = await resultPromise - - expect(result.success).toBe(true) - // Verify execFileSync was called for docker ps with run-specific filter - expect(mockExecFileSync).toHaveBeenNthCalledWith( - 2, - "docker", - ["ps", "--format", "{{.Names}}", "--filter", "name=evals-task-555-"], - expect.any(Object), - ) - }) -}) diff --git a/apps/web-evals/src/actions/exercises.ts b/apps/web-evals/src/actions/exercises.ts deleted file mode 100644 index 17eb1ff085..0000000000 --- a/apps/web-evals/src/actions/exercises.ts +++ /dev/null @@ -1,22 +0,0 @@ -"use server" - -import * as path from "path" -import { fileURLToPath } from "url" - -import { exerciseLanguages, listDirectories } from "@roo-code/evals" - -const __dirname = path.dirname(fileURLToPath(import.meta.url)) // /apps/web-evals/src/actions - -const EVALS_REPO_PATH = path.resolve(__dirname, "../../../../../evals") - -export const getExercises = async () => { - const result = await Promise.all( - exerciseLanguages.map(async (language) => { - const languagePath = path.join(EVALS_REPO_PATH, language) - const exercises = await listDirectories(__dirname, languagePath) - return exercises.map((exercise) => `${language}/${exercise}`) - }), - ) - - return result.flat() -} diff --git a/apps/web-evals/src/actions/heartbeat.ts b/apps/web-evals/src/actions/heartbeat.ts deleted file mode 100644 index a74aa8ee64..0000000000 --- a/apps/web-evals/src/actions/heartbeat.ts +++ /dev/null @@ -1,8 +0,0 @@ -"use server" - -import { redisClient } from "@/lib/server/redis" - -export const getHeartbeat = async (runId: number) => { - const redis = await redisClient() - return redis.get(`heartbeat:${runId}`) -} diff --git a/apps/web-evals/src/actions/runners.ts b/apps/web-evals/src/actions/runners.ts deleted file mode 100644 index 8b7e86b0f3..0000000000 --- a/apps/web-evals/src/actions/runners.ts +++ /dev/null @@ -1,8 +0,0 @@ -"use server" - -import { redisClient } from "@/lib/server/redis" - -export const getRunners = async (runId: number) => { - const redis = await redisClient() - return redis.sMembers(`runners:${runId}`) -} diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts deleted file mode 100644 index f0c1578aed..0000000000 --- a/apps/web-evals/src/actions/runs.ts +++ /dev/null @@ -1,377 +0,0 @@ -"use server" - -import * as path from "path" -import fs from "fs" -import { fileURLToPath } from "url" -import { spawn, execFileSync } from "child_process" - -import { revalidatePath } from "next/cache" -import pMap from "p-map" - -import { - type ExerciseLanguage, - exerciseLanguages, - createRun as _createRun, - deleteRun as _deleteRun, - updateRun as _updateRun, - getIncompleteRuns as _getIncompleteRuns, - deleteRunsByIds as _deleteRunsByIds, - createTask, - getExercisesForLanguage, -} from "@roo-code/evals" - -import { CreateRun } from "@/lib/schemas" -import { redisClient } from "@/lib/server/redis" - -// Storage base path for eval logs -const EVALS_STORAGE_PATH = "/tmp/evals/runs" - -const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") - -export async function createRun({ - suite, - exercises = [], - timeout, - iterations = 1, - executionMethod = "vscode", - ...values -}: CreateRun) { - const run = await _createRun({ - ...values, - timeout, - executionMethod, - socketPath: "", // TODO: Get rid of this. - }) - - if (suite === "partial") { - for (const path of exercises) { - const [language, exercise] = path.split("/") - - if (!language || !exercise) { - throw new Error("Invalid exercise path: " + path) - } - - // Create multiple tasks for each iteration - for (let iteration = 1; iteration <= iterations; iteration++) { - await createTask({ - ...values, - runId: run.id, - language: language as ExerciseLanguage, - exercise, - iteration, - }) - } - } - } else { - for (const language of exerciseLanguages) { - const languageExercises = await getExercisesForLanguage(EVALS_REPO_PATH, language) - - // Create tasks for all iterations of each exercise - const tasksToCreate: Array<{ language: ExerciseLanguage; exercise: string; iteration: number }> = [] - for (const exercise of languageExercises) { - for (let iteration = 1; iteration <= iterations; iteration++) { - tasksToCreate.push({ language, exercise, iteration }) - } - } - - await pMap( - tasksToCreate, - ({ language, exercise, iteration }) => createTask({ runId: run.id, language, exercise, iteration }), - { concurrency: 10 }, - ) - } - } - - revalidatePath("/runs") - - try { - const isRunningInDocker = fs.existsSync("/.dockerenv") - - const dockerArgs = [ - `--name evals-controller-${run.id}`, - "--rm", - "--network evals_default", - "-v /var/run/docker.sock:/var/run/docker.sock", - "-v /tmp/evals:/var/log/evals", - "-e HOST_EXECUTION_METHOD=docker", - ] - - const cliCommand = `pnpm --filter @roo-code/evals cli --runId ${run.id}` - - const command = isRunningInDocker - ? `docker run ${dockerArgs.join(" ")} evals-runner sh -c "${cliCommand}"` - : cliCommand - - console.log("spawn ->", command) - - const childProcess = spawn("sh", ["-c", command], { - detached: true, - stdio: ["ignore", "pipe", "pipe"], - }) - - const logStream = fs.createWriteStream("/tmp/roo-code-evals.log", { flags: "a" }) - - if (childProcess.stdout) { - childProcess.stdout.pipe(logStream) - } - - if (childProcess.stderr) { - childProcess.stderr.pipe(logStream) - } - - childProcess.unref() - } catch (error) { - console.error(error) - } - - return run -} - -export async function deleteRun(runId: number) { - await _deleteRun(runId) - revalidatePath("/runs") -} - -export type KillRunResult = { - success: boolean - killedContainers: string[] - errors: string[] -} - -const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)) - -/** - * Kill all Docker containers associated with a run (controller and task runners). - * Kills the controller first, waits 10 seconds, then kills runners. - * Also clears Redis state for heartbeat and runners. - * - * Container naming conventions: - * - Controller: evals-controller-{runId} - * - Task runners: evals-task-{runId}-{taskId}.{attempt} - */ -export async function killRun(runId: number): Promise { - const killedContainers: string[] = [] - const errors: string[] = [] - const controllerPattern = `evals-controller-${runId}` - const taskPattern = `evals-task-${runId}-` - - try { - // Step 1: Kill the controller first - console.log(`Killing controller: ${controllerPattern}`) - try { - execFileSync("docker", ["kill", controllerPattern], { encoding: "utf-8", timeout: 10000 }) - killedContainers.push(controllerPattern) - console.log(`Killed controller container: ${controllerPattern}`) - } catch (_error) { - // Controller might not be running - that's ok, continue to kill runners - console.log(`Controller ${controllerPattern} not running or already stopped`) - } - - // Step 2: Wait 10 seconds before killing runners - console.log("Waiting 10 seconds before killing runners...") - await sleep(10000) - - // Step 3: Find and kill all task runner containers for THIS run only - let taskContainerNames: string[] = [] - - try { - const output = execFileSync("docker", ["ps", "--format", "{{.Names}}", "--filter", `name=${taskPattern}`], { - encoding: "utf-8", - timeout: 10000, - }) - taskContainerNames = output - .split("\n") - .map((name) => name.trim()) - .filter((name) => name.length > 0 && name.startsWith(taskPattern)) - } catch (error) { - console.error("Failed to list task containers:", error) - errors.push("Failed to list Docker task containers") - } - - // Kill each task runner container - for (const containerName of taskContainerNames) { - try { - execFileSync("docker", ["kill", containerName], { encoding: "utf-8", timeout: 10000 }) - killedContainers.push(containerName) - console.log(`Killed task container: ${containerName}`) - } catch (error) { - // Container might have already stopped - console.error(`Failed to kill container ${containerName}:`, error) - errors.push(`Failed to kill container: ${containerName}`) - } - } - - // Step 4: Clear Redis state - try { - const redis = await redisClient() - const heartbeatKey = `heartbeat:${runId}` - const runnersKey = `runners:${runId}` - - await redis.del(heartbeatKey) - await redis.del(runnersKey) - console.log(`Cleared Redis keys: ${heartbeatKey}, ${runnersKey}`) - } catch (error) { - console.error("Failed to clear Redis state:", error) - errors.push("Failed to clear Redis state") - } - } catch (error) { - console.error("Error in killRun:", error) - errors.push("Unexpected error while killing containers") - } - - revalidatePath(`/runs/${runId}`) - revalidatePath("/runs") - - return { - success: killedContainers.length > 0 || errors.length === 0, - killedContainers, - errors, - } -} - -export type DeleteIncompleteRunsResult = { - success: boolean - deletedCount: number - deletedRunIds: number[] - storageErrors: string[] -} - -/** - * Delete all incomplete runs (runs without a taskMetricsId/final score). - * Removes both database records and storage folders. - */ -export async function deleteIncompleteRuns(): Promise { - const storageErrors: string[] = [] - - // Get all incomplete runs - const incompleteRuns = await _getIncompleteRuns() - const runIds = incompleteRuns.map((run) => run.id) - - if (runIds.length === 0) { - return { - success: true, - deletedCount: 0, - deletedRunIds: [], - storageErrors: [], - } - } - - // Delete storage folders for each run - for (const runId of runIds) { - const storagePath = path.join(EVALS_STORAGE_PATH, String(runId)) - try { - if (fs.existsSync(storagePath)) { - fs.rmSync(storagePath, { recursive: true, force: true }) - console.log(`Deleted storage folder: ${storagePath}`) - } - } catch (error) { - console.error(`Failed to delete storage folder ${storagePath}:`, error) - storageErrors.push(`Failed to delete storage for run ${runId}`) - } - - // Also try to clear Redis state for any potentially running incomplete runs - try { - const redis = await redisClient() - await redis.del(`heartbeat:${runId}`) - await redis.del(`runners:${runId}`) - } catch (error) { - // Non-critical error, just log it - console.error(`Failed to clear Redis state for run ${runId}:`, error) - } - } - - // Delete from database - await _deleteRunsByIds(runIds) - - revalidatePath("/runs") - - return { - success: true, - deletedCount: runIds.length, - deletedRunIds: runIds, - storageErrors, - } -} - -/** - * Get count of incomplete runs (for UI display) - */ -export async function getIncompleteRunsCount(): Promise { - const incompleteRuns = await _getIncompleteRuns() - return incompleteRuns.length -} - -/** - * Delete all runs older than 30 days. - * Removes both database records and storage folders. - */ -export async function deleteOldRuns(): Promise { - const storageErrors: string[] = [] - - // Get all runs older than 30 days - const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000) - const { getRuns } = await import("@roo-code/evals") - const allRuns = await getRuns() - const oldRuns = allRuns.filter((run) => run.createdAt < thirtyDaysAgo) - const runIds = oldRuns.map((run) => run.id) - - if (runIds.length === 0) { - return { - success: true, - deletedCount: 0, - deletedRunIds: [], - storageErrors: [], - } - } - - // Delete storage folders for each run - for (const runId of runIds) { - const storagePath = path.join(EVALS_STORAGE_PATH, String(runId)) - try { - if (fs.existsSync(storagePath)) { - fs.rmSync(storagePath, { recursive: true, force: true }) - console.log(`Deleted storage folder: ${storagePath}`) - } - } catch (error) { - console.error(`Failed to delete storage folder ${storagePath}:`, error) - storageErrors.push(`Failed to delete storage for run ${runId}`) - } - - // Also try to clear Redis state - try { - const redis = await redisClient() - await redis.del(`heartbeat:${runId}`) - await redis.del(`runners:${runId}`) - } catch (error) { - // Non-critical error, just log it - console.error(`Failed to clear Redis state for run ${runId}:`, error) - } - } - - // Delete from database - await _deleteRunsByIds(runIds) - - revalidatePath("/runs") - - return { - success: true, - deletedCount: runIds.length, - deletedRunIds: runIds, - storageErrors, - } -} - -/** - * Update the description of a run. - */ -export async function updateRunDescription(runId: number, description: string | null): Promise<{ success: boolean }> { - try { - await _updateRun(runId, { description }) - revalidatePath("/runs") - revalidatePath(`/runs/${runId}`) - return { success: true } - } catch (error) { - console.error("Failed to update run description:", error) - return { success: false } - } -} diff --git a/apps/web-evals/src/actions/tasks.ts b/apps/web-evals/src/actions/tasks.ts deleted file mode 100644 index 18b428b0ca..0000000000 --- a/apps/web-evals/src/actions/tasks.ts +++ /dev/null @@ -1,11 +0,0 @@ -"use server" - -import { revalidatePath } from "next/cache" - -import { getTasks as _getTasks } from "@roo-code/evals" - -export async function getTasks(runId: number) { - const tasks = await _getTasks(runId) - revalidatePath(`/runs/${runId}`) - return tasks -} diff --git a/apps/web-evals/src/app/api/runs/[id]/logs/[taskId]/route.ts b/apps/web-evals/src/app/api/runs/[id]/logs/[taskId]/route.ts deleted file mode 100644 index e5ec8751ab..0000000000 --- a/apps/web-evals/src/app/api/runs/[id]/logs/[taskId]/route.ts +++ /dev/null @@ -1,74 +0,0 @@ -import { NextResponse } from "next/server" -import type { NextRequest } from "next/server" -import * as fs from "node:fs/promises" -import * as path from "node:path" - -import { findTask, findRun } from "@roo-code/evals" - -export const dynamic = "force-dynamic" - -const LOG_BASE_PATH = "/tmp/evals/runs" - -// Sanitize path components to prevent path traversal attacks -function sanitizePathComponent(component: string): string { - // Remove any path separators, null bytes, and other dangerous characters - return component.replace(/[/\\:\0*?"<>|]/g, "_") -} - -export async function GET(request: NextRequest, { params }: { params: Promise<{ id: string; taskId: string }> }) { - const { id, taskId } = await params - - try { - const runId = Number(id) - const taskIdNum = Number(taskId) - - if (isNaN(runId) || isNaN(taskIdNum)) { - return NextResponse.json({ error: "Invalid run ID or task ID" }, { status: 400 }) - } - - // Verify the run exists - await findRun(runId) - - // Get the task to find its language and exercise - const task = await findTask(taskIdNum) - - // Verify the task belongs to this run - if (task.runId !== runId) { - return NextResponse.json({ error: "Task does not belong to this run" }, { status: 404 }) - } - - // Sanitize language and exercise to prevent path traversal - const safeLanguage = sanitizePathComponent(task.language) - const safeExercise = sanitizePathComponent(task.exercise) - - // Construct the log file path - const logFileName = `${safeLanguage}-${safeExercise}.log` - const logFilePath = path.join(LOG_BASE_PATH, String(runId), logFileName) - - // Verify the resolved path is within the expected directory (defense in depth) - const resolvedPath = path.resolve(logFilePath) - const expectedBase = path.resolve(LOG_BASE_PATH) - if (!resolvedPath.startsWith(expectedBase)) { - return NextResponse.json({ error: "Invalid log path" }, { status: 400 }) - } - - // Check if the log file exists and read it (async) - try { - const logContent = await fs.readFile(logFilePath, "utf-8") - return NextResponse.json({ logContent }) - } catch (err) { - if ((err as NodeJS.ErrnoException).code === "ENOENT") { - return NextResponse.json({ error: "Log file not found", logContent: null }, { status: 200 }) - } - throw err - } - } catch (error) { - console.error("Error reading task log:", error) - - if (error instanceof Error && error.name === "RecordNotFoundError") { - return NextResponse.json({ error: "Task or run not found" }, { status: 404 }) - } - - return NextResponse.json({ error: "Failed to read log file" }, { status: 500 }) - } -} diff --git a/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts b/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts deleted file mode 100644 index 8b2760df98..0000000000 --- a/apps/web-evals/src/app/api/runs/[id]/logs/failed/route.ts +++ /dev/null @@ -1,147 +0,0 @@ -import { NextResponse } from "next/server" -import type { NextRequest } from "next/server" -import * as fs from "node:fs" -import * as path from "node:path" -import archiver from "archiver" - -import { findRun, getTasks } from "@roo-code/evals" - -export const dynamic = "force-dynamic" - -const LOG_BASE_PATH = "/tmp/evals/runs" - -// Sanitize path components to prevent path traversal attacks -function sanitizePathComponent(component: string): string { - // Remove any path separators, null bytes, and other dangerous characters - return component.replace(/[/\\:\0*?"<>|]/g, "_") -} - -export async function GET(request: NextRequest, { params }: { params: Promise<{ id: string }> }) { - const { id } = await params - - try { - const runId = Number(id) - - if (isNaN(runId)) { - return NextResponse.json({ error: "Invalid run ID" }, { status: 400 }) - } - - // Verify the run exists - await findRun(runId) - - // Get all tasks for this run - const tasks = await getTasks(runId) - - // Filter for failed tasks only - const failedTasks = tasks.filter((task) => task.passed === false) - - if (failedTasks.length === 0) { - return NextResponse.json({ error: "No failed tasks to export" }, { status: 400 }) - } - - // Create a zip archive - const archive = archiver("zip", { zlib: { level: 9 } }) - - // Collect chunks to build the response - const chunks: Buffer[] = [] - - archive.on("data", (chunk: Buffer) => { - chunks.push(chunk) - }) - - // Track archive errors - let archiveError: Error | null = null - archive.on("error", (err: Error) => { - archiveError = err - }) - - // Set up the end promise before finalizing (proper event listener ordering) - const archiveEndPromise = new Promise((resolve, reject) => { - archive.on("end", resolve) - archive.on("error", reject) - }) - - // Add each failed task's log file and history files to the archive - const logDir = path.join(LOG_BASE_PATH, String(runId)) - let filesAdded = 0 - - for (const task of failedTasks) { - // Sanitize language and exercise to prevent path traversal - const safeLanguage = sanitizePathComponent(task.language) - const safeExercise = sanitizePathComponent(task.exercise) - const expectedBase = path.resolve(LOG_BASE_PATH) - - // Add the log file - const logFileName = `${safeLanguage}-${safeExercise}.log` - const logFilePath = path.join(logDir, logFileName) - - // Verify the resolved path is within the expected directory (defense in depth) - const resolvedLogPath = path.resolve(logFilePath) - if (resolvedLogPath.startsWith(expectedBase) && fs.existsSync(logFilePath)) { - archive.file(logFilePath, { name: logFileName }) - filesAdded++ - } - - // Add the API conversation history file - // Format: {language}-{exercise}.{iteration}_api_conversation_history.json - const apiHistoryFileName = `${safeLanguage}-${safeExercise}.${task.iteration}_api_conversation_history.json` - const apiHistoryFilePath = path.join(logDir, apiHistoryFileName) - const resolvedApiHistoryPath = path.resolve(apiHistoryFilePath) - if (resolvedApiHistoryPath.startsWith(expectedBase) && fs.existsSync(apiHistoryFilePath)) { - archive.file(apiHistoryFilePath, { name: apiHistoryFileName }) - filesAdded++ - } - - // Add the UI messages file - // Format: {language}-{exercise}.{iteration}_ui_messages.json - const uiMessagesFileName = `${safeLanguage}-${safeExercise}.${task.iteration}_ui_messages.json` - const uiMessagesFilePath = path.join(logDir, uiMessagesFileName) - const resolvedUiMessagesPath = path.resolve(uiMessagesFilePath) - if (resolvedUiMessagesPath.startsWith(expectedBase) && fs.existsSync(uiMessagesFilePath)) { - archive.file(uiMessagesFilePath, { name: uiMessagesFileName }) - filesAdded++ - } - } - - // Check if any files were actually added - if (filesAdded === 0) { - archive.abort() - return NextResponse.json( - { error: "No log files found - they may have been cleared from disk" }, - { status: 404 }, - ) - } - - // Finalize the archive - await archive.finalize() - - // Wait for all data to be collected - await archiveEndPromise - - // Check for archive errors - if (archiveError) { - throw archiveError - } - - // Combine all chunks into a single buffer - const zipBuffer = Buffer.concat(chunks) - - // Return the zip file - return new NextResponse(zipBuffer, { - status: 200, - headers: { - "Content-Type": "application/zip", - "Content-Disposition": `attachment; filename="run-${runId}-failed-logs.zip"`, - "Content-Length": String(zipBuffer.length), - }, - }) - } catch (error) { - console.error("Error exporting failed logs:", error) - - if (error instanceof Error && error.name === "RecordNotFoundError") { - return NextResponse.json({ error: "Run not found" }, { status: 404 }) - } - - return NextResponse.json({ error: "Failed to export logs" }, { status: 500 }) - } -} diff --git a/apps/web-evals/src/app/api/runs/[id]/stream/route.ts b/apps/web-evals/src/app/api/runs/[id]/stream/route.ts deleted file mode 100644 index 3168974ecd..0000000000 --- a/apps/web-evals/src/app/api/runs/[id]/stream/route.ts +++ /dev/null @@ -1,71 +0,0 @@ -import type { NextRequest } from "next/server" - -import { taskEventSchema } from "@roo-code/types" -import { findRun } from "@roo-code/evals" - -import { SSEStream } from "@/lib/server/sse-stream" -import { redisClient } from "@/lib/server/redis" - -export const dynamic = "force-dynamic" - -export async function GET(request: NextRequest, { params }: { params: Promise<{ id: string }> }) { - const { id } = await params - const requestId = crypto.randomUUID() - const stream = new SSEStream() - const run = await findRun(Number(id)) - const redis = await redisClient() - - let isStreamClosed = false - const channelName = `evals:${run.id}` - - const onMessage = async (data: string) => { - if (isStreamClosed || stream.isClosed) { - return - } - - try { - const taskEvent = taskEventSchema.parse(JSON.parse(data)) - // console.log(`[stream#${requestId}] task event -> ${taskEvent.eventName}`) - const writeSuccess = await stream.write(JSON.stringify(taskEvent)) - - if (!writeSuccess) { - await disconnect() - } - } catch (_error) { - console.error(`[stream#${requestId}] invalid task event:`, data) - } - } - - const disconnect = async () => { - if (isStreamClosed) { - return - } - - isStreamClosed = true - - try { - await redis.unsubscribe(channelName) - console.log(`[stream#${requestId}] unsubscribed from ${channelName}`) - } catch (error) { - console.error(`[stream#${requestId}] error unsubscribing:`, error) - } - - try { - await stream.close() - } catch (error) { - console.error(`[stream#${requestId}] error closing stream:`, error) - } - } - - await redis.subscribe(channelName, onMessage) - - request.signal.addEventListener("abort", () => { - console.log(`[stream#${requestId}] abort`) - - disconnect().catch((error) => { - console.error(`[stream#${requestId}] cleanup error:`, error) - }) - }) - - return stream.getResponse() -} diff --git a/apps/web-evals/src/app/favicon.ico b/apps/web-evals/src/app/favicon.ico deleted file mode 100644 index 718d6fea48..0000000000 Binary files a/apps/web-evals/src/app/favicon.ico and /dev/null differ diff --git a/apps/web-evals/src/app/globals.css b/apps/web-evals/src/app/globals.css deleted file mode 100644 index 8c12f0d1d2..0000000000 --- a/apps/web-evals/src/app/globals.css +++ /dev/null @@ -1,141 +0,0 @@ -@import "tailwindcss"; - -@plugin "tailwindcss-animate"; - -@custom-variant dark (&:is(.dark *)); - -:root { - --radius: 0.625rem; - --background: oklch(1 0 0); - --foreground: oklch(0.145 0 0); - --card: oklch(1 0 0); - --card-foreground: oklch(0.145 0 0); - --popover: oklch(1 0 0); - --popover-foreground: oklch(0.145 0 0); - --primary: oklch(0.205 0 0); - --primary-foreground: oklch(0.985 0 0); - --secondary: oklch(0.97 0 0); - --secondary-foreground: oklch(0.205 0 0); - --muted: oklch(0.97 0 0); - --muted-foreground: oklch(0.556 0 0); - --accent: oklch(0.97 0 0); - --accent-foreground: oklch(0.205 0 0); - --destructive: oklch(0.577 0.245 27.325); - --border: oklch(0.922 0 0); - --input: oklch(0.922 0 0); - --ring: oklch(0.708 0 0); - --chart-1: oklch(0.646 0.222 41.116); - --chart-2: oklch(0.6 0.118 184.704); - --chart-3: oklch(0.398 0.07 227.392); - --chart-4: oklch(0.828 0.189 84.429); - --chart-5: oklch(0.769 0.188 70.08); - --sidebar: oklch(0.985 0 0); - --sidebar-foreground: oklch(0.145 0 0); - --sidebar-primary: oklch(0.205 0 0); - --sidebar-primary-foreground: oklch(0.985 0 0); - --sidebar-accent: oklch(0.97 0 0); - --sidebar-accent-foreground: oklch(0.205 0 0); - --sidebar-border: oklch(0.922 0 0); - --sidebar-ring: oklch(0.708 0 0); -} - -.dark { - --background: oklch(23.66% 0.0198 271.79); - --foreground: oklch(75.15% 0.0477 278.41); - --card: oklch(0.205 0 0); - --card-foreground: oklch(0.985 0 0); - --popover: var(--primary); - --popover-foreground: oklch(0.985 0 0); - --primary: oklch(29.33% 0.0295 276.18); - --primary-foreground: var(--accent); - --secondary: var(--primary); - --secondary-foreground: var(--foreground); - --muted: oklch(28.27% 0.0207 273.06); - --muted-foreground: oklch(75.15% 0.0477 278.41 / 75%); - --accent: oklch(70.21% 0.1813 328.71); - --accent-foreground: oklch(1 0 0 / 75%); - --destructive: oklch(72.14% 0.1616 15.49); - --border: var(--primary); - --input: var(--primary); - --ring: oklch(83.63% 0.1259 176.52); - --chart-1: oklch(0.488 0.243 264.376); - --chart-2: oklch(0.696 0.17 162.48); - --chart-3: oklch(0.769 0.188 70.08); - --chart-4: oklch(0.627 0.265 303.9); - --chart-5: oklch(0.645 0.246 16.439); - --sidebar: oklch(0.205 0 0); - --sidebar-foreground: oklch(0.985 0 0); - --sidebar-primary: oklch(0.488 0.243 264.376); - --sidebar-primary-foreground: oklch(0.985 0 0); - --sidebar-accent: oklch(0.269 0 0); - --sidebar-accent-foreground: oklch(0.985 0 0); - --sidebar-border: oklch(1 0 0 / 10%); - --sidebar-ring: oklch(0.556 0 0); -} - -@theme inline { - --color-background: var(--background); - --color-foreground: var(--foreground); - --color-sidebar-ring: var(--sidebar-ring); - --color-sidebar-border: var(--sidebar-border); - --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); - --color-sidebar-accent: var(--sidebar-accent); - --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); - --color-sidebar-primary: var(--sidebar-primary); - --color-sidebar-foreground: var(--sidebar-foreground); - --color-sidebar: var(--sidebar); - --color-chart-5: var(--chart-5); - --color-chart-4: var(--chart-4); - --color-chart-3: var(--chart-3); - --color-chart-2: var(--chart-2); - --color-chart-1: var(--chart-1); - --color-ring: var(--ring); - --color-input: var(--input); - --color-border: var(--border); - --color-destructive: var(--destructive); - --color-accent-foreground: var(--accent-foreground); - --color-accent: var(--accent); - --color-muted-foreground: var(--muted-foreground); - --color-muted: var(--muted); - --color-secondary-foreground: var(--secondary-foreground); - --color-secondary: var(--secondary); - --color-primary-foreground: var(--primary-foreground); - --color-primary: var(--primary); - --color-popover-foreground: var(--popover-foreground); - --color-popover: var(--popover); - --color-card-foreground: var(--card-foreground); - --color-card: var(--card); - --radius-sm: calc(var(--radius) - 4px); - --radius-md: calc(var(--radius) - 2px); - --radius-lg: var(--radius); - --radius-xl: calc(var(--radius) + 4px); - - --animate-hop: hop 0.8s ease-in-out infinite; - - @keyframes hop { - 0%, - 100% { - transform: none; - animation-timing-function: cubic-bezier(0.8, 0, 1, 1); - } - 50% { - transform: translateY(-8px); - animation-timing-function: cubic-bezier(0, 0, 0.2, 1); - } - } -} - -@layer base { - * { - @apply border-border outline-ring/50; - } - html, - body { - height: 100%; - } - body { - @apply bg-background text-foreground; - scrollbar-color: rgba(0, 0, 0, 0.2) transparent; /* Firefox */ - scrollbar-width: thin; - } -} diff --git a/apps/web-evals/src/app/layout.tsx b/apps/web-evals/src/app/layout.tsx deleted file mode 100644 index 3bb34f7dfb..0000000000 --- a/apps/web-evals/src/app/layout.tsx +++ /dev/null @@ -1,35 +0,0 @@ -import type { Metadata } from "next" -import { Geist, Geist_Mono } from "next/font/google" - -import { ThemeProvider, ReactQueryProvider } from "@/components/providers" -import { Toaster } from "@/components/ui" -import { Header } from "@/components/layout/header" - -import "./globals.css" - -const fontSans = Geist({ variable: "--font-sans", subsets: ["latin"] }) -const fontMono = Geist_Mono({ variable: "--font-mono", subsets: ["latin"] }) - -export const metadata: Metadata = { - title: "Roo Code Evals", -} - -export default function RootLayout({ - children, -}: Readonly<{ - children: React.ReactNode -}>) { - return ( - - - - -
- {children} - - - - - - ) -} diff --git a/apps/web-evals/src/app/page.tsx b/apps/web-evals/src/app/page.tsx deleted file mode 100644 index 3dcb26aebf..0000000000 --- a/apps/web-evals/src/app/page.tsx +++ /dev/null @@ -1,10 +0,0 @@ -import { getRuns } from "@roo-code/evals" - -import { Runs } from "@/components/home/runs" - -export const dynamic = "force-dynamic" - -export default async function Page() { - const runs = await getRuns() - return -} diff --git a/apps/web-evals/src/app/runs/[id]/page.tsx b/apps/web-evals/src/app/runs/[id]/page.tsx deleted file mode 100644 index 8b993eec8a..0000000000 --- a/apps/web-evals/src/app/runs/[id]/page.tsx +++ /dev/null @@ -1,14 +0,0 @@ -import { findRun } from "@roo-code/evals" - -import { Run } from "./run" - -export default async function Page({ params }: { params: Promise<{ id: string }> }) { - const { id } = await params - const run = await findRun(Number(id)) - - return ( -
- -
- ) -} diff --git a/apps/web-evals/src/app/runs/[id]/run-status.tsx b/apps/web-evals/src/app/runs/[id]/run-status.tsx deleted file mode 100644 index e05b1b51eb..0000000000 --- a/apps/web-evals/src/app/runs/[id]/run-status.tsx +++ /dev/null @@ -1,79 +0,0 @@ -"use client" - -import { Link2, Link2Off, CheckCircle2 } from "lucide-react" -import type { RunStatus as _RunStatus } from "@/hooks/use-run-status" -import { cn } from "@/lib/utils" -import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui" - -function StreamIcon({ status }: { status: "connected" | "waiting" | "error" }) { - if (status === "connected") { - return - } - return -} - -export const RunStatus = ({ - runStatus: { sseStatus, heartbeat, runners = [] }, - isComplete = false, -}: { - runStatus: _RunStatus - isComplete?: boolean -}) => { - // For completed runs, show a simple "Complete" badge - if (isComplete) { - return ( - - -
- -
-
- - Run complete - -
- ) - } - - return ( - - -
- {/* Task Stream status icon */} - - - {/* Task Controller ID */} - {heartbeat ?? "-"} - - {/* Task Runners count */} - 0 ? "text-green-500" : "text-rose-500"}> - {runners.length > 0 ? `${runners.length}r` : "0r"} - -
-
- -
-
- - Task Stream: {sseStatus} -
-
- ā— - Task Controller: {heartbeat ?? "dead"} -
-
- 0 ? "text-green-500" : "text-rose-500"}>ā— - Task Runners: {runners.length > 0 ? runners.length : "none"} -
- {runners.length > 0 && ( -
- {runners.map((runner) => ( -
{runner}
- ))} -
- )} -
-
-
- ) -} diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx deleted file mode 100644 index badd77741e..0000000000 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ /dev/null @@ -1,1058 +0,0 @@ -"use client" - -import { useMemo, useState, useCallback, useEffect, Fragment } from "react" -import { toast } from "sonner" -import { LoaderCircle, FileText, Copy, Check, StopCircle, List, Layers } from "lucide-react" - -import type { Run, TaskMetrics as _TaskMetrics, Task } from "@roo-code/evals" -import type { ToolName } from "@roo-code/types" - -import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters" -import { useRunStatus } from "@/hooks/use-run-status" -import { killRun } from "@/actions/runs" -import { - Table, - TableBody, - TableCell, - TableHead, - TableHeader, - TableRow, - Tooltip, - TooltipContent, - TooltipTrigger, - Dialog, - DialogContent, - DialogHeader, - DialogTitle, - ScrollArea, - Button, - AlertDialog, - AlertDialogAction, - AlertDialogCancel, - AlertDialogContent, - AlertDialogDescription, - AlertDialogFooter, - AlertDialogHeader, - AlertDialogTitle, -} from "@/components/ui" - -import { TaskStatus } from "./task-status" -import { RunStatus } from "./run-status" - -type TaskMetrics = Pick<_TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost"> - -// Extended Task type with taskMetrics from useRunStatus -type TaskWithMetrics = Task & { taskMetrics: _TaskMetrics | null } - -type ToolUsageEntry = { attempts: number; failures: number } -type ToolUsage = Record - -// Generate abbreviation from tool name (e.g., "read_file" -> "RF", "list_code_definition_names" -> "LCDN") -function getToolAbbreviation(toolName: string): string { - return toolName - .split("_") - .map((word) => word[0]?.toUpperCase() ?? "") - .join("") -} - -// Pattern definitions for syntax highlighting -type HighlightPattern = { - pattern: RegExp - className: string - // If true, wraps the entire match; if a number, wraps that capture group - wrapGroup?: number -} - -const HIGHLIGHT_PATTERNS: HighlightPattern[] = [ - // Log levels - styled as badges - { pattern: /\|\s*(INFO)\s*\|/g, className: "text-green-400", wrapGroup: 1 }, - { pattern: /\|\s*(WARN|WARNING)\s*\|/g, className: "text-yellow-400", wrapGroup: 1 }, - { pattern: /\|\s*(ERROR)\s*\|/g, className: "text-red-400 font-semibold", wrapGroup: 1 }, - { pattern: /\|\s*(DEBUG)\s*\|/g, className: "text-gray-400", wrapGroup: 1 }, - // Task identifiers - important events - { - pattern: /(taskCreated|taskFocused|taskStarted|taskCompleted|taskAborted|taskResumable)/g, - className: "text-purple-400 font-medium", - }, - // Tool failures - highlight in red - { pattern: /(taskToolFailed)/g, className: "text-red-400 font-bold" }, - { pattern: /(Tool execution failed|tool.*failed|failed.*tool)/gi, className: "text-red-400" }, - { pattern: /(EvalPass)/g, className: "text-green-400 font-bold" }, - { pattern: /(EvalFail)/g, className: "text-red-400 font-bold" }, - // Message arrows - { pattern: /→/g, className: "text-cyan-400" }, - // Tool names in quotes - { pattern: /"(tool)":\s*"([^"]+)"/g, className: "text-orange-400" }, - // JSON keys - { pattern: /"([^"]+)":/g, className: "text-sky-300" }, - // Boolean values - { pattern: /:\s*(true|false)/g, className: "text-amber-400", wrapGroup: 1 }, - // Numbers - { pattern: /:\s*(-?\d+\.?\d*)/g, className: "text-emerald-400", wrapGroup: 1 }, -] - -// Extract timestamp from a log line and return elapsed time from baseline -function formatElapsedTime(timestamp: string, baselineMs: number): string { - const currentMs = new Date(timestamp).getTime() - const elapsedMs = currentMs - baselineMs - const totalSeconds = Math.floor(elapsedMs / 1000) - const minutes = Math.floor(totalSeconds / 60) - const seconds = totalSeconds % 60 - return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}` -} - -// Extract the first timestamp from the log to use as baseline -function extractFirstTimestamp(log: string): number | null { - // Match timestamp at start of line: [2025-11-28T09:35:23.187Z | ... or [2025-11-28T09:35:23.187Z] - const match = log.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/) - const isoString = match?.[1] - if (!isoString) return null - return new Date(isoString).getTime() -} - -// Simplify log line by removing redundant metadata -function simplifyLogLine(line: string, baselineMs: number | null): { timestamp: string; simplified: string } { - // Extract timestamp - matches [2025-11-28T09:35:23.187Z | ... format - const timestampMatch = line.match(/\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)[\s|\]]/) - const isoTimestamp = timestampMatch?.[1] - if (!isoTimestamp) { - return { timestamp: "", simplified: line } - } - - const timestamp = baselineMs !== null ? formatElapsedTime(isoTimestamp, baselineMs) : isoTimestamp.slice(11, 19) - - // Remove the timestamp from the line (handles both [timestamp] and [timestamp | formats) - let simplified = line.replace(/\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s*\|?\s*/, "") - - // Remove redundant metadata: pid, run, task IDs (they're same for entire log) - simplified = simplified.replace(/\|\s*pid:\d+\s*/g, "") - simplified = simplified.replace(/\|\s*run:\d+\s*/g, "") - simplified = simplified.replace(/\|\s*task:\d+\s*/g, "") - simplified = simplified.replace(/runTask\s*\|\s*/g, "") - - // Clean up extra pipes, spaces, and trailing brackets - simplified = simplified.replace(/\|\s*\|/g, "|") - simplified = simplified.replace(/^\s*\|\s*/, "") - simplified = simplified.replace(/\]\s*$/, "") // Remove trailing bracket if present - - return { timestamp, simplified } -} - -// Format a single line with syntax highlighting using React elements (XSS-safe) -function formatLine(line: string): React.ReactNode[] { - // Find all matches with their positions - type Match = { start: number; end: number; text: string; className: string } - const matches: Match[] = [] - - for (const { pattern, className, wrapGroup } of HIGHLIGHT_PATTERNS) { - // Reset regex state - pattern.lastIndex = 0 - let regexMatch - while ((regexMatch = pattern.exec(line)) !== null) { - const capturedText = wrapGroup !== undefined ? regexMatch[wrapGroup] : regexMatch[0] - // Skip if capture group didn't match - if (!capturedText) continue - const start = - wrapGroup !== undefined ? regexMatch.index + regexMatch[0].indexOf(capturedText) : regexMatch.index - matches.push({ - start, - end: start + capturedText.length, - text: capturedText, - className, - }) - } - } - - // Sort matches by position and filter overlapping ones - matches.sort((a, b) => a.start - b.start) - const filteredMatches: Match[] = [] - for (const m of matches) { - const lastMatch = filteredMatches[filteredMatches.length - 1] - if (!lastMatch || m.start >= lastMatch.end) { - filteredMatches.push(m) - } - } - - // Build result with highlighted spans - const result: React.ReactNode[] = [] - let currentPos = 0 - - for (const [i, m] of filteredMatches.entries()) { - // Add text before this match - if (m.start > currentPos) { - result.push(line.slice(currentPos, m.start)) - } - // Add highlighted match - result.push( - - {m.text} - , - ) - currentPos = m.end - } - - // Add remaining text - if (currentPos < line.length) { - result.push(line.slice(currentPos)) - } - - return result.length > 0 ? result : [line] -} - -// Determine the visual style for a log line based on its content -function getLineStyle(line: string): string { - if (line.includes("ERROR")) return "bg-red-950/30 border-l-2 border-red-500" - if (line.includes("WARN") || line.includes("WARNING")) return "bg-yellow-950/20 border-l-2 border-yellow-500" - if (line.includes("taskToolFailed")) return "bg-red-950/30 border-l-2 border-red-500" - if (line.includes("taskStarted") || line.includes("taskCreated")) return "bg-purple-950/20" - if (line.includes("EvalPass")) return "bg-green-950/30 border-l-2 border-green-500" - if (line.includes("EvalFail")) return "bg-red-950/30 border-l-2 border-red-500" - if (line.includes("taskCompleted") || line.includes("taskAborted")) return "bg-blue-950/20" - return "" -} - -// Format log content with basic highlighting (XSS-safe - no dangerouslySetInnerHTML) -function formatLogContent(log: string): React.ReactNode[] { - const lines = log.split("\n") - const baselineMs = extractFirstTimestamp(log) - - return lines.map((line, index) => { - if (!line.trim()) { - return ( -
- {" "} -
- ) - } - - const parsed = simplifyLogLine(line, baselineMs) - const lineStyle = getLineStyle(line) - - return ( -
- {/* Elapsed time */} - - {parsed.timestamp} - - {/* Log content - pl-12 ensures wrapped lines are indented under the timestamp */} - - {formatLine(parsed.simplified)} - -
- ) - }) -} - -export function Run({ run }: { run: Run }) { - const runStatus = useRunStatus(run) - const { tasks, tokenUsage, toolUsage, usageUpdatedAt, heartbeat, runners } = runStatus - - const [selectedTask, setSelectedTask] = useState(null) - const [taskLog, setTaskLog] = useState(null) - const [isLoadingLog, setIsLoadingLog] = useState(false) - const [copied, setCopied] = useState(false) - const [showKillDialog, setShowKillDialog] = useState(false) - const [isKilling, setIsKilling] = useState(false) - const [groupByStatus, setGroupByStatus] = useState(() => { - // Initialize from localStorage if available (client-side only) - if (typeof window !== "undefined") { - const stored = localStorage.getItem("evals-group-by-status") - return stored === "true" - } - return false - }) - - // Persist groupByStatus to localStorage - useEffect(() => { - localStorage.setItem("evals-group-by-status", String(groupByStatus)) - }, [groupByStatus]) - - // Determine if run is still active (has heartbeat or runners) - const isRunActive = !run.taskMetricsId && (!!heartbeat || (runners && runners.length > 0)) - - const onKillRun = useCallback(async () => { - setIsKilling(true) - try { - const result = await killRun(run.id) - if (result.killedContainers.length > 0) { - toast.success(`Killed ${result.killedContainers.length} container(s)`) - } else if (result.errors.length === 0) { - toast.info("No running containers found") - } else { - toast.error(result.errors.join(", ")) - } - } catch (error) { - console.error("Failed to kill run:", error) - toast.error("Failed to kill run") - } finally { - setIsKilling(false) - setShowKillDialog(false) - } - }, [run.id]) - - const onCopyLog = useCallback(async () => { - if (!taskLog) return - - try { - await navigator.clipboard.writeText(taskLog) - setCopied(true) - toast.success("Log copied to clipboard") - setTimeout(() => setCopied(false), 2000) - } catch (error) { - console.error("Failed to copy log:", error) - toast.error("Failed to copy log") - } - }, [taskLog]) - - // Handle ESC key to close the dialog - useEffect(() => { - const handleKeyDown = (e: KeyboardEvent) => { - if (e.key === "Escape" && selectedTask) { - setSelectedTask(null) - } - } - - document.addEventListener("keydown", handleKeyDown) - return () => document.removeEventListener("keydown", handleKeyDown) - }, [selectedTask]) - - const taskMetrics: Record = useMemo(() => { - // Reference usageUpdatedAt to trigger recomputation when Map contents change - void usageUpdatedAt - const metrics: Record = {} - - // Helper to calculate duration from database timestamps when streaming duration - // is unavailable (e.g., page was loaded after TaskStarted event was published) - const calculateDurationFromTimestamps = (task: TaskWithMetrics): number => { - if (!task.startedAt) return 0 - const startTime = new Date(task.startedAt).getTime() - const endTime = task.finishedAt ? new Date(task.finishedAt).getTime() : Date.now() - return endTime - startTime - } - - tasks?.forEach((task) => { - const streamingUsage = tokenUsage.get(task.id) - const dbMetrics = task.taskMetrics - - // For finished tasks, prefer DB values but fall back to streaming values - // This handles race conditions during timeout where DB might not have latest data - if (task.finishedAt) { - // Check if DB metrics have meaningful values (not just default/empty) - const dbHasData = dbMetrics && (dbMetrics.tokensIn > 0 || dbMetrics.tokensOut > 0 || dbMetrics.cost > 0) - if (dbHasData) { - // If DB duration is 0 but we have timestamps, calculate from timestamps - const duration = dbMetrics.duration || calculateDurationFromTimestamps(task) - metrics[task.id] = { ...dbMetrics, duration } - } else if (streamingUsage) { - // Fall back to streaming values if DB is empty/stale - // Use streaming duration, or calculate from timestamps if not available - const duration = streamingUsage.duration || calculateDurationFromTimestamps(task) - metrics[task.id] = { - tokensIn: streamingUsage.totalTokensIn, - tokensOut: streamingUsage.totalTokensOut, - tokensContext: streamingUsage.contextTokens, - duration, - cost: streamingUsage.totalCost, - } - } else { - // Task finished but no DB metrics and no streaming data - // (e.g., page loaded after task completed, metrics not persisted) - // Still provide duration calculated from timestamps - metrics[task.id] = { - tokensIn: 0, - tokensOut: 0, - tokensContext: 0, - duration: calculateDurationFromTimestamps(task), - cost: 0, - } - } - } else if (streamingUsage) { - // For running tasks, use streaming values - // Use streaming duration, or calculate from task.startedAt if not available - // (happens when page loads after TaskStarted event was already published) - const duration = streamingUsage.duration || calculateDurationFromTimestamps(task) - metrics[task.id] = { - tokensIn: streamingUsage.totalTokensIn, - tokensOut: streamingUsage.totalTokensOut, - tokensContext: streamingUsage.contextTokens, - duration, - cost: streamingUsage.totalCost, - } - } else if (task.startedAt) { - // Task has started (has startedAt in DB) but no streaming data yet - // This can happen when page loads after TaskStarted but before TokenUsageUpdated - metrics[task.id] = { - tokensIn: 0, - tokensOut: 0, - tokensContext: 0, - duration: calculateDurationFromTimestamps(task), - cost: 0, - } - } - }) - - return metrics - }, [tasks, tokenUsage, usageUpdatedAt]) - - const onViewTaskLog = useCallback( - async (task: Task) => { - // Only allow viewing logs for tasks that have started. - // Note: we treat presence of derived metrics as evidence of a started task, - // since this page may be rendered without streaming `tokenUsage` populated. - const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) || !!taskMetrics[task.id] - if (!hasStarted) { - toast.error("Task has not started yet") - return - } - - setSelectedTask(task) - setIsLoadingLog(true) - setTaskLog(null) - - try { - const response = await fetch(`/api/runs/${run.id}/logs/${task.id}`) - - if (!response.ok) { - const error = await response.json() - toast.error(error.error || "Failed to load log") - setSelectedTask(null) - return - } - - const data = await response.json() - setTaskLog(data.logContent) - } catch (error) { - console.error("Error loading task log:", error) - toast.error("Failed to load log") - setSelectedTask(null) - } finally { - setIsLoadingLog(false) - } - }, - [run.id, tokenUsage, taskMetrics], - ) - - // Collect all unique tool names from all tasks and sort by total attempts - const toolColumns = useMemo(() => { - // Reference usageUpdatedAt to trigger recomputation when Map contents change - void usageUpdatedAt - if (!tasks) return [] - - const toolTotals = new Map() - - for (const task of tasks) { - // Get both DB and streaming values - const dbToolUsage = task.taskMetrics?.toolUsage - const streamingToolUsage = toolUsage.get(task.id) - - // For finished tasks, prefer DB values but fall back to streaming values - // For running tasks, use streaming values - // This handles race conditions during timeout where DB might not have latest data - const taskToolUsage = task.finishedAt - ? dbToolUsage && Object.keys(dbToolUsage).length > 0 - ? dbToolUsage - : streamingToolUsage - : streamingToolUsage - - if (taskToolUsage) { - for (const [toolName, usage] of Object.entries(taskToolUsage)) { - const tool = toolName as ToolName - const current = toolTotals.get(tool) ?? 0 - toolTotals.set(tool, current + usage.attempts) - } - } - } - - // Sort by total attempts descending - return Array.from(toolTotals.entries()) - .sort((a, b) => b[1] - a[1]) - .map(([name]): ToolName => name) - // toolUsage ref is stable; usageUpdatedAt triggers recomputation when Map contents change - }, [tasks, toolUsage, usageUpdatedAt]) - - // Compute aggregate stats - const stats = useMemo(() => { - // Reference usageUpdatedAt to trigger recomputation when Map contents change - void usageUpdatedAt - if (!tasks) return null - - const passed = tasks.filter((t) => t.passed === true).length - const failed = tasks.filter((t) => t.passed === false).length - const completed = passed + failed - - let totalTokensIn = 0 - let totalTokensOut = 0 - let totalCost = 0 - let totalDuration = 0 - - // Aggregate tool usage from all tasks (both finished and running) - const toolUsageAggregate: ToolUsage = {} - - for (const task of tasks) { - const metrics = taskMetrics[task.id] - if (metrics) { - totalTokensIn += metrics.tokensIn - totalTokensOut += metrics.tokensOut - totalCost += metrics.cost - totalDuration += metrics.duration - } - - // Aggregate tool usage: prefer DB values for finished tasks, fall back to streaming values - // This handles race conditions during timeout where DB might not have latest data - const dbToolUsage = task.taskMetrics?.toolUsage - const streamingToolUsage = toolUsage.get(task.id) - const taskToolUsage = task.finishedAt - ? dbToolUsage && Object.keys(dbToolUsage).length > 0 - ? dbToolUsage - : streamingToolUsage - : streamingToolUsage - - if (taskToolUsage) { - for (const [key, usage] of Object.entries(taskToolUsage)) { - const tool = key as keyof ToolUsage - if (!toolUsageAggregate[tool]) { - toolUsageAggregate[tool] = { attempts: 0, failures: 0 } - } - toolUsageAggregate[tool].attempts += usage.attempts - toolUsageAggregate[tool].failures += usage.failures - } - } - } - - const remaining = tasks.length - completed - - return { - passed, - failed, - completed, - remaining, - passRate: completed > 0 ? ((passed / completed) * 100).toFixed(1) : null, - totalTokensIn, - totalTokensOut, - totalCost, - totalDuration, - toolUsage: toolUsageAggregate, - } - // Map refs are stable; usageUpdatedAt triggers recomputation when Map contents change - }, [tasks, taskMetrics, toolUsage, usageUpdatedAt]) - - // Calculate elapsed time (wall-clock time from run creation to completion or now) - const elapsedTime = useMemo(() => { - // Reference usageUpdatedAt to trigger recomputation for live elapsed time updates - void usageUpdatedAt - if (!tasks || tasks.length === 0) return null - - const startTime = new Date(run.createdAt).getTime() - - // If run is complete, find the latest finishedAt from tasks - if (run.taskMetricsId) { - const latestFinish = tasks.reduce((latest, task) => { - if (task.finishedAt) { - const finishTime = new Date(task.finishedAt).getTime() - return finishTime > latest ? finishTime : latest - } - return latest - }, startTime) - return latestFinish - startTime - } - - // If still running, use current time - return Date.now() - startTime - }, [tasks, run.createdAt, run.taskMetricsId, usageUpdatedAt]) - - // Task status categories - type TaskStatusCategory = "failed" | "in_progress" | "passed" | "not_started" - - const getTaskStatusCategory = useCallback( - (task: TaskWithMetrics): TaskStatusCategory => { - if (task.passed === false) return "failed" - if (task.passed === true) return "passed" - // Check streaming data, DB metrics, or startedAt timestamp - const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) || !!taskMetrics[task.id] - if (hasStarted) return "in_progress" - return "not_started" - }, - [tokenUsage, taskMetrics], - ) - - // Group tasks by status while preserving original index - const groupedTasks = useMemo(() => { - if (!tasks || !groupByStatus) return null - - const groups: Record> = { - failed: [], - in_progress: [], - passed: [], - not_started: [], - } - - tasks.forEach((task, index) => { - const status = getTaskStatusCategory(task) - groups[status].push({ task, originalIndex: index }) - }) - - return groups - }, [tasks, groupByStatus, getTaskStatusCategory]) - - const statusLabels = useMemo( - (): Record => ({ - failed: { label: "Failed", className: "text-red-500", count: groupedTasks?.failed.length ?? 0 }, - in_progress: { - label: "In Progress", - className: "text-yellow-500", - count: groupedTasks?.in_progress.length ?? 0, - }, - passed: { label: "Passed", className: "text-green-500", count: groupedTasks?.passed.length ?? 0 }, - not_started: { - label: "Not Started", - className: "text-muted-foreground", - count: groupedTasks?.not_started.length ?? 0, - }, - }), - [groupedTasks], - ) - - const statusOrder: TaskStatusCategory[] = ["failed", "in_progress", "passed", "not_started"] - - // Helper to render a task row - const renderTaskRow = (task: TaskWithMetrics, originalIndex: number) => { - const hasStarted = !!task.startedAt || !!tokenUsage.get(task.id) || !!taskMetrics[task.id] - return ( - hasStarted && onViewTaskLog(task)}> - - {originalIndex + 1} - - -
- -
- - {task.language}/{task.exercise} - {task.iteration > 1 && ( - (#{task.iteration}) - )} - - {hasStarted && ( - - - - - Click to view log - - )} -
-
-
- {taskMetrics[task.id] ? ( - <> - -
-
{formatTokens(taskMetrics[task.id]!.tokensIn)}
/ -
{formatTokens(taskMetrics[task.id]!.tokensOut)}
-
-
- - {formatTokens(taskMetrics[task.id]!.tokensContext)} - - {toolColumns.map((toolName) => { - const dbUsage = task.taskMetrics?.toolUsage?.[toolName] - const streamingUsage = toolUsage.get(task.id)?.[toolName] - const usage = task.finishedAt ? (dbUsage ?? streamingUsage) : streamingUsage - - const successRate = - usage && usage.attempts > 0 - ? ((usage.attempts - usage.failures) / usage.attempts) * 100 - : 100 - const rateColor = - successRate === 100 - ? "text-muted-foreground" - : successRate >= 80 - ? "text-yellow-500" - : "text-red-500" - return ( - - {usage ? ( -
- {usage.attempts} - {formatToolUsageSuccessRate(usage)} -
- ) : ( - - - )} -
- ) - })} - - {taskMetrics[task.id]!.duration ? formatDuration(taskMetrics[task.id]!.duration) : "-"} - - - {formatCurrency(taskMetrics[task.id]!.cost)} - - - ) : ( - - )} -
- ) - } - - return ( - <> -
- {!tasks ? ( - - ) : ( - <> - {/* View Toggle */} -
- - - - - - {groupByStatus ? "Show tasks in run order" : "Group tasks by status"} - - -
- - - {stats && ( - - - {/* Provider, Model title and status */} -
- {run.settings?.apiProvider && ( - - {run.settings.apiProvider} - - )} -
{run.model}
- - {run.description && ( - - - {run.description} - - )} - {isRunActive && ( - - - - - - Stop all containers for this run - - - )} -
- {/* Main Stats Row */} -
- {/* Pass Rate / Fail Rate / Remaining % */} -
-
- - {stats.completed > 0 - ? `${((stats.passed / stats.completed) * 100).toFixed(1)}%` - : "-"} - - / - - {stats.completed > 0 - ? `${((stats.failed / stats.completed) * 100).toFixed(1)}%` - : "-"} - - / - - {tasks.length > 0 - ? `${((stats.remaining / tasks.length) * 100).toFixed(1)}%` - : "-"} - -
-
- {stats.passed} - {" / "} - {stats.failed} - {" / "} - {stats.remaining} - {" of "} - {tasks.length} -
-
- - {/* Tokens */} -
-
- {formatTokens(stats.totalTokensIn)} - / - {formatTokens(stats.totalTokensOut)} -
-
Tokens In / Out
-
- - {/* Cost */} -
-
- {formatCurrency(stats.totalCost)} -
-
Cost
-
- - {/* Duration */} -
-
- {stats.totalDuration > 0 - ? formatDuration(stats.totalDuration) - : "-"} -
-
Duration
-
- - {/* Elapsed Time */} -
-
- {elapsedTime !== null ? formatDuration(elapsedTime) : "-"} -
-
Elapsed
-
- - {/* Estimated Time Remaining - only show if run is active and we have data */} - {!run.taskMetricsId && - elapsedTime !== null && - stats.completed > 0 && - stats.remaining > 0 && ( -
-
- ~ - {formatDuration( - (elapsedTime / stats.completed) * stats.remaining, - )} -
-
- Est. Remaining -
-
- )} -
- - {/* Tool Usage Row */} - {Object.keys(stats.toolUsage).length > 0 && ( -
- {Object.entries(stats.toolUsage) - .sort(([, a], [, b]) => b.attempts - a.attempts) - .map(([toolName, usage]) => { - const abbr = getToolAbbreviation(toolName) - const successRate = - usage.attempts > 0 - ? ((usage.attempts - usage.failures) / - usage.attempts) * - 100 - : 100 - const rateColor = - successRate === 100 - ? "text-green-500" - : successRate >= 80 - ? "text-yellow-500" - : "text-red-500" - return ( - - -
- - {abbr} - - - {usage.attempts} - - - {formatToolUsageSuccessRate(usage)} - -
-
- - {toolName} - -
- ) - })} -
- )} -
-
- )} - - # - Exercise - Tokens In / Out - Context - {toolColumns.map((toolName) => ( - - - {getToolAbbreviation(toolName)} - {toolName} - - - ))} - Duration - Cost - -
- - {groupByStatus && groupedTasks - ? // Grouped view - statusOrder.map((status) => { - const group = groupedTasks[status] - if (group.length === 0) return null - const { label, className } = statusLabels[status] - return ( - - - - - {label} ({group.length}) - - - - {group.map(({ task, originalIndex }) => - renderTaskRow(task, originalIndex), - )} - - ) - }) - : // Default order view - tasks.map((task, index) => renderTaskRow(task, index))} - -
- - )} -
- - {/* Task Log Dialog - Full Screen */} - setSelectedTask(null)}> - - -
- - - {selectedTask?.language}/{selectedTask?.exercise} - {selectedTask?.iteration && selectedTask.iteration > 1 && ( - (#{selectedTask.iteration}) - )} - - ( - {selectedTask?.passed === true - ? "Passed" - : selectedTask?.passed === false - ? "Failed" - : "Running"} - ) - - - {taskLog && ( - - )} -
-
-
- {isLoadingLog ? ( -
- -
- ) : taskLog ? ( - -
- {formatLogContent(taskLog)} -
-
- ) : ( -
- Log file not available (may have been cleared) -
- )} -
-
-
- - {/* Kill Run Confirmation Dialog */} - - - - Kill Run? - - This will stop the controller and all task runner containers for this run. Any running tasks - will be terminated immediately. This action cannot be undone. - - - - Cancel - - {isKilling ? ( - <> - - Killing... - - ) : ( - "Kill Run" - )} - - - - - - ) -} diff --git a/apps/web-evals/src/app/runs/[id]/task-status.tsx b/apps/web-evals/src/app/runs/[id]/task-status.tsx deleted file mode 100644 index bae785131a..0000000000 --- a/apps/web-evals/src/app/runs/[id]/task-status.tsx +++ /dev/null @@ -1,20 +0,0 @@ -import { CircleCheck, CircleDashed, CircleSlash, LoaderCircle } from "lucide-react" - -import type { Task } from "@roo-code/evals" - -type TaskStatusProps = { - task: Task - running: boolean -} - -export const TaskStatus = ({ task, running }: TaskStatusProps) => { - return task.passed === false ? ( - - ) : task.passed === true ? ( - - ) : running ? ( - - ) : ( - - ) -} diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx deleted file mode 100644 index 47eabe7fe0..0000000000 --- a/apps/web-evals/src/app/runs/new/new-run.tsx +++ /dev/null @@ -1,992 +0,0 @@ -"use client" - -import { useCallback, useEffect, useMemo, useRef, useState } from "react" -import { useRouter } from "next/navigation" -import { z } from "zod" -import { useQuery } from "@tanstack/react-query" -import { useForm, FormProvider } from "react-hook-form" -import { zodResolver } from "@hookform/resolvers/zod" -import { toast } from "sonner" -import { - X, - Rocket, - Check, - ChevronsUpDown, - SlidersHorizontal, - Info, - Plus, - Minus, - Terminal, - MonitorPlay, -} from "lucide-react" - -import { - type ProviderSettings, - type GlobalSettings, - globalSettingsSchema, - providerSettingsSchema, - getModelId, - EVALS_SETTINGS, -} from "@roo-code/types" - -import { createRun } from "@/actions/runs" -import { getExercises } from "@/actions/exercises" - -import { - type CreateRun, - type ExecutionMethod, - createRunSchema, - CONCURRENCY_MIN, - CONCURRENCY_MAX, - CONCURRENCY_DEFAULT, - TIMEOUT_MIN, - TIMEOUT_MAX, - TIMEOUT_DEFAULT, - ITERATIONS_MIN, - ITERATIONS_MAX, - ITERATIONS_DEFAULT, -} from "@/lib/schemas" -import { cn } from "@/lib/utils" - -import { normalizeCreateRunForSubmit } from "@/lib/normalize-create-run" - -import { useOpenRouterModels } from "@/hooks/use-open-router-models" - -import { - Button, - FormControl, - FormField, - FormItem, - FormLabel, - FormMessage, - Textarea, - Tabs, - TabsList, - TabsTrigger, - MultiSelect, - Command, - CommandEmpty, - CommandGroup, - CommandInput, - CommandItem, - CommandList, - Popover, - PopoverContent, - PopoverTrigger, - Slider, - Label, - Tooltip, - TooltipContent, - TooltipTrigger, -} from "@/components/ui" - -import { SettingsDiff } from "./settings-diff" - -type ImportedSettings = { - apiConfigs: Record - globalSettings: GlobalSettings - currentApiConfigName: string -} - -type ModelSelection = { - id: string - model: string - popoverOpen: boolean -} - -type ConfigSelection = { - id: string - configName: string - popoverOpen: boolean -} - -export function NewRun() { - const router = useRouter() - const modelSelectionsByProviderRef = useRef>({}) - const modelValueByProviderRef = useRef>({}) - - const [provider, setModelSource] = useState<"openrouter" | "other">("openrouter") - const [executionMethod, setExecutionMethod] = useState("vscode") - const [commandExecutionTimeout, setCommandExecutionTimeout] = useState(20) - const [terminalShellIntegrationTimeout, setTerminalShellIntegrationTimeout] = useState(30) // seconds - - const [modelSelections, setModelSelections] = useState([ - { id: crypto.randomUUID(), model: "", popoverOpen: false }, - ]) - - const [importedSettings, setImportedSettings] = useState(null) - const [configSelections, setConfigSelections] = useState([ - { id: crypto.randomUUID(), configName: "", popoverOpen: false }, - ]) - - const openRouter = useOpenRouterModels() - const models = provider === "openrouter" ? openRouter.data : [] - const searchValue = provider === "openrouter" ? openRouter.searchValue : "" - const setSearchValue = provider === "openrouter" ? openRouter.setSearchValue : () => {} - const onFilter = provider === "openrouter" ? openRouter.onFilter : undefined - - const exercises = useQuery({ queryKey: ["getExercises"], queryFn: () => getExercises() }) - - const [selectedExercises, setSelectedExercises] = useState([]) - - const form = useForm({ - resolver: zodResolver(createRunSchema), - defaultValues: { - model: "", - description: "", - suite: "full", - exercises: [], - settings: undefined, - concurrency: CONCURRENCY_DEFAULT, - timeout: TIMEOUT_DEFAULT, - iterations: ITERATIONS_DEFAULT, - jobToken: "", - executionMethod: "vscode", - }, - }) - - const { - register, - setValue, - clearErrors, - watch, - getValues, - formState: { isSubmitting }, - } = form - - const [suite, settings] = watch(["suite", "settings", "concurrency"]) - - // Ensure the `exercises` field is registered so RHF always includes it in submit values. - useEffect(() => { - register("exercises") - }, [register]) - - // Load settings from localStorage on mount - useEffect(() => { - const savedConcurrency = localStorage.getItem("evals-concurrency") - - if (savedConcurrency) { - const parsed = parseInt(savedConcurrency, 10) - - if (!isNaN(parsed) && parsed >= CONCURRENCY_MIN && parsed <= CONCURRENCY_MAX) { - setValue("concurrency", parsed) - } - } - - const savedTimeout = localStorage.getItem("evals-timeout") - - if (savedTimeout) { - const parsed = parseInt(savedTimeout, 10) - - if (!isNaN(parsed) && parsed >= TIMEOUT_MIN && parsed <= TIMEOUT_MAX) { - setValue("timeout", parsed) - } - } - - const savedCommandTimeout = localStorage.getItem("evals-command-execution-timeout") - - if (savedCommandTimeout) { - const parsed = parseInt(savedCommandTimeout, 10) - - if (!isNaN(parsed) && parsed >= 20 && parsed <= 60) { - setCommandExecutionTimeout(parsed) - } - } - - const savedShellTimeout = localStorage.getItem("evals-shell-integration-timeout") - - if (savedShellTimeout) { - const parsed = parseInt(savedShellTimeout, 10) - - if (!isNaN(parsed) && parsed >= 30 && parsed <= 60) { - setTerminalShellIntegrationTimeout(parsed) - } - } - - const savedSuite = localStorage.getItem("evals-suite") - - if (savedSuite === "partial") { - setValue("suite", "partial") - const savedExercises = localStorage.getItem("evals-exercises") - if (savedExercises) { - try { - const parsed = JSON.parse(savedExercises) as string[] - if (Array.isArray(parsed)) { - setSelectedExercises(parsed) - setValue("exercises", parsed) - } - } catch { - // Invalid JSON, ignore. - } - } - } - }, [setValue]) - - // Track previous provider to detect switches - const [prevProvider, setPrevProvider] = useState(provider) - - // Preserve selections per provider; avoids cross-contamination while keeping UX stable. - useEffect(() => { - if (provider === prevProvider) return - - modelSelectionsByProviderRef.current[prevProvider] = modelSelections - modelValueByProviderRef.current[prevProvider] = getValues("model") - - const nextModelSelections = - modelSelectionsByProviderRef.current[provider] ?? - ([{ id: crypto.randomUUID(), model: "", popoverOpen: false }] satisfies ModelSelection[]) - - setModelSelections(nextModelSelections) - - const nextModelValue = - modelValueByProviderRef.current[provider] ?? - nextModelSelections.find((s) => s.model.trim().length > 0)?.model ?? - (provider === "other" && importedSettings && configSelections[0]?.configName - ? (getModelId(importedSettings.apiConfigs[configSelections[0].configName] ?? {}) ?? "") - : "") - - setValue("model", nextModelValue) - setPrevProvider(provider) - }, [provider, prevProvider, modelSelections, setValue, getValues, importedSettings, configSelections]) - - // Extract unique languages from exercises - const languages = useMemo(() => { - if (!exercises.data) { - return [] - } - - const langs = new Set() - - for (const path of exercises.data) { - const lang = path.split("/")[0] - - if (lang) { - langs.add(lang) - } - } - - return Array.from(langs).sort() - }, [exercises.data]) - - const getExercisesForLanguage = useCallback( - (lang: string) => { - if (!exercises.data) { - return [] - } - - return exercises.data.filter((path) => path.startsWith(`${lang}/`)) - }, - [exercises.data], - ) - - const toggleLanguage = useCallback( - (lang: string) => { - const langExercises = getExercisesForLanguage(lang) - const allSelected = langExercises.every((ex) => selectedExercises.includes(ex)) - - let newSelected: string[] - - if (allSelected) { - newSelected = selectedExercises.filter((ex) => !ex.startsWith(`${lang}/`)) - } else { - const existing = new Set(selectedExercises) - - for (const ex of langExercises) { - existing.add(ex) - } - - newSelected = Array.from(existing) - } - - setSelectedExercises(newSelected) - setValue("exercises", newSelected) - localStorage.setItem("evals-exercises", JSON.stringify(newSelected)) - }, - [getExercisesForLanguage, selectedExercises, setValue], - ) - - const isLanguageSelected = useCallback( - (lang: string) => { - const langExercises = getExercisesForLanguage(lang) - return langExercises.length > 0 && langExercises.every((ex) => selectedExercises.includes(ex)) - }, - [getExercisesForLanguage, selectedExercises], - ) - - const isLanguagePartiallySelected = useCallback( - (lang: string) => { - const langExercises = getExercisesForLanguage(lang) - const selectedCount = langExercises.filter((ex) => selectedExercises.includes(ex)).length - return selectedCount > 0 && selectedCount < langExercises.length - }, - [getExercisesForLanguage, selectedExercises], - ) - - const addModelSelection = useCallback(() => { - setModelSelections((prev) => [...prev, { id: crypto.randomUUID(), model: "", popoverOpen: false }]) - }, []) - - const removeModelSelection = useCallback((id: string) => { - setModelSelections((prev) => prev.filter((s) => s.id !== id)) - }, []) - - const updateModelSelection = useCallback( - (id: string, model: string) => { - setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, model, popoverOpen: false } : s))) - // Also set the form model field for validation (use first non-empty model). - setValue("model", model) - }, - [setValue], - ) - - const toggleModelPopover = useCallback((id: string, open: boolean) => { - setModelSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s))) - }, []) - - const addConfigSelection = useCallback(() => { - setConfigSelections((prev) => [...prev, { id: crypto.randomUUID(), configName: "", popoverOpen: false }]) - }, []) - - const removeConfigSelection = useCallback((id: string) => { - setConfigSelections((prev) => prev.filter((s) => s.id !== id)) - }, []) - - const updateConfigSelection = useCallback( - (id: string, configName: string) => { - setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, configName, popoverOpen: false } : s))) - - // Also update the form settings for the first config (for validation). - if (importedSettings) { - const providerSettings = importedSettings.apiConfigs[configName] ?? {} - setValue("model", getModelId(providerSettings) ?? "") - setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...importedSettings.globalSettings }) - } - }, - [importedSettings, setValue], - ) - - const toggleConfigPopover = useCallback((id: string, open: boolean) => { - setConfigSelections((prev) => prev.map((s) => (s.id === id ? { ...s, popoverOpen: open } : s))) - }, []) - - const onSubmit = useCallback( - async (values: CreateRun) => { - try { - const baseValues = normalizeCreateRunForSubmit(values, selectedExercises, suite) - - const selectionsToLaunch: { model: string; configName?: string }[] = [] - - if (provider === "other") { - for (const config of configSelections) { - if (config.configName) { - selectionsToLaunch.push({ model: "", configName: config.configName }) - } - } - } else { - for (const selection of modelSelections) { - if (selection.model) { - selectionsToLaunch.push({ model: selection.model }) - } - } - } - - if (selectionsToLaunch.length === 0) { - toast.error("Please select at least one model or config") - return - } - - const totalRuns = selectionsToLaunch.length - toast.info(totalRuns > 1 ? `Launching ${totalRuns} runs (every 20 seconds)...` : "Launching run...") - - for (let i = 0; i < selectionsToLaunch.length; i++) { - const selection = selectionsToLaunch[i]! - - // Wait 20 seconds between runs (except for the first one). - if (i > 0) { - await new Promise((resolve) => setTimeout(resolve, 20_000)) - } - - const runValues = { ...baseValues } - - if (provider === "openrouter") { - runValues.model = selection.model - runValues.settings = { - ...(runValues.settings || {}), - apiProvider: "openrouter", - openRouterModelId: selection.model, - commandExecutionTimeout, - terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, - } - } else if (provider === "other" && selection.configName && importedSettings) { - const providerSettings = importedSettings.apiConfigs[selection.configName] ?? {} - runValues.model = getModelId(providerSettings) ?? "" - runValues.settings = { - ...EVALS_SETTINGS, - ...providerSettings, - ...importedSettings.globalSettings, - commandExecutionTimeout, - terminalShellIntegrationTimeout: terminalShellIntegrationTimeout * 1000, - } - } - - try { - await createRun(runValues) - toast.success(`Run ${i + 1}/${totalRuns} launched`) - } catch (e) { - toast.error(`Run ${i + 1} failed: ${e instanceof Error ? e.message : "Unknown error"}`) - } - } - - router.push("/") - } catch (e) { - toast.error(e instanceof Error ? e.message : "An unknown error occurred.") - } - }, - [ - suite, - selectedExercises, - provider, - modelSelections, - configSelections, - importedSettings, - router, - commandExecutionTimeout, - terminalShellIntegrationTimeout, - ], - ) - - const onImportSettings = useCallback( - async (event: React.ChangeEvent) => { - const file = event.target.files?.[0] - - if (!file) { - return - } - - clearErrors("settings") - - try { - const { providerProfiles, globalSettings } = z - .object({ - providerProfiles: z.object({ - currentApiConfigName: z.string(), - apiConfigs: z.record(z.string(), providerSettingsSchema), - }), - globalSettings: globalSettingsSchema, - }) - .parse(JSON.parse(await file.text())) - - setImportedSettings({ - apiConfigs: providerProfiles.apiConfigs, - globalSettings, - currentApiConfigName: providerProfiles.currentApiConfigName, - }) - - const defaultConfigName = providerProfiles.currentApiConfigName - setConfigSelections([{ id: crypto.randomUUID(), configName: defaultConfigName, popoverOpen: false }]) - - const providerSettings = providerProfiles.apiConfigs[defaultConfigName] ?? {} - setValue("model", getModelId(providerSettings) ?? "") - setValue("settings", { ...EVALS_SETTINGS, ...providerSettings, ...globalSettings }) - - event.target.value = "" - } catch (e) { - console.error(e) - toast.error(e instanceof Error ? e.message : "An unknown error occurred.") - } - }, - [clearErrors, setValue], - ) - - return ( - <> - -
- ( - - setModelSource(value as "openrouter" | "other")}> - - Import - OpenRouter - - - - {provider === "other" ? ( -
- - - - {importedSettings && Object.keys(importedSettings.apiConfigs).length > 0 && ( -
- - {configSelections.map((selection, index) => ( -
- - toggleConfigPopover(selection.id, open) - }> - - - - - - - - No config found. - - {Object.keys( - importedSettings.apiConfigs, - ).map((configName) => ( - - updateConfigSelection( - selection.id, - configName, - ) - }> - {configName} - {configName === - importedSettings.currentApiConfigName && ( - - (default) - - )} - - - ))} - - - - - - {index === configSelections.length - 1 ? ( - - ) : ( - - )} -
- ))} -
- )} - - {settings && ( - - )} -
- ) : ( - <> -
- {modelSelections.map((selection, index) => ( -
- toggleModelPopover(selection.id, open)}> - - - - - - - - No model found. - - {models?.map(({ id, name }) => ( - - updateModelSelection( - selection.id, - id, - ) - }> - {name} - - - ))} - - - - - - {index === modelSelections.length - 1 ? ( - - ) : ( - - )} -
- ))} -
- - )} - - -
- )} - /> - - ( - - Exercises -
- { - setValue("suite", value as "full" | "partial") - localStorage.setItem("evals-suite", value) - if (value === "full") { - setSelectedExercises([]) - setValue("exercises", []) - localStorage.removeItem("evals-exercises") - } - }}> - - All - Some - - - {suite === "partial" && languages.length > 0 && ( -
- {languages.map((lang) => ( - - ))} -
- )} -
- {suite === "partial" && ( - ({ value: path, label: path })) || []} - value={selectedExercises} - onValueChange={(value) => { - setSelectedExercises(value) - setValue("exercises", value) - localStorage.setItem("evals-exercises", JSON.stringify(value)) - }} - placeholder="Select" - variant="inverted" - maxCount={4} - /> - )} - -
- )} - /> - - {/* Concurrency, Timeout, and Iterations in a 3-column row */} -
- ( - - Concurrency - -
- { - field.onChange(value[0]) - localStorage.setItem("evals-concurrency", String(value[0])) - }} - /> -
{field.value}
-
-
- -
- )} - /> - - ( - - Timeout (Minutes) - -
- { - field.onChange(value[0]) - localStorage.setItem("evals-timeout", String(value[0])) - }} - /> -
{field.value}
-
-
- -
- )} - /> - - ( - - Iterations - -
- { - field.onChange(value[0]) - }} - /> -
{field.value}
-
-
- -
- )} - /> -
- - {/* Terminal timeouts in a 2-column row */} -
- -
- - - - - - -

- Maximum time in seconds to wait for terminal command execution to complete - before timing out. This applies to commands run via the execute_command - tool. -

-
-
-
-
- { - if (value !== undefined) { - setCommandExecutionTimeout(value) - localStorage.setItem("evals-command-execution-timeout", String(value)) - } - }} - /> -
{commandExecutionTimeout}
-
-
- - -
- - - - - - -

- Maximum time in seconds to wait for shell integration to initialize when - opening a new terminal. -

-
-
-
-
- { - if (value !== undefined) { - setTerminalShellIntegrationTimeout(value) - localStorage.setItem("evals-shell-integration-timeout", String(value)) - } - }} - /> -
{terminalShellIntegrationTimeout}
-
-
-
- - {/* Execution Method */} - ( - - Execution Method - { - const newExecutionMethod = value as ExecutionMethod - setExecutionMethod(newExecutionMethod) - setValue("executionMethod", newExecutionMethod) - }}> - - - - VSCode - - - - CLI - - - - - - )} - /> - - ( - - Description / Notes - -