getsentry · dcramer · Apr 10, 2026 · Apr 9, 2026 · Apr 10, 2026
diff --git a/packages/junior-evals/README.md b/packages/junior-evals/README.md
@@ -68,6 +68,7 @@ Harness override knobs (in `EvalOverrides`):
 - `mock_image_generation`: stub the image-generation HTTP response with a valid image payload while still exercising the real attachment path.
 - `plugin_dirs`: load plugin fixtures from eval-local directories without adding workspace packages.
 - `reply_texts`: override returned reply text per call.
+- `reply_timeout_ms`: raise the per-reply harness timeout for a specific slow scenario without changing the suite-wide default.
 - `subscribed_decisions`: controls the subscribed-message reply gate in the harness. If you use it, do not claim that reply-selection behavior is being validated by the eval itself.
 
 These knobs work by overriding services on the eval-local runtime instance. They must not reintroduce mutable global runtime behavior seams.

diff --git a/packages/junior-evals/evals/behavior-harness.ts b/packages/junior-evals/evals/behavior-harness.ts
@@ -124,6 +124,7 @@ export interface EvalOverrides {
   plugin_dirs?: string[];
   plugin_packages?: string[];
   reply_results?: EvalReplyResultFixture[];
+  reply_timeout_ms?: number;
   reply_texts?: string[];
   skill_dirs?: string[];
   subscribed_decisions?: SubscribedDecisionFixture[];
@@ -797,10 +798,11 @@ function buildRuntimeServices(
   const replyResults = scenario.overrides?.reply_results ?? [];
   const replyTexts = scenario.overrides?.reply_texts ?? [];
   const subscribedDecisions = scenario.overrides?.subscribed_decisions ?? [];
-  const replyTimeoutMs = Number.parseInt(
-    process.env.EVAL_AGENT_REPLY_TIMEOUT_MS ?? "45000",
-    10,
-  );
+  const replyTimeoutMs =
+    scenario.overrides?.reply_timeout_ms &&
+    scenario.overrides.reply_timeout_ms > 0
+      ? scenario.overrides.reply_timeout_ms
+      : Number.parseInt(process.env.EVAL_AGENT_REPLY_TIMEOUT_MS ?? "45000", 10);
   let replyCallCount = 0;
   let decisionIndex = 0;
   const replyState = { successfulCount: 0 };

diff --git a/packages/junior-evals/evals/core/passive-behavior.eval.ts b/packages/junior-evals/evals/core/passive-behavior.eval.ts
@@ -143,6 +143,62 @@ describe("Conversational Evals: Passive Behavior", () => {
       "The assistant posts two replies in order. The second reply provides more detail about the deploy changes because the follow-up explicitly references Junior's last response.",
   });
 
+  const terseFollowUpThread = {
+    id: "thread-passive-terse-follow-up",
+    channel_id: "C-passive-terse-follow-up",
+    thread_ts: "17000000.passive-terse-follow-up",
+  };
+
+  slackEval(
+    "passive: terse clarification right after Junior reply gets a reply",
+    {
+      overrides: {
+        reply_texts: [
+          "The deploy changed billing, auth, and the API gateway.",
+          "The three services were billing, auth, and the API gateway.",
+        ],
+      },
+      events: [
+        mention("What changed in the deploy?", {
+          thread: terseFollowUpThread,
+        }),
+        threadMessage("Which one?", {
+          thread: terseFollowUpThread,
+        }),
+      ],
+      criteria:
+        "The assistant posts two replies in order. The second reply clarifies which services changed because the terse follow-up 'Which one?' came immediately after Junior's answer and is naturally directed at Junior.",
+    },
+  );
+
+  const humansTookFloorThread = {
+    id: "thread-passive-humans-took-floor",
+    channel_id: "C-passive-humans-took-floor",
+    thread_ts: "17000000.passive-humans-took-floor",
+  };
+
+  slackEval(
+    "passive: same-topic question is skipped after humans take the floor",
+    {
+      overrides: {
+        reply_texts: ["The deploy changed billing, auth, and the API gateway."],
+      },
+      events: [
+        mention("What changed in the deploy?", {
+          thread: humansTookFloorThread,
+        }),
+        threadMessage("I think auth should roll back first.", {
+          thread: humansTookFloorThread,
+        }),
+        threadMessage("What about the billing worker timeline?", {
+          thread: humansTookFloorThread,
+        }),
+      ],
+      criteria:
+        "The assistant posts exactly one reply: the initial deploy summary. It does not answer the later same-topic question about the billing worker timeline because humans resumed the thread and the later question does not clearly turn back to Junior.",
+    },
+  );
+
   const optOutThread = {
     id: "thread-opt-out",
     channel_id: "C-opt-out",

diff --git a/packages/junior-evals/evals/github/skill-workflows.eval.ts b/packages/junior-evals/evals/github/skill-workflows.eval.ts
@@ -18,6 +18,7 @@ describe("Conversational Evals: GitHub Skill Workflows", () => {
     overrides: {
       enable_test_credentials: true,
       plugin_packages: ["@sentry/junior-github"],
+      reply_timeout_ms: 75000,
       test_credential_token: "eval-github-token",
       skill_dirs: ["../junior/skills"],
     },