From 8a0b48201120d4437262e1416d94f90b020c8d29 Mon Sep 17 00:00:00 2001
From: harshalmore31 <harshalmore2468@gmail.com>
Date: Wed, 27 May 2026 10:40:34 +0530
Subject: [PATCH] fix(agent): widen MCP client-session timeout to 30s for cold
 first runs

The openai-agent and deep-agent runners left MCP stdio client-session
timeouts at the SDK / library defaults, so a heavy server's cold-cache
first spawn (e.g. vibration importing scipy + numpy + DSP submodules)
could exceed the budget and abort the entire agent run with
McpError: Timed out ... Waited 5.0 seconds during session.initialize().

Plumb a 30s timeout through both runners.  Warm handshakes are
sub-second across all six servers so there is no operational downside.

claude-agent is unaffected: its Python-side initialize timeout already
defaults to 60s, and the stdio MCP servers it spawns are managed by
the bundled Claude Code CLI subprocess.

Closes #335.

Signed-off-by: harshalmore31 <harshalmore2468@gmail.com>
---
 src/agent/deep_agent/runner.py   | 10 ++++++++++
 src/agent/openai_agent/runner.py |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/src/agent/deep_agent/runner.py b/src/agent/deep_agent/runner.py
index 3d975d55..ad3622e8 100644
--- a/src/agent/deep_agent/runner.py
+++ b/src/agent/deep_agent/runner.py
@@ -38,6 +38,11 @@
 
 _DEFAULT_MODEL = "litellm_proxy/aws/claude-opus-4-6"
 
+# MCP client-session timeout for stdio servers.  Cold-cache first spawns of
+# heavy servers (vibration imports scipy + numpy + DSP) can exceed the mcp
+# library's default handshake budget.  Warm runs are sub-second.
+_MCP_CLIENT_TIMEOUT_SECONDS = 30
+
 
 def _build_chat_model(model_id: str):
     """Construct a LangChain chat model for *model_id*.
@@ -85,6 +90,11 @@ def _build_mcp_connections(
             "command": "uv",
             "args": ["run", cmd_arg],
             "cwd": str(_REPO_ROOT),
+            "session_kwargs": {
+                "read_timeout_seconds": _dt.timedelta(
+                    seconds=_MCP_CLIENT_TIMEOUT_SECONDS
+                ),
+            },
         }
     return connections
 
diff --git a/src/agent/openai_agent/runner.py b/src/agent/openai_agent/runner.py
index 8dfccb48..c36fa745 100644
--- a/src/agent/openai_agent/runner.py
+++ b/src/agent/openai_agent/runner.py
@@ -39,6 +39,12 @@
 
 _DEFAULT_MODEL = "litellm_proxy/azure/gpt-5.4"
 
+# MCP client-session timeout for stdio servers.  The SDK default of 5s is too
+# tight for first-spawn cold-cache cases (heavy imports like scipy/numpy push
+# vibration cold start past 10s on a freshly synced env).  Warm handshakes are
+# all sub-second so a generous value has no operational downside.
+_MCP_CLIENT_TIMEOUT_SECONDS = 30
+
 
 def _build_run_config(model_id: str) -> RunConfig | None:
     """Build a RunConfig with a LiteLLM model provider when needed.
@@ -95,6 +101,7 @@ def _build_mcp_servers(
                     "args": ["run", cmd_arg],
                 },
                 cache_tools_list=True,
+                client_session_timeout_seconds=_MCP_CLIENT_TIMEOUT_SECONDS,
             )
         )
     return servers