From 8a0b48201120d4437262e1416d94f90b020c8d29 Mon Sep 17 00:00:00 2001 From: harshalmore31 Date: Wed, 27 May 2026 10:40:34 +0530 Subject: [PATCH] fix(agent): widen MCP client-session timeout to 30s for cold first runs The openai-agent and deep-agent runners left MCP stdio client-session timeouts at the SDK / library defaults, so a heavy server's cold-cache first spawn (e.g. vibration importing scipy + numpy + DSP submodules) could exceed the budget and abort the entire agent run with McpError: Timed out ... Waited 5.0 seconds during session.initialize(). Plumb a 30s timeout through both runners. Warm handshakes are sub-second across all six servers so there is no operational downside. claude-agent is unaffected: its Python-side initialize timeout already defaults to 60s, and the stdio MCP servers it spawns are managed by the bundled Claude Code CLI subprocess. Closes #335. Signed-off-by: harshalmore31 --- src/agent/deep_agent/runner.py | 10 ++++++++++ src/agent/openai_agent/runner.py | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/src/agent/deep_agent/runner.py b/src/agent/deep_agent/runner.py index 3d975d55..ad3622e8 100644 --- a/src/agent/deep_agent/runner.py +++ b/src/agent/deep_agent/runner.py @@ -38,6 +38,11 @@ _DEFAULT_MODEL = "litellm_proxy/aws/claude-opus-4-6" +# MCP client-session timeout for stdio servers. Cold-cache first spawns of +# heavy servers (vibration imports scipy + numpy + DSP) can exceed the mcp +# library's default handshake budget. Warm runs are sub-second. +_MCP_CLIENT_TIMEOUT_SECONDS = 30 + def _build_chat_model(model_id: str): """Construct a LangChain chat model for *model_id*. @@ -85,6 +90,11 @@ def _build_mcp_connections( "command": "uv", "args": ["run", cmd_arg], "cwd": str(_REPO_ROOT), + "session_kwargs": { + "read_timeout_seconds": _dt.timedelta( + seconds=_MCP_CLIENT_TIMEOUT_SECONDS + ), + }, } return connections diff --git a/src/agent/openai_agent/runner.py b/src/agent/openai_agent/runner.py index 8dfccb48..c36fa745 100644 --- a/src/agent/openai_agent/runner.py +++ b/src/agent/openai_agent/runner.py @@ -39,6 +39,12 @@ _DEFAULT_MODEL = "litellm_proxy/azure/gpt-5.4" +# MCP client-session timeout for stdio servers. The SDK default of 5s is too +# tight for first-spawn cold-cache cases (heavy imports like scipy/numpy push +# vibration cold start past 10s on a freshly synced env). Warm handshakes are +# all sub-second so a generous value has no operational downside. +_MCP_CLIENT_TIMEOUT_SECONDS = 30 + def _build_run_config(model_id: str) -> RunConfig | None: """Build a RunConfig with a LiteLLM model provider when needed. @@ -95,6 +101,7 @@ def _build_mcp_servers( "args": ["run", cmd_arg], }, cache_tools_list=True, + client_session_timeout_seconds=_MCP_CLIENT_TIMEOUT_SECONDS, ) ) return servers