Skip to content

Commit f695d9a

Browse files
committed
fix(mcp): address bugbot/greptile review on per-server tool discovery
- Workspace-scoped `mcpKeys.serverToolsWorkspace(workspaceId)` prefix for bulk invalidations (create-server, refresh-all, SSE workspace fallback, OAuth fallback). The previous `mcpKeys.serverTools()` prefix was global and invalidated every workspace's tools cache. - `useMcpToolsQuery` folds `useMcpServers().isLoading` into the aggregate `isLoading` so mounting no longer flashes an "empty tools" state during the servers-list fetch. Aggregate `error` is suppressed when any per-server query already returned data so one slow server can't blank out the others. - `useForceRefreshMcpTools` invalidates the per-server query keys of servers whose force refresh failed, so stale tools don't linger. - `DiscoveryOutcome` error variant carries the original error, restoring the OAuth-exemption check that `getErrorMessage(...)` previously erased. - `discoverServerTools(userId, serverId, workspaceId, forceRefresh = false)` now consults the positive + negative cache by default. Per-server React Query refetches hit the cache instead of re-paying the listTools timeout; callers that explicitly bypass cache (refresh route, OAuth callback, bulk POST refresh) pass `forceRefresh: true`. Negative-cache hits throw a typed `McpConnectionError` so the route layer can surface a fast 503.
1 parent cdfe60e commit f695d9a

8 files changed

Lines changed: 116 additions & 32 deletions

File tree

apps/sim/app/api/mcp/oauth/callback/route.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,10 @@ export const GET = withRouteHandler(async (request: NextRequest) => {
168168

169169
try {
170170
// discoverServerTools writes the result to this server's cache so the UI's
171-
// immediate refetch hits it instead of re-fetching live.
172-
await mcpService.discoverServerTools(session.user.id, server.id, server.workspaceId)
171+
// immediate refetch hits it instead of re-fetching live. Bypass any
172+
// stale positive/negative cache from before re-auth so we actually
173+
// verify the new tokens.
174+
await mcpService.discoverServerTools(session.user.id, server.id, server.workspaceId, true)
173175
} catch (e) {
174176
logger.warn('Post-auth tools refresh failed', toError(e).message)
175177
}

apps/sim/app/api/mcp/servers/[id]/refresh/route.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,12 @@ export const POST = withRouteHandler(
197197
}
198198

199199
try {
200-
discoveredTools = await mcpService.discoverServerTools(userId, serverId, workspaceId)
200+
discoveredTools = await mcpService.discoverServerTools(
201+
userId,
202+
serverId,
203+
workspaceId,
204+
true
205+
)
201206
connectionStatus = 'connected'
202207
toolCount = discoveredTools.length
203208
logger.info(`[${requestId}] Discovered ${toolCount} tools from server ${serverId}`)

apps/sim/app/api/mcp/tools/discover/route.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ export const GET = withRouteHandler(
2828
logger.info(`[${requestId}] Discovering MCP tools`, { serverId, workspaceId, forceRefresh })
2929

3030
const tools = serverId
31-
? await mcpService.discoverServerTools(userId, serverId, workspaceId)
31+
? await mcpService.discoverServerTools(userId, serverId, workspaceId, forceRefresh)
3232
: await mcpService.discoverTools(userId, workspaceId, forceRefresh)
3333

3434
const byServer: Record<string, number> = {}
@@ -76,7 +76,7 @@ export const POST = withRouteHandler(
7676

7777
const results = await Promise.allSettled(
7878
serverIds.map(async (serverId: string) => {
79-
const tools = await mcpService.discoverServerTools(userId, serverId, workspaceId)
79+
const tools = await mcpService.discoverServerTools(userId, serverId, workspaceId, true)
8080
return { serverId, toolCount: tools.length }
8181
})
8282
)

apps/sim/hooks/mcp/use-mcp-oauth-popup.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ export function useMcpOauthPopup({ workspaceId }: UseMcpOauthPopupProps) {
8787
queryKey: mcpKeys.serverToolsList(workspaceId, data.serverId),
8888
})
8989
} else {
90-
queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() })
90+
queryClient.invalidateQueries({
91+
queryKey: mcpKeys.serverToolsWorkspace(workspaceId),
92+
})
9193
}
9294
queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) })
9395
toast.success('Server authorized')

apps/sim/hooks/mcp/use-mcp-tools.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ export function useMcpTools(workspaceId: string): UseMcpToolsResult {
6565
logger.info('Refreshing MCP tools', { forceRefresh, workspaceId })
6666

6767
await queryClient.invalidateQueries({
68-
queryKey: mcpKeys.serverTools(),
68+
queryKey: mcpKeys.serverToolsWorkspace(workspaceId),
6969
refetchType: forceRefresh ? 'active' : 'all',
7070
})
7171
},

apps/sim/hooks/queries/mcp.ts

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,13 @@ export const mcpKeys = {
4848
tools: () => [...mcpKeys.all, 'tools'] as const,
4949
toolsList: (workspaceId?: string) => [...mcpKeys.tools(), workspaceId ?? ''] as const,
5050
serverTools: () => [...mcpKeys.all, 'serverTools'] as const,
51+
// Workspace-scoped prefix so bulk invalidations (refresh-all, create-server)
52+
// only touch the current workspace's per-server entries — not every
53+
// workspace cached in the QueryClient.
54+
serverToolsWorkspace: (workspaceId?: string) =>
55+
[...mcpKeys.serverTools(), workspaceId ?? ''] as const,
5156
serverToolsList: (workspaceId?: string, serverId?: string) =>
52-
[...mcpKeys.serverTools(), workspaceId ?? '', serverId ?? ''] as const,
57+
[...mcpKeys.serverToolsWorkspace(workspaceId), serverId ?? ''] as const,
5358
storedTools: () => [...mcpKeys.all, 'storedTools'] as const,
5459
storedToolsList: (workspaceId?: string) => [...mcpKeys.storedTools(), workspaceId ?? ''] as const,
5560
allowedDomains: () => [...mcpKeys.all, 'allowedDomains'] as const,
@@ -149,7 +154,7 @@ export function useMcpServerTools(workspaceId: string, serverId?: string) {
149154
* fast servers populate immediately.
150155
*/
151156
export function useMcpToolsQuery(workspaceId: string) {
152-
const { data: servers } = useMcpServers(workspaceId)
157+
const { data: servers, isLoading: serversLoading } = useMcpServers(workspaceId)
153158

154159
const serverIds = useMemo(() => (servers ? servers.map((s) => s.id).sort() : []), [servers])
155160

@@ -168,27 +173,31 @@ export function useMcpToolsQuery(workspaceId: string) {
168173
return useMemo(() => {
169174
const tools: McpTool[] = []
170175
let hasData = false
171-
let isLoading = false
176+
let anyServerLoading = false
172177
let firstError: Error | null = null
173178
for (const result of results) {
174179
if (result.data) {
175180
tools.push(...result.data)
176181
hasData = true
177182
}
178-
if (result.isLoading) isLoading = true
183+
if (result.isLoading) anyServerLoading = true
179184
if (!firstError && result.error instanceof Error) firstError = result.error
180185
}
181186
return {
182187
data: tools,
183-
// Match the prior semantics: "loading" means we have nothing to show yet.
184-
// Once any server has returned, the UI can render that and let slow
185-
// neighbors fill in incrementally without keeping the spinner up.
186-
isLoading: isLoading && !hasData,
187-
isFetching: results.some((r) => r.isFetching),
188-
error: firstError,
188+
// "Loading" means we have nothing to render yet — including the gap
189+
// between mount and the server-list arriving (otherwise consumers flash
190+
// an empty state). Once any per-server query has returned, we drop the
191+
// spinner and let slow neighbors fill in incrementally.
192+
isLoading: (serversLoading || anyServerLoading) && !hasData,
193+
isFetching: serversLoading || results.some((r) => r.isFetching),
194+
// Only surface an aggregate error when no server returned anything —
195+
// one slow/dead server should not blank out tools from the others.
196+
// Per-server errors are still exposed via `perServer`.
197+
error: hasData ? null : firstError,
189198
perServer: results,
190199
}
191-
}, [results])
200+
}, [results, serversLoading])
192201
}
193202

194203
export function useForceRefreshMcpTools() {
@@ -207,13 +216,27 @@ export function useForceRefreshMcpTools() {
207216
return tools
208217
})
209218
)
219+
// Invalidate the per-server keys of any server that failed so stale tools
220+
// don't linger after a force-refresh — React Query will refetch and the
221+
// negative cache (set server-side) will make that retry fail fast.
222+
results.forEach((result, index) => {
223+
if (result.status === 'rejected') {
224+
const failedServer = servers[index]
225+
if (failedServer) {
226+
queryClient.invalidateQueries({
227+
queryKey: mcpKeys.serverToolsList(workspaceId, failedServer.id),
228+
})
229+
}
230+
}
231+
})
210232
return results
211233
.filter((r): r is PromiseFulfilledResult<McpTool[]> => r.status === 'fulfilled')
212234
.flatMap((r) => r.value)
213235
},
214236
onSettled: (_data, _error, workspaceId) => {
215-
// Per-server caches were already populated inside `mutationFn` via
216-
// `setQueryData`, so we only need to refresh the dependent views.
237+
// Successful per-server caches were already populated inside `mutationFn`
238+
// via `setQueryData`; failed ones were invalidated above. Just refresh
239+
// the dependent views.
217240
queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) })
218241
queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) })
219242
},
@@ -263,7 +286,9 @@ export function useCreateMcpServer() {
263286
},
264287
onSettled: (_data, _error, variables) => {
265288
queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(variables.workspaceId) })
266-
queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() })
289+
queryClient.invalidateQueries({
290+
queryKey: mcpKeys.serverToolsWorkspace(variables.workspaceId),
291+
})
267292
},
268293
})
269294
}
@@ -486,7 +511,7 @@ export function useMcpToolsEvents(workspaceId: string) {
486511
queryKey: mcpKeys.serverToolsList(workspaceId, serverId),
487512
})
488513
} else {
489-
queryClient.invalidateQueries({ queryKey: mcpKeys.serverTools() })
514+
queryClient.invalidateQueries({ queryKey: mcpKeys.serverToolsWorkspace(workspaceId) })
490515
}
491516
queryClient.invalidateQueries({ queryKey: mcpKeys.serversList(workspaceId) })
492517
queryClient.invalidateQueries({ queryKey: mcpKeys.storedToolsList(workspaceId) })

apps/sim/lib/mcp/service.test.ts

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,11 +291,18 @@ describe('McpService.discoverTools per-server caching', () => {
291291
'Request timed out'
292292
)
293293

294-
// Following the failure, the negative cache would short-circuit discoverTools.
295-
// Reconnecting the server (e.g. after OAuth) should bring it back to live.
294+
// After the failure the negative cache is set, so the next default call
295+
// short-circuits without re-paying the listTools timeout.
296296
mockListTools.mockClear()
297+
await expect(mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID)).rejects.toThrow(
298+
'cooldown'
299+
)
300+
expect(mockListTools).not.toHaveBeenCalled()
301+
302+
// Reconnecting via the explicit-refresh path (refresh button / OAuth
303+
// callback) bypasses both caches and brings the server back to live.
297304
mockListTools.mockResolvedValueOnce([tool('a1', 'mcp-a')])
298-
const tools = await mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID)
305+
const tools = await mcpService.discoverServerTools(USER_ID, 'mcp-a', WORKSPACE_ID, true)
299306
expect(tools.map((t) => t.name)).toEqual(['a1'])
300307

301308
// discoverTools now sees the cleared negative cache + primed positive cache.

apps/sim/lib/mcp/service.ts

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import {
3232
type McpCacheStorageAdapter,
3333
} from '@/lib/mcp/storage'
3434
import {
35+
McpConnectionError,
3536
McpOauthAuthorizationRequiredError,
3637
type McpServerConfig,
3738
type McpServerStatusConfig,
@@ -69,7 +70,9 @@ type DiscoveryOutcome =
6970
}
7071
| { kind: 'oauth-pending' }
7172
| { kind: 'unhealthy' }
72-
| { kind: 'error'; message: string }
73+
// Carries the original error so `markServerUnhealthy` can apply its
74+
// `instanceof` exemption — `getErrorMessage(...)` erases type info.
75+
| { kind: 'error'; message: string; originalError: unknown }
7376

7477
class McpService {
7578
private cacheAdapter: McpCacheStorageAdapter
@@ -519,7 +522,11 @@ class McpService {
519522
) {
520523
return { kind: 'oauth-pending' }
521524
}
522-
return { kind: 'error', message: getErrorMessage(error, 'Unknown error') }
525+
return {
526+
kind: 'error',
527+
message: getErrorMessage(error, 'Unknown error'),
528+
originalError: error,
529+
}
523530
}
524531
})
525532
)
@@ -593,7 +600,7 @@ class McpService {
593600
)
594601
deferredSideEffects.push(
595602
this.updateServerStatus(server.id, workspaceId, false, outcome.message),
596-
this.markServerUnhealthy(workspaceId, server.id, new Error(outcome.message))
603+
this.markServerUnhealthy(workspaceId, server.id, outcome.originalError)
597604
)
598605
})
599606

@@ -630,6 +637,12 @@ class McpService {
630637
* Discover tools from a specific server with retry logic for session errors.
631638
* Retries once on session-related errors (400, 404, session ID issues).
632639
*
640+
* By default consults the positive + negative cache before going live, so
641+
* a React Query refetch for a healthy server is served from cache and a
642+
* recently-failed server short-circuits instead of re-paying the listTools
643+
* timeout. Pass `forceRefresh: true` from explicit user-initiated paths
644+
* (refresh button, OAuth callback) to bypass both caches.
645+
*
633646
* Concurrent callers for the same `(workspaceId, serverId, userId)` share a
634647
* single in-flight upstream request — important when an OAuth callback
635648
* primes the cache while a UI refetch races against it, or when multiple
@@ -638,13 +651,19 @@ class McpService {
638651
async discoverServerTools(
639652
userId: string,
640653
serverId: string,
641-
workspaceId: string
654+
workspaceId: string,
655+
forceRefresh = false
642656
): Promise<McpTool[]> {
643-
const inflightKey = `${workspaceId}:${serverId}:${userId}`
657+
const inflightKey = `${workspaceId}:${serverId}:${userId}:${forceRefresh ? 'force' : 'cache'}`
644658
const existing = this.inflightServerDiscovery.get(inflightKey)
645659
if (existing) return existing
646660

647-
const promise = this.discoverServerToolsImpl(userId, serverId, workspaceId).finally(() => {
661+
const promise = this.discoverServerToolsImpl(
662+
userId,
663+
serverId,
664+
workspaceId,
665+
forceRefresh
666+
).finally(() => {
648667
this.inflightServerDiscovery.delete(inflightKey)
649668
})
650669
this.inflightServerDiscovery.set(inflightKey, promise)
@@ -654,11 +673,35 @@ class McpService {
654673
private async discoverServerToolsImpl(
655674
userId: string,
656675
serverId: string,
657-
workspaceId: string
676+
workspaceId: string,
677+
forceRefresh: boolean
658678
): Promise<McpTool[]> {
659679
const requestId = generateRequestId()
660680
const maxRetries = 2
661681

682+
if (!forceRefresh) {
683+
// Positive cache hit — same payload the aggregate fan-out would return,
684+
// no upstream traffic.
685+
try {
686+
const cached = await this.cacheAdapter.get(serverCacheKey(workspaceId, serverId))
687+
if (cached) {
688+
logger.debug(`[${requestId}] Cache hit for server ${serverId}`)
689+
return cached.tools
690+
}
691+
} catch (error) {
692+
logger.warn(`[${requestId}] Cache read failed for server ${serverId}:`, error)
693+
}
694+
// Negative cache hit — fail fast with a typed error the caller can map
695+
// to a 503-style response, rather than waiting out the listTools timeout.
696+
if (await this.isServerUnhealthy(workspaceId, serverId)) {
697+
logger.info(`[${requestId}] Skipping recently-failed server ${serverId} (negative-cache)`)
698+
throw new McpConnectionError(
699+
'Server recently failed and is in cooldown — try again shortly.',
700+
serverId
701+
)
702+
}
703+
}
704+
662705
for (let attempt = 0; attempt < maxRetries; attempt++) {
663706
try {
664707
logger.info(

0 commit comments

Comments
 (0)