diff --git a/src/tmux-manager.ts b/src/tmux-manager.ts index 53e53234..9284e174 100644 --- a/src/tmux-manager.ts +++ b/src/tmux-manager.ts @@ -1153,19 +1153,23 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { } try { - const psOutput = execSync(`ps -o rss=,pcpu= -p ${session.pid} 2>/dev/null || echo "0 0"`, { - encoding: 'utf-8', - timeout: EXEC_TIMEOUT_MS, - }).trim(); + const psOutput = ( + await execAsync(`ps -o rss=,pcpu= -p ${session.pid} 2>/dev/null || echo "0 0"`, { + encoding: 'utf-8', + timeout: EXEC_TIMEOUT_MS, + }) + ).stdout.trim(); const [rss, cpu] = psOutput.split(/\s+/).map((x) => parseFloat(x) || 0); let childCount = 0; try { - const childOutput = execSync(`pgrep -P ${session.pid} | wc -l`, { - encoding: 'utf-8', - timeout: EXEC_TIMEOUT_MS, - }).trim(); + const childOutput = ( + await execAsync(`pgrep -P ${session.pid} | wc -l`, { + encoding: 'utf-8', + timeout: EXEC_TIMEOUT_MS, + }) + ).stdout.trim(); childCount = parseInt(childOutput, 10) || 0; } catch { // No children or command failed @@ -1202,13 +1206,15 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { // Step 1: Get descendant PIDs const descendantMap = new Map(); - const pgrepOutput = execSync( - `for p in ${sessionPids.join(' ')}; do children=$(pgrep -P $p 2>/dev/null | tr '\\n' ','); echo "$p:$children"; done`, - { - encoding: 'utf-8', - timeout: EXEC_TIMEOUT_MS, - } - ).trim(); + const pgrepOutput = ( + await execAsync( + `for p in ${sessionPids.join(' ')}; do children=$(pgrep -P $p 2>/dev/null | tr '\\n' ','); echo "$p:$children"; done`, + { + encoding: 'utf-8', + timeout: EXEC_TIMEOUT_MS, + } + ) + ).stdout.trim(); for (const line of pgrepOutput.split('\n')) { const [pidStr, childrenStr] = line.split(':'); @@ -1233,10 +1239,12 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { // Step 3: Single ps call const pidArray = Array.from(allPids); if (pidArray.length > 0) { - const psOutput = execSync(`ps -o pid=,rss=,pcpu= -p ${pidArray.join(',')} 2>/dev/null || true`, { - encoding: 'utf-8', - timeout: EXEC_TIMEOUT_MS, - }).trim(); + const psOutput = ( + await execAsync(`ps -o pid=,rss=,pcpu= -p ${pidArray.join(',')} 2>/dev/null || true`, { + encoding: 'utf-8', + timeout: EXEC_TIMEOUT_MS, + }) + ).stdout.trim(); const processStats = new Map(); for (const line of psOutput.split('\n')) { @@ -1324,11 +1332,11 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { clearInterval(this.mouseSyncInterval); } - this.mouseSyncInterval = setInterval(() => { + this.mouseSyncInterval = setInterval(async () => { if (IS_TEST_MODE) return; for (const session of this.sessions.values()) { - const panes = this.listPanes(session.muxName); + const panes = await this.listPanes(session.muxName); const count = panes.length; if (count === 0) continue; @@ -1337,12 +1345,12 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { // Pane count changed — toggle mouse mode if (count > 1) { - if (this.enableMouseMode(session.muxName)) { + if (await this.enableMouseMode(session.muxName)) { this.lastPaneCount.set(session.muxName, count); } // If enableMouseMode fails, DON'T update lastPaneCount — retry next poll } else { - if (this.disableMouseMode(session.muxName)) { + if (await this.disableMouseMode(session.muxName)) { this.lastPaneCount.set(session.muxName, count); } } @@ -1473,7 +1481,7 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { * Allows clicking to select panes in agent team split-pane layouts. * When mouse mode is on, tmux intercepts mouse events (slow selection, no browser copy). */ - enableMouseMode(muxName: string): boolean { + async enableMouseMode(muxName: string): Promise { if (IS_TEST_MODE) return true; if (!isValidMuxName(muxName)) { console.error('[TmuxManager] Invalid session name in enableMouseMode:', muxName); @@ -1481,7 +1489,7 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { } try { - execSync(`${this.tmux()} set-option -t "${muxName}" mouse on`, { + await execAsync(`${this.tmux()} set-option -t "${muxName}" mouse on`, { encoding: 'utf-8', timeout: EXEC_TIMEOUT_MS, }); @@ -1497,7 +1505,7 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { * Disable mouse mode for an existing tmux session. * Restores native xterm.js text selection and browser clipboard copy. */ - disableMouseMode(muxName: string): boolean { + async disableMouseMode(muxName: string): Promise { if (IS_TEST_MODE) return true; if (!isValidMuxName(muxName)) { console.error('[TmuxManager] Invalid session name in disableMouseMode:', muxName); @@ -1505,7 +1513,7 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { } try { - execSync(`${this.tmux()} set-option -t "${muxName}" mouse off`, { + await execAsync(`${this.tmux()} set-option -t "${muxName}" mouse off`, { encoding: 'utf-8', timeout: EXEC_TIMEOUT_MS, }); @@ -1522,9 +1530,9 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { * Called by TeamWatcher when teammates spawn/despawn panes. * Uses `tmux list-panes` for bulletproof detection — counts actual panes, not config. */ - syncMouseMode(muxName: string): boolean { + async syncMouseMode(muxName: string): Promise { if (IS_TEST_MODE) return true; - const panes = this.listPanes(muxName); + const panes = await this.listPanes(muxName); if (panes.length > 1) { return this.enableMouseMode(muxName); } else { @@ -1536,7 +1544,7 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { * List all panes in a tmux session. * Returns structured info for each pane. */ - listPanes(muxName: string): PaneInfo[] { + async listPanes(muxName: string): Promise { if (IS_TEST_MODE) return []; if (!isValidMuxName(muxName)) { console.error('[TmuxManager] Invalid session name in listPanes:', muxName); @@ -1544,10 +1552,12 @@ export class TmuxManager extends EventEmitter implements TerminalMultiplexer { } try { - const output = execSync( - `${this.tmux()} list-panes -t "${muxName}" -F '#{pane_id}:#{pane_index}:#{pane_pid}:#{pane_width}:#{pane_height}'`, - { encoding: 'utf-8', timeout: EXEC_TIMEOUT_MS } - ).trim(); + const output = ( + await execAsync( + `${this.tmux()} list-panes -t "${muxName}" -F '#{pane_id}:#{pane_index}:#{pane_pid}:#{pane_width}:#{pane_height}'`, + { encoding: 'utf-8', timeout: EXEC_TIMEOUT_MS } + ) + ).stdout.trim(); return output .split('\n') diff --git a/src/utils/event-loop-monitor.ts b/src/utils/event-loop-monitor.ts new file mode 100644 index 00000000..22b0baf1 --- /dev/null +++ b/src/utils/event-loop-monitor.ts @@ -0,0 +1,54 @@ +/** + * @fileoverview Event-loop lag monitor. + * + * Node is single-threaded: any synchronous work (e.g. a blocking `execSync`) + * freezes the whole event loop, so the HTTP server stops answering on its port + * while the process stays alive and other ports are unaffected. Such stalls + * self-heal and never restart the process, so a periodic loopback healthcheck + * misses them entirely — they leave no trace. + * + * This monitor samples how late a fixed-interval timer actually fires versus when + * it was scheduled; the excess is time the loop was blocked. When that exceeds a + * threshold it logs the measured stall, turning otherwise-invisible "port briefly + * unreachable" incidents into a timestamped, quantified log line. + * + * @module utils/event-loop-monitor + */ + +export interface EventLoopMonitorHandle { + stop(): void; +} + +/** + * Start sampling event-loop lag. + * + * @param sampleMs How often to sample (and the baseline interval lag is measured against). + * @param thresholdMs Only stalls at or above this many ms are logged (noise floor). + * @param log Sink for stall reports; defaults to console.warn (lands in the web log). + */ +export function startEventLoopMonitor( + sampleMs = 1000, + thresholdMs = 1000, + log: (msg: string) => void = (m) => console.warn(m) +): EventLoopMonitorHandle { + let last = performance.now(); + + const timer = setInterval(() => { + const now = performance.now(); + // Lag = elapsed beyond the scheduled interval = time the loop was blocked. + const lag = Math.round(now - last - sampleMs); + if (lag >= thresholdMs) { + log(`[EventLoopLag] event loop blocked ~${lag}ms (at ${new Date().toISOString()})`); + } + last = now; + }, sampleMs); + + // Never keep the process alive solely for this monitor. + timer.unref?.(); + + return { + stop() { + clearInterval(timer); + }, + }; +} diff --git a/src/utils/index.ts b/src/utils/index.ts index ded4d09b..298dd5f0 100644 --- a/src/utils/index.ts +++ b/src/utils/index.ts @@ -9,6 +9,8 @@ export { BufferAccumulator } from './buffer-accumulator.js'; export { CleanupManager } from './cleanup-manager.js'; export { Debouncer, KeyedDebouncer } from './debouncer.js'; +export { startEventLoopMonitor } from './event-loop-monitor.js'; +export type { EventLoopMonitorHandle } from './event-loop-monitor.js'; export { StaleExpirationMap } from './stale-expiration-map.js'; export { ANSI_ESCAPE_PATTERN_FULL, diff --git a/src/web/server.ts b/src/web/server.ts index dc3f725a..c7000b7b 100644 --- a/src/web/server.ts +++ b/src/web/server.ts @@ -94,7 +94,8 @@ import { type ImageDetectedEvent, DEFAULT_NICE_CONFIG, } from '../types.js'; -import { CleanupManager, KeyedDebouncer, StaleExpirationMap } from '../utils/index.js'; +import { CleanupManager, KeyedDebouncer, StaleExpirationMap, startEventLoopMonitor } from '../utils/index.js'; +import type { EventLoopMonitorHandle } from '../utils/index.js'; import { MAX_CONCURRENT_SESSIONS, MAX_SSE_CLIENTS } from '../config/map-limits.js'; import { SseEvent } from './sse-events.js'; import type { ScheduledRun } from './ports/index.js'; @@ -231,6 +232,7 @@ export class WebServer extends EventEmitter { private teamWatcher: TeamWatcher = new TeamWatcher(); private _orchestratorLoop: import('../orchestrator-loop.js').OrchestratorLoop | null = null; private _pasteImageGcStop: (() => void) | null = null; + private _eventLoopMonitor: EventLoopMonitorHandle | null = null; private teamWatcherHandlers: { teamCreated: (config: unknown) => void; teamUpdated: (config: unknown) => void; @@ -1551,6 +1553,10 @@ export class WebServer extends EventEmitter { // older than 7 days from each live session's .claude-images/ hourly. if (!this.testMode) { this._pasteImageGcStop = startPasteImageGc({ sessions: this.sessions }); + // Surface event-loop stalls (e.g. a slow synchronous tmux/ps call) so the + // intermittent ":3000 briefly unreachable, process never restarts" class of + // incident leaves a quantified log line instead of vanishing silently. + this._eventLoopMonitor = startEventLoopMonitor(); } await this.app.listen({ port: this.port, host: '0.0.0.0' }); @@ -1911,6 +1917,11 @@ export class WebServer extends EventEmitter { this._pasteImageGcStop = null; } + if (this._eventLoopMonitor) { + this._eventLoopMonitor.stop(); + this._eventLoopMonitor = null; + } + // Dispose all managed timers (intervals + resettable timeouts) this.cleanup.dispose();