diff --git a/src/hooks/shared/capture-gate.ts b/src/hooks/shared/capture-gate.ts index 97499e01..55ad3b96 100644 --- a/src/hooks/shared/capture-gate.ts +++ b/src/hooks/shared/capture-gate.ts @@ -1,11 +1,16 @@ /** * Allowlist gate for HIVEMIND_CAPTURE_ONLY_CLI. * - * When the env var is "true", only capture sessions whose - * CLAUDE_CODE_ENTRYPOINT contains the substring "cli". The Claude Agent SDK - * (Python / TypeScript) sets the entrypoint to "sdk-py" / "sdk-ts" when it - * spawns the CLI subprocess, so those sessions fail the check and the hook - * short-circuits. Interactive terminal sessions keep entrypoint="cli". + * When the env var is "true", only capture sessions launched from the + * interactive terminal CLI, whose CLAUDE_CODE_ENTRYPOINT is EXACTLY "cli". + * Everything else is skipped: + * - "sdk-py" / "sdk-ts" — Claude Agent SDK (Python / TypeScript) + * - "sdk-cli" — headless `claude -p` print mode + * Matching must be exact equality, NOT a substring test: `claude -p` reports + * "sdk-cli", which *contains* "cli", so an `includes("cli")` check would let + * print-mode sessions slip through and create stray capture rows even with + * the gate on. Interactive terminal sessions are the only ones that report a + * bare "cli". * * Returns true when the gate PASSES (capture should proceed), false when * the caller should skip. With the gate disabled (env var unset or != "true") @@ -20,5 +25,5 @@ export function entrypointPassesOnlyCliGate( const onlyCli = env.HIVEMIND_CAPTURE_ONLY_CLI === "true"; if (!onlyCli) return true; const entrypoint = env.CLAUDE_CODE_ENTRYPOINT ?? ""; - return entrypoint.includes("cli"); + return entrypoint === "cli"; } diff --git a/src/hooks/wiki-worker.ts b/src/hooks/wiki-worker.ts index b060f65f..ab015c0a 100644 --- a/src/hooks/wiki-worker.ts +++ b/src/hooks/wiki-worker.ts @@ -59,6 +59,26 @@ function esc(s: string): string { .replace(/[\x01-\x08\x0b\x0c\x0e-\x1f\x7f]/g, ""); } +// The capture hooks INSERT session events asynchronously, and Deeplake reads +// are eventually-consistent. Under concurrency (many SDK / `claude -p` sessions +// ending at once) those rows can lag behind SessionEnd, so the worker can read +// zero events for a session that does have them. Retry with linear backoff +// before giving up, instead of stranding the SessionStart placeholder. +/** + * Parse a non-negative integer from an env var, falling back to `fallback` + * for missing / non-numeric / negative values. Without this, a misconfigured + * env var would make `Number(...)` return NaN, the retry loop condition + * `attempt <= NaN` would be false, and retries would be silently disabled — + * reintroducing the stranded-placeholder bug under load. + */ +function parseNonNegativeInt(raw: string | undefined, fallback: number): number { + const n = Number.parseInt(raw ?? "", 10); + return Number.isFinite(n) && n >= 0 ? n : fallback; +} +const EVENT_FETCH_RETRIES = parseNonNegativeInt(process.env.HIVEMIND_WIKI_EVENT_RETRIES, 5); +const EVENT_FETCH_BACKOFF_MS = parseNonNegativeInt(process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS, 1500); +const sleep = (ms: number): Promise => new Promise(resolve => setTimeout(resolve, ms)); + async function query(sql: string, retries = 4): Promise[]> { for (let attempt = 0; attempt <= retries; attempt++) { const r = await fetch(`${cfg.apiUrl}/workspaces/${cfg.workspaceId}/tables/query`, { @@ -109,15 +129,37 @@ function cleanup(): void { async function main(): Promise { try { - // 1. Fetch session events from sessions table, reconstruct JSONL + // 1. Fetch session events from sessions table, reconstruct JSONL. + // Retry on an empty result: the async capture writes (or Deeplake read + // consistency) may simply be lagging behind SessionEnd under load. wlog("fetching session events"); - const rows = await query( + const fetchEvents = () => query( `SELECT message, creation_date FROM "${cfg.sessionsTable}" ` + `WHERE path LIKE '${esc(`/sessions/%${cfg.sessionId}%`)}' ORDER BY creation_date ASC` ); + let rows = await fetchEvents(); + for (let attempt = 1; rows.length === 0 && attempt <= EVENT_FETCH_RETRIES; attempt++) { + const delay = EVENT_FETCH_BACKOFF_MS * attempt; + wlog(`no events yet — retry ${attempt}/${EVENT_FETCH_RETRIES} in ${delay}ms`); + await sleep(delay); + rows = await fetchEvents(); + } if (rows.length === 0) { - wlog("no session events found — exiting"); + // Events never showed up. Do NOT leave the SessionStart placeholder + // stranded at 'in progress' forever — remove it. The `description = + // 'in progress'` guard means a concurrent worker that already wrote a + // real summary for this session is never clobbered. + wlog("no session events after retries — removing orphan placeholder"); + try { + await query( + `DELETE FROM "${cfg.memoryTable}" ` + + `WHERE path = '${esc(`/summaries/${cfg.userName}/${cfg.sessionId}.md`)}' ` + + `AND description = 'in progress'` + ); + } catch (e: any) { + wlog(`orphan placeholder cleanup failed: ${e.message}`); + } return; } diff --git a/tests/claude-code/wiki-worker.test.ts b/tests/claude-code/wiki-worker.test.ts index ca3618fa..07631450 100644 --- a/tests/claude-code/wiki-worker.test.ts +++ b/tests/claude-code/wiki-worker.test.ts @@ -119,18 +119,36 @@ beforeEach(() => { afterEach(() => { global.fetch = originalFetch; process.argv[2] = originalArgv2; + delete process.env.HIVEMIND_WIKI_EVENT_RETRIES; + delete process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS; try { rmSync(rootDir, { recursive: true, force: true }); } catch { /* ignore */ } vi.restoreAllMocks(); }); -// ═══ early exit: zero events ═══════════════════════════════════════════════ +// ═══ zero events: retry, then remove the orphan placeholder ════════════════ describe("wiki-worker — no events", () => { - it("exits early when the sessions table has no rows for this session", async () => { - fetchMock.mockResolvedValue(jsonResp({ columns: ["message", "creation_date"], rows: [] })); + it("removes the orphan placeholder when no events ever appear", async () => { + // retries=0 → skip the backoff loop, go straight to cleanup (keeps the + // test instant; the retry path itself is covered separately below). + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "0"; + const sqls: string[] = []; + fetchMock.mockImplementation(async (_url: string, opts: any) => { + sqls.push(JSON.parse(opts.body).query); + return jsonResp({ columns: ["message", "creation_date"], rows: [] }); + }); await runWorker(); + const log = readFileSync(join(hooksDir, "wiki.log"), "utf-8"); - expect(log).toContain("no session events found — exiting"); + expect(log).toContain("removing orphan placeholder"); + // A DELETE guarded on description='in progress' must be issued for THIS + // session's summary path — never an unguarded delete that could clobber a + // real summary written by a concurrent worker. + const del = sqls.find(s => /^\s*DELETE FROM "memory"/.test(s)); + expect(del).toBeTruthy(); + expect(del).toContain("description = 'in progress'"); + expect(del).toContain("/summaries/alice/sid-worker.md"); + // It must NOT have run claude -p or written any summary. expect(execFileSyncMock).not.toHaveBeenCalled(); expect(uploadSummaryMock).not.toHaveBeenCalled(); expect(finalizeSummaryMock).not.toHaveBeenCalled(); @@ -138,12 +156,62 @@ describe("wiki-worker — no events", () => { expect(releaseLockMock).toHaveBeenCalledWith("sid-worker"); }); - it("treats a response with null rows/columns as empty", async () => { + it("treats a response with null rows/columns as empty (then cleans up)", async () => { + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "0"; fetchMock.mockResolvedValue(jsonResp({})); await runWorker(); expect(execFileSyncMock).not.toHaveBeenCalled(); + expect(uploadSummaryMock).not.toHaveBeenCalled(); expect(releaseLockMock).toHaveBeenCalled(); }); + + it("falls back to default retries when the env var is non-numeric (no silent disable)", async () => { + // Regression guard: a garbage HIVEMIND_WIKI_EVENT_RETRIES must NOT become + // NaN and silently skip the retry loop (which would re-strand placeholders). + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "not-a-number"; + process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS = "0"; + fetchMock.mockResolvedValue(jsonResp({ columns: ["message", "creation_date"], rows: [] })); + await runWorker(); + await new Promise(r => setTimeout(r, 50)); + await new Promise(r => setImmediate(r)); + const log = readFileSync(join(hooksDir, "wiki.log"), "utf-8"); + // It must have retried (fallback to the default of 5), not bailed immediately. + expect(log).toContain("no events yet — retry"); + }); + + it("retries and recovers when events show up on a later fetch (race)", async () => { + // Reproduces the real bug: the async capture write lags behind + // SessionEnd, so the first event SELECT returns empty. With backoff=0 the + // worker should retry, see the events on a subsequent fetch, and finalize + // normally instead of stranding the placeholder. + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "5"; + process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS = "0"; + let eventSelects = 0; + fetchMock.mockImplementation(async (_url: string, opts: any) => { + const q = JSON.parse(opts.body).query as string; + if (/^\s*SELECT message, creation_date FROM "sessions"/.test(q)) { + eventSelects++; + // Empty for the first two attempts, then the events appear. + if (eventSelects <= 2) return jsonResp({ columns: ["message", "creation_date"], rows: [] }); + return jsonResp({ columns: ["message", "creation_date"], rows: [[JSON.stringify({ type: "user_message", content: "hi" }), "2026-01-01T00:00:00Z"]] }); + } + // path lookup + existing-summary lookup → empty is fine + return jsonResp({ columns: [], rows: [] }); + }); + // claude -p "writes" the summary file the worker reads back. + execFileSyncMock.mockImplementation(() => { writeFileSync(join(tmpDir, "summary.md"), "real summary body"); }); + + await runWorker(); + // Allow the backoff-0 setTimeout retries to flush. + await new Promise(r => setTimeout(r, 50)); + await new Promise(r => setImmediate(r)); + + const log = readFileSync(join(hooksDir, "wiki.log"), "utf-8"); + expect(log).toContain("no events yet — retry"); + expect(eventSelects).toBeGreaterThanOrEqual(3); + expect(execFileSyncMock).toHaveBeenCalled(); + expect(uploadSummaryMock).toHaveBeenCalled(); + }); }); // ═══ happy path: events + claude -p + upload ═══════════════════════════════ diff --git a/tests/shared/capture-gate.test.ts b/tests/shared/capture-gate.test.ts index 81c956b8..8a377561 100644 --- a/tests/shared/capture-gate.test.ts +++ b/tests/shared/capture-gate.test.ts @@ -25,17 +25,25 @@ describe("entrypointPassesOnlyCliGate", () => { })).toBe(true); }); - it("passes with gate active and entrypoint contains 'cli' as substring", () => { - // Substring match (not equality) is intentional — covers future variants - // like cli-interactive, claude-cli, etc. without code changes. + it("blocks `claude -p` print mode (entrypoint='sdk-cli')", () => { + // Regression: `claude -p` reports CLAUDE_CODE_ENTRYPOINT="sdk-cli", which + // CONTAINS "cli". A substring check let print-mode sessions through and + // created stray capture rows despite the gate being on. Exact equality + // must exclude it. expect(entrypointPassesOnlyCliGate({ HIVEMIND_CAPTURE_ONLY_CLI: "true", - CLAUDE_CODE_ENTRYPOINT: "cli-interactive", - })).toBe(true); - expect(entrypointPassesOnlyCliGate({ - HIVEMIND_CAPTURE_ONLY_CLI: "true", - CLAUDE_CODE_ENTRYPOINT: "claude-cli", - })).toBe(true); + CLAUDE_CODE_ENTRYPOINT: "sdk-cli", + })).toBe(false); + }); + + it("blocks entrypoints that merely contain 'cli' but aren't exactly 'cli'", () => { + // Exact-match semantics: only a bare "cli" (interactive terminal) passes. + for (const ep of ["sdk-cli", "cli-interactive", "claude-cli", "clip"]) { + expect(entrypointPassesOnlyCliGate({ + HIVEMIND_CAPTURE_ONLY_CLI: "true", + CLAUDE_CODE_ENTRYPOINT: ep, + })).toBe(false); + } }); it("blocks with gate active and entrypoint='sdk-py'", () => { @@ -53,8 +61,8 @@ describe("entrypointPassesOnlyCliGate", () => { }); it("blocks with gate active and entrypoint undefined", () => { - // Strict: missing entrypoint is treated as non-cli. An empty string - // doesn't contain the "cli" substring so the gate filters it out. + // Strict: missing entrypoint is treated as non-cli. It isn't exactly + // "cli" so the gate filters it out. expect(entrypointPassesOnlyCliGate({ HIVEMIND_CAPTURE_ONLY_CLI: "true", })).toBe(false);