From ea2598664c0d2572f293f0027b16c210cddcba42 Mon Sep 17 00:00:00 2001 From: Emanuele Fenocchi Date: Wed, 10 Jun 2026 21:30:03 +0000 Subject: [PATCH 1/2] fix(capture): exclude `claude -p` from only-cli gate; stop stranding placeholders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gate: HIVEMIND_CAPTURE_ONLY_CLI used `entrypoint.includes("cli")`, which let headless `claude -p` (CLAUDE_CODE_ENTRYPOINT="sdk-cli") slip through and create capture rows even with the gate on. Tighten to exact `=== "cli"` so only the interactive terminal passes; sdk-py / sdk-ts / sdk-cli are excluded. Worker: the wiki worker exited immediately when the sessions table returned zero events, leaving the SessionStart placeholder stranded at "in progress" forever. Under concurrency the async capture writes lag behind SessionEnd, so this stranded a large fraction of short sessions. Retry the event fetch with backoff (HIVEMIND_WIKI_EVENT_RETRIES / _BACKOFF_MS), and if events still never arrive, delete the orphan placeholder — guarded on description='in progress' so a concurrent real summary is never clobbered. Co-Authored-By: Claude Opus 4.8 --- src/hooks/shared/capture-gate.ts | 17 ++++--- src/hooks/wiki-worker.ts | 37 ++++++++++++++-- tests/claude-code/wiki-worker.test.ts | 64 ++++++++++++++++++++++++--- tests/shared/capture-gate.test.ts | 30 ++++++++----- 4 files changed, 123 insertions(+), 25 deletions(-) diff --git a/src/hooks/shared/capture-gate.ts b/src/hooks/shared/capture-gate.ts index 97499e01..55ad3b96 100644 --- a/src/hooks/shared/capture-gate.ts +++ b/src/hooks/shared/capture-gate.ts @@ -1,11 +1,16 @@ /** * Allowlist gate for HIVEMIND_CAPTURE_ONLY_CLI. * - * When the env var is "true", only capture sessions whose - * CLAUDE_CODE_ENTRYPOINT contains the substring "cli". The Claude Agent SDK - * (Python / TypeScript) sets the entrypoint to "sdk-py" / "sdk-ts" when it - * spawns the CLI subprocess, so those sessions fail the check and the hook - * short-circuits. Interactive terminal sessions keep entrypoint="cli". + * When the env var is "true", only capture sessions launched from the + * interactive terminal CLI, whose CLAUDE_CODE_ENTRYPOINT is EXACTLY "cli". + * Everything else is skipped: + * - "sdk-py" / "sdk-ts" — Claude Agent SDK (Python / TypeScript) + * - "sdk-cli" — headless `claude -p` print mode + * Matching must be exact equality, NOT a substring test: `claude -p` reports + * "sdk-cli", which *contains* "cli", so an `includes("cli")` check would let + * print-mode sessions slip through and create stray capture rows even with + * the gate on. Interactive terminal sessions are the only ones that report a + * bare "cli". * * Returns true when the gate PASSES (capture should proceed), false when * the caller should skip. With the gate disabled (env var unset or != "true") @@ -20,5 +25,5 @@ export function entrypointPassesOnlyCliGate( const onlyCli = env.HIVEMIND_CAPTURE_ONLY_CLI === "true"; if (!onlyCli) return true; const entrypoint = env.CLAUDE_CODE_ENTRYPOINT ?? ""; - return entrypoint.includes("cli"); + return entrypoint === "cli"; } diff --git a/src/hooks/wiki-worker.ts b/src/hooks/wiki-worker.ts index b060f65f..a985ca99 100644 --- a/src/hooks/wiki-worker.ts +++ b/src/hooks/wiki-worker.ts @@ -59,6 +59,15 @@ function esc(s: string): string { .replace(/[\x01-\x08\x0b\x0c\x0e-\x1f\x7f]/g, ""); } +// The capture hooks INSERT session events asynchronously, and Deeplake reads +// are eventually-consistent. Under concurrency (many SDK / `claude -p` sessions +// ending at once) those rows can lag behind SessionEnd, so the worker can read +// zero events for a session that does have them. Retry with linear backoff +// before giving up, instead of stranding the SessionStart placeholder. +const EVENT_FETCH_RETRIES = Number(process.env.HIVEMIND_WIKI_EVENT_RETRIES ?? "5"); +const EVENT_FETCH_BACKOFF_MS = Number(process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS ?? "1500"); +const sleep = (ms: number): Promise => new Promise(resolve => setTimeout(resolve, ms)); + async function query(sql: string, retries = 4): Promise[]> { for (let attempt = 0; attempt <= retries; attempt++) { const r = await fetch(`${cfg.apiUrl}/workspaces/${cfg.workspaceId}/tables/query`, { @@ -109,15 +118,37 @@ function cleanup(): void { async function main(): Promise { try { - // 1. Fetch session events from sessions table, reconstruct JSONL + // 1. Fetch session events from sessions table, reconstruct JSONL. + // Retry on an empty result: the async capture writes (or Deeplake read + // consistency) may simply be lagging behind SessionEnd under load. wlog("fetching session events"); - const rows = await query( + const fetchEvents = () => query( `SELECT message, creation_date FROM "${cfg.sessionsTable}" ` + `WHERE path LIKE '${esc(`/sessions/%${cfg.sessionId}%`)}' ORDER BY creation_date ASC` ); + let rows = await fetchEvents(); + for (let attempt = 1; rows.length === 0 && attempt <= EVENT_FETCH_RETRIES; attempt++) { + const delay = EVENT_FETCH_BACKOFF_MS * attempt; + wlog(`no events yet — retry ${attempt}/${EVENT_FETCH_RETRIES} in ${delay}ms`); + await sleep(delay); + rows = await fetchEvents(); + } if (rows.length === 0) { - wlog("no session events found — exiting"); + // Events never showed up. Do NOT leave the SessionStart placeholder + // stranded at 'in progress' forever — remove it. The `description = + // 'in progress'` guard means a concurrent worker that already wrote a + // real summary for this session is never clobbered. + wlog("no session events after retries — removing orphan placeholder"); + try { + await query( + `DELETE FROM "${cfg.memoryTable}" ` + + `WHERE path = '${esc(`/summaries/${cfg.userName}/${cfg.sessionId}.md`)}' ` + + `AND description = 'in progress'` + ); + } catch (e: any) { + wlog(`orphan placeholder cleanup failed: ${e.message}`); + } return; } diff --git a/tests/claude-code/wiki-worker.test.ts b/tests/claude-code/wiki-worker.test.ts index ca3618fa..69be6bfb 100644 --- a/tests/claude-code/wiki-worker.test.ts +++ b/tests/claude-code/wiki-worker.test.ts @@ -119,18 +119,36 @@ beforeEach(() => { afterEach(() => { global.fetch = originalFetch; process.argv[2] = originalArgv2; + delete process.env.HIVEMIND_WIKI_EVENT_RETRIES; + delete process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS; try { rmSync(rootDir, { recursive: true, force: true }); } catch { /* ignore */ } vi.restoreAllMocks(); }); -// ═══ early exit: zero events ═══════════════════════════════════════════════ +// ═══ zero events: retry, then remove the orphan placeholder ════════════════ describe("wiki-worker — no events", () => { - it("exits early when the sessions table has no rows for this session", async () => { - fetchMock.mockResolvedValue(jsonResp({ columns: ["message", "creation_date"], rows: [] })); + it("removes the orphan placeholder when no events ever appear", async () => { + // retries=0 → skip the backoff loop, go straight to cleanup (keeps the + // test instant; the retry path itself is covered separately below). + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "0"; + const sqls: string[] = []; + fetchMock.mockImplementation(async (_url: string, opts: any) => { + sqls.push(JSON.parse(opts.body).query); + return jsonResp({ columns: ["message", "creation_date"], rows: [] }); + }); await runWorker(); + const log = readFileSync(join(hooksDir, "wiki.log"), "utf-8"); - expect(log).toContain("no session events found — exiting"); + expect(log).toContain("removing orphan placeholder"); + // A DELETE guarded on description='in progress' must be issued for THIS + // session's summary path — never an unguarded delete that could clobber a + // real summary written by a concurrent worker. + const del = sqls.find(s => /^\s*DELETE FROM "memory"/.test(s)); + expect(del).toBeTruthy(); + expect(del).toContain("description = 'in progress'"); + expect(del).toContain("/summaries/alice/sid-worker.md"); + // It must NOT have run claude -p or written any summary. expect(execFileSyncMock).not.toHaveBeenCalled(); expect(uploadSummaryMock).not.toHaveBeenCalled(); expect(finalizeSummaryMock).not.toHaveBeenCalled(); @@ -138,12 +156,48 @@ describe("wiki-worker — no events", () => { expect(releaseLockMock).toHaveBeenCalledWith("sid-worker"); }); - it("treats a response with null rows/columns as empty", async () => { + it("treats a response with null rows/columns as empty (then cleans up)", async () => { + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "0"; fetchMock.mockResolvedValue(jsonResp({})); await runWorker(); expect(execFileSyncMock).not.toHaveBeenCalled(); + expect(uploadSummaryMock).not.toHaveBeenCalled(); expect(releaseLockMock).toHaveBeenCalled(); }); + + it("retries and recovers when events show up on a later fetch (race)", async () => { + // Reproduces the real bug: the async capture write lags behind + // SessionEnd, so the first event SELECT returns empty. With backoff=0 the + // worker should retry, see the events on a subsequent fetch, and finalize + // normally instead of stranding the placeholder. + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "5"; + process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS = "0"; + let eventSelects = 0; + fetchMock.mockImplementation(async (_url: string, opts: any) => { + const q = JSON.parse(opts.body).query as string; + if (/^\s*SELECT message, creation_date FROM "sessions"/.test(q)) { + eventSelects++; + // Empty for the first two attempts, then the events appear. + if (eventSelects <= 2) return jsonResp({ columns: ["message", "creation_date"], rows: [] }); + return jsonResp({ columns: ["message", "creation_date"], rows: [[JSON.stringify({ type: "user_message", content: "hi" }), "2026-01-01T00:00:00Z"]] }); + } + // path lookup + existing-summary lookup → empty is fine + return jsonResp({ columns: [], rows: [] }); + }); + // claude -p "writes" the summary file the worker reads back. + execFileSyncMock.mockImplementation(() => { writeFileSync(join(tmpDir, "summary.md"), "real summary body"); }); + + await runWorker(); + // Allow the backoff-0 setTimeout retries to flush. + await new Promise(r => setTimeout(r, 50)); + await new Promise(r => setImmediate(r)); + + const log = readFileSync(join(hooksDir, "wiki.log"), "utf-8"); + expect(log).toContain("no events yet — retry"); + expect(eventSelects).toBeGreaterThanOrEqual(3); + expect(execFileSyncMock).toHaveBeenCalled(); + expect(uploadSummaryMock).toHaveBeenCalled(); + }); }); // ═══ happy path: events + claude -p + upload ═══════════════════════════════ diff --git a/tests/shared/capture-gate.test.ts b/tests/shared/capture-gate.test.ts index 81c956b8..8a377561 100644 --- a/tests/shared/capture-gate.test.ts +++ b/tests/shared/capture-gate.test.ts @@ -25,17 +25,25 @@ describe("entrypointPassesOnlyCliGate", () => { })).toBe(true); }); - it("passes with gate active and entrypoint contains 'cli' as substring", () => { - // Substring match (not equality) is intentional — covers future variants - // like cli-interactive, claude-cli, etc. without code changes. + it("blocks `claude -p` print mode (entrypoint='sdk-cli')", () => { + // Regression: `claude -p` reports CLAUDE_CODE_ENTRYPOINT="sdk-cli", which + // CONTAINS "cli". A substring check let print-mode sessions through and + // created stray capture rows despite the gate being on. Exact equality + // must exclude it. expect(entrypointPassesOnlyCliGate({ HIVEMIND_CAPTURE_ONLY_CLI: "true", - CLAUDE_CODE_ENTRYPOINT: "cli-interactive", - })).toBe(true); - expect(entrypointPassesOnlyCliGate({ - HIVEMIND_CAPTURE_ONLY_CLI: "true", - CLAUDE_CODE_ENTRYPOINT: "claude-cli", - })).toBe(true); + CLAUDE_CODE_ENTRYPOINT: "sdk-cli", + })).toBe(false); + }); + + it("blocks entrypoints that merely contain 'cli' but aren't exactly 'cli'", () => { + // Exact-match semantics: only a bare "cli" (interactive terminal) passes. + for (const ep of ["sdk-cli", "cli-interactive", "claude-cli", "clip"]) { + expect(entrypointPassesOnlyCliGate({ + HIVEMIND_CAPTURE_ONLY_CLI: "true", + CLAUDE_CODE_ENTRYPOINT: ep, + })).toBe(false); + } }); it("blocks with gate active and entrypoint='sdk-py'", () => { @@ -53,8 +61,8 @@ describe("entrypointPassesOnlyCliGate", () => { }); it("blocks with gate active and entrypoint undefined", () => { - // Strict: missing entrypoint is treated as non-cli. An empty string - // doesn't contain the "cli" substring so the gate filters it out. + // Strict: missing entrypoint is treated as non-cli. It isn't exactly + // "cli" so the gate filters it out. expect(entrypointPassesOnlyCliGate({ HIVEMIND_CAPTURE_ONLY_CLI: "true", })).toBe(false); From 6df6ed80cf6df488cfd85c61ca4495741ee7a3b0 Mon Sep 17 00:00:00 2001 From: Emanuele Fenocchi Date: Wed, 10 Jun 2026 21:40:55 +0000 Subject: [PATCH 2/2] fix(wiki-worker): harden retry/backoff env parsing against NaN/negative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses CodeRabbit review on PR #254. A non-numeric HIVEMIND_WIKI_EVENT_RETRIES made Number(...) return NaN, so `attempt <= NaN` was always false and retries were silently disabled — reintroducing placeholder orphaning under load. Parse via a parseNonNegativeInt helper that falls back to the defaults (5 / 1500ms) for missing/non-numeric/negative values. Adds a regression test asserting a garbage env value still retries. Co-Authored-By: Claude Opus 4.8 --- src/hooks/wiki-worker.ts | 15 +++++++++++++-- tests/claude-code/wiki-worker.test.ts | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/hooks/wiki-worker.ts b/src/hooks/wiki-worker.ts index a985ca99..ab015c0a 100644 --- a/src/hooks/wiki-worker.ts +++ b/src/hooks/wiki-worker.ts @@ -64,8 +64,19 @@ function esc(s: string): string { // ending at once) those rows can lag behind SessionEnd, so the worker can read // zero events for a session that does have them. Retry with linear backoff // before giving up, instead of stranding the SessionStart placeholder. -const EVENT_FETCH_RETRIES = Number(process.env.HIVEMIND_WIKI_EVENT_RETRIES ?? "5"); -const EVENT_FETCH_BACKOFF_MS = Number(process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS ?? "1500"); +/** + * Parse a non-negative integer from an env var, falling back to `fallback` + * for missing / non-numeric / negative values. Without this, a misconfigured + * env var would make `Number(...)` return NaN, the retry loop condition + * `attempt <= NaN` would be false, and retries would be silently disabled — + * reintroducing the stranded-placeholder bug under load. + */ +function parseNonNegativeInt(raw: string | undefined, fallback: number): number { + const n = Number.parseInt(raw ?? "", 10); + return Number.isFinite(n) && n >= 0 ? n : fallback; +} +const EVENT_FETCH_RETRIES = parseNonNegativeInt(process.env.HIVEMIND_WIKI_EVENT_RETRIES, 5); +const EVENT_FETCH_BACKOFF_MS = parseNonNegativeInt(process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS, 1500); const sleep = (ms: number): Promise => new Promise(resolve => setTimeout(resolve, ms)); async function query(sql: string, retries = 4): Promise[]> { diff --git a/tests/claude-code/wiki-worker.test.ts b/tests/claude-code/wiki-worker.test.ts index 69be6bfb..07631450 100644 --- a/tests/claude-code/wiki-worker.test.ts +++ b/tests/claude-code/wiki-worker.test.ts @@ -165,6 +165,20 @@ describe("wiki-worker — no events", () => { expect(releaseLockMock).toHaveBeenCalled(); }); + it("falls back to default retries when the env var is non-numeric (no silent disable)", async () => { + // Regression guard: a garbage HIVEMIND_WIKI_EVENT_RETRIES must NOT become + // NaN and silently skip the retry loop (which would re-strand placeholders). + process.env.HIVEMIND_WIKI_EVENT_RETRIES = "not-a-number"; + process.env.HIVEMIND_WIKI_EVENT_BACKOFF_MS = "0"; + fetchMock.mockResolvedValue(jsonResp({ columns: ["message", "creation_date"], rows: [] })); + await runWorker(); + await new Promise(r => setTimeout(r, 50)); + await new Promise(r => setImmediate(r)); + const log = readFileSync(join(hooksDir, "wiki.log"), "utf-8"); + // It must have retried (fallback to the default of 5), not bailed immediately. + expect(log).toContain("no events yet — retry"); + }); + it("retries and recovers when events show up on a later fetch (race)", async () => { // Reproduces the real bug: the async capture write lags behind // SessionEnd, so the first event SELECT returns empty. With backoff=0 the