From 7476389163962c8c4498f83ce4d506d826439666 Mon Sep 17 00:00:00 2001 From: kaghni Date: Mon, 8 Jun 2026 05:07:03 +0000 Subject: [PATCH 1/3] =?UTF-8?q?fix(skillopt):=20retry=20findInvocation=20t?= =?UTF-8?q?hrough=20Deeplake=20insert=E2=86=92read=20lag?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The worker fires on a user reaction and reads the skill-invocation window from the Deeplake sessions table. That row is written by a SEPARATE process (capture.js) and lands on a short insert→read visibility lag, so a fast reaction reads stale → "invocation not found" → silent no-op. The K=3 window only retried if the user kept typing; a single fast/final reaction lost the improvement. findInvocation now polls with bounded linear backoff (5 attempts, 3s base, ~45s max) inside the already-detached worker, so the wait blocks nothing. Only a not-found (null) result retries — a query error (e.g. 402) fails fast. Bounded so a genuinely-absent invocation (e.g. capture disabled) gives up gracefully with no publish. Tests: miss-then-hit retries then publishes; never-propagates gives up with no insert; query-error fails fast without spinning. --- src/skillify/skillopt-improve.ts | 33 +++++++++++++++- tests/shared/skillopt-improve.test.ts | 57 ++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/skillify/skillopt-improve.ts b/src/skillify/skillopt-improve.ts index 881d6339..ef91b0e0 100644 --- a/src/skillify/skillopt-improve.ts +++ b/src/skillify/skillopt-improve.ts @@ -82,6 +82,37 @@ export interface ImproveOpts { prior?: (name: string, author: string) => string[]; alreadyProposed?: (name: string, author: string, edits: Edit[]) => boolean; recordEdit?: (name: string, author: string, edits: Edit[]) => void; + // Bug #1 tolerance: the invocation row is written by a SEPARATE process (capture.js) and lands + // in Deeplake on a short insert→read visibility lag, so a worker firing on a fast reaction can + // read stale. Poll findInvocation with linear backoff before giving up. Injectable for tests. + invocationRetries?: number; // extra attempts after the first (default 5) + invocationBackoffMs?: number; // linear backoff base ms: sleep = base * attempt (default 3000) + sleep?: (ms: number) => Promise; // default real timer +} + +const DEFAULT_INVOCATION_RETRIES = 5; +const DEFAULT_INVOCATION_BACKOFF_MS = 3000; +const realSleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +/** + * findInvocation, tolerant of Deeplake's insert→read visibility lag (Bug #1). The window's + * skill-invocation row is captured by a SEPARATE process and may not be queryable the instant the + * worker fires on a fast reaction. The row is near-certain to land (capture is a reliable path), so + * poll with linear backoff; but it's NOT guaranteed (capture may be disabled/errored), so the + * attempts are BOUNDED — on exhaustion we return null and the caller gives up gracefully (no + * publish) instead of spinning. Runs inside the already-detached worker, so the waits block nothing. + * Only a not-found (null) result is retried — a query ERROR (e.g. 402) propagates immediately. + */ +async function findInvocationWithRetry(opts: ImproveOpts, name: string, author: string): Promise { + const retries = opts.invocationRetries ?? DEFAULT_INVOCATION_RETRIES; + const backoffMs = opts.invocationBackoffMs ?? DEFAULT_INVOCATION_BACKOFF_MS; + const sleep = opts.sleep ?? realSleep; + for (let attempt = 0; ; attempt++) { + const inv = await findInvocation(opts.query, opts.sessionsTable, opts.sessionId, name, author, opts.toolUseId); + if (inv) return inv; + if (attempt >= retries) return null; + await sleep(backoffMs * (attempt + 1)); + } } export async function improveSkillIfFailed(opts: ImproveOpts): Promise { @@ -89,7 +120,7 @@ export async function improveSkillIfFailed(opts: ImproveOpts): Promise { it("invocation not in the session → not judged", async () => { const { query } = makeQuery(); - expect(await improveSkillIfFailed(base(query, { skillRef: "ghost--x" }))).toMatchObject({ judged: false }); + // invocationRetries:0 → skip the real Bug#1 backoff; this exercises the immediate not-found path. + expect(await improveSkillIfFailed(base(query, { skillRef: "ghost--x", invocationRetries: 0 }))).toMatchObject({ judged: false }); }); it("failed but the skill isn't in the org table → judged, not improved", async () => { @@ -130,4 +131,58 @@ describe("improveSkillIfFailed", () => { expect(r).toMatchObject({ judged: true, failed: true, improved: false, reason: expect.stringContaining("dedup") }); expect(inserts).toHaveLength(0); }); + + // Bug #1: the invocation row is written by a SEPARATE process (capture.js) and lands in + // Deeplake on a short insert→read visibility lag, so a worker that fires on a fast reaction + // reads stale and finds nothing. The window-retry (K=3) only helps if the user keeps typing; + // a single fast/final reaction would silently no-op. The worker must retry-with-backoff. + it("retries findInvocation when the row hasn't propagated yet (Deeplake lag) → then judges + improves", async () => { + let sessionsCalls = 0; + const inserts: string[] = []; + const query = vi.fn(async (sql: string) => { + if (sql.includes("INSERT INTO")) { inserts.push(sql); return []; } + if (sql.includes("/sessions/")) { sessionsCalls++; return sessionsCalls <= 2 ? [] : sessionRows("s1"); } // miss twice, then visible + if (sql.includes('FROM "skills"')) return [SKILL_ROW]; + return []; + }); + const sleeps: number[] = []; + const r = await improveSkillIfFailed(base(query, { + invocationRetries: 5, invocationBackoffMs: 1000, sleep: async (ms: number) => { sleeps.push(ms); }, + })); + expect(r).toMatchObject({ judged: true, failed: true, improved: true, version: 3 }); + expect(sessionsCalls).toBeGreaterThanOrEqual(3); // it kept polling past the stale reads + expect(sleeps).toEqual([1000, 2000]); // linear backoff between the two misses, then hit + expect(inserts).toHaveLength(1); + }); + + it("gives up gracefully (no publish) when the row never propagates — bounded retries, e.g. capture disabled", async () => { + let sessionsCalls = 0; + const inserts: string[] = []; + const query = vi.fn(async (sql: string) => { + if (sql.includes("INSERT INTO")) { inserts.push(sql); return []; } + if (sql.includes("/sessions/")) { sessionsCalls++; return []; } // never lands + if (sql.includes('FROM "skills"')) return [SKILL_ROW]; + return []; + }); + const sleeps: number[] = []; + const r = await improveSkillIfFailed(base(query, { + invocationRetries: 3, invocationBackoffMs: 1, sleep: async (ms: number) => { sleeps.push(ms); }, + })); + expect(r).toMatchObject({ judged: false, improved: false, reason: "invocation not found in session" }); + expect(sessionsCalls).toBe(4); // 1 initial + 3 bounded retries, then stop + expect(sleeps).toHaveLength(3); // backed off 3 times then gave up + expect(inserts).toHaveLength(0); // nothing inserted — graceful no-op + }); + + it("does NOT retry on a query error (e.g. 402) — fails fast, no spinning", async () => { + let sessionsCalls = 0; + const query = vi.fn(async (sql: string) => { + if (sql.includes("/sessions/")) { sessionsCalls++; throw new Error("402 insufficient balance"); } + if (sql.includes('FROM "skills"')) return [SKILL_ROW]; + return []; + }); + await expect(improveSkillIfFailed(base(query, { invocationRetries: 5, invocationBackoffMs: 1, sleep: async () => {} }))) + .rejects.toThrow(/402/); + expect(sessionsCalls).toBe(1); // threw on the first query — no retry loop + }); }); From 4b34063ed35f771ff6ee09a8ef978b8ea9a7ff82 Mon Sep 17 00:00:00 2001 From: kaghni Date: Mon, 8 Jun 2026 05:29:01 +0000 Subject: [PATCH 2/3] review: tighten retry-count assertion to exact .toBe(4) (coderabbit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit flagged .toBeGreaterThanOrEqual(3) and suggested a precise equality. The precise value is 4, not the 3 suggested — findInvocation polls 3× (miss, miss, hit) and windowAroundInvocation issues one more /sessions/ query after the invocation is found. .toBe(4) is both precise and correct. --- tests/shared/skillopt-improve.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/shared/skillopt-improve.test.ts b/tests/shared/skillopt-improve.test.ts index c6fe4931..8d75c2c7 100644 --- a/tests/shared/skillopt-improve.test.ts +++ b/tests/shared/skillopt-improve.test.ts @@ -150,7 +150,7 @@ describe("improveSkillIfFailed", () => { invocationRetries: 5, invocationBackoffMs: 1000, sleep: async (ms: number) => { sleeps.push(ms); }, })); expect(r).toMatchObject({ judged: true, failed: true, improved: true, version: 3 }); - expect(sessionsCalls).toBeGreaterThanOrEqual(3); // it kept polling past the stale reads + expect(sessionsCalls).toBe(4); // 3 retry polls (miss, miss, hit) + 1 windowAroundInvocation query expect(sleeps).toEqual([1000, 2000]); // linear backoff between the two misses, then hit expect(inserts).toHaveLength(1); }); From 2d00789c03775b1a44d2bf3684c02f844bf20cd2 Mon Sep 17 00:00:00 2001 From: kaghni Date: Mon, 8 Jun 2026 05:33:19 +0000 Subject: [PATCH 3/3] docs(skillopt): reframe the Deeplake read-lag as expected latency, not a "bug" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The insert→read visibility lag is normal Deeplake behavior, not a defect — the worker simply needs to tolerate it. Drop the "Bug #1" labeling from the comments; the retry logic is unchanged. --- src/skillify/skillopt-improve.ts | 11 ++++++----- tests/shared/skillopt-improve.test.ts | 9 +++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/skillify/skillopt-improve.ts b/src/skillify/skillopt-improve.ts index ef91b0e0..70782fa2 100644 --- a/src/skillify/skillopt-improve.ts +++ b/src/skillify/skillopt-improve.ts @@ -82,9 +82,10 @@ export interface ImproveOpts { prior?: (name: string, author: string) => string[]; alreadyProposed?: (name: string, author: string, edits: Edit[]) => boolean; recordEdit?: (name: string, author: string, edits: Edit[]) => void; - // Bug #1 tolerance: the invocation row is written by a SEPARATE process (capture.js) and lands - // in Deeplake on a short insert→read visibility lag, so a worker firing on a fast reaction can - // read stale. Poll findInvocation with linear backoff before giving up. Injectable for tests. + // Deeplake insert→read lag tolerance: the invocation row is written by a SEPARATE process + // (capture.js) and lands in Deeplake on a short visibility lag (expected, not a defect), so a + // worker firing on a fast reaction can read stale. Poll findInvocation with linear backoff + // before giving up. Injectable for tests. invocationRetries?: number; // extra attempts after the first (default 5) invocationBackoffMs?: number; // linear backoff base ms: sleep = base * attempt (default 3000) sleep?: (ms: number) => Promise; // default real timer @@ -95,8 +96,8 @@ const DEFAULT_INVOCATION_BACKOFF_MS = 3000; const realSleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); /** - * findInvocation, tolerant of Deeplake's insert→read visibility lag (Bug #1). The window's - * skill-invocation row is captured by a SEPARATE process and may not be queryable the instant the + * findInvocation, tolerant of Deeplake's insert→read visibility lag (expected latency, not a + * defect). The window's skill-invocation row is captured by a SEPARATE process and may not be queryable the instant the * worker fires on a fast reaction. The row is near-certain to land (capture is a reliable path), so * poll with linear backoff; but it's NOT guaranteed (capture may be disabled/errored), so the * attempts are BOUNDED — on exhaustion we return null and the caller gives up gracefully (no diff --git a/tests/shared/skillopt-improve.test.ts b/tests/shared/skillopt-improve.test.ts index 8d75c2c7..925c4c21 100644 --- a/tests/shared/skillopt-improve.test.ts +++ b/tests/shared/skillopt-improve.test.ts @@ -132,10 +132,11 @@ describe("improveSkillIfFailed", () => { expect(inserts).toHaveLength(0); }); - // Bug #1: the invocation row is written by a SEPARATE process (capture.js) and lands in - // Deeplake on a short insert→read visibility lag, so a worker that fires on a fast reaction - // reads stale and finds nothing. The window-retry (K=3) only helps if the user keeps typing; - // a single fast/final reaction would silently no-op. The worker must retry-with-backoff. + // Deeplake insert→read visibility lag (expected latency, not a defect): the invocation row is + // written by a SEPARATE process (capture.js) and lands on a short visibility lag, so a worker + // that fires on a fast reaction reads stale and finds nothing. The window-retry (K=3) only helps + // if the user keeps typing; a single fast/final reaction would silently no-op. So the worker + // must retry-with-backoff itself. it("retries findInvocation when the row hasn't propagated yet (Deeplake lag) → then judges + improves", async () => { let sessionsCalls = 0; const inserts: string[] = [];