From 06f49566abb69784ecd83fe1fc79cdb959a63454 Mon Sep 17 00:00:00 2001 From: pauldambra Date: Tue, 19 May 2026 10:57:34 +0100 Subject: [PATCH 1/3] fix(git): cap and stream untracked-file line counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `countFileLines` previously read each untracked file's full content into memory as UTF-8 via `fs.readFile(filePath, "utf-8")`. The caller in `getChangedFilesDetailed` runs this 16-way concurrent against every untracked path returned by `streamGitStatus` (up to 50k entries). On a monorepo with multi-MB build artifacts or lockfiles, peak heap was `16 * file_bytes * 2` (V8's UTF-16 cost), easily several GB. That OOM'd the main process and froze the renderer waiting on a tRPC call that would never return — the symptom was the app appearing to freeze a few seconds after the sidebar painted, with no `[ipc-rate]` warnings, just a silent V8 `Scavenger: semi-space copy Allocation failed`. Fix: bail out on files larger than 1 MB, and stream the rest with `createReadStream`, counting `\n` bytes byte-by-byte. Per-stream memory stays at ~64 KB regardless of file size, so peak across 16 concurrent reads is ~1 MB total. Preserves the original return semantics (trailing-newline-aware line count) for files within the cap. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/git/src/queries.test.ts | 44 ++++++++++++++++++++++++++++++++ packages/git/src/queries.ts | 29 ++++++++++++++++++--- 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/packages/git/src/queries.test.ts b/packages/git/src/queries.test.ts index 6eab2e7b4..9de74d05d 100644 --- a/packages/git/src/queries.test.ts +++ b/packages/git/src/queries.test.ts @@ -6,6 +6,7 @@ import { createGitClient } from "./client"; import { detectDefaultBranch, getBranchDiffPatchesByPath, + getChangedFilesDetailed, splitUnifiedDiffByFile, } from "./queries"; @@ -242,3 +243,46 @@ describe("getBranchDiffPatchesByPath", () => { } }); }); + +describe("getChangedFilesDetailed", () => { + let repoDir: string; + + afterEach(async () => { + if (repoDir) { + await rm(repoDir, { recursive: true, force: true }); + repoDir = ""; + } + }); + + it("reports line counts for small untracked files", async () => { + repoDir = await setupRepo(); + await writeFile(path.join(repoDir, "small.txt"), "a\nb\nc\n"); + await writeFile(path.join(repoDir, "no-trailing.txt"), "a\nb\nc"); + + const files = await getChangedFilesDetailed(repoDir); + const small = files.find((f) => f.path === "small.txt"); + const noTrailing = files.find((f) => f.path === "no-trailing.txt"); + + expect(small).toMatchObject({ status: "untracked", linesAdded: 3 }); + expect(noTrailing).toMatchObject({ status: "untracked", linesAdded: 3 }); + }); + + // Regression guard for the OOM in https://github.com/PostHog/code/issues/... + // (introduced in c617988f). Before the fix `countFileLines` read each + // untracked file's full content into memory, 16-way concurrent, with no + // size cap. On a monorepo with multi-MB build artifacts / lockfiles this + // exhausted the main-process V8 heap (~3GB+) and froze the renderer + // waiting on the dead tRPC call. The fix bails on files larger than + // COUNT_FILE_LINES_MAX_BYTES (1MB) and stream-counts the rest, so peak + // memory stays ~16 * 64KB regardless of file size. + it("skips line counting for files over the size cap", async () => { + repoDir = await setupRepo(); + const oneAndAHalfMB = "a\n".repeat(800_000); + await writeFile(path.join(repoDir, "huge.txt"), oneAndAHalfMB); + + const files = await getChangedFilesDetailed(repoDir); + const huge = files.find((f) => f.path === "huge.txt"); + + expect(huge).toMatchObject({ status: "untracked", linesAdded: 0 }); + }); +}); diff --git a/packages/git/src/queries.ts b/packages/git/src/queries.ts index dc4cdcf82..140ff9a24 100644 --- a/packages/git/src/queries.ts +++ b/packages/git/src/queries.ts @@ -1,3 +1,4 @@ +import { createReadStream } from "node:fs"; import * as fs from "node:fs/promises"; import * as path from "node:path"; import type { CreateGitClientOptions } from "./client"; @@ -413,11 +414,33 @@ function matchesExcludePattern(filePath: string, patterns: string[]): boolean { }); } +const COUNT_FILE_LINES_MAX_BYTES = 1 * 1024 * 1024; + async function countFileLines(filePath: string): Promise { try { - const content = await fs.readFile(filePath, "utf-8"); - if (!content) return 0; - return content.split("\n").length - (content.endsWith("\n") ? 1 : 0); + const stat = await fs.stat(filePath); + if (!stat.isFile() || stat.size === 0) return 0; + if (stat.size > COUNT_FILE_LINES_MAX_BYTES) return 0; + return await new Promise((resolve) => { + let newlines = 0; + let lastByte = -1; + const stream = createReadStream(filePath); + stream.on("data", (chunk) => { + const buf = typeof chunk === "string" ? Buffer.from(chunk) : chunk; + for (let i = 0; i < buf.length; i++) { + if (buf[i] === 0x0a) newlines++; + } + if (buf.length > 0) lastByte = buf[buf.length - 1]; + }); + stream.on("end", () => { + if (lastByte === -1) { + resolve(0); + return; + } + resolve(lastByte === 0x0a ? newlines : newlines + 1); + }); + stream.on("error", () => resolve(0)); + }); } catch { return 0; } From 0ac56b1770bc5a1ca270e3da4f86fc3b0a4ecb8b Mon Sep 17 00:00:00 2001 From: pauldambra Date: Tue, 19 May 2026 11:06:10 +0100 Subject: [PATCH 2/3] fix(git): drop size cap, stream-count all untracked files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier on this PR I gated countFileLines with a 1 MB size cap that returned `linesAdded: 0` for anything bigger. That was a correctness regression — a 5 MB untracked lockfile would show 0 lines in the diff stats instead of its real count. The streaming line counter is already enough to fix the OOM on its own: `createReadStream`'s 64 KB highWaterMark means peak per-stream memory is ~64 KB regardless of file size, so 16-way concurrent reads top out around 1 MB total whether the files are 5 KB or 5 GB. The only cost of removing the cap is wall-clock time for genuinely huge files (a 1 GB untracked file now streams in a few seconds instead of returning 0), which is the correct trade. Updated the regression test to assert the real line count for a ~1.6 MB untracked file instead of asserting it gets skipped. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/git/src/queries.test.ts | 30 ++++++++++++++++++------------ packages/git/src/queries.ts | 3 --- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/packages/git/src/queries.test.ts b/packages/git/src/queries.test.ts index 9de74d05d..bcb043d8d 100644 --- a/packages/git/src/queries.test.ts +++ b/packages/git/src/queries.test.ts @@ -267,22 +267,28 @@ describe("getChangedFilesDetailed", () => { expect(noTrailing).toMatchObject({ status: "untracked", linesAdded: 3 }); }); - // Regression guard for the OOM in https://github.com/PostHog/code/issues/... - // (introduced in c617988f). Before the fix `countFileLines` read each - // untracked file's full content into memory, 16-way concurrent, with no - // size cap. On a monorepo with multi-MB build artifacts / lockfiles this - // exhausted the main-process V8 heap (~3GB+) and froze the renderer - // waiting on the dead tRPC call. The fix bails on files larger than - // COUNT_FILE_LINES_MAX_BYTES (1MB) and stream-counts the rest, so peak - // memory stays ~16 * 64KB regardless of file size. - it("skips line counting for files over the size cap", async () => { + // Regression guard for the OOM in #2218. Before the fix `countFileLines` + // read each untracked file's full content into memory via + // `fs.readFile(..., "utf-8")`, 16-way concurrent against every untracked + // path returned by `streamGitStatus` (up to 50k). On a monorepo with + // multi-MB build artifacts this exhausted the main-process V8 heap + // (`16 * file_bytes * 2` for V8's UTF-16) and froze the renderer waiting + // on the dead tRPC call. The fix stream-counts via `createReadStream`, + // so peak per-stream memory is ~64KB regardless of file size — the + // multi-MB case below would have OOM'd pre-fix and must still report an + // accurate line count. + it("stream-counts untracked files larger than the streaming chunk size", async () => { repoDir = await setupRepo(); - const oneAndAHalfMB = "a\n".repeat(800_000); - await writeFile(path.join(repoDir, "huge.txt"), oneAndAHalfMB); + const lineCount = 800_000; // ~1.6MB at 2 bytes/line, well over default 64KB chunk + const content = "a\n".repeat(lineCount); + await writeFile(path.join(repoDir, "huge.txt"), content); const files = await getChangedFilesDetailed(repoDir); const huge = files.find((f) => f.path === "huge.txt"); - expect(huge).toMatchObject({ status: "untracked", linesAdded: 0 }); + expect(huge).toMatchObject({ + status: "untracked", + linesAdded: lineCount, + }); }); }); diff --git a/packages/git/src/queries.ts b/packages/git/src/queries.ts index 140ff9a24..373923617 100644 --- a/packages/git/src/queries.ts +++ b/packages/git/src/queries.ts @@ -414,13 +414,10 @@ function matchesExcludePattern(filePath: string, patterns: string[]): boolean { }); } -const COUNT_FILE_LINES_MAX_BYTES = 1 * 1024 * 1024; - async function countFileLines(filePath: string): Promise { try { const stat = await fs.stat(filePath); if (!stat.isFile() || stat.size === 0) return 0; - if (stat.size > COUNT_FILE_LINES_MAX_BYTES) return 0; return await new Promise((resolve) => { let newlines = 0; let lastByte = -1; From 3475526be2ddd181f66928e18ba7e95d4666f2ba Mon Sep 17 00:00:00 2001 From: pauldambra Date: Tue, 19 May 2026 11:31:09 +0100 Subject: [PATCH 3/3] fix(git): address qa-swarm feedback on countFileLines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convergent feedback from three reviewers on the streaming line counter: - thread `AbortSignal` from `getChangedFilesDetailed` through `countFileLines` into `createReadStream({ signal })` and check the signal between items in `mapWithConcurrency`. A cancelled tRPC call no longer streams every untracked file to completion before the result is discarded. - log on `stream.on("error", ...)` via `console.warn` (matches the pattern already used in `utils.ts`). Silent zeros in a diff-stats field is the exact smell that hid the original OOM bug; now there's a breadcrumb when it happens. - replace the per-byte JS scan with `Buffer.indexOf(0x0a)` — native, an order of magnitude faster on multi-MB buffers, which is the exact workload this function exists to handle. (The original `typeof chunk === "string"` branch was dead at runtime — Node's `data` listener types are wider than the actual emit — so it's now a focused cast with a comment explaining why.) - switch `fs.stat` -> `fs.lstat` so an untracked symlink to /dev/zero (or any device / dir / fifo through a symlink) short-circuits to 0 via `isFile()` instead of streaming forever. Symlinks in the untracked list are rare; preferring safety over a legitimate-symlink line count is the right trade. - comment the `lastByte === -1` branch so a future cleanup doesn't treat it as dead — it's the TOCTOU guard. Test coverage expanded to a parameterized `it.each` over the line-count contract (trailing newline, no trailing newline, single byte, lone newline, consecutive newlines, CRLF), plus an explicit empty-file case. The multi-MB regression test now uses a named constant so a future "shrink for speed" PR can't accidentally drop it below the stream's highWaterMark and silently downgrade the test. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/git/src/queries.test.ts | 42 +++++++++++++++++++------- packages/git/src/queries.ts | 51 +++++++++++++++++++++++++------- 2 files changed, 72 insertions(+), 21 deletions(-) diff --git a/packages/git/src/queries.test.ts b/packages/git/src/queries.test.ts index bcb043d8d..77d387ea6 100644 --- a/packages/git/src/queries.test.ts +++ b/packages/git/src/queries.test.ts @@ -244,7 +244,12 @@ describe("getBranchDiffPatchesByPath", () => { }); }); -describe("getChangedFilesDetailed", () => { +// Picked to land well past the default 64KB highWaterMark of +// `createReadStream` so the regression test actually exercises the +// across-chunk path of the streaming line counter. +const LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK = 800_000; + +describe("getChangedFilesDetailed > untracked line counts", () => { let repoDir: string; afterEach(async () => { @@ -254,17 +259,33 @@ describe("getChangedFilesDetailed", () => { } }); - it("reports line counts for small untracked files", async () => { + it.each([ + { name: "trailing newline", content: "a\nb\nc\n", expected: 3 }, + { name: "no trailing newline", content: "a\nb\nc", expected: 3 }, + { name: "single byte, no newline", content: "a", expected: 1 }, + { name: "lone newline", content: "\n", expected: 1 }, + { name: "consecutive newlines", content: "\n\n", expected: 2 }, + // CRLF: legacy `split("\n")` counted only `\n` separators, so + // `"a\r\nb\r\n"` -> 2 lines. Byte-counter matches. + { name: "CRLF endings", content: "a\r\nb\r\n", expected: 2 }, + ])("counts $name as $expected line(s)", async ({ content, expected }) => { + repoDir = await setupRepo(); + await writeFile(path.join(repoDir, "f.txt"), content); + + const files = await getChangedFilesDetailed(repoDir); + const f = files.find((file) => file.path === "f.txt"); + + expect(f).toMatchObject({ status: "untracked", linesAdded: expected }); + }); + + it("reports 0 lines for empty untracked files", async () => { repoDir = await setupRepo(); - await writeFile(path.join(repoDir, "small.txt"), "a\nb\nc\n"); - await writeFile(path.join(repoDir, "no-trailing.txt"), "a\nb\nc"); + await writeFile(path.join(repoDir, "empty.txt"), ""); const files = await getChangedFilesDetailed(repoDir); - const small = files.find((f) => f.path === "small.txt"); - const noTrailing = files.find((f) => f.path === "no-trailing.txt"); + const empty = files.find((f) => f.path === "empty.txt"); - expect(small).toMatchObject({ status: "untracked", linesAdded: 3 }); - expect(noTrailing).toMatchObject({ status: "untracked", linesAdded: 3 }); + expect(empty).toMatchObject({ status: "untracked", linesAdded: 0 }); }); // Regression guard for the OOM in #2218. Before the fix `countFileLines` @@ -279,8 +300,7 @@ describe("getChangedFilesDetailed", () => { // accurate line count. it("stream-counts untracked files larger than the streaming chunk size", async () => { repoDir = await setupRepo(); - const lineCount = 800_000; // ~1.6MB at 2 bytes/line, well over default 64KB chunk - const content = "a\n".repeat(lineCount); + const content = "a\n".repeat(LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK); await writeFile(path.join(repoDir, "huge.txt"), content); const files = await getChangedFilesDetailed(repoDir); @@ -288,7 +308,7 @@ describe("getChangedFilesDetailed", () => { expect(huge).toMatchObject({ status: "untracked", - linesAdded: lineCount, + linesAdded: LINE_COUNT_LARGER_THAN_READ_STREAM_CHUNK, }); }); }); diff --git a/packages/git/src/queries.ts b/packages/git/src/queries.ts index 373923617..56aa909cd 100644 --- a/packages/git/src/queries.ts +++ b/packages/git/src/queries.ts @@ -414,29 +414,54 @@ function matchesExcludePattern(filePath: string, patterns: string[]): boolean { }); } -async function countFileLines(filePath: string): Promise { +async function countFileLines( + filePath: string, + options?: { signal?: AbortSignal }, +): Promise { try { - const stat = await fs.stat(filePath); + // `lstat` instead of `stat` so an untracked symlink (rare, but legal) + // pointing at /dev/zero or a path outside the workdir doesn't stream + // forever — symlinks fail `isFile()` and short-circuit to 0. + const stat = await fs.lstat(filePath); if (!stat.isFile() || stat.size === 0) return 0; return await new Promise((resolve) => { let newlines = 0; let lastByte = -1; - const stream = createReadStream(filePath); - stream.on("data", (chunk) => { - const buf = typeof chunk === "string" ? Buffer.from(chunk) : chunk; - for (let i = 0; i < buf.length; i++) { - if (buf[i] === 0x0a) newlines++; + const stream = createReadStream(filePath, { signal: options?.signal }); + stream.on("data", (rawChunk) => { + // Node types stream chunks as `string | Buffer`; without an + // `encoding` option `createReadStream` always emits `Buffer`, + // so the cast is for the type checker, not the runtime. + const chunk = rawChunk as Buffer; + // Native `Buffer.indexOf` — ~10x faster than a per-byte JS loop + // on multi-MB buffers, which is the workload this whole function + // exists to handle. + for ( + let idx = chunk.indexOf(0x0a); + idx !== -1; + idx = chunk.indexOf(0x0a, idx + 1) + ) { + newlines++; } - if (buf.length > 0) lastByte = buf[buf.length - 1]; + if (chunk.length > 0) lastByte = chunk[chunk.length - 1]; }); stream.on("end", () => { + // Guards against TOCTOU truncation between lstat and read — + // size > 0 at stat time, zero bytes by the time we open. if (lastByte === -1) { resolve(0); return; } resolve(lastByte === 0x0a ? newlines : newlines + 1); }); - stream.on("error", () => resolve(0)); + stream.on("error", (err) => { + // Don't propagate — caller already treats any failure as 0 lines. + // But log so the next time a "shows 0 lines" mystery shows up + // there's a breadcrumb (the original OOM in #2218 hid behind the + // same silent-zero return). + console.warn(`countFileLines failed for ${filePath}:`, err); + resolve(0); + }); }); } catch { return 0; @@ -447,12 +472,14 @@ async function mapWithConcurrency( items: readonly T[], concurrency: number, mapper: (item: T) => Promise, + options?: { signal?: AbortSignal }, ): Promise { if (items.length === 0) return []; const results = new Array(items.length); let index = 0; const worker = async () => { while (index < items.length) { + if (options?.signal?.aborted) return; const i = index++; results[i] = await mapper(items[i]); } @@ -541,7 +568,11 @@ export async function getChangedFilesDetailed( const untrackedLineCounts = await mapWithConcurrency( untrackedToCount, 16, - (file) => countFileLines(path.join(baseDir, file)), + (file) => + countFileLines(path.join(baseDir, file), { + signal: gitOptions?.abortSignal, + }), + { signal: gitOptions?.abortSignal }, ); for (let i = 0; i < untrackedToCount.length; i++) { files.push({