diff --git a/src/build-stats.test.ts b/src/build-stats.test.ts new file mode 100644 index 0000000..c5e34c0 --- /dev/null +++ b/src/build-stats.test.ts @@ -0,0 +1,254 @@ +import { test, expect, beforeAll, afterAll } from "vitest"; +import { PostgreSqlContainer, type StartedPostgreSqlContainer } from "@testcontainers/postgresql"; +import { buildStatsFromDatabase } from "./build-stats.ts"; +import { connectToSource } from "./sql/postgresjs.ts"; +import { Connectable } from "./sync/connectable.ts"; +import { + IndexOptimizer, + PostgresQueryBuilder, + Statistics, + type Postgres, +} from "@query-doctor/core"; + +let pg: StartedPostgreSqlContainer; +let db: Postgres; + +beforeAll(async () => { + pg = await new PostgreSqlContainer("postgres:17") + .withCommand(["-c", "autovacuum=off"]) + .start(); + db = connectToSource(Connectable.fromString(pg.getConnectionUri())); +}, 30_000); + +afterAll(async () => { + await (db as unknown as { close(): Promise }).close(); + await pg.stop(); +}); + +async function freshSchema(sql: string) { + // Drop all user tables so each test starts clean + const tables = await db.exec<{ t: string }>( + `SELECT tablename AS "t" FROM pg_tables WHERE schemaname = 'public'`, + ); + for (const { t } of tables) { + await db.exec(`DROP TABLE IF EXISTS "${t}" CASCADE`); + } + await db.exec(sql); +} + +test("sets reltuples to 10,000 for tables below threshold, preserves real relpages", async () => { + await freshSchema(` + CREATE TABLE users(id serial PRIMARY KEY, name text, email text); + CREATE INDEX users_email_idx ON users(email); + INSERT INTO users (name, email) + SELECT 'user_' || i, 'user_' || i || '@example.com' + FROM generate_series(1, 1000) AS i; + ANALYZE; + `); + + const mode = await buildStatsFromDatabase(db); + + expect(mode.kind).toBe("fromStatisticsExport"); + if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable"); + + const usersStats = mode.stats.find((s) => s.tableName === "users"); + expect(usersStats).toBeDefined(); + expect(usersStats!.reltuples).toBe(10_000); + expect(usersStats!.relpages).toBeGreaterThan(1); + + const emailIdx = usersStats!.indexes.find( + (i) => i.indexName === "users_email_idx", + ); + expect(emailIdx).toBeDefined(); + expect(emailIdx!.relpages).toBeGreaterThanOrEqual(1); +}); + +test("clamps relpages to at least 1 for empty tables", async () => { + await freshSchema(` + CREATE TABLE empty_table(id serial PRIMARY KEY, data text); + ANALYZE; + `); + + const mode = await buildStatsFromDatabase(db); + if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable"); + + const stats = mode.stats.find((s) => s.tableName === "empty_table"); + expect(stats).toBeDefined(); + expect(stats!.reltuples).toBe(10_000); + expect(stats!.relpages).toBeGreaterThanOrEqual(1); +}); + +test("density stays realistic regardless of actual row count", async () => { + await freshSchema(` + CREATE TABLE orders(id serial PRIMARY KEY, user_id int, total numeric); + CREATE INDEX orders_user_id_idx ON orders(user_id); + INSERT INTO orders (user_id, total) + SELECT (random() * 1000)::int, random() * 100 + FROM generate_series(1, 10000); + ANALYZE; + `); + + const mode = await buildStatsFromDatabase(db); + if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable"); + + const ordersStats = mode.stats.find((s) => s.tableName === "orders"); + expect(ordersStats).toBeDefined(); + + const density = ordersStats!.reltuples / ordersStats!.relpages; + expect(density).toBeLessThan(500); + expect(density).toBeGreaterThan(10); +}); + +test("groups indexes by their parent table", async () => { + await freshSchema(` + CREATE TABLE products(id serial PRIMARY KEY, name text, price numeric); + CREATE INDEX products_name_idx ON products(name); + CREATE INDEX products_price_idx ON products(price); + CREATE TABLE categories(id serial PRIMARY KEY, label text); + ANALYZE; + `); + + const mode = await buildStatsFromDatabase(db); + if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable"); + + const products = mode.stats.find((s) => s.tableName === "products"); + expect(products).toBeDefined(); + const indexNames = products!.indexes.map((i) => i.indexName).sort(); + expect(indexNames).toContain("products_name_idx"); + expect(indexNames).toContain("products_price_idx"); + expect(indexNames).toContain("products_pkey"); + + const categories = mode.stats.find((s) => s.tableName === "categories"); + expect(categories).toBeDefined(); + const catIndexNames = categories!.indexes.map((i) => i.indexName); + expect(catIndexNames).toContain("categories_pkey"); + expect(catIndexNames).not.toContain("products_name_idx"); +}); + +test("planner estimates 10,000 rows with only 1 row seeded", async () => { + await freshSchema(` + CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text); + INSERT INTO widgets (user_id, name) VALUES ('00000000-0000-0000-0000-000000000001', 'w1'); + ANALYZE; + `); + + const mode = await buildStatsFromDatabase(db); + const stats = await Statistics.fromPostgres(db, mode); + const existingIndexes = await stats.getExistingIndexes(); + const optimizer = new IndexOptimizer(db, stats, existingIndexes); + + const builder = new PostgresQueryBuilder("SELECT * FROM widgets"); + const plan = await optimizer.testQueryWithStats(builder); + + const estimatedRows = (plan.Plan as Record)["Plan Rows"]; + expect(estimatedRows).toBe(10_000); +}); + +test("planner estimates 10,000 rows with 10,000 rows seeded", async () => { + await freshSchema(` + CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text); + INSERT INTO widgets (user_id, name) + SELECT gen_random_uuid(), 'widget_' || i + FROM generate_series(1, 10000) AS i; + `); + + const beforeAnalyze = await db.exec<{ relpages: number; reltuples: number }>( + `SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`, + ); + console.log(`10K: BEFORE ANALYZE: relpages=${beforeAnalyze[0].relpages}, reltuples=${beforeAnalyze[0].reltuples}`); + + await db.exec("ANALYZE widgets"); + + const afterAnalyze = await db.exec<{ relpages: number; reltuples: number }>( + `SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`, + ); + console.log(`10K: AFTER ANALYZE: relpages=${afterAnalyze[0].relpages}, reltuples=${afterAnalyze[0].reltuples}`); + + const mode = await buildStatsFromDatabase(db); + const stats = await Statistics.fromPostgres(db, mode); + const existingIndexes = await stats.getExistingIndexes(); + const optimizer = new IndexOptimizer(db, stats, existingIndexes); + + const builder = new PostgresQueryBuilder("SELECT * FROM widgets"); + const plan = await optimizer.testQueryWithStats(builder); + + const estimatedRows = (plan.Plan as Record)["Plan Rows"]; + expect(estimatedRows).toBe(10_000); +}); + +test("planner estimates 10,000 rows even with 50,000 rows seeded", async () => { + await freshSchema(` + CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text); + INSERT INTO widgets (user_id, name) + SELECT gen_random_uuid(), 'widget_' || i + FROM generate_series(1, 50000) AS i; + ANALYZE; + `); + + const mode = await buildStatsFromDatabase(db); + const stats = await Statistics.fromPostgres(db, mode); + const existingIndexes = await stats.getExistingIndexes(); + const optimizer = new IndexOptimizer(db, stats, existingIndexes); + + const builder = new PostgresQueryBuilder("SELECT * FROM widgets"); + const plan = await optimizer.testQueryWithStats(builder); + + const estimatedRows = (plan.Plan as Record)["Plan Rows"]; + console.log(`10K: planner estimatedRows=${estimatedRows}`); + expect(estimatedRows).toBe(10_000); +}); + +test("BUG: fromAssumption(relpages=1) inflates estimates with real data", async () => { + await freshSchema(` + CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text); + INSERT INTO widgets (user_id, name) + SELECT gen_random_uuid(), 'widget_' || i + FROM generate_series(1, 10000) AS i; + `); + + const beforeAnalyze = await db.exec<{ relpages: number; reltuples: number }>( + `SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`, + ); + console.log(`BUG: BEFORE ANALYZE: relpages=${beforeAnalyze[0].relpages}, reltuples=${beforeAnalyze[0].reltuples}`); + + await db.exec("ANALYZE widgets"); + + const afterAnalyze = await db.exec<{ relpages: number; reltuples: number }>( + `SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`, + ); + console.log(`BUG: AFTER ANALYZE: relpages=${afterAnalyze[0].relpages}, reltuples=${afterAnalyze[0].reltuples}`); + + const brokenMode = Statistics.defaultStatsMode; + if (brokenMode.kind === "fromAssumption") { + console.log(`BUG: override writes: reltuples=${brokenMode.reltuples}, relpages=${brokenMode.relpages}`); + } + const stats = await Statistics.fromPostgres(db, brokenMode); + const existingIndexes = await stats.getExistingIndexes(); + const optimizer = new IndexOptimizer(db, stats, existingIndexes); + + const builder = new PostgresQueryBuilder("SELECT * FROM widgets"); + const plan = await optimizer.testQueryWithStats(builder); + + const estimatedRows = (plan.Plan as Record)["Plan Rows"]; + console.log(`BUG: planner estimatedRows=${estimatedRows}`); + + // With relpages=74 on disk and the override setting relpages=1, + // PostgreSQL computes: density = 10000/1 = 10000 tuples/page, + // then estimates tuples = 74 * 10000 = 740,000 instead of 10,000. + expect(estimatedRows).toBe(740_000); +}); + +test("leaves columns null so ANALYZE pg_statistic entries persist", async () => { + await freshSchema(` + CREATE TABLE items(id serial PRIMARY KEY, label text); + INSERT INTO items (label) SELECT 'item_' || i FROM generate_series(1, 100) AS i; + ANALYZE; + `); + + const mode = await buildStatsFromDatabase(db); + if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable"); + + const items = mode.stats.find((s) => s.tableName === "items"); + expect(items).toBeDefined(); + expect(items!.columns).toBeNull(); +}); diff --git a/src/build-stats.ts b/src/build-stats.ts new file mode 100644 index 0000000..8c270c0 --- /dev/null +++ b/src/build-stats.ts @@ -0,0 +1,92 @@ +import { + type Postgres, + Statistics, + type StatisticsMode, +} from "@query-doctor/core"; + +const DEFAULT_RELTUPLES = 10_000; + +/** + * Build a `fromStatisticsExport` stats mode from the live database. + * + * PostgreSQL's planner ignores `pg_class.relpages` for tables with data on + * disk — it reads the actual page count via `RelationGetNumberOfBlocks()`. + * It then estimates tuples as: + * + * estimated_tuples = actual_pages × pg_class.reltuples ÷ pg_class.relpages + * + * The old `fromAssumption` default (reltuples=10 000, relpages=1) causes a + * massive inflation when tables have real data (e.g. 167 pages → 1.67 M + * estimated tuples). + * + * By reading the real `relpages` from pg_class (after ANALYZE) and pairing + * it with a correct reltuples, the formula produces the correct estimate + * regardless of actual data volume. Column-level statistics (`pg_statistic`) + * are left untouched — ANALYZE already populated them. + * + * All tables are assumed to have 10,000 rows regardless of actual data. + */ +export async function buildStatsFromDatabase( + db: Postgres, +): Promise { + type TableRow = { + tableName: string; + schemaName: string; + relpages: number; + relallvisible: number; + }; + type IndexRow = TableRow & { indexName: string; reltuples: number }; + + const [tables, indexes] = await Promise.all([ + db.exec(` + SELECT c.relname AS "tableName", + n.nspname AS "schemaName", + c.relpages::int AS "relpages", + c.relallvisible::int AS "relallvisible" + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE c.relkind = 'r' + AND n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection + `), + db.exec(` + SELECT t.relname AS "tableName", + n.nspname AS "schemaName", + i.relname AS "indexName", + i.reltuples::real AS "reltuples", + i.relpages::int AS "relpages", + i.relallvisible::int AS "relallvisible" + FROM pg_index ix + JOIN pg_class t ON t.oid = ix.indrelid + JOIN pg_class i ON i.oid = ix.indexrelid + JOIN pg_namespace n ON n.oid = t.relnamespace + WHERE n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection + `), + ]); + + const indexesByTable = new Map(); + for (const idx of indexes) { + const key = `${idx.schemaName}.${idx.tableName}`; + const list = indexesByTable.get(key) ?? []; + list.push(idx); + indexesByTable.set(key, list); + } + + const stats = tables.map((t) => ({ + tableName: t.tableName, + schemaName: t.schemaName, + reltuples: DEFAULT_RELTUPLES, + relpages: Math.max(1, t.relpages), + relallvisible: t.relallvisible ?? 0, + columns: null, + indexes: ( + indexesByTable.get(`${t.schemaName}.${t.tableName}`) ?? [] + ).map((i) => ({ + indexName: i.indexName, + relpages: Math.max(1, i.relpages), + reltuples: i.reltuples, + relallvisible: i.relallvisible ?? 0, + })), + })); + + return Statistics.statsModeFromExport(stats); +} diff --git a/src/runner.ts b/src/runner.ts index 111c31c..5b85414 100644 --- a/src/runner.ts +++ b/src/runner.ts @@ -39,6 +39,7 @@ import { env } from "./env.ts"; import { connectToSource } from "./sql/postgresjs.ts"; import { parse } from "@libpg-query/parser"; import { Connectable } from "./sync/connectable.ts"; +import { buildStatsFromDatabase } from "./build-stats.ts"; export class Runner { private readonly seenQueries = new Set(); @@ -66,7 +67,16 @@ export class Runner { ignoredQueryHashes?: string[]; }) { const db = connectToSource(options.postgresUrl); - const statisticsMode = Runner.decideStatisticsMode(options.statisticsPath); + let statisticsMode: StatisticsMode; + if (options.statisticsPath) { + statisticsMode = Runner.decideStatisticsMode(options.statisticsPath); + } else { + // Run ANALYZE so pg_class.relpages and pg_statistic reflect the + // current data. Without this, relpages can be 0 after fresh + // inserts and column stats depend on autovacuum timing. + await db.exec("ANALYZE"); + statisticsMode = await buildStatsFromDatabase(db); + } const stats = await Statistics.fromPostgres(db, statisticsMode); const existingIndexes = await stats.getExistingIndexes(); const optimizer = new IndexOptimizer(db, stats, existingIndexes); @@ -461,19 +471,12 @@ export class Runner { console.log(); } - private static decideStatisticsMode(path?: string): StatisticsMode { - if (path) { - const data = Runner.readStatisticsFile(path); - return Statistics.statsModeFromExport(data); - } else { - return Statistics.defaultStatsMode; - } - } - private static readStatisticsFile(path: string): ExportedStats[] { + private static decideStatisticsMode(path: string): StatisticsMode { const data = readFileSync(path); const json = JSON.parse(new TextDecoder().decode(data)); - return ExportedStats.array().parse(json); + return Statistics.statsModeFromExport(ExportedStats.array().parse(json)); } + } export type QueryProcessResult =