Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 254 additions & 0 deletions src/build-stats.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
import { test, expect, beforeAll, afterAll } from "vitest";
import { PostgreSqlContainer, type StartedPostgreSqlContainer } from "@testcontainers/postgresql";
import { buildStatsFromDatabase } from "./build-stats.ts";
import { connectToSource } from "./sql/postgresjs.ts";
import { Connectable } from "./sync/connectable.ts";
import {
IndexOptimizer,
PostgresQueryBuilder,
Statistics,
type Postgres,
} from "@query-doctor/core";

let pg: StartedPostgreSqlContainer;
let db: Postgres;

beforeAll(async () => {
pg = await new PostgreSqlContainer("postgres:17")
.withCommand(["-c", "autovacuum=off"])
.start();
db = connectToSource(Connectable.fromString(pg.getConnectionUri()));
}, 30_000);

afterAll(async () => {
await (db as unknown as { close(): Promise<void> }).close();
await pg.stop();
});

async function freshSchema(sql: string) {
// Drop all user tables so each test starts clean
const tables = await db.exec<{ t: string }>(
`SELECT tablename AS "t" FROM pg_tables WHERE schemaname = 'public'`,
);
for (const { t } of tables) {
await db.exec(`DROP TABLE IF EXISTS "${t}" CASCADE`);
}
await db.exec(sql);
}

test("sets reltuples to 10,000 for tables below threshold, preserves real relpages", async () => {
await freshSchema(`
CREATE TABLE users(id serial PRIMARY KEY, name text, email text);
CREATE INDEX users_email_idx ON users(email);
INSERT INTO users (name, email)
SELECT 'user_' || i, 'user_' || i || '@example.com'
FROM generate_series(1, 1000) AS i;
ANALYZE;
`);

const mode = await buildStatsFromDatabase(db);

expect(mode.kind).toBe("fromStatisticsExport");
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");

const usersStats = mode.stats.find((s) => s.tableName === "users");
expect(usersStats).toBeDefined();
expect(usersStats!.reltuples).toBe(10_000);
expect(usersStats!.relpages).toBeGreaterThan(1);

const emailIdx = usersStats!.indexes.find(
(i) => i.indexName === "users_email_idx",
);
expect(emailIdx).toBeDefined();
expect(emailIdx!.relpages).toBeGreaterThanOrEqual(1);
});

test("clamps relpages to at least 1 for empty tables", async () => {
await freshSchema(`
CREATE TABLE empty_table(id serial PRIMARY KEY, data text);
ANALYZE;
`);

const mode = await buildStatsFromDatabase(db);
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");

const stats = mode.stats.find((s) => s.tableName === "empty_table");
expect(stats).toBeDefined();
expect(stats!.reltuples).toBe(10_000);
expect(stats!.relpages).toBeGreaterThanOrEqual(1);
});

test("density stays realistic regardless of actual row count", async () => {
await freshSchema(`
CREATE TABLE orders(id serial PRIMARY KEY, user_id int, total numeric);
CREATE INDEX orders_user_id_idx ON orders(user_id);
INSERT INTO orders (user_id, total)
SELECT (random() * 1000)::int, random() * 100
FROM generate_series(1, 10000);
ANALYZE;
`);

const mode = await buildStatsFromDatabase(db);
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");

const ordersStats = mode.stats.find((s) => s.tableName === "orders");
expect(ordersStats).toBeDefined();

const density = ordersStats!.reltuples / ordersStats!.relpages;
expect(density).toBeLessThan(500);
expect(density).toBeGreaterThan(10);
});

test("groups indexes by their parent table", async () => {
await freshSchema(`
CREATE TABLE products(id serial PRIMARY KEY, name text, price numeric);
CREATE INDEX products_name_idx ON products(name);
CREATE INDEX products_price_idx ON products(price);
CREATE TABLE categories(id serial PRIMARY KEY, label text);
ANALYZE;
`);

const mode = await buildStatsFromDatabase(db);
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");

const products = mode.stats.find((s) => s.tableName === "products");
expect(products).toBeDefined();
const indexNames = products!.indexes.map((i) => i.indexName).sort();
expect(indexNames).toContain("products_name_idx");
expect(indexNames).toContain("products_price_idx");
expect(indexNames).toContain("products_pkey");

const categories = mode.stats.find((s) => s.tableName === "categories");
expect(categories).toBeDefined();
const catIndexNames = categories!.indexes.map((i) => i.indexName);
expect(catIndexNames).toContain("categories_pkey");
expect(catIndexNames).not.toContain("products_name_idx");
});

test("planner estimates 10,000 rows with only 1 row seeded", async () => {
await freshSchema(`
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
INSERT INTO widgets (user_id, name) VALUES ('00000000-0000-0000-0000-000000000001', 'w1');
ANALYZE;
`);

const mode = await buildStatsFromDatabase(db);
const stats = await Statistics.fromPostgres(db, mode);
const existingIndexes = await stats.getExistingIndexes();
const optimizer = new IndexOptimizer(db, stats, existingIndexes);

const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
const plan = await optimizer.testQueryWithStats(builder);

const estimatedRows = (plan.Plan as Record<string, unknown>)["Plan Rows"];
expect(estimatedRows).toBe(10_000);
});

test("planner estimates 10,000 rows with 10,000 rows seeded", async () => {
await freshSchema(`
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
INSERT INTO widgets (user_id, name)
SELECT gen_random_uuid(), 'widget_' || i
FROM generate_series(1, 10000) AS i;
`);

const beforeAnalyze = await db.exec<{ relpages: number; reltuples: number }>(
`SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`,
);
console.log(`10K: BEFORE ANALYZE: relpages=${beforeAnalyze[0].relpages}, reltuples=${beforeAnalyze[0].reltuples}`);

await db.exec("ANALYZE widgets");

const afterAnalyze = await db.exec<{ relpages: number; reltuples: number }>(
`SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`,
);
console.log(`10K: AFTER ANALYZE: relpages=${afterAnalyze[0].relpages}, reltuples=${afterAnalyze[0].reltuples}`);

const mode = await buildStatsFromDatabase(db);
const stats = await Statistics.fromPostgres(db, mode);
const existingIndexes = await stats.getExistingIndexes();
const optimizer = new IndexOptimizer(db, stats, existingIndexes);

const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
const plan = await optimizer.testQueryWithStats(builder);

const estimatedRows = (plan.Plan as Record<string, unknown>)["Plan Rows"];
expect(estimatedRows).toBe(10_000);
});

test("planner estimates 10,000 rows even with 50,000 rows seeded", async () => {
await freshSchema(`
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
INSERT INTO widgets (user_id, name)
SELECT gen_random_uuid(), 'widget_' || i
FROM generate_series(1, 50000) AS i;
ANALYZE;
`);

const mode = await buildStatsFromDatabase(db);
const stats = await Statistics.fromPostgres(db, mode);
const existingIndexes = await stats.getExistingIndexes();
const optimizer = new IndexOptimizer(db, stats, existingIndexes);

const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
const plan = await optimizer.testQueryWithStats(builder);

const estimatedRows = (plan.Plan as Record<string, unknown>)["Plan Rows"];
console.log(`10K: planner estimatedRows=${estimatedRows}`);
expect(estimatedRows).toBe(10_000);
});

test("BUG: fromAssumption(relpages=1) inflates estimates with real data", async () => {
await freshSchema(`
CREATE TABLE widgets(id serial PRIMARY KEY, user_id uuid, name text);
INSERT INTO widgets (user_id, name)
SELECT gen_random_uuid(), 'widget_' || i
FROM generate_series(1, 10000) AS i;
`);

const beforeAnalyze = await db.exec<{ relpages: number; reltuples: number }>(
`SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`,
);
console.log(`BUG: BEFORE ANALYZE: relpages=${beforeAnalyze[0].relpages}, reltuples=${beforeAnalyze[0].reltuples}`);

await db.exec("ANALYZE widgets");

const afterAnalyze = await db.exec<{ relpages: number; reltuples: number }>(
`SELECT relpages, reltuples::int FROM pg_class WHERE relname = 'widgets' AND relkind = 'r'`,
);
console.log(`BUG: AFTER ANALYZE: relpages=${afterAnalyze[0].relpages}, reltuples=${afterAnalyze[0].reltuples}`);

const brokenMode = Statistics.defaultStatsMode;
if (brokenMode.kind === "fromAssumption") {
console.log(`BUG: override writes: reltuples=${brokenMode.reltuples}, relpages=${brokenMode.relpages}`);
}
const stats = await Statistics.fromPostgres(db, brokenMode);
const existingIndexes = await stats.getExistingIndexes();
const optimizer = new IndexOptimizer(db, stats, existingIndexes);

const builder = new PostgresQueryBuilder("SELECT * FROM widgets");
const plan = await optimizer.testQueryWithStats(builder);

const estimatedRows = (plan.Plan as Record<string, unknown>)["Plan Rows"];
console.log(`BUG: planner estimatedRows=${estimatedRows}`);

// With relpages=74 on disk and the override setting relpages=1,
// PostgreSQL computes: density = 10000/1 = 10000 tuples/page,
// then estimates tuples = 74 * 10000 = 740,000 instead of 10,000.
expect(estimatedRows).toBe(740_000);
});

test("leaves columns null so ANALYZE pg_statistic entries persist", async () => {
await freshSchema(`
CREATE TABLE items(id serial PRIMARY KEY, label text);
INSERT INTO items (label) SELECT 'item_' || i FROM generate_series(1, 100) AS i;
ANALYZE;
`);

const mode = await buildStatsFromDatabase(db);
if (mode.kind !== "fromStatisticsExport") throw new Error("unreachable");

const items = mode.stats.find((s) => s.tableName === "items");
expect(items).toBeDefined();
expect(items!.columns).toBeNull();
});
92 changes: 92 additions & 0 deletions src/build-stats.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import {
type Postgres,
Statistics,
type StatisticsMode,
} from "@query-doctor/core";

const DEFAULT_RELTUPLES = 10_000;

/**
* Build a `fromStatisticsExport` stats mode from the live database.
*
* PostgreSQL's planner ignores `pg_class.relpages` for tables with data on
* disk — it reads the actual page count via `RelationGetNumberOfBlocks()`.
* It then estimates tuples as:
*
* estimated_tuples = actual_pages × pg_class.reltuples ÷ pg_class.relpages
*
* The old `fromAssumption` default (reltuples=10 000, relpages=1) causes a
* massive inflation when tables have real data (e.g. 167 pages → 1.67 M
* estimated tuples).
*
* By reading the real `relpages` from pg_class (after ANALYZE) and pairing
* it with a correct reltuples, the formula produces the correct estimate
* regardless of actual data volume. Column-level statistics (`pg_statistic`)
* are left untouched — ANALYZE already populated them.
*
* All tables are assumed to have 10,000 rows regardless of actual data.
*/
export async function buildStatsFromDatabase(
db: Postgres,
): Promise<StatisticsMode> {
type TableRow = {
tableName: string;
schemaName: string;
relpages: number;
relallvisible: number;
};
type IndexRow = TableRow & { indexName: string; reltuples: number };

const [tables, indexes] = await Promise.all([
db.exec<TableRow>(`
SELECT c.relname AS "tableName",
n.nspname AS "schemaName",
c.relpages::int AS "relpages",
c.relallvisible::int AS "relallvisible"
FROM pg_class c
JOIN pg_namespace n ON n.oid = c.relnamespace
WHERE c.relkind = 'r'
AND n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection
`),
db.exec<IndexRow>(`
SELECT t.relname AS "tableName",
n.nspname AS "schemaName",
i.relname AS "indexName",
i.reltuples::real AS "reltuples",
i.relpages::int AS "relpages",
i.relallvisible::int AS "relallvisible"
FROM pg_index ix
JOIN pg_class t ON t.oid = ix.indrelid
JOIN pg_class i ON i.oid = ix.indexrelid
JOIN pg_namespace n ON n.oid = t.relnamespace
WHERE n.nspname NOT IN ('pg_catalog', 'information_schema') -- @qd_introspection
`),
]);

const indexesByTable = new Map<string, IndexRow[]>();
for (const idx of indexes) {
const key = `${idx.schemaName}.${idx.tableName}`;
const list = indexesByTable.get(key) ?? [];
list.push(idx);
indexesByTable.set(key, list);
}

const stats = tables.map((t) => ({
tableName: t.tableName,
schemaName: t.schemaName,
reltuples: DEFAULT_RELTUPLES,
relpages: Math.max(1, t.relpages),
relallvisible: t.relallvisible ?? 0,
columns: null,
indexes: (
indexesByTable.get(`${t.schemaName}.${t.tableName}`) ?? []
).map((i) => ({
indexName: i.indexName,
relpages: Math.max(1, i.relpages),
reltuples: i.reltuples,
relallvisible: i.relallvisible ?? 0,
})),
}));

return Statistics.statsModeFromExport(stats);
}
25 changes: 14 additions & 11 deletions src/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import { env } from "./env.ts";
import { connectToSource } from "./sql/postgresjs.ts";
import { parse } from "@libpg-query/parser";
import { Connectable } from "./sync/connectable.ts";
import { buildStatsFromDatabase } from "./build-stats.ts";

export class Runner {
private readonly seenQueries = new Set<string>();
Expand Down Expand Up @@ -66,7 +67,16 @@ export class Runner {
ignoredQueryHashes?: string[];
}) {
const db = connectToSource(options.postgresUrl);
const statisticsMode = Runner.decideStatisticsMode(options.statisticsPath);
let statisticsMode: StatisticsMode;
if (options.statisticsPath) {
statisticsMode = Runner.decideStatisticsMode(options.statisticsPath);
} else {
// Run ANALYZE so pg_class.relpages and pg_statistic reflect the
// current data. Without this, relpages can be 0 after fresh
// inserts and column stats depend on autovacuum timing.
await db.exec("ANALYZE");
statisticsMode = await buildStatsFromDatabase(db);
}
const stats = await Statistics.fromPostgres(db, statisticsMode);
const existingIndexes = await stats.getExistingIndexes();
const optimizer = new IndexOptimizer(db, stats, existingIndexes);
Expand Down Expand Up @@ -461,19 +471,12 @@ export class Runner {
console.log();
}

private static decideStatisticsMode(path?: string): StatisticsMode {
if (path) {
const data = Runner.readStatisticsFile(path);
return Statistics.statsModeFromExport(data);
} else {
return Statistics.defaultStatsMode;
}
}
private static readStatisticsFile(path: string): ExportedStats[] {
private static decideStatisticsMode(path: string): StatisticsMode {
const data = readFileSync(path);
const json = JSON.parse(new TextDecoder().decode(data));
return ExportedStats.array().parse(json);
return Statistics.statsModeFromExport(ExportedStats.array().parse(json));
}

}

export type QueryProcessResult =
Expand Down
Loading