From 340ce60a549680eb0f3d46273a59c2407b1ebc8d Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 00:09:09 -0600
Subject: [PATCH 1/8] refactor: domain error hierarchy replacing ad-hoc error
 handling (ROADMAP 3.8)

Add structured domain errors (CodegraphError base + 7 subclasses) to
replace the mix of process.exit(1), throw new Error, and console.error
scattered across library code.

- New src/errors.js with ParseError, DbError, ConfigError, ResolutionError,
  EngineError, AnalysisError, BoundaryError
- Library code throws domain errors instead of calling process.exit(1)
- CLI top-level catch formats CodegraphError with [CODE] prefix
- MCP catch returns structured { isError, code } responses
- CLI commands use parseAsync() so async errors propagate
- CI gate commands (check, manifesto) use process.exitCode instead of exit
- All error classes exported from public API (src/index.js)

Impact: 52 functions changed, 215 affected
---
 src/ast-analysis/shared.js        |  9 ++--
 src/batch.js                      |  3 +-
 src/cli.js                        |  8 +++-
 src/cli/commands/ast.js           |  5 +-
 src/cli/commands/batch.js         |  9 ++--
 src/cli/commands/check.js         | 11 +++--
 src/cli/commands/co-change.js     |  5 +-
 src/cli/commands/registry.js      |  4 +-
 src/cli/commands/snapshot.js      | 54 +++++++--------------
 src/cli/commands/triage.js        | 12 ++---
 src/cli/index.js                  |  6 +--
 src/commands/check.js             | 10 ++--
 src/commands/manifesto.js         |  6 +--
 src/db/connection.js              |  8 ++--
 src/db/query-builder.js           | 11 +++--
 src/db/repository/nodes.js        |  7 ++-
 src/embedder.js                   | 40 ++++++++--------
 src/errors.js                     | 78 +++++++++++++++++++++++++++++++
 src/index.js                      | 11 +++++
 src/mcp/server.js                 | 20 ++++----
 src/native.js                     |  4 +-
 src/snapshot.js                   | 11 +++--
 src/watcher.js                    |  4 +-
 tests/unit/db.test.js             | 27 +++++------
 tests/unit/errors.test.js         | 69 +++++++++++++++++++++++++++
 tests/unit/prompt-install.test.js | 35 ++++++++------
 26 files changed, 316 insertions(+), 151 deletions(-)
 create mode 100644 src/errors.js
 create mode 100644 tests/unit/errors.test.js

diff --git a/src/ast-analysis/shared.js b/src/ast-analysis/shared.js
index 6d7ed6dc..f5d8e0be 100644
--- a/src/ast-analysis/shared.js
+++ b/src/ast-analysis/shared.js
@@ -2,6 +2,7 @@
  * Shared utilities for AST analysis modules (complexity, CFG, dataflow, AST nodes).
  */
 
+import { ConfigError } from '../errors.js';
 import { LANGUAGE_REGISTRY } from '../parser.js';
 
 // ─── Generic Rule Factory ─────────────────────────────────────────────────
@@ -18,7 +19,7 @@ export function makeRules(defaults, overrides, label) {
   const validKeys = new Set(Object.keys(defaults));
   for (const key of Object.keys(overrides)) {
     if (!validKeys.has(key)) {
-      throw new Error(`${label} rules: unknown key "${key}"`);
+      throw new ConfigError(`${label} rules: unknown key "${key}"`);
     }
   }
   return { ...defaults, ...overrides };
@@ -61,10 +62,10 @@ export const CFG_DEFAULTS = {
 export function makeCfgRules(overrides) {
   const rules = makeRules(CFG_DEFAULTS, overrides, 'CFG');
   if (!(rules.functionNodes instanceof Set) || rules.functionNodes.size === 0) {
-    throw new Error('CFG rules: functionNodes must be a non-empty Set');
+    throw new ConfigError('CFG rules: functionNodes must be a non-empty Set');
   }
   if (!(rules.forNodes instanceof Set)) {
-    throw new Error('CFG rules: forNodes must be a Set');
+    throw new ConfigError('CFG rules: forNodes must be a Set');
   }
   return rules;
 }
@@ -136,7 +137,7 @@ export const DATAFLOW_DEFAULTS = {
 export function makeDataflowRules(overrides) {
   const rules = makeRules(DATAFLOW_DEFAULTS, overrides, 'Dataflow');
   if (!(rules.functionNodes instanceof Set) || rules.functionNodes.size === 0) {
-    throw new Error('Dataflow rules: functionNodes must be a non-empty Set');
+    throw new ConfigError('Dataflow rules: functionNodes must be a non-empty Set');
   }
   return rules;
 }
diff --git a/src/batch.js b/src/batch.js
index cdb25dfc..fb4ce88d 100644
--- a/src/batch.js
+++ b/src/batch.js
@@ -7,6 +7,7 @@
 
 import { complexityData } from './complexity.js';
 import { dataflowData } from './dataflow.js';
+import { ConfigError } from './errors.js';
 import { flowData } from './flow.js';
 import {
   contextData,
@@ -53,7 +54,7 @@ export const BATCH_COMMANDS = {
 export function batchData(command, targets, customDbPath, opts = {}) {
   const entry = BATCH_COMMANDS[command];
   if (!entry) {
-    throw new Error(
+    throw new ConfigError(
       `Unknown batch command "${command}". Valid commands: ${Object.keys(BATCH_COMMANDS).join(', ')}`,
     );
   }
diff --git a/src/cli.js b/src/cli.js
index 72e9ced7..6318f0e4 100644
--- a/src/cli.js
+++ b/src/cli.js
@@ -1,8 +1,14 @@
 #!/usr/bin/env node
 
 import { run } from './cli/index.js';
+import { CodegraphError } from './errors.js';
 
 run().catch((err) => {
-  console.error(`codegraph: fatal error — ${err.message || err}`);
+  if (err instanceof CodegraphError) {
+    console.error(`codegraph [${err.code}]: ${err.message}`);
+    if (err.file) console.error(`  file: ${err.file}`);
+  } else {
+    console.error(`codegraph: fatal error — ${err.message || err}`);
+  }
   process.exit(1);
 });
diff --git a/src/cli/commands/ast.js b/src/cli/commands/ast.js
index 92140a33..cc9124b0 100644
--- a/src/cli/commands/ast.js
+++ b/src/cli/commands/ast.js
@@ -1,3 +1,5 @@
+import { ConfigError } from '../../errors.js';
+
 export const command = {
   name: 'ast [pattern]',
   description: 'Search stored AST nodes (calls, new, string, regex, throw, await) by pattern',
@@ -9,8 +11,7 @@ export const command = {
   async execute([pattern], opts, ctx) {
     const { AST_NODE_KINDS, astQuery } = await import('../../ast.js');
     if (opts.kind && !AST_NODE_KINDS.includes(opts.kind)) {
-      console.error(`Invalid AST kind "${opts.kind}". Valid: ${AST_NODE_KINDS.join(', ')}`);
-      process.exit(1);
+      throw new ConfigError(`Invalid AST kind "${opts.kind}". Valid: ${AST_NODE_KINDS.join(', ')}`);
     }
     astQuery(pattern, opts.db, {
       kind: opts.kind,
diff --git a/src/cli/commands/batch.js b/src/cli/commands/batch.js
index fe75c5c3..7637b5fb 100644
--- a/src/cli/commands/batch.js
+++ b/src/cli/commands/batch.js
@@ -1,6 +1,7 @@
 import fs from 'node:fs';
 import { BATCH_COMMANDS, multiBatchData, splitTargets } from '../../batch.js';
 import { batch } from '../../commands/batch.js';
+import { ConfigError } from '../../errors.js';
 import { EVERY_SYMBOL_KIND } from '../../queries.js';
 
 export const command = {
@@ -40,13 +41,13 @@ export const command = {
         targets = splitTargets(positionalTargets);
       }
     } catch (err) {
-      console.error(`Failed to parse targets: ${err.message}`);
-      process.exit(1);
+      throw new ConfigError(`Failed to parse targets: ${err.message}`, { cause: err });
     }
 
     if (!targets || targets.length === 0) {
-      console.error('No targets provided. Pass targets as arguments, --from-file, or --stdin.');
-      process.exit(1);
+      throw new ConfigError(
+        'No targets provided. Pass targets as arguments, --from-file, or --stdin.',
+      );
     }
 
     const batchOpts = {
diff --git a/src/cli/commands/check.js b/src/cli/commands/check.js
index 4c79fc11..78edb0b9 100644
--- a/src/cli/commands/check.js
+++ b/src/cli/commands/check.js
@@ -1,3 +1,4 @@
+import { ConfigError } from '../../errors.js';
 import { EVERY_SYMBOL_KIND } from '../../queries.js';
 
 export const command = {
@@ -27,8 +28,9 @@ export const command = {
 
     if (!isDiffMode && !opts.rules) {
       if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) {
-        console.error(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`);
-        process.exit(1);
+        throw new ConfigError(
+          `Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`,
+        );
       }
       const { manifesto } = await import('../../commands/manifesto.js');
       manifesto(opts.db, {
@@ -58,8 +60,9 @@ export const command = {
 
     if (opts.rules) {
       if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) {
-        console.error(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`);
-        process.exit(1);
+        throw new ConfigError(
+          `Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`,
+        );
       }
       const { manifesto } = await import('../../commands/manifesto.js');
       manifesto(opts.db, {
diff --git a/src/cli/commands/co-change.js b/src/cli/commands/co-change.js
index 9118ed0b..ef4885b5 100644
--- a/src/cli/commands/co-change.js
+++ b/src/cli/commands/co-change.js
@@ -1,3 +1,5 @@
+import { AnalysisError } from '../../errors.js';
+
 export const command = {
   name: 'co-change [file]',
   description:
@@ -32,8 +34,7 @@ export const command = {
       if (opts.json) {
         console.log(JSON.stringify(result, null, 2));
       } else if (result.error) {
-        console.error(result.error);
-        process.exit(1);
+        throw new AnalysisError(result.error);
       } else {
         console.log(
           `\nCo-change analysis complete: ${result.pairsFound} pairs from ${result.commitsScanned} commits (since: ${result.since})\n`,
diff --git a/src/cli/commands/registry.js b/src/cli/commands/registry.js
index c2181556..9e516d9a 100644
--- a/src/cli/commands/registry.js
+++ b/src/cli/commands/registry.js
@@ -1,5 +1,6 @@
 import fs from 'node:fs';
 import path from 'node:path';
+import { ConfigError } from '../../errors.js';
 import {
   listRepos,
   pruneRegistry,
@@ -54,8 +55,7 @@ export const command = {
         if (removed) {
           console.log(`Removed "${name}" from registry.`);
         } else {
-          console.error(`Repository "${name}" not found in registry.`);
-          process.exit(1);
+          throw new ConfigError(`Repository "${name}" not found in registry.`);
         }
       },
     },
diff --git a/src/cli/commands/snapshot.js b/src/cli/commands/snapshot.js
index b2dc3455..8dd0093e 100644
--- a/src/cli/commands/snapshot.js
+++ b/src/cli/commands/snapshot.js
@@ -12,13 +12,8 @@ export const command = {
         ['--force', 'Overwrite existing snapshot'],
       ],
       execute([name], opts, ctx) {
-        try {
-          const result = snapshotSave(name, { dbPath: opts.db, force: opts.force });
-          console.log(`Snapshot saved: ${result.name} (${ctx.formatSize(result.size)})`);
-        } catch (err) {
-          console.error(err.message);
-          process.exit(1);
-        }
+        const result = snapshotSave(name, { dbPath: opts.db, force: opts.force });
+        console.log(`Snapshot saved: ${result.name} (${ctx.formatSize(result.size)})`);
       },
     },
     {
@@ -26,13 +21,8 @@ export const command = {
       description: 'Restore a snapshot over the current graph database',
       options: [['-d, --db <path>', 'Path to graph.db']],
       execute([name], opts) {
-        try {
-          snapshotRestore(name, { dbPath: opts.db });
-          console.log(`Snapshot "${name}" restored.`);
-        } catch (err) {
-          console.error(err.message);
-          process.exit(1);
-        }
+        snapshotRestore(name, { dbPath: opts.db });
+        console.log(`Snapshot "${name}" restored.`);
       },
     },
     {
@@ -43,23 +33,18 @@ export const command = {
         ['-j, --json', 'Output as JSON'],
       ],
       execute(_args, opts, ctx) {
-        try {
-          const snapshots = snapshotList({ dbPath: opts.db });
-          if (opts.json) {
-            console.log(JSON.stringify(snapshots, null, 2));
-          } else if (snapshots.length === 0) {
-            console.log('No snapshots found.');
-          } else {
-            console.log(`Snapshots (${snapshots.length}):\n`);
-            for (const s of snapshots) {
-              console.log(
-                `  ${s.name.padEnd(30)} ${ctx.formatSize(s.size).padStart(10)}  ${s.createdAt.toISOString()}`,
-              );
-            }
+        const snapshots = snapshotList({ dbPath: opts.db });
+        if (opts.json) {
+          console.log(JSON.stringify(snapshots, null, 2));
+        } else if (snapshots.length === 0) {
+          console.log('No snapshots found.');
+        } else {
+          console.log(`Snapshots (${snapshots.length}):\n`);
+          for (const s of snapshots) {
+            console.log(
+              `  ${s.name.padEnd(30)} ${ctx.formatSize(s.size).padStart(10)}  ${s.createdAt.toISOString()}`,
+            );
           }
-        } catch (err) {
-          console.error(err.message);
-          process.exit(1);
         }
       },
     },
@@ -68,13 +53,8 @@ export const command = {
       description: 'Delete a saved snapshot',
       options: [['-d, --db <path>', 'Path to graph.db']],
       execute([name], opts) {
-        try {
-          snapshotDelete(name, { dbPath: opts.db });
-          console.log(`Snapshot "${name}" deleted.`);
-        } catch (err) {
-          console.error(err.message);
-          process.exit(1);
-        }
+        snapshotDelete(name, { dbPath: opts.db });
+        console.log(`Snapshot "${name}" deleted.`);
       },
     },
   ],
diff --git a/src/cli/commands/triage.js b/src/cli/commands/triage.js
index eb8946d6..a334475f 100644
--- a/src/cli/commands/triage.js
+++ b/src/cli/commands/triage.js
@@ -1,3 +1,4 @@
+import { ConfigError } from '../../errors.js';
 import { EVERY_SYMBOL_KIND, VALID_ROLES } from '../../queries.js';
 
 export const command = {
@@ -46,20 +47,17 @@ export const command = {
     }
 
     if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) {
-      console.error(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`);
-      process.exit(1);
+      throw new ConfigError(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`);
     }
     if (opts.role && !VALID_ROLES.includes(opts.role)) {
-      console.error(`Invalid role "${opts.role}". Valid: ${VALID_ROLES.join(', ')}`);
-      process.exit(1);
+      throw new ConfigError(`Invalid role "${opts.role}". Valid: ${VALID_ROLES.join(', ')}`);
     }
     let weights;
     if (opts.weights) {
       try {
         weights = JSON.parse(opts.weights);
-      } catch {
-        console.error('Invalid --weights JSON');
-        process.exit(1);
+      } catch (err) {
+        throw new ConfigError('Invalid --weights JSON', { cause: err });
       }
     }
     const { triage } = await import('../../commands/triage.js');
diff --git a/src/cli/index.js b/src/cli/index.js
index 83fecb98..52936b40 100644
--- a/src/cli/index.js
+++ b/src/cli/index.js
@@ -2,6 +2,7 @@ import fs from 'node:fs';
 import path from 'node:path';
 import { pathToFileURL } from 'node:url';
 import { Command } from 'commander';
+import { ConfigError } from '../errors.js';
 import { setVerbose } from '../logger.js';
 import { checkForUpdates, printUpdateNotification } from '../update-check.js';
 import { applyQueryOpts, config, formatSize, resolveNoTests } from './shared/options.js';
@@ -68,8 +69,7 @@ function registerCommand(parent, def) {
       if (def.validate) {
         const err = def.validate(args, opts, ctx);
         if (err) {
-          console.error(err);
-          process.exit(1);
+          throw new ConfigError(err);
         }
       }
 
@@ -112,7 +112,7 @@ async function discoverCommands() {
 
 export async function run() {
   await discoverCommands();
-  program.parse();
+  await program.parseAsync();
 }
 
 export { program, registerCommand, ctx };
diff --git a/src/commands/check.js b/src/commands/check.js
index b3ae6d1b..114b09f6 100644
--- a/src/commands/check.js
+++ b/src/commands/check.js
@@ -1,19 +1,19 @@
 import { checkData } from '../check.js';
+import { AnalysisError } from '../errors.js';
 import { outputResult } from '../infrastructure/result-formatter.js';
 
 /**
- * CLI formatter — prints check results and exits with code 1 on failure.
+ * CLI formatter — prints check results and sets exitCode 1 on failure.
  */
 export function check(customDbPath, opts = {}) {
   const data = checkData(customDbPath, opts);
 
   if (data.error) {
-    console.error(data.error);
-    process.exit(1);
+    throw new AnalysisError(data.error);
   }
 
   if (outputResult(data, null, opts)) {
-    if (!data.passed) process.exit(1);
+    if (!data.passed) process.exitCode = 1;
     return;
   }
 
@@ -77,6 +77,6 @@ export function check(customDbPath, opts = {}) {
   console.log(`\n  ${s.total} predicates | ${s.passed} passed | ${s.failed} failed\n`);
 
   if (!data.passed) {
-    process.exit(1);
+    process.exitCode = 1;
   }
 }
diff --git a/src/commands/manifesto.js b/src/commands/manifesto.js
index 8044f61c..0ccf1d1e 100644
--- a/src/commands/manifesto.js
+++ b/src/commands/manifesto.js
@@ -2,13 +2,13 @@ import { outputResult } from '../infrastructure/result-formatter.js';
 import { manifestoData } from '../manifesto.js';
 
 /**
- * CLI formatter — prints manifesto results and exits with code 1 on failure.
+ * CLI formatter — prints manifesto results and sets exitCode 1 on failure.
  */
 export function manifesto(customDbPath, opts = {}) {
   const data = manifestoData(customDbPath, opts);
 
   if (outputResult(data, 'violations', opts)) {
-    if (!data.passed) process.exit(1);
+    if (!data.passed) process.exitCode = 1;
     return;
   }
 
@@ -72,6 +72,6 @@ export function manifesto(customDbPath, opts = {}) {
   console.log();
 
   if (!data.passed) {
-    process.exit(1);
+    process.exitCode = 1;
   }
 }
diff --git a/src/db/connection.js b/src/db/connection.js
index beffdc41..d8b34c21 100644
--- a/src/db/connection.js
+++ b/src/db/connection.js
@@ -1,6 +1,7 @@
 import fs from 'node:fs';
 import path from 'node:path';
 import Database from 'better-sqlite3';
+import { DbError } from '../errors.js';
 import { warn } from '../logger.js';
 
 function isProcessAlive(pid) {
@@ -78,11 +79,10 @@ export function findDbPath(customPath) {
 export function openReadonlyOrFail(customPath) {
   const dbPath = findDbPath(customPath);
   if (!fs.existsSync(dbPath)) {
-    console.error(
-      `No codegraph database found at ${dbPath}.\n` +
-        `Run "codegraph build" first to analyze your codebase.`,
+    throw new DbError(
+      `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
+      { file: dbPath },
     );
-    process.exit(1);
   }
   return new Database(dbPath, { readonly: true });
 }
diff --git a/src/db/query-builder.js b/src/db/query-builder.js
index 29b87686..2f43e754 100644
--- a/src/db/query-builder.js
+++ b/src/db/query-builder.js
@@ -1,3 +1,4 @@
+import { DbError } from '../errors.js';
 import { EVERY_EDGE_KIND } from '../kinds.js';
 
 // ─── Validation Helpers ─────────────────────────────────────────────
@@ -12,13 +13,13 @@ const SAFE_SELECT_TOKEN_RE =
 
 function validateAlias(alias) {
   if (!SAFE_ALIAS_RE.test(alias)) {
-    throw new Error(`Invalid SQL alias: ${alias}`);
+    throw new DbError(`Invalid SQL alias: ${alias}`);
   }
 }
 
 function validateColumn(column) {
   if (!SAFE_COLUMN_RE.test(column)) {
-    throw new Error(`Invalid SQL column: ${column}`);
+    throw new DbError(`Invalid SQL column: ${column}`);
   }
 }
 
@@ -26,7 +27,7 @@ function validateOrderBy(clause) {
   const terms = clause.split(',').map((t) => t.trim());
   for (const term of terms) {
     if (!SAFE_ORDER_TERM_RE.test(term)) {
-      throw new Error(`Invalid ORDER BY term: ${term}`);
+      throw new DbError(`Invalid ORDER BY term: ${term}`);
     }
   }
 }
@@ -51,14 +52,14 @@ function validateSelectCols(cols) {
   const tokens = splitTopLevelCommas(cols);
   for (const token of tokens) {
     if (!SAFE_SELECT_TOKEN_RE.test(token)) {
-      throw new Error(`Invalid SELECT expression: ${token}`);
+      throw new DbError(`Invalid SELECT expression: ${token}`);
     }
   }
 }
 
 function validateEdgeKind(edgeKind) {
   if (!EVERY_EDGE_KIND.includes(edgeKind)) {
-    throw new Error(
+    throw new DbError(
       `Invalid edge kind: ${edgeKind} (expected one of ${EVERY_EDGE_KIND.join(', ')})`,
     );
   }
diff --git a/src/db/repository/nodes.js b/src/db/repository/nodes.js
index 7fa3d035..af4a3475 100644
--- a/src/db/repository/nodes.js
+++ b/src/db/repository/nodes.js
@@ -1,3 +1,4 @@
+import { ConfigError } from '../../errors.js';
 import { EVERY_SYMBOL_KIND, VALID_ROLES } from '../../kinds.js';
 import { NodeQuery } from '../query-builder.js';
 import { cachedStmt } from './cached-stmt.js';
@@ -37,10 +38,12 @@ export function findNodesWithFanIn(db, namePattern, opts = {}) {
  */
 export function findNodesForTriage(db, opts = {}) {
   if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) {
-    throw new Error(`Invalid kind: ${opts.kind} (expected one of ${EVERY_SYMBOL_KIND.join(', ')})`);
+    throw new ConfigError(
+      `Invalid kind: ${opts.kind} (expected one of ${EVERY_SYMBOL_KIND.join(', ')})`,
+    );
   }
   if (opts.role && !VALID_ROLES.includes(opts.role)) {
-    throw new Error(`Invalid role: ${opts.role} (expected one of ${VALID_ROLES.join(', ')})`);
+    throw new ConfigError(`Invalid role: ${opts.role} (expected one of ${VALID_ROLES.join(', ')})`);
   }
 
   const kindsToUse = opts.kind ? [opts.kind] : ['function', 'method', 'class'];
diff --git a/src/embedder.js b/src/embedder.js
index 3e03ee1b..f8fbc527 100644
--- a/src/embedder.js
+++ b/src/embedder.js
@@ -10,6 +10,7 @@ import {
   openDb,
   openReadonlyOrFail,
 } from './db.js';
+import { ConfigError, DbError, EngineError } from './errors.js';
 import { info, warn } from './logger.js';
 import { normalizeSymbol } from './queries.js';
 
@@ -123,8 +124,7 @@ function getModelConfig(modelKey) {
   const key = modelKey || DEFAULT_MODEL;
   const config = MODELS[key];
   if (!config) {
-    console.error(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`);
-    process.exit(1);
+    throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`);
   }
   return config;
 }
@@ -263,13 +263,14 @@ async function loadTransformers() {
     if (installed) {
       try {
         return await import(pkg);
-      } catch {
-        console.error(`\n${pkg} was installed but failed to load. Please check your environment.`);
-        process.exit(1);
+      } catch (loadErr) {
+        throw new EngineError(
+          `${pkg} was installed but failed to load. Please check your environment.`,
+          { cause: loadErr },
+        );
       }
     }
-    console.error(`Semantic search requires ${pkg}.\n` + `Install it with: npm install ${pkg}`);
-    process.exit(1);
+    throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`);
   }
 }
 
@@ -304,20 +305,20 @@ async function loadModel(modelKey) {
   } catch (err) {
     const msg = err.message || String(err);
     if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
-      console.error(
-        `\nModel "${config.name}" requires authentication.\n` +
+      throw new EngineError(
+        `Model "${config.name}" requires authentication.\n` +
           `This model is gated on HuggingFace and needs an access token.\n\n` +
           `Options:\n` +
           `  1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
-          `  2. Use a public model instead: codegraph embed --model minilm\n`,
-      );
-    } else {
-      console.error(
-        `\nFailed to load model "${config.name}": ${msg}\n` +
-          `Try a different model: codegraph embed --model minilm\n`,
+          `  2. Use a public model instead: codegraph embed --model minilm`,
+        { cause: err },
       );
     }
-    process.exit(1);
+    throw new EngineError(
+      `Failed to load model "${config.name}": ${msg}\n` +
+        `Try a different model: codegraph embed --model minilm`,
+      { cause: err },
+    );
   }
   activeModel = config.name;
   info('Model loaded.');
@@ -413,11 +414,10 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath, options =
   const dbPath = customDbPath || findDbPath(null);
 
   if (!fs.existsSync(dbPath)) {
-    console.error(
-      `No codegraph database found at ${dbPath}.\n` +
-        `Run "codegraph build" first to analyze your codebase.`,
+    throw new DbError(
+      `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
+      { file: dbPath },
     );
-    process.exit(1);
   }
 
   const db = openDb(dbPath);
diff --git a/src/errors.js b/src/errors.js
new file mode 100644
index 00000000..0a398446
--- /dev/null
+++ b/src/errors.js
@@ -0,0 +1,78 @@
+/**
+ * Domain error hierarchy for codegraph.
+ *
+ * Library code throws these instead of calling process.exit() or throwing
+ * bare Error instances. The CLI top-level catch formats them for humans;
+ * MCP returns structured { isError, code } responses.
+ */
+
+export class CodegraphError extends Error {
+  /** @type {string} */
+  code;
+
+  /** @type {string|undefined} */
+  file;
+
+  /**
+   * @param {string} message
+   * @param {object} [opts]
+   * @param {string} [opts.code]
+   * @param {string} [opts.file]  - Related file path, if applicable
+   * @param {Error}  [opts.cause] - Original error that triggered this one
+   */
+  constructor(message, { code = 'CODEGRAPH_ERROR', file, cause } = {}) {
+    super(message, { cause });
+    this.name = 'CodegraphError';
+    this.code = code;
+    this.file = file;
+  }
+}
+
+export class ParseError extends CodegraphError {
+  constructor(message, opts = {}) {
+    super(message, { code: 'PARSE_FAILED', ...opts });
+    this.name = 'ParseError';
+  }
+}
+
+export class DbError extends CodegraphError {
+  constructor(message, opts = {}) {
+    super(message, { code: 'DB_ERROR', ...opts });
+    this.name = 'DbError';
+  }
+}
+
+export class ConfigError extends CodegraphError {
+  constructor(message, opts = {}) {
+    super(message, { code: 'CONFIG_INVALID', ...opts });
+    this.name = 'ConfigError';
+  }
+}
+
+export class ResolutionError extends CodegraphError {
+  constructor(message, opts = {}) {
+    super(message, { code: 'RESOLUTION_FAILED', ...opts });
+    this.name = 'ResolutionError';
+  }
+}
+
+export class EngineError extends CodegraphError {
+  constructor(message, opts = {}) {
+    super(message, { code: 'ENGINE_UNAVAILABLE', ...opts });
+    this.name = 'EngineError';
+  }
+}
+
+export class AnalysisError extends CodegraphError {
+  constructor(message, opts = {}) {
+    super(message, { code: 'ANALYSIS_FAILED', ...opts });
+    this.name = 'AnalysisError';
+  }
+}
+
+export class BoundaryError extends CodegraphError {
+  constructor(message, opts = {}) {
+    super(message, { code: 'BOUNDARY_VIOLATION', ...opts });
+    this.name = 'BoundaryError';
+  }
+}
diff --git a/src/index.js b/src/index.js
index b798d38a..ae0732ac 100644
--- a/src/index.js
+++ b/src/index.js
@@ -119,6 +119,17 @@ export {
   search,
   searchData,
 } from './embedder.js';
+// Domain errors
+export {
+  AnalysisError,
+  BoundaryError,
+  CodegraphError,
+  ConfigError,
+  DbError,
+  EngineError,
+  ParseError,
+  ResolutionError,
+} from './errors.js';
 // Export (DOT/Mermaid/JSON/GraphML/GraphSON/Neo4j CSV)
 export {
   exportDOT,
diff --git a/src/mcp/server.js b/src/mcp/server.js
index 135c08a2..3a39aed8 100644
--- a/src/mcp/server.js
+++ b/src/mcp/server.js
@@ -7,6 +7,7 @@
 
 import { createRequire } from 'node:module';
 import { findDbPath } from '../db.js';
+import { CodegraphError, ConfigError } from '../errors.js';
 import { MCP_MAX_LIMIT } from '../paginate.js';
 import { buildToolList } from './tool-registry.js';
 import { TOOL_HANDLERS } from './tools/index.js';
@@ -33,11 +34,9 @@ export async function startMCPServer(customDbPath, options = {}) {
     ListToolsRequestSchema = types.ListToolsRequestSchema;
     CallToolRequestSchema = types.CallToolRequestSchema;
   } catch {
-    console.error(
-      'MCP server requires @modelcontextprotocol/sdk.\n' +
-        'Install it with: npm install @modelcontextprotocol/sdk',
+    throw new ConfigError(
+      'MCP server requires @modelcontextprotocol/sdk.\nInstall it with: npm install @modelcontextprotocol/sdk',
     );
-    process.exit(1);
   }
 
   // Connect transport FIRST so the server can receive the client's
@@ -75,12 +74,12 @@ export async function startMCPServer(customDbPath, options = {}) {
     const { name, arguments: args } = request.params;
     try {
       if (!multiRepo && args.repo) {
-        throw new Error(
+        throw new ConfigError(
           'Multi-repo access is disabled. Restart with `codegraph mcp --multi-repo` to access other repositories.',
         );
       }
       if (!multiRepo && name === 'list_repos') {
-        throw new Error(
+        throw new ConfigError(
           'Multi-repo access is disabled. Restart with `codegraph mcp --multi-repo` to list repositories.',
         );
       }
@@ -88,12 +87,12 @@ export async function startMCPServer(customDbPath, options = {}) {
       let dbPath = customDbPath || undefined;
       if (args.repo) {
         if (allowedRepos && !allowedRepos.includes(args.repo)) {
-          throw new Error(`Repository "${args.repo}" is not in the allowed repos list.`);
+          throw new ConfigError(`Repository "${args.repo}" is not in the allowed repos list.`);
         }
         const { resolveRepoDbPath } = await import('../registry.js');
         const resolved = resolveRepoDbPath(args.repo);
         if (!resolved)
-          throw new Error(
+          throw new ConfigError(
             `Repository "${args.repo}" not found in registry or its database is missing.`,
           );
         dbPath = resolved;
@@ -117,7 +116,10 @@ export async function startMCPServer(customDbPath, options = {}) {
       if (result?.content) return result; // pass-through MCP responses
       return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
     } catch (err) {
-      return { content: [{ type: 'text', text: `Error: ${err.message}` }], isError: true };
+      const code = err instanceof CodegraphError ? err.code : 'UNKNOWN_ERROR';
+      const text =
+        err instanceof CodegraphError ? `[${code}] ${err.message}` : `Error: ${err.message}`;
+      return { content: [{ type: 'text', text }], isError: true };
     }
   });
 
diff --git a/src/native.js b/src/native.js
index ce435f92..7de86d9a 100644
--- a/src/native.js
+++ b/src/native.js
@@ -8,6 +8,7 @@
 
 import { createRequire } from 'node:module';
 import os from 'node:os';
+import { EngineError } from './errors.js';
 
 let _cached; // undefined = not yet tried, null = failed, object = module
 let _loadError = null;
@@ -101,9 +102,10 @@ export function getNativePackageVersion() {
 export function getNative() {
   const mod = loadNative();
   if (!mod) {
-    throw new Error(
+    throw new EngineError(
       `Native codegraph-core not available: ${_loadError?.message || 'unknown error'}. ` +
         'Install the platform package or use --engine wasm.',
+      { cause: _loadError },
     );
   }
   return mod;
diff --git a/src/snapshot.js b/src/snapshot.js
index 43d46b09..a9f80ce9 100644
--- a/src/snapshot.js
+++ b/src/snapshot.js
@@ -2,6 +2,7 @@ import fs from 'node:fs';
 import path from 'node:path';
 import Database from 'better-sqlite3';
 import { findDbPath } from './db.js';
+import { ConfigError, DbError } from './errors.js';
 import { debug } from './logger.js';
 
 const NAME_RE = /^[a-zA-Z0-9_-]+$/;
@@ -12,7 +13,7 @@ const NAME_RE = /^[a-zA-Z0-9_-]+$/;
  */
 export function validateSnapshotName(name) {
   if (!name || !NAME_RE.test(name)) {
-    throw new Error(
+    throw new ConfigError(
       `Invalid snapshot name "${name}". Use only letters, digits, hyphens, and underscores.`,
     );
   }
@@ -39,7 +40,7 @@ export function snapshotSave(name, options = {}) {
   validateSnapshotName(name);
   const dbPath = options.dbPath || findDbPath();
   if (!fs.existsSync(dbPath)) {
-    throw new Error(`Database not found: ${dbPath}`);
+    throw new DbError(`Database not found: ${dbPath}`, { file: dbPath });
   }
 
   const dir = snapshotsDir(dbPath);
@@ -47,7 +48,7 @@ export function snapshotSave(name, options = {}) {
 
   if (fs.existsSync(dest)) {
     if (!options.force) {
-      throw new Error(`Snapshot "${name}" already exists. Use --force to overwrite.`);
+      throw new DbError(`Snapshot "${name}" already exists. Use --force to overwrite.`);
     }
     fs.unlinkSync(dest);
     debug(`Deleted existing snapshot: ${dest}`);
@@ -82,7 +83,7 @@ export function snapshotRestore(name, options = {}) {
   const src = path.join(dir, `${name}.db`);
 
   if (!fs.existsSync(src)) {
-    throw new Error(`Snapshot "${name}" not found at ${src}`);
+    throw new DbError(`Snapshot "${name}" not found at ${src}`, { file: src });
   }
 
   // Remove WAL/SHM sidecar files for a clean restore
@@ -141,7 +142,7 @@ export function snapshotDelete(name, options = {}) {
   const target = path.join(dir, `${name}.db`);
 
   if (!fs.existsSync(target)) {
-    throw new Error(`Snapshot "${name}" not found at ${target}`);
+    throw new DbError(`Snapshot "${name}" not found at ${target}`, { file: target });
   }
 
   fs.unlinkSync(target);
diff --git a/src/watcher.js b/src/watcher.js
index 32c80e53..aad62fe0 100644
--- a/src/watcher.js
+++ b/src/watcher.js
@@ -4,6 +4,7 @@ import { readFileSafe } from './builder.js';
 import { appendChangeEvents, buildChangeEvent, diffSymbols } from './change-journal.js';
 import { EXTENSIONS, IGNORE_DIRS, normalizePath } from './constants.js';
 import { closeDb, getNodeId as getNodeIdQuery, initSchema, openDb } from './db.js';
+import { DbError } from './errors.js';
 import { appendJournalEntries } from './journal.js';
 import { info, warn } from './logger.js';
 import { createParseTreeCache, getActiveEngine, parseFileIncremental } from './parser.js';
@@ -162,8 +163,7 @@ async function updateFile(_db, rootDir, filePath, stmts, engineOpts, cache) {
 export async function watchProject(rootDir, opts = {}) {
   const dbPath = path.join(rootDir, '.codegraph', 'graph.db');
   if (!fs.existsSync(dbPath)) {
-    console.error('No graph.db found. Run `codegraph build` first.');
-    process.exit(1);
+    throw new DbError('No graph.db found. Run `codegraph build` first.', { file: dbPath });
   }
 
   const db = openDb(dbPath);
diff --git a/tests/unit/db.test.js b/tests/unit/db.test.js
index 10fcbcde..63fb5807 100644
--- a/tests/unit/db.test.js
+++ b/tests/unit/db.test.js
@@ -6,7 +6,7 @@ import fs from 'node:fs';
 import os from 'node:os';
 import path from 'node:path';
 import Database from 'better-sqlite3';
-import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest';
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
 import {
   closeDb,
   findDbPath,
@@ -195,20 +195,17 @@ describe('build_meta', () => {
 });
 
 describe('openReadonlyOrFail', () => {
-  it('exits with error when DB does not exist', () => {
-    const exitSpy = vi.spyOn(process, 'exit').mockImplementation(() => {
-      throw new Error('process.exit');
-    });
-    const stderrSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
-
-    expect(() => openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'))).toThrow('process.exit');
-    expect(exitSpy).toHaveBeenCalledWith(1);
-    expect(stderrSpy).toHaveBeenCalled();
-    const errorMsg = stderrSpy.mock.calls[0][0];
-    expect(errorMsg).toContain('No codegraph database found');
-
-    exitSpy.mockRestore();
-    stderrSpy.mockRestore();
+  it('throws DbError when DB does not exist', () => {
+    expect(() => openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'))).toThrow(
+      'No codegraph database found',
+    );
+    try {
+      openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'));
+    } catch (err) {
+      expect(err.name).toBe('DbError');
+      expect(err.code).toBe('DB_ERROR');
+      expect(err.file).toBeDefined();
+    }
   });
 
   it('returns a readonly database when DB exists', () => {
diff --git a/tests/unit/errors.test.js b/tests/unit/errors.test.js
new file mode 100644
index 00000000..3714df5b
--- /dev/null
+++ b/tests/unit/errors.test.js
@@ -0,0 +1,69 @@
+/**
+ * Unit tests for the domain error hierarchy (src/errors.js).
+ */
+
+import { describe, expect, it } from 'vitest';
+import {
+  AnalysisError,
+  BoundaryError,
+  CodegraphError,
+  ConfigError,
+  DbError,
+  EngineError,
+  ParseError,
+  ResolutionError,
+} from '../../src/errors.js';
+
+describe('CodegraphError', () => {
+  it('sets defaults', () => {
+    const err = new CodegraphError('boom');
+    expect(err).toBeInstanceOf(Error);
+    expect(err).toBeInstanceOf(CodegraphError);
+    expect(err.name).toBe('CodegraphError');
+    expect(err.code).toBe('CODEGRAPH_ERROR');
+    expect(err.message).toBe('boom');
+    expect(err.file).toBeUndefined();
+    expect(err.cause).toBeUndefined();
+  });
+
+  it('accepts opts', () => {
+    const cause = new Error('root');
+    const err = new CodegraphError('msg', { code: 'CUSTOM', file: 'foo.js', cause });
+    expect(err.code).toBe('CUSTOM');
+    expect(err.file).toBe('foo.js');
+    expect(err.cause).toBe(cause);
+  });
+});
+
+describe('subclasses', () => {
+  const cases = [
+    { Class: ParseError, name: 'ParseError', code: 'PARSE_FAILED' },
+    { Class: DbError, name: 'DbError', code: 'DB_ERROR' },
+    { Class: ConfigError, name: 'ConfigError', code: 'CONFIG_INVALID' },
+    { Class: ResolutionError, name: 'ResolutionError', code: 'RESOLUTION_FAILED' },
+    { Class: EngineError, name: 'EngineError', code: 'ENGINE_UNAVAILABLE' },
+    { Class: AnalysisError, name: 'AnalysisError', code: 'ANALYSIS_FAILED' },
+    { Class: BoundaryError, name: 'BoundaryError', code: 'BOUNDARY_VIOLATION' },
+  ];
+
+  for (const { Class, name, code } of cases) {
+    it(`${name} has correct defaults and instanceof chain`, () => {
+      const err = new Class('test');
+      expect(err).toBeInstanceOf(Error);
+      expect(err).toBeInstanceOf(CodegraphError);
+      expect(err).toBeInstanceOf(Class);
+      expect(err.name).toBe(name);
+      expect(err.code).toBe(code);
+      expect(err.message).toBe('test');
+    });
+
+    it(`${name} forwards file and cause`, () => {
+      const cause = new Error('root');
+      const err = new Class('msg', { file: 'bar.js', cause });
+      expect(err.file).toBe('bar.js');
+      expect(err.cause).toBe(cause);
+      // code should stay as the subclass default
+      expect(err.code).toBe(code);
+    });
+  }
+});
diff --git a/tests/unit/prompt-install.test.js b/tests/unit/prompt-install.test.js
index d7583508..6a36c2de 100644
--- a/tests/unit/prompt-install.test.js
+++ b/tests/unit/prompt-install.test.js
@@ -34,7 +34,7 @@ describe('loadTransformers install prompt', () => {
     vi.restoreAllMocks();
   });
 
-  test('non-TTY: prints error and exits without prompting', async () => {
+  test('non-TTY: throws EngineError without prompting', async () => {
     process.stdin.isTTY = undefined;
 
     const rlFactory = vi.fn();
@@ -46,15 +46,18 @@ describe('loadTransformers install prompt', () => {
 
     const { embed } = await import('../../src/embedder.js');
 
-    await expect(embed(['test'], 'minilm')).rejects.toThrow('process.exit(1)');
-    expect(errorSpy).toHaveBeenCalledWith(
-      expect.stringContaining('Semantic search requires @huggingface/transformers'),
+    await expect(embed(['test'], 'minilm')).rejects.toThrow(
+      'Semantic search requires @huggingface/transformers',
     );
+    await expect(embed(['test'], 'minilm')).rejects.toMatchObject({
+      name: 'EngineError',
+      code: 'ENGINE_UNAVAILABLE',
+    });
     // readline should NOT have been called — no prompt in non-TTY
     expect(rlFactory).not.toHaveBeenCalled();
   });
 
-  test('TTY + user declines: prints error and exits', async () => {
+  test('TTY + user declines: throws EngineError', async () => {
     process.stdin.isTTY = true;
 
     vi.doMock('node:readline', () => ({
@@ -70,13 +73,16 @@ describe('loadTransformers install prompt', () => {
 
     const { embed } = await import('../../src/embedder.js');
 
-    await expect(embed(['test'], 'minilm')).rejects.toThrow('process.exit(1)');
-    expect(errorSpy).toHaveBeenCalledWith(
-      expect.stringContaining('Semantic search requires @huggingface/transformers'),
+    await expect(embed(['test'], 'minilm')).rejects.toThrow(
+      'Semantic search requires @huggingface/transformers',
     );
+    await expect(embed(['test'], 'minilm')).rejects.toMatchObject({
+      name: 'EngineError',
+      code: 'ENGINE_UNAVAILABLE',
+    });
   });
 
-  test('TTY + user accepts but npm install fails: prints error and exits', async () => {
+  test('TTY + user accepts but npm install fails: throws EngineError', async () => {
     process.stdin.isTTY = true;
 
     const execMock = vi.fn(() => {
@@ -95,15 +101,18 @@ describe('loadTransformers install prompt', () => {
 
     const { embed } = await import('../../src/embedder.js');
 
-    await expect(embed(['test'], 'minilm')).rejects.toThrow('process.exit(1)');
+    await expect(embed(['test'], 'minilm')).rejects.toThrow(
+      'Semantic search requires @huggingface/transformers',
+    );
+    await expect(embed(['test'], 'minilm')).rejects.toMatchObject({
+      name: 'EngineError',
+      code: 'ENGINE_UNAVAILABLE',
+    });
     expect(execMock).toHaveBeenCalledWith(
       'npm',
       ['install', '@huggingface/transformers'],
       expect.objectContaining({ stdio: 'inherit', timeout: 300_000 }),
     );
-    expect(errorSpy).toHaveBeenCalledWith(
-      expect.stringContaining('Semantic search requires @huggingface/transformers'),
-    );
   });
 
   test('TTY + install succeeds: retries import and loads module', async () => {

From d2794cb98da6352473d4dc2574cbbf1a953d3df8 Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 00:16:48 -0600
Subject: [PATCH 2/8] fix: address Greptile review feedback on PR #431

- Use expect.assertions(4) in db.test.js to prevent silent assertion skips
- Change snapshot "already exists" error from DbError to ConfigError
  (it's a missing --force flag, not a database failure)

Impact: 1 functions changed, 0 affected
---
 src/snapshot.js       | 2 +-
 tests/unit/db.test.js | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/snapshot.js b/src/snapshot.js
index a9f80ce9..0ce12bf5 100644
--- a/src/snapshot.js
+++ b/src/snapshot.js
@@ -48,7 +48,7 @@ export function snapshotSave(name, options = {}) {
 
   if (fs.existsSync(dest)) {
     if (!options.force) {
-      throw new DbError(`Snapshot "${name}" already exists. Use --force to overwrite.`);
+      throw new ConfigError(`Snapshot "${name}" already exists. Use --force to overwrite.`);
     }
     fs.unlinkSync(dest);
     debug(`Deleted existing snapshot: ${dest}`);
diff --git a/tests/unit/db.test.js b/tests/unit/db.test.js
index 63fb5807..47dc393d 100644
--- a/tests/unit/db.test.js
+++ b/tests/unit/db.test.js
@@ -196,12 +196,11 @@ describe('build_meta', () => {
 
 describe('openReadonlyOrFail', () => {
   it('throws DbError when DB does not exist', () => {
-    expect(() => openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'))).toThrow(
-      'No codegraph database found',
-    );
+    expect.assertions(4);
     try {
       openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'));
     } catch (err) {
+      expect(err.message).toContain('No codegraph database found');
       expect(err.name).toBe('DbError');
       expect(err.code).toBe('DB_ERROR');
       expect(err.file).toBeDefined();

From d625c212f2f5eccbe780a6859ed98b2e184d31d2 Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 02:15:12 -0600
Subject: [PATCH 3/8] refactor: extract embedder.js into src/embeddings/
 subsystem (ROADMAP 3.10)

Split the monolithic 1,100-line embedder.js into a modular subsystem with
clear separation of concerns: models, generator, strategies, stores, and
search modules. Uses a pluggable VectorStore JSDoc contract for future ANN
backends. Reuses existing db/repository/embeddings.js for search preparation.

All 9 consumer import paths updated, old file deleted.

Impact: 26 functions changed, 16 affected
---
 src/cli/commands/embed.js                 |    2 +-
 src/cli/commands/models.js                |    2 +-
 src/cli/commands/search.js                |    2 +-
 src/embedder.js                           | 1097 ---------------------
 src/embeddings/generator.js               |  163 +++
 src/embeddings/index.js                   |   13 +
 src/embeddings/models.js                  |  217 ++++
 src/embeddings/search/cli-formatter.js    |  151 +++
 src/embeddings/search/filters.js          |   46 +
 src/embeddings/search/hybrid.js           |  121 +++
 src/embeddings/search/keyword.js          |   68 ++
 src/embeddings/search/prepare.js          |   58 ++
 src/embeddings/search/semantic.js         |  145 +++
 src/embeddings/stores/fts5.js             |   27 +
 src/embeddings/stores/sqlite-blob.js      |   23 +
 src/embeddings/strategies/source.js       |   14 +
 src/embeddings/strategies/structured.js   |   43 +
 src/embeddings/strategies/text-utils.js   |   43 +
 src/index.js                              |    7 +-
 src/mcp/tools/semantic-search.js          |    6 +-
 tests/search/embedder-search.test.js      |    2 +-
 tests/search/embedding-regression.test.js |    2 +-
 tests/search/embedding-strategy.test.js   |    2 +-
 tests/unit/prompt-install.test.js         |    8 +-
 24 files changed, 1151 insertions(+), 1111 deletions(-)
 delete mode 100644 src/embedder.js
 create mode 100644 src/embeddings/generator.js
 create mode 100644 src/embeddings/index.js
 create mode 100644 src/embeddings/models.js
 create mode 100644 src/embeddings/search/cli-formatter.js
 create mode 100644 src/embeddings/search/filters.js
 create mode 100644 src/embeddings/search/hybrid.js
 create mode 100644 src/embeddings/search/keyword.js
 create mode 100644 src/embeddings/search/prepare.js
 create mode 100644 src/embeddings/search/semantic.js
 create mode 100644 src/embeddings/stores/fts5.js
 create mode 100644 src/embeddings/stores/sqlite-blob.js
 create mode 100644 src/embeddings/strategies/source.js
 create mode 100644 src/embeddings/strategies/structured.js
 create mode 100644 src/embeddings/strategies/text-utils.js

diff --git a/src/cli/commands/embed.js b/src/cli/commands/embed.js
index fcd908e9..075520cd 100644
--- a/src/cli/commands/embed.js
+++ b/src/cli/commands/embed.js
@@ -1,5 +1,5 @@
 import path from 'node:path';
-import { buildEmbeddings, DEFAULT_MODEL, EMBEDDING_STRATEGIES } from '../../embedder.js';
+import { buildEmbeddings, DEFAULT_MODEL, EMBEDDING_STRATEGIES } from '../../embeddings/index.js';
 
 export const command = {
   name: 'embed [dir]',
diff --git a/src/cli/commands/models.js b/src/cli/commands/models.js
index 6773f2c2..0763650a 100644
--- a/src/cli/commands/models.js
+++ b/src/cli/commands/models.js
@@ -1,4 +1,4 @@
-import { DEFAULT_MODEL, MODELS } from '../../embedder.js';
+import { DEFAULT_MODEL, MODELS } from '../../embeddings/index.js';
 
 export const command = {
   name: 'models',
diff --git a/src/cli/commands/search.js b/src/cli/commands/search.js
index 312f734d..238b59a0 100644
--- a/src/cli/commands/search.js
+++ b/src/cli/commands/search.js
@@ -1,4 +1,4 @@
-import { search } from '../../embedder.js';
+import { search } from '../../embeddings/index.js';
 
 export const command = {
   name: 'search <query>',
diff --git a/src/embedder.js b/src/embedder.js
deleted file mode 100644
index f8fbc527..00000000
--- a/src/embedder.js
+++ /dev/null
@@ -1,1097 +0,0 @@
-import { execFileSync } from 'node:child_process';
-import fs from 'node:fs';
-import path from 'node:path';
-import { createInterface } from 'node:readline';
-import {
-  closeDb,
-  findCalleeNames,
-  findCallerNames,
-  findDbPath,
-  openDb,
-  openReadonlyOrFail,
-} from './db.js';
-import { ConfigError, DbError, EngineError } from './errors.js';
-import { info, warn } from './logger.js';
-import { normalizeSymbol } from './queries.js';
-
-/**
- * Split an identifier into readable words.
- * camelCase/PascalCase → "camel Case", snake_case → "snake case", kebab-case → "kebab case"
- */
-function splitIdentifier(name) {
-  return name
-    .replace(/([a-z])([A-Z])/g, '$1 $2')
-    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
-    .replace(/[_-]+/g, ' ')
-    .trim();
-}
-
-/**
- * Match a file path against a glob pattern.
- * Supports *, **, and ? wildcards. Zero dependencies.
- */
-function globMatch(filePath, pattern) {
-  // Normalize separators to forward slashes
-  const normalized = filePath.replace(/\\/g, '/');
-  // Escape regex specials except glob chars
-  let regex = pattern.replace(/\\/g, '/').replace(/[.+^${}()|[\]\\]/g, '\\$&');
-  // Replace ** first (matches any path segment), then * and ?
-  regex = regex.replace(/\*\*/g, '\0');
-  regex = regex.replace(/\*/g, '[^/]*');
-  regex = regex.replace(/\0/g, '.*');
-  regex = regex.replace(/\?/g, '[^/]');
-  try {
-    return new RegExp(`^${regex}$`).test(normalized);
-  } catch {
-    // Malformed pattern — fall back to substring match
-    return normalized.includes(pattern);
-  }
-}
-
-// Lazy-load transformers (heavy, optional module)
-let pipeline = null;
-let _cos_sim = null;
-let extractor = null;
-let activeModel = null;
-
-export const MODELS = {
-  minilm: {
-    name: 'Xenova/all-MiniLM-L6-v2',
-    dim: 384,
-    contextWindow: 256,
-    desc: 'Smallest, fastest (~23MB). General text.',
-    quantized: true,
-  },
-  'jina-small': {
-    name: 'Xenova/jina-embeddings-v2-small-en',
-    dim: 512,
-    contextWindow: 8192,
-    desc: 'Small, good quality (~33MB). General text.',
-    quantized: false,
-  },
-  'jina-base': {
-    name: 'Xenova/jina-embeddings-v2-base-en',
-    dim: 768,
-    contextWindow: 8192,
-    desc: 'Good quality (~137MB). General text, 8192 token context.',
-    quantized: false,
-  },
-  'jina-code': {
-    name: 'Xenova/jina-embeddings-v2-base-code',
-    dim: 768,
-    contextWindow: 8192,
-    desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
-    quantized: false,
-  },
-  nomic: {
-    name: 'Xenova/nomic-embed-text-v1',
-    dim: 768,
-    contextWindow: 8192,
-    desc: 'Good local quality (~137MB). 8192 context.',
-    quantized: false,
-  },
-  'nomic-v1.5': {
-    name: 'nomic-ai/nomic-embed-text-v1.5',
-    dim: 768,
-    contextWindow: 8192,
-    desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
-    quantized: false,
-  },
-  'bge-large': {
-    name: 'Xenova/bge-large-en-v1.5',
-    dim: 1024,
-    contextWindow: 512,
-    desc: 'Best general retrieval (~335MB). Top MTEB scores.',
-    quantized: false,
-  },
-};
-
-export const EMBEDDING_STRATEGIES = ['structured', 'source'];
-
-export const DEFAULT_MODEL = 'nomic-v1.5';
-const BATCH_SIZE_MAP = {
-  minilm: 32,
-  'jina-small': 16,
-  'jina-base': 8,
-  'jina-code': 8,
-  nomic: 8,
-  'nomic-v1.5': 8,
-  'bge-large': 4,
-};
-const DEFAULT_BATCH_SIZE = 32;
-
-function getModelConfig(modelKey) {
-  const key = modelKey || DEFAULT_MODEL;
-  const config = MODELS[key];
-  if (!config) {
-    throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`);
-  }
-  return config;
-}
-
-/**
- * Rough token estimate (~4 chars per token for code/English).
- * Conservative — avoids adding a tokenizer dependency.
- */
-export function estimateTokens(text) {
-  return Math.ceil(text.length / 4);
-}
-
-/**
- * Extract leading comment text (JSDoc, //, #, etc.) above a function line.
- * Returns the cleaned comment text or null if none found.
- */
-function extractLeadingComment(lines, fnLineIndex) {
-  if (fnLineIndex > lines.length) return null;
-  const raw = [];
-  for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) {
-    if (i >= lines.length) continue;
-    const trimmed = lines[i].trim();
-    if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) {
-      raw.unshift(trimmed);
-    } else if (trimmed === '') {
-      if (raw.length > 0) break;
-    } else {
-      break;
-    }
-  }
-  if (raw.length === 0) return null;
-  return raw
-    .map((line) =>
-      line
-        .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */
-        .replace(/^\*\s?/, '') // middle * lines
-        .replace(/^\/\/\/?\s?/, '') // // or ///
-        .replace(/^#\s?/, '') // # (Python/Ruby)
-        .trim(),
-    )
-    .filter((l) => l.length > 0)
-    .join(' ');
-}
-
-/**
- * Build graph-enriched text for a symbol using dependency context.
- * Produces compact, semantic text (~100 tokens) instead of full source code.
- */
-function buildStructuredText(node, file, lines, db) {
-  const readable = splitIdentifier(node.name);
-  const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`];
-  const startLine = Math.max(0, node.line - 1);
-
-  // Extract parameters from signature (best-effort, single-line)
-  const sigLine = lines[startLine] || '';
-  const paramMatch = sigLine.match(/\(([^)]*)\)/);
-  if (paramMatch?.[1]?.trim()) {
-    parts.push(`Parameters: ${paramMatch[1].trim()}`);
-  }
-
-  // Graph context: callees (capped at 10)
-  const callees = findCalleeNames(db, node.id);
-  if (callees.length > 0) {
-    parts.push(`Calls: ${callees.slice(0, 10).join(', ')}`);
-  }
-
-  // Graph context: callers (capped at 10)
-  const callers = findCallerNames(db, node.id);
-  if (callers.length > 0) {
-    parts.push(`Called by: ${callers.slice(0, 10).join(', ')}`);
-  }
-
-  // Leading comment (high semantic value) or first few lines of code
-  const comment = extractLeadingComment(lines, startLine);
-  if (comment) {
-    parts.push(comment);
-  } else {
-    const endLine = Math.min(lines.length, startLine + 4);
-    const snippet = lines.slice(startLine, endLine).join('\n').trim();
-    if (snippet) parts.push(snippet);
-  }
-
-  return parts.join('\n');
-}
-
-/**
- * Build raw source-code text for a symbol (original strategy).
- */
-function buildSourceText(node, file, lines) {
-  const startLine = Math.max(0, node.line - 1);
-  const endLine = node.end_line
-    ? Math.min(lines.length, node.end_line)
-    : Math.min(lines.length, startLine + 15);
-  const context = lines.slice(startLine, endLine).join('\n');
-  const readable = splitIdentifier(node.name);
-  return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`;
-}
-
-/**
- * Prompt the user to install a missing package interactively.
- * Returns true if the package was installed, false otherwise.
- * Skips the prompt entirely in non-TTY environments (CI, piped stdin).
- */
-function promptInstall(packageName) {
-  if (!process.stdin.isTTY) return Promise.resolve(false);
-
-  return new Promise((resolve) => {
-    const rl = createInterface({ input: process.stdin, output: process.stderr });
-    rl.question(`Semantic search requires ${packageName}. Install it now? [y/N] `, (answer) => {
-      rl.close();
-      if (answer.trim().toLowerCase() !== 'y') return resolve(false);
-      try {
-        execFileSync('npm', ['install', packageName], {
-          stdio: 'inherit',
-          timeout: 300_000,
-        });
-        resolve(true);
-      } catch {
-        resolve(false);
-      }
-    });
-  });
-}
-
-/**
- * Lazy-load @huggingface/transformers.
- * If the package is missing, prompts the user to install it interactively.
- * In non-TTY environments, prints an error and exits.
- */
-async function loadTransformers() {
-  try {
-    return await import('@huggingface/transformers');
-  } catch {
-    const pkg = '@huggingface/transformers';
-    const installed = await promptInstall(pkg);
-    if (installed) {
-      try {
-        return await import(pkg);
-      } catch (loadErr) {
-        throw new EngineError(
-          `${pkg} was installed but failed to load. Please check your environment.`,
-          { cause: loadErr },
-        );
-      }
-    }
-    throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`);
-  }
-}
-
-/**
- * Dispose the current ONNX session and free memory.
- * Safe to call when no model is loaded (no-op).
- */
-export async function disposeModel() {
-  if (extractor) {
-    await extractor.dispose();
-    extractor = null;
-  }
-  activeModel = null;
-}
-
-async function loadModel(modelKey) {
-  const config = getModelConfig(modelKey);
-
-  if (extractor && activeModel === config.name) return { extractor, config };
-
-  // Dispose previous model before loading a different one
-  await disposeModel();
-
-  const transformers = await loadTransformers();
-  pipeline = transformers.pipeline;
-  _cos_sim = transformers.cos_sim;
-
-  info(`Loading embedding model: ${config.name} (${config.dim}d)...`);
-  const pipelineOpts = config.quantized ? { quantized: true } : {};
-  try {
-    extractor = await pipeline('feature-extraction', config.name, pipelineOpts);
-  } catch (err) {
-    const msg = err.message || String(err);
-    if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
-      throw new EngineError(
-        `Model "${config.name}" requires authentication.\n` +
-          `This model is gated on HuggingFace and needs an access token.\n\n` +
-          `Options:\n` +
-          `  1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
-          `  2. Use a public model instead: codegraph embed --model minilm`,
-        { cause: err },
-      );
-    }
-    throw new EngineError(
-      `Failed to load model "${config.name}": ${msg}\n` +
-        `Try a different model: codegraph embed --model minilm`,
-      { cause: err },
-    );
-  }
-  activeModel = config.name;
-  info('Model loaded.');
-  return { extractor, config };
-}
-
-/**
- * Generate embeddings for an array of texts.
- */
-export async function embed(texts, modelKey) {
-  const { extractor: ext, config } = await loadModel(modelKey);
-  const dim = config.dim;
-  const results = [];
-  const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] || DEFAULT_BATCH_SIZE;
-
-  for (let i = 0; i < texts.length; i += batchSize) {
-    const batch = texts.slice(i, i + batchSize);
-    const output = await ext(batch, { pooling: 'mean', normalize: true });
-
-    for (let j = 0; j < batch.length; j++) {
-      const start = j * dim;
-      const vec = new Float32Array(dim);
-      for (let k = 0; k < dim; k++) {
-        vec[k] = output.data[start + k];
-      }
-      results.push(vec);
-    }
-
-    if (texts.length > batchSize) {
-      process.stdout.write(`  Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`);
-    }
-  }
-
-  return { vectors: results, dim };
-}
-
-/**
- * Cosine similarity between two Float32Arrays.
- */
-export function cosineSim(a, b) {
-  let dot = 0,
-    normA = 0,
-    normB = 0;
-  for (let i = 0; i < a.length; i++) {
-    dot += a[i] * b[i];
-    normA += a[i] * a[i];
-    normB += b[i] * b[i];
-  }
-  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
-}
-
-function initEmbeddingsSchema(db) {
-  db.exec(`
-    CREATE TABLE IF NOT EXISTS embeddings (
-      node_id INTEGER PRIMARY KEY,
-      vector BLOB NOT NULL,
-      text_preview TEXT,
-      FOREIGN KEY(node_id) REFERENCES nodes(id)
-    );
-    CREATE TABLE IF NOT EXISTS embedding_meta (
-      key TEXT PRIMARY KEY,
-      value TEXT
-    );
-  `);
-
-  // Add full_text column (idempotent — ignore if already exists)
-  try {
-    db.exec('ALTER TABLE embeddings ADD COLUMN full_text TEXT');
-  } catch {
-    /* column already exists */
-  }
-
-  // FTS5 virtual table for BM25 keyword search
-  db.exec(`
-    CREATE VIRTUAL TABLE IF NOT EXISTS fts_index USING fts5(
-      name,
-      content,
-      tokenize='unicode61'
-    );
-  `);
-}
-
-/**
- * Build embeddings for all functions/methods/classes in the graph.
- * @param {string} rootDir - Project root directory
- * @param {string} modelKey - Model identifier from MODELS registry
- * @param {string} [customDbPath] - Override path to graph.db
- * @param {object} [options] - Embedding options
- * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code)
- */
-export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) {
-  const strategy = options.strategy || 'structured';
-  const dbPath = customDbPath || findDbPath(null);
-
-  if (!fs.existsSync(dbPath)) {
-    throw new DbError(
-      `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
-      { file: dbPath },
-    );
-  }
-
-  const db = openDb(dbPath);
-  initEmbeddingsSchema(db);
-
-  db.exec('DELETE FROM embeddings');
-  db.exec('DELETE FROM embedding_meta');
-  db.exec('DELETE FROM fts_index');
-
-  const nodes = db
-    .prepare(
-      `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
-    )
-    .all();
-
-  console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`);
-
-  const byFile = new Map();
-  for (const node of nodes) {
-    if (!byFile.has(node.file)) byFile.set(node.file, []);
-    byFile.get(node.file).push(node);
-  }
-
-  const texts = [];
-  const nodeIds = [];
-  const nodeNames = [];
-  const previews = [];
-  const config = getModelConfig(modelKey);
-  const contextWindow = config.contextWindow;
-  let overflowCount = 0;
-
-  for (const [file, fileNodes] of byFile) {
-    const fullPath = path.join(rootDir, file);
-    let lines;
-    try {
-      lines = fs.readFileSync(fullPath, 'utf-8').split('\n');
-    } catch (err) {
-      warn(`Cannot read ${file} for embeddings: ${err.message}`);
-      continue;
-    }
-
-    for (const node of fileNodes) {
-      let text =
-        strategy === 'structured'
-          ? buildStructuredText(node, file, lines, db)
-          : buildSourceText(node, file, lines);
-
-      // Detect and handle context window overflow
-      const tokens = estimateTokens(text);
-      if (tokens > contextWindow) {
-        overflowCount++;
-        const maxChars = contextWindow * 4;
-        text = text.slice(0, maxChars);
-      }
-
-      texts.push(text);
-      nodeIds.push(node.id);
-      nodeNames.push(node.name);
-      previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`);
-    }
-  }
-
-  if (overflowCount > 0) {
-    warn(
-      `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
-    );
-  }
-
-  console.log(`Embedding ${texts.length} symbols...`);
-  const { vectors, dim } = await embed(texts, modelKey);
-
-  const insert = db.prepare(
-    'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)',
-  );
-  const insertFts = db.prepare('INSERT INTO fts_index(rowid, name, content) VALUES (?, ?, ?)');
-  const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)');
-  const insertAll = db.transaction(() => {
-    for (let i = 0; i < vectors.length; i++) {
-      insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i], texts[i]);
-      insertFts.run(nodeIds[i], nodeNames[i], texts[i]);
-    }
-    insertMeta.run('model', config.name);
-    insertMeta.run('dim', String(dim));
-    insertMeta.run('count', String(vectors.length));
-    insertMeta.run('fts_count', String(vectors.length));
-    insertMeta.run('strategy', strategy);
-    insertMeta.run('built_at', new Date().toISOString());
-    if (overflowCount > 0) {
-      insertMeta.run('truncated_count', String(overflowCount));
-    }
-  });
-  insertAll();
-
-  console.log(
-    `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
-  );
-  closeDb(db);
-}
-
-/**
- * Shared setup for search functions: opens DB, validates embeddings/model, loads rows.
- * Returns { db, rows, modelKey, storedDim } or null on failure (prints error).
- */
-function _prepareSearch(customDbPath, opts = {}) {
-  const db = openReadonlyOrFail(customDbPath);
-
-  let count;
-  try {
-    count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c;
-  } catch {
-    console.log('No embeddings table found. Run `codegraph embed` first.');
-    db.close();
-    return null;
-  }
-  if (count === 0) {
-    console.log('No embeddings found. Run `codegraph embed` first.');
-    db.close();
-    return null;
-  }
-
-  let storedModel = null;
-  let storedDim = null;
-  try {
-    const modelRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'model'").get();
-    const dimRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'dim'").get();
-    if (modelRow) storedModel = modelRow.value;
-    if (dimRow) storedDim = parseInt(dimRow.value, 10);
-  } catch {
-    /* old DB without meta table */
-  }
-
-  let modelKey = opts.model || null;
-  if (!modelKey && storedModel) {
-    for (const [key, config] of Object.entries(MODELS)) {
-      if (config.name === storedModel) {
-        modelKey = key;
-        break;
-      }
-    }
-  }
-
-  // Pre-filter: allow filtering by kind or file pattern to reduce search space
-  const noTests = opts.noTests || false;
-  const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./;
-  let sql = `
-    SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role
-    FROM embeddings e
-    JOIN nodes n ON e.node_id = n.id
-  `;
-  const params = [];
-  const conditions = [];
-  if (opts.kind) {
-    conditions.push('n.kind = ?');
-    params.push(opts.kind);
-  }
-  const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern);
-  if (opts.filePattern && !isGlob) {
-    conditions.push('n.file LIKE ?');
-    params.push(`%${opts.filePattern}%`);
-  }
-  if (conditions.length > 0) {
-    sql += ` WHERE ${conditions.join(' AND ')}`;
-  }
-
-  let rows = db.prepare(sql).all(...params);
-  if (isGlob) {
-    rows = rows.filter((row) => globMatch(row.file, opts.filePattern));
-  }
-  if (noTests) {
-    rows = rows.filter((row) => !TEST_PATTERN.test(row.file));
-  }
-
-  return { db, rows, modelKey, storedDim };
-}
-
-/**
- * Single-query semantic search — returns data instead of printing.
- * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure.
- */
-export async function searchData(query, customDbPath, opts = {}) {
-  const limit = opts.limit || 15;
-  const minScore = opts.minScore || 0.2;
-
-  const prepared = _prepareSearch(customDbPath, opts);
-  if (!prepared) return null;
-  const { db, rows, modelKey, storedDim } = prepared;
-
-  try {
-    const {
-      vectors: [queryVec],
-      dim,
-    } = await embed([query], modelKey);
-
-    if (storedDim && dim !== storedDim) {
-      console.log(
-        `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
-      );
-      console.log(`  Re-run \`codegraph embed\` with the same model, or use --model to match.`);
-      return null;
-    }
-
-    const hc = new Map();
-    const results = [];
-    for (const row of rows) {
-      const vec = new Float32Array(new Uint8Array(row.vector).buffer);
-      const sim = cosineSim(queryVec, vec);
-
-      if (sim >= minScore) {
-        results.push({
-          ...normalizeSymbol(row, db, hc),
-          similarity: sim,
-        });
-      }
-    }
-
-    results.sort((a, b) => b.similarity - a.similarity);
-    return { results: results.slice(0, limit) };
-  } finally {
-    db.close();
-  }
-}
-
-/**
- * Multi-query semantic search with Reciprocal Rank Fusion (RRF).
- * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure.
- */
-export async function multiSearchData(queries, customDbPath, opts = {}) {
-  const limit = opts.limit || 15;
-  const minScore = opts.minScore || 0.2;
-  const k = opts.rrfK || 60;
-
-  const prepared = _prepareSearch(customDbPath, opts);
-  if (!prepared) return null;
-  const { db, rows, modelKey, storedDim } = prepared;
-
-  try {
-    const { vectors: queryVecs, dim } = await embed(queries, modelKey);
-
-    // Warn about similar queries that may bias RRF results
-    const SIMILARITY_WARN_THRESHOLD = 0.85;
-    for (let i = 0; i < queryVecs.length; i++) {
-      for (let j = i + 1; j < queryVecs.length; j++) {
-        const sim = cosineSim(queryVecs[i], queryVecs[j]);
-        if (sim >= SIMILARITY_WARN_THRESHOLD) {
-          warn(
-            `Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
-              `(${(sim * 100).toFixed(0)}% cosine similarity). ` +
-              `This may bias RRF results toward their shared matches. ` +
-              `Consider using more distinct queries.`,
-          );
-        }
-      }
-    }
-
-    if (storedDim && dim !== storedDim) {
-      console.log(
-        `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
-      );
-      console.log(`  Re-run \`codegraph embed\` with the same model, or use --model to match.`);
-      return null;
-    }
-
-    // Parse row vectors once
-    const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer));
-
-    // For each query: compute similarities, filter by minScore, rank
-    const perQueryRanked = queries.map((_query, qi) => {
-      const scored = [];
-      for (let ri = 0; ri < rows.length; ri++) {
-        const sim = cosineSim(queryVecs[qi], rowVecs[ri]);
-        if (sim >= minScore) {
-          scored.push({ rowIndex: ri, similarity: sim });
-        }
-      }
-      scored.sort((a, b) => b.similarity - a.similarity);
-      // Assign 1-indexed ranks
-      return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
-    });
-
-    // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries
-    const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] }
-    for (let qi = 0; qi < queries.length; qi++) {
-      for (const item of perQueryRanked[qi]) {
-        if (!fusionMap.has(item.rowIndex)) {
-          fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
-        }
-        const entry = fusionMap.get(item.rowIndex);
-        entry.rrfScore += 1 / (k + item.rank);
-        entry.queryScores.push({
-          query: queries[qi],
-          similarity: item.similarity,
-          rank: item.rank,
-        });
-      }
-    }
-
-    // Build results sorted by RRF score
-    const hc = new Map();
-    const results = [];
-    for (const [rowIndex, entry] of fusionMap) {
-      const row = rows[rowIndex];
-      results.push({
-        ...normalizeSymbol(row, db, hc),
-        rrf: entry.rrfScore,
-        queryScores: entry.queryScores,
-      });
-    }
-
-    results.sort((a, b) => b.rrf - a.rrf);
-    return { results: results.slice(0, limit) };
-  } finally {
-    db.close();
-  }
-}
-
-/**
- * Sanitize a user query for FTS5 MATCH syntax.
- * Wraps each token as an implicit OR and escapes special FTS5 characters.
- */
-function sanitizeFtsQuery(query) {
-  // Remove FTS5 special chars that could cause syntax errors
-  const cleaned = query.replace(/[*"():^{}~<>]/g, ' ').trim();
-  if (!cleaned) return null;
-  // Split into tokens, wrap with OR for multi-token queries
-  const tokens = cleaned.split(/\s+/).filter((t) => t.length > 0);
-  if (tokens.length === 0) return null;
-  if (tokens.length === 1) return `"${tokens[0]}"`;
-  return tokens.map((t) => `"${t}"`).join(' OR ');
-}
-
-/**
- * Check if the FTS5 index exists in the database.
- * Returns true if fts_index table exists and has rows, false otherwise.
- */
-function hasFtsIndex(db) {
-  try {
-    const row = db.prepare('SELECT COUNT(*) as c FROM fts_index').get();
-    return row.c > 0;
-  } catch {
-    return false;
-  }
-}
-
-/**
- * BM25 keyword search via FTS5.
- * Returns { results: [{ name, kind, file, line, bm25Score }] } or null if no FTS5 index.
- */
-export function ftsSearchData(query, customDbPath, opts = {}) {
-  const limit = opts.limit || 15;
-  const noTests = opts.noTests || false;
-  const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./;
-
-  const db = openReadonlyOrFail(customDbPath);
-
-  try {
-    if (!hasFtsIndex(db)) {
-      return null;
-    }
-
-    const ftsQuery = sanitizeFtsQuery(query);
-    if (!ftsQuery) {
-      return { results: [] };
-    }
-
-    let sql = `
-      SELECT f.rowid AS node_id, rank AS bm25_score,
-             n.name, n.kind, n.file, n.line, n.end_line, n.role
-      FROM fts_index f
-      JOIN nodes n ON f.rowid = n.id
-      WHERE fts_index MATCH ?
-    `;
-    const params = [ftsQuery];
-
-    if (opts.kind) {
-      sql += ' AND n.kind = ?';
-      params.push(opts.kind);
-    }
-
-    const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern);
-    if (opts.filePattern && !isGlob) {
-      sql += ' AND n.file LIKE ?';
-      params.push(`%${opts.filePattern}%`);
-    }
-
-    sql += ' ORDER BY rank LIMIT ?';
-    params.push(limit * 5); // fetch generous set for post-filtering
-
-    let rows;
-    try {
-      rows = db.prepare(sql).all(...params);
-    } catch {
-      // Invalid FTS5 query syntax — return empty
-      return { results: [] };
-    }
-
-    if (isGlob) {
-      rows = rows.filter((row) => globMatch(row.file, opts.filePattern));
-    }
-    if (noTests) {
-      rows = rows.filter((row) => !TEST_PATTERN.test(row.file));
-    }
-
-    const hc = new Map();
-    const results = rows.slice(0, limit).map((row) => ({
-      ...normalizeSymbol(row, db, hc),
-      bm25Score: -row.bm25_score, // FTS5 rank is negative; negate for display
-    }));
-
-    return { results };
-  } finally {
-    db.close();
-  }
-}
-
-/**
- * Hybrid BM25 + semantic search with RRF fusion.
- * Returns { results: [{ name, kind, file, line, rrf, bm25Score, bm25Rank, similarity, semanticRank }] }
- * or null if no FTS5 index (caller should fall back to semantic-only).
- */
-export async function hybridSearchData(query, customDbPath, opts = {}) {
-  const limit = opts.limit || 15;
-  const k = opts.rrfK || 60;
-  const topK = (opts.limit || 15) * 5;
-
-  // Split semicolons for multi-query support
-  const queries =
-    typeof query === 'string'
-      ? query
-          .split(';')
-          .map((q) => q.trim())
-          .filter((q) => q.length > 0)
-      : [query];
-
-  // Check FTS5 availability first (sync, cheap)
-  const checkDb = openReadonlyOrFail(customDbPath);
-  const ftsAvailable = hasFtsIndex(checkDb);
-  checkDb.close();
-  if (!ftsAvailable) return null;
-
-  // Collect ranked lists: for each query, one BM25 list + one semantic list
-  const rankedLists = [];
-
-  for (const q of queries) {
-    // BM25 ranked list (sync)
-    const bm25Data = ftsSearchData(q, customDbPath, { ...opts, limit: topK });
-    if (bm25Data?.results) {
-      rankedLists.push(
-        bm25Data.results.map((r, idx) => ({
-          key: `${r.name}:${r.file}:${r.line}`,
-          rank: idx + 1,
-          source: 'bm25',
-          ...r,
-        })),
-      );
-    }
-
-    // Semantic ranked list (async)
-    const semData = await searchData(q, customDbPath, {
-      ...opts,
-      limit: topK,
-      minScore: opts.minScore || 0.2,
-    });
-    if (semData?.results) {
-      rankedLists.push(
-        semData.results.map((r, idx) => ({
-          key: `${r.name}:${r.file}:${r.line}`,
-          rank: idx + 1,
-          source: 'semantic',
-          ...r,
-        })),
-      );
-    }
-  }
-
-  // RRF fusion across all ranked lists
-  const fusionMap = new Map();
-  for (const list of rankedLists) {
-    for (const item of list) {
-      if (!fusionMap.has(item.key)) {
-        fusionMap.set(item.key, {
-          name: item.name,
-          kind: item.kind,
-          file: item.file,
-          line: item.line,
-          endLine: item.endLine ?? null,
-          role: item.role ?? null,
-          fileHash: item.fileHash ?? null,
-          rrfScore: 0,
-          bm25Score: null,
-          bm25Rank: null,
-          similarity: null,
-          semanticRank: null,
-        });
-      }
-      const entry = fusionMap.get(item.key);
-      entry.rrfScore += 1 / (k + item.rank);
-      if (item.source === 'bm25') {
-        if (entry.bm25Rank === null || item.rank < entry.bm25Rank) {
-          entry.bm25Score = item.bm25Score;
-          entry.bm25Rank = item.rank;
-        }
-      } else {
-        if (entry.semanticRank === null || item.rank < entry.semanticRank) {
-          entry.similarity = item.similarity;
-          entry.semanticRank = item.rank;
-        }
-      }
-    }
-  }
-
-  const results = [...fusionMap.values()]
-    .sort((a, b) => b.rrfScore - a.rrfScore)
-    .slice(0, limit)
-    .map((e) => ({
-      name: e.name,
-      kind: e.kind,
-      file: e.file,
-      line: e.line,
-      endLine: e.endLine,
-      role: e.role,
-      fileHash: e.fileHash,
-      rrf: e.rrfScore,
-      bm25Score: e.bm25Score,
-      bm25Rank: e.bm25Rank,
-      similarity: e.similarity,
-      semanticRank: e.semanticRank,
-    }));
-
-  return { results };
-}
-
-/**
- * Search with mode support — CLI wrapper with multi-query detection.
- * Modes: 'hybrid' (default), 'semantic', 'keyword'
- */
-export async function search(query, customDbPath, opts = {}) {
-  const mode = opts.mode || 'hybrid';
-
-  // Split by semicolons, trim, filter empties
-  const queries = query
-    .split(';')
-    .map((q) => q.trim())
-    .filter((q) => q.length > 0);
-
-  const kindIcon = (kind) => (kind === 'function' ? 'f' : kind === 'class' ? '*' : 'o');
-
-  // ─── Keyword-only mode ──────────────────────────────────────────────
-  if (mode === 'keyword') {
-    const singleQuery = queries.length === 1 ? queries[0] : query;
-    const data = ftsSearchData(singleQuery, customDbPath, opts);
-    if (!data) {
-      console.log('No FTS5 index found. Run `codegraph embed` to build the keyword index.');
-      return;
-    }
-
-    if (opts.json) {
-      console.log(JSON.stringify(data, null, 2));
-      return;
-    }
-
-    console.log(`\nKeyword search: "${singleQuery}" (BM25)\n`);
-    if (data.results.length === 0) {
-      console.log('  No results found.');
-    } else {
-      for (const r of data.results) {
-        console.log(
-          `  BM25 ${r.bm25Score.toFixed(2)}  ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`,
-        );
-      }
-    }
-    console.log(`\n  ${data.results.length} results shown\n`);
-    return;
-  }
-
-  // ─── Semantic-only mode ─────────────────────────────────────────────
-  if (mode === 'semantic') {
-    if (queries.length <= 1) {
-      const singleQuery = queries[0] || query;
-      const data = await searchData(singleQuery, customDbPath, opts);
-      if (!data) return;
-
-      if (opts.json) {
-        console.log(JSON.stringify(data, null, 2));
-        return;
-      }
-
-      console.log(`\nSemantic search: "${singleQuery}"\n`);
-      if (data.results.length === 0) {
-        console.log('  No results above threshold.');
-      } else {
-        for (const r of data.results) {
-          const bar = '#'.repeat(Math.round(r.similarity * 20));
-          console.log(`  ${(r.similarity * 100).toFixed(1)}% ${bar}`);
-          console.log(`    ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`);
-        }
-      }
-      console.log(`\n  ${data.results.length} results shown\n`);
-    } else {
-      const data = await multiSearchData(queries, customDbPath, opts);
-      if (!data) return;
-
-      if (opts.json) {
-        console.log(JSON.stringify(data, null, 2));
-        return;
-      }
-
-      console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`);
-      for (let i = 0; i < queries.length; i++) console.log(`  [${i + 1}] "${queries[i]}"`);
-      console.log();
-      if (data.results.length === 0) {
-        console.log('  No results above threshold.');
-      } else {
-        for (const r of data.results) {
-          console.log(
-            `  RRF ${r.rrf.toFixed(4)}  ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`,
-          );
-          for (const qs of r.queryScores) {
-            const bar = '#'.repeat(Math.round(qs.similarity * 20));
-            console.log(
-              `    [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`,
-            );
-          }
-        }
-      }
-      console.log(`\n  ${data.results.length} results shown\n`);
-    }
-    return;
-  }
-
-  // ─── Hybrid mode (default) ──────────────────────────────────────────
-  const data = await hybridSearchData(query, customDbPath, opts);
-
-  if (!data) {
-    // No FTS5 index — fall back to semantic-only
-    warn(
-      'FTS5 index not found — using semantic search only. Re-run `codegraph embed` to enable hybrid mode.',
-    );
-    return search(query, customDbPath, { ...opts, mode: 'semantic' });
-  }
-
-  if (opts.json) {
-    console.log(JSON.stringify(data, null, 2));
-    return;
-  }
-
-  const rrfK = opts.rrfK || 60;
-  if (queries.length <= 1) {
-    const singleQuery = queries[0] || query;
-    console.log(`\nHybrid search: "${singleQuery}" (BM25 + semantic, RRF k=${rrfK})\n`);
-  } else {
-    console.log(`\nHybrid multi-query search (BM25 + semantic, RRF k=${rrfK}):`);
-    for (let i = 0; i < queries.length; i++) console.log(`  [${i + 1}] "${queries[i]}"`);
-    console.log();
-  }
-
-  if (data.results.length === 0) {
-    console.log('  No results found.');
-  } else {
-    for (const r of data.results) {
-      console.log(
-        `  RRF ${r.rrf.toFixed(4)}  ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`,
-      );
-      const parts = [];
-      if (r.bm25Rank != null) {
-        parts.push(`BM25: rank ${r.bm25Rank} (score ${r.bm25Score.toFixed(2)})`);
-      }
-      if (r.semanticRank != null) {
-        parts.push(`Semantic: rank ${r.semanticRank} (${(r.similarity * 100).toFixed(1)}%)`);
-      }
-      if (parts.length > 0) {
-        console.log(`    ${parts.join('  |  ')}`);
-      }
-    }
-  }
-
-  console.log(`\n  ${data.results.length} results shown\n`);
-}
diff --git a/src/embeddings/generator.js b/src/embeddings/generator.js
new file mode 100644
index 00000000..8721e2ac
--- /dev/null
+++ b/src/embeddings/generator.js
@@ -0,0 +1,163 @@
+import fs from 'node:fs';
+import path from 'node:path';
+import { closeDb, findDbPath, openDb } from '../db.js';
+import { DbError } from '../errors.js';
+import { warn } from '../logger.js';
+import { embed, getModelConfig } from './models.js';
+import { buildSourceText } from './strategies/source.js';
+import { buildStructuredText } from './strategies/structured.js';
+
+/**
+ * Rough token estimate (~4 chars per token for code/English).
+ * Conservative — avoids adding a tokenizer dependency.
+ */
+export function estimateTokens(text) {
+  return Math.ceil(text.length / 4);
+}
+
+export function initEmbeddingsSchema(db) {
+  db.exec(`
+    CREATE TABLE IF NOT EXISTS embeddings (
+      node_id INTEGER PRIMARY KEY,
+      vector BLOB NOT NULL,
+      text_preview TEXT,
+      FOREIGN KEY(node_id) REFERENCES nodes(id)
+    );
+    CREATE TABLE IF NOT EXISTS embedding_meta (
+      key TEXT PRIMARY KEY,
+      value TEXT
+    );
+  `);
+
+  // Add full_text column (idempotent — ignore if already exists)
+  try {
+    db.exec('ALTER TABLE embeddings ADD COLUMN full_text TEXT');
+  } catch {
+    /* column already exists */
+  }
+
+  // FTS5 virtual table for BM25 keyword search
+  db.exec(`
+    CREATE VIRTUAL TABLE IF NOT EXISTS fts_index USING fts5(
+      name,
+      content,
+      tokenize='unicode61'
+    );
+  `);
+}
+
+/**
+ * Build embeddings for all functions/methods/classes in the graph.
+ * @param {string} rootDir - Project root directory
+ * @param {string} modelKey - Model identifier from MODELS registry
+ * @param {string} [customDbPath] - Override path to graph.db
+ * @param {object} [options] - Embedding options
+ * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code)
+ */
+export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) {
+  const strategy = options.strategy || 'structured';
+  const dbPath = customDbPath || findDbPath(null);
+
+  if (!fs.existsSync(dbPath)) {
+    throw new DbError(
+      `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
+      { file: dbPath },
+    );
+  }
+
+  const db = openDb(dbPath);
+  initEmbeddingsSchema(db);
+
+  db.exec('DELETE FROM embeddings');
+  db.exec('DELETE FROM embedding_meta');
+  db.exec('DELETE FROM fts_index');
+
+  const nodes = db
+    .prepare(
+      `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
+    )
+    .all();
+
+  console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`);
+
+  const byFile = new Map();
+  for (const node of nodes) {
+    if (!byFile.has(node.file)) byFile.set(node.file, []);
+    byFile.get(node.file).push(node);
+  }
+
+  const texts = [];
+  const nodeIds = [];
+  const nodeNames = [];
+  const previews = [];
+  const config = getModelConfig(modelKey);
+  const contextWindow = config.contextWindow;
+  let overflowCount = 0;
+
+  for (const [file, fileNodes] of byFile) {
+    const fullPath = path.join(rootDir, file);
+    let lines;
+    try {
+      lines = fs.readFileSync(fullPath, 'utf-8').split('\n');
+    } catch (err) {
+      warn(`Cannot read ${file} for embeddings: ${err.message}`);
+      continue;
+    }
+
+    for (const node of fileNodes) {
+      let text =
+        strategy === 'structured'
+          ? buildStructuredText(node, file, lines, db)
+          : buildSourceText(node, file, lines);
+
+      // Detect and handle context window overflow
+      const tokens = estimateTokens(text);
+      if (tokens > contextWindow) {
+        overflowCount++;
+        const maxChars = contextWindow * 4;
+        text = text.slice(0, maxChars);
+      }
+
+      texts.push(text);
+      nodeIds.push(node.id);
+      nodeNames.push(node.name);
+      previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`);
+    }
+  }
+
+  if (overflowCount > 0) {
+    warn(
+      `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
+    );
+  }
+
+  console.log(`Embedding ${texts.length} symbols...`);
+  const { vectors, dim } = await embed(texts, modelKey);
+
+  const insert = db.prepare(
+    'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)',
+  );
+  const insertFts = db.prepare('INSERT INTO fts_index(rowid, name, content) VALUES (?, ?, ?)');
+  const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)');
+  const insertAll = db.transaction(() => {
+    for (let i = 0; i < vectors.length; i++) {
+      insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i], texts[i]);
+      insertFts.run(nodeIds[i], nodeNames[i], texts[i]);
+    }
+    insertMeta.run('model', config.name);
+    insertMeta.run('dim', String(dim));
+    insertMeta.run('count', String(vectors.length));
+    insertMeta.run('fts_count', String(vectors.length));
+    insertMeta.run('strategy', strategy);
+    insertMeta.run('built_at', new Date().toISOString());
+    if (overflowCount > 0) {
+      insertMeta.run('truncated_count', String(overflowCount));
+    }
+  });
+  insertAll();
+
+  console.log(
+    `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
+  );
+  closeDb(db);
+}
diff --git a/src/embeddings/index.js b/src/embeddings/index.js
new file mode 100644
index 00000000..bac3c60d
--- /dev/null
+++ b/src/embeddings/index.js
@@ -0,0 +1,13 @@
+/**
+ * Embeddings subsystem — public API barrel.
+ *
+ * Re-exports everything consumers previously imported from `../embedder.js`.
+ */
+
+export { buildEmbeddings, estimateTokens } from './generator.js';
+export { DEFAULT_MODEL, disposeModel, EMBEDDING_STRATEGIES, embed, MODELS } from './models.js';
+export { search } from './search/cli-formatter.js';
+export { hybridSearchData } from './search/hybrid.js';
+export { ftsSearchData } from './search/keyword.js';
+export { multiSearchData, searchData } from './search/semantic.js';
+export { cosineSim } from './stores/sqlite-blob.js';
diff --git a/src/embeddings/models.js b/src/embeddings/models.js
new file mode 100644
index 00000000..948ad3aa
--- /dev/null
+++ b/src/embeddings/models.js
@@ -0,0 +1,217 @@
+import { execFileSync } from 'node:child_process';
+import { createInterface } from 'node:readline';
+import { ConfigError, EngineError } from '../errors.js';
+import { info } from '../logger.js';
+
+// Lazy-load transformers (heavy, optional module)
+let pipeline = null;
+let _cos_sim = null;
+let extractor = null;
+let activeModel = null;
+
+export const MODELS = {
+  minilm: {
+    name: 'Xenova/all-MiniLM-L6-v2',
+    dim: 384,
+    contextWindow: 256,
+    desc: 'Smallest, fastest (~23MB). General text.',
+    quantized: true,
+  },
+  'jina-small': {
+    name: 'Xenova/jina-embeddings-v2-small-en',
+    dim: 512,
+    contextWindow: 8192,
+    desc: 'Small, good quality (~33MB). General text.',
+    quantized: false,
+  },
+  'jina-base': {
+    name: 'Xenova/jina-embeddings-v2-base-en',
+    dim: 768,
+    contextWindow: 8192,
+    desc: 'Good quality (~137MB). General text, 8192 token context.',
+    quantized: false,
+  },
+  'jina-code': {
+    name: 'Xenova/jina-embeddings-v2-base-code',
+    dim: 768,
+    contextWindow: 8192,
+    desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
+    quantized: false,
+  },
+  nomic: {
+    name: 'Xenova/nomic-embed-text-v1',
+    dim: 768,
+    contextWindow: 8192,
+    desc: 'Good local quality (~137MB). 8192 context.',
+    quantized: false,
+  },
+  'nomic-v1.5': {
+    name: 'nomic-ai/nomic-embed-text-v1.5',
+    dim: 768,
+    contextWindow: 8192,
+    desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
+    quantized: false,
+  },
+  'bge-large': {
+    name: 'Xenova/bge-large-en-v1.5',
+    dim: 1024,
+    contextWindow: 512,
+    desc: 'Best general retrieval (~335MB). Top MTEB scores.',
+    quantized: false,
+  },
+};
+
+export const EMBEDDING_STRATEGIES = ['structured', 'source'];
+
+export const DEFAULT_MODEL = 'nomic-v1.5';
+const BATCH_SIZE_MAP = {
+  minilm: 32,
+  'jina-small': 16,
+  'jina-base': 8,
+  'jina-code': 8,
+  nomic: 8,
+  'nomic-v1.5': 8,
+  'bge-large': 4,
+};
+const DEFAULT_BATCH_SIZE = 32;
+
+export function getModelConfig(modelKey) {
+  const key = modelKey || DEFAULT_MODEL;
+  const config = MODELS[key];
+  if (!config) {
+    throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`);
+  }
+  return config;
+}
+
+/**
+ * Prompt the user to install a missing package interactively.
+ * Returns true if the package was installed, false otherwise.
+ * Skips the prompt entirely in non-TTY environments (CI, piped stdin).
+ */
+export function promptInstall(packageName) {
+  if (!process.stdin.isTTY) return Promise.resolve(false);
+
+  return new Promise((resolve) => {
+    const rl = createInterface({ input: process.stdin, output: process.stderr });
+    rl.question(`Semantic search requires ${packageName}. Install it now? [y/N] `, (answer) => {
+      rl.close();
+      if (answer.trim().toLowerCase() !== 'y') return resolve(false);
+      try {
+        execFileSync('npm', ['install', packageName], {
+          stdio: 'inherit',
+          timeout: 300_000,
+        });
+        resolve(true);
+      } catch {
+        resolve(false);
+      }
+    });
+  });
+}
+
+/**
+ * Lazy-load @huggingface/transformers.
+ * If the package is missing, prompts the user to install it interactively.
+ * In non-TTY environments, prints an error and exits.
+ */
+export async function loadTransformers() {
+  try {
+    return await import('@huggingface/transformers');
+  } catch {
+    const pkg = '@huggingface/transformers';
+    const installed = await promptInstall(pkg);
+    if (installed) {
+      try {
+        return await import(pkg);
+      } catch (loadErr) {
+        throw new EngineError(
+          `${pkg} was installed but failed to load. Please check your environment.`,
+          { cause: loadErr },
+        );
+      }
+    }
+    throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`);
+  }
+}
+
+/**
+ * Dispose the current ONNX session and free memory.
+ * Safe to call when no model is loaded (no-op).
+ */
+export async function disposeModel() {
+  if (extractor) {
+    await extractor.dispose();
+    extractor = null;
+  }
+  activeModel = null;
+}
+
+async function loadModel(modelKey) {
+  const config = getModelConfig(modelKey);
+
+  if (extractor && activeModel === config.name) return { extractor, config };
+
+  // Dispose previous model before loading a different one
+  await disposeModel();
+
+  const transformers = await loadTransformers();
+  pipeline = transformers.pipeline;
+  _cos_sim = transformers.cos_sim;
+
+  info(`Loading embedding model: ${config.name} (${config.dim}d)...`);
+  const pipelineOpts = config.quantized ? { quantized: true } : {};
+  try {
+    extractor = await pipeline('feature-extraction', config.name, pipelineOpts);
+  } catch (err) {
+    const msg = err.message || String(err);
+    if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
+      throw new EngineError(
+        `Model "${config.name}" requires authentication.\n` +
+          `This model is gated on HuggingFace and needs an access token.\n\n` +
+          `Options:\n` +
+          `  1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
+          `  2. Use a public model instead: codegraph embed --model minilm`,
+        { cause: err },
+      );
+    }
+    throw new EngineError(
+      `Failed to load model "${config.name}": ${msg}\n` +
+        `Try a different model: codegraph embed --model minilm`,
+      { cause: err },
+    );
+  }
+  activeModel = config.name;
+  info('Model loaded.');
+  return { extractor, config };
+}
+
+/**
+ * Generate embeddings for an array of texts.
+ */
+export async function embed(texts, modelKey) {
+  const { extractor: ext, config } = await loadModel(modelKey);
+  const dim = config.dim;
+  const results = [];
+  const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] || DEFAULT_BATCH_SIZE;
+
+  for (let i = 0; i < texts.length; i += batchSize) {
+    const batch = texts.slice(i, i + batchSize);
+    const output = await ext(batch, { pooling: 'mean', normalize: true });
+
+    for (let j = 0; j < batch.length; j++) {
+      const start = j * dim;
+      const vec = new Float32Array(dim);
+      for (let k = 0; k < dim; k++) {
+        vec[k] = output.data[start + k];
+      }
+      results.push(vec);
+    }
+
+    if (texts.length > batchSize) {
+      process.stdout.write(`  Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`);
+    }
+  }
+
+  return { vectors: results, dim };
+}
diff --git a/src/embeddings/search/cli-formatter.js b/src/embeddings/search/cli-formatter.js
new file mode 100644
index 00000000..f79a9e27
--- /dev/null
+++ b/src/embeddings/search/cli-formatter.js
@@ -0,0 +1,151 @@
+import { warn } from '../../logger.js';
+import { hybridSearchData } from './hybrid.js';
+import { ftsSearchData } from './keyword.js';
+import { multiSearchData, searchData } from './semantic.js';
+
+/**
+ * Search with mode support — CLI wrapper with multi-query detection.
+ * Modes: 'hybrid' (default), 'semantic', 'keyword'
+ */
+export async function search(query, customDbPath, opts = {}) {
+  const mode = opts.mode || 'hybrid';
+
+  // Split by semicolons, trim, filter empties
+  const queries = query
+    .split(';')
+    .map((q) => q.trim())
+    .filter((q) => q.length > 0);
+
+  const kindIcon = (kind) => (kind === 'function' ? 'f' : kind === 'class' ? '*' : 'o');
+
+  // ─── Keyword-only mode ──────────────────────────────────────────────
+  if (mode === 'keyword') {
+    const singleQuery = queries.length === 1 ? queries[0] : query;
+    const data = ftsSearchData(singleQuery, customDbPath, opts);
+    if (!data) {
+      console.log('No FTS5 index found. Run `codegraph embed` to build the keyword index.');
+      return;
+    }
+
+    if (opts.json) {
+      console.log(JSON.stringify(data, null, 2));
+      return;
+    }
+
+    console.log(`\nKeyword search: "${singleQuery}" (BM25)\n`);
+    if (data.results.length === 0) {
+      console.log('  No results found.');
+    } else {
+      for (const r of data.results) {
+        console.log(
+          `  BM25 ${r.bm25Score.toFixed(2)}  ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`,
+        );
+      }
+    }
+    console.log(`\n  ${data.results.length} results shown\n`);
+    return;
+  }
+
+  // ─── Semantic-only mode ─────────────────────────────────────────────
+  if (mode === 'semantic') {
+    if (queries.length <= 1) {
+      const singleQuery = queries[0] || query;
+      const data = await searchData(singleQuery, customDbPath, opts);
+      if (!data) return;
+
+      if (opts.json) {
+        console.log(JSON.stringify(data, null, 2));
+        return;
+      }
+
+      console.log(`\nSemantic search: "${singleQuery}"\n`);
+      if (data.results.length === 0) {
+        console.log('  No results above threshold.');
+      } else {
+        for (const r of data.results) {
+          const bar = '#'.repeat(Math.round(r.similarity * 20));
+          console.log(`  ${(r.similarity * 100).toFixed(1)}% ${bar}`);
+          console.log(`    ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`);
+        }
+      }
+      console.log(`\n  ${data.results.length} results shown\n`);
+    } else {
+      const data = await multiSearchData(queries, customDbPath, opts);
+      if (!data) return;
+
+      if (opts.json) {
+        console.log(JSON.stringify(data, null, 2));
+        return;
+      }
+
+      console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`);
+      for (let i = 0; i < queries.length; i++) console.log(`  [${i + 1}] "${queries[i]}"`);
+      console.log();
+      if (data.results.length === 0) {
+        console.log('  No results above threshold.');
+      } else {
+        for (const r of data.results) {
+          console.log(
+            `  RRF ${r.rrf.toFixed(4)}  ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`,
+          );
+          for (const qs of r.queryScores) {
+            const bar = '#'.repeat(Math.round(qs.similarity * 20));
+            console.log(
+              `    [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`,
+            );
+          }
+        }
+      }
+      console.log(`\n  ${data.results.length} results shown\n`);
+    }
+    return;
+  }
+
+  // ─── Hybrid mode (default) ──────────────────────────────────────────
+  const data = await hybridSearchData(query, customDbPath, opts);
+
+  if (!data) {
+    // No FTS5 index — fall back to semantic-only
+    warn(
+      'FTS5 index not found — using semantic search only. Re-run `codegraph embed` to enable hybrid mode.',
+    );
+    return search(query, customDbPath, { ...opts, mode: 'semantic' });
+  }
+
+  if (opts.json) {
+    console.log(JSON.stringify(data, null, 2));
+    return;
+  }
+
+  const rrfK = opts.rrfK || 60;
+  if (queries.length <= 1) {
+    const singleQuery = queries[0] || query;
+    console.log(`\nHybrid search: "${singleQuery}" (BM25 + semantic, RRF k=${rrfK})\n`);
+  } else {
+    console.log(`\nHybrid multi-query search (BM25 + semantic, RRF k=${rrfK}):`);
+    for (let i = 0; i < queries.length; i++) console.log(`  [${i + 1}] "${queries[i]}"`);
+    console.log();
+  }
+
+  if (data.results.length === 0) {
+    console.log('  No results found.');
+  } else {
+    for (const r of data.results) {
+      console.log(
+        `  RRF ${r.rrf.toFixed(4)}  ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`,
+      );
+      const parts = [];
+      if (r.bm25Rank != null) {
+        parts.push(`BM25: rank ${r.bm25Rank} (score ${r.bm25Score.toFixed(2)})`);
+      }
+      if (r.semanticRank != null) {
+        parts.push(`Semantic: rank ${r.semanticRank} (${(r.similarity * 100).toFixed(1)}%)`);
+      }
+      if (parts.length > 0) {
+        console.log(`    ${parts.join('  |  ')}`);
+      }
+    }
+  }
+
+  console.log(`\n  ${data.results.length} results shown\n`);
+}
diff --git a/src/embeddings/search/filters.js b/src/embeddings/search/filters.js
new file mode 100644
index 00000000..465e51e0
--- /dev/null
+++ b/src/embeddings/search/filters.js
@@ -0,0 +1,46 @@
+/**
+ * Match a file path against a glob pattern.
+ * Supports *, **, and ? wildcards. Zero dependencies.
+ */
+export function globMatch(filePath, pattern) {
+  // Normalize separators to forward slashes
+  const normalized = filePath.replace(/\\/g, '/');
+  // Escape regex specials except glob chars
+  let regex = pattern.replace(/\\/g, '/').replace(/[.+^${}()|[\]\\]/g, '\\$&');
+  // Replace ** first (matches any path segment), then * and ?
+  regex = regex.replace(/\*\*/g, '\0');
+  regex = regex.replace(/\*/g, '[^/]*');
+  regex = regex.replace(/\0/g, '.*');
+  regex = regex.replace(/\?/g, '[^/]');
+  try {
+    return new RegExp(`^${regex}$`).test(normalized);
+  } catch {
+    // Malformed pattern — fall back to substring match
+    return normalized.includes(pattern);
+  }
+}
+
+const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./;
+
+/**
+ * Apply post-query filters (glob pattern, noTests) to a set of rows.
+ * Mutates nothing — returns a new filtered array.
+ * @param {Array} rows - Rows with at least a `file` property
+ * @param {object} opts
+ * @param {string} [opts.filePattern] - Glob pattern (only applied if it contains glob chars)
+ * @param {boolean} [opts.noTests] - Exclude test/spec files
+ * @param {boolean} [opts.isGlob] - Pre-computed: does filePattern contain glob chars?
+ * @returns {Array}
+ */
+export function applyFilters(rows, opts = {}) {
+  let filtered = rows;
+  const isGlob =
+    opts.isGlob !== undefined ? opts.isGlob : opts.filePattern && /[*?[\]]/.test(opts.filePattern);
+  if (isGlob) {
+    filtered = filtered.filter((row) => globMatch(row.file, opts.filePattern));
+  }
+  if (opts.noTests) {
+    filtered = filtered.filter((row) => !TEST_PATTERN.test(row.file));
+  }
+  return filtered;
+}
diff --git a/src/embeddings/search/hybrid.js b/src/embeddings/search/hybrid.js
new file mode 100644
index 00000000..759e91c7
--- /dev/null
+++ b/src/embeddings/search/hybrid.js
@@ -0,0 +1,121 @@
+import { openReadonlyOrFail } from '../../db.js';
+import { hasFtsIndex } from '../stores/fts5.js';
+import { ftsSearchData } from './keyword.js';
+import { searchData } from './semantic.js';
+
+/**
+ * Hybrid BM25 + semantic search with RRF fusion.
+ * Returns { results: [{ name, kind, file, line, rrf, bm25Score, bm25Rank, similarity, semanticRank }] }
+ * or null if no FTS5 index (caller should fall back to semantic-only).
+ */
+export async function hybridSearchData(query, customDbPath, opts = {}) {
+  const limit = opts.limit || 15;
+  const k = opts.rrfK || 60;
+  const topK = (opts.limit || 15) * 5;
+
+  // Split semicolons for multi-query support
+  const queries =
+    typeof query === 'string'
+      ? query
+          .split(';')
+          .map((q) => q.trim())
+          .filter((q) => q.length > 0)
+      : [query];
+
+  // Check FTS5 availability first (sync, cheap)
+  const checkDb = openReadonlyOrFail(customDbPath);
+  const ftsAvailable = hasFtsIndex(checkDb);
+  checkDb.close();
+  if (!ftsAvailable) return null;
+
+  // Collect ranked lists: for each query, one BM25 list + one semantic list
+  const rankedLists = [];
+
+  for (const q of queries) {
+    // BM25 ranked list (sync)
+    const bm25Data = ftsSearchData(q, customDbPath, { ...opts, limit: topK });
+    if (bm25Data?.results) {
+      rankedLists.push(
+        bm25Data.results.map((r, idx) => ({
+          key: `${r.name}:${r.file}:${r.line}`,
+          rank: idx + 1,
+          source: 'bm25',
+          ...r,
+        })),
+      );
+    }
+
+    // Semantic ranked list (async)
+    const semData = await searchData(q, customDbPath, {
+      ...opts,
+      limit: topK,
+      minScore: opts.minScore || 0.2,
+    });
+    if (semData?.results) {
+      rankedLists.push(
+        semData.results.map((r, idx) => ({
+          key: `${r.name}:${r.file}:${r.line}`,
+          rank: idx + 1,
+          source: 'semantic',
+          ...r,
+        })),
+      );
+    }
+  }
+
+  // RRF fusion across all ranked lists
+  const fusionMap = new Map();
+  for (const list of rankedLists) {
+    for (const item of list) {
+      if (!fusionMap.has(item.key)) {
+        fusionMap.set(item.key, {
+          name: item.name,
+          kind: item.kind,
+          file: item.file,
+          line: item.line,
+          endLine: item.endLine ?? null,
+          role: item.role ?? null,
+          fileHash: item.fileHash ?? null,
+          rrfScore: 0,
+          bm25Score: null,
+          bm25Rank: null,
+          similarity: null,
+          semanticRank: null,
+        });
+      }
+      const entry = fusionMap.get(item.key);
+      entry.rrfScore += 1 / (k + item.rank);
+      if (item.source === 'bm25') {
+        if (entry.bm25Rank === null || item.rank < entry.bm25Rank) {
+          entry.bm25Score = item.bm25Score;
+          entry.bm25Rank = item.rank;
+        }
+      } else {
+        if (entry.semanticRank === null || item.rank < entry.semanticRank) {
+          entry.similarity = item.similarity;
+          entry.semanticRank = item.rank;
+        }
+      }
+    }
+  }
+
+  const results = [...fusionMap.values()]
+    .sort((a, b) => b.rrfScore - a.rrfScore)
+    .slice(0, limit)
+    .map((e) => ({
+      name: e.name,
+      kind: e.kind,
+      file: e.file,
+      line: e.line,
+      endLine: e.endLine,
+      role: e.role,
+      fileHash: e.fileHash,
+      rrf: e.rrfScore,
+      bm25Score: e.bm25Score,
+      bm25Rank: e.bm25Rank,
+      similarity: e.similarity,
+      semanticRank: e.semanticRank,
+    }));
+
+  return { results };
+}
diff --git a/src/embeddings/search/keyword.js b/src/embeddings/search/keyword.js
new file mode 100644
index 00000000..cc8975d3
--- /dev/null
+++ b/src/embeddings/search/keyword.js
@@ -0,0 +1,68 @@
+import { openReadonlyOrFail } from '../../db.js';
+import { normalizeSymbol } from '../../queries.js';
+import { hasFtsIndex, sanitizeFtsQuery } from '../stores/fts5.js';
+import { applyFilters } from './filters.js';
+
+/**
+ * BM25 keyword search via FTS5.
+ * Returns { results: [{ name, kind, file, line, bm25Score }] } or null if no FTS5 index.
+ */
+export function ftsSearchData(query, customDbPath, opts = {}) {
+  const limit = opts.limit || 15;
+
+  const db = openReadonlyOrFail(customDbPath);
+
+  try {
+    if (!hasFtsIndex(db)) {
+      return null;
+    }
+
+    const ftsQuery = sanitizeFtsQuery(query);
+    if (!ftsQuery) {
+      return { results: [] };
+    }
+
+    let sql = `
+      SELECT f.rowid AS node_id, rank AS bm25_score,
+             n.name, n.kind, n.file, n.line, n.end_line, n.role
+      FROM fts_index f
+      JOIN nodes n ON f.rowid = n.id
+      WHERE fts_index MATCH ?
+    `;
+    const params = [ftsQuery];
+
+    if (opts.kind) {
+      sql += ' AND n.kind = ?';
+      params.push(opts.kind);
+    }
+
+    const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern);
+    if (opts.filePattern && !isGlob) {
+      sql += ' AND n.file LIKE ?';
+      params.push(`%${opts.filePattern}%`);
+    }
+
+    sql += ' ORDER BY rank LIMIT ?';
+    params.push(limit * 5); // fetch generous set for post-filtering
+
+    let rows;
+    try {
+      rows = db.prepare(sql).all(...params);
+    } catch {
+      // Invalid FTS5 query syntax — return empty
+      return { results: [] };
+    }
+
+    rows = applyFilters(rows, { ...opts, isGlob });
+
+    const hc = new Map();
+    const results = rows.slice(0, limit).map((row) => ({
+      ...normalizeSymbol(row, db, hc),
+      bm25Score: -row.bm25_score, // FTS5 rank is negative; negate for display
+    }));
+
+    return { results };
+  } finally {
+    db.close();
+  }
+}
diff --git a/src/embeddings/search/prepare.js b/src/embeddings/search/prepare.js
new file mode 100644
index 00000000..fae92d0d
--- /dev/null
+++ b/src/embeddings/search/prepare.js
@@ -0,0 +1,58 @@
+import { getEmbeddingMeta, hasEmbeddings } from '../../db/repository/embeddings.js';
+import { openReadonlyOrFail } from '../../db.js';
+import { MODELS } from '../models.js';
+import { applyFilters } from './filters.js';
+
+/**
+ * Shared setup for search functions: opens DB, validates embeddings/model, loads rows.
+ * Returns { db, rows, modelKey, storedDim } or null on failure (prints error).
+ */
+export function prepareSearch(customDbPath, opts = {}) {
+  const db = openReadonlyOrFail(customDbPath);
+
+  if (!hasEmbeddings(db)) {
+    console.log('No embeddings found. Run `codegraph embed` first.');
+    db.close();
+    return null;
+  }
+
+  const storedModel = getEmbeddingMeta(db, 'model') || null;
+  const dimStr = getEmbeddingMeta(db, 'dim');
+  const storedDim = dimStr ? parseInt(dimStr, 10) : null;
+
+  let modelKey = opts.model || null;
+  if (!modelKey && storedModel) {
+    for (const [key, config] of Object.entries(MODELS)) {
+      if (config.name === storedModel) {
+        modelKey = key;
+        break;
+      }
+    }
+  }
+
+  // Pre-filter: allow filtering by kind or file pattern to reduce search space
+  const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern);
+  let sql = `
+    SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role
+    FROM embeddings e
+    JOIN nodes n ON e.node_id = n.id
+  `;
+  const params = [];
+  const conditions = [];
+  if (opts.kind) {
+    conditions.push('n.kind = ?');
+    params.push(opts.kind);
+  }
+  if (opts.filePattern && !isGlob) {
+    conditions.push('n.file LIKE ?');
+    params.push(`%${opts.filePattern}%`);
+  }
+  if (conditions.length > 0) {
+    sql += ` WHERE ${conditions.join(' AND ')}`;
+  }
+
+  let rows = db.prepare(sql).all(...params);
+  rows = applyFilters(rows, { ...opts, isGlob });
+
+  return { db, rows, modelKey, storedDim };
+}
diff --git a/src/embeddings/search/semantic.js b/src/embeddings/search/semantic.js
new file mode 100644
index 00000000..62263ac3
--- /dev/null
+++ b/src/embeddings/search/semantic.js
@@ -0,0 +1,145 @@
+import { warn } from '../../logger.js';
+import { normalizeSymbol } from '../../queries.js';
+import { embed } from '../models.js';
+import { cosineSim } from '../stores/sqlite-blob.js';
+import { prepareSearch } from './prepare.js';
+
+/**
+ * Single-query semantic search — returns data instead of printing.
+ * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure.
+ */
+export async function searchData(query, customDbPath, opts = {}) {
+  const limit = opts.limit || 15;
+  const minScore = opts.minScore || 0.2;
+
+  const prepared = prepareSearch(customDbPath, opts);
+  if (!prepared) return null;
+  const { db, rows, modelKey, storedDim } = prepared;
+
+  try {
+    const {
+      vectors: [queryVec],
+      dim,
+    } = await embed([query], modelKey);
+
+    if (storedDim && dim !== storedDim) {
+      console.log(
+        `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
+      );
+      console.log(`  Re-run \`codegraph embed\` with the same model, or use --model to match.`);
+      return null;
+    }
+
+    const hc = new Map();
+    const results = [];
+    for (const row of rows) {
+      const vec = new Float32Array(new Uint8Array(row.vector).buffer);
+      const sim = cosineSim(queryVec, vec);
+
+      if (sim >= minScore) {
+        results.push({
+          ...normalizeSymbol(row, db, hc),
+          similarity: sim,
+        });
+      }
+    }
+
+    results.sort((a, b) => b.similarity - a.similarity);
+    return { results: results.slice(0, limit) };
+  } finally {
+    db.close();
+  }
+}
+
+/**
+ * Multi-query semantic search with Reciprocal Rank Fusion (RRF).
+ * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure.
+ */
+export async function multiSearchData(queries, customDbPath, opts = {}) {
+  const limit = opts.limit || 15;
+  const minScore = opts.minScore || 0.2;
+  const k = opts.rrfK || 60;
+
+  const prepared = prepareSearch(customDbPath, opts);
+  if (!prepared) return null;
+  const { db, rows, modelKey, storedDim } = prepared;
+
+  try {
+    const { vectors: queryVecs, dim } = await embed(queries, modelKey);
+
+    // Warn about similar queries that may bias RRF results
+    const SIMILARITY_WARN_THRESHOLD = 0.85;
+    for (let i = 0; i < queryVecs.length; i++) {
+      for (let j = i + 1; j < queryVecs.length; j++) {
+        const sim = cosineSim(queryVecs[i], queryVecs[j]);
+        if (sim >= SIMILARITY_WARN_THRESHOLD) {
+          warn(
+            `Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
+              `(${(sim * 100).toFixed(0)}% cosine similarity). ` +
+              `This may bias RRF results toward their shared matches. ` +
+              `Consider using more distinct queries.`,
+          );
+        }
+      }
+    }
+
+    if (storedDim && dim !== storedDim) {
+      console.log(
+        `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
+      );
+      console.log(`  Re-run \`codegraph embed\` with the same model, or use --model to match.`);
+      return null;
+    }
+
+    // Parse row vectors once
+    const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer));
+
+    // For each query: compute similarities, filter by minScore, rank
+    const perQueryRanked = queries.map((_query, qi) => {
+      const scored = [];
+      for (let ri = 0; ri < rows.length; ri++) {
+        const sim = cosineSim(queryVecs[qi], rowVecs[ri]);
+        if (sim >= minScore) {
+          scored.push({ rowIndex: ri, similarity: sim });
+        }
+      }
+      scored.sort((a, b) => b.similarity - a.similarity);
+      // Assign 1-indexed ranks
+      return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
+    });
+
+    // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries
+    const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] }
+    for (let qi = 0; qi < queries.length; qi++) {
+      for (const item of perQueryRanked[qi]) {
+        if (!fusionMap.has(item.rowIndex)) {
+          fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
+        }
+        const entry = fusionMap.get(item.rowIndex);
+        entry.rrfScore += 1 / (k + item.rank);
+        entry.queryScores.push({
+          query: queries[qi],
+          similarity: item.similarity,
+          rank: item.rank,
+        });
+      }
+    }
+
+    // Build results sorted by RRF score
+    const hc = new Map();
+    const results = [];
+    for (const [rowIndex, entry] of fusionMap) {
+      const row = rows[rowIndex];
+      results.push({
+        ...normalizeSymbol(row, db, hc),
+        rrf: entry.rrfScore,
+        queryScores: entry.queryScores,
+      });
+    }
+
+    results.sort((a, b) => b.rrf - a.rrf);
+    return { results: results.slice(0, limit) };
+  } finally {
+    db.close();
+  }
+}
diff --git a/src/embeddings/stores/fts5.js b/src/embeddings/stores/fts5.js
new file mode 100644
index 00000000..9b902dce
--- /dev/null
+++ b/src/embeddings/stores/fts5.js
@@ -0,0 +1,27 @@
+/**
+ * Sanitize a user query for FTS5 MATCH syntax.
+ * Wraps each token as an implicit OR and escapes special FTS5 characters.
+ */
+export function sanitizeFtsQuery(query) {
+  // Remove FTS5 special chars that could cause syntax errors
+  const cleaned = query.replace(/[*"():^{}~<>]/g, ' ').trim();
+  if (!cleaned) return null;
+  // Split into tokens, wrap with OR for multi-token queries
+  const tokens = cleaned.split(/\s+/).filter((t) => t.length > 0);
+  if (tokens.length === 0) return null;
+  if (tokens.length === 1) return `"${tokens[0]}"`;
+  return tokens.map((t) => `"${t}"`).join(' OR ');
+}
+
+/**
+ * Check if the FTS5 index exists in the database.
+ * Returns true if fts_index table exists and has rows, false otherwise.
+ */
+export function hasFtsIndex(db) {
+  try {
+    const row = db.prepare('SELECT COUNT(*) as c FROM fts_index').get();
+    return row.c > 0;
+  } catch {
+    return false;
+  }
+}
diff --git a/src/embeddings/stores/sqlite-blob.js b/src/embeddings/stores/sqlite-blob.js
new file mode 100644
index 00000000..e3979b2c
--- /dev/null
+++ b/src/embeddings/stores/sqlite-blob.js
@@ -0,0 +1,23 @@
+/**
+ * @typedef {object} VectorStore
+ * @property {(queryVec: Float32Array, rows: Array<{vector: Buffer}>) => Array<{index: number, score: number}>} search
+ *   Score every row against a query vector and return scored indices.
+ *
+ * Future implementations (e.g. HNSW via `hnsw.js`) implement this same shape
+ * for approximate nearest-neighbor search.
+ */
+
+/**
+ * Cosine similarity between two Float32Arrays.
+ */
+export function cosineSim(a, b) {
+  let dot = 0,
+    normA = 0,
+    normB = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i];
+    normA += a[i] * a[i];
+    normB += b[i] * b[i];
+  }
+  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
+}
diff --git a/src/embeddings/strategies/source.js b/src/embeddings/strategies/source.js
new file mode 100644
index 00000000..3b25e0f3
--- /dev/null
+++ b/src/embeddings/strategies/source.js
@@ -0,0 +1,14 @@
+import { splitIdentifier } from './text-utils.js';
+
+/**
+ * Build raw source-code text for a symbol (original strategy).
+ */
+export function buildSourceText(node, file, lines) {
+  const startLine = Math.max(0, node.line - 1);
+  const endLine = node.end_line
+    ? Math.min(lines.length, node.end_line)
+    : Math.min(lines.length, startLine + 15);
+  const context = lines.slice(startLine, endLine).join('\n');
+  const readable = splitIdentifier(node.name);
+  return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`;
+}
diff --git a/src/embeddings/strategies/structured.js b/src/embeddings/strategies/structured.js
new file mode 100644
index 00000000..c488d1c6
--- /dev/null
+++ b/src/embeddings/strategies/structured.js
@@ -0,0 +1,43 @@
+import { findCalleeNames, findCallerNames } from '../../db.js';
+import { extractLeadingComment, splitIdentifier } from './text-utils.js';
+
+/**
+ * Build graph-enriched text for a symbol using dependency context.
+ * Produces compact, semantic text (~100 tokens) instead of full source code.
+ */
+export function buildStructuredText(node, file, lines, db) {
+  const readable = splitIdentifier(node.name);
+  const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`];
+  const startLine = Math.max(0, node.line - 1);
+
+  // Extract parameters from signature (best-effort, single-line)
+  const sigLine = lines[startLine] || '';
+  const paramMatch = sigLine.match(/\(([^)]*)\)/);
+  if (paramMatch?.[1]?.trim()) {
+    parts.push(`Parameters: ${paramMatch[1].trim()}`);
+  }
+
+  // Graph context: callees (capped at 10)
+  const callees = findCalleeNames(db, node.id);
+  if (callees.length > 0) {
+    parts.push(`Calls: ${callees.slice(0, 10).join(', ')}`);
+  }
+
+  // Graph context: callers (capped at 10)
+  const callers = findCallerNames(db, node.id);
+  if (callers.length > 0) {
+    parts.push(`Called by: ${callers.slice(0, 10).join(', ')}`);
+  }
+
+  // Leading comment (high semantic value) or first few lines of code
+  const comment = extractLeadingComment(lines, startLine);
+  if (comment) {
+    parts.push(comment);
+  } else {
+    const endLine = Math.min(lines.length, startLine + 4);
+    const snippet = lines.slice(startLine, endLine).join('\n').trim();
+    if (snippet) parts.push(snippet);
+  }
+
+  return parts.join('\n');
+}
diff --git a/src/embeddings/strategies/text-utils.js b/src/embeddings/strategies/text-utils.js
new file mode 100644
index 00000000..fca8f29e
--- /dev/null
+++ b/src/embeddings/strategies/text-utils.js
@@ -0,0 +1,43 @@
+/**
+ * Split an identifier into readable words.
+ * camelCase/PascalCase -> "camel Case", snake_case -> "snake case", kebab-case -> "kebab case"
+ */
+export function splitIdentifier(name) {
+  return name
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
+    .replace(/[_-]+/g, ' ')
+    .trim();
+}
+
+/**
+ * Extract leading comment text (JSDoc, //, #, etc.) above a function line.
+ * Returns the cleaned comment text or null if none found.
+ */
+export function extractLeadingComment(lines, fnLineIndex) {
+  if (fnLineIndex > lines.length) return null;
+  const raw = [];
+  for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) {
+    if (i >= lines.length) continue;
+    const trimmed = lines[i].trim();
+    if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) {
+      raw.unshift(trimmed);
+    } else if (trimmed === '') {
+      if (raw.length > 0) break;
+    } else {
+      break;
+    }
+  }
+  if (raw.length === 0) return null;
+  return raw
+    .map((line) =>
+      line
+        .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */
+        .replace(/^\*\s?/, '') // middle * lines
+        .replace(/^\/\/\/?\s?/, '') // // or ///
+        .replace(/^#\s?/, '') // # (Python/Ruby)
+        .trim(),
+    )
+    .filter((l) => l.length > 0)
+    .join(' ');
+}
diff --git a/src/index.js b/src/index.js
index bca2cec7..576efac6 100644
--- a/src/index.js
+++ b/src/index.js
@@ -23,7 +23,12 @@ export { loadConfig } from './config.js';
 export { EXTENSIONS, IGNORE_DIRS } from './constants.js';
 export { findCycles } from './cycles.js';
 export { dataflowData } from './dataflow.js';
-export { buildEmbeddings, hybridSearchData, multiSearchData, searchData } from './embedder.js';
+export {
+  buildEmbeddings,
+  hybridSearchData,
+  multiSearchData,
+  searchData,
+} from './embeddings/index.js';
 export {
   AnalysisError,
   BoundaryError,
diff --git a/src/mcp/tools/semantic-search.js b/src/mcp/tools/semantic-search.js
index 06ef8354..2fa22ea5 100644
--- a/src/mcp/tools/semantic-search.js
+++ b/src/mcp/tools/semantic-search.js
@@ -11,7 +11,7 @@ export async function handler(args, ctx) {
   };
 
   if (mode === 'keyword') {
-    const { ftsSearchData } = await import('../../embedder.js');
+    const { ftsSearchData } = await import('../../embeddings/index.js');
     const result = ftsSearchData(args.query, ctx.dbPath, searchOpts);
     if (result === null) {
       return {
@@ -28,7 +28,7 @@ export async function handler(args, ctx) {
   }
 
   if (mode === 'semantic') {
-    const { searchData } = await import('../../embedder.js');
+    const { searchData } = await import('../../embeddings/index.js');
     const result = await searchData(args.query, ctx.dbPath, searchOpts);
     if (result === null) {
       return {
@@ -45,7 +45,7 @@ export async function handler(args, ctx) {
   }
 
   // hybrid (default) — falls back to semantic if no FTS5
-  const { hybridSearchData, searchData } = await import('../../embedder.js');
+  const { hybridSearchData, searchData } = await import('../../embeddings/index.js');
   let result = await hybridSearchData(args.query, ctx.dbPath, searchOpts);
   if (result === null) {
     result = await searchData(args.query, ctx.dbPath, searchOpts);
diff --git a/tests/search/embedder-search.test.js b/tests/search/embedder-search.test.js
index 93ea518c..86fe5543 100644
--- a/tests/search/embedder-search.test.js
+++ b/tests/search/embedder-search.test.js
@@ -38,7 +38,7 @@ import {
   multiSearchData,
   search,
   searchData,
-} from '../../src/embedder.js';
+} from '../../src/embeddings/index.js';
 
 // ─── Helpers ───────────────────────────────────────────────────────────
 
diff --git a/tests/search/embedding-regression.test.js b/tests/search/embedding-regression.test.js
index f1004bf3..56222875 100644
--- a/tests/search/embedding-regression.test.js
+++ b/tests/search/embedding-regression.test.js
@@ -23,7 +23,7 @@ try {
 
 // Lazy-import to avoid top-level errors when transformers is missing
 const { buildGraph } = await import('../../src/builder.js');
-const { buildEmbeddings, searchData } = await import('../../src/embedder.js');
+const { buildEmbeddings, searchData } = await import('../../src/embeddings/index.js');
 
 // Same ES-module fixture files used by build.test.js
 const FIXTURE_FILES = {
diff --git a/tests/search/embedding-strategy.test.js b/tests/search/embedding-strategy.test.js
index e1553678..70215559 100644
--- a/tests/search/embedding-strategy.test.js
+++ b/tests/search/embedding-strategy.test.js
@@ -31,7 +31,7 @@ import {
   EMBEDDING_STRATEGIES,
   estimateTokens,
   MODELS,
-} from '../../src/embedder.js';
+} from '../../src/embeddings/index.js';
 
 // ─── Helpers ───────────────────────────────────────────────────────────
 
diff --git a/tests/unit/prompt-install.test.js b/tests/unit/prompt-install.test.js
index 6a36c2de..98cd926f 100644
--- a/tests/unit/prompt-install.test.js
+++ b/tests/unit/prompt-install.test.js
@@ -44,7 +44,7 @@ describe('loadTransformers install prompt', () => {
       throw new Error('Cannot find package');
     });
 
-    const { embed } = await import('../../src/embedder.js');
+    const { embed } = await import('../../src/embeddings/index.js');
 
     await expect(embed(['test'], 'minilm')).rejects.toThrow(
       'Semantic search requires @huggingface/transformers',
@@ -71,7 +71,7 @@ describe('loadTransformers install prompt', () => {
       throw new Error('Cannot find package');
     });
 
-    const { embed } = await import('../../src/embedder.js');
+    const { embed } = await import('../../src/embeddings/index.js');
 
     await expect(embed(['test'], 'minilm')).rejects.toThrow(
       'Semantic search requires @huggingface/transformers',
@@ -99,7 +99,7 @@ describe('loadTransformers install prompt', () => {
       throw new Error('Cannot find package');
     });
 
-    const { embed } = await import('../../src/embedder.js');
+    const { embed } = await import('../../src/embeddings/index.js');
 
     await expect(embed(['test'], 'minilm')).rejects.toThrow(
       'Semantic search requires @huggingface/transformers',
@@ -137,7 +137,7 @@ describe('loadTransformers install prompt', () => {
       };
     });
 
-    const { embed } = await import('../../src/embedder.js');
+    const { embed } = await import('../../src/embeddings/index.js');
 
     const result = await embed(['test text'], 'minilm');
     expect(result.vectors).toHaveLength(1);

From 83678c3959553a21b6b98ff93239ea7f5c8a368e Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 02:24:23 -0600
Subject: [PATCH 4/8] fix: address review feedback on embedder extraction

- Remove dead _cos_sim variable from models.js (greptile)
- Fix embedding-benchmark.js import path (greptile)
- Update workflow path filters and cache keys for new directory (greptile)
- Update stale file references in test comments and CLAUDE.md (greptile)

Impact: 1 functions changed, 1 affected
---
 .github/workflows/benchmark.yml            | 2 +-
 .github/workflows/embedding-regression.yml | 2 +-
 CLAUDE.md                                  | 2 +-
 scripts/embedding-benchmark.js             | 2 +-
 src/embeddings/models.js                   | 2 --
 tests/unit/prompt-install.test.js          | 2 +-
 6 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0393e4b9..670b8b9e 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -228,7 +228,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: ~/.cache/huggingface
-          key: hf-models-${{ runner.os }}-${{ hashFiles('src/embedder.js') }}
+          key: hf-models-${{ runner.os }}-${{ hashFiles('src/embeddings/**') }}
           restore-keys: hf-models-${{ runner.os }}-
 
       - name: Build graph
diff --git a/.github/workflows/embedding-regression.yml b/.github/workflows/embedding-regression.yml
index 7cecee3f..a42cc6e7 100644
--- a/.github/workflows/embedding-regression.yml
+++ b/.github/workflows/embedding-regression.yml
@@ -6,7 +6,7 @@ on:
   workflow_dispatch:
   pull_request:
     paths:
-      - 'src/embedder.js'
+      - 'src/embeddings/**'
       - 'tests/search/**'
       - 'package.json'
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 46119fff..1b25676b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -45,7 +45,7 @@ JS source is plain JavaScript (ES modules) in `src/`. No transpilation step. The
 | `builder.js` | Graph building: file collection, parsing, import resolution, incremental hashing |
 | `parser.js` | tree-sitter WASM wrapper; `LANGUAGE_REGISTRY` + per-language extractors for functions, classes, methods, imports, exports, call sites |
 | `queries.js` | Query functions: symbol search, file deps, impact analysis, diff-impact; `SYMBOL_KINDS` constant defines all node kinds |
-| `embedder.js` | Semantic search with `@huggingface/transformers`; multi-query RRF ranking |
+| `embeddings/` | Embedding subsystem: model management, vector generation, semantic/keyword/hybrid search, CLI formatting |
 | `db.js` | SQLite schema and operations (`better-sqlite3`) |
 | `mcp.js` | MCP server exposing graph queries to AI agents; single-repo by default, `--multi-repo` to enable cross-repo access |
 | `cycles.js` | Circular dependency detection |
diff --git a/scripts/embedding-benchmark.js b/scripts/embedding-benchmark.js
index 51738074..4bc3afec 100644
--- a/scripts/embedding-benchmark.js
+++ b/scripts/embedding-benchmark.js
@@ -26,7 +26,7 @@ const { version, srcDir, cleanup } = await resolveBenchmarkSource();
 const dbPath = path.join(root, '.codegraph', 'graph.db');
 
 const { buildEmbeddings, MODELS, searchData, disposeModel } = await import(
-	srcImport(srcDir, 'embedder.js')
+	srcImport(srcDir, 'embeddings/index.js')
 );
 
 // Redirect console.log to stderr so only JSON goes to stdout
diff --git a/src/embeddings/models.js b/src/embeddings/models.js
index 948ad3aa..949f6c85 100644
--- a/src/embeddings/models.js
+++ b/src/embeddings/models.js
@@ -5,7 +5,6 @@ import { info } from '../logger.js';
 
 // Lazy-load transformers (heavy, optional module)
 let pipeline = null;
-let _cos_sim = null;
 let extractor = null;
 let activeModel = null;
 
@@ -157,7 +156,6 @@ async function loadModel(modelKey) {
 
   const transformers = await loadTransformers();
   pipeline = transformers.pipeline;
-  _cos_sim = transformers.cos_sim;
 
   info(`Loading embedding model: ${config.name} (${config.dim}d)...`);
   const pipelineOpts = config.quantized ? { quantized: true } : {};
diff --git a/tests/unit/prompt-install.test.js b/tests/unit/prompt-install.test.js
index 98cd926f..f23a73f8 100644
--- a/tests/unit/prompt-install.test.js
+++ b/tests/unit/prompt-install.test.js
@@ -1,5 +1,5 @@
 /**
- * Unit tests for the interactive install prompt in src/embedder.js.
+ * Unit tests for the interactive install prompt in src/embeddings/models.js.
  *
  * Tests the promptInstall() + loadTransformers() flow when
  * @huggingface/transformers is missing.

From b1ca54d4ce6148d275e96786da38e576c4ff297c Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 02:47:22 -0600
Subject: [PATCH 5/8] fix: harden prepareSearch with try/catch for DB leak and
 use getEmbeddingCount

- Wrap post-open logic in try/catch so DB is closed on unexpected exceptions
- Switch from hasEmbeddings to getEmbeddingCount for clearer zero-count check

Impact: 1 functions changed, 0 affected
---
 src/embeddings/search/prepare.js | 78 ++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 35 deletions(-)

diff --git a/src/embeddings/search/prepare.js b/src/embeddings/search/prepare.js
index fae92d0d..864bfbe9 100644
--- a/src/embeddings/search/prepare.js
+++ b/src/embeddings/search/prepare.js
@@ -1,4 +1,4 @@
-import { getEmbeddingMeta, hasEmbeddings } from '../../db/repository/embeddings.js';
+import { getEmbeddingCount, getEmbeddingMeta } from '../../db/repository/embeddings.js';
 import { openReadonlyOrFail } from '../../db.js';
 import { MODELS } from '../models.js';
 import { applyFilters } from './filters.js';
@@ -6,53 +6,61 @@ import { applyFilters } from './filters.js';
 /**
  * Shared setup for search functions: opens DB, validates embeddings/model, loads rows.
  * Returns { db, rows, modelKey, storedDim } or null on failure (prints error).
+ * On null return, the DB is closed. On exception, the DB is also closed
+ * (callers only need to close DB from the returned object on the happy path).
  */
 export function prepareSearch(customDbPath, opts = {}) {
   const db = openReadonlyOrFail(customDbPath);
 
-  if (!hasEmbeddings(db)) {
-    console.log('No embeddings found. Run `codegraph embed` first.');
-    db.close();
-    return null;
-  }
+  try {
+    const count = getEmbeddingCount(db);
+    if (count === 0) {
+      console.log('No embeddings found. Run `codegraph embed` first.');
+      db.close();
+      return null;
+    }
 
-  const storedModel = getEmbeddingMeta(db, 'model') || null;
-  const dimStr = getEmbeddingMeta(db, 'dim');
-  const storedDim = dimStr ? parseInt(dimStr, 10) : null;
+    const storedModel = getEmbeddingMeta(db, 'model') || null;
+    const dimStr = getEmbeddingMeta(db, 'dim');
+    const storedDim = dimStr ? parseInt(dimStr, 10) : null;
 
-  let modelKey = opts.model || null;
-  if (!modelKey && storedModel) {
-    for (const [key, config] of Object.entries(MODELS)) {
-      if (config.name === storedModel) {
-        modelKey = key;
-        break;
+    let modelKey = opts.model || null;
+    if (!modelKey && storedModel) {
+      for (const [key, config] of Object.entries(MODELS)) {
+        if (config.name === storedModel) {
+          modelKey = key;
+          break;
+        }
       }
     }
-  }
 
-  // Pre-filter: allow filtering by kind or file pattern to reduce search space
-  const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern);
-  let sql = `
+    // Pre-filter: allow filtering by kind or file pattern to reduce search space
+    const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern);
+    let sql = `
     SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role
     FROM embeddings e
     JOIN nodes n ON e.node_id = n.id
   `;
-  const params = [];
-  const conditions = [];
-  if (opts.kind) {
-    conditions.push('n.kind = ?');
-    params.push(opts.kind);
-  }
-  if (opts.filePattern && !isGlob) {
-    conditions.push('n.file LIKE ?');
-    params.push(`%${opts.filePattern}%`);
-  }
-  if (conditions.length > 0) {
-    sql += ` WHERE ${conditions.join(' AND ')}`;
-  }
+    const params = [];
+    const conditions = [];
+    if (opts.kind) {
+      conditions.push('n.kind = ?');
+      params.push(opts.kind);
+    }
+    if (opts.filePattern && !isGlob) {
+      conditions.push('n.file LIKE ?');
+      params.push(`%${opts.filePattern}%`);
+    }
+    if (conditions.length > 0) {
+      sql += ` WHERE ${conditions.join(' AND ')}`;
+    }
 
-  let rows = db.prepare(sql).all(...params);
-  rows = applyFilters(rows, { ...opts, isGlob });
+    let rows = db.prepare(sql).all(...params);
+    rows = applyFilters(rows, { ...opts, isGlob });
 
-  return { db, rows, modelKey, storedDim };
+    return { db, rows, modelKey, storedDim };
+  } catch (err) {
+    db.close();
+    throw err;
+  }
 }

From 9d781df8d4712555030d2814b8b03a3f996791ac Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 03:10:04 -0600
Subject: [PATCH 6/8] fix: guard cosineSim against zero-magnitude vectors
 returning NaN

Return 0 instead of NaN when either vector has zero magnitude (e.g.
corrupted DB row). In practice embed() stores L2-normalised vectors,
but this makes the contract explicit.

Impact: 1 functions changed, 0 affected
---
 src/embeddings/stores/sqlite-blob.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/embeddings/stores/sqlite-blob.js b/src/embeddings/stores/sqlite-blob.js
index e3979b2c..75037ffa 100644
--- a/src/embeddings/stores/sqlite-blob.js
+++ b/src/embeddings/stores/sqlite-blob.js
@@ -19,5 +19,6 @@ export function cosineSim(a, b) {
     normA += a[i] * a[i];
     normB += b[i] * b[i];
   }
-  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
+  const denom = Math.sqrt(normA) * Math.sqrt(normB);
+  return denom === 0 ? 0 : dot / denom;
 }

From 78aa1d5384357697e6767a2f450d060d89a7bc05 Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 03:21:13 -0600
Subject: [PATCH 7/8] fix: add @internal JSDoc tags to non-public model helpers

Mark getModelConfig, promptInstall, and loadTransformers as @internal
since they are exported only for sibling module use, not the public barrel.
---
 src/embeddings/models.js | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/embeddings/models.js b/src/embeddings/models.js
index 949f6c85..1202dd28 100644
--- a/src/embeddings/models.js
+++ b/src/embeddings/models.js
@@ -74,6 +74,7 @@ const BATCH_SIZE_MAP = {
 };
 const DEFAULT_BATCH_SIZE = 32;
 
+/** @internal Used by generator.js — not part of the public barrel. */
 export function getModelConfig(modelKey) {
   const key = modelKey || DEFAULT_MODEL;
   const config = MODELS[key];
@@ -87,6 +88,7 @@ export function getModelConfig(modelKey) {
  * Prompt the user to install a missing package interactively.
  * Returns true if the package was installed, false otherwise.
  * Skips the prompt entirely in non-TTY environments (CI, piped stdin).
+ * @internal Not part of the public barrel.
  */
 export function promptInstall(packageName) {
   if (!process.stdin.isTTY) return Promise.resolve(false);
@@ -113,6 +115,7 @@ export function promptInstall(packageName) {
  * Lazy-load @huggingface/transformers.
  * If the package is missing, prompts the user to install it interactively.
  * In non-TTY environments, prints an error and exits.
+ * @internal Not part of the public barrel.
  */
 export async function loadTransformers() {
   try {

From 37ba7e13967f24c78b128e8fac14c8f37f450b57 Mon Sep 17 00:00:00 2001
From: carlos-alm <127798846+carlos-alm@users.noreply.github.com>
Date: Fri, 13 Mar 2026 03:36:49 -0600
Subject: [PATCH 8/8] =?UTF-8?q?fix:=20unexport=20initEmbeddingsSchema=20?=
 =?UTF-8?q?=E2=80=94=20only=20used=20within=20generator.js?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: 1 functions changed, 1 affected
---
 src/embeddings/generator.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/embeddings/generator.js b/src/embeddings/generator.js
index 8721e2ac..b34f5934 100644
--- a/src/embeddings/generator.js
+++ b/src/embeddings/generator.js
@@ -15,7 +15,7 @@ export function estimateTokens(text) {
   return Math.ceil(text.length / 4);
 }
 
-export function initEmbeddingsSchema(db) {
+function initEmbeddingsSchema(db) {
   db.exec(`
     CREATE TABLE IF NOT EXISTS embeddings (
       node_id INTEGER PRIMARY KEY,