From 340ce60a549680eb0f3d46273a59c2407b1ebc8d Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 00:09:09 -0600 Subject: [PATCH 1/8] refactor: domain error hierarchy replacing ad-hoc error handling (ROADMAP 3.8) Add structured domain errors (CodegraphError base + 7 subclasses) to replace the mix of process.exit(1), throw new Error, and console.error scattered across library code. - New src/errors.js with ParseError, DbError, ConfigError, ResolutionError, EngineError, AnalysisError, BoundaryError - Library code throws domain errors instead of calling process.exit(1) - CLI top-level catch formats CodegraphError with [CODE] prefix - MCP catch returns structured { isError, code } responses - CLI commands use parseAsync() so async errors propagate - CI gate commands (check, manifesto) use process.exitCode instead of exit - All error classes exported from public API (src/index.js) Impact: 52 functions changed, 215 affected --- src/ast-analysis/shared.js | 9 ++-- src/batch.js | 3 +- src/cli.js | 8 +++- src/cli/commands/ast.js | 5 +- src/cli/commands/batch.js | 9 ++-- src/cli/commands/check.js | 11 +++-- src/cli/commands/co-change.js | 5 +- src/cli/commands/registry.js | 4 +- src/cli/commands/snapshot.js | 54 +++++++-------------- src/cli/commands/triage.js | 12 ++--- src/cli/index.js | 6 +-- src/commands/check.js | 10 ++-- src/commands/manifesto.js | 6 +-- src/db/connection.js | 8 ++-- src/db/query-builder.js | 11 +++-- src/db/repository/nodes.js | 7 ++- src/embedder.js | 40 ++++++++-------- src/errors.js | 78 +++++++++++++++++++++++++++++++ src/index.js | 11 +++++ src/mcp/server.js | 20 ++++---- src/native.js | 4 +- src/snapshot.js | 11 +++-- src/watcher.js | 4 +- tests/unit/db.test.js | 27 +++++------ tests/unit/errors.test.js | 69 +++++++++++++++++++++++++++ tests/unit/prompt-install.test.js | 35 ++++++++------ 26 files changed, 316 insertions(+), 151 deletions(-) create mode 100644 src/errors.js create mode 100644 tests/unit/errors.test.js diff --git a/src/ast-analysis/shared.js b/src/ast-analysis/shared.js index 6d7ed6dc..f5d8e0be 100644 --- a/src/ast-analysis/shared.js +++ b/src/ast-analysis/shared.js @@ -2,6 +2,7 @@ * Shared utilities for AST analysis modules (complexity, CFG, dataflow, AST nodes). */ +import { ConfigError } from '../errors.js'; import { LANGUAGE_REGISTRY } from '../parser.js'; // ─── Generic Rule Factory ───────────────────────────────────────────────── @@ -18,7 +19,7 @@ export function makeRules(defaults, overrides, label) { const validKeys = new Set(Object.keys(defaults)); for (const key of Object.keys(overrides)) { if (!validKeys.has(key)) { - throw new Error(`${label} rules: unknown key "${key}"`); + throw new ConfigError(`${label} rules: unknown key "${key}"`); } } return { ...defaults, ...overrides }; @@ -61,10 +62,10 @@ export const CFG_DEFAULTS = { export function makeCfgRules(overrides) { const rules = makeRules(CFG_DEFAULTS, overrides, 'CFG'); if (!(rules.functionNodes instanceof Set) || rules.functionNodes.size === 0) { - throw new Error('CFG rules: functionNodes must be a non-empty Set'); + throw new ConfigError('CFG rules: functionNodes must be a non-empty Set'); } if (!(rules.forNodes instanceof Set)) { - throw new Error('CFG rules: forNodes must be a Set'); + throw new ConfigError('CFG rules: forNodes must be a Set'); } return rules; } @@ -136,7 +137,7 @@ export const DATAFLOW_DEFAULTS = { export function makeDataflowRules(overrides) { const rules = makeRules(DATAFLOW_DEFAULTS, overrides, 'Dataflow'); if (!(rules.functionNodes instanceof Set) || rules.functionNodes.size === 0) { - throw new Error('Dataflow rules: functionNodes must be a non-empty Set'); + throw new ConfigError('Dataflow rules: functionNodes must be a non-empty Set'); } return rules; } diff --git a/src/batch.js b/src/batch.js index cdb25dfc..fb4ce88d 100644 --- a/src/batch.js +++ b/src/batch.js @@ -7,6 +7,7 @@ import { complexityData } from './complexity.js'; import { dataflowData } from './dataflow.js'; +import { ConfigError } from './errors.js'; import { flowData } from './flow.js'; import { contextData, @@ -53,7 +54,7 @@ export const BATCH_COMMANDS = { export function batchData(command, targets, customDbPath, opts = {}) { const entry = BATCH_COMMANDS[command]; if (!entry) { - throw new Error( + throw new ConfigError( `Unknown batch command "${command}". Valid commands: ${Object.keys(BATCH_COMMANDS).join(', ')}`, ); } diff --git a/src/cli.js b/src/cli.js index 72e9ced7..6318f0e4 100644 --- a/src/cli.js +++ b/src/cli.js @@ -1,8 +1,14 @@ #!/usr/bin/env node import { run } from './cli/index.js'; +import { CodegraphError } from './errors.js'; run().catch((err) => { - console.error(`codegraph: fatal error — ${err.message || err}`); + if (err instanceof CodegraphError) { + console.error(`codegraph [${err.code}]: ${err.message}`); + if (err.file) console.error(` file: ${err.file}`); + } else { + console.error(`codegraph: fatal error — ${err.message || err}`); + } process.exit(1); }); diff --git a/src/cli/commands/ast.js b/src/cli/commands/ast.js index 92140a33..cc9124b0 100644 --- a/src/cli/commands/ast.js +++ b/src/cli/commands/ast.js @@ -1,3 +1,5 @@ +import { ConfigError } from '../../errors.js'; + export const command = { name: 'ast [pattern]', description: 'Search stored AST nodes (calls, new, string, regex, throw, await) by pattern', @@ -9,8 +11,7 @@ export const command = { async execute([pattern], opts, ctx) { const { AST_NODE_KINDS, astQuery } = await import('../../ast.js'); if (opts.kind && !AST_NODE_KINDS.includes(opts.kind)) { - console.error(`Invalid AST kind "${opts.kind}". Valid: ${AST_NODE_KINDS.join(', ')}`); - process.exit(1); + throw new ConfigError(`Invalid AST kind "${opts.kind}". Valid: ${AST_NODE_KINDS.join(', ')}`); } astQuery(pattern, opts.db, { kind: opts.kind, diff --git a/src/cli/commands/batch.js b/src/cli/commands/batch.js index fe75c5c3..7637b5fb 100644 --- a/src/cli/commands/batch.js +++ b/src/cli/commands/batch.js @@ -1,6 +1,7 @@ import fs from 'node:fs'; import { BATCH_COMMANDS, multiBatchData, splitTargets } from '../../batch.js'; import { batch } from '../../commands/batch.js'; +import { ConfigError } from '../../errors.js'; import { EVERY_SYMBOL_KIND } from '../../queries.js'; export const command = { @@ -40,13 +41,13 @@ export const command = { targets = splitTargets(positionalTargets); } } catch (err) { - console.error(`Failed to parse targets: ${err.message}`); - process.exit(1); + throw new ConfigError(`Failed to parse targets: ${err.message}`, { cause: err }); } if (!targets || targets.length === 0) { - console.error('No targets provided. Pass targets as arguments, --from-file, or --stdin.'); - process.exit(1); + throw new ConfigError( + 'No targets provided. Pass targets as arguments, --from-file, or --stdin.', + ); } const batchOpts = { diff --git a/src/cli/commands/check.js b/src/cli/commands/check.js index 4c79fc11..78edb0b9 100644 --- a/src/cli/commands/check.js +++ b/src/cli/commands/check.js @@ -1,3 +1,4 @@ +import { ConfigError } from '../../errors.js'; import { EVERY_SYMBOL_KIND } from '../../queries.js'; export const command = { @@ -27,8 +28,9 @@ export const command = { if (!isDiffMode && !opts.rules) { if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) { - console.error(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`); - process.exit(1); + throw new ConfigError( + `Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`, + ); } const { manifesto } = await import('../../commands/manifesto.js'); manifesto(opts.db, { @@ -58,8 +60,9 @@ export const command = { if (opts.rules) { if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) { - console.error(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`); - process.exit(1); + throw new ConfigError( + `Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`, + ); } const { manifesto } = await import('../../commands/manifesto.js'); manifesto(opts.db, { diff --git a/src/cli/commands/co-change.js b/src/cli/commands/co-change.js index 9118ed0b..ef4885b5 100644 --- a/src/cli/commands/co-change.js +++ b/src/cli/commands/co-change.js @@ -1,3 +1,5 @@ +import { AnalysisError } from '../../errors.js'; + export const command = { name: 'co-change [file]', description: @@ -32,8 +34,7 @@ export const command = { if (opts.json) { console.log(JSON.stringify(result, null, 2)); } else if (result.error) { - console.error(result.error); - process.exit(1); + throw new AnalysisError(result.error); } else { console.log( `\nCo-change analysis complete: ${result.pairsFound} pairs from ${result.commitsScanned} commits (since: ${result.since})\n`, diff --git a/src/cli/commands/registry.js b/src/cli/commands/registry.js index c2181556..9e516d9a 100644 --- a/src/cli/commands/registry.js +++ b/src/cli/commands/registry.js @@ -1,5 +1,6 @@ import fs from 'node:fs'; import path from 'node:path'; +import { ConfigError } from '../../errors.js'; import { listRepos, pruneRegistry, @@ -54,8 +55,7 @@ export const command = { if (removed) { console.log(`Removed "${name}" from registry.`); } else { - console.error(`Repository "${name}" not found in registry.`); - process.exit(1); + throw new ConfigError(`Repository "${name}" not found in registry.`); } }, }, diff --git a/src/cli/commands/snapshot.js b/src/cli/commands/snapshot.js index b2dc3455..8dd0093e 100644 --- a/src/cli/commands/snapshot.js +++ b/src/cli/commands/snapshot.js @@ -12,13 +12,8 @@ export const command = { ['--force', 'Overwrite existing snapshot'], ], execute([name], opts, ctx) { - try { - const result = snapshotSave(name, { dbPath: opts.db, force: opts.force }); - console.log(`Snapshot saved: ${result.name} (${ctx.formatSize(result.size)})`); - } catch (err) { - console.error(err.message); - process.exit(1); - } + const result = snapshotSave(name, { dbPath: opts.db, force: opts.force }); + console.log(`Snapshot saved: ${result.name} (${ctx.formatSize(result.size)})`); }, }, { @@ -26,13 +21,8 @@ export const command = { description: 'Restore a snapshot over the current graph database', options: [['-d, --db ', 'Path to graph.db']], execute([name], opts) { - try { - snapshotRestore(name, { dbPath: opts.db }); - console.log(`Snapshot "${name}" restored.`); - } catch (err) { - console.error(err.message); - process.exit(1); - } + snapshotRestore(name, { dbPath: opts.db }); + console.log(`Snapshot "${name}" restored.`); }, }, { @@ -43,23 +33,18 @@ export const command = { ['-j, --json', 'Output as JSON'], ], execute(_args, opts, ctx) { - try { - const snapshots = snapshotList({ dbPath: opts.db }); - if (opts.json) { - console.log(JSON.stringify(snapshots, null, 2)); - } else if (snapshots.length === 0) { - console.log('No snapshots found.'); - } else { - console.log(`Snapshots (${snapshots.length}):\n`); - for (const s of snapshots) { - console.log( - ` ${s.name.padEnd(30)} ${ctx.formatSize(s.size).padStart(10)} ${s.createdAt.toISOString()}`, - ); - } + const snapshots = snapshotList({ dbPath: opts.db }); + if (opts.json) { + console.log(JSON.stringify(snapshots, null, 2)); + } else if (snapshots.length === 0) { + console.log('No snapshots found.'); + } else { + console.log(`Snapshots (${snapshots.length}):\n`); + for (const s of snapshots) { + console.log( + ` ${s.name.padEnd(30)} ${ctx.formatSize(s.size).padStart(10)} ${s.createdAt.toISOString()}`, + ); } - } catch (err) { - console.error(err.message); - process.exit(1); } }, }, @@ -68,13 +53,8 @@ export const command = { description: 'Delete a saved snapshot', options: [['-d, --db ', 'Path to graph.db']], execute([name], opts) { - try { - snapshotDelete(name, { dbPath: opts.db }); - console.log(`Snapshot "${name}" deleted.`); - } catch (err) { - console.error(err.message); - process.exit(1); - } + snapshotDelete(name, { dbPath: opts.db }); + console.log(`Snapshot "${name}" deleted.`); }, }, ], diff --git a/src/cli/commands/triage.js b/src/cli/commands/triage.js index eb8946d6..a334475f 100644 --- a/src/cli/commands/triage.js +++ b/src/cli/commands/triage.js @@ -1,3 +1,4 @@ +import { ConfigError } from '../../errors.js'; import { EVERY_SYMBOL_KIND, VALID_ROLES } from '../../queries.js'; export const command = { @@ -46,20 +47,17 @@ export const command = { } if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) { - console.error(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`); - process.exit(1); + throw new ConfigError(`Invalid kind "${opts.kind}". Valid: ${EVERY_SYMBOL_KIND.join(', ')}`); } if (opts.role && !VALID_ROLES.includes(opts.role)) { - console.error(`Invalid role "${opts.role}". Valid: ${VALID_ROLES.join(', ')}`); - process.exit(1); + throw new ConfigError(`Invalid role "${opts.role}". Valid: ${VALID_ROLES.join(', ')}`); } let weights; if (opts.weights) { try { weights = JSON.parse(opts.weights); - } catch { - console.error('Invalid --weights JSON'); - process.exit(1); + } catch (err) { + throw new ConfigError('Invalid --weights JSON', { cause: err }); } } const { triage } = await import('../../commands/triage.js'); diff --git a/src/cli/index.js b/src/cli/index.js index 83fecb98..52936b40 100644 --- a/src/cli/index.js +++ b/src/cli/index.js @@ -2,6 +2,7 @@ import fs from 'node:fs'; import path from 'node:path'; import { pathToFileURL } from 'node:url'; import { Command } from 'commander'; +import { ConfigError } from '../errors.js'; import { setVerbose } from '../logger.js'; import { checkForUpdates, printUpdateNotification } from '../update-check.js'; import { applyQueryOpts, config, formatSize, resolveNoTests } from './shared/options.js'; @@ -68,8 +69,7 @@ function registerCommand(parent, def) { if (def.validate) { const err = def.validate(args, opts, ctx); if (err) { - console.error(err); - process.exit(1); + throw new ConfigError(err); } } @@ -112,7 +112,7 @@ async function discoverCommands() { export async function run() { await discoverCommands(); - program.parse(); + await program.parseAsync(); } export { program, registerCommand, ctx }; diff --git a/src/commands/check.js b/src/commands/check.js index b3ae6d1b..114b09f6 100644 --- a/src/commands/check.js +++ b/src/commands/check.js @@ -1,19 +1,19 @@ import { checkData } from '../check.js'; +import { AnalysisError } from '../errors.js'; import { outputResult } from '../infrastructure/result-formatter.js'; /** - * CLI formatter — prints check results and exits with code 1 on failure. + * CLI formatter — prints check results and sets exitCode 1 on failure. */ export function check(customDbPath, opts = {}) { const data = checkData(customDbPath, opts); if (data.error) { - console.error(data.error); - process.exit(1); + throw new AnalysisError(data.error); } if (outputResult(data, null, opts)) { - if (!data.passed) process.exit(1); + if (!data.passed) process.exitCode = 1; return; } @@ -77,6 +77,6 @@ export function check(customDbPath, opts = {}) { console.log(`\n ${s.total} predicates | ${s.passed} passed | ${s.failed} failed\n`); if (!data.passed) { - process.exit(1); + process.exitCode = 1; } } diff --git a/src/commands/manifesto.js b/src/commands/manifesto.js index 8044f61c..0ccf1d1e 100644 --- a/src/commands/manifesto.js +++ b/src/commands/manifesto.js @@ -2,13 +2,13 @@ import { outputResult } from '../infrastructure/result-formatter.js'; import { manifestoData } from '../manifesto.js'; /** - * CLI formatter — prints manifesto results and exits with code 1 on failure. + * CLI formatter — prints manifesto results and sets exitCode 1 on failure. */ export function manifesto(customDbPath, opts = {}) { const data = manifestoData(customDbPath, opts); if (outputResult(data, 'violations', opts)) { - if (!data.passed) process.exit(1); + if (!data.passed) process.exitCode = 1; return; } @@ -72,6 +72,6 @@ export function manifesto(customDbPath, opts = {}) { console.log(); if (!data.passed) { - process.exit(1); + process.exitCode = 1; } } diff --git a/src/db/connection.js b/src/db/connection.js index beffdc41..d8b34c21 100644 --- a/src/db/connection.js +++ b/src/db/connection.js @@ -1,6 +1,7 @@ import fs from 'node:fs'; import path from 'node:path'; import Database from 'better-sqlite3'; +import { DbError } from '../errors.js'; import { warn } from '../logger.js'; function isProcessAlive(pid) { @@ -78,11 +79,10 @@ export function findDbPath(customPath) { export function openReadonlyOrFail(customPath) { const dbPath = findDbPath(customPath); if (!fs.existsSync(dbPath)) { - console.error( - `No codegraph database found at ${dbPath}.\n` + - `Run "codegraph build" first to analyze your codebase.`, + throw new DbError( + `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, + { file: dbPath }, ); - process.exit(1); } return new Database(dbPath, { readonly: true }); } diff --git a/src/db/query-builder.js b/src/db/query-builder.js index 29b87686..2f43e754 100644 --- a/src/db/query-builder.js +++ b/src/db/query-builder.js @@ -1,3 +1,4 @@ +import { DbError } from '../errors.js'; import { EVERY_EDGE_KIND } from '../kinds.js'; // ─── Validation Helpers ───────────────────────────────────────────── @@ -12,13 +13,13 @@ const SAFE_SELECT_TOKEN_RE = function validateAlias(alias) { if (!SAFE_ALIAS_RE.test(alias)) { - throw new Error(`Invalid SQL alias: ${alias}`); + throw new DbError(`Invalid SQL alias: ${alias}`); } } function validateColumn(column) { if (!SAFE_COLUMN_RE.test(column)) { - throw new Error(`Invalid SQL column: ${column}`); + throw new DbError(`Invalid SQL column: ${column}`); } } @@ -26,7 +27,7 @@ function validateOrderBy(clause) { const terms = clause.split(',').map((t) => t.trim()); for (const term of terms) { if (!SAFE_ORDER_TERM_RE.test(term)) { - throw new Error(`Invalid ORDER BY term: ${term}`); + throw new DbError(`Invalid ORDER BY term: ${term}`); } } } @@ -51,14 +52,14 @@ function validateSelectCols(cols) { const tokens = splitTopLevelCommas(cols); for (const token of tokens) { if (!SAFE_SELECT_TOKEN_RE.test(token)) { - throw new Error(`Invalid SELECT expression: ${token}`); + throw new DbError(`Invalid SELECT expression: ${token}`); } } } function validateEdgeKind(edgeKind) { if (!EVERY_EDGE_KIND.includes(edgeKind)) { - throw new Error( + throw new DbError( `Invalid edge kind: ${edgeKind} (expected one of ${EVERY_EDGE_KIND.join(', ')})`, ); } diff --git a/src/db/repository/nodes.js b/src/db/repository/nodes.js index 7fa3d035..af4a3475 100644 --- a/src/db/repository/nodes.js +++ b/src/db/repository/nodes.js @@ -1,3 +1,4 @@ +import { ConfigError } from '../../errors.js'; import { EVERY_SYMBOL_KIND, VALID_ROLES } from '../../kinds.js'; import { NodeQuery } from '../query-builder.js'; import { cachedStmt } from './cached-stmt.js'; @@ -37,10 +38,12 @@ export function findNodesWithFanIn(db, namePattern, opts = {}) { */ export function findNodesForTriage(db, opts = {}) { if (opts.kind && !EVERY_SYMBOL_KIND.includes(opts.kind)) { - throw new Error(`Invalid kind: ${opts.kind} (expected one of ${EVERY_SYMBOL_KIND.join(', ')})`); + throw new ConfigError( + `Invalid kind: ${opts.kind} (expected one of ${EVERY_SYMBOL_KIND.join(', ')})`, + ); } if (opts.role && !VALID_ROLES.includes(opts.role)) { - throw new Error(`Invalid role: ${opts.role} (expected one of ${VALID_ROLES.join(', ')})`); + throw new ConfigError(`Invalid role: ${opts.role} (expected one of ${VALID_ROLES.join(', ')})`); } const kindsToUse = opts.kind ? [opts.kind] : ['function', 'method', 'class']; diff --git a/src/embedder.js b/src/embedder.js index 3e03ee1b..f8fbc527 100644 --- a/src/embedder.js +++ b/src/embedder.js @@ -10,6 +10,7 @@ import { openDb, openReadonlyOrFail, } from './db.js'; +import { ConfigError, DbError, EngineError } from './errors.js'; import { info, warn } from './logger.js'; import { normalizeSymbol } from './queries.js'; @@ -123,8 +124,7 @@ function getModelConfig(modelKey) { const key = modelKey || DEFAULT_MODEL; const config = MODELS[key]; if (!config) { - console.error(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`); - process.exit(1); + throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`); } return config; } @@ -263,13 +263,14 @@ async function loadTransformers() { if (installed) { try { return await import(pkg); - } catch { - console.error(`\n${pkg} was installed but failed to load. Please check your environment.`); - process.exit(1); + } catch (loadErr) { + throw new EngineError( + `${pkg} was installed but failed to load. Please check your environment.`, + { cause: loadErr }, + ); } } - console.error(`Semantic search requires ${pkg}.\n` + `Install it with: npm install ${pkg}`); - process.exit(1); + throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`); } } @@ -304,20 +305,20 @@ async function loadModel(modelKey) { } catch (err) { const msg = err.message || String(err); if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) { - console.error( - `\nModel "${config.name}" requires authentication.\n` + + throw new EngineError( + `Model "${config.name}" requires authentication.\n` + `This model is gated on HuggingFace and needs an access token.\n\n` + `Options:\n` + ` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` + - ` 2. Use a public model instead: codegraph embed --model minilm\n`, - ); - } else { - console.error( - `\nFailed to load model "${config.name}": ${msg}\n` + - `Try a different model: codegraph embed --model minilm\n`, + ` 2. Use a public model instead: codegraph embed --model minilm`, + { cause: err }, ); } - process.exit(1); + throw new EngineError( + `Failed to load model "${config.name}": ${msg}\n` + + `Try a different model: codegraph embed --model minilm`, + { cause: err }, + ); } activeModel = config.name; info('Model loaded.'); @@ -413,11 +414,10 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = const dbPath = customDbPath || findDbPath(null); if (!fs.existsSync(dbPath)) { - console.error( - `No codegraph database found at ${dbPath}.\n` + - `Run "codegraph build" first to analyze your codebase.`, + throw new DbError( + `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, + { file: dbPath }, ); - process.exit(1); } const db = openDb(dbPath); diff --git a/src/errors.js b/src/errors.js new file mode 100644 index 00000000..0a398446 --- /dev/null +++ b/src/errors.js @@ -0,0 +1,78 @@ +/** + * Domain error hierarchy for codegraph. + * + * Library code throws these instead of calling process.exit() or throwing + * bare Error instances. The CLI top-level catch formats them for humans; + * MCP returns structured { isError, code } responses. + */ + +export class CodegraphError extends Error { + /** @type {string} */ + code; + + /** @type {string|undefined} */ + file; + + /** + * @param {string} message + * @param {object} [opts] + * @param {string} [opts.code] + * @param {string} [opts.file] - Related file path, if applicable + * @param {Error} [opts.cause] - Original error that triggered this one + */ + constructor(message, { code = 'CODEGRAPH_ERROR', file, cause } = {}) { + super(message, { cause }); + this.name = 'CodegraphError'; + this.code = code; + this.file = file; + } +} + +export class ParseError extends CodegraphError { + constructor(message, opts = {}) { + super(message, { code: 'PARSE_FAILED', ...opts }); + this.name = 'ParseError'; + } +} + +export class DbError extends CodegraphError { + constructor(message, opts = {}) { + super(message, { code: 'DB_ERROR', ...opts }); + this.name = 'DbError'; + } +} + +export class ConfigError extends CodegraphError { + constructor(message, opts = {}) { + super(message, { code: 'CONFIG_INVALID', ...opts }); + this.name = 'ConfigError'; + } +} + +export class ResolutionError extends CodegraphError { + constructor(message, opts = {}) { + super(message, { code: 'RESOLUTION_FAILED', ...opts }); + this.name = 'ResolutionError'; + } +} + +export class EngineError extends CodegraphError { + constructor(message, opts = {}) { + super(message, { code: 'ENGINE_UNAVAILABLE', ...opts }); + this.name = 'EngineError'; + } +} + +export class AnalysisError extends CodegraphError { + constructor(message, opts = {}) { + super(message, { code: 'ANALYSIS_FAILED', ...opts }); + this.name = 'AnalysisError'; + } +} + +export class BoundaryError extends CodegraphError { + constructor(message, opts = {}) { + super(message, { code: 'BOUNDARY_VIOLATION', ...opts }); + this.name = 'BoundaryError'; + } +} diff --git a/src/index.js b/src/index.js index b798d38a..ae0732ac 100644 --- a/src/index.js +++ b/src/index.js @@ -119,6 +119,17 @@ export { search, searchData, } from './embedder.js'; +// Domain errors +export { + AnalysisError, + BoundaryError, + CodegraphError, + ConfigError, + DbError, + EngineError, + ParseError, + ResolutionError, +} from './errors.js'; // Export (DOT/Mermaid/JSON/GraphML/GraphSON/Neo4j CSV) export { exportDOT, diff --git a/src/mcp/server.js b/src/mcp/server.js index 135c08a2..3a39aed8 100644 --- a/src/mcp/server.js +++ b/src/mcp/server.js @@ -7,6 +7,7 @@ import { createRequire } from 'node:module'; import { findDbPath } from '../db.js'; +import { CodegraphError, ConfigError } from '../errors.js'; import { MCP_MAX_LIMIT } from '../paginate.js'; import { buildToolList } from './tool-registry.js'; import { TOOL_HANDLERS } from './tools/index.js'; @@ -33,11 +34,9 @@ export async function startMCPServer(customDbPath, options = {}) { ListToolsRequestSchema = types.ListToolsRequestSchema; CallToolRequestSchema = types.CallToolRequestSchema; } catch { - console.error( - 'MCP server requires @modelcontextprotocol/sdk.\n' + - 'Install it with: npm install @modelcontextprotocol/sdk', + throw new ConfigError( + 'MCP server requires @modelcontextprotocol/sdk.\nInstall it with: npm install @modelcontextprotocol/sdk', ); - process.exit(1); } // Connect transport FIRST so the server can receive the client's @@ -75,12 +74,12 @@ export async function startMCPServer(customDbPath, options = {}) { const { name, arguments: args } = request.params; try { if (!multiRepo && args.repo) { - throw new Error( + throw new ConfigError( 'Multi-repo access is disabled. Restart with `codegraph mcp --multi-repo` to access other repositories.', ); } if (!multiRepo && name === 'list_repos') { - throw new Error( + throw new ConfigError( 'Multi-repo access is disabled. Restart with `codegraph mcp --multi-repo` to list repositories.', ); } @@ -88,12 +87,12 @@ export async function startMCPServer(customDbPath, options = {}) { let dbPath = customDbPath || undefined; if (args.repo) { if (allowedRepos && !allowedRepos.includes(args.repo)) { - throw new Error(`Repository "${args.repo}" is not in the allowed repos list.`); + throw new ConfigError(`Repository "${args.repo}" is not in the allowed repos list.`); } const { resolveRepoDbPath } = await import('../registry.js'); const resolved = resolveRepoDbPath(args.repo); if (!resolved) - throw new Error( + throw new ConfigError( `Repository "${args.repo}" not found in registry or its database is missing.`, ); dbPath = resolved; @@ -117,7 +116,10 @@ export async function startMCPServer(customDbPath, options = {}) { if (result?.content) return result; // pass-through MCP responses return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] }; } catch (err) { - return { content: [{ type: 'text', text: `Error: ${err.message}` }], isError: true }; + const code = err instanceof CodegraphError ? err.code : 'UNKNOWN_ERROR'; + const text = + err instanceof CodegraphError ? `[${code}] ${err.message}` : `Error: ${err.message}`; + return { content: [{ type: 'text', text }], isError: true }; } }); diff --git a/src/native.js b/src/native.js index ce435f92..7de86d9a 100644 --- a/src/native.js +++ b/src/native.js @@ -8,6 +8,7 @@ import { createRequire } from 'node:module'; import os from 'node:os'; +import { EngineError } from './errors.js'; let _cached; // undefined = not yet tried, null = failed, object = module let _loadError = null; @@ -101,9 +102,10 @@ export function getNativePackageVersion() { export function getNative() { const mod = loadNative(); if (!mod) { - throw new Error( + throw new EngineError( `Native codegraph-core not available: ${_loadError?.message || 'unknown error'}. ` + 'Install the platform package or use --engine wasm.', + { cause: _loadError }, ); } return mod; diff --git a/src/snapshot.js b/src/snapshot.js index 43d46b09..a9f80ce9 100644 --- a/src/snapshot.js +++ b/src/snapshot.js @@ -2,6 +2,7 @@ import fs from 'node:fs'; import path from 'node:path'; import Database from 'better-sqlite3'; import { findDbPath } from './db.js'; +import { ConfigError, DbError } from './errors.js'; import { debug } from './logger.js'; const NAME_RE = /^[a-zA-Z0-9_-]+$/; @@ -12,7 +13,7 @@ const NAME_RE = /^[a-zA-Z0-9_-]+$/; */ export function validateSnapshotName(name) { if (!name || !NAME_RE.test(name)) { - throw new Error( + throw new ConfigError( `Invalid snapshot name "${name}". Use only letters, digits, hyphens, and underscores.`, ); } @@ -39,7 +40,7 @@ export function snapshotSave(name, options = {}) { validateSnapshotName(name); const dbPath = options.dbPath || findDbPath(); if (!fs.existsSync(dbPath)) { - throw new Error(`Database not found: ${dbPath}`); + throw new DbError(`Database not found: ${dbPath}`, { file: dbPath }); } const dir = snapshotsDir(dbPath); @@ -47,7 +48,7 @@ export function snapshotSave(name, options = {}) { if (fs.existsSync(dest)) { if (!options.force) { - throw new Error(`Snapshot "${name}" already exists. Use --force to overwrite.`); + throw new DbError(`Snapshot "${name}" already exists. Use --force to overwrite.`); } fs.unlinkSync(dest); debug(`Deleted existing snapshot: ${dest}`); @@ -82,7 +83,7 @@ export function snapshotRestore(name, options = {}) { const src = path.join(dir, `${name}.db`); if (!fs.existsSync(src)) { - throw new Error(`Snapshot "${name}" not found at ${src}`); + throw new DbError(`Snapshot "${name}" not found at ${src}`, { file: src }); } // Remove WAL/SHM sidecar files for a clean restore @@ -141,7 +142,7 @@ export function snapshotDelete(name, options = {}) { const target = path.join(dir, `${name}.db`); if (!fs.existsSync(target)) { - throw new Error(`Snapshot "${name}" not found at ${target}`); + throw new DbError(`Snapshot "${name}" not found at ${target}`, { file: target }); } fs.unlinkSync(target); diff --git a/src/watcher.js b/src/watcher.js index 32c80e53..aad62fe0 100644 --- a/src/watcher.js +++ b/src/watcher.js @@ -4,6 +4,7 @@ import { readFileSafe } from './builder.js'; import { appendChangeEvents, buildChangeEvent, diffSymbols } from './change-journal.js'; import { EXTENSIONS, IGNORE_DIRS, normalizePath } from './constants.js'; import { closeDb, getNodeId as getNodeIdQuery, initSchema, openDb } from './db.js'; +import { DbError } from './errors.js'; import { appendJournalEntries } from './journal.js'; import { info, warn } from './logger.js'; import { createParseTreeCache, getActiveEngine, parseFileIncremental } from './parser.js'; @@ -162,8 +163,7 @@ async function updateFile(_db, rootDir, filePath, stmts, engineOpts, cache) { export async function watchProject(rootDir, opts = {}) { const dbPath = path.join(rootDir, '.codegraph', 'graph.db'); if (!fs.existsSync(dbPath)) { - console.error('No graph.db found. Run `codegraph build` first.'); - process.exit(1); + throw new DbError('No graph.db found. Run `codegraph build` first.', { file: dbPath }); } const db = openDb(dbPath); diff --git a/tests/unit/db.test.js b/tests/unit/db.test.js index 10fcbcde..63fb5807 100644 --- a/tests/unit/db.test.js +++ b/tests/unit/db.test.js @@ -6,7 +6,7 @@ import fs from 'node:fs'; import os from 'node:os'; import path from 'node:path'; import Database from 'better-sqlite3'; -import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; import { closeDb, findDbPath, @@ -195,20 +195,17 @@ describe('build_meta', () => { }); describe('openReadonlyOrFail', () => { - it('exits with error when DB does not exist', () => { - const exitSpy = vi.spyOn(process, 'exit').mockImplementation(() => { - throw new Error('process.exit'); - }); - const stderrSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); - - expect(() => openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'))).toThrow('process.exit'); - expect(exitSpy).toHaveBeenCalledWith(1); - expect(stderrSpy).toHaveBeenCalled(); - const errorMsg = stderrSpy.mock.calls[0][0]; - expect(errorMsg).toContain('No codegraph database found'); - - exitSpy.mockRestore(); - stderrSpy.mockRestore(); + it('throws DbError when DB does not exist', () => { + expect(() => openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'))).toThrow( + 'No codegraph database found', + ); + try { + openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db')); + } catch (err) { + expect(err.name).toBe('DbError'); + expect(err.code).toBe('DB_ERROR'); + expect(err.file).toBeDefined(); + } }); it('returns a readonly database when DB exists', () => { diff --git a/tests/unit/errors.test.js b/tests/unit/errors.test.js new file mode 100644 index 00000000..3714df5b --- /dev/null +++ b/tests/unit/errors.test.js @@ -0,0 +1,69 @@ +/** + * Unit tests for the domain error hierarchy (src/errors.js). + */ + +import { describe, expect, it } from 'vitest'; +import { + AnalysisError, + BoundaryError, + CodegraphError, + ConfigError, + DbError, + EngineError, + ParseError, + ResolutionError, +} from '../../src/errors.js'; + +describe('CodegraphError', () => { + it('sets defaults', () => { + const err = new CodegraphError('boom'); + expect(err).toBeInstanceOf(Error); + expect(err).toBeInstanceOf(CodegraphError); + expect(err.name).toBe('CodegraphError'); + expect(err.code).toBe('CODEGRAPH_ERROR'); + expect(err.message).toBe('boom'); + expect(err.file).toBeUndefined(); + expect(err.cause).toBeUndefined(); + }); + + it('accepts opts', () => { + const cause = new Error('root'); + const err = new CodegraphError('msg', { code: 'CUSTOM', file: 'foo.js', cause }); + expect(err.code).toBe('CUSTOM'); + expect(err.file).toBe('foo.js'); + expect(err.cause).toBe(cause); + }); +}); + +describe('subclasses', () => { + const cases = [ + { Class: ParseError, name: 'ParseError', code: 'PARSE_FAILED' }, + { Class: DbError, name: 'DbError', code: 'DB_ERROR' }, + { Class: ConfigError, name: 'ConfigError', code: 'CONFIG_INVALID' }, + { Class: ResolutionError, name: 'ResolutionError', code: 'RESOLUTION_FAILED' }, + { Class: EngineError, name: 'EngineError', code: 'ENGINE_UNAVAILABLE' }, + { Class: AnalysisError, name: 'AnalysisError', code: 'ANALYSIS_FAILED' }, + { Class: BoundaryError, name: 'BoundaryError', code: 'BOUNDARY_VIOLATION' }, + ]; + + for (const { Class, name, code } of cases) { + it(`${name} has correct defaults and instanceof chain`, () => { + const err = new Class('test'); + expect(err).toBeInstanceOf(Error); + expect(err).toBeInstanceOf(CodegraphError); + expect(err).toBeInstanceOf(Class); + expect(err.name).toBe(name); + expect(err.code).toBe(code); + expect(err.message).toBe('test'); + }); + + it(`${name} forwards file and cause`, () => { + const cause = new Error('root'); + const err = new Class('msg', { file: 'bar.js', cause }); + expect(err.file).toBe('bar.js'); + expect(err.cause).toBe(cause); + // code should stay as the subclass default + expect(err.code).toBe(code); + }); + } +}); diff --git a/tests/unit/prompt-install.test.js b/tests/unit/prompt-install.test.js index d7583508..6a36c2de 100644 --- a/tests/unit/prompt-install.test.js +++ b/tests/unit/prompt-install.test.js @@ -34,7 +34,7 @@ describe('loadTransformers install prompt', () => { vi.restoreAllMocks(); }); - test('non-TTY: prints error and exits without prompting', async () => { + test('non-TTY: throws EngineError without prompting', async () => { process.stdin.isTTY = undefined; const rlFactory = vi.fn(); @@ -46,15 +46,18 @@ describe('loadTransformers install prompt', () => { const { embed } = await import('../../src/embedder.js'); - await expect(embed(['test'], 'minilm')).rejects.toThrow('process.exit(1)'); - expect(errorSpy).toHaveBeenCalledWith( - expect.stringContaining('Semantic search requires @huggingface/transformers'), + await expect(embed(['test'], 'minilm')).rejects.toThrow( + 'Semantic search requires @huggingface/transformers', ); + await expect(embed(['test'], 'minilm')).rejects.toMatchObject({ + name: 'EngineError', + code: 'ENGINE_UNAVAILABLE', + }); // readline should NOT have been called — no prompt in non-TTY expect(rlFactory).not.toHaveBeenCalled(); }); - test('TTY + user declines: prints error and exits', async () => { + test('TTY + user declines: throws EngineError', async () => { process.stdin.isTTY = true; vi.doMock('node:readline', () => ({ @@ -70,13 +73,16 @@ describe('loadTransformers install prompt', () => { const { embed } = await import('../../src/embedder.js'); - await expect(embed(['test'], 'minilm')).rejects.toThrow('process.exit(1)'); - expect(errorSpy).toHaveBeenCalledWith( - expect.stringContaining('Semantic search requires @huggingface/transformers'), + await expect(embed(['test'], 'minilm')).rejects.toThrow( + 'Semantic search requires @huggingface/transformers', ); + await expect(embed(['test'], 'minilm')).rejects.toMatchObject({ + name: 'EngineError', + code: 'ENGINE_UNAVAILABLE', + }); }); - test('TTY + user accepts but npm install fails: prints error and exits', async () => { + test('TTY + user accepts but npm install fails: throws EngineError', async () => { process.stdin.isTTY = true; const execMock = vi.fn(() => { @@ -95,15 +101,18 @@ describe('loadTransformers install prompt', () => { const { embed } = await import('../../src/embedder.js'); - await expect(embed(['test'], 'minilm')).rejects.toThrow('process.exit(1)'); + await expect(embed(['test'], 'minilm')).rejects.toThrow( + 'Semantic search requires @huggingface/transformers', + ); + await expect(embed(['test'], 'minilm')).rejects.toMatchObject({ + name: 'EngineError', + code: 'ENGINE_UNAVAILABLE', + }); expect(execMock).toHaveBeenCalledWith( 'npm', ['install', '@huggingface/transformers'], expect.objectContaining({ stdio: 'inherit', timeout: 300_000 }), ); - expect(errorSpy).toHaveBeenCalledWith( - expect.stringContaining('Semantic search requires @huggingface/transformers'), - ); }); test('TTY + install succeeds: retries import and loads module', async () => { From d2794cb98da6352473d4dc2574cbbf1a953d3df8 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 00:16:48 -0600 Subject: [PATCH 2/8] fix: address Greptile review feedback on PR #431 - Use expect.assertions(4) in db.test.js to prevent silent assertion skips - Change snapshot "already exists" error from DbError to ConfigError (it's a missing --force flag, not a database failure) Impact: 1 functions changed, 0 affected --- src/snapshot.js | 2 +- tests/unit/db.test.js | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/snapshot.js b/src/snapshot.js index a9f80ce9..0ce12bf5 100644 --- a/src/snapshot.js +++ b/src/snapshot.js @@ -48,7 +48,7 @@ export function snapshotSave(name, options = {}) { if (fs.existsSync(dest)) { if (!options.force) { - throw new DbError(`Snapshot "${name}" already exists. Use --force to overwrite.`); + throw new ConfigError(`Snapshot "${name}" already exists. Use --force to overwrite.`); } fs.unlinkSync(dest); debug(`Deleted existing snapshot: ${dest}`); diff --git a/tests/unit/db.test.js b/tests/unit/db.test.js index 63fb5807..47dc393d 100644 --- a/tests/unit/db.test.js +++ b/tests/unit/db.test.js @@ -196,12 +196,11 @@ describe('build_meta', () => { describe('openReadonlyOrFail', () => { it('throws DbError when DB does not exist', () => { - expect(() => openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db'))).toThrow( - 'No codegraph database found', - ); + expect.assertions(4); try { openReadonlyOrFail(path.join(tmpDir, 'nonexistent.db')); } catch (err) { + expect(err.message).toContain('No codegraph database found'); expect(err.name).toBe('DbError'); expect(err.code).toBe('DB_ERROR'); expect(err.file).toBeDefined(); From d625c212f2f5eccbe780a6859ed98b2e184d31d2 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 02:15:12 -0600 Subject: [PATCH 3/8] refactor: extract embedder.js into src/embeddings/ subsystem (ROADMAP 3.10) Split the monolithic 1,100-line embedder.js into a modular subsystem with clear separation of concerns: models, generator, strategies, stores, and search modules. Uses a pluggable VectorStore JSDoc contract for future ANN backends. Reuses existing db/repository/embeddings.js for search preparation. All 9 consumer import paths updated, old file deleted. Impact: 26 functions changed, 16 affected --- src/cli/commands/embed.js | 2 +- src/cli/commands/models.js | 2 +- src/cli/commands/search.js | 2 +- src/embedder.js | 1097 --------------------- src/embeddings/generator.js | 163 +++ src/embeddings/index.js | 13 + src/embeddings/models.js | 217 ++++ src/embeddings/search/cli-formatter.js | 151 +++ src/embeddings/search/filters.js | 46 + src/embeddings/search/hybrid.js | 121 +++ src/embeddings/search/keyword.js | 68 ++ src/embeddings/search/prepare.js | 58 ++ src/embeddings/search/semantic.js | 145 +++ src/embeddings/stores/fts5.js | 27 + src/embeddings/stores/sqlite-blob.js | 23 + src/embeddings/strategies/source.js | 14 + src/embeddings/strategies/structured.js | 43 + src/embeddings/strategies/text-utils.js | 43 + src/index.js | 7 +- src/mcp/tools/semantic-search.js | 6 +- tests/search/embedder-search.test.js | 2 +- tests/search/embedding-regression.test.js | 2 +- tests/search/embedding-strategy.test.js | 2 +- tests/unit/prompt-install.test.js | 8 +- 24 files changed, 1151 insertions(+), 1111 deletions(-) delete mode 100644 src/embedder.js create mode 100644 src/embeddings/generator.js create mode 100644 src/embeddings/index.js create mode 100644 src/embeddings/models.js create mode 100644 src/embeddings/search/cli-formatter.js create mode 100644 src/embeddings/search/filters.js create mode 100644 src/embeddings/search/hybrid.js create mode 100644 src/embeddings/search/keyword.js create mode 100644 src/embeddings/search/prepare.js create mode 100644 src/embeddings/search/semantic.js create mode 100644 src/embeddings/stores/fts5.js create mode 100644 src/embeddings/stores/sqlite-blob.js create mode 100644 src/embeddings/strategies/source.js create mode 100644 src/embeddings/strategies/structured.js create mode 100644 src/embeddings/strategies/text-utils.js diff --git a/src/cli/commands/embed.js b/src/cli/commands/embed.js index fcd908e9..075520cd 100644 --- a/src/cli/commands/embed.js +++ b/src/cli/commands/embed.js @@ -1,5 +1,5 @@ import path from 'node:path'; -import { buildEmbeddings, DEFAULT_MODEL, EMBEDDING_STRATEGIES } from '../../embedder.js'; +import { buildEmbeddings, DEFAULT_MODEL, EMBEDDING_STRATEGIES } from '../../embeddings/index.js'; export const command = { name: 'embed [dir]', diff --git a/src/cli/commands/models.js b/src/cli/commands/models.js index 6773f2c2..0763650a 100644 --- a/src/cli/commands/models.js +++ b/src/cli/commands/models.js @@ -1,4 +1,4 @@ -import { DEFAULT_MODEL, MODELS } from '../../embedder.js'; +import { DEFAULT_MODEL, MODELS } from '../../embeddings/index.js'; export const command = { name: 'models', diff --git a/src/cli/commands/search.js b/src/cli/commands/search.js index 312f734d..238b59a0 100644 --- a/src/cli/commands/search.js +++ b/src/cli/commands/search.js @@ -1,4 +1,4 @@ -import { search } from '../../embedder.js'; +import { search } from '../../embeddings/index.js'; export const command = { name: 'search ', diff --git a/src/embedder.js b/src/embedder.js deleted file mode 100644 index f8fbc527..00000000 --- a/src/embedder.js +++ /dev/null @@ -1,1097 +0,0 @@ -import { execFileSync } from 'node:child_process'; -import fs from 'node:fs'; -import path from 'node:path'; -import { createInterface } from 'node:readline'; -import { - closeDb, - findCalleeNames, - findCallerNames, - findDbPath, - openDb, - openReadonlyOrFail, -} from './db.js'; -import { ConfigError, DbError, EngineError } from './errors.js'; -import { info, warn } from './logger.js'; -import { normalizeSymbol } from './queries.js'; - -/** - * Split an identifier into readable words. - * camelCase/PascalCase → "camel Case", snake_case → "snake case", kebab-case → "kebab case" - */ -function splitIdentifier(name) { - return name - .replace(/([a-z])([A-Z])/g, '$1 $2') - .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') - .replace(/[_-]+/g, ' ') - .trim(); -} - -/** - * Match a file path against a glob pattern. - * Supports *, **, and ? wildcards. Zero dependencies. - */ -function globMatch(filePath, pattern) { - // Normalize separators to forward slashes - const normalized = filePath.replace(/\\/g, '/'); - // Escape regex specials except glob chars - let regex = pattern.replace(/\\/g, '/').replace(/[.+^${}()|[\]\\]/g, '\\$&'); - // Replace ** first (matches any path segment), then * and ? - regex = regex.replace(/\*\*/g, '\0'); - regex = regex.replace(/\*/g, '[^/]*'); - regex = regex.replace(/\0/g, '.*'); - regex = regex.replace(/\?/g, '[^/]'); - try { - return new RegExp(`^${regex}$`).test(normalized); - } catch { - // Malformed pattern — fall back to substring match - return normalized.includes(pattern); - } -} - -// Lazy-load transformers (heavy, optional module) -let pipeline = null; -let _cos_sim = null; -let extractor = null; -let activeModel = null; - -export const MODELS = { - minilm: { - name: 'Xenova/all-MiniLM-L6-v2', - dim: 384, - contextWindow: 256, - desc: 'Smallest, fastest (~23MB). General text.', - quantized: true, - }, - 'jina-small': { - name: 'Xenova/jina-embeddings-v2-small-en', - dim: 512, - contextWindow: 8192, - desc: 'Small, good quality (~33MB). General text.', - quantized: false, - }, - 'jina-base': { - name: 'Xenova/jina-embeddings-v2-base-en', - dim: 768, - contextWindow: 8192, - desc: 'Good quality (~137MB). General text, 8192 token context.', - quantized: false, - }, - 'jina-code': { - name: 'Xenova/jina-embeddings-v2-base-code', - dim: 768, - contextWindow: 8192, - desc: 'Code-aware (~137MB). Trained on code+text, best for code search.', - quantized: false, - }, - nomic: { - name: 'Xenova/nomic-embed-text-v1', - dim: 768, - contextWindow: 8192, - desc: 'Good local quality (~137MB). 8192 context.', - quantized: false, - }, - 'nomic-v1.5': { - name: 'nomic-ai/nomic-embed-text-v1.5', - dim: 768, - contextWindow: 8192, - desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.', - quantized: false, - }, - 'bge-large': { - name: 'Xenova/bge-large-en-v1.5', - dim: 1024, - contextWindow: 512, - desc: 'Best general retrieval (~335MB). Top MTEB scores.', - quantized: false, - }, -}; - -export const EMBEDDING_STRATEGIES = ['structured', 'source']; - -export const DEFAULT_MODEL = 'nomic-v1.5'; -const BATCH_SIZE_MAP = { - minilm: 32, - 'jina-small': 16, - 'jina-base': 8, - 'jina-code': 8, - nomic: 8, - 'nomic-v1.5': 8, - 'bge-large': 4, -}; -const DEFAULT_BATCH_SIZE = 32; - -function getModelConfig(modelKey) { - const key = modelKey || DEFAULT_MODEL; - const config = MODELS[key]; - if (!config) { - throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`); - } - return config; -} - -/** - * Rough token estimate (~4 chars per token for code/English). - * Conservative — avoids adding a tokenizer dependency. - */ -export function estimateTokens(text) { - return Math.ceil(text.length / 4); -} - -/** - * Extract leading comment text (JSDoc, //, #, etc.) above a function line. - * Returns the cleaned comment text or null if none found. - */ -function extractLeadingComment(lines, fnLineIndex) { - if (fnLineIndex > lines.length) return null; - const raw = []; - for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) { - if (i >= lines.length) continue; - const trimmed = lines[i].trim(); - if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) { - raw.unshift(trimmed); - } else if (trimmed === '') { - if (raw.length > 0) break; - } else { - break; - } - } - if (raw.length === 0) return null; - return raw - .map((line) => - line - .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */ - .replace(/^\*\s?/, '') // middle * lines - .replace(/^\/\/\/?\s?/, '') // // or /// - .replace(/^#\s?/, '') // # (Python/Ruby) - .trim(), - ) - .filter((l) => l.length > 0) - .join(' '); -} - -/** - * Build graph-enriched text for a symbol using dependency context. - * Produces compact, semantic text (~100 tokens) instead of full source code. - */ -function buildStructuredText(node, file, lines, db) { - const readable = splitIdentifier(node.name); - const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`]; - const startLine = Math.max(0, node.line - 1); - - // Extract parameters from signature (best-effort, single-line) - const sigLine = lines[startLine] || ''; - const paramMatch = sigLine.match(/\(([^)]*)\)/); - if (paramMatch?.[1]?.trim()) { - parts.push(`Parameters: ${paramMatch[1].trim()}`); - } - - // Graph context: callees (capped at 10) - const callees = findCalleeNames(db, node.id); - if (callees.length > 0) { - parts.push(`Calls: ${callees.slice(0, 10).join(', ')}`); - } - - // Graph context: callers (capped at 10) - const callers = findCallerNames(db, node.id); - if (callers.length > 0) { - parts.push(`Called by: ${callers.slice(0, 10).join(', ')}`); - } - - // Leading comment (high semantic value) or first few lines of code - const comment = extractLeadingComment(lines, startLine); - if (comment) { - parts.push(comment); - } else { - const endLine = Math.min(lines.length, startLine + 4); - const snippet = lines.slice(startLine, endLine).join('\n').trim(); - if (snippet) parts.push(snippet); - } - - return parts.join('\n'); -} - -/** - * Build raw source-code text for a symbol (original strategy). - */ -function buildSourceText(node, file, lines) { - const startLine = Math.max(0, node.line - 1); - const endLine = node.end_line - ? Math.min(lines.length, node.end_line) - : Math.min(lines.length, startLine + 15); - const context = lines.slice(startLine, endLine).join('\n'); - const readable = splitIdentifier(node.name); - return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`; -} - -/** - * Prompt the user to install a missing package interactively. - * Returns true if the package was installed, false otherwise. - * Skips the prompt entirely in non-TTY environments (CI, piped stdin). - */ -function promptInstall(packageName) { - if (!process.stdin.isTTY) return Promise.resolve(false); - - return new Promise((resolve) => { - const rl = createInterface({ input: process.stdin, output: process.stderr }); - rl.question(`Semantic search requires ${packageName}. Install it now? [y/N] `, (answer) => { - rl.close(); - if (answer.trim().toLowerCase() !== 'y') return resolve(false); - try { - execFileSync('npm', ['install', packageName], { - stdio: 'inherit', - timeout: 300_000, - }); - resolve(true); - } catch { - resolve(false); - } - }); - }); -} - -/** - * Lazy-load @huggingface/transformers. - * If the package is missing, prompts the user to install it interactively. - * In non-TTY environments, prints an error and exits. - */ -async function loadTransformers() { - try { - return await import('@huggingface/transformers'); - } catch { - const pkg = '@huggingface/transformers'; - const installed = await promptInstall(pkg); - if (installed) { - try { - return await import(pkg); - } catch (loadErr) { - throw new EngineError( - `${pkg} was installed but failed to load. Please check your environment.`, - { cause: loadErr }, - ); - } - } - throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`); - } -} - -/** - * Dispose the current ONNX session and free memory. - * Safe to call when no model is loaded (no-op). - */ -export async function disposeModel() { - if (extractor) { - await extractor.dispose(); - extractor = null; - } - activeModel = null; -} - -async function loadModel(modelKey) { - const config = getModelConfig(modelKey); - - if (extractor && activeModel === config.name) return { extractor, config }; - - // Dispose previous model before loading a different one - await disposeModel(); - - const transformers = await loadTransformers(); - pipeline = transformers.pipeline; - _cos_sim = transformers.cos_sim; - - info(`Loading embedding model: ${config.name} (${config.dim}d)...`); - const pipelineOpts = config.quantized ? { quantized: true } : {}; - try { - extractor = await pipeline('feature-extraction', config.name, pipelineOpts); - } catch (err) { - const msg = err.message || String(err); - if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) { - throw new EngineError( - `Model "${config.name}" requires authentication.\n` + - `This model is gated on HuggingFace and needs an access token.\n\n` + - `Options:\n` + - ` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` + - ` 2. Use a public model instead: codegraph embed --model minilm`, - { cause: err }, - ); - } - throw new EngineError( - `Failed to load model "${config.name}": ${msg}\n` + - `Try a different model: codegraph embed --model minilm`, - { cause: err }, - ); - } - activeModel = config.name; - info('Model loaded.'); - return { extractor, config }; -} - -/** - * Generate embeddings for an array of texts. - */ -export async function embed(texts, modelKey) { - const { extractor: ext, config } = await loadModel(modelKey); - const dim = config.dim; - const results = []; - const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] || DEFAULT_BATCH_SIZE; - - for (let i = 0; i < texts.length; i += batchSize) { - const batch = texts.slice(i, i + batchSize); - const output = await ext(batch, { pooling: 'mean', normalize: true }); - - for (let j = 0; j < batch.length; j++) { - const start = j * dim; - const vec = new Float32Array(dim); - for (let k = 0; k < dim; k++) { - vec[k] = output.data[start + k]; - } - results.push(vec); - } - - if (texts.length > batchSize) { - process.stdout.write(` Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`); - } - } - - return { vectors: results, dim }; -} - -/** - * Cosine similarity between two Float32Arrays. - */ -export function cosineSim(a, b) { - let dot = 0, - normA = 0, - normB = 0; - for (let i = 0; i < a.length; i++) { - dot += a[i] * b[i]; - normA += a[i] * a[i]; - normB += b[i] * b[i]; - } - return dot / (Math.sqrt(normA) * Math.sqrt(normB)); -} - -function initEmbeddingsSchema(db) { - db.exec(` - CREATE TABLE IF NOT EXISTS embeddings ( - node_id INTEGER PRIMARY KEY, - vector BLOB NOT NULL, - text_preview TEXT, - FOREIGN KEY(node_id) REFERENCES nodes(id) - ); - CREATE TABLE IF NOT EXISTS embedding_meta ( - key TEXT PRIMARY KEY, - value TEXT - ); - `); - - // Add full_text column (idempotent — ignore if already exists) - try { - db.exec('ALTER TABLE embeddings ADD COLUMN full_text TEXT'); - } catch { - /* column already exists */ - } - - // FTS5 virtual table for BM25 keyword search - db.exec(` - CREATE VIRTUAL TABLE IF NOT EXISTS fts_index USING fts5( - name, - content, - tokenize='unicode61' - ); - `); -} - -/** - * Build embeddings for all functions/methods/classes in the graph. - * @param {string} rootDir - Project root directory - * @param {string} modelKey - Model identifier from MODELS registry - * @param {string} [customDbPath] - Override path to graph.db - * @param {object} [options] - Embedding options - * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code) - */ -export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) { - const strategy = options.strategy || 'structured'; - const dbPath = customDbPath || findDbPath(null); - - if (!fs.existsSync(dbPath)) { - throw new DbError( - `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, - { file: dbPath }, - ); - } - - const db = openDb(dbPath); - initEmbeddingsSchema(db); - - db.exec('DELETE FROM embeddings'); - db.exec('DELETE FROM embedding_meta'); - db.exec('DELETE FROM fts_index'); - - const nodes = db - .prepare( - `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`, - ) - .all(); - - console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`); - - const byFile = new Map(); - for (const node of nodes) { - if (!byFile.has(node.file)) byFile.set(node.file, []); - byFile.get(node.file).push(node); - } - - const texts = []; - const nodeIds = []; - const nodeNames = []; - const previews = []; - const config = getModelConfig(modelKey); - const contextWindow = config.contextWindow; - let overflowCount = 0; - - for (const [file, fileNodes] of byFile) { - const fullPath = path.join(rootDir, file); - let lines; - try { - lines = fs.readFileSync(fullPath, 'utf-8').split('\n'); - } catch (err) { - warn(`Cannot read ${file} for embeddings: ${err.message}`); - continue; - } - - for (const node of fileNodes) { - let text = - strategy === 'structured' - ? buildStructuredText(node, file, lines, db) - : buildSourceText(node, file, lines); - - // Detect and handle context window overflow - const tokens = estimateTokens(text); - if (tokens > contextWindow) { - overflowCount++; - const maxChars = contextWindow * 4; - text = text.slice(0, maxChars); - } - - texts.push(text); - nodeIds.push(node.id); - nodeNames.push(node.name); - previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`); - } - } - - if (overflowCount > 0) { - warn( - `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`, - ); - } - - console.log(`Embedding ${texts.length} symbols...`); - const { vectors, dim } = await embed(texts, modelKey); - - const insert = db.prepare( - 'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)', - ); - const insertFts = db.prepare('INSERT INTO fts_index(rowid, name, content) VALUES (?, ?, ?)'); - const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)'); - const insertAll = db.transaction(() => { - for (let i = 0; i < vectors.length; i++) { - insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i], texts[i]); - insertFts.run(nodeIds[i], nodeNames[i], texts[i]); - } - insertMeta.run('model', config.name); - insertMeta.run('dim', String(dim)); - insertMeta.run('count', String(vectors.length)); - insertMeta.run('fts_count', String(vectors.length)); - insertMeta.run('strategy', strategy); - insertMeta.run('built_at', new Date().toISOString()); - if (overflowCount > 0) { - insertMeta.run('truncated_count', String(overflowCount)); - } - }); - insertAll(); - - console.log( - `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`, - ); - closeDb(db); -} - -/** - * Shared setup for search functions: opens DB, validates embeddings/model, loads rows. - * Returns { db, rows, modelKey, storedDim } or null on failure (prints error). - */ -function _prepareSearch(customDbPath, opts = {}) { - const db = openReadonlyOrFail(customDbPath); - - let count; - try { - count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c; - } catch { - console.log('No embeddings table found. Run `codegraph embed` first.'); - db.close(); - return null; - } - if (count === 0) { - console.log('No embeddings found. Run `codegraph embed` first.'); - db.close(); - return null; - } - - let storedModel = null; - let storedDim = null; - try { - const modelRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'model'").get(); - const dimRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'dim'").get(); - if (modelRow) storedModel = modelRow.value; - if (dimRow) storedDim = parseInt(dimRow.value, 10); - } catch { - /* old DB without meta table */ - } - - let modelKey = opts.model || null; - if (!modelKey && storedModel) { - for (const [key, config] of Object.entries(MODELS)) { - if (config.name === storedModel) { - modelKey = key; - break; - } - } - } - - // Pre-filter: allow filtering by kind or file pattern to reduce search space - const noTests = opts.noTests || false; - const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./; - let sql = ` - SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role - FROM embeddings e - JOIN nodes n ON e.node_id = n.id - `; - const params = []; - const conditions = []; - if (opts.kind) { - conditions.push('n.kind = ?'); - params.push(opts.kind); - } - const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); - if (opts.filePattern && !isGlob) { - conditions.push('n.file LIKE ?'); - params.push(`%${opts.filePattern}%`); - } - if (conditions.length > 0) { - sql += ` WHERE ${conditions.join(' AND ')}`; - } - - let rows = db.prepare(sql).all(...params); - if (isGlob) { - rows = rows.filter((row) => globMatch(row.file, opts.filePattern)); - } - if (noTests) { - rows = rows.filter((row) => !TEST_PATTERN.test(row.file)); - } - - return { db, rows, modelKey, storedDim }; -} - -/** - * Single-query semantic search — returns data instead of printing. - * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure. - */ -export async function searchData(query, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const minScore = opts.minScore || 0.2; - - const prepared = _prepareSearch(customDbPath, opts); - if (!prepared) return null; - const { db, rows, modelKey, storedDim } = prepared; - - try { - const { - vectors: [queryVec], - dim, - } = await embed([query], modelKey); - - if (storedDim && dim !== storedDim) { - console.log( - `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, - ); - console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); - return null; - } - - const hc = new Map(); - const results = []; - for (const row of rows) { - const vec = new Float32Array(new Uint8Array(row.vector).buffer); - const sim = cosineSim(queryVec, vec); - - if (sim >= minScore) { - results.push({ - ...normalizeSymbol(row, db, hc), - similarity: sim, - }); - } - } - - results.sort((a, b) => b.similarity - a.similarity); - return { results: results.slice(0, limit) }; - } finally { - db.close(); - } -} - -/** - * Multi-query semantic search with Reciprocal Rank Fusion (RRF). - * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure. - */ -export async function multiSearchData(queries, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const minScore = opts.minScore || 0.2; - const k = opts.rrfK || 60; - - const prepared = _prepareSearch(customDbPath, opts); - if (!prepared) return null; - const { db, rows, modelKey, storedDim } = prepared; - - try { - const { vectors: queryVecs, dim } = await embed(queries, modelKey); - - // Warn about similar queries that may bias RRF results - const SIMILARITY_WARN_THRESHOLD = 0.85; - for (let i = 0; i < queryVecs.length; i++) { - for (let j = i + 1; j < queryVecs.length; j++) { - const sim = cosineSim(queryVecs[i], queryVecs[j]); - if (sim >= SIMILARITY_WARN_THRESHOLD) { - warn( - `Queries "${queries[i]}" and "${queries[j]}" are very similar ` + - `(${(sim * 100).toFixed(0)}% cosine similarity). ` + - `This may bias RRF results toward their shared matches. ` + - `Consider using more distinct queries.`, - ); - } - } - } - - if (storedDim && dim !== storedDim) { - console.log( - `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, - ); - console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); - return null; - } - - // Parse row vectors once - const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer)); - - // For each query: compute similarities, filter by minScore, rank - const perQueryRanked = queries.map((_query, qi) => { - const scored = []; - for (let ri = 0; ri < rows.length; ri++) { - const sim = cosineSim(queryVecs[qi], rowVecs[ri]); - if (sim >= minScore) { - scored.push({ rowIndex: ri, similarity: sim }); - } - } - scored.sort((a, b) => b.similarity - a.similarity); - // Assign 1-indexed ranks - return scored.map((item, rank) => ({ ...item, rank: rank + 1 })); - }); - - // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries - const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] } - for (let qi = 0; qi < queries.length; qi++) { - for (const item of perQueryRanked[qi]) { - if (!fusionMap.has(item.rowIndex)) { - fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] }); - } - const entry = fusionMap.get(item.rowIndex); - entry.rrfScore += 1 / (k + item.rank); - entry.queryScores.push({ - query: queries[qi], - similarity: item.similarity, - rank: item.rank, - }); - } - } - - // Build results sorted by RRF score - const hc = new Map(); - const results = []; - for (const [rowIndex, entry] of fusionMap) { - const row = rows[rowIndex]; - results.push({ - ...normalizeSymbol(row, db, hc), - rrf: entry.rrfScore, - queryScores: entry.queryScores, - }); - } - - results.sort((a, b) => b.rrf - a.rrf); - return { results: results.slice(0, limit) }; - } finally { - db.close(); - } -} - -/** - * Sanitize a user query for FTS5 MATCH syntax. - * Wraps each token as an implicit OR and escapes special FTS5 characters. - */ -function sanitizeFtsQuery(query) { - // Remove FTS5 special chars that could cause syntax errors - const cleaned = query.replace(/[*"():^{}~<>]/g, ' ').trim(); - if (!cleaned) return null; - // Split into tokens, wrap with OR for multi-token queries - const tokens = cleaned.split(/\s+/).filter((t) => t.length > 0); - if (tokens.length === 0) return null; - if (tokens.length === 1) return `"${tokens[0]}"`; - return tokens.map((t) => `"${t}"`).join(' OR '); -} - -/** - * Check if the FTS5 index exists in the database. - * Returns true if fts_index table exists and has rows, false otherwise. - */ -function hasFtsIndex(db) { - try { - const row = db.prepare('SELECT COUNT(*) as c FROM fts_index').get(); - return row.c > 0; - } catch { - return false; - } -} - -/** - * BM25 keyword search via FTS5. - * Returns { results: [{ name, kind, file, line, bm25Score }] } or null if no FTS5 index. - */ -export function ftsSearchData(query, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const noTests = opts.noTests || false; - const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./; - - const db = openReadonlyOrFail(customDbPath); - - try { - if (!hasFtsIndex(db)) { - return null; - } - - const ftsQuery = sanitizeFtsQuery(query); - if (!ftsQuery) { - return { results: [] }; - } - - let sql = ` - SELECT f.rowid AS node_id, rank AS bm25_score, - n.name, n.kind, n.file, n.line, n.end_line, n.role - FROM fts_index f - JOIN nodes n ON f.rowid = n.id - WHERE fts_index MATCH ? - `; - const params = [ftsQuery]; - - if (opts.kind) { - sql += ' AND n.kind = ?'; - params.push(opts.kind); - } - - const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); - if (opts.filePattern && !isGlob) { - sql += ' AND n.file LIKE ?'; - params.push(`%${opts.filePattern}%`); - } - - sql += ' ORDER BY rank LIMIT ?'; - params.push(limit * 5); // fetch generous set for post-filtering - - let rows; - try { - rows = db.prepare(sql).all(...params); - } catch { - // Invalid FTS5 query syntax — return empty - return { results: [] }; - } - - if (isGlob) { - rows = rows.filter((row) => globMatch(row.file, opts.filePattern)); - } - if (noTests) { - rows = rows.filter((row) => !TEST_PATTERN.test(row.file)); - } - - const hc = new Map(); - const results = rows.slice(0, limit).map((row) => ({ - ...normalizeSymbol(row, db, hc), - bm25Score: -row.bm25_score, // FTS5 rank is negative; negate for display - })); - - return { results }; - } finally { - db.close(); - } -} - -/** - * Hybrid BM25 + semantic search with RRF fusion. - * Returns { results: [{ name, kind, file, line, rrf, bm25Score, bm25Rank, similarity, semanticRank }] } - * or null if no FTS5 index (caller should fall back to semantic-only). - */ -export async function hybridSearchData(query, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const k = opts.rrfK || 60; - const topK = (opts.limit || 15) * 5; - - // Split semicolons for multi-query support - const queries = - typeof query === 'string' - ? query - .split(';') - .map((q) => q.trim()) - .filter((q) => q.length > 0) - : [query]; - - // Check FTS5 availability first (sync, cheap) - const checkDb = openReadonlyOrFail(customDbPath); - const ftsAvailable = hasFtsIndex(checkDb); - checkDb.close(); - if (!ftsAvailable) return null; - - // Collect ranked lists: for each query, one BM25 list + one semantic list - const rankedLists = []; - - for (const q of queries) { - // BM25 ranked list (sync) - const bm25Data = ftsSearchData(q, customDbPath, { ...opts, limit: topK }); - if (bm25Data?.results) { - rankedLists.push( - bm25Data.results.map((r, idx) => ({ - key: `${r.name}:${r.file}:${r.line}`, - rank: idx + 1, - source: 'bm25', - ...r, - })), - ); - } - - // Semantic ranked list (async) - const semData = await searchData(q, customDbPath, { - ...opts, - limit: topK, - minScore: opts.minScore || 0.2, - }); - if (semData?.results) { - rankedLists.push( - semData.results.map((r, idx) => ({ - key: `${r.name}:${r.file}:${r.line}`, - rank: idx + 1, - source: 'semantic', - ...r, - })), - ); - } - } - - // RRF fusion across all ranked lists - const fusionMap = new Map(); - for (const list of rankedLists) { - for (const item of list) { - if (!fusionMap.has(item.key)) { - fusionMap.set(item.key, { - name: item.name, - kind: item.kind, - file: item.file, - line: item.line, - endLine: item.endLine ?? null, - role: item.role ?? null, - fileHash: item.fileHash ?? null, - rrfScore: 0, - bm25Score: null, - bm25Rank: null, - similarity: null, - semanticRank: null, - }); - } - const entry = fusionMap.get(item.key); - entry.rrfScore += 1 / (k + item.rank); - if (item.source === 'bm25') { - if (entry.bm25Rank === null || item.rank < entry.bm25Rank) { - entry.bm25Score = item.bm25Score; - entry.bm25Rank = item.rank; - } - } else { - if (entry.semanticRank === null || item.rank < entry.semanticRank) { - entry.similarity = item.similarity; - entry.semanticRank = item.rank; - } - } - } - } - - const results = [...fusionMap.values()] - .sort((a, b) => b.rrfScore - a.rrfScore) - .slice(0, limit) - .map((e) => ({ - name: e.name, - kind: e.kind, - file: e.file, - line: e.line, - endLine: e.endLine, - role: e.role, - fileHash: e.fileHash, - rrf: e.rrfScore, - bm25Score: e.bm25Score, - bm25Rank: e.bm25Rank, - similarity: e.similarity, - semanticRank: e.semanticRank, - })); - - return { results }; -} - -/** - * Search with mode support — CLI wrapper with multi-query detection. - * Modes: 'hybrid' (default), 'semantic', 'keyword' - */ -export async function search(query, customDbPath, opts = {}) { - const mode = opts.mode || 'hybrid'; - - // Split by semicolons, trim, filter empties - const queries = query - .split(';') - .map((q) => q.trim()) - .filter((q) => q.length > 0); - - const kindIcon = (kind) => (kind === 'function' ? 'f' : kind === 'class' ? '*' : 'o'); - - // ─── Keyword-only mode ────────────────────────────────────────────── - if (mode === 'keyword') { - const singleQuery = queries.length === 1 ? queries[0] : query; - const data = ftsSearchData(singleQuery, customDbPath, opts); - if (!data) { - console.log('No FTS5 index found. Run `codegraph embed` to build the keyword index.'); - return; - } - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - console.log(`\nKeyword search: "${singleQuery}" (BM25)\n`); - if (data.results.length === 0) { - console.log(' No results found.'); - } else { - for (const r of data.results) { - console.log( - ` BM25 ${r.bm25Score.toFixed(2)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, - ); - } - } - console.log(`\n ${data.results.length} results shown\n`); - return; - } - - // ─── Semantic-only mode ───────────────────────────────────────────── - if (mode === 'semantic') { - if (queries.length <= 1) { - const singleQuery = queries[0] || query; - const data = await searchData(singleQuery, customDbPath, opts); - if (!data) return; - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - console.log(`\nSemantic search: "${singleQuery}"\n`); - if (data.results.length === 0) { - console.log(' No results above threshold.'); - } else { - for (const r of data.results) { - const bar = '#'.repeat(Math.round(r.similarity * 20)); - console.log(` ${(r.similarity * 100).toFixed(1)}% ${bar}`); - console.log(` ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`); - } - } - console.log(`\n ${data.results.length} results shown\n`); - } else { - const data = await multiSearchData(queries, customDbPath, opts); - if (!data) return; - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`); - for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); - console.log(); - if (data.results.length === 0) { - console.log(' No results above threshold.'); - } else { - for (const r of data.results) { - console.log( - ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, - ); - for (const qs of r.queryScores) { - const bar = '#'.repeat(Math.round(qs.similarity * 20)); - console.log( - ` [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`, - ); - } - } - } - console.log(`\n ${data.results.length} results shown\n`); - } - return; - } - - // ─── Hybrid mode (default) ────────────────────────────────────────── - const data = await hybridSearchData(query, customDbPath, opts); - - if (!data) { - // No FTS5 index — fall back to semantic-only - warn( - 'FTS5 index not found — using semantic search only. Re-run `codegraph embed` to enable hybrid mode.', - ); - return search(query, customDbPath, { ...opts, mode: 'semantic' }); - } - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - const rrfK = opts.rrfK || 60; - if (queries.length <= 1) { - const singleQuery = queries[0] || query; - console.log(`\nHybrid search: "${singleQuery}" (BM25 + semantic, RRF k=${rrfK})\n`); - } else { - console.log(`\nHybrid multi-query search (BM25 + semantic, RRF k=${rrfK}):`); - for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); - console.log(); - } - - if (data.results.length === 0) { - console.log(' No results found.'); - } else { - for (const r of data.results) { - console.log( - ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, - ); - const parts = []; - if (r.bm25Rank != null) { - parts.push(`BM25: rank ${r.bm25Rank} (score ${r.bm25Score.toFixed(2)})`); - } - if (r.semanticRank != null) { - parts.push(`Semantic: rank ${r.semanticRank} (${(r.similarity * 100).toFixed(1)}%)`); - } - if (parts.length > 0) { - console.log(` ${parts.join(' | ')}`); - } - } - } - - console.log(`\n ${data.results.length} results shown\n`); -} diff --git a/src/embeddings/generator.js b/src/embeddings/generator.js new file mode 100644 index 00000000..8721e2ac --- /dev/null +++ b/src/embeddings/generator.js @@ -0,0 +1,163 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { closeDb, findDbPath, openDb } from '../db.js'; +import { DbError } from '../errors.js'; +import { warn } from '../logger.js'; +import { embed, getModelConfig } from './models.js'; +import { buildSourceText } from './strategies/source.js'; +import { buildStructuredText } from './strategies/structured.js'; + +/** + * Rough token estimate (~4 chars per token for code/English). + * Conservative — avoids adding a tokenizer dependency. + */ +export function estimateTokens(text) { + return Math.ceil(text.length / 4); +} + +export function initEmbeddingsSchema(db) { + db.exec(` + CREATE TABLE IF NOT EXISTS embeddings ( + node_id INTEGER PRIMARY KEY, + vector BLOB NOT NULL, + text_preview TEXT, + FOREIGN KEY(node_id) REFERENCES nodes(id) + ); + CREATE TABLE IF NOT EXISTS embedding_meta ( + key TEXT PRIMARY KEY, + value TEXT + ); + `); + + // Add full_text column (idempotent — ignore if already exists) + try { + db.exec('ALTER TABLE embeddings ADD COLUMN full_text TEXT'); + } catch { + /* column already exists */ + } + + // FTS5 virtual table for BM25 keyword search + db.exec(` + CREATE VIRTUAL TABLE IF NOT EXISTS fts_index USING fts5( + name, + content, + tokenize='unicode61' + ); + `); +} + +/** + * Build embeddings for all functions/methods/classes in the graph. + * @param {string} rootDir - Project root directory + * @param {string} modelKey - Model identifier from MODELS registry + * @param {string} [customDbPath] - Override path to graph.db + * @param {object} [options] - Embedding options + * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code) + */ +export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) { + const strategy = options.strategy || 'structured'; + const dbPath = customDbPath || findDbPath(null); + + if (!fs.existsSync(dbPath)) { + throw new DbError( + `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, + { file: dbPath }, + ); + } + + const db = openDb(dbPath); + initEmbeddingsSchema(db); + + db.exec('DELETE FROM embeddings'); + db.exec('DELETE FROM embedding_meta'); + db.exec('DELETE FROM fts_index'); + + const nodes = db + .prepare( + `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`, + ) + .all(); + + console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`); + + const byFile = new Map(); + for (const node of nodes) { + if (!byFile.has(node.file)) byFile.set(node.file, []); + byFile.get(node.file).push(node); + } + + const texts = []; + const nodeIds = []; + const nodeNames = []; + const previews = []; + const config = getModelConfig(modelKey); + const contextWindow = config.contextWindow; + let overflowCount = 0; + + for (const [file, fileNodes] of byFile) { + const fullPath = path.join(rootDir, file); + let lines; + try { + lines = fs.readFileSync(fullPath, 'utf-8').split('\n'); + } catch (err) { + warn(`Cannot read ${file} for embeddings: ${err.message}`); + continue; + } + + for (const node of fileNodes) { + let text = + strategy === 'structured' + ? buildStructuredText(node, file, lines, db) + : buildSourceText(node, file, lines); + + // Detect and handle context window overflow + const tokens = estimateTokens(text); + if (tokens > contextWindow) { + overflowCount++; + const maxChars = contextWindow * 4; + text = text.slice(0, maxChars); + } + + texts.push(text); + nodeIds.push(node.id); + nodeNames.push(node.name); + previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`); + } + } + + if (overflowCount > 0) { + warn( + `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`, + ); + } + + console.log(`Embedding ${texts.length} symbols...`); + const { vectors, dim } = await embed(texts, modelKey); + + const insert = db.prepare( + 'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)', + ); + const insertFts = db.prepare('INSERT INTO fts_index(rowid, name, content) VALUES (?, ?, ?)'); + const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)'); + const insertAll = db.transaction(() => { + for (let i = 0; i < vectors.length; i++) { + insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i], texts[i]); + insertFts.run(nodeIds[i], nodeNames[i], texts[i]); + } + insertMeta.run('model', config.name); + insertMeta.run('dim', String(dim)); + insertMeta.run('count', String(vectors.length)); + insertMeta.run('fts_count', String(vectors.length)); + insertMeta.run('strategy', strategy); + insertMeta.run('built_at', new Date().toISOString()); + if (overflowCount > 0) { + insertMeta.run('truncated_count', String(overflowCount)); + } + }); + insertAll(); + + console.log( + `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`, + ); + closeDb(db); +} diff --git a/src/embeddings/index.js b/src/embeddings/index.js new file mode 100644 index 00000000..bac3c60d --- /dev/null +++ b/src/embeddings/index.js @@ -0,0 +1,13 @@ +/** + * Embeddings subsystem — public API barrel. + * + * Re-exports everything consumers previously imported from `../embedder.js`. + */ + +export { buildEmbeddings, estimateTokens } from './generator.js'; +export { DEFAULT_MODEL, disposeModel, EMBEDDING_STRATEGIES, embed, MODELS } from './models.js'; +export { search } from './search/cli-formatter.js'; +export { hybridSearchData } from './search/hybrid.js'; +export { ftsSearchData } from './search/keyword.js'; +export { multiSearchData, searchData } from './search/semantic.js'; +export { cosineSim } from './stores/sqlite-blob.js'; diff --git a/src/embeddings/models.js b/src/embeddings/models.js new file mode 100644 index 00000000..948ad3aa --- /dev/null +++ b/src/embeddings/models.js @@ -0,0 +1,217 @@ +import { execFileSync } from 'node:child_process'; +import { createInterface } from 'node:readline'; +import { ConfigError, EngineError } from '../errors.js'; +import { info } from '../logger.js'; + +// Lazy-load transformers (heavy, optional module) +let pipeline = null; +let _cos_sim = null; +let extractor = null; +let activeModel = null; + +export const MODELS = { + minilm: { + name: 'Xenova/all-MiniLM-L6-v2', + dim: 384, + contextWindow: 256, + desc: 'Smallest, fastest (~23MB). General text.', + quantized: true, + }, + 'jina-small': { + name: 'Xenova/jina-embeddings-v2-small-en', + dim: 512, + contextWindow: 8192, + desc: 'Small, good quality (~33MB). General text.', + quantized: false, + }, + 'jina-base': { + name: 'Xenova/jina-embeddings-v2-base-en', + dim: 768, + contextWindow: 8192, + desc: 'Good quality (~137MB). General text, 8192 token context.', + quantized: false, + }, + 'jina-code': { + name: 'Xenova/jina-embeddings-v2-base-code', + dim: 768, + contextWindow: 8192, + desc: 'Code-aware (~137MB). Trained on code+text, best for code search.', + quantized: false, + }, + nomic: { + name: 'Xenova/nomic-embed-text-v1', + dim: 768, + contextWindow: 8192, + desc: 'Good local quality (~137MB). 8192 context.', + quantized: false, + }, + 'nomic-v1.5': { + name: 'nomic-ai/nomic-embed-text-v1.5', + dim: 768, + contextWindow: 8192, + desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.', + quantized: false, + }, + 'bge-large': { + name: 'Xenova/bge-large-en-v1.5', + dim: 1024, + contextWindow: 512, + desc: 'Best general retrieval (~335MB). Top MTEB scores.', + quantized: false, + }, +}; + +export const EMBEDDING_STRATEGIES = ['structured', 'source']; + +export const DEFAULT_MODEL = 'nomic-v1.5'; +const BATCH_SIZE_MAP = { + minilm: 32, + 'jina-small': 16, + 'jina-base': 8, + 'jina-code': 8, + nomic: 8, + 'nomic-v1.5': 8, + 'bge-large': 4, +}; +const DEFAULT_BATCH_SIZE = 32; + +export function getModelConfig(modelKey) { + const key = modelKey || DEFAULT_MODEL; + const config = MODELS[key]; + if (!config) { + throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`); + } + return config; +} + +/** + * Prompt the user to install a missing package interactively. + * Returns true if the package was installed, false otherwise. + * Skips the prompt entirely in non-TTY environments (CI, piped stdin). + */ +export function promptInstall(packageName) { + if (!process.stdin.isTTY) return Promise.resolve(false); + + return new Promise((resolve) => { + const rl = createInterface({ input: process.stdin, output: process.stderr }); + rl.question(`Semantic search requires ${packageName}. Install it now? [y/N] `, (answer) => { + rl.close(); + if (answer.trim().toLowerCase() !== 'y') return resolve(false); + try { + execFileSync('npm', ['install', packageName], { + stdio: 'inherit', + timeout: 300_000, + }); + resolve(true); + } catch { + resolve(false); + } + }); + }); +} + +/** + * Lazy-load @huggingface/transformers. + * If the package is missing, prompts the user to install it interactively. + * In non-TTY environments, prints an error and exits. + */ +export async function loadTransformers() { + try { + return await import('@huggingface/transformers'); + } catch { + const pkg = '@huggingface/transformers'; + const installed = await promptInstall(pkg); + if (installed) { + try { + return await import(pkg); + } catch (loadErr) { + throw new EngineError( + `${pkg} was installed but failed to load. Please check your environment.`, + { cause: loadErr }, + ); + } + } + throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`); + } +} + +/** + * Dispose the current ONNX session and free memory. + * Safe to call when no model is loaded (no-op). + */ +export async function disposeModel() { + if (extractor) { + await extractor.dispose(); + extractor = null; + } + activeModel = null; +} + +async function loadModel(modelKey) { + const config = getModelConfig(modelKey); + + if (extractor && activeModel === config.name) return { extractor, config }; + + // Dispose previous model before loading a different one + await disposeModel(); + + const transformers = await loadTransformers(); + pipeline = transformers.pipeline; + _cos_sim = transformers.cos_sim; + + info(`Loading embedding model: ${config.name} (${config.dim}d)...`); + const pipelineOpts = config.quantized ? { quantized: true } : {}; + try { + extractor = await pipeline('feature-extraction', config.name, pipelineOpts); + } catch (err) { + const msg = err.message || String(err); + if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) { + throw new EngineError( + `Model "${config.name}" requires authentication.\n` + + `This model is gated on HuggingFace and needs an access token.\n\n` + + `Options:\n` + + ` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` + + ` 2. Use a public model instead: codegraph embed --model minilm`, + { cause: err }, + ); + } + throw new EngineError( + `Failed to load model "${config.name}": ${msg}\n` + + `Try a different model: codegraph embed --model minilm`, + { cause: err }, + ); + } + activeModel = config.name; + info('Model loaded.'); + return { extractor, config }; +} + +/** + * Generate embeddings for an array of texts. + */ +export async function embed(texts, modelKey) { + const { extractor: ext, config } = await loadModel(modelKey); + const dim = config.dim; + const results = []; + const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] || DEFAULT_BATCH_SIZE; + + for (let i = 0; i < texts.length; i += batchSize) { + const batch = texts.slice(i, i + batchSize); + const output = await ext(batch, { pooling: 'mean', normalize: true }); + + for (let j = 0; j < batch.length; j++) { + const start = j * dim; + const vec = new Float32Array(dim); + for (let k = 0; k < dim; k++) { + vec[k] = output.data[start + k]; + } + results.push(vec); + } + + if (texts.length > batchSize) { + process.stdout.write(` Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`); + } + } + + return { vectors: results, dim }; +} diff --git a/src/embeddings/search/cli-formatter.js b/src/embeddings/search/cli-formatter.js new file mode 100644 index 00000000..f79a9e27 --- /dev/null +++ b/src/embeddings/search/cli-formatter.js @@ -0,0 +1,151 @@ +import { warn } from '../../logger.js'; +import { hybridSearchData } from './hybrid.js'; +import { ftsSearchData } from './keyword.js'; +import { multiSearchData, searchData } from './semantic.js'; + +/** + * Search with mode support — CLI wrapper with multi-query detection. + * Modes: 'hybrid' (default), 'semantic', 'keyword' + */ +export async function search(query, customDbPath, opts = {}) { + const mode = opts.mode || 'hybrid'; + + // Split by semicolons, trim, filter empties + const queries = query + .split(';') + .map((q) => q.trim()) + .filter((q) => q.length > 0); + + const kindIcon = (kind) => (kind === 'function' ? 'f' : kind === 'class' ? '*' : 'o'); + + // ─── Keyword-only mode ────────────────────────────────────────────── + if (mode === 'keyword') { + const singleQuery = queries.length === 1 ? queries[0] : query; + const data = ftsSearchData(singleQuery, customDbPath, opts); + if (!data) { + console.log('No FTS5 index found. Run `codegraph embed` to build the keyword index.'); + return; + } + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + console.log(`\nKeyword search: "${singleQuery}" (BM25)\n`); + if (data.results.length === 0) { + console.log(' No results found.'); + } else { + for (const r of data.results) { + console.log( + ` BM25 ${r.bm25Score.toFixed(2)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, + ); + } + } + console.log(`\n ${data.results.length} results shown\n`); + return; + } + + // ─── Semantic-only mode ───────────────────────────────────────────── + if (mode === 'semantic') { + if (queries.length <= 1) { + const singleQuery = queries[0] || query; + const data = await searchData(singleQuery, customDbPath, opts); + if (!data) return; + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + console.log(`\nSemantic search: "${singleQuery}"\n`); + if (data.results.length === 0) { + console.log(' No results above threshold.'); + } else { + for (const r of data.results) { + const bar = '#'.repeat(Math.round(r.similarity * 20)); + console.log(` ${(r.similarity * 100).toFixed(1)}% ${bar}`); + console.log(` ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`); + } + } + console.log(`\n ${data.results.length} results shown\n`); + } else { + const data = await multiSearchData(queries, customDbPath, opts); + if (!data) return; + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`); + for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); + console.log(); + if (data.results.length === 0) { + console.log(' No results above threshold.'); + } else { + for (const r of data.results) { + console.log( + ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, + ); + for (const qs of r.queryScores) { + const bar = '#'.repeat(Math.round(qs.similarity * 20)); + console.log( + ` [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`, + ); + } + } + } + console.log(`\n ${data.results.length} results shown\n`); + } + return; + } + + // ─── Hybrid mode (default) ────────────────────────────────────────── + const data = await hybridSearchData(query, customDbPath, opts); + + if (!data) { + // No FTS5 index — fall back to semantic-only + warn( + 'FTS5 index not found — using semantic search only. Re-run `codegraph embed` to enable hybrid mode.', + ); + return search(query, customDbPath, { ...opts, mode: 'semantic' }); + } + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + const rrfK = opts.rrfK || 60; + if (queries.length <= 1) { + const singleQuery = queries[0] || query; + console.log(`\nHybrid search: "${singleQuery}" (BM25 + semantic, RRF k=${rrfK})\n`); + } else { + console.log(`\nHybrid multi-query search (BM25 + semantic, RRF k=${rrfK}):`); + for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); + console.log(); + } + + if (data.results.length === 0) { + console.log(' No results found.'); + } else { + for (const r of data.results) { + console.log( + ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, + ); + const parts = []; + if (r.bm25Rank != null) { + parts.push(`BM25: rank ${r.bm25Rank} (score ${r.bm25Score.toFixed(2)})`); + } + if (r.semanticRank != null) { + parts.push(`Semantic: rank ${r.semanticRank} (${(r.similarity * 100).toFixed(1)}%)`); + } + if (parts.length > 0) { + console.log(` ${parts.join(' | ')}`); + } + } + } + + console.log(`\n ${data.results.length} results shown\n`); +} diff --git a/src/embeddings/search/filters.js b/src/embeddings/search/filters.js new file mode 100644 index 00000000..465e51e0 --- /dev/null +++ b/src/embeddings/search/filters.js @@ -0,0 +1,46 @@ +/** + * Match a file path against a glob pattern. + * Supports *, **, and ? wildcards. Zero dependencies. + */ +export function globMatch(filePath, pattern) { + // Normalize separators to forward slashes + const normalized = filePath.replace(/\\/g, '/'); + // Escape regex specials except glob chars + let regex = pattern.replace(/\\/g, '/').replace(/[.+^${}()|[\]\\]/g, '\\$&'); + // Replace ** first (matches any path segment), then * and ? + regex = regex.replace(/\*\*/g, '\0'); + regex = regex.replace(/\*/g, '[^/]*'); + regex = regex.replace(/\0/g, '.*'); + regex = regex.replace(/\?/g, '[^/]'); + try { + return new RegExp(`^${regex}$`).test(normalized); + } catch { + // Malformed pattern — fall back to substring match + return normalized.includes(pattern); + } +} + +const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./; + +/** + * Apply post-query filters (glob pattern, noTests) to a set of rows. + * Mutates nothing — returns a new filtered array. + * @param {Array} rows - Rows with at least a `file` property + * @param {object} opts + * @param {string} [opts.filePattern] - Glob pattern (only applied if it contains glob chars) + * @param {boolean} [opts.noTests] - Exclude test/spec files + * @param {boolean} [opts.isGlob] - Pre-computed: does filePattern contain glob chars? + * @returns {Array} + */ +export function applyFilters(rows, opts = {}) { + let filtered = rows; + const isGlob = + opts.isGlob !== undefined ? opts.isGlob : opts.filePattern && /[*?[\]]/.test(opts.filePattern); + if (isGlob) { + filtered = filtered.filter((row) => globMatch(row.file, opts.filePattern)); + } + if (opts.noTests) { + filtered = filtered.filter((row) => !TEST_PATTERN.test(row.file)); + } + return filtered; +} diff --git a/src/embeddings/search/hybrid.js b/src/embeddings/search/hybrid.js new file mode 100644 index 00000000..759e91c7 --- /dev/null +++ b/src/embeddings/search/hybrid.js @@ -0,0 +1,121 @@ +import { openReadonlyOrFail } from '../../db.js'; +import { hasFtsIndex } from '../stores/fts5.js'; +import { ftsSearchData } from './keyword.js'; +import { searchData } from './semantic.js'; + +/** + * Hybrid BM25 + semantic search with RRF fusion. + * Returns { results: [{ name, kind, file, line, rrf, bm25Score, bm25Rank, similarity, semanticRank }] } + * or null if no FTS5 index (caller should fall back to semantic-only). + */ +export async function hybridSearchData(query, customDbPath, opts = {}) { + const limit = opts.limit || 15; + const k = opts.rrfK || 60; + const topK = (opts.limit || 15) * 5; + + // Split semicolons for multi-query support + const queries = + typeof query === 'string' + ? query + .split(';') + .map((q) => q.trim()) + .filter((q) => q.length > 0) + : [query]; + + // Check FTS5 availability first (sync, cheap) + const checkDb = openReadonlyOrFail(customDbPath); + const ftsAvailable = hasFtsIndex(checkDb); + checkDb.close(); + if (!ftsAvailable) return null; + + // Collect ranked lists: for each query, one BM25 list + one semantic list + const rankedLists = []; + + for (const q of queries) { + // BM25 ranked list (sync) + const bm25Data = ftsSearchData(q, customDbPath, { ...opts, limit: topK }); + if (bm25Data?.results) { + rankedLists.push( + bm25Data.results.map((r, idx) => ({ + key: `${r.name}:${r.file}:${r.line}`, + rank: idx + 1, + source: 'bm25', + ...r, + })), + ); + } + + // Semantic ranked list (async) + const semData = await searchData(q, customDbPath, { + ...opts, + limit: topK, + minScore: opts.minScore || 0.2, + }); + if (semData?.results) { + rankedLists.push( + semData.results.map((r, idx) => ({ + key: `${r.name}:${r.file}:${r.line}`, + rank: idx + 1, + source: 'semantic', + ...r, + })), + ); + } + } + + // RRF fusion across all ranked lists + const fusionMap = new Map(); + for (const list of rankedLists) { + for (const item of list) { + if (!fusionMap.has(item.key)) { + fusionMap.set(item.key, { + name: item.name, + kind: item.kind, + file: item.file, + line: item.line, + endLine: item.endLine ?? null, + role: item.role ?? null, + fileHash: item.fileHash ?? null, + rrfScore: 0, + bm25Score: null, + bm25Rank: null, + similarity: null, + semanticRank: null, + }); + } + const entry = fusionMap.get(item.key); + entry.rrfScore += 1 / (k + item.rank); + if (item.source === 'bm25') { + if (entry.bm25Rank === null || item.rank < entry.bm25Rank) { + entry.bm25Score = item.bm25Score; + entry.bm25Rank = item.rank; + } + } else { + if (entry.semanticRank === null || item.rank < entry.semanticRank) { + entry.similarity = item.similarity; + entry.semanticRank = item.rank; + } + } + } + } + + const results = [...fusionMap.values()] + .sort((a, b) => b.rrfScore - a.rrfScore) + .slice(0, limit) + .map((e) => ({ + name: e.name, + kind: e.kind, + file: e.file, + line: e.line, + endLine: e.endLine, + role: e.role, + fileHash: e.fileHash, + rrf: e.rrfScore, + bm25Score: e.bm25Score, + bm25Rank: e.bm25Rank, + similarity: e.similarity, + semanticRank: e.semanticRank, + })); + + return { results }; +} diff --git a/src/embeddings/search/keyword.js b/src/embeddings/search/keyword.js new file mode 100644 index 00000000..cc8975d3 --- /dev/null +++ b/src/embeddings/search/keyword.js @@ -0,0 +1,68 @@ +import { openReadonlyOrFail } from '../../db.js'; +import { normalizeSymbol } from '../../queries.js'; +import { hasFtsIndex, sanitizeFtsQuery } from '../stores/fts5.js'; +import { applyFilters } from './filters.js'; + +/** + * BM25 keyword search via FTS5. + * Returns { results: [{ name, kind, file, line, bm25Score }] } or null if no FTS5 index. + */ +export function ftsSearchData(query, customDbPath, opts = {}) { + const limit = opts.limit || 15; + + const db = openReadonlyOrFail(customDbPath); + + try { + if (!hasFtsIndex(db)) { + return null; + } + + const ftsQuery = sanitizeFtsQuery(query); + if (!ftsQuery) { + return { results: [] }; + } + + let sql = ` + SELECT f.rowid AS node_id, rank AS bm25_score, + n.name, n.kind, n.file, n.line, n.end_line, n.role + FROM fts_index f + JOIN nodes n ON f.rowid = n.id + WHERE fts_index MATCH ? + `; + const params = [ftsQuery]; + + if (opts.kind) { + sql += ' AND n.kind = ?'; + params.push(opts.kind); + } + + const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); + if (opts.filePattern && !isGlob) { + sql += ' AND n.file LIKE ?'; + params.push(`%${opts.filePattern}%`); + } + + sql += ' ORDER BY rank LIMIT ?'; + params.push(limit * 5); // fetch generous set for post-filtering + + let rows; + try { + rows = db.prepare(sql).all(...params); + } catch { + // Invalid FTS5 query syntax — return empty + return { results: [] }; + } + + rows = applyFilters(rows, { ...opts, isGlob }); + + const hc = new Map(); + const results = rows.slice(0, limit).map((row) => ({ + ...normalizeSymbol(row, db, hc), + bm25Score: -row.bm25_score, // FTS5 rank is negative; negate for display + })); + + return { results }; + } finally { + db.close(); + } +} diff --git a/src/embeddings/search/prepare.js b/src/embeddings/search/prepare.js new file mode 100644 index 00000000..fae92d0d --- /dev/null +++ b/src/embeddings/search/prepare.js @@ -0,0 +1,58 @@ +import { getEmbeddingMeta, hasEmbeddings } from '../../db/repository/embeddings.js'; +import { openReadonlyOrFail } from '../../db.js'; +import { MODELS } from '../models.js'; +import { applyFilters } from './filters.js'; + +/** + * Shared setup for search functions: opens DB, validates embeddings/model, loads rows. + * Returns { db, rows, modelKey, storedDim } or null on failure (prints error). + */ +export function prepareSearch(customDbPath, opts = {}) { + const db = openReadonlyOrFail(customDbPath); + + if (!hasEmbeddings(db)) { + console.log('No embeddings found. Run `codegraph embed` first.'); + db.close(); + return null; + } + + const storedModel = getEmbeddingMeta(db, 'model') || null; + const dimStr = getEmbeddingMeta(db, 'dim'); + const storedDim = dimStr ? parseInt(dimStr, 10) : null; + + let modelKey = opts.model || null; + if (!modelKey && storedModel) { + for (const [key, config] of Object.entries(MODELS)) { + if (config.name === storedModel) { + modelKey = key; + break; + } + } + } + + // Pre-filter: allow filtering by kind or file pattern to reduce search space + const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); + let sql = ` + SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role + FROM embeddings e + JOIN nodes n ON e.node_id = n.id + `; + const params = []; + const conditions = []; + if (opts.kind) { + conditions.push('n.kind = ?'); + params.push(opts.kind); + } + if (opts.filePattern && !isGlob) { + conditions.push('n.file LIKE ?'); + params.push(`%${opts.filePattern}%`); + } + if (conditions.length > 0) { + sql += ` WHERE ${conditions.join(' AND ')}`; + } + + let rows = db.prepare(sql).all(...params); + rows = applyFilters(rows, { ...opts, isGlob }); + + return { db, rows, modelKey, storedDim }; +} diff --git a/src/embeddings/search/semantic.js b/src/embeddings/search/semantic.js new file mode 100644 index 00000000..62263ac3 --- /dev/null +++ b/src/embeddings/search/semantic.js @@ -0,0 +1,145 @@ +import { warn } from '../../logger.js'; +import { normalizeSymbol } from '../../queries.js'; +import { embed } from '../models.js'; +import { cosineSim } from '../stores/sqlite-blob.js'; +import { prepareSearch } from './prepare.js'; + +/** + * Single-query semantic search — returns data instead of printing. + * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure. + */ +export async function searchData(query, customDbPath, opts = {}) { + const limit = opts.limit || 15; + const minScore = opts.minScore || 0.2; + + const prepared = prepareSearch(customDbPath, opts); + if (!prepared) return null; + const { db, rows, modelKey, storedDim } = prepared; + + try { + const { + vectors: [queryVec], + dim, + } = await embed([query], modelKey); + + if (storedDim && dim !== storedDim) { + console.log( + `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, + ); + console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); + return null; + } + + const hc = new Map(); + const results = []; + for (const row of rows) { + const vec = new Float32Array(new Uint8Array(row.vector).buffer); + const sim = cosineSim(queryVec, vec); + + if (sim >= minScore) { + results.push({ + ...normalizeSymbol(row, db, hc), + similarity: sim, + }); + } + } + + results.sort((a, b) => b.similarity - a.similarity); + return { results: results.slice(0, limit) }; + } finally { + db.close(); + } +} + +/** + * Multi-query semantic search with Reciprocal Rank Fusion (RRF). + * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure. + */ +export async function multiSearchData(queries, customDbPath, opts = {}) { + const limit = opts.limit || 15; + const minScore = opts.minScore || 0.2; + const k = opts.rrfK || 60; + + const prepared = prepareSearch(customDbPath, opts); + if (!prepared) return null; + const { db, rows, modelKey, storedDim } = prepared; + + try { + const { vectors: queryVecs, dim } = await embed(queries, modelKey); + + // Warn about similar queries that may bias RRF results + const SIMILARITY_WARN_THRESHOLD = 0.85; + for (let i = 0; i < queryVecs.length; i++) { + for (let j = i + 1; j < queryVecs.length; j++) { + const sim = cosineSim(queryVecs[i], queryVecs[j]); + if (sim >= SIMILARITY_WARN_THRESHOLD) { + warn( + `Queries "${queries[i]}" and "${queries[j]}" are very similar ` + + `(${(sim * 100).toFixed(0)}% cosine similarity). ` + + `This may bias RRF results toward their shared matches. ` + + `Consider using more distinct queries.`, + ); + } + } + } + + if (storedDim && dim !== storedDim) { + console.log( + `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, + ); + console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); + return null; + } + + // Parse row vectors once + const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer)); + + // For each query: compute similarities, filter by minScore, rank + const perQueryRanked = queries.map((_query, qi) => { + const scored = []; + for (let ri = 0; ri < rows.length; ri++) { + const sim = cosineSim(queryVecs[qi], rowVecs[ri]); + if (sim >= minScore) { + scored.push({ rowIndex: ri, similarity: sim }); + } + } + scored.sort((a, b) => b.similarity - a.similarity); + // Assign 1-indexed ranks + return scored.map((item, rank) => ({ ...item, rank: rank + 1 })); + }); + + // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries + const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] } + for (let qi = 0; qi < queries.length; qi++) { + for (const item of perQueryRanked[qi]) { + if (!fusionMap.has(item.rowIndex)) { + fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] }); + } + const entry = fusionMap.get(item.rowIndex); + entry.rrfScore += 1 / (k + item.rank); + entry.queryScores.push({ + query: queries[qi], + similarity: item.similarity, + rank: item.rank, + }); + } + } + + // Build results sorted by RRF score + const hc = new Map(); + const results = []; + for (const [rowIndex, entry] of fusionMap) { + const row = rows[rowIndex]; + results.push({ + ...normalizeSymbol(row, db, hc), + rrf: entry.rrfScore, + queryScores: entry.queryScores, + }); + } + + results.sort((a, b) => b.rrf - a.rrf); + return { results: results.slice(0, limit) }; + } finally { + db.close(); + } +} diff --git a/src/embeddings/stores/fts5.js b/src/embeddings/stores/fts5.js new file mode 100644 index 00000000..9b902dce --- /dev/null +++ b/src/embeddings/stores/fts5.js @@ -0,0 +1,27 @@ +/** + * Sanitize a user query for FTS5 MATCH syntax. + * Wraps each token as an implicit OR and escapes special FTS5 characters. + */ +export function sanitizeFtsQuery(query) { + // Remove FTS5 special chars that could cause syntax errors + const cleaned = query.replace(/[*"():^{}~<>]/g, ' ').trim(); + if (!cleaned) return null; + // Split into tokens, wrap with OR for multi-token queries + const tokens = cleaned.split(/\s+/).filter((t) => t.length > 0); + if (tokens.length === 0) return null; + if (tokens.length === 1) return `"${tokens[0]}"`; + return tokens.map((t) => `"${t}"`).join(' OR '); +} + +/** + * Check if the FTS5 index exists in the database. + * Returns true if fts_index table exists and has rows, false otherwise. + */ +export function hasFtsIndex(db) { + try { + const row = db.prepare('SELECT COUNT(*) as c FROM fts_index').get(); + return row.c > 0; + } catch { + return false; + } +} diff --git a/src/embeddings/stores/sqlite-blob.js b/src/embeddings/stores/sqlite-blob.js new file mode 100644 index 00000000..e3979b2c --- /dev/null +++ b/src/embeddings/stores/sqlite-blob.js @@ -0,0 +1,23 @@ +/** + * @typedef {object} VectorStore + * @property {(queryVec: Float32Array, rows: Array<{vector: Buffer}>) => Array<{index: number, score: number}>} search + * Score every row against a query vector and return scored indices. + * + * Future implementations (e.g. HNSW via `hnsw.js`) implement this same shape + * for approximate nearest-neighbor search. + */ + +/** + * Cosine similarity between two Float32Arrays. + */ +export function cosineSim(a, b) { + let dot = 0, + normA = 0, + normB = 0; + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + return dot / (Math.sqrt(normA) * Math.sqrt(normB)); +} diff --git a/src/embeddings/strategies/source.js b/src/embeddings/strategies/source.js new file mode 100644 index 00000000..3b25e0f3 --- /dev/null +++ b/src/embeddings/strategies/source.js @@ -0,0 +1,14 @@ +import { splitIdentifier } from './text-utils.js'; + +/** + * Build raw source-code text for a symbol (original strategy). + */ +export function buildSourceText(node, file, lines) { + const startLine = Math.max(0, node.line - 1); + const endLine = node.end_line + ? Math.min(lines.length, node.end_line) + : Math.min(lines.length, startLine + 15); + const context = lines.slice(startLine, endLine).join('\n'); + const readable = splitIdentifier(node.name); + return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`; +} diff --git a/src/embeddings/strategies/structured.js b/src/embeddings/strategies/structured.js new file mode 100644 index 00000000..c488d1c6 --- /dev/null +++ b/src/embeddings/strategies/structured.js @@ -0,0 +1,43 @@ +import { findCalleeNames, findCallerNames } from '../../db.js'; +import { extractLeadingComment, splitIdentifier } from './text-utils.js'; + +/** + * Build graph-enriched text for a symbol using dependency context. + * Produces compact, semantic text (~100 tokens) instead of full source code. + */ +export function buildStructuredText(node, file, lines, db) { + const readable = splitIdentifier(node.name); + const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`]; + const startLine = Math.max(0, node.line - 1); + + // Extract parameters from signature (best-effort, single-line) + const sigLine = lines[startLine] || ''; + const paramMatch = sigLine.match(/\(([^)]*)\)/); + if (paramMatch?.[1]?.trim()) { + parts.push(`Parameters: ${paramMatch[1].trim()}`); + } + + // Graph context: callees (capped at 10) + const callees = findCalleeNames(db, node.id); + if (callees.length > 0) { + parts.push(`Calls: ${callees.slice(0, 10).join(', ')}`); + } + + // Graph context: callers (capped at 10) + const callers = findCallerNames(db, node.id); + if (callers.length > 0) { + parts.push(`Called by: ${callers.slice(0, 10).join(', ')}`); + } + + // Leading comment (high semantic value) or first few lines of code + const comment = extractLeadingComment(lines, startLine); + if (comment) { + parts.push(comment); + } else { + const endLine = Math.min(lines.length, startLine + 4); + const snippet = lines.slice(startLine, endLine).join('\n').trim(); + if (snippet) parts.push(snippet); + } + + return parts.join('\n'); +} diff --git a/src/embeddings/strategies/text-utils.js b/src/embeddings/strategies/text-utils.js new file mode 100644 index 00000000..fca8f29e --- /dev/null +++ b/src/embeddings/strategies/text-utils.js @@ -0,0 +1,43 @@ +/** + * Split an identifier into readable words. + * camelCase/PascalCase -> "camel Case", snake_case -> "snake case", kebab-case -> "kebab case" + */ +export function splitIdentifier(name) { + return name + .replace(/([a-z])([A-Z])/g, '$1 $2') + .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') + .replace(/[_-]+/g, ' ') + .trim(); +} + +/** + * Extract leading comment text (JSDoc, //, #, etc.) above a function line. + * Returns the cleaned comment text or null if none found. + */ +export function extractLeadingComment(lines, fnLineIndex) { + if (fnLineIndex > lines.length) return null; + const raw = []; + for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) { + if (i >= lines.length) continue; + const trimmed = lines[i].trim(); + if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) { + raw.unshift(trimmed); + } else if (trimmed === '') { + if (raw.length > 0) break; + } else { + break; + } + } + if (raw.length === 0) return null; + return raw + .map((line) => + line + .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */ + .replace(/^\*\s?/, '') // middle * lines + .replace(/^\/\/\/?\s?/, '') // // or /// + .replace(/^#\s?/, '') // # (Python/Ruby) + .trim(), + ) + .filter((l) => l.length > 0) + .join(' '); +} diff --git a/src/index.js b/src/index.js index bca2cec7..576efac6 100644 --- a/src/index.js +++ b/src/index.js @@ -23,7 +23,12 @@ export { loadConfig } from './config.js'; export { EXTENSIONS, IGNORE_DIRS } from './constants.js'; export { findCycles } from './cycles.js'; export { dataflowData } from './dataflow.js'; -export { buildEmbeddings, hybridSearchData, multiSearchData, searchData } from './embedder.js'; +export { + buildEmbeddings, + hybridSearchData, + multiSearchData, + searchData, +} from './embeddings/index.js'; export { AnalysisError, BoundaryError, diff --git a/src/mcp/tools/semantic-search.js b/src/mcp/tools/semantic-search.js index 06ef8354..2fa22ea5 100644 --- a/src/mcp/tools/semantic-search.js +++ b/src/mcp/tools/semantic-search.js @@ -11,7 +11,7 @@ export async function handler(args, ctx) { }; if (mode === 'keyword') { - const { ftsSearchData } = await import('../../embedder.js'); + const { ftsSearchData } = await import('../../embeddings/index.js'); const result = ftsSearchData(args.query, ctx.dbPath, searchOpts); if (result === null) { return { @@ -28,7 +28,7 @@ export async function handler(args, ctx) { } if (mode === 'semantic') { - const { searchData } = await import('../../embedder.js'); + const { searchData } = await import('../../embeddings/index.js'); const result = await searchData(args.query, ctx.dbPath, searchOpts); if (result === null) { return { @@ -45,7 +45,7 @@ export async function handler(args, ctx) { } // hybrid (default) — falls back to semantic if no FTS5 - const { hybridSearchData, searchData } = await import('../../embedder.js'); + const { hybridSearchData, searchData } = await import('../../embeddings/index.js'); let result = await hybridSearchData(args.query, ctx.dbPath, searchOpts); if (result === null) { result = await searchData(args.query, ctx.dbPath, searchOpts); diff --git a/tests/search/embedder-search.test.js b/tests/search/embedder-search.test.js index 93ea518c..86fe5543 100644 --- a/tests/search/embedder-search.test.js +++ b/tests/search/embedder-search.test.js @@ -38,7 +38,7 @@ import { multiSearchData, search, searchData, -} from '../../src/embedder.js'; +} from '../../src/embeddings/index.js'; // ─── Helpers ─────────────────────────────────────────────────────────── diff --git a/tests/search/embedding-regression.test.js b/tests/search/embedding-regression.test.js index f1004bf3..56222875 100644 --- a/tests/search/embedding-regression.test.js +++ b/tests/search/embedding-regression.test.js @@ -23,7 +23,7 @@ try { // Lazy-import to avoid top-level errors when transformers is missing const { buildGraph } = await import('../../src/builder.js'); -const { buildEmbeddings, searchData } = await import('../../src/embedder.js'); +const { buildEmbeddings, searchData } = await import('../../src/embeddings/index.js'); // Same ES-module fixture files used by build.test.js const FIXTURE_FILES = { diff --git a/tests/search/embedding-strategy.test.js b/tests/search/embedding-strategy.test.js index e1553678..70215559 100644 --- a/tests/search/embedding-strategy.test.js +++ b/tests/search/embedding-strategy.test.js @@ -31,7 +31,7 @@ import { EMBEDDING_STRATEGIES, estimateTokens, MODELS, -} from '../../src/embedder.js'; +} from '../../src/embeddings/index.js'; // ─── Helpers ─────────────────────────────────────────────────────────── diff --git a/tests/unit/prompt-install.test.js b/tests/unit/prompt-install.test.js index 6a36c2de..98cd926f 100644 --- a/tests/unit/prompt-install.test.js +++ b/tests/unit/prompt-install.test.js @@ -44,7 +44,7 @@ describe('loadTransformers install prompt', () => { throw new Error('Cannot find package'); }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); await expect(embed(['test'], 'minilm')).rejects.toThrow( 'Semantic search requires @huggingface/transformers', @@ -71,7 +71,7 @@ describe('loadTransformers install prompt', () => { throw new Error('Cannot find package'); }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); await expect(embed(['test'], 'minilm')).rejects.toThrow( 'Semantic search requires @huggingface/transformers', @@ -99,7 +99,7 @@ describe('loadTransformers install prompt', () => { throw new Error('Cannot find package'); }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); await expect(embed(['test'], 'minilm')).rejects.toThrow( 'Semantic search requires @huggingface/transformers', @@ -137,7 +137,7 @@ describe('loadTransformers install prompt', () => { }; }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); const result = await embed(['test text'], 'minilm'); expect(result.vectors).toHaveLength(1); From 83678c3959553a21b6b98ff93239ea7f5c8a368e Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 02:24:23 -0600 Subject: [PATCH 4/8] fix: address review feedback on embedder extraction - Remove dead _cos_sim variable from models.js (greptile) - Fix embedding-benchmark.js import path (greptile) - Update workflow path filters and cache keys for new directory (greptile) - Update stale file references in test comments and CLAUDE.md (greptile) Impact: 1 functions changed, 1 affected --- .github/workflows/benchmark.yml | 2 +- .github/workflows/embedding-regression.yml | 2 +- CLAUDE.md | 2 +- scripts/embedding-benchmark.js | 2 +- src/embeddings/models.js | 2 -- tests/unit/prompt-install.test.js | 2 +- 6 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0393e4b9..670b8b9e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -228,7 +228,7 @@ jobs: uses: actions/cache@v5 with: path: ~/.cache/huggingface - key: hf-models-${{ runner.os }}-${{ hashFiles('src/embedder.js') }} + key: hf-models-${{ runner.os }}-${{ hashFiles('src/embeddings/**') }} restore-keys: hf-models-${{ runner.os }}- - name: Build graph diff --git a/.github/workflows/embedding-regression.yml b/.github/workflows/embedding-regression.yml index 7cecee3f..a42cc6e7 100644 --- a/.github/workflows/embedding-regression.yml +++ b/.github/workflows/embedding-regression.yml @@ -6,7 +6,7 @@ on: workflow_dispatch: pull_request: paths: - - 'src/embedder.js' + - 'src/embeddings/**' - 'tests/search/**' - 'package.json' diff --git a/CLAUDE.md b/CLAUDE.md index 46119fff..1b25676b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,7 +45,7 @@ JS source is plain JavaScript (ES modules) in `src/`. No transpilation step. The | `builder.js` | Graph building: file collection, parsing, import resolution, incremental hashing | | `parser.js` | tree-sitter WASM wrapper; `LANGUAGE_REGISTRY` + per-language extractors for functions, classes, methods, imports, exports, call sites | | `queries.js` | Query functions: symbol search, file deps, impact analysis, diff-impact; `SYMBOL_KINDS` constant defines all node kinds | -| `embedder.js` | Semantic search with `@huggingface/transformers`; multi-query RRF ranking | +| `embeddings/` | Embedding subsystem: model management, vector generation, semantic/keyword/hybrid search, CLI formatting | | `db.js` | SQLite schema and operations (`better-sqlite3`) | | `mcp.js` | MCP server exposing graph queries to AI agents; single-repo by default, `--multi-repo` to enable cross-repo access | | `cycles.js` | Circular dependency detection | diff --git a/scripts/embedding-benchmark.js b/scripts/embedding-benchmark.js index 51738074..4bc3afec 100644 --- a/scripts/embedding-benchmark.js +++ b/scripts/embedding-benchmark.js @@ -26,7 +26,7 @@ const { version, srcDir, cleanup } = await resolveBenchmarkSource(); const dbPath = path.join(root, '.codegraph', 'graph.db'); const { buildEmbeddings, MODELS, searchData, disposeModel } = await import( - srcImport(srcDir, 'embedder.js') + srcImport(srcDir, 'embeddings/index.js') ); // Redirect console.log to stderr so only JSON goes to stdout diff --git a/src/embeddings/models.js b/src/embeddings/models.js index 948ad3aa..949f6c85 100644 --- a/src/embeddings/models.js +++ b/src/embeddings/models.js @@ -5,7 +5,6 @@ import { info } from '../logger.js'; // Lazy-load transformers (heavy, optional module) let pipeline = null; -let _cos_sim = null; let extractor = null; let activeModel = null; @@ -157,7 +156,6 @@ async function loadModel(modelKey) { const transformers = await loadTransformers(); pipeline = transformers.pipeline; - _cos_sim = transformers.cos_sim; info(`Loading embedding model: ${config.name} (${config.dim}d)...`); const pipelineOpts = config.quantized ? { quantized: true } : {}; diff --git a/tests/unit/prompt-install.test.js b/tests/unit/prompt-install.test.js index 98cd926f..f23a73f8 100644 --- a/tests/unit/prompt-install.test.js +++ b/tests/unit/prompt-install.test.js @@ -1,5 +1,5 @@ /** - * Unit tests for the interactive install prompt in src/embedder.js. + * Unit tests for the interactive install prompt in src/embeddings/models.js. * * Tests the promptInstall() + loadTransformers() flow when * @huggingface/transformers is missing. From b1ca54d4ce6148d275e96786da38e576c4ff297c Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 02:47:22 -0600 Subject: [PATCH 5/8] fix: harden prepareSearch with try/catch for DB leak and use getEmbeddingCount - Wrap post-open logic in try/catch so DB is closed on unexpected exceptions - Switch from hasEmbeddings to getEmbeddingCount for clearer zero-count check Impact: 1 functions changed, 0 affected --- src/embeddings/search/prepare.js | 78 ++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/src/embeddings/search/prepare.js b/src/embeddings/search/prepare.js index fae92d0d..864bfbe9 100644 --- a/src/embeddings/search/prepare.js +++ b/src/embeddings/search/prepare.js @@ -1,4 +1,4 @@ -import { getEmbeddingMeta, hasEmbeddings } from '../../db/repository/embeddings.js'; +import { getEmbeddingCount, getEmbeddingMeta } from '../../db/repository/embeddings.js'; import { openReadonlyOrFail } from '../../db.js'; import { MODELS } from '../models.js'; import { applyFilters } from './filters.js'; @@ -6,53 +6,61 @@ import { applyFilters } from './filters.js'; /** * Shared setup for search functions: opens DB, validates embeddings/model, loads rows. * Returns { db, rows, modelKey, storedDim } or null on failure (prints error). + * On null return, the DB is closed. On exception, the DB is also closed + * (callers only need to close DB from the returned object on the happy path). */ export function prepareSearch(customDbPath, opts = {}) { const db = openReadonlyOrFail(customDbPath); - if (!hasEmbeddings(db)) { - console.log('No embeddings found. Run `codegraph embed` first.'); - db.close(); - return null; - } + try { + const count = getEmbeddingCount(db); + if (count === 0) { + console.log('No embeddings found. Run `codegraph embed` first.'); + db.close(); + return null; + } - const storedModel = getEmbeddingMeta(db, 'model') || null; - const dimStr = getEmbeddingMeta(db, 'dim'); - const storedDim = dimStr ? parseInt(dimStr, 10) : null; + const storedModel = getEmbeddingMeta(db, 'model') || null; + const dimStr = getEmbeddingMeta(db, 'dim'); + const storedDim = dimStr ? parseInt(dimStr, 10) : null; - let modelKey = opts.model || null; - if (!modelKey && storedModel) { - for (const [key, config] of Object.entries(MODELS)) { - if (config.name === storedModel) { - modelKey = key; - break; + let modelKey = opts.model || null; + if (!modelKey && storedModel) { + for (const [key, config] of Object.entries(MODELS)) { + if (config.name === storedModel) { + modelKey = key; + break; + } } } - } - // Pre-filter: allow filtering by kind or file pattern to reduce search space - const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); - let sql = ` + // Pre-filter: allow filtering by kind or file pattern to reduce search space + const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); + let sql = ` SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role FROM embeddings e JOIN nodes n ON e.node_id = n.id `; - const params = []; - const conditions = []; - if (opts.kind) { - conditions.push('n.kind = ?'); - params.push(opts.kind); - } - if (opts.filePattern && !isGlob) { - conditions.push('n.file LIKE ?'); - params.push(`%${opts.filePattern}%`); - } - if (conditions.length > 0) { - sql += ` WHERE ${conditions.join(' AND ')}`; - } + const params = []; + const conditions = []; + if (opts.kind) { + conditions.push('n.kind = ?'); + params.push(opts.kind); + } + if (opts.filePattern && !isGlob) { + conditions.push('n.file LIKE ?'); + params.push(`%${opts.filePattern}%`); + } + if (conditions.length > 0) { + sql += ` WHERE ${conditions.join(' AND ')}`; + } - let rows = db.prepare(sql).all(...params); - rows = applyFilters(rows, { ...opts, isGlob }); + let rows = db.prepare(sql).all(...params); + rows = applyFilters(rows, { ...opts, isGlob }); - return { db, rows, modelKey, storedDim }; + return { db, rows, modelKey, storedDim }; + } catch (err) { + db.close(); + throw err; + } } From 9d781df8d4712555030d2814b8b03a3f996791ac Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 03:10:04 -0600 Subject: [PATCH 6/8] fix: guard cosineSim against zero-magnitude vectors returning NaN Return 0 instead of NaN when either vector has zero magnitude (e.g. corrupted DB row). In practice embed() stores L2-normalised vectors, but this makes the contract explicit. Impact: 1 functions changed, 0 affected --- src/embeddings/stores/sqlite-blob.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/embeddings/stores/sqlite-blob.js b/src/embeddings/stores/sqlite-blob.js index e3979b2c..75037ffa 100644 --- a/src/embeddings/stores/sqlite-blob.js +++ b/src/embeddings/stores/sqlite-blob.js @@ -19,5 +19,6 @@ export function cosineSim(a, b) { normA += a[i] * a[i]; normB += b[i] * b[i]; } - return dot / (Math.sqrt(normA) * Math.sqrt(normB)); + const denom = Math.sqrt(normA) * Math.sqrt(normB); + return denom === 0 ? 0 : dot / denom; } From 78aa1d5384357697e6767a2f450d060d89a7bc05 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 03:21:13 -0600 Subject: [PATCH 7/8] fix: add @internal JSDoc tags to non-public model helpers Mark getModelConfig, promptInstall, and loadTransformers as @internal since they are exported only for sibling module use, not the public barrel. --- src/embeddings/models.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/embeddings/models.js b/src/embeddings/models.js index 949f6c85..1202dd28 100644 --- a/src/embeddings/models.js +++ b/src/embeddings/models.js @@ -74,6 +74,7 @@ const BATCH_SIZE_MAP = { }; const DEFAULT_BATCH_SIZE = 32; +/** @internal Used by generator.js — not part of the public barrel. */ export function getModelConfig(modelKey) { const key = modelKey || DEFAULT_MODEL; const config = MODELS[key]; @@ -87,6 +88,7 @@ export function getModelConfig(modelKey) { * Prompt the user to install a missing package interactively. * Returns true if the package was installed, false otherwise. * Skips the prompt entirely in non-TTY environments (CI, piped stdin). + * @internal Not part of the public barrel. */ export function promptInstall(packageName) { if (!process.stdin.isTTY) return Promise.resolve(false); @@ -113,6 +115,7 @@ export function promptInstall(packageName) { * Lazy-load @huggingface/transformers. * If the package is missing, prompts the user to install it interactively. * In non-TTY environments, prints an error and exits. + * @internal Not part of the public barrel. */ export async function loadTransformers() { try { From 37ba7e13967f24c78b128e8fac14c8f37f450b57 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 03:36:49 -0600 Subject: [PATCH 8/8] =?UTF-8?q?fix:=20unexport=20initEmbeddingsSchema=20?= =?UTF-8?q?=E2=80=94=20only=20used=20within=20generator.js?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: 1 functions changed, 1 affected --- src/embeddings/generator.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embeddings/generator.js b/src/embeddings/generator.js index 8721e2ac..b34f5934 100644 --- a/src/embeddings/generator.js +++ b/src/embeddings/generator.js @@ -15,7 +15,7 @@ export function estimateTokens(text) { return Math.ceil(text.length / 4); } -export function initEmbeddingsSchema(db) { +function initEmbeddingsSchema(db) { db.exec(` CREATE TABLE IF NOT EXISTS embeddings ( node_id INTEGER PRIMARY KEY,