From f5bcfe78c5c582196342bfd2005186b2d4b092b6 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 02:18:43 -0600 Subject: [PATCH 1/5] refactor: decompose buildGraph() into pipeline stages (ROADMAP 3.9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split the 1,487-line monolithic buildGraph() into independently testable pipeline stages communicating through a shared PipelineContext object. New directory structure: src/builder/ context.js — PipelineContext class (shared mutable state) helpers.js — Extracted utilities (collectFiles, fileHash, batch inserts) incremental.js — Single-file rebuild for watch mode pipeline.js — Orchestrator: setup → stages → return stages/ collect-files.js — File collection + scoped rebuild detect-changes.js — Three-tier change detection + reverse-dep cascade parse-files.js — parseFilesAuto dispatch insert-nodes.js — Batch node/edge insertion in SQLite transaction resolve-imports.js— Batch import resolution + barrel map build-edges.js — Import/call/receiver/extends/implements edges build-structure.js— Directory structure + node role classification run-analyses.js — Unified AST analysis engine dispatch finalize.js — Metadata, drift detection, cleanup, registry src/builder.js is now a barrel re-export (11 lines) following the established src/db.js pattern. All existing imports continue working. src/watcher.js now delegates to builder/incremental.js instead of duplicating 140 lines of node insertion and edge building logic. Impact: 28 functions changed, 27 affected --- src/builder.js | 1495 +------------------------ src/builder/context.js | 85 ++ src/builder/helpers.js | 215 ++++ src/builder/incremental.js | 179 +++ src/builder/pipeline.js | 125 +++ src/builder/stages/build-edges.js | 297 +++++ src/builder/stages/build-structure.js | 113 ++ src/builder/stages/collect-files.js | 44 + src/builder/stages/detect-changes.js | 413 +++++++ src/builder/stages/finalize.js | 139 +++ src/builder/stages/insert-nodes.js | 170 +++ src/builder/stages/parse-files.js | 28 + src/builder/stages/resolve-imports.js | 143 +++ src/builder/stages/run-analyses.js | 44 + src/watcher.js | 152 +-- tests/builder/collect-files.test.js | 70 ++ tests/builder/context.test.js | 42 + tests/builder/detect-changes.test.js | 144 +++ tests/builder/pipeline.test.js | 79 ++ 19 files changed, 2346 insertions(+), 1631 deletions(-) create mode 100644 src/builder/context.js create mode 100644 src/builder/helpers.js create mode 100644 src/builder/incremental.js create mode 100644 src/builder/pipeline.js create mode 100644 src/builder/stages/build-edges.js create mode 100644 src/builder/stages/build-structure.js create mode 100644 src/builder/stages/collect-files.js create mode 100644 src/builder/stages/detect-changes.js create mode 100644 src/builder/stages/finalize.js create mode 100644 src/builder/stages/insert-nodes.js create mode 100644 src/builder/stages/parse-files.js create mode 100644 src/builder/stages/resolve-imports.js create mode 100644 src/builder/stages/run-analyses.js create mode 100644 tests/builder/collect-files.test.js create mode 100644 tests/builder/context.test.js create mode 100644 tests/builder/detect-changes.test.js create mode 100644 tests/builder/pipeline.test.js diff --git a/src/builder.js b/src/builder.js index f4ab372..134f109 100644 --- a/src/builder.js +++ b/src/builder.js @@ -1,1486 +1,11 @@ -import { createHash } from 'node:crypto'; -import fs from 'node:fs'; -import path from 'node:path'; -import { performance } from 'node:perf_hooks'; -import { loadConfig } from './config.js'; -import { EXTENSIONS, IGNORE_DIRS, normalizePath } from './constants.js'; -import { - bulkNodeIdsByFile, - closeDb, - getBuildMeta, - getNodeId, - initSchema, - MIGRATIONS, - openDb, - purgeFilesData, - setBuildMeta, -} from './db.js'; -import { readJournal, writeJournalHeader } from './journal.js'; -import { debug, info, warn } from './logger.js'; -import { loadNative } from './native.js'; -import { getActiveEngine, parseFilesAuto } from './parser.js'; -import { computeConfidence, resolveImportPath, resolveImportsBatch } from './resolve.js'; - +// Barrel re-export — keeps all existing `import { ... } from './builder.js'` working. +// See src/builder/ for the pipeline implementation (ROADMAP 3.9). + +export { + collectFiles, + loadPathAliases, + purgeFilesFromGraph, + readFileSafe, +} from './builder/helpers.js'; +export { buildGraph } from './builder/pipeline.js'; export { resolveImportPath } from './resolve.js'; - -const __builderDir = path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/i, '$1')); -const CODEGRAPH_VERSION = JSON.parse( - fs.readFileSync(path.join(__builderDir, '..', 'package.json'), 'utf-8'), -).version; - -const BUILTIN_RECEIVERS = new Set([ - 'console', - 'Math', - 'JSON', - 'Object', - 'Array', - 'String', - 'Number', - 'Boolean', - 'Date', - 'RegExp', - 'Map', - 'Set', - 'WeakMap', - 'WeakSet', - 'Promise', - 'Symbol', - 'Error', - 'TypeError', - 'RangeError', - 'Proxy', - 'Reflect', - 'Intl', - 'globalThis', - 'window', - 'document', - 'process', - 'Buffer', - 'require', -]); - -export function collectFiles( - dir, - files = [], - config = {}, - directories = null, - _visited = new Set(), -) { - const trackDirs = directories !== null; - - // Resolve real path to detect symlink loops - let realDir; - try { - realDir = fs.realpathSync(dir); - } catch { - return trackDirs ? { files, directories } : files; - } - if (_visited.has(realDir)) { - warn(`Symlink loop detected, skipping: ${dir}`); - return trackDirs ? { files, directories } : files; - } - _visited.add(realDir); - - let entries; - try { - entries = fs.readdirSync(dir, { withFileTypes: true }); - } catch (err) { - warn(`Cannot read directory ${dir}: ${err.message}`); - return trackDirs ? { files, directories } : files; - } - - // Merge config ignoreDirs with defaults - const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null; - - let hasFiles = false; - for (const entry of entries) { - if (entry.name.startsWith('.') && entry.name !== '.') { - if (IGNORE_DIRS.has(entry.name)) continue; - if (entry.isDirectory()) continue; - } - if (IGNORE_DIRS.has(entry.name)) continue; - if (extraIgnore?.has(entry.name)) continue; - - const full = path.join(dir, entry.name); - if (entry.isDirectory()) { - collectFiles(full, files, config, directories, _visited); - } else if (EXTENSIONS.has(path.extname(entry.name))) { - files.push(full); - hasFiles = true; - } - } - if (trackDirs && hasFiles) { - directories.add(dir); - } - return trackDirs ? { files, directories } : files; -} - -export function loadPathAliases(rootDir) { - const aliases = { baseUrl: null, paths: {} }; - for (const configName of ['tsconfig.json', 'jsconfig.json']) { - const configPath = path.join(rootDir, configName); - if (!fs.existsSync(configPath)) continue; - try { - const raw = fs - .readFileSync(configPath, 'utf-8') - .replace(/\/\/.*$/gm, '') - .replace(/\/\*[\s\S]*?\*\//g, '') - .replace(/,\s*([\]}])/g, '$1'); - const config = JSON.parse(raw); - const opts = config.compilerOptions || {}; - if (opts.baseUrl) aliases.baseUrl = path.resolve(rootDir, opts.baseUrl); - if (opts.paths) { - for (const [pattern, targets] of Object.entries(opts.paths)) { - aliases.paths[pattern] = targets.map((t) => path.resolve(aliases.baseUrl || rootDir, t)); - } - } - break; - } catch (err) { - warn(`Failed to parse ${configName}: ${err.message}`); - } - } - return aliases; -} - -/** - * Compute MD5 hash of file contents for incremental builds. - */ -function fileHash(content) { - return createHash('md5').update(content).digest('hex'); -} - -/** - * Stat a file, returning { mtimeMs, size } or null on error. - */ -function fileStat(filePath) { - try { - const s = fs.statSync(filePath); - return { mtimeMs: s.mtimeMs, size: s.size }; - } catch { - return null; - } -} - -/** - * Read a file with retry on transient errors (EBUSY/EACCES/EPERM). - * Editors performing non-atomic saves can cause these during mid-write. - */ -const TRANSIENT_CODES = new Set(['EBUSY', 'EACCES', 'EPERM']); -const RETRY_DELAY_MS = 50; - -export function readFileSafe(filePath, retries = 2) { - for (let attempt = 0; ; attempt++) { - try { - return fs.readFileSync(filePath, 'utf-8'); - } catch (err) { - if (attempt < retries && TRANSIENT_CODES.has(err.code)) { - const end = Date.now() + RETRY_DELAY_MS; - while (Date.now() < end) {} - continue; - } - throw err; - } - } -} - -/** - * Determine which files have changed since last build. - * Three-tier cascade: - * Tier 0 — Journal: O(changed) when watcher was running - * Tier 1 — mtime+size: O(n) stats, O(changed) reads - * Tier 2 — Hash comparison: O(changed) reads (fallback from Tier 1) - */ -function getChangedFiles(db, allFiles, rootDir) { - // Check if file_hashes table exists - let hasTable = false; - try { - db.prepare('SELECT 1 FROM file_hashes LIMIT 1').get(); - hasTable = true; - } catch { - /* table doesn't exist */ - } - - if (!hasTable) { - return { - changed: allFiles.map((f) => ({ file: f })), - removed: [], - isFullBuild: true, - }; - } - - const existing = new Map( - db - .prepare('SELECT file, hash, mtime, size FROM file_hashes') - .all() - .map((r) => [r.file, r]), - ); - - // Build set of current files for removal detection - const currentFiles = new Set(); - for (const file of allFiles) { - currentFiles.add(normalizePath(path.relative(rootDir, file))); - } - - const removed = []; - for (const existingFile of existing.keys()) { - if (!currentFiles.has(existingFile)) { - removed.push(existingFile); - } - } - - // ── Tier 0: Journal ────────────────────────────────────────────── - const journal = readJournal(rootDir); - if (journal.valid) { - // Validate journal timestamp against DB — journal should be from after the last build - const dbMtimes = db.prepare('SELECT MAX(mtime) as latest FROM file_hashes').get(); - const latestDbMtime = dbMtimes?.latest || 0; - - // Empty journal = no watcher was running, fall to Tier 1 for safety - const hasJournalEntries = journal.changed.length > 0 || journal.removed.length > 0; - - if (hasJournalEntries && journal.timestamp >= latestDbMtime) { - debug( - `Tier 0: journal valid, ${journal.changed.length} changed, ${journal.removed.length} removed`, - ); - const changed = []; - - for (const relPath of journal.changed) { - const absPath = path.join(rootDir, relPath); - const stat = fileStat(absPath); - if (!stat) continue; - - let content; - try { - content = readFileSafe(absPath); - } catch { - continue; - } - const hash = fileHash(content); - const record = existing.get(relPath); - if (!record || record.hash !== hash) { - changed.push({ file: absPath, content, hash, relPath, stat }); - } - } - - // Merge journal removals with filesystem removals (dedup) - const removedSet = new Set(removed); - for (const relPath of journal.removed) { - if (existing.has(relPath)) removedSet.add(relPath); - } - - return { changed, removed: [...removedSet], isFullBuild: false }; - } - debug( - `Tier 0: skipped (${hasJournalEntries ? 'timestamp stale' : 'no entries'}), falling to Tier 1`, - ); - } - - // ── Tier 1: mtime+size fast-path ───────────────────────────────── - const needsHash = []; // Files that failed mtime+size check - const skipped = []; // Files that passed mtime+size check - - for (const file of allFiles) { - const relPath = normalizePath(path.relative(rootDir, file)); - const record = existing.get(relPath); - - if (!record) { - // New file — needs full read+hash - needsHash.push({ file, relPath }); - continue; - } - - const stat = fileStat(file); - if (!stat) continue; - - const storedMtime = record.mtime || 0; - const storedSize = record.size || 0; - - // size > 0 guard: pre-v4 rows have size=0, always fall through to hash - if (storedSize > 0 && Math.floor(stat.mtimeMs) === storedMtime && stat.size === storedSize) { - skipped.push(relPath); - continue; - } - - needsHash.push({ file, relPath, stat }); - } - - if (needsHash.length > 0) { - debug(`Tier 1: ${skipped.length} skipped by mtime+size, ${needsHash.length} need hash check`); - } - - // ── Tier 2: Hash comparison ────────────────────────────────────── - const changed = []; - - for (const item of needsHash) { - let content; - try { - content = readFileSafe(item.file); - } catch { - continue; - } - const hash = fileHash(content); - const stat = item.stat || fileStat(item.file); - const record = existing.get(item.relPath); - - if (!record || record.hash !== hash) { - changed.push({ file: item.file, content, hash, relPath: item.relPath, stat }); - } else if (stat) { - // Hash matches but mtime/size was stale — self-heal by updating stored metadata - changed.push({ - file: item.file, - content, - hash, - relPath: item.relPath, - stat, - metadataOnly: true, - }); - } - } - - // Filter out metadata-only updates from the "changed" list for parsing, - // but keep them so the caller can update file_hashes - const parseChanged = changed.filter((c) => !c.metadataOnly); - if (needsHash.length > 0) { - debug( - `Tier 2: ${parseChanged.length} actually changed, ${changed.length - parseChanged.length} metadata-only`, - ); - } - - return { changed, removed, isFullBuild: false }; -} - -/** - * Purge all graph data for the specified files. - * Deletes: embeddings → edges (in+out) → node_metrics → function_complexity → dataflow → nodes. - * Handles missing tables gracefully (embeddings, complexity, dataflow may not exist in older DBs). - * - * @param {import('better-sqlite3').Database} db - Open writable database - * @param {string[]} files - Relative file paths to purge - * @param {object} [options] - * @param {boolean} [options.purgeHashes=true] - Also delete file_hashes entries - */ -export function purgeFilesFromGraph(db, files, options = {}) { - purgeFilesData(db, files, options); -} - -export async function buildGraph(rootDir, opts = {}) { - const _t_buildStart = performance.now(); - rootDir = path.resolve(rootDir); - const dbPath = path.join(rootDir, '.codegraph', 'graph.db'); - const db = openDb(dbPath); - initSchema(db); - - const config = loadConfig(rootDir); - const incremental = - opts.incremental !== false && config.build && config.build.incremental !== false; - - // Engine selection: 'native', 'wasm', or 'auto' (default) - const engineOpts = { - engine: opts.engine || 'auto', - dataflow: opts.dataflow !== false, - ast: opts.ast !== false, - }; - const { name: engineName, version: engineVersion } = getActiveEngine(engineOpts); - info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`); - - // Check for engine/schema mismatch — auto-promote to full rebuild - // Only trigger on engine change or schema version change (not every patch/minor bump) - const CURRENT_SCHEMA_VERSION = MIGRATIONS[MIGRATIONS.length - 1].version; - let forceFullRebuild = false; - if (incremental) { - const prevEngine = getBuildMeta(db, 'engine'); - if (prevEngine && prevEngine !== engineName) { - info(`Engine changed (${prevEngine} → ${engineName}), promoting to full rebuild.`); - forceFullRebuild = true; - } - const prevSchema = getBuildMeta(db, 'schema_version'); - if (prevSchema && Number(prevSchema) !== CURRENT_SCHEMA_VERSION) { - info( - `Schema version changed (${prevSchema} → ${CURRENT_SCHEMA_VERSION}), promoting to full rebuild.`, - ); - forceFullRebuild = true; - } - } - - const aliases = loadPathAliases(rootDir); - // Merge config aliases - if (config.aliases) { - for (const [key, value] of Object.entries(config.aliases)) { - const pattern = key.endsWith('/') ? `${key}*` : key; - const target = path.resolve(rootDir, value); - aliases.paths[pattern] = [target.endsWith('/') ? `${target}*` : `${target}/*`]; - } - } - - if (aliases.baseUrl || Object.keys(aliases.paths).length > 0) { - info( - `Loaded path aliases: baseUrl=${aliases.baseUrl || 'none'}, ${Object.keys(aliases.paths).length} path mappings`, - ); - } - - // ── Scoped rebuild: rebuild only specified files ────────────────── - let files, discoveredDirs, parseChanges, metadataUpdates, removed, isFullBuild; - - if (opts.scope) { - const scopedFiles = opts.scope.map((f) => normalizePath(f)); - const existing = []; - const missing = []; - for (const rel of scopedFiles) { - const abs = path.join(rootDir, rel); - if (fs.existsSync(abs)) { - existing.push({ file: abs, relPath: rel }); - } else { - missing.push(rel); - } - } - files = existing.map((e) => e.file); - // Derive discoveredDirs from scoped files' parent directories - discoveredDirs = new Set(existing.map((e) => path.dirname(e.file))); - parseChanges = existing; - metadataUpdates = []; - removed = missing; - isFullBuild = false; - info(`Scoped rebuild: ${existing.length} files to rebuild, ${missing.length} to purge`); - } else { - const collected = collectFiles(rootDir, [], config, new Set()); - files = collected.files; - discoveredDirs = collected.directories; - info(`Found ${files.length} files to parse`); - - // Check for incremental build - const increResult = - incremental && !forceFullRebuild - ? getChangedFiles(db, files, rootDir) - : { changed: files.map((f) => ({ file: f })), removed: [], isFullBuild: true }; - removed = increResult.removed; - isFullBuild = increResult.isFullBuild; - - // Separate metadata-only updates (mtime/size self-heal) from real changes - parseChanges = increResult.changed.filter((c) => !c.metadataOnly); - metadataUpdates = increResult.changed.filter((c) => c.metadataOnly); - } - - if (!isFullBuild && parseChanges.length === 0 && removed.length === 0) { - // Check if default analyses were never computed (e.g. legacy DB) - const needsCfg = - opts.cfg !== false && - (() => { - try { - return db.prepare('SELECT COUNT(*) as c FROM cfg_blocks').get().c === 0; - } catch { - return true; - } - })(); - const needsDataflow = - opts.dataflow !== false && - (() => { - try { - return db.prepare('SELECT COUNT(*) as c FROM dataflow').get().c === 0; - } catch { - return true; - } - })(); - - if (needsCfg || needsDataflow) { - info('No file changes. Running pending analysis pass...'); - const analysisOpts = { - ...engineOpts, - dataflow: needsDataflow && opts.dataflow !== false, - }; - const analysisSymbols = await parseFilesAuto(files, rootDir, analysisOpts); - if (needsCfg) { - const { buildCFGData } = await import('./cfg.js'); - await buildCFGData(db, analysisSymbols, rootDir, engineOpts); - } - if (needsDataflow) { - const { buildDataflowEdges } = await import('./dataflow.js'); - await buildDataflowEdges(db, analysisSymbols, rootDir, engineOpts); - } - closeDb(db); - writeJournalHeader(rootDir, Date.now()); - return; - } - - // Still update metadata for self-healing even when no real changes - if (metadataUpdates.length > 0) { - try { - const healHash = db.prepare( - 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', - ); - const healTx = db.transaction(() => { - for (const item of metadataUpdates) { - const mtime = item.stat ? Math.floor(item.stat.mtimeMs) : 0; - const size = item.stat ? item.stat.size : 0; - healHash.run(item.relPath, item.hash, mtime, size); - } - }); - healTx(); - debug(`Self-healed mtime/size for ${metadataUpdates.length} files`); - } catch { - /* ignore heal errors */ - } - } - info('No changes detected. Graph is up to date.'); - closeDb(db); - writeJournalHeader(rootDir, Date.now()); - return; - } - - // Check if embeddings table exists (created by `embed`, not by initSchema) - let hasEmbeddings = false; - try { - db.prepare('SELECT 1 FROM embeddings LIMIT 1').get(); - hasEmbeddings = true; - } catch { - /* table doesn't exist */ - } - - if (isFullBuild) { - const deletions = - 'PRAGMA foreign_keys = OFF; DELETE FROM cfg_edges; DELETE FROM cfg_blocks; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM function_complexity; DELETE FROM dataflow; DELETE FROM ast_nodes; DELETE FROM nodes; PRAGMA foreign_keys = ON;'; - db.exec( - hasEmbeddings - ? `${deletions.replace('PRAGMA foreign_keys = ON;', '')} DELETE FROM embeddings; PRAGMA foreign_keys = ON;` - : deletions, - ); - } else { - // ── Reverse-dependency cascade (issue #116) ───────────────────── - // Find files with edges pointing TO changed/removed files. - // Their nodes stay intact (preserving IDs), but outgoing edges are - // deleted so they can be rebuilt during the edge-building pass. - // When opts.noReverseDeps is true (e.g. agent rollback to same version), - // skip this cascade — the agent knows exports didn't change. - const reverseDeps = new Set(); - if (!opts.noReverseDeps) { - const changedRelPaths = new Set(); - for (const item of parseChanges) { - changedRelPaths.add(item.relPath || normalizePath(path.relative(rootDir, item.file))); - } - for (const relPath of removed) { - changedRelPaths.add(relPath); - } - - if (changedRelPaths.size > 0) { - const findReverseDeps = db.prepare(` - SELECT DISTINCT n_src.file FROM edges e - JOIN nodes n_src ON e.source_id = n_src.id - JOIN nodes n_tgt ON e.target_id = n_tgt.id - WHERE n_tgt.file = ? AND n_src.file != n_tgt.file AND n_src.kind != 'directory' - `); - for (const relPath of changedRelPaths) { - for (const row of findReverseDeps.all(relPath)) { - if (!changedRelPaths.has(row.file) && !reverseDeps.has(row.file)) { - // Verify the file still exists on disk - const absPath = path.join(rootDir, row.file); - if (fs.existsSync(absPath)) { - reverseDeps.add(row.file); - } - } - } - } - } - } - - info( - `Incremental: ${parseChanges.length} changed, ${removed.length} removed${reverseDeps.size > 0 ? `, ${reverseDeps.size} reverse-deps` : ''}`, - ); - if (parseChanges.length > 0) - debug(`Changed files: ${parseChanges.map((c) => c.relPath).join(', ')}`); - if (removed.length > 0) debug(`Removed files: ${removed.join(', ')}`); - // Remove embeddings/metrics/edges/nodes for changed and removed files - const changePaths = parseChanges.map( - (item) => item.relPath || normalizePath(path.relative(rootDir, item.file)), - ); - purgeFilesFromGraph(db, [...removed, ...changePaths], { purgeHashes: false }); - - // Process reverse deps: delete only outgoing edges (nodes/IDs preserved) - // then add them to the parse list so they participate in edge building - const deleteOutgoingEdgesForFile = db.prepare( - 'DELETE FROM edges WHERE source_id IN (SELECT id FROM nodes WHERE file = ?)', - ); - for (const relPath of reverseDeps) { - deleteOutgoingEdgesForFile.run(relPath); - } - for (const relPath of reverseDeps) { - const absPath = path.join(rootDir, relPath); - parseChanges.push({ file: absPath, relPath, _reverseDepOnly: true }); - } - } - - const getNodeIdStmt = { - get: (name, kind, file, line) => { - const id = getNodeId(db, name, kind, file, line); - return id != null ? { id } : undefined; - }, - }; - - // Batch INSERT helpers — multi-value INSERTs reduce SQLite round-trips - const BATCH_CHUNK = 200; - function batchInsertNodes(rows) { - if (!rows.length) return; - const ph = '(?,?,?,?,?,?)'; - for (let i = 0; i < rows.length; i += BATCH_CHUNK) { - const chunk = rows.slice(i, i + BATCH_CHUNK); - const vals = []; - for (const r of chunk) vals.push(r[0], r[1], r[2], r[3], r[4], r[5]); - db.prepare( - 'INSERT OR IGNORE INTO nodes (name,kind,file,line,end_line,parent_id) VALUES ' + - chunk.map(() => ph).join(','), - ).run(...vals); - } - } - function batchInsertEdges(rows) { - if (!rows.length) return; - const ph = '(?,?,?,?,?)'; - for (let i = 0; i < rows.length; i += BATCH_CHUNK) { - const chunk = rows.slice(i, i + BATCH_CHUNK); - const vals = []; - for (const r of chunk) vals.push(r[0], r[1], r[2], r[3], r[4]); - db.prepare( - 'INSERT INTO edges (source_id,target_id,kind,confidence,dynamic) VALUES ' + - chunk.map(() => ph).join(','), - ).run(...vals); - } - } - - // Prepare hash upsert (with size column from migration v4) - let upsertHash; - try { - upsertHash = db.prepare( - 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', - ); - } catch { - upsertHash = null; - } - - // First pass: parse files and insert nodes - const fileSymbols = new Map(); - - // For incremental builds, also load existing symbols that aren't changing - if (!isFullBuild) { - // We need to reload ALL file symbols for edge building - const _allExistingFiles = db - .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'") - .all(); - // We'll fill these in during the parse pass + edge pass - } - - const filesToParse = isFullBuild ? files.map((f) => ({ file: f })) : parseChanges; - - // ── Phase timing ──────────────────────────────────────────────────── - const _t = {}; - _t.setupMs = performance.now() - _t_buildStart; - - // ── Unified parse via parseFilesAuto ─────────────────────────────── - const filePaths = filesToParse.map((item) => item.file); - _t.parse0 = performance.now(); - const allSymbols = await parseFilesAuto(filePaths, rootDir, engineOpts); - _t.parseMs = performance.now() - _t.parse0; - - // Build a lookup from incremental data (changed items may carry pre-computed hashes + stats) - const precomputedData = new Map(); - for (const item of filesToParse) { - if (item.relPath) { - precomputedData.set(item.relPath, item); - } - } - - // Bulk-fetch all node IDs for a file in one query (replaces per-node getNodeId calls) - const bulkGetNodeIds = { all: (file) => bulkNodeIdsByFile(db, file) }; - - const insertAll = db.transaction(() => { - // Phase 1: Batch insert all file nodes + definitions + exports - const phase1Rows = []; - for (const [relPath, symbols] of allSymbols) { - fileSymbols.set(relPath, symbols); - phase1Rows.push([relPath, 'file', relPath, 0, null, null]); - for (const def of symbols.definitions) { - phase1Rows.push([def.name, def.kind, relPath, def.line, def.endLine || null, null]); - } - for (const exp of symbols.exports) { - phase1Rows.push([exp.name, exp.kind, relPath, exp.line, null, null]); - } - } - batchInsertNodes(phase1Rows); - - // Phase 1b: Mark exported symbols - const markExported = db.prepare( - 'UPDATE nodes SET exported = 1 WHERE name = ? AND kind = ? AND file = ? AND line = ?', - ); - for (const [relPath, symbols] of allSymbols) { - for (const exp of symbols.exports) { - markExported.run(exp.name, exp.kind, relPath, exp.line); - } - } - - // Phase 3: Batch insert children (needs parent IDs from Phase 2) - const childRows = []; - for (const [relPath, symbols] of allSymbols) { - const nodeIdMap = new Map(); - for (const row of bulkGetNodeIds.all(relPath)) { - nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); - } - for (const def of symbols.definitions) { - if (!def.children?.length) continue; - const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); - if (!defId) continue; - for (const child of def.children) { - childRows.push([ - child.name, - child.kind, - relPath, - child.line, - child.endLine || null, - defId, - ]); - } - } - } - batchInsertNodes(childRows); - - // Phase 5: Batch insert contains/parameter_of edges - const edgeRows = []; - for (const [relPath, symbols] of allSymbols) { - // Re-fetch to include children IDs - const nodeIdMap = new Map(); - for (const row of bulkGetNodeIds.all(relPath)) { - nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); - } - const fileId = nodeIdMap.get(`${relPath}|file|0`); - for (const def of symbols.definitions) { - const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); - if (fileId && defId) { - edgeRows.push([fileId, defId, 'contains', 1.0, 0]); - } - if (def.children?.length && defId) { - for (const child of def.children) { - const childId = nodeIdMap.get(`${child.name}|${child.kind}|${child.line}`); - if (childId) { - edgeRows.push([defId, childId, 'contains', 1.0, 0]); - if (child.kind === 'parameter') { - edgeRows.push([childId, defId, 'parameter_of', 1.0, 0]); - } - } - } - } - } - - // Update file hash with real mtime+size for incremental builds - // Skip for reverse-dep files — they didn't actually change - if (upsertHash) { - const precomputed = precomputedData.get(relPath); - if (precomputed?._reverseDepOnly) { - // no-op: file unchanged, hash already correct - } else if (precomputed?.hash) { - const stat = precomputed.stat || fileStat(path.join(rootDir, relPath)); - const mtime = stat ? Math.floor(stat.mtimeMs) : 0; - const size = stat ? stat.size : 0; - upsertHash.run(relPath, precomputed.hash, mtime, size); - } else { - const absPath = path.join(rootDir, relPath); - let code; - try { - code = readFileSafe(absPath); - } catch { - code = null; - } - if (code !== null) { - const stat = fileStat(absPath); - const mtime = stat ? Math.floor(stat.mtimeMs) : 0; - const size = stat ? stat.size : 0; - upsertHash.run(relPath, fileHash(code), mtime, size); - } - } - } - } - batchInsertEdges(edgeRows); - - // Also update metadata-only entries (self-heal mtime/size without re-parse) - if (upsertHash) { - for (const item of metadataUpdates) { - const mtime = item.stat ? Math.floor(item.stat.mtimeMs) : 0; - const size = item.stat ? item.stat.size : 0; - upsertHash.run(item.relPath, item.hash, mtime, size); - } - } - }); - _t.insert0 = performance.now(); - insertAll(); - _t.insertMs = performance.now() - _t.insert0; - - const parsed = allSymbols.size; - const skipped = filesToParse.length - parsed; - info(`Parsed ${parsed} files (${skipped} skipped)`); - - // Clean up removed file hashes - if (upsertHash && removed.length > 0) { - const deleteHash = db.prepare('DELETE FROM file_hashes WHERE file = ?'); - for (const relPath of removed) { - deleteHash.run(relPath); - } - } - - // ── Batch import resolution ──────────────────────────────────────── - // Collect all (fromFile, importSource) pairs and resolve in one native call - _t.resolve0 = performance.now(); - const batchInputs = []; - for (const [relPath, symbols] of fileSymbols) { - const absFile = path.join(rootDir, relPath); - for (const imp of symbols.imports) { - batchInputs.push({ fromFile: absFile, importSource: imp.source }); - } - } - const batchResolved = resolveImportsBatch(batchInputs, rootDir, aliases, files); - _t.resolveMs = performance.now() - _t.resolve0; - - function getResolved(absFile, importSource) { - if (batchResolved) { - const key = `${absFile}|${importSource}`; - const hit = batchResolved.get(key); - if (hit !== undefined) return hit; - } - return resolveImportPath(absFile, importSource, rootDir, aliases); - } - - // Build re-export map for barrel resolution - const reexportMap = new Map(); - for (const [relPath, symbols] of fileSymbols) { - const reexports = symbols.imports.filter((imp) => imp.reexport); - if (reexports.length > 0) { - reexportMap.set( - relPath, - reexports.map((imp) => ({ - source: getResolved(path.join(rootDir, relPath), imp.source), - names: imp.names, - wildcardReexport: imp.wildcardReexport || false, - })), - ); - } - } - - // For incremental builds, load unchanged barrel files into reexportMap - // so barrel-resolved import/call edges aren't dropped for reverse-dep files. - // These files are loaded only for resolution — they must NOT be iterated - // in the edge-building loop (their existing edges are still in the DB). - const barrelOnlyFiles = new Set(); - if (!isFullBuild) { - const barrelCandidates = db - .prepare( - `SELECT DISTINCT n1.file FROM edges e - JOIN nodes n1 ON e.source_id = n1.id - WHERE e.kind = 'reexports' AND n1.kind = 'file'`, - ) - .all(); - for (const { file: relPath } of barrelCandidates) { - if (fileSymbols.has(relPath)) continue; - const absPath = path.join(rootDir, relPath); - try { - const symbols = await parseFilesAuto([absPath], rootDir, engineOpts); - const fileSym = symbols.get(relPath); - if (fileSym) { - fileSymbols.set(relPath, fileSym); - barrelOnlyFiles.add(relPath); - const reexports = fileSym.imports.filter((imp) => imp.reexport); - if (reexports.length > 0) { - reexportMap.set( - relPath, - reexports.map((imp) => ({ - source: getResolved(absPath, imp.source), - names: imp.names, - wildcardReexport: imp.wildcardReexport || false, - })), - ); - } - } - } catch { - /* skip if unreadable */ - } - } - } - - function isBarrelFile(relPath) { - const symbols = fileSymbols.get(relPath); - if (!symbols) return false; - const reexports = symbols.imports.filter((imp) => imp.reexport); - if (reexports.length === 0) return false; - const ownDefs = symbols.definitions.length; - return reexports.length >= ownDefs; - } - - function resolveBarrelExport(barrelPath, symbolName, visited = new Set()) { - if (visited.has(barrelPath)) return null; - visited.add(barrelPath); - const reexports = reexportMap.get(barrelPath); - if (!reexports) return null; - - for (const re of reexports) { - if (re.names.length > 0 && !re.wildcardReexport) { - if (re.names.includes(symbolName)) { - const targetSymbols = fileSymbols.get(re.source); - if (targetSymbols) { - const hasDef = targetSymbols.definitions.some((d) => d.name === symbolName); - if (hasDef) return re.source; - const deeper = resolveBarrelExport(re.source, symbolName, visited); - if (deeper) return deeper; - } - return re.source; - } - continue; - } - if (re.wildcardReexport || re.names.length === 0) { - const targetSymbols = fileSymbols.get(re.source); - if (targetSymbols) { - const hasDef = targetSymbols.definitions.some((d) => d.name === symbolName); - if (hasDef) return re.source; - const deeper = resolveBarrelExport(re.source, symbolName, visited); - if (deeper) return deeper; - } - } - } - return null; - } - - // N+1 optimization: pre-load all nodes into a lookup map for edge building - const allNodes = db - .prepare( - `SELECT id, name, kind, file, line FROM nodes WHERE kind IN ('function','method','class','interface','struct','type','module','enum','trait')`, - ) - .all(); - const nodesByName = new Map(); - for (const node of allNodes) { - if (!nodesByName.has(node.name)) nodesByName.set(node.name, []); - nodesByName.get(node.name).push(node); - } - const nodesByNameAndFile = new Map(); - for (const node of allNodes) { - const key = `${node.name}|${node.file}`; - if (!nodesByNameAndFile.has(key)) nodesByNameAndFile.set(key, []); - nodesByNameAndFile.get(key).push(node); - } - - // Second pass: build edges (accumulated and batch-inserted) - _t.edges0 = performance.now(); - const buildEdges = db.transaction(() => { - const allEdgeRows = []; - - for (const [relPath, symbols] of fileSymbols) { - // Skip barrel-only files — loaded for resolution, edges already in DB - if (barrelOnlyFiles.has(relPath)) continue; - const fileNodeRow = getNodeIdStmt.get(relPath, 'file', relPath, 0); - if (!fileNodeRow) continue; - const fileNodeId = fileNodeRow.id; - - // Import edges - for (const imp of symbols.imports) { - const resolvedPath = getResolved(path.join(rootDir, relPath), imp.source); - const targetRow = getNodeIdStmt.get(resolvedPath, 'file', resolvedPath, 0); - if (targetRow) { - const edgeKind = imp.reexport - ? 'reexports' - : imp.typeOnly - ? 'imports-type' - : imp.dynamicImport - ? 'dynamic-imports' - : 'imports'; - allEdgeRows.push([fileNodeId, targetRow.id, edgeKind, 1.0, 0]); - - if (!imp.reexport && isBarrelFile(resolvedPath)) { - const resolvedSources = new Set(); - for (const name of imp.names) { - const cleanName = name.replace(/^\*\s+as\s+/, ''); - const actualSource = resolveBarrelExport(resolvedPath, cleanName); - if ( - actualSource && - actualSource !== resolvedPath && - !resolvedSources.has(actualSource) - ) { - resolvedSources.add(actualSource); - const actualRow = getNodeIdStmt.get(actualSource, 'file', actualSource, 0); - if (actualRow) { - allEdgeRows.push([ - fileNodeId, - actualRow.id, - edgeKind === 'imports-type' - ? 'imports-type' - : edgeKind === 'dynamic-imports' - ? 'dynamic-imports' - : 'imports', - 0.9, - 0, - ]); - } - } - } - } - } - } - } - - // Call/receiver/extends/implements edges — native when available - const native = engineName === 'native' ? loadNative() : null; - if (native?.buildCallEdges) { - const nativeFiles = []; - for (const [relPath, symbols] of fileSymbols) { - if (barrelOnlyFiles.has(relPath)) continue; - const fileNodeRow = getNodeIdStmt.get(relPath, 'file', relPath, 0); - if (!fileNodeRow) continue; - - // Pre-resolve imported names (including barrel resolution) - const importedNames = []; - for (const imp of symbols.imports) { - const resolvedPath = getResolved(path.join(rootDir, relPath), imp.source); - for (const name of imp.names) { - const cleanName = name.replace(/^\*\s+as\s+/, ''); - let targetFile = resolvedPath; - if (isBarrelFile(resolvedPath)) { - const actual = resolveBarrelExport(resolvedPath, cleanName); - if (actual) targetFile = actual; - } - importedNames.push({ name: cleanName, file: targetFile }); - } - } - - nativeFiles.push({ - file: relPath, - fileNodeId: fileNodeRow.id, - definitions: symbols.definitions.map((d) => ({ - name: d.name, - kind: d.kind, - line: d.line, - endLine: d.endLine ?? null, - })), - calls: symbols.calls, - importedNames, - classes: symbols.classes, - }); - } - - const nativeEdges = native.buildCallEdges(nativeFiles, allNodes, [...BUILTIN_RECEIVERS]); - - for (const e of nativeEdges) { - allEdgeRows.push([e.sourceId, e.targetId, e.kind, e.confidence, e.dynamic]); - } - } else { - // JS fallback — call/receiver/extends/implements edges - for (const [relPath, symbols] of fileSymbols) { - if (barrelOnlyFiles.has(relPath)) continue; - const fileNodeRow = getNodeIdStmt.get(relPath, 'file', relPath, 0); - if (!fileNodeRow) continue; - - // Build import name -> target file mapping - const importedNames = new Map(); - for (const imp of symbols.imports) { - const resolvedPath = getResolved(path.join(rootDir, relPath), imp.source); - for (const name of imp.names) { - const cleanName = name.replace(/^\*\s+as\s+/, ''); - importedNames.set(cleanName, resolvedPath); - } - } - - // Call edges with confidence scoring — using pre-loaded lookup maps (N+1 fix) - const seenCallEdges = new Set(); - for (const call of symbols.calls) { - if (call.receiver && BUILTIN_RECEIVERS.has(call.receiver)) continue; - let caller = null; - let callerSpan = Infinity; - for (const def of symbols.definitions) { - if (def.line <= call.line) { - const end = def.endLine || Infinity; - if (call.line <= end) { - const span = end - def.line; - if (span < callerSpan) { - const row = getNodeIdStmt.get(def.name, def.kind, relPath, def.line); - if (row) { - caller = row; - callerSpan = span; - } - } - } else if (!caller) { - const row = getNodeIdStmt.get(def.name, def.kind, relPath, def.line); - if (row) caller = row; - } - } - } - if (!caller) caller = fileNodeRow; - - const isDynamic = call.dynamic ? 1 : 0; - let targets; - const importedFrom = importedNames.get(call.name); - - if (importedFrom) { - targets = nodesByNameAndFile.get(`${call.name}|${importedFrom}`) || []; - - if (targets.length === 0 && isBarrelFile(importedFrom)) { - const actualSource = resolveBarrelExport(importedFrom, call.name); - if (actualSource) { - targets = nodesByNameAndFile.get(`${call.name}|${actualSource}`) || []; - } - } - } - if (!targets || targets.length === 0) { - targets = nodesByNameAndFile.get(`${call.name}|${relPath}`) || []; - if (targets.length === 0) { - const methodCandidates = (nodesByName.get(call.name) || []).filter( - (n) => n.name.endsWith(`.${call.name}`) && n.kind === 'method', - ); - if (methodCandidates.length > 0) { - targets = methodCandidates; - } else if ( - !call.receiver || - call.receiver === 'this' || - call.receiver === 'self' || - call.receiver === 'super' - ) { - targets = (nodesByName.get(call.name) || []).filter( - (n) => computeConfidence(relPath, n.file, null) >= 0.5, - ); - } - } - } - - if (targets.length > 1) { - targets.sort((a, b) => { - const confA = computeConfidence(relPath, a.file, importedFrom); - const confB = computeConfidence(relPath, b.file, importedFrom); - return confB - confA; - }); - } - - for (const t of targets) { - const edgeKey = `${caller.id}|${t.id}`; - if (t.id !== caller.id && !seenCallEdges.has(edgeKey)) { - seenCallEdges.add(edgeKey); - const confidence = computeConfidence(relPath, t.file, importedFrom); - allEdgeRows.push([caller.id, t.id, 'calls', confidence, isDynamic]); - } - } - - // Receiver edge: caller → receiver type node - if ( - call.receiver && - !BUILTIN_RECEIVERS.has(call.receiver) && - call.receiver !== 'this' && - call.receiver !== 'self' && - call.receiver !== 'super' - ) { - const receiverKinds = new Set(['class', 'struct', 'interface', 'type', 'module']); - const samefile = nodesByNameAndFile.get(`${call.receiver}|${relPath}`) || []; - const candidates = - samefile.length > 0 ? samefile : nodesByName.get(call.receiver) || []; - const receiverNodes = candidates.filter((n) => receiverKinds.has(n.kind)); - if (receiverNodes.length > 0 && caller) { - const recvTarget = receiverNodes[0]; - const recvKey = `recv|${caller.id}|${recvTarget.id}`; - if (!seenCallEdges.has(recvKey)) { - seenCallEdges.add(recvKey); - allEdgeRows.push([caller.id, recvTarget.id, 'receiver', 0.7, 0]); - } - } - } - } - - // Class extends edges - for (const cls of symbols.classes) { - if (cls.extends) { - const sourceRow = (nodesByNameAndFile.get(`${cls.name}|${relPath}`) || []).find( - (n) => n.kind === 'class', - ); - const targetCandidates = nodesByName.get(cls.extends) || []; - const targetRows = targetCandidates.filter((n) => n.kind === 'class'); - if (sourceRow) { - for (const t of targetRows) { - allEdgeRows.push([sourceRow.id, t.id, 'extends', 1.0, 0]); - } - } - } - - if (cls.implements) { - const sourceRow = (nodesByNameAndFile.get(`${cls.name}|${relPath}`) || []).find( - (n) => n.kind === 'class', - ); - const targetCandidates = nodesByName.get(cls.implements) || []; - const targetRows = targetCandidates.filter( - (n) => n.kind === 'interface' || n.kind === 'class', - ); - if (sourceRow) { - for (const t of targetRows) { - allEdgeRows.push([sourceRow.id, t.id, 'implements', 1.0, 0]); - } - } - } - } - } - } - - batchInsertEdges(allEdgeRows); - }); - buildEdges(); - _t.edgesMs = performance.now() - _t.edges0; - - // Build line count map for structure metrics (prefer cached _lineCount from parser) - const lineCountMap = new Map(); - for (const [relPath, symbols] of fileSymbols) { - if (symbols.lineCount ?? symbols._lineCount) { - lineCountMap.set(relPath, symbols.lineCount ?? symbols._lineCount); - } else { - const absPath = path.join(rootDir, relPath); - try { - const content = fs.readFileSync(absPath, 'utf-8'); - lineCountMap.set(relPath, content.split('\n').length); - } catch { - lineCountMap.set(relPath, 0); - } - } - } - - // For incremental builds, buildStructure needs ALL files (not just changed ones) - // because it clears and rebuilds all contains edges and directory metrics. - // Load unchanged files from the DB so structure data stays complete. - if (!isFullBuild) { - const existingFiles = db.prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'").all(); - const defsByFile = db.prepare( - "SELECT name, kind, line FROM nodes WHERE file = ? AND kind != 'file' AND kind != 'directory'", - ); - // Count imports per file — buildStructure only uses imports.length for metrics - const importCountByFile = db.prepare( - `SELECT COUNT(DISTINCT n2.file) AS cnt FROM edges e - JOIN nodes n1 ON e.source_id = n1.id - JOIN nodes n2 ON e.target_id = n2.id - WHERE n1.file = ? AND e.kind = 'imports'`, - ); - const lineCountByFile = db.prepare( - `SELECT n.name AS file, m.line_count - FROM node_metrics m JOIN nodes n ON m.node_id = n.id - WHERE n.kind = 'file'`, - ); - const cachedLineCounts = new Map(); - for (const row of lineCountByFile.all()) { - cachedLineCounts.set(row.file, row.line_count); - } - let loadedFromDb = 0; - for (const { file: relPath } of existingFiles) { - if (!fileSymbols.has(relPath)) { - const importCount = importCountByFile.get(relPath)?.cnt || 0; - fileSymbols.set(relPath, { - definitions: defsByFile.all(relPath), - imports: new Array(importCount), - exports: [], - }); - loadedFromDb++; - } - if (!lineCountMap.has(relPath)) { - const cached = cachedLineCounts.get(relPath); - if (cached != null) { - lineCountMap.set(relPath, cached); - } else { - const absPath = path.join(rootDir, relPath); - try { - const content = fs.readFileSync(absPath, 'utf-8'); - lineCountMap.set(relPath, content.split('\n').length); - } catch { - lineCountMap.set(relPath, 0); - } - } - } - } - debug(`Structure: ${fileSymbols.size} files (${loadedFromDb} loaded from DB)`); - } - - // Build directory structure, containment edges, and metrics - _t.structure0 = performance.now(); - const relDirs = new Set(); - for (const absDir of discoveredDirs) { - relDirs.add(normalizePath(path.relative(rootDir, absDir))); - } - try { - const { buildStructure } = await import('./structure.js'); - // Pass changed file paths so incremental builds can scope the rebuild - const changedFilePaths = isFullBuild ? null : [...allSymbols.keys()]; - buildStructure(db, fileSymbols, rootDir, lineCountMap, relDirs, changedFilePaths); - } catch (err) { - debug(`Structure analysis failed: ${err.message}`); - } - _t.structureMs = performance.now() - _t.structure0; - - // Classify node roles (entry, core, utility, adapter, dead, leaf) - _t.roles0 = performance.now(); - try { - const { classifyNodeRoles } = await import('./structure.js'); - const roleSummary = classifyNodeRoles(db); - debug( - `Roles: ${Object.entries(roleSummary) - .map(([r, c]) => `${r}=${c}`) - .join(', ')}`, - ); - } catch (err) { - debug(`Role classification failed: ${err.message}`); - } - _t.rolesMs = performance.now() - _t.roles0; - - // For incremental builds, filter out reverse-dep-only files from AST/complexity/CFG/dataflow - // — their content didn't change, so existing ast_nodes/function_complexity rows are valid. - let astComplexitySymbols = allSymbols; - if (!isFullBuild) { - const reverseDepFiles = new Set( - filesToParse.filter((item) => item._reverseDepOnly).map((item) => item.relPath), - ); - if (reverseDepFiles.size > 0) { - astComplexitySymbols = new Map(); - for (const [relPath, symbols] of allSymbols) { - if (!reverseDepFiles.has(relPath)) { - astComplexitySymbols.set(relPath, symbols); - } - } - debug( - `AST/complexity/CFG/dataflow: processing ${astComplexitySymbols.size} changed files (skipping ${reverseDepFiles.size} reverse-deps)`, - ); - } - } - - // ── Unified AST analysis engine ────────────────────────────────────── - // Replaces 4 sequential buildXxx calls with one coordinated pass. - { - const { runAnalyses } = await import('./ast-analysis/engine.js'); - try { - const analysisTiming = await runAnalyses(db, astComplexitySymbols, rootDir, opts, engineOpts); - _t.astMs = analysisTiming.astMs; - _t.complexityMs = analysisTiming.complexityMs; - _t.cfgMs = analysisTiming.cfgMs; - _t.dataflowMs = analysisTiming.dataflowMs; - } catch (err) { - debug(`Unified analysis engine failed: ${err.message}`); - } - } - - _t.finalize0 = performance.now(); - - // Release any remaining cached WASM trees — call .delete() to free WASM memory - for (const [, symbols] of allSymbols) { - if (symbols._tree && typeof symbols._tree.delete === 'function') { - try { - symbols._tree.delete(); - } catch {} - } - symbols._tree = null; - symbols._langId = null; - } - - const nodeCount = db.prepare('SELECT COUNT(*) as c FROM nodes').get().c; - const actualEdgeCount = db.prepare('SELECT COUNT(*) as c FROM edges').get().c; - info(`Graph built: ${nodeCount} nodes, ${actualEdgeCount} edges`); - info(`Stored in ${dbPath}`); - - // Verify incremental build didn't diverge significantly from previous counts - if (!isFullBuild) { - const prevNodes = getBuildMeta(db, 'node_count'); - const prevEdges = getBuildMeta(db, 'edge_count'); - if (prevNodes && prevEdges) { - const prevN = Number(prevNodes); - const prevE = Number(prevEdges); - if (prevN > 0) { - const nodeDrift = Math.abs(nodeCount - prevN) / prevN; - const edgeDrift = prevE > 0 ? Math.abs(actualEdgeCount - prevE) / prevE : 0; - const driftThreshold = config.build?.driftThreshold ?? 0.2; - if (nodeDrift > driftThreshold || edgeDrift > driftThreshold) { - warn( - `Incremental build diverged significantly from previous counts (nodes: ${prevN}→${nodeCount} [${(nodeDrift * 100).toFixed(1)}%], edges: ${prevE}→${actualEdgeCount} [${(edgeDrift * 100).toFixed(1)}%], threshold: ${(driftThreshold * 100).toFixed(0)}%). Consider rebuilding with --no-incremental.`, - ); - } - } - } - } - - // Warn about orphaned embeddings that no longer match any node - if (hasEmbeddings) { - try { - const orphaned = db - .prepare('SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)') - .get().c; - if (orphaned > 0) { - warn( - `${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, - ); - } - } catch { - /* ignore — embeddings table may have been dropped */ - } - } - - // Warn about unused exports (exported but zero cross-file consumers) - try { - const unusedCount = db - .prepare( - `SELECT COUNT(*) as c FROM nodes - WHERE exported = 1 AND kind != 'file' - AND id NOT IN ( - SELECT DISTINCT e.target_id FROM edges e - JOIN nodes caller ON e.source_id = caller.id - JOIN nodes target ON e.target_id = target.id - WHERE e.kind = 'calls' AND caller.file != target.file - )`, - ) - .get().c; - if (unusedCount > 0) { - warn( - `${unusedCount} exported symbol${unusedCount > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`, - ); - } - } catch { - /* exported column may not exist on older DBs */ - } - - // Persist build metadata for mismatch detection - try { - setBuildMeta(db, { - engine: engineName, - engine_version: engineVersion || '', - codegraph_version: CODEGRAPH_VERSION, - schema_version: String(CURRENT_SCHEMA_VERSION), - built_at: new Date().toISOString(), - node_count: nodeCount, - edge_count: actualEdgeCount, - }); - } catch (err) { - warn(`Failed to write build metadata: ${err.message}`); - } - - closeDb(db); - - // Write journal header after successful build - writeJournalHeader(rootDir, Date.now()); - - if (!opts.skipRegistry) { - const { tmpdir } = await import('node:os'); - const tmpDir = path.resolve(tmpdir()); - const resolvedRoot = path.resolve(rootDir); - if (resolvedRoot.startsWith(tmpDir)) { - debug(`Skipping auto-registration for temp directory: ${resolvedRoot}`); - } else { - try { - const { registerRepo } = await import('./registry.js'); - registerRepo(rootDir); - } catch (err) { - debug(`Auto-registration failed: ${err.message}`); - } - } - } - - _t.finalizeMs = performance.now() - _t.finalize0; - - return { - phases: { - setupMs: +_t.setupMs.toFixed(1), - parseMs: +_t.parseMs.toFixed(1), - insertMs: +_t.insertMs.toFixed(1), - resolveMs: +_t.resolveMs.toFixed(1), - edgesMs: +_t.edgesMs.toFixed(1), - structureMs: +_t.structureMs.toFixed(1), - rolesMs: +_t.rolesMs.toFixed(1), - astMs: +_t.astMs.toFixed(1), - complexityMs: +_t.complexityMs.toFixed(1), - ...(_t.cfgMs != null && { cfgMs: +_t.cfgMs.toFixed(1) }), - ...(_t.dataflowMs != null && { dataflowMs: +_t.dataflowMs.toFixed(1) }), - finalizeMs: +_t.finalizeMs.toFixed(1), - }, - }; -} diff --git a/src/builder/context.js b/src/builder/context.js new file mode 100644 index 0000000..31024d6 --- /dev/null +++ b/src/builder/context.js @@ -0,0 +1,85 @@ +/** + * PipelineContext — shared mutable state threaded through all build stages. + * + * Each stage reads what it needs and writes what it produces. + * This replaces the closure-captured locals in the old monolithic buildGraph(). + */ +export class PipelineContext { + // ── Inputs (set during setup) ────────────────────────────────────── + /** @type {string} Absolute root directory */ + rootDir; + /** @type {import('better-sqlite3').Database} */ + db; + /** @type {string} Absolute path to the database file */ + dbPath; + /** @type {object} From loadConfig() */ + config; + /** @type {object} Original buildGraph opts */ + opts; + /** @type {{ engine: string, dataflow: boolean, ast: boolean }} */ + engineOpts; + /** @type {string} 'native' | 'wasm' */ + engineName; + /** @type {string|null} */ + engineVersion; + /** @type {{ baseUrl: string|null, paths: object }} */ + aliases; + /** @type {boolean} Whether incremental mode is enabled */ + incremental; + /** @type {boolean} Force full rebuild (engine/schema mismatch) */ + forceFullRebuild = false; + /** @type {number} Current schema version */ + schemaVersion; + + // ── File collection (set by collectFiles stage) ──────────────────── + /** @type {string[]} Absolute file paths */ + allFiles; + /** @type {Set} Absolute directory paths */ + discoveredDirs; + + // ── Change detection (set by detectChanges stage) ────────────────── + /** @type {boolean} */ + isFullBuild; + /** @type {Array<{ file: string, relPath?: string, content?: string, hash?: string, stat?: object, _reverseDepOnly?: boolean }>} */ + parseChanges; + /** @type {Array<{ relPath: string, hash: string, stat: object }>} Metadata-only self-heal updates */ + metadataUpdates; + /** @type {string[]} Relative paths of deleted files */ + removed; + /** @type {boolean} True when no changes detected — skip remaining stages */ + earlyExit = false; + + // ── Parsing (set by parseFiles stage) ────────────────────────────── + /** @type {Map} relPath → symbols from parseFilesAuto */ + allSymbols; + /** @type {Map} relPath → symbols (includes incrementally loaded) */ + fileSymbols; + /** @type {Array<{ file: string, relPath?: string }>} Files to parse this build */ + filesToParse; + + // ── Import resolution (set by resolveImports stage) ──────────────── + /** @type {Map|null} "absFile|source" → resolved path */ + batchResolved; + /** @type {Map} relPath → re-export descriptors */ + reexportMap; + /** @type {Set} Files loaded only for barrel resolution (don't rebuild edges) */ + barrelOnlyFiles; + + // ── Node lookup (set by insertNodes / buildEdges stages) ─────────── + /** @type {Map} name → node rows */ + nodesByName; + /** @type {Map} "name|file" → node rows */ + nodesByNameAndFile; + + // ── Misc state ───────────────────────────────────────────────────── + /** @type {boolean} Whether embeddings table exists */ + hasEmbeddings = false; + /** @type {Map} relPath → line count */ + lineCountMap; + + // ── Phase timing ─────────────────────────────────────────────────── + timing = {}; + + /** @type {number} performance.now() at build start */ + buildStart; +} diff --git a/src/builder/helpers.js b/src/builder/helpers.js new file mode 100644 index 0000000..5a90219 --- /dev/null +++ b/src/builder/helpers.js @@ -0,0 +1,215 @@ +/** + * Builder helper functions — shared utilities used across pipeline stages. + * + * Extracted from the monolithic builder.js so stages can import individually. + */ +import { createHash } from 'node:crypto'; +import fs from 'node:fs'; +import path from 'node:path'; +import { EXTENSIONS, IGNORE_DIRS } from '../constants.js'; +import { purgeFilesData } from '../db.js'; +import { warn } from '../logger.js'; + +export const BUILTIN_RECEIVERS = new Set([ + 'console', + 'Math', + 'JSON', + 'Object', + 'Array', + 'String', + 'Number', + 'Boolean', + 'Date', + 'RegExp', + 'Map', + 'Set', + 'WeakMap', + 'WeakSet', + 'Promise', + 'Symbol', + 'Error', + 'TypeError', + 'RangeError', + 'Proxy', + 'Reflect', + 'Intl', + 'globalThis', + 'window', + 'document', + 'process', + 'Buffer', + 'require', +]); + +/** + * Recursively collect all source files under `dir`. + * When `directories` is a Set, also tracks which directories contain files. + */ +export function collectFiles( + dir, + files = [], + config = {}, + directories = null, + _visited = new Set(), +) { + const trackDirs = directories instanceof Set; + let hasFiles = false; + + let entries; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch (err) { + warn(`Cannot read directory ${dir}: ${err.message}`); + return trackDirs ? { files, directories } : files; + } + + // Merge config ignoreDirs with defaults + const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null; + + // Detect symlink loops + let realDir; + try { + realDir = fs.realpathSync(dir); + } catch { + return trackDirs ? { files, directories } : files; + } + if (_visited.has(realDir)) return trackDirs ? { files, directories } : files; + _visited.add(realDir); + + for (const entry of entries) { + if (entry.name.startsWith('.') && entry.name !== '.') { + if (IGNORE_DIRS.has(entry.name)) continue; + if (entry.isDirectory()) continue; + } + if (IGNORE_DIRS.has(entry.name)) continue; + if (extraIgnore?.has(entry.name)) continue; + + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + collectFiles(full, files, config, directories, _visited); + } else if (EXTENSIONS.has(path.extname(entry.name))) { + files.push(full); + hasFiles = true; + } + } + if (trackDirs && hasFiles) { + directories.add(dir); + } + return trackDirs ? { files, directories } : files; +} + +/** + * Load path aliases from tsconfig.json / jsconfig.json. + */ +export function loadPathAliases(rootDir) { + const aliases = { baseUrl: null, paths: {} }; + for (const configName of ['tsconfig.json', 'jsconfig.json']) { + const configPath = path.join(rootDir, configName); + if (!fs.existsSync(configPath)) continue; + try { + const raw = fs + .readFileSync(configPath, 'utf-8') + .replace(/\/\/.*$/gm, '') + .replace(/\/\*[\s\S]*?\*\//g, '') + .replace(/,\s*([\]}])/g, '$1'); + const config = JSON.parse(raw); + const opts = config.compilerOptions || {}; + if (opts.baseUrl) aliases.baseUrl = path.resolve(rootDir, opts.baseUrl); + if (opts.paths) { + for (const [pattern, targets] of Object.entries(opts.paths)) { + aliases.paths[pattern] = targets.map((t) => path.resolve(aliases.baseUrl || rootDir, t)); + } + } + break; + } catch (err) { + warn(`Failed to parse ${configName}: ${err.message}`); + } + } + return aliases; +} + +/** + * Compute MD5 hash of file contents for incremental builds. + */ +export function fileHash(content) { + return createHash('md5').update(content).digest('hex'); +} + +/** + * Stat a file, returning { mtimeMs, size } or null on error. + */ +export function fileStat(filePath) { + try { + const s = fs.statSync(filePath); + return { mtimeMs: s.mtimeMs, size: s.size }; + } catch { + return null; + } +} + +/** + * Read a file with retry on transient errors (EBUSY/EACCES/EPERM). + */ +const TRANSIENT_CODES = new Set(['EBUSY', 'EACCES', 'EPERM']); +const RETRY_DELAY_MS = 50; + +export function readFileSafe(filePath, retries = 2) { + for (let attempt = 0; ; attempt++) { + try { + return fs.readFileSync(filePath, 'utf-8'); + } catch (err) { + if (attempt < retries && TRANSIENT_CODES.has(err.code)) { + const end = Date.now() + RETRY_DELAY_MS; + while (Date.now() < end) {} + continue; + } + throw err; + } + } +} + +/** + * Purge all graph data for the specified files. + */ +export function purgeFilesFromGraph(db, files, options = {}) { + purgeFilesData(db, files, options); +} + +/** Batch INSERT chunk size for multi-value INSERTs. */ +export const BATCH_CHUNK = 200; + +/** + * Batch-insert node rows via multi-value INSERT statements. + * Each row: [name, kind, file, line, end_line, parent_id] + */ +export function batchInsertNodes(db, rows) { + if (!rows.length) return; + const ph = '(?,?,?,?,?,?)'; + for (let i = 0; i < rows.length; i += BATCH_CHUNK) { + const chunk = rows.slice(i, i + BATCH_CHUNK); + const vals = []; + for (const r of chunk) vals.push(r[0], r[1], r[2], r[3], r[4], r[5]); + db.prepare( + 'INSERT OR IGNORE INTO nodes (name,kind,file,line,end_line,parent_id) VALUES ' + + chunk.map(() => ph).join(','), + ).run(...vals); + } +} + +/** + * Batch-insert edge rows via multi-value INSERT statements. + * Each row: [source_id, target_id, kind, confidence, dynamic] + */ +export function batchInsertEdges(db, rows) { + if (!rows.length) return; + const ph = '(?,?,?,?,?)'; + for (let i = 0; i < rows.length; i += BATCH_CHUNK) { + const chunk = rows.slice(i, i + BATCH_CHUNK); + const vals = []; + for (const r of chunk) vals.push(r[0], r[1], r[2], r[3], r[4]); + db.prepare( + 'INSERT INTO edges (source_id,target_id,kind,confidence,dynamic) VALUES ' + + chunk.map(() => ph).join(','), + ).run(...vals); + } +} diff --git a/src/builder/incremental.js b/src/builder/incremental.js new file mode 100644 index 0000000..25a1e5e --- /dev/null +++ b/src/builder/incremental.js @@ -0,0 +1,179 @@ +/** + * Incremental single-file rebuild — used by watch mode. + * + * Reuses pipeline helpers instead of duplicating node insertion and edge building + * logic from the main builder. This eliminates the watcher.js divergence (ROADMAP 3.9). + */ +import path from 'node:path'; +import { normalizePath } from '../constants.js'; +import { warn } from '../logger.js'; +import { parseFileIncremental } from '../parser.js'; +import { computeConfidence, resolveImportPath } from '../resolve.js'; +import { BUILTIN_RECEIVERS, readFileSafe } from './helpers.js'; + +/** + * Parse a single file and update the database incrementally. + * + * @param {import('better-sqlite3').Database} db + * @param {string} rootDir - Absolute root directory + * @param {string} filePath - Absolute file path + * @param {object} stmts - Prepared DB statements + * @param {object} engineOpts - Engine options + * @param {object|null} cache - Parse tree cache (native only) + * @param {object} [options] + * @param {Function} [options.diffSymbols] - Symbol diff function + * @returns {Promise} Update result or null on failure + */ +export async function rebuildFile(db, rootDir, filePath, stmts, engineOpts, cache, options = {}) { + const { diffSymbols } = options; + const relPath = normalizePath(path.relative(rootDir, filePath)); + const fs = await import('node:fs'); + + const oldNodes = stmts.countNodes.get(relPath)?.c || 0; + const oldSymbols = diffSymbols ? stmts.listSymbols.all(relPath) : []; + + stmts.deleteEdgesForFile.run(relPath); + stmts.deleteNodes.run(relPath); + + if (!fs.existsSync(filePath)) { + if (cache) cache.remove(filePath); + const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, []) : null; + return { + file: relPath, + nodesAdded: 0, + nodesRemoved: oldNodes, + edgesAdded: 0, + deleted: true, + event: 'deleted', + symbolDiff, + nodesBefore: oldNodes, + nodesAfter: 0, + }; + } + + let code; + try { + code = readFileSafe(filePath); + } catch (err) { + warn(`Cannot read ${relPath}: ${err.message}`); + return null; + } + + const symbols = await parseFileIncremental(cache, filePath, code, engineOpts); + if (!symbols) return null; + + // Insert nodes + stmts.insertNode.run(relPath, 'file', relPath, 0, null); + for (const def of symbols.definitions) { + stmts.insertNode.run(def.name, def.kind, relPath, def.line, def.endLine || null); + } + for (const exp of symbols.exports) { + stmts.insertNode.run(exp.name, exp.kind, relPath, exp.line, null); + } + + const newNodes = stmts.countNodes.get(relPath)?.c || 0; + const newSymbols = diffSymbols ? stmts.listSymbols.all(relPath) : []; + + let edgesAdded = 0; + const fileNodeRow = stmts.getNodeId.get(relPath, 'file', relPath, 0); + if (!fileNodeRow) + return { file: relPath, nodesAdded: newNodes, nodesRemoved: oldNodes, edgesAdded: 0 }; + const fileNodeId = fileNodeRow.id; + + // Load aliases for import resolution + const aliases = { baseUrl: null, paths: {} }; + + // Import edges + for (const imp of symbols.imports) { + const resolvedPath = resolveImportPath( + path.join(rootDir, relPath), + imp.source, + rootDir, + aliases, + ); + const targetRow = stmts.getNodeId.get(resolvedPath, 'file', resolvedPath, 0); + if (targetRow) { + const edgeKind = imp.reexport ? 'reexports' : imp.typeOnly ? 'imports-type' : 'imports'; + stmts.insertEdge.run(fileNodeId, targetRow.id, edgeKind, 1.0, 0); + edgesAdded++; + } + } + + // Build import name → resolved file mapping + const importedNames = new Map(); + for (const imp of symbols.imports) { + const resolvedPath = resolveImportPath( + path.join(rootDir, relPath), + imp.source, + rootDir, + aliases, + ); + for (const name of imp.names) { + importedNames.set(name.replace(/^\*\s+as\s+/, ''), resolvedPath); + } + } + + // Call edges + for (const call of symbols.calls) { + if (call.receiver && BUILTIN_RECEIVERS.has(call.receiver)) continue; + + let caller = null; + let callerSpan = Infinity; + for (const def of symbols.definitions) { + if (def.line <= call.line) { + const end = def.endLine || Infinity; + if (call.line <= end) { + const span = end - def.line; + if (span < callerSpan) { + const row = stmts.getNodeId.get(def.name, def.kind, relPath, def.line); + if (row) { + caller = row; + callerSpan = span; + } + } + } else if (!caller) { + const row = stmts.getNodeId.get(def.name, def.kind, relPath, def.line); + if (row) caller = row; + } + } + } + if (!caller) caller = fileNodeRow; + + const importedFrom = importedNames.get(call.name); + let targets; + if (importedFrom) { + targets = stmts.findNodeInFile.all(call.name, importedFrom); + } + if (!targets || targets.length === 0) { + targets = stmts.findNodeInFile.all(call.name, relPath); + if (targets.length === 0) { + targets = stmts.findNodeByName.all(call.name); + } + } + + for (const t of targets) { + if (t.id !== caller.id) { + const confidence = importedFrom + ? computeConfidence(relPath, t.file, importedFrom) + : computeConfidence(relPath, t.file, null); + stmts.insertEdge.run(caller.id, t.id, 'calls', confidence, call.dynamic ? 1 : 0); + edgesAdded++; + } + } + } + + const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, newSymbols) : null; + const event = oldNodes === 0 ? 'added' : 'modified'; + + return { + file: relPath, + nodesAdded: newNodes, + nodesRemoved: oldNodes, + edgesAdded, + deleted: false, + event, + symbolDiff, + nodesBefore: oldNodes, + nodesAfter: newNodes, + }; +} diff --git a/src/builder/pipeline.js b/src/builder/pipeline.js new file mode 100644 index 0000000..53f419d --- /dev/null +++ b/src/builder/pipeline.js @@ -0,0 +1,125 @@ +/** + * Pipeline orchestrator — runs build stages sequentially through a shared PipelineContext. + * + * This is the heart of the builder refactor (ROADMAP 3.9): the monolithic buildGraph() + * is decomposed into independently testable stages that communicate via PipelineContext. + */ +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { loadConfig } from '../config.js'; +import { getBuildMeta, initSchema, MIGRATIONS, openDb } from '../db.js'; +import { info } from '../logger.js'; +import { getActiveEngine } from '../parser.js'; +import { PipelineContext } from './context.js'; +import { loadPathAliases } from './helpers.js'; +import { buildEdges } from './stages/build-edges.js'; +import { buildStructure } from './stages/build-structure.js'; +// Pipeline stages +import { collectFiles } from './stages/collect-files.js'; +import { detectChanges } from './stages/detect-changes.js'; +import { finalize } from './stages/finalize.js'; +import { insertNodes } from './stages/insert-nodes.js'; +import { parseFiles } from './stages/parse-files.js'; +import { resolveImports } from './stages/resolve-imports.js'; +import { runAnalyses } from './stages/run-analyses.js'; + +/** + * Build the dependency graph for a codebase. + * + * Signature and return value are identical to the original monolithic buildGraph(). + * + * @param {string} rootDir - Root directory to scan + * @param {object} [opts] - Build options + * @returns {Promise<{ phases: object } | undefined>} + */ +export async function buildGraph(rootDir, opts = {}) { + const ctx = new PipelineContext(); + ctx.buildStart = performance.now(); + ctx.opts = opts; + + // ── Setup (creates DB, loads config, selects engine) ────────────── + ctx.rootDir = path.resolve(rootDir); + ctx.dbPath = path.join(ctx.rootDir, '.codegraph', 'graph.db'); + ctx.db = openDb(ctx.dbPath); + initSchema(ctx.db); + + ctx.config = loadConfig(ctx.rootDir); + ctx.incremental = + opts.incremental !== false && ctx.config.build && ctx.config.build.incremental !== false; + + ctx.engineOpts = { + engine: opts.engine || 'auto', + dataflow: opts.dataflow !== false, + ast: opts.ast !== false, + }; + const { name: engineName, version: engineVersion } = getActiveEngine(ctx.engineOpts); + ctx.engineName = engineName; + ctx.engineVersion = engineVersion; + info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`); + + // Engine/schema mismatch detection + ctx.schemaVersion = MIGRATIONS[MIGRATIONS.length - 1].version; + ctx.forceFullRebuild = false; + if (ctx.incremental) { + const prevEngine = getBuildMeta(ctx.db, 'engine'); + if (prevEngine && prevEngine !== engineName) { + info(`Engine changed (${prevEngine} → ${engineName}), promoting to full rebuild.`); + ctx.forceFullRebuild = true; + } + const prevSchema = getBuildMeta(ctx.db, 'schema_version'); + if (prevSchema && Number(prevSchema) !== ctx.schemaVersion) { + info( + `Schema version changed (${prevSchema} → ${ctx.schemaVersion}), promoting to full rebuild.`, + ); + ctx.forceFullRebuild = true; + } + } + + // Path aliases + ctx.aliases = loadPathAliases(ctx.rootDir); + if (ctx.config.aliases) { + for (const [key, value] of Object.entries(ctx.config.aliases)) { + const pattern = key.endsWith('/') ? `${key}*` : key; + const target = path.resolve(ctx.rootDir, value); + ctx.aliases.paths[pattern] = [target.endsWith('/') ? `${target}*` : `${target}/*`]; + } + } + if (ctx.aliases.baseUrl || Object.keys(ctx.aliases.paths).length > 0) { + info( + `Loaded path aliases: baseUrl=${ctx.aliases.baseUrl || 'none'}, ${Object.keys(ctx.aliases.paths).length} path mappings`, + ); + } + + ctx.timing.setupMs = performance.now() - ctx.buildStart; + + // ── Pipeline stages ─────────────────────────────────────────────── + await collectFiles(ctx); + await detectChanges(ctx); + + if (ctx.earlyExit) return; + + await parseFiles(ctx); + await insertNodes(ctx); + await resolveImports(ctx); + await buildEdges(ctx); + await buildStructure(ctx); + await runAnalyses(ctx); + await finalize(ctx); + + return { + phases: { + setupMs: +ctx.timing.setupMs.toFixed(1), + parseMs: +ctx.timing.parseMs.toFixed(1), + insertMs: +ctx.timing.insertMs.toFixed(1), + resolveMs: +ctx.timing.resolveMs.toFixed(1), + edgesMs: +ctx.timing.edgesMs.toFixed(1), + structureMs: +ctx.timing.structureMs.toFixed(1), + rolesMs: +ctx.timing.rolesMs.toFixed(1), + astMs: +ctx.timing.astMs.toFixed(1), + complexityMs: +ctx.timing.complexityMs.toFixed(1), + ...(ctx.timing.cfgMs != null && { cfgMs: +ctx.timing.cfgMs.toFixed(1) }), + ...(ctx.timing.dataflowMs != null && { dataflowMs: +ctx.timing.dataflowMs.toFixed(1) }), + finalizeMs: +ctx.timing.finalizeMs.toFixed(1), + }, + }; +} diff --git a/src/builder/stages/build-edges.js b/src/builder/stages/build-edges.js new file mode 100644 index 0000000..545aa9b --- /dev/null +++ b/src/builder/stages/build-edges.js @@ -0,0 +1,297 @@ +/** + * Stage: buildEdges + * + * Builds import, call, receiver, extends, and implements edges. + * Uses pre-loaded node lookup maps (N+1 optimization). + */ +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { getNodeId } from '../../db.js'; +import { loadNative } from '../../native.js'; +import { computeConfidence } from '../../resolve.js'; +import { BUILTIN_RECEIVERS, batchInsertEdges } from '../helpers.js'; +import { getResolved, isBarrelFile, resolveBarrelExport } from './resolve-imports.js'; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function buildEdges(ctx) { + const { db, fileSymbols, barrelOnlyFiles, rootDir, engineName } = ctx; + + const getNodeIdStmt = { + get: (name, kind, file, line) => { + const id = getNodeId(db, name, kind, file, line); + return id != null ? { id } : undefined; + }, + }; + + // Pre-load all nodes into lookup maps + const allNodes = db + .prepare( + `SELECT id, name, kind, file, line FROM nodes WHERE kind IN ('function','method','class','interface','struct','type','module','enum','trait')`, + ) + .all(); + ctx.nodesByName = new Map(); + for (const node of allNodes) { + if (!ctx.nodesByName.has(node.name)) ctx.nodesByName.set(node.name, []); + ctx.nodesByName.get(node.name).push(node); + } + ctx.nodesByNameAndFile = new Map(); + for (const node of allNodes) { + const key = `${node.name}|${node.file}`; + if (!ctx.nodesByNameAndFile.has(key)) ctx.nodesByNameAndFile.set(key, []); + ctx.nodesByNameAndFile.get(key).push(node); + } + + const t0 = performance.now(); + const buildEdgesTx = db.transaction(() => { + const allEdgeRows = []; + + // ── Import edges ──────────────────────────────────────────────── + for (const [relPath, symbols] of fileSymbols) { + if (barrelOnlyFiles.has(relPath)) continue; + const fileNodeRow = getNodeIdStmt.get(relPath, 'file', relPath, 0); + if (!fileNodeRow) continue; + const fileNodeId = fileNodeRow.id; + + for (const imp of symbols.imports) { + const resolvedPath = getResolved(ctx, path.join(rootDir, relPath), imp.source); + const targetRow = getNodeIdStmt.get(resolvedPath, 'file', resolvedPath, 0); + if (targetRow) { + const edgeKind = imp.reexport + ? 'reexports' + : imp.typeOnly + ? 'imports-type' + : imp.dynamicImport + ? 'dynamic-imports' + : 'imports'; + allEdgeRows.push([fileNodeId, targetRow.id, edgeKind, 1.0, 0]); + + if (!imp.reexport && isBarrelFile(ctx, resolvedPath)) { + const resolvedSources = new Set(); + for (const name of imp.names) { + const cleanName = name.replace(/^\*\s+as\s+/, ''); + const actualSource = resolveBarrelExport(ctx, resolvedPath, cleanName); + if ( + actualSource && + actualSource !== resolvedPath && + !resolvedSources.has(actualSource) + ) { + resolvedSources.add(actualSource); + const actualRow = getNodeIdStmt.get(actualSource, 'file', actualSource, 0); + if (actualRow) { + allEdgeRows.push([ + fileNodeId, + actualRow.id, + edgeKind === 'imports-type' + ? 'imports-type' + : edgeKind === 'dynamic-imports' + ? 'dynamic-imports' + : 'imports', + 0.9, + 0, + ]); + } + } + } + } + } + } + } + + // ── Call/receiver/extends/implements edges ─────────────────────── + const native = engineName === 'native' ? loadNative() : null; + if (native?.buildCallEdges) { + const nativeFiles = []; + for (const [relPath, symbols] of fileSymbols) { + if (barrelOnlyFiles.has(relPath)) continue; + const fileNodeRow = getNodeIdStmt.get(relPath, 'file', relPath, 0); + if (!fileNodeRow) continue; + + const importedNames = []; + for (const imp of symbols.imports) { + const resolvedPath = getResolved(ctx, path.join(rootDir, relPath), imp.source); + for (const name of imp.names) { + const cleanName = name.replace(/^\*\s+as\s+/, ''); + let targetFile = resolvedPath; + if (isBarrelFile(ctx, resolvedPath)) { + const actual = resolveBarrelExport(ctx, resolvedPath, cleanName); + if (actual) targetFile = actual; + } + importedNames.push({ name: cleanName, file: targetFile }); + } + } + + nativeFiles.push({ + file: relPath, + fileNodeId: fileNodeRow.id, + definitions: symbols.definitions.map((d) => ({ + name: d.name, + kind: d.kind, + line: d.line, + endLine: d.endLine ?? null, + })), + calls: symbols.calls, + importedNames, + classes: symbols.classes, + }); + } + + const nativeEdges = native.buildCallEdges(nativeFiles, allNodes, [...BUILTIN_RECEIVERS]); + for (const e of nativeEdges) { + allEdgeRows.push([e.sourceId, e.targetId, e.kind, e.confidence, e.dynamic]); + } + } else { + // JS fallback + for (const [relPath, symbols] of fileSymbols) { + if (barrelOnlyFiles.has(relPath)) continue; + const fileNodeRow = getNodeIdStmt.get(relPath, 'file', relPath, 0); + if (!fileNodeRow) continue; + + const importedNames = new Map(); + for (const imp of symbols.imports) { + const resolvedPath = getResolved(ctx, path.join(rootDir, relPath), imp.source); + for (const name of imp.names) { + const cleanName = name.replace(/^\*\s+as\s+/, ''); + importedNames.set(cleanName, resolvedPath); + } + } + + const seenCallEdges = new Set(); + for (const call of symbols.calls) { + if (call.receiver && BUILTIN_RECEIVERS.has(call.receiver)) continue; + let caller = null; + let callerSpan = Infinity; + for (const def of symbols.definitions) { + if (def.line <= call.line) { + const end = def.endLine || Infinity; + if (call.line <= end) { + const span = end - def.line; + if (span < callerSpan) { + const row = getNodeIdStmt.get(def.name, def.kind, relPath, def.line); + if (row) { + caller = row; + callerSpan = span; + } + } + } else if (!caller) { + const row = getNodeIdStmt.get(def.name, def.kind, relPath, def.line); + if (row) caller = row; + } + } + } + if (!caller) caller = fileNodeRow; + + const isDynamic = call.dynamic ? 1 : 0; + let targets; + const importedFrom = importedNames.get(call.name); + + if (importedFrom) { + targets = ctx.nodesByNameAndFile.get(`${call.name}|${importedFrom}`) || []; + if (targets.length === 0 && isBarrelFile(ctx, importedFrom)) { + const actualSource = resolveBarrelExport(ctx, importedFrom, call.name); + if (actualSource) { + targets = ctx.nodesByNameAndFile.get(`${call.name}|${actualSource}`) || []; + } + } + } + if (!targets || targets.length === 0) { + targets = ctx.nodesByNameAndFile.get(`${call.name}|${relPath}`) || []; + if (targets.length === 0) { + const methodCandidates = (ctx.nodesByName.get(call.name) || []).filter( + (n) => n.name.endsWith(`.${call.name}`) && n.kind === 'method', + ); + if (methodCandidates.length > 0) { + targets = methodCandidates; + } else if ( + !call.receiver || + call.receiver === 'this' || + call.receiver === 'self' || + call.receiver === 'super' + ) { + targets = (ctx.nodesByName.get(call.name) || []).filter( + (n) => computeConfidence(relPath, n.file, null) >= 0.5, + ); + } + } + } + + if (targets.length > 1) { + targets.sort((a, b) => { + const confA = computeConfidence(relPath, a.file, importedFrom); + const confB = computeConfidence(relPath, b.file, importedFrom); + return confB - confA; + }); + } + + for (const t of targets) { + const edgeKey = `${caller.id}|${t.id}`; + if (t.id !== caller.id && !seenCallEdges.has(edgeKey)) { + seenCallEdges.add(edgeKey); + const confidence = computeConfidence(relPath, t.file, importedFrom); + allEdgeRows.push([caller.id, t.id, 'calls', confidence, isDynamic]); + } + } + + // Receiver edge + if ( + call.receiver && + !BUILTIN_RECEIVERS.has(call.receiver) && + call.receiver !== 'this' && + call.receiver !== 'self' && + call.receiver !== 'super' + ) { + const receiverKinds = new Set(['class', 'struct', 'interface', 'type', 'module']); + const samefile = ctx.nodesByNameAndFile.get(`${call.receiver}|${relPath}`) || []; + const candidates = + samefile.length > 0 ? samefile : ctx.nodesByName.get(call.receiver) || []; + const receiverNodes = candidates.filter((n) => receiverKinds.has(n.kind)); + if (receiverNodes.length > 0 && caller) { + const recvTarget = receiverNodes[0]; + const recvKey = `recv|${caller.id}|${recvTarget.id}`; + if (!seenCallEdges.has(recvKey)) { + seenCallEdges.add(recvKey); + allEdgeRows.push([caller.id, recvTarget.id, 'receiver', 0.7, 0]); + } + } + } + } + + // Class extends edges + for (const cls of symbols.classes) { + if (cls.extends) { + const sourceRow = (ctx.nodesByNameAndFile.get(`${cls.name}|${relPath}`) || []).find( + (n) => n.kind === 'class', + ); + const targetCandidates = ctx.nodesByName.get(cls.extends) || []; + const targetRows = targetCandidates.filter((n) => n.kind === 'class'); + if (sourceRow) { + for (const t of targetRows) { + allEdgeRows.push([sourceRow.id, t.id, 'extends', 1.0, 0]); + } + } + } + + if (cls.implements) { + const sourceRow = (ctx.nodesByNameAndFile.get(`${cls.name}|${relPath}`) || []).find( + (n) => n.kind === 'class', + ); + const targetCandidates = ctx.nodesByName.get(cls.implements) || []; + const targetRows = targetCandidates.filter( + (n) => n.kind === 'interface' || n.kind === 'class', + ); + if (sourceRow) { + for (const t of targetRows) { + allEdgeRows.push([sourceRow.id, t.id, 'implements', 1.0, 0]); + } + } + } + } + } + } + + batchInsertEdges(db, allEdgeRows); + }); + buildEdgesTx(); + ctx.timing.edgesMs = performance.now() - t0; +} diff --git a/src/builder/stages/build-structure.js b/src/builder/stages/build-structure.js new file mode 100644 index 0000000..07827c0 --- /dev/null +++ b/src/builder/stages/build-structure.js @@ -0,0 +1,113 @@ +/** + * Stage: buildStructure + classifyRoles + * + * Builds directory structure, containment edges, metrics, and classifies node roles. + */ +import fs from 'node:fs'; +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { normalizePath } from '../../constants.js'; +import { debug } from '../../logger.js'; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function buildStructure(ctx) { + const { db, fileSymbols, rootDir, discoveredDirs, allSymbols, isFullBuild } = ctx; + + // Build line count map (prefer cached _lineCount from parser) + ctx.lineCountMap = new Map(); + for (const [relPath, symbols] of fileSymbols) { + if (symbols.lineCount ?? symbols._lineCount) { + ctx.lineCountMap.set(relPath, symbols.lineCount ?? symbols._lineCount); + } else { + const absPath = path.join(rootDir, relPath); + try { + const content = fs.readFileSync(absPath, 'utf-8'); + ctx.lineCountMap.set(relPath, content.split('\n').length); + } catch { + ctx.lineCountMap.set(relPath, 0); + } + } + } + + // For incremental builds, load unchanged files from DB for complete structure + if (!isFullBuild) { + const existingFiles = db.prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'").all(); + const defsByFile = db.prepare( + "SELECT name, kind, line FROM nodes WHERE file = ? AND kind != 'file' AND kind != 'directory'", + ); + const importCountByFile = db.prepare( + `SELECT COUNT(DISTINCT n2.file) AS cnt FROM edges e + JOIN nodes n1 ON e.source_id = n1.id + JOIN nodes n2 ON e.target_id = n2.id + WHERE n1.file = ? AND e.kind = 'imports'`, + ); + const lineCountByFile = db.prepare( + `SELECT n.name AS file, m.line_count + FROM node_metrics m JOIN nodes n ON m.node_id = n.id + WHERE n.kind = 'file'`, + ); + const cachedLineCounts = new Map(); + for (const row of lineCountByFile.all()) { + cachedLineCounts.set(row.file, row.line_count); + } + let loadedFromDb = 0; + for (const { file: relPath } of existingFiles) { + if (!fileSymbols.has(relPath)) { + const importCount = importCountByFile.get(relPath)?.cnt || 0; + fileSymbols.set(relPath, { + definitions: defsByFile.all(relPath), + imports: new Array(importCount), + exports: [], + }); + loadedFromDb++; + } + if (!ctx.lineCountMap.has(relPath)) { + const cached = cachedLineCounts.get(relPath); + if (cached != null) { + ctx.lineCountMap.set(relPath, cached); + } else { + const absPath = path.join(rootDir, relPath); + try { + const content = fs.readFileSync(absPath, 'utf-8'); + ctx.lineCountMap.set(relPath, content.split('\n').length); + } catch { + ctx.lineCountMap.set(relPath, 0); + } + } + } + } + debug(`Structure: ${fileSymbols.size} files (${loadedFromDb} loaded from DB)`); + } + + // Build directory structure + const t0 = performance.now(); + const relDirs = new Set(); + for (const absDir of discoveredDirs) { + relDirs.add(normalizePath(path.relative(rootDir, absDir))); + } + try { + const { buildStructure: buildStructureFn } = await import('../../structure.js'); + const changedFilePaths = isFullBuild ? null : [...allSymbols.keys()]; + buildStructureFn(db, fileSymbols, rootDir, ctx.lineCountMap, relDirs, changedFilePaths); + } catch (err) { + debug(`Structure analysis failed: ${err.message}`); + } + ctx.timing.structureMs = performance.now() - t0; + + // Classify node roles + const t1 = performance.now(); + try { + const { classifyNodeRoles } = await import('../../structure.js'); + const roleSummary = classifyNodeRoles(db); + debug( + `Roles: ${Object.entries(roleSummary) + .map(([r, c]) => `${r}=${c}`) + .join(', ')}`, + ); + } catch (err) { + debug(`Role classification failed: ${err.message}`); + } + ctx.timing.rolesMs = performance.now() - t1; +} diff --git a/src/builder/stages/collect-files.js b/src/builder/stages/collect-files.js new file mode 100644 index 0000000..29511d2 --- /dev/null +++ b/src/builder/stages/collect-files.js @@ -0,0 +1,44 @@ +/** + * Stage: collectFiles + * + * Collects all source files to process. Handles both normal and scoped rebuilds. + */ +import fs from 'node:fs'; +import path from 'node:path'; +import { normalizePath } from '../../constants.js'; +import { info } from '../../logger.js'; +import { collectFiles as collectFilesUtil } from '../helpers.js'; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function collectFiles(ctx) { + const { rootDir, config, opts } = ctx; + + if (opts.scope) { + // Scoped rebuild: rebuild only specified files + const scopedFiles = opts.scope.map((f) => normalizePath(f)); + const existing = []; + const missing = []; + for (const rel of scopedFiles) { + const abs = path.join(rootDir, rel); + if (fs.existsSync(abs)) { + existing.push({ file: abs, relPath: rel }); + } else { + missing.push(rel); + } + } + ctx.allFiles = existing.map((e) => e.file); + ctx.discoveredDirs = new Set(existing.map((e) => path.dirname(e.file))); + ctx.parseChanges = existing; + ctx.metadataUpdates = []; + ctx.removed = missing; + ctx.isFullBuild = false; + info(`Scoped rebuild: ${existing.length} files to rebuild, ${missing.length} to purge`); + } else { + const collected = collectFilesUtil(rootDir, [], config, new Set()); + ctx.allFiles = collected.files; + ctx.discoveredDirs = collected.directories; + info(`Found ${ctx.allFiles.length} files to parse`); + } +} diff --git a/src/builder/stages/detect-changes.js b/src/builder/stages/detect-changes.js new file mode 100644 index 0000000..ce902a7 --- /dev/null +++ b/src/builder/stages/detect-changes.js @@ -0,0 +1,413 @@ +/** + * Stage: detectChanges + * + * Three-tier change detection cascade + incremental reverse-dependency handling. + * Sets ctx.parseChanges, ctx.metadataUpdates, ctx.removed, ctx.isFullBuild, ctx.earlyExit. + */ +import fs from 'node:fs'; +import path from 'node:path'; +import { normalizePath } from '../../constants.js'; +import { closeDb } from '../../db.js'; +import { readJournal, writeJournalHeader } from '../../journal.js'; +import { debug, info } from '../../logger.js'; +import { parseFilesAuto } from '../../parser.js'; +import { fileHash, fileStat, purgeFilesFromGraph, readFileSafe } from '../helpers.js'; + +/** + * Determine which files have changed since last build. + * Three-tier cascade: + * Tier 0 — Journal: O(changed) when watcher was running + * Tier 1 — mtime+size: O(n) stats, O(changed) reads + * Tier 2 — Hash comparison: O(changed) reads (fallback from Tier 1) + */ +function getChangedFiles(db, allFiles, rootDir) { + let hasTable = false; + try { + db.prepare('SELECT 1 FROM file_hashes LIMIT 1').get(); + hasTable = true; + } catch { + /* table doesn't exist */ + } + + if (!hasTable) { + return { + changed: allFiles.map((f) => ({ file: f })), + removed: [], + isFullBuild: true, + }; + } + + const existing = new Map( + db + .prepare('SELECT file, hash, mtime, size FROM file_hashes') + .all() + .map((r) => [r.file, r]), + ); + + const currentFiles = new Set(); + for (const file of allFiles) { + currentFiles.add(normalizePath(path.relative(rootDir, file))); + } + + const removed = []; + for (const existingFile of existing.keys()) { + if (!currentFiles.has(existingFile)) { + removed.push(existingFile); + } + } + + // ── Tier 0: Journal ────────────────────────────────────────────── + const journal = readJournal(rootDir); + if (journal.valid) { + const dbMtimes = db.prepare('SELECT MAX(mtime) as latest FROM file_hashes').get(); + const latestDbMtime = dbMtimes?.latest || 0; + const hasJournalEntries = journal.changed.length > 0 || journal.removed.length > 0; + + if (hasJournalEntries && journal.timestamp >= latestDbMtime) { + debug( + `Tier 0: journal valid, ${journal.changed.length} changed, ${journal.removed.length} removed`, + ); + const changed = []; + + for (const relPath of journal.changed) { + const absPath = path.join(rootDir, relPath); + const stat = fileStat(absPath); + if (!stat) continue; + + let content; + try { + content = readFileSafe(absPath); + } catch { + continue; + } + const hash = fileHash(content); + const record = existing.get(relPath); + if (!record || record.hash !== hash) { + changed.push({ file: absPath, content, hash, relPath, stat }); + } + } + + const removedSet = new Set(removed); + for (const relPath of journal.removed) { + if (existing.has(relPath)) removedSet.add(relPath); + } + + return { changed, removed: [...removedSet], isFullBuild: false }; + } + debug( + `Tier 0: skipped (${hasJournalEntries ? 'timestamp stale' : 'no entries'}), falling to Tier 1`, + ); + } + + // ── Tier 1: mtime+size fast-path ───────────────────────────────── + const needsHash = []; + const skipped = []; + + for (const file of allFiles) { + const relPath = normalizePath(path.relative(rootDir, file)); + const record = existing.get(relPath); + + if (!record) { + needsHash.push({ file, relPath }); + continue; + } + + const stat = fileStat(file); + if (!stat) continue; + + const storedMtime = record.mtime || 0; + const storedSize = record.size || 0; + + if (storedSize > 0 && Math.floor(stat.mtimeMs) === storedMtime && stat.size === storedSize) { + skipped.push(relPath); + continue; + } + + needsHash.push({ file, relPath, stat }); + } + + if (needsHash.length > 0) { + debug(`Tier 1: ${skipped.length} skipped by mtime+size, ${needsHash.length} need hash check`); + } + + // ── Tier 2: Hash comparison ────────────────────────────────────── + const changed = []; + + for (const item of needsHash) { + let content; + try { + content = readFileSafe(item.file); + } catch { + continue; + } + const hash = fileHash(content); + const stat = item.stat || fileStat(item.file); + const record = existing.get(item.relPath); + + if (!record || record.hash !== hash) { + changed.push({ file: item.file, content, hash, relPath: item.relPath, stat }); + } else if (stat) { + changed.push({ + file: item.file, + content, + hash, + relPath: item.relPath, + stat, + metadataOnly: true, + }); + } + } + + const parseChanged = changed.filter((c) => !c.metadataOnly); + if (needsHash.length > 0) { + debug( + `Tier 2: ${parseChanged.length} actually changed, ${changed.length - parseChanged.length} metadata-only`, + ); + } + + return { changed, removed, isFullBuild: false }; +} + +/** + * Run pending analysis pass when no file changes but analysis tables are empty. + * @returns {boolean} true if analysis was run and we should early-exit + */ +async function runPendingAnalysis(ctx) { + const { db, opts, engineOpts, allFiles, rootDir } = ctx; + + const needsCfg = + opts.cfg !== false && + (() => { + try { + return db.prepare('SELECT COUNT(*) as c FROM cfg_blocks').get().c === 0; + } catch { + return true; + } + })(); + const needsDataflow = + opts.dataflow !== false && + (() => { + try { + return db.prepare('SELECT COUNT(*) as c FROM dataflow').get().c === 0; + } catch { + return true; + } + })(); + + if (!needsCfg && !needsDataflow) return false; + + info('No file changes. Running pending analysis pass...'); + const analysisOpts = { + ...engineOpts, + dataflow: needsDataflow && opts.dataflow !== false, + }; + const analysisSymbols = await parseFilesAuto(allFiles, rootDir, analysisOpts); + if (needsCfg) { + const { buildCFGData } = await import('../../cfg.js'); + await buildCFGData(db, analysisSymbols, rootDir, engineOpts); + } + if (needsDataflow) { + const { buildDataflowEdges } = await import('../../dataflow.js'); + await buildDataflowEdges(db, analysisSymbols, rootDir, engineOpts); + } + return true; +} + +/** + * Self-heal metadata-only updates (mtime/size) without re-parsing. + */ +function healMetadata(ctx) { + const { db, metadataUpdates } = ctx; + if (!metadataUpdates || metadataUpdates.length === 0) return; + try { + const healHash = db.prepare( + 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', + ); + const healTx = db.transaction(() => { + for (const item of metadataUpdates) { + const mtime = item.stat ? Math.floor(item.stat.mtimeMs) : 0; + const size = item.stat ? item.stat.size : 0; + healHash.run(item.relPath, item.hash, mtime, size); + } + }); + healTx(); + debug(`Self-healed mtime/size for ${metadataUpdates.length} files`); + } catch { + /* ignore heal errors */ + } +} + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function detectChanges(ctx) { + const { db, allFiles, rootDir, incremental, forceFullRebuild, opts } = ctx; + + // Scoped builds already set parseChanges in collectFiles. + // Still need to purge removed files and set hasEmbeddings. + if (opts.scope) { + let hasEmbeddings = false; + try { + db.prepare('SELECT 1 FROM embeddings LIMIT 1').get(); + hasEmbeddings = true; + } catch { + /* table doesn't exist */ + } + ctx.hasEmbeddings = hasEmbeddings; + + // Reverse-dependency cascade BEFORE purging (needs existing edges to find importers) + const changePaths = ctx.parseChanges.map( + (item) => item.relPath || normalizePath(path.relative(rootDir, item.file)), + ); + const reverseDeps = new Set(); + if (!opts.noReverseDeps) { + const changedRelPaths = new Set([...changePaths, ...ctx.removed]); + if (changedRelPaths.size > 0) { + const findReverseDeps = db.prepare(` + SELECT DISTINCT n_src.file FROM edges e + JOIN nodes n_src ON e.source_id = n_src.id + JOIN nodes n_tgt ON e.target_id = n_tgt.id + WHERE n_tgt.file = ? AND n_src.file != n_tgt.file AND n_src.kind != 'directory' + `); + for (const relPath of changedRelPaths) { + for (const row of findReverseDeps.all(relPath)) { + if (!changedRelPaths.has(row.file) && !reverseDeps.has(row.file)) { + const absPath = path.join(rootDir, row.file); + if (fs.existsSync(absPath)) { + reverseDeps.add(row.file); + } + } + } + } + } + } + + // Now purge changed + removed files + if (changePaths.length > 0 || ctx.removed.length > 0) { + purgeFilesFromGraph(db, [...ctx.removed, ...changePaths], { purgeHashes: false }); + } + + // Delete outgoing edges for reverse-dep files and add to parse list + if (reverseDeps.size > 0) { + const deleteOutgoingEdgesForFile = db.prepare( + 'DELETE FROM edges WHERE source_id IN (SELECT id FROM nodes WHERE file = ?)', + ); + for (const relPath of reverseDeps) { + deleteOutgoingEdgesForFile.run(relPath); + } + for (const relPath of reverseDeps) { + const absPath = path.join(rootDir, relPath); + ctx.parseChanges.push({ file: absPath, relPath, _reverseDepOnly: true }); + } + info( + `Scoped rebuild: ${changePaths.length} changed, ${ctx.removed.length} removed, ${reverseDeps.size} reverse-deps`, + ); + } + return; + } + + const increResult = + incremental && !forceFullRebuild + ? getChangedFiles(db, allFiles, rootDir) + : { changed: allFiles.map((f) => ({ file: f })), removed: [], isFullBuild: true }; + + ctx.removed = increResult.removed; + ctx.isFullBuild = increResult.isFullBuild; + ctx.parseChanges = increResult.changed.filter((c) => !c.metadataOnly); + ctx.metadataUpdates = increResult.changed.filter((c) => c.metadataOnly); + + // Early exit: no changes detected + if (!ctx.isFullBuild && ctx.parseChanges.length === 0 && ctx.removed.length === 0) { + const ranAnalysis = await runPendingAnalysis(ctx); + if (ranAnalysis) { + closeDb(db); + writeJournalHeader(rootDir, Date.now()); + ctx.earlyExit = true; + return; + } + + healMetadata(ctx); + info('No changes detected. Graph is up to date.'); + closeDb(db); + writeJournalHeader(rootDir, Date.now()); + ctx.earlyExit = true; + return; + } + + // ── Full build: truncate all tables ────────────────────────────── + let hasEmbeddings = false; + try { + db.prepare('SELECT 1 FROM embeddings LIMIT 1').get(); + hasEmbeddings = true; + } catch { + /* table doesn't exist */ + } + ctx.hasEmbeddings = hasEmbeddings; + + if (ctx.isFullBuild) { + const deletions = + 'PRAGMA foreign_keys = OFF; DELETE FROM cfg_edges; DELETE FROM cfg_blocks; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM function_complexity; DELETE FROM dataflow; DELETE FROM ast_nodes; DELETE FROM nodes; PRAGMA foreign_keys = ON;'; + db.exec( + hasEmbeddings + ? `${deletions.replace('PRAGMA foreign_keys = ON;', '')} DELETE FROM embeddings; PRAGMA foreign_keys = ON;` + : deletions, + ); + return; + } + + // ── Reverse-dependency cascade (incremental) ───────────────────── + const reverseDeps = new Set(); + if (!opts.noReverseDeps) { + const changedRelPaths = new Set(); + for (const item of ctx.parseChanges) { + changedRelPaths.add(item.relPath || normalizePath(path.relative(rootDir, item.file))); + } + for (const relPath of ctx.removed) { + changedRelPaths.add(relPath); + } + + if (changedRelPaths.size > 0) { + const findReverseDeps = db.prepare(` + SELECT DISTINCT n_src.file FROM edges e + JOIN nodes n_src ON e.source_id = n_src.id + JOIN nodes n_tgt ON e.target_id = n_tgt.id + WHERE n_tgt.file = ? AND n_src.file != n_tgt.file AND n_src.kind != 'directory' + `); + for (const relPath of changedRelPaths) { + for (const row of findReverseDeps.all(relPath)) { + if (!changedRelPaths.has(row.file) && !reverseDeps.has(row.file)) { + const absPath = path.join(rootDir, row.file); + if (fs.existsSync(absPath)) { + reverseDeps.add(row.file); + } + } + } + } + } + } + + info( + `Incremental: ${ctx.parseChanges.length} changed, ${ctx.removed.length} removed${reverseDeps.size > 0 ? `, ${reverseDeps.size} reverse-deps` : ''}`, + ); + if (ctx.parseChanges.length > 0) + debug(`Changed files: ${ctx.parseChanges.map((c) => c.relPath).join(', ')}`); + if (ctx.removed.length > 0) debug(`Removed files: ${ctx.removed.join(', ')}`); + + // Purge changed and removed files + const changePaths = ctx.parseChanges.map( + (item) => item.relPath || normalizePath(path.relative(rootDir, item.file)), + ); + purgeFilesFromGraph(db, [...ctx.removed, ...changePaths], { purgeHashes: false }); + + // Delete outgoing edges for reverse-dep files, then add them to parse list + const deleteOutgoingEdgesForFile = db.prepare( + 'DELETE FROM edges WHERE source_id IN (SELECT id FROM nodes WHERE file = ?)', + ); + for (const relPath of reverseDeps) { + deleteOutgoingEdgesForFile.run(relPath); + } + for (const relPath of reverseDeps) { + const absPath = path.join(rootDir, relPath); + ctx.parseChanges.push({ file: absPath, relPath, _reverseDepOnly: true }); + } +} diff --git a/src/builder/stages/finalize.js b/src/builder/stages/finalize.js new file mode 100644 index 0000000..1b59ceb --- /dev/null +++ b/src/builder/stages/finalize.js @@ -0,0 +1,139 @@ +/** + * Stage: finalize + * + * WASM cleanup, stats logging, drift detection, build metadata, registry, journal. + */ +import fs from 'node:fs'; +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { closeDb, getBuildMeta, setBuildMeta } from '../../db.js'; +import { writeJournalHeader } from '../../journal.js'; +import { debug, info, warn } from '../../logger.js'; + +const __builderDir = path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/i, '$1')); +const CODEGRAPH_VERSION = JSON.parse( + fs.readFileSync(path.join(__builderDir, '..', '..', '..', 'package.json'), 'utf-8'), +).version; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function finalize(ctx) { + const { db, allSymbols, rootDir, isFullBuild, hasEmbeddings, config, opts, schemaVersion } = ctx; + + const t0 = performance.now(); + + // Release cached WASM trees + for (const [, symbols] of allSymbols) { + if (symbols._tree && typeof symbols._tree.delete === 'function') { + try { + symbols._tree.delete(); + } catch {} + } + symbols._tree = null; + symbols._langId = null; + } + + const nodeCount = db.prepare('SELECT COUNT(*) as c FROM nodes').get().c; + const actualEdgeCount = db.prepare('SELECT COUNT(*) as c FROM edges').get().c; + info(`Graph built: ${nodeCount} nodes, ${actualEdgeCount} edges`); + info(`Stored in ${ctx.dbPath}`); + + // Incremental drift detection + if (!isFullBuild) { + const prevNodes = getBuildMeta(db, 'node_count'); + const prevEdges = getBuildMeta(db, 'edge_count'); + if (prevNodes && prevEdges) { + const prevN = Number(prevNodes); + const prevE = Number(prevEdges); + if (prevN > 0) { + const nodeDrift = Math.abs(nodeCount - prevN) / prevN; + const edgeDrift = prevE > 0 ? Math.abs(actualEdgeCount - prevE) / prevE : 0; + const driftThreshold = config.build?.driftThreshold ?? 0.2; + if (nodeDrift > driftThreshold || edgeDrift > driftThreshold) { + warn( + `Incremental build diverged significantly from previous counts (nodes: ${prevN}→${nodeCount} [${(nodeDrift * 100).toFixed(1)}%], edges: ${prevE}→${actualEdgeCount} [${(edgeDrift * 100).toFixed(1)}%], threshold: ${(driftThreshold * 100).toFixed(0)}%). Consider rebuilding with --no-incremental.`, + ); + } + } + } + } + + // Orphaned embeddings warning + if (hasEmbeddings) { + try { + const orphaned = db + .prepare('SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)') + .get().c; + if (orphaned > 0) { + warn( + `${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, + ); + } + } catch { + /* ignore — embeddings table may have been dropped */ + } + } + + // Unused exports warning + try { + const unusedCount = db + .prepare( + `SELECT COUNT(*) as c FROM nodes + WHERE exported = 1 AND kind != 'file' + AND id NOT IN ( + SELECT DISTINCT e.target_id FROM edges e + JOIN nodes caller ON e.source_id = caller.id + JOIN nodes target ON e.target_id = target.id + WHERE e.kind = 'calls' AND caller.file != target.file + )`, + ) + .get().c; + if (unusedCount > 0) { + warn( + `${unusedCount} exported symbol${unusedCount > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`, + ); + } + } catch { + /* exported column may not exist on older DBs */ + } + + // Persist build metadata + try { + setBuildMeta(db, { + engine: ctx.engineName, + engine_version: ctx.engineVersion || '', + codegraph_version: CODEGRAPH_VERSION, + schema_version: String(schemaVersion), + built_at: new Date().toISOString(), + node_count: nodeCount, + edge_count: actualEdgeCount, + }); + } catch (err) { + warn(`Failed to write build metadata: ${err.message}`); + } + + closeDb(db); + + // Write journal header after successful build + writeJournalHeader(rootDir, Date.now()); + + // Auto-registration + if (!opts.skipRegistry) { + const { tmpdir } = await import('node:os'); + const tmpDir = path.resolve(tmpdir()); + const resolvedRoot = path.resolve(rootDir); + if (resolvedRoot.startsWith(tmpDir)) { + debug(`Skipping auto-registration for temp directory: ${resolvedRoot}`); + } else { + try { + const { registerRepo } = await import('../../registry.js'); + registerRepo(rootDir); + } catch (err) { + debug(`Auto-registration failed: ${err.message}`); + } + } + } + + ctx.timing.finalizeMs = performance.now() - t0; +} diff --git a/src/builder/stages/insert-nodes.js b/src/builder/stages/insert-nodes.js new file mode 100644 index 0000000..cafd034 --- /dev/null +++ b/src/builder/stages/insert-nodes.js @@ -0,0 +1,170 @@ +/** + * Stage: insertNodes + * + * Batch-inserts file nodes, definitions, exports, children, and contains/parameter_of edges. + * Updates file hashes for incremental builds. + */ +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { bulkNodeIdsByFile } from '../../db.js'; +import { + batchInsertEdges, + batchInsertNodes, + fileHash, + fileStat, + readFileSafe, +} from '../helpers.js'; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function insertNodes(ctx) { + const { db, allSymbols, filesToParse, metadataUpdates, rootDir, removed } = ctx; + + // Build lookup from incremental data (pre-computed hashes + stats) + const precomputedData = new Map(); + for (const item of filesToParse) { + if (item.relPath) { + precomputedData.set(item.relPath, item); + } + } + + const bulkGetNodeIds = { all: (file) => bulkNodeIdsByFile(db, file) }; + + // Prepare hash upsert + let upsertHash; + try { + upsertHash = db.prepare( + 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', + ); + } catch { + upsertHash = null; + } + + const insertAll = db.transaction(() => { + // Phase 1: Batch insert all file nodes + definitions + exports + const phase1Rows = []; + for (const [relPath, symbols] of allSymbols) { + ctx.fileSymbols.set(relPath, symbols); + phase1Rows.push([relPath, 'file', relPath, 0, null, null]); + for (const def of symbols.definitions) { + phase1Rows.push([def.name, def.kind, relPath, def.line, def.endLine || null, null]); + } + for (const exp of symbols.exports) { + phase1Rows.push([exp.name, exp.kind, relPath, exp.line, null, null]); + } + } + batchInsertNodes(db, phase1Rows); + + // Phase 1b: Mark exported symbols + const markExported = db.prepare( + 'UPDATE nodes SET exported = 1 WHERE name = ? AND kind = ? AND file = ? AND line = ?', + ); + for (const [relPath, symbols] of allSymbols) { + for (const exp of symbols.exports) { + markExported.run(exp.name, exp.kind, relPath, exp.line); + } + } + + // Phase 3: Batch insert children (needs parent IDs from Phase 2) + const childRows = []; + for (const [relPath, symbols] of allSymbols) { + const nodeIdMap = new Map(); + for (const row of bulkGetNodeIds.all(relPath)) { + nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); + } + for (const def of symbols.definitions) { + if (!def.children?.length) continue; + const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); + if (!defId) continue; + for (const child of def.children) { + childRows.push([ + child.name, + child.kind, + relPath, + child.line, + child.endLine || null, + defId, + ]); + } + } + } + batchInsertNodes(db, childRows); + + // Phase 5: Batch insert contains/parameter_of edges + const edgeRows = []; + for (const [relPath, symbols] of allSymbols) { + const nodeIdMap = new Map(); + for (const row of bulkGetNodeIds.all(relPath)) { + nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); + } + const fileId = nodeIdMap.get(`${relPath}|file|0`); + for (const def of symbols.definitions) { + const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); + if (fileId && defId) { + edgeRows.push([fileId, defId, 'contains', 1.0, 0]); + } + if (def.children?.length && defId) { + for (const child of def.children) { + const childId = nodeIdMap.get(`${child.name}|${child.kind}|${child.line}`); + if (childId) { + edgeRows.push([defId, childId, 'contains', 1.0, 0]); + if (child.kind === 'parameter') { + edgeRows.push([childId, defId, 'parameter_of', 1.0, 0]); + } + } + } + } + } + + // Update file hash — skip reverse-dep files (unchanged) + if (upsertHash) { + const precomputed = precomputedData.get(relPath); + if (precomputed?._reverseDepOnly) { + // no-op: file unchanged, hash already correct + } else if (precomputed?.hash) { + const stat = precomputed.stat || fileStat(path.join(rootDir, relPath)); + const mtime = stat ? Math.floor(stat.mtimeMs) : 0; + const size = stat ? stat.size : 0; + upsertHash.run(relPath, precomputed.hash, mtime, size); + } else { + const absPath = path.join(rootDir, relPath); + let code; + try { + code = readFileSafe(absPath); + } catch { + code = null; + } + if (code !== null) { + const stat = fileStat(absPath); + const mtime = stat ? Math.floor(stat.mtimeMs) : 0; + const size = stat ? stat.size : 0; + upsertHash.run(relPath, fileHash(code), mtime, size); + } + } + } + } + batchInsertEdges(db, edgeRows); + + // Also update metadata-only entries (self-heal mtime/size without re-parse) + if (upsertHash) { + for (const item of metadataUpdates) { + const mtime = item.stat ? Math.floor(item.stat.mtimeMs) : 0; + const size = item.stat ? item.stat.size : 0; + upsertHash.run(item.relPath, item.hash, mtime, size); + } + } + }); + + const t0 = performance.now(); + insertAll(); + ctx.timing.insertMs = performance.now() - t0; + + // Clean up removed file hashes + if (upsertHash && removed.length > 0) { + const deleteHash = db.prepare('DELETE FROM file_hashes WHERE file = ?'); + for (const relPath of removed) { + deleteHash.run(relPath); + } + } +} diff --git a/src/builder/stages/parse-files.js b/src/builder/stages/parse-files.js new file mode 100644 index 0000000..3cdae46 --- /dev/null +++ b/src/builder/stages/parse-files.js @@ -0,0 +1,28 @@ +/** + * Stage: parseFiles + * + * Parses source files via parseFilesAuto (native or WASM engine). + * Populates ctx.allSymbols, ctx.fileSymbols, ctx.filesToParse. + */ +import { performance } from 'node:perf_hooks'; +import { info } from '../../logger.js'; +import { parseFilesAuto } from '../../parser.js'; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function parseFiles(ctx) { + const { allFiles, parseChanges, isFullBuild, engineOpts, rootDir } = ctx; + + ctx.filesToParse = isFullBuild ? allFiles.map((f) => ({ file: f })) : parseChanges; + ctx.fileSymbols = new Map(); + + const filePaths = ctx.filesToParse.map((item) => item.file); + const t0 = performance.now(); + ctx.allSymbols = await parseFilesAuto(filePaths, rootDir, engineOpts); + ctx.timing.parseMs = performance.now() - t0; + + const parsed = ctx.allSymbols.size; + const skipped = ctx.filesToParse.length - parsed; + info(`Parsed ${parsed} files (${skipped} skipped)`); +} diff --git a/src/builder/stages/resolve-imports.js b/src/builder/stages/resolve-imports.js new file mode 100644 index 0000000..1fba3f5 --- /dev/null +++ b/src/builder/stages/resolve-imports.js @@ -0,0 +1,143 @@ +/** + * Stage: resolveImports + * + * Batch import resolution + barrel/re-export map construction. + * For incremental builds, loads unchanged barrel files for resolution. + */ +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { parseFilesAuto } from '../../parser.js'; +import { resolveImportPath, resolveImportsBatch } from '../../resolve.js'; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function resolveImports(ctx) { + const { db, fileSymbols, rootDir, aliases, allFiles, isFullBuild, engineOpts } = ctx; + + // Collect all (fromFile, importSource) pairs and resolve in one native call + const t0 = performance.now(); + const batchInputs = []; + for (const [relPath, symbols] of fileSymbols) { + const absFile = path.join(rootDir, relPath); + for (const imp of symbols.imports) { + batchInputs.push({ fromFile: absFile, importSource: imp.source }); + } + } + ctx.batchResolved = resolveImportsBatch(batchInputs, rootDir, aliases, allFiles); + ctx.timing.resolveMs = performance.now() - t0; + + // Build re-export map for barrel resolution + ctx.reexportMap = new Map(); + for (const [relPath, symbols] of fileSymbols) { + const reexports = symbols.imports.filter((imp) => imp.reexport); + if (reexports.length > 0) { + ctx.reexportMap.set( + relPath, + reexports.map((imp) => ({ + source: getResolved(ctx, path.join(rootDir, relPath), imp.source), + names: imp.names, + wildcardReexport: imp.wildcardReexport || false, + })), + ); + } + } + + // For incremental builds, load unchanged barrel files into reexportMap + ctx.barrelOnlyFiles = new Set(); + if (!isFullBuild) { + const barrelCandidates = db + .prepare( + `SELECT DISTINCT n1.file FROM edges e + JOIN nodes n1 ON e.source_id = n1.id + WHERE e.kind = 'reexports' AND n1.kind = 'file'`, + ) + .all(); + for (const { file: relPath } of barrelCandidates) { + if (fileSymbols.has(relPath)) continue; + const absPath = path.join(rootDir, relPath); + try { + const symbols = await parseFilesAuto([absPath], rootDir, engineOpts); + const fileSym = symbols.get(relPath); + if (fileSym) { + fileSymbols.set(relPath, fileSym); + ctx.barrelOnlyFiles.add(relPath); + const reexports = fileSym.imports.filter((imp) => imp.reexport); + if (reexports.length > 0) { + ctx.reexportMap.set( + relPath, + reexports.map((imp) => ({ + source: getResolved(ctx, absPath, imp.source), + names: imp.names, + wildcardReexport: imp.wildcardReexport || false, + })), + ); + } + } + } catch { + /* skip if unreadable */ + } + } + } +} + +/** + * Resolve an import source, preferring batch results. + * Exported so other stages (build-edges) can reuse it. + */ +export function getResolved(ctx, absFile, importSource) { + if (ctx.batchResolved) { + const key = `${absFile}|${importSource}`; + const hit = ctx.batchResolved.get(key); + if (hit !== undefined) return hit; + } + return resolveImportPath(absFile, importSource, ctx.rootDir, ctx.aliases); +} + +/** + * Check if a file is a barrel (re-export hub). + */ +export function isBarrelFile(ctx, relPath) { + const symbols = ctx.fileSymbols.get(relPath); + if (!symbols) return false; + const reexports = symbols.imports.filter((imp) => imp.reexport); + if (reexports.length === 0) return false; + const ownDefs = symbols.definitions.length; + return reexports.length >= ownDefs; +} + +/** + * Resolve a symbol through barrel re-export chains. + */ +export function resolveBarrelExport(ctx, barrelPath, symbolName, visited = new Set()) { + if (visited.has(barrelPath)) return null; + visited.add(barrelPath); + const reexports = ctx.reexportMap.get(barrelPath); + if (!reexports) return null; + + for (const re of reexports) { + if (re.names.length > 0 && !re.wildcardReexport) { + if (re.names.includes(symbolName)) { + const targetSymbols = ctx.fileSymbols.get(re.source); + if (targetSymbols) { + const hasDef = targetSymbols.definitions.some((d) => d.name === symbolName); + if (hasDef) return re.source; + const deeper = resolveBarrelExport(ctx, re.source, symbolName, visited); + if (deeper) return deeper; + } + return re.source; + } + continue; + } + if (re.wildcardReexport || re.names.length === 0) { + const targetSymbols = ctx.fileSymbols.get(re.source); + if (targetSymbols) { + const hasDef = targetSymbols.definitions.some((d) => d.name === symbolName); + if (hasDef) return re.source; + const deeper = resolveBarrelExport(ctx, re.source, symbolName, visited); + if (deeper) return deeper; + } + } + } + return null; +} diff --git a/src/builder/stages/run-analyses.js b/src/builder/stages/run-analyses.js new file mode 100644 index 0000000..9d6a7ef --- /dev/null +++ b/src/builder/stages/run-analyses.js @@ -0,0 +1,44 @@ +/** + * Stage: runAnalyses + * + * Dispatches to the unified AST analysis engine (AST nodes, complexity, CFG, dataflow). + * Filters out reverse-dep files for incremental builds. + */ +import { debug } from '../../logger.js'; + +/** + * @param {import('../context.js').PipelineContext} ctx + */ +export async function runAnalyses(ctx) { + const { db, allSymbols, rootDir, opts, engineOpts, isFullBuild, filesToParse } = ctx; + + // For incremental builds, exclude reverse-dep-only files + let astComplexitySymbols = allSymbols; + if (!isFullBuild) { + const reverseDepFiles = new Set( + filesToParse.filter((item) => item._reverseDepOnly).map((item) => item.relPath), + ); + if (reverseDepFiles.size > 0) { + astComplexitySymbols = new Map(); + for (const [relPath, symbols] of allSymbols) { + if (!reverseDepFiles.has(relPath)) { + astComplexitySymbols.set(relPath, symbols); + } + } + debug( + `AST/complexity/CFG/dataflow: processing ${astComplexitySymbols.size} changed files (skipping ${reverseDepFiles.size} reverse-deps)`, + ); + } + } + + const { runAnalyses: runAnalysesFn } = await import('../../ast-analysis/engine.js'); + try { + const analysisTiming = await runAnalysesFn(db, astComplexitySymbols, rootDir, opts, engineOpts); + ctx.timing.astMs = analysisTiming.astMs; + ctx.timing.complexityMs = analysisTiming.complexityMs; + ctx.timing.cfgMs = analysisTiming.cfgMs; + ctx.timing.dataflowMs = analysisTiming.dataflowMs; + } catch (err) { + debug(`Unified analysis engine failed: ${err.message}`); + } +} diff --git a/src/watcher.js b/src/watcher.js index aad62fe..8c5ed85 100644 --- a/src/watcher.js +++ b/src/watcher.js @@ -1,14 +1,13 @@ import fs from 'node:fs'; import path from 'node:path'; -import { readFileSafe } from './builder.js'; +import { rebuildFile } from './builder/incremental.js'; import { appendChangeEvents, buildChangeEvent, diffSymbols } from './change-journal.js'; import { EXTENSIONS, IGNORE_DIRS, normalizePath } from './constants.js'; import { closeDb, getNodeId as getNodeIdQuery, initSchema, openDb } from './db.js'; import { DbError } from './errors.js'; import { appendJournalEntries } from './journal.js'; -import { info, warn } from './logger.js'; -import { createParseTreeCache, getActiveEngine, parseFileIncremental } from './parser.js'; -import { resolveImportPath } from './resolve.js'; +import { info } from './logger.js'; +import { createParseTreeCache, getActiveEngine } from './parser.js'; function shouldIgnore(filePath) { const parts = filePath.split(path.sep); @@ -19,147 +18,6 @@ function isTrackedExt(filePath) { return EXTENSIONS.has(path.extname(filePath)); } -/** - * Parse a single file and update the database incrementally. - */ -async function updateFile(_db, rootDir, filePath, stmts, engineOpts, cache) { - const relPath = normalizePath(path.relative(rootDir, filePath)); - - const oldNodes = stmts.countNodes.get(relPath)?.c || 0; - const _oldEdges = stmts.countEdgesForFile.get(relPath)?.c || 0; - const oldSymbols = stmts.listSymbols.all(relPath); - - stmts.deleteEdgesForFile.run(relPath); - stmts.deleteNodes.run(relPath); - - if (!fs.existsSync(filePath)) { - if (cache) cache.remove(filePath); - const symbolDiff = diffSymbols(oldSymbols, []); - return { - file: relPath, - nodesAdded: 0, - nodesRemoved: oldNodes, - edgesAdded: 0, - deleted: true, - event: 'deleted', - symbolDiff, - nodesBefore: oldNodes, - nodesAfter: 0, - }; - } - - let code; - try { - code = readFileSafe(filePath); - } catch (err) { - warn(`Cannot read ${relPath}: ${err.message}`); - return null; - } - - const symbols = await parseFileIncremental(cache, filePath, code, engineOpts); - if (!symbols) return null; - - stmts.insertNode.run(relPath, 'file', relPath, 0, null); - - for (const def of symbols.definitions) { - stmts.insertNode.run(def.name, def.kind, relPath, def.line, def.endLine || null); - } - for (const exp of symbols.exports) { - stmts.insertNode.run(exp.name, exp.kind, relPath, exp.line, null); - } - - const newNodes = stmts.countNodes.get(relPath)?.c || 0; - const newSymbols = stmts.listSymbols.all(relPath); - - let edgesAdded = 0; - const fileNodeRow = stmts.getNodeId.get(relPath, 'file', relPath, 0); - if (!fileNodeRow) - return { file: relPath, nodesAdded: newNodes, nodesRemoved: oldNodes, edgesAdded: 0 }; - const fileNodeId = fileNodeRow.id; - - // Load aliases for full import resolution - const aliases = { baseUrl: null, paths: {} }; - - for (const imp of symbols.imports) { - const resolvedPath = resolveImportPath( - path.join(rootDir, relPath), - imp.source, - rootDir, - aliases, - ); - const targetRow = stmts.getNodeId.get(resolvedPath, 'file', resolvedPath, 0); - if (targetRow) { - const edgeKind = imp.reexport ? 'reexports' : imp.typeOnly ? 'imports-type' : 'imports'; - stmts.insertEdge.run(fileNodeId, targetRow.id, edgeKind, 1.0, 0); - edgesAdded++; - } - } - - const importedNames = new Map(); - for (const imp of symbols.imports) { - const resolvedPath = resolveImportPath( - path.join(rootDir, relPath), - imp.source, - rootDir, - aliases, - ); - for (const name of imp.names) { - importedNames.set(name.replace(/^\*\s+as\s+/, ''), resolvedPath); - } - } - - for (const call of symbols.calls) { - let caller = null; - for (const def of symbols.definitions) { - if (def.line <= call.line) { - const row = stmts.getNodeId.get(def.name, def.kind, relPath, def.line); - if (row) caller = row; - } - } - if (!caller) caller = fileNodeRow; - - const importedFrom = importedNames.get(call.name); - let targets; - if (importedFrom) { - targets = stmts.findNodeInFile.all(call.name, importedFrom); - } - if (!targets || targets.length === 0) { - targets = stmts.findNodeInFile.all(call.name, relPath); - if (targets.length === 0) { - targets = stmts.findNodeByName.all(call.name); - } - } - - for (const t of targets) { - if (t.id !== caller.id) { - stmts.insertEdge.run( - caller.id, - t.id, - 'calls', - importedFrom ? 1.0 : 0.5, - call.dynamic ? 1 : 0, - ); - edgesAdded++; - } - } - } - - const symbolDiff = diffSymbols(oldSymbols, newSymbols); - const event = oldNodes === 0 ? 'added' : 'modified'; - - return { - file: relPath, - nodesAdded: newNodes, - nodesRemoved: oldNodes, - edgesAdded, - deleted: false, - event, - symbolDiff, - nodesBefore: oldNodes, - nodesAfter: newNodes, - }; -} - export async function watchProject(rootDir, opts = {}) { const dbPath = path.join(rootDir, '.codegraph', 'graph.db'); if (!fs.existsSync(dbPath)) { @@ -227,7 +85,9 @@ export async function watchProject(rootDir, opts = {}) { const results = []; for (const filePath of files) { - const result = await updateFile(db, rootDir, filePath, stmts, engineOpts, cache); + const result = await rebuildFile(db, rootDir, filePath, stmts, engineOpts, cache, { + diffSymbols, + }); if (result) results.push(result); } const updates = results; diff --git a/tests/builder/collect-files.test.js b/tests/builder/collect-files.test.js new file mode 100644 index 0000000..709ef35 --- /dev/null +++ b/tests/builder/collect-files.test.js @@ -0,0 +1,70 @@ +/** + * Unit tests for collectFiles pipeline stage. + */ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { PipelineContext } from '../../src/builder/context.js'; +import { collectFiles } from '../../src/builder/stages/collect-files.js'; + +let tmpDir; + +beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-stage-collect-')); + fs.mkdirSync(path.join(tmpDir, 'src')); + fs.writeFileSync(path.join(tmpDir, 'src', 'a.js'), 'export const a = 1;'); + fs.writeFileSync(path.join(tmpDir, 'src', 'b.ts'), 'export const b = 2;'); + fs.writeFileSync(path.join(tmpDir, 'src', 'style.css'), 'body {}'); +}); + +afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); +}); + +describe('collectFiles stage', () => { + it('populates ctx.allFiles and ctx.discoveredDirs', async () => { + const ctx = new PipelineContext(); + ctx.rootDir = tmpDir; + ctx.config = {}; + ctx.opts = {}; + + await collectFiles(ctx); + + expect(ctx.allFiles.length).toBe(2); // a.js + b.ts, not style.css + const basenames = ctx.allFiles.map((f) => path.basename(f)); + expect(basenames).toContain('a.js'); + expect(basenames).toContain('b.ts'); + expect(basenames).not.toContain('style.css'); + expect(ctx.discoveredDirs).toBeInstanceOf(Set); + expect(ctx.discoveredDirs.size).toBeGreaterThan(0); + }); + + it('handles scoped rebuild', async () => { + const ctx = new PipelineContext(); + ctx.rootDir = tmpDir; + ctx.config = {}; + ctx.opts = { scope: ['src/a.js'] }; + + await collectFiles(ctx); + + expect(ctx.allFiles).toHaveLength(1); + expect(ctx.isFullBuild).toBe(false); + expect(ctx.parseChanges).toHaveLength(1); + expect(ctx.parseChanges[0].relPath).toBe('src/a.js'); + expect(ctx.removed).toHaveLength(0); + }); + + it('scoped rebuild with missing file marks it as removed', async () => { + const ctx = new PipelineContext(); + ctx.rootDir = tmpDir; + ctx.config = {}; + ctx.opts = { scope: ['nonexistent.js'] }; + + await collectFiles(ctx); + + expect(ctx.allFiles).toHaveLength(0); + expect(ctx.parseChanges).toHaveLength(0); + expect(ctx.removed).toContain('nonexistent.js'); + }); +}); diff --git a/tests/builder/context.test.js b/tests/builder/context.test.js new file mode 100644 index 0000000..d691ab5 --- /dev/null +++ b/tests/builder/context.test.js @@ -0,0 +1,42 @@ +/** + * Unit tests for PipelineContext. + */ +import { describe, expect, it } from 'vitest'; +import { PipelineContext } from '../../src/builder/context.js'; + +describe('PipelineContext', () => { + it('creates an instance with default values', () => { + const ctx = new PipelineContext(); + expect(ctx.earlyExit).toBe(false); + expect(ctx.forceFullRebuild).toBe(false); + expect(ctx.hasEmbeddings).toBe(false); + expect(ctx.timing).toEqual({}); + }); + + it('allows setting all stage fields', () => { + const ctx = new PipelineContext(); + ctx.rootDir = '/tmp/test'; + ctx.allFiles = ['/tmp/test/a.js']; + ctx.parseChanges = []; + ctx.allSymbols = new Map(); + ctx.fileSymbols = new Map(); + ctx.reexportMap = new Map(); + ctx.barrelOnlyFiles = new Set(); + ctx.nodesByName = new Map(); + ctx.nodesByNameAndFile = new Map(); + + expect(ctx.rootDir).toBe('/tmp/test'); + expect(ctx.allFiles).toHaveLength(1); + expect(ctx.parseChanges).toHaveLength(0); + expect(ctx.allSymbols).toBeInstanceOf(Map); + expect(ctx.fileSymbols).toBeInstanceOf(Map); + }); + + it('timing accumulates across stages', () => { + const ctx = new PipelineContext(); + ctx.timing.parseMs = 10; + ctx.timing.insertMs = 20; + ctx.timing.edgesMs = 30; + expect(ctx.timing).toEqual({ parseMs: 10, insertMs: 20, edgesMs: 30 }); + }); +}); diff --git a/tests/builder/detect-changes.test.js b/tests/builder/detect-changes.test.js new file mode 100644 index 0000000..ec0734d --- /dev/null +++ b/tests/builder/detect-changes.test.js @@ -0,0 +1,144 @@ +/** + * Unit tests for detectChanges pipeline stage. + */ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { PipelineContext } from '../../src/builder/context.js'; +import { detectChanges } from '../../src/builder/stages/detect-changes.js'; +import { closeDb, initSchema, openDb } from '../../src/db.js'; +import { writeJournalHeader } from '../../src/journal.js'; + +let tmpDir; + +beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-stage-detect-')); + fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 1;'); +}); + +afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); +}); + +describe('detectChanges stage', () => { + it('treats all files as changed when file_hashes is empty', async () => { + const dbDir = path.join(tmpDir, '.codegraph'); + fs.mkdirSync(dbDir, { recursive: true }); + const db = openDb(path.join(dbDir, 'graph.db')); + initSchema(db); + + const ctx = new PipelineContext(); + ctx.rootDir = tmpDir; + ctx.db = db; + ctx.allFiles = [path.join(tmpDir, 'a.js')]; + ctx.opts = {}; + ctx.incremental = true; + ctx.forceFullRebuild = false; + ctx.config = {}; + + await detectChanges(ctx); + + // Empty file_hashes = all files are new (incremental, not full build) + expect(ctx.isFullBuild).toBe(false); + expect(ctx.earlyExit).toBe(false); + expect(ctx.parseChanges.length).toBe(1); + closeDb(db); + }); + + it('detects early exit when no changes after initial build', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-stage-nochange-')); + const dbDir = path.join(dir, '.codegraph'); + fs.mkdirSync(dbDir, { recursive: true }); + fs.writeFileSync(path.join(dir, 'a.js'), 'export const a = 1;'); + + const db = openDb(path.join(dbDir, 'graph.db')); + initSchema(db); + + // Seed file_hashes so incremental thinks file is unchanged + const content = fs.readFileSync(path.join(dir, 'a.js'), 'utf-8'); + const { createHash } = await import('node:crypto'); + const hash = createHash('md5').update(content).digest('hex'); + const stat = fs.statSync(path.join(dir, 'a.js')); + db.prepare('INSERT INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)').run( + 'a.js', + hash, + Math.floor(stat.mtimeMs), + stat.size, + ); + + // Write journal header so journal check doesn't confuse things + writeJournalHeader(dir, Date.now()); + + const ctx = new PipelineContext(); + ctx.rootDir = dir; + ctx.db = db; + ctx.allFiles = [path.join(dir, 'a.js')]; + ctx.opts = {}; + ctx.incremental = true; + ctx.forceFullRebuild = false; + ctx.config = {}; + + await detectChanges(ctx); + + expect(ctx.earlyExit).toBe(true); + // DB should be closed by detectChanges on early exit + fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('skips change detection for scoped builds', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-stage-scope-')); + const dbDir = path.join(dir, '.codegraph'); + fs.mkdirSync(dbDir, { recursive: true }); + fs.writeFileSync(path.join(dir, 'a.js'), 'export const a = 1;'); + + const db = openDb(path.join(dbDir, 'graph.db')); + initSchema(db); + + const ctx = new PipelineContext(); + ctx.rootDir = dir; + ctx.db = db; + ctx.allFiles = [path.join(dir, 'a.js')]; + ctx.opts = { scope: ['a.js'] }; + ctx.incremental = true; + ctx.forceFullRebuild = false; + ctx.config = {}; + ctx.parseChanges = [{ file: path.join(dir, 'a.js'), relPath: 'a.js' }]; + ctx.removed = []; + ctx.isFullBuild = false; + + await detectChanges(ctx); + + // Should return without modifying isFullBuild + expect(ctx.isFullBuild).toBe(false); + expect(ctx.earlyExit).toBe(false); + closeDb(db); + fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('forces full rebuild when forceFullRebuild is set', async () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-stage-force-')); + const dbDir = path.join(dir, '.codegraph'); + fs.mkdirSync(dbDir, { recursive: true }); + fs.writeFileSync(path.join(dir, 'a.js'), 'export const a = 1;'); + + const db = openDb(path.join(dbDir, 'graph.db')); + initSchema(db); + + const ctx = new PipelineContext(); + ctx.rootDir = dir; + ctx.db = db; + ctx.allFiles = [path.join(dir, 'a.js')]; + ctx.opts = {}; + ctx.incremental = true; + ctx.forceFullRebuild = true; + ctx.config = {}; + + await detectChanges(ctx); + + expect(ctx.isFullBuild).toBe(true); + expect(ctx.parseChanges.length).toBe(1); + closeDb(db); + fs.rmSync(dir, { recursive: true, force: true }); + }); +}); diff --git a/tests/builder/pipeline.test.js b/tests/builder/pipeline.test.js new file mode 100644 index 0000000..d6b8b76 --- /dev/null +++ b/tests/builder/pipeline.test.js @@ -0,0 +1,79 @@ +/** + * Unit tests for the pipeline orchestrator. + * + * Verifies that buildGraph from the new pipeline produces the same results + * as the integration tests expect — correct return shape, phase timing, etc. + */ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { buildGraph } from '../../src/builder/pipeline.js'; + +const FIXTURE_DIR = path.join(import.meta.dirname, '..', 'fixtures', 'sample-project'); +let tmpDir; + +beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-pipeline-')); + for (const file of fs.readdirSync(FIXTURE_DIR)) { + fs.copyFileSync(path.join(FIXTURE_DIR, file), path.join(tmpDir, file)); + } +}); + +afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); +}); + +describe('buildGraph pipeline', () => { + it('returns phases timing object', async () => { + const result = await buildGraph(tmpDir, { incremental: false }); + expect(result).toBeDefined(); + expect(result.phases).toBeDefined(); + expect(typeof result.phases.setupMs).toBe('number'); + expect(typeof result.phases.parseMs).toBe('number'); + expect(typeof result.phases.insertMs).toBe('number'); + expect(typeof result.phases.resolveMs).toBe('number'); + expect(typeof result.phases.edgesMs).toBe('number'); + expect(typeof result.phases.structureMs).toBe('number'); + expect(typeof result.phases.rolesMs).toBe('number'); + expect(typeof result.phases.finalizeMs).toBe('number'); + }); + + it('returns undefined on early exit (no changes)', async () => { + // First build + await buildGraph(tmpDir, { incremental: false }); + // Second build — incremental, no changes + const result = await buildGraph(tmpDir, { incremental: true }); + expect(result).toBeUndefined(); + }); + + it('creates expected nodes and edges', async () => { + await buildGraph(tmpDir, { incremental: false }); + + const Database = (await import('better-sqlite3')).default; + const db = new Database(path.join(tmpDir, '.codegraph', 'graph.db'), { readonly: true }); + + const nodeCount = db.prepare('SELECT COUNT(*) as c FROM nodes').get().c; + const edgeCount = db.prepare('SELECT COUNT(*) as c FROM edges').get().c; + + expect(nodeCount).toBeGreaterThan(0); + expect(edgeCount).toBeGreaterThan(0); + + // Should have file nodes for all 3 fixture files + const fileNodes = db + .prepare("SELECT name FROM nodes WHERE kind = 'file'") + .all() + .map((r) => r.name); + expect(fileNodes).toContain('math.js'); + expect(fileNodes).toContain('utils.js'); + expect(fileNodes).toContain('index.js'); + + db.close(); + }); + + it('exports from barrel are identical to direct import', async () => { + // Verify the barrel re-export works + const { buildGraph: fromBarrel } = await import('../../src/builder.js'); + expect(fromBarrel).toBe(buildGraph); + }); +}); From d270073b721d28a0d327a5a97b87e247fd03652c Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 02:30:24 -0600 Subject: [PATCH 2/5] fix: address review feedback on helpers.js and incremental.js - Restore symlink loop warning and move detection before readdirSync to avoid wasted I/O (greptile review) - Replace dynamic fs import with static top-level import in incremental.js to avoid per-call async overhead in watch mode (greptile review) Impact: 2 functions changed, 0 affected --- src/builder/helpers.js | 23 +++++++++++++---------- src/builder/incremental.js | 3 +-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/builder/helpers.js b/src/builder/helpers.js index 5a90219..f2dd9d1 100644 --- a/src/builder/helpers.js +++ b/src/builder/helpers.js @@ -55,27 +55,30 @@ export function collectFiles( const trackDirs = directories instanceof Set; let hasFiles = false; - let entries; - try { - entries = fs.readdirSync(dir, { withFileTypes: true }); - } catch (err) { - warn(`Cannot read directory ${dir}: ${err.message}`); - return trackDirs ? { files, directories } : files; - } - // Merge config ignoreDirs with defaults const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null; - // Detect symlink loops + // Detect symlink loops (before I/O to avoid wasted readdirSync) let realDir; try { realDir = fs.realpathSync(dir); } catch { return trackDirs ? { files, directories } : files; } - if (_visited.has(realDir)) return trackDirs ? { files, directories } : files; + if (_visited.has(realDir)) { + warn(`Symlink loop detected, skipping: ${dir}`); + return trackDirs ? { files, directories } : files; + } _visited.add(realDir); + let entries; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch (err) { + warn(`Cannot read directory ${dir}: ${err.message}`); + return trackDirs ? { files, directories } : files; + } + for (const entry of entries) { if (entry.name.startsWith('.') && entry.name !== '.') { if (IGNORE_DIRS.has(entry.name)) continue; diff --git a/src/builder/incremental.js b/src/builder/incremental.js index 25a1e5e..8081b42 100644 --- a/src/builder/incremental.js +++ b/src/builder/incremental.js @@ -4,6 +4,7 @@ * Reuses pipeline helpers instead of duplicating node insertion and edge building * logic from the main builder. This eliminates the watcher.js divergence (ROADMAP 3.9). */ +import fs from 'node:fs'; import path from 'node:path'; import { normalizePath } from '../constants.js'; import { warn } from '../logger.js'; @@ -27,8 +28,6 @@ import { BUILTIN_RECEIVERS, readFileSafe } from './helpers.js'; export async function rebuildFile(db, rootDir, filePath, stmts, engineOpts, cache, options = {}) { const { diffSymbols } = options; const relPath = normalizePath(path.relative(rootDir, filePath)); - const fs = await import('node:fs'); - const oldNodes = stmts.countNodes.get(relPath)?.c || 0; const oldSymbols = diffSymbols ? stmts.listSymbols.all(relPath) : []; From d7355e4602fac9806671495ea8fc0fda69dc15fe Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 02:49:11 -0600 Subject: [PATCH 3/5] fix: guard pipeline against DB leak on stage errors and undefined timings Wrap pipeline stages in try-catch to close the DB if a stage throws between parseFiles and finalize. Add ?? 0 fallback for all timing properties that may be undefined if a stage didn't run. Impact: 1 functions changed, 0 affected --- src/builder/pipeline.js | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/src/builder/pipeline.js b/src/builder/pipeline.js index 53f419d..e0ea3b4 100644 --- a/src/builder/pipeline.js +++ b/src/builder/pipeline.js @@ -7,7 +7,7 @@ import path from 'node:path'; import { performance } from 'node:perf_hooks'; import { loadConfig } from '../config.js'; -import { getBuildMeta, initSchema, MIGRATIONS, openDb } from '../db.js'; +import { closeDb, getBuildMeta, initSchema, MIGRATIONS, openDb } from '../db.js'; import { info } from '../logger.js'; import { getActiveEngine } from '../parser.js'; import { PipelineContext } from './context.js'; @@ -98,28 +98,33 @@ export async function buildGraph(rootDir, opts = {}) { if (ctx.earlyExit) return; - await parseFiles(ctx); - await insertNodes(ctx); - await resolveImports(ctx); - await buildEdges(ctx); - await buildStructure(ctx); - await runAnalyses(ctx); - await finalize(ctx); + try { + await parseFiles(ctx); + await insertNodes(ctx); + await resolveImports(ctx); + await buildEdges(ctx); + await buildStructure(ctx); + await runAnalyses(ctx); + await finalize(ctx); + } catch (err) { + closeDb(ctx.db); + throw err; + } return { phases: { setupMs: +ctx.timing.setupMs.toFixed(1), - parseMs: +ctx.timing.parseMs.toFixed(1), - insertMs: +ctx.timing.insertMs.toFixed(1), - resolveMs: +ctx.timing.resolveMs.toFixed(1), - edgesMs: +ctx.timing.edgesMs.toFixed(1), - structureMs: +ctx.timing.structureMs.toFixed(1), - rolesMs: +ctx.timing.rolesMs.toFixed(1), - astMs: +ctx.timing.astMs.toFixed(1), - complexityMs: +ctx.timing.complexityMs.toFixed(1), + parseMs: +(ctx.timing.parseMs ?? 0).toFixed(1), + insertMs: +(ctx.timing.insertMs ?? 0).toFixed(1), + resolveMs: +(ctx.timing.resolveMs ?? 0).toFixed(1), + edgesMs: +(ctx.timing.edgesMs ?? 0).toFixed(1), + structureMs: +(ctx.timing.structureMs ?? 0).toFixed(1), + rolesMs: +(ctx.timing.rolesMs ?? 0).toFixed(1), + astMs: +(ctx.timing.astMs ?? 0).toFixed(1), + complexityMs: +(ctx.timing.complexityMs ?? 0).toFixed(1), ...(ctx.timing.cfgMs != null && { cfgMs: +ctx.timing.cfgMs.toFixed(1) }), ...(ctx.timing.dataflowMs != null && { dataflowMs: +ctx.timing.dataflowMs.toFixed(1) }), - finalizeMs: +ctx.timing.finalizeMs.toFixed(1), + finalizeMs: +(ctx.timing.finalizeMs ?? 0).toFixed(1), }, }; } From d229c873d149da2540bcac5397e1725401983805 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 03:20:56 -0600 Subject: [PATCH 4/5] fix: extend DB leak guard to all stages, hoist fileSymbols out of transaction - Extend try-catch in pipeline.js to cover collectFiles and detectChanges (which run SQLite queries that can throw), with a guard to avoid double-closing on the early-exit path. - Move ctx.fileSymbols population before the db.transaction() in insert-nodes.js so a rollback doesn't leave partial JS state. Impact: 2 functions changed, 1 affected --- src/builder/pipeline.js | 10 +++++----- src/builder/stages/insert-nodes.js | 8 +++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/builder/pipeline.js b/src/builder/pipeline.js index e0ea3b4..3fab4ec 100644 --- a/src/builder/pipeline.js +++ b/src/builder/pipeline.js @@ -93,12 +93,12 @@ export async function buildGraph(rootDir, opts = {}) { ctx.timing.setupMs = performance.now() - ctx.buildStart; // ── Pipeline stages ─────────────────────────────────────────────── - await collectFiles(ctx); - await detectChanges(ctx); + try { + await collectFiles(ctx); + await detectChanges(ctx); - if (ctx.earlyExit) return; + if (ctx.earlyExit) return; - try { await parseFiles(ctx); await insertNodes(ctx); await resolveImports(ctx); @@ -107,7 +107,7 @@ export async function buildGraph(rootDir, opts = {}) { await runAnalyses(ctx); await finalize(ctx); } catch (err) { - closeDb(ctx.db); + if (!ctx.earlyExit) closeDb(ctx.db); throw err; } diff --git a/src/builder/stages/insert-nodes.js b/src/builder/stages/insert-nodes.js index cafd034..5007603 100644 --- a/src/builder/stages/insert-nodes.js +++ b/src/builder/stages/insert-nodes.js @@ -41,11 +41,17 @@ export async function insertNodes(ctx) { upsertHash = null; } + // Populate fileSymbols before the transaction so it is a pure input + // to (rather than a side-effect of) the DB write — avoids partial + // population if the transaction rolls back. + for (const [relPath, symbols] of allSymbols) { + ctx.fileSymbols.set(relPath, symbols); + } + const insertAll = db.transaction(() => { // Phase 1: Batch insert all file nodes + definitions + exports const phase1Rows = []; for (const [relPath, symbols] of allSymbols) { - ctx.fileSymbols.set(relPath, symbols); phase1Rows.push([relPath, 'file', relPath, 0, null, null]); for (const def of symbols.definitions) { phase1Rows.push([def.name, def.kind, relPath, def.line, def.endLine || null, null]); From 027af485ab8f6c65c8ec60cc603d5618f4e562c9 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Fri, 13 Mar 2026 03:38:24 -0600 Subject: [PATCH 5/5] fix: extend DB guard to setup phase, surface analysis failures, use readFileSafe - Move try-catch to start immediately after openDb so initSchema, getBuildMeta, and loadPathAliases are all covered. - Promote analysis engine failure log from debug to warn so users know when AST/complexity/CFG/dataflow data is incomplete. - Replace direct fs.readFileSync with readFileSafe in build-structure.js for consistent transient-error retry on Windows. Impact: 3 functions changed, 1 affected --- src/builder/pipeline.js | 88 +++++++++++++-------------- src/builder/stages/build-structure.js | 6 +- src/builder/stages/run-analyses.js | 4 +- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/builder/pipeline.js b/src/builder/pipeline.js index 3fab4ec..09a97a5 100644 --- a/src/builder/pipeline.js +++ b/src/builder/pipeline.js @@ -41,59 +41,59 @@ export async function buildGraph(rootDir, opts = {}) { ctx.rootDir = path.resolve(rootDir); ctx.dbPath = path.join(ctx.rootDir, '.codegraph', 'graph.db'); ctx.db = openDb(ctx.dbPath); - initSchema(ctx.db); + try { + initSchema(ctx.db); - ctx.config = loadConfig(ctx.rootDir); - ctx.incremental = - opts.incremental !== false && ctx.config.build && ctx.config.build.incremental !== false; + ctx.config = loadConfig(ctx.rootDir); + ctx.incremental = + opts.incremental !== false && ctx.config.build && ctx.config.build.incremental !== false; - ctx.engineOpts = { - engine: opts.engine || 'auto', - dataflow: opts.dataflow !== false, - ast: opts.ast !== false, - }; - const { name: engineName, version: engineVersion } = getActiveEngine(ctx.engineOpts); - ctx.engineName = engineName; - ctx.engineVersion = engineVersion; - info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`); + ctx.engineOpts = { + engine: opts.engine || 'auto', + dataflow: opts.dataflow !== false, + ast: opts.ast !== false, + }; + const { name: engineName, version: engineVersion } = getActiveEngine(ctx.engineOpts); + ctx.engineName = engineName; + ctx.engineVersion = engineVersion; + info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`); - // Engine/schema mismatch detection - ctx.schemaVersion = MIGRATIONS[MIGRATIONS.length - 1].version; - ctx.forceFullRebuild = false; - if (ctx.incremental) { - const prevEngine = getBuildMeta(ctx.db, 'engine'); - if (prevEngine && prevEngine !== engineName) { - info(`Engine changed (${prevEngine} → ${engineName}), promoting to full rebuild.`); - ctx.forceFullRebuild = true; + // Engine/schema mismatch detection + ctx.schemaVersion = MIGRATIONS[MIGRATIONS.length - 1].version; + ctx.forceFullRebuild = false; + if (ctx.incremental) { + const prevEngine = getBuildMeta(ctx.db, 'engine'); + if (prevEngine && prevEngine !== engineName) { + info(`Engine changed (${prevEngine} → ${engineName}), promoting to full rebuild.`); + ctx.forceFullRebuild = true; + } + const prevSchema = getBuildMeta(ctx.db, 'schema_version'); + if (prevSchema && Number(prevSchema) !== ctx.schemaVersion) { + info( + `Schema version changed (${prevSchema} → ${ctx.schemaVersion}), promoting to full rebuild.`, + ); + ctx.forceFullRebuild = true; + } } - const prevSchema = getBuildMeta(ctx.db, 'schema_version'); - if (prevSchema && Number(prevSchema) !== ctx.schemaVersion) { + + // Path aliases + ctx.aliases = loadPathAliases(ctx.rootDir); + if (ctx.config.aliases) { + for (const [key, value] of Object.entries(ctx.config.aliases)) { + const pattern = key.endsWith('/') ? `${key}*` : key; + const target = path.resolve(ctx.rootDir, value); + ctx.aliases.paths[pattern] = [target.endsWith('/') ? `${target}*` : `${target}/*`]; + } + } + if (ctx.aliases.baseUrl || Object.keys(ctx.aliases.paths).length > 0) { info( - `Schema version changed (${prevSchema} → ${ctx.schemaVersion}), promoting to full rebuild.`, + `Loaded path aliases: baseUrl=${ctx.aliases.baseUrl || 'none'}, ${Object.keys(ctx.aliases.paths).length} path mappings`, ); - ctx.forceFullRebuild = true; - } - } - - // Path aliases - ctx.aliases = loadPathAliases(ctx.rootDir); - if (ctx.config.aliases) { - for (const [key, value] of Object.entries(ctx.config.aliases)) { - const pattern = key.endsWith('/') ? `${key}*` : key; - const target = path.resolve(ctx.rootDir, value); - ctx.aliases.paths[pattern] = [target.endsWith('/') ? `${target}*` : `${target}/*`]; } - } - if (ctx.aliases.baseUrl || Object.keys(ctx.aliases.paths).length > 0) { - info( - `Loaded path aliases: baseUrl=${ctx.aliases.baseUrl || 'none'}, ${Object.keys(ctx.aliases.paths).length} path mappings`, - ); - } - ctx.timing.setupMs = performance.now() - ctx.buildStart; + ctx.timing.setupMs = performance.now() - ctx.buildStart; - // ── Pipeline stages ─────────────────────────────────────────────── - try { + // ── Pipeline stages ───────────────────────────────────────────── await collectFiles(ctx); await detectChanges(ctx); diff --git a/src/builder/stages/build-structure.js b/src/builder/stages/build-structure.js index 07827c0..9f69e45 100644 --- a/src/builder/stages/build-structure.js +++ b/src/builder/stages/build-structure.js @@ -3,11 +3,11 @@ * * Builds directory structure, containment edges, metrics, and classifies node roles. */ -import fs from 'node:fs'; import path from 'node:path'; import { performance } from 'node:perf_hooks'; import { normalizePath } from '../../constants.js'; import { debug } from '../../logger.js'; +import { readFileSafe } from '../helpers.js'; /** * @param {import('../context.js').PipelineContext} ctx @@ -23,7 +23,7 @@ export async function buildStructure(ctx) { } else { const absPath = path.join(rootDir, relPath); try { - const content = fs.readFileSync(absPath, 'utf-8'); + const content = readFileSafe(absPath); ctx.lineCountMap.set(relPath, content.split('\n').length); } catch { ctx.lineCountMap.set(relPath, 0); @@ -70,7 +70,7 @@ export async function buildStructure(ctx) { } else { const absPath = path.join(rootDir, relPath); try { - const content = fs.readFileSync(absPath, 'utf-8'); + const content = readFileSafe(absPath); ctx.lineCountMap.set(relPath, content.split('\n').length); } catch { ctx.lineCountMap.set(relPath, 0); diff --git a/src/builder/stages/run-analyses.js b/src/builder/stages/run-analyses.js index 9d6a7ef..bc8db8a 100644 --- a/src/builder/stages/run-analyses.js +++ b/src/builder/stages/run-analyses.js @@ -4,7 +4,7 @@ * Dispatches to the unified AST analysis engine (AST nodes, complexity, CFG, dataflow). * Filters out reverse-dep files for incremental builds. */ -import { debug } from '../../logger.js'; +import { debug, warn } from '../../logger.js'; /** * @param {import('../context.js').PipelineContext} ctx @@ -39,6 +39,6 @@ export async function runAnalyses(ctx) { ctx.timing.cfgMs = analysisTiming.cfgMs; ctx.timing.dataflowMs = analysisTiming.dataflowMs; } catch (err) { - debug(`Unified analysis engine failed: ${err.message}`); + warn(`Analysis engine failed (AST/complexity/CFG/dataflow may be incomplete): ${err.message}`); } }