From 7d00f77372624404da9b90224031191c43deb2a7 Mon Sep 17 00:00:00 2001 From: Martin Vogel Date: Sat, 4 Jul 2026 17:23:43 +0200 Subject: [PATCH 1/3] fix(index): run background auto-index + watcher re-index in the supervised subprocess (RSS isolation) The long-lived MCP server ran the full index pipeline in-process on two background paths -- the session auto-index (autoindex_thread, mcp.c) and the watcher re-index (watcher_index_fn, main.c). The parallel pipeline's worker threads abandon their mimalloc pages at thread exit, and mimalloc v3 defaults page_reclaim_on_free=0, so those pages are never reclaimed when the main thread later frees their blocks (mi_collect cannot touch abandoned pages either). RSS then ratchets across re-index cycles and never comes back down (#832). The index_repository tool path already avoided this by running the index in a supervised worker subprocess: on child exit the kernel reclaims 100% of the child's RSS. Route the two background paths through that same supervised runner: - supervisor_invalidate_store() is now NULL-safe (the watcher path has no MCP server / cached store). - New static index_run_supervised_path() builds {"repo_path": root} and reuses the existing spawn + skip-and-continue recovery loop (no duplication). New public cbm_mcp_index_run_supervised_path() is the srv-less entry main.c calls. - autoindex_thread and watcher_index_fn gate on cbm_index_supervisor_should_wrap(): supervised on a marked host with the kill switch off, else fall through to the unchanged in-process pipeline (kill switch / spawn-failure degrade). The watcher keeps its non-blocking try_lock gate + g_shutdown check exactly; the lock still serialises re-indexes. cbm_mem_collect() stays on the in-process branch only. Also fold in the in-process allocator fallback: cbm_mem_init sets mi_option_page_reclaim_on_free=1, restoring v2 behaviour so any path that stays in-process (kill switch, degrade, embedders) also reclaims abandoned worker pages. The subprocess is the primary cure (child exit returns RSS every cycle); this is the fallback. Tests: a deterministic gating guard (test_mcp.c) proves the background paths now spawn a worker child and index the fixture (RED when routed in-process: spawn_count unchanged; GREEN after). The test-runner acts as a real in-process worker when spawned as `cli --index-worker` (public APIs only, no production seam). A non-gating prod-binary RSS reproduction (tests/repro/issue832_rss.py) shows the supervised server flat at ~10 MB vs ~131 MB in-process. Scope: JobObject, the #841 spurious-trigger fix, retention caps, budget tiering and the backpressure busy-spin are follow-ups. Refs #832, #841. Signed-off-by: Martin Vogel --- src/foundation/mem.c | 9 +++ src/main.c | 17 ++++ src/mcp/mcp.c | 57 ++++++++++++- src/mcp/mcp.h | 12 +++ tests/repro/issue832_rss.py | 135 +++++++++++++++++++++++++++++++ tests/test_main.c | 72 ++++++++++++++++- tests/test_mcp.c | 155 ++++++++++++++++++++++++++++++++++++ 7 files changed, 454 insertions(+), 3 deletions(-) create mode 100644 tests/repro/issue832_rss.py diff --git a/src/foundation/mem.c b/src/foundation/mem.c index 46494aad2..e1fef4a9d 100644 --- a/src/foundation/mem.c +++ b/src/foundation/mem.c @@ -122,6 +122,15 @@ void cbm_mem_init(double ram_fraction) { mi_option_set(mi_option_arena_eager_commit, 0); mi_option_set(mi_option_purge_decommits, SKIP_ONE); mi_option_set(mi_option_purge_delay, 0); /* immediate purge, no 1s delay */ + /* v3 (#832): reclaim abandoned pages on ANY thread's free (=1), restoring the + * v2 behaviour. mimalloc v3 defaults page_reclaim_on_free=0, so pages a worker + * thread abandons at exit are NOT reclaimed when the main thread later frees + * their blocks (and mi_collect cannot touch abandoned pages) — RSS then + * ratchets across repeated in-process index cycles. The supervised subprocess + * is the primary cure (the child returns 100% RSS on exit); this is the + * in-process fallback for any path that stays in-process (kill switch, + * spawn-fail degrade, embedders). */ + mi_option_set(mi_option_page_reclaim_on_free, 1); /* CBM_MEM_BUDGET_MB env override (memory analogue of CBM_WORKERS). * Lets users cap the budget directly without an enclosing cgroup — diff --git a/src/main.c b/src/main.c index 5bbaa65c9..474abd536 100644 --- a/src/main.c +++ b/src/main.c @@ -171,6 +171,23 @@ static int watcher_index_fn(const char *project_name, const char *root_path, voi cbm_log_info("watcher.reindex", "project", project_name, "path", root_path); + /* #832: route the re-index through the supervised worker subprocess so this + * long-lived server process hands its RSS back to the OS on every cycle + * instead of ratcheting (mimalloc v3 does not reclaim pages that worker + * threads abandon at exit). The child writes the DB; the parent only needs the + * return code. The pipeline lock (already held) still serialises re-indexes. + * Degrade to the in-process pipeline when the supervisor is off (kill switch) + * or the spawn fails. */ + if (cbm_index_supervisor_should_wrap()) { + char *resp = cbm_mcp_index_run_supervised_path(root_path); + if (resp) { + free(resp); + cbm_pipeline_unlock(); + return 0; + } + /* resp == NULL → spawn-failure degrade → fall through to in-process. */ + } + cbm_pipeline_t *p = cbm_pipeline_new(root_path, NULL, CBM_MODE_FULL); if (!p) { cbm_pipeline_unlock(); diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 39065260e..293f9764b 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -3481,8 +3481,14 @@ static char *build_worker_failure_response(const char *args, cbm_proc_outcome_t } /* Drop the cached store so the next query reopens whatever the worker wrote (each - * worker is a fresh process that deletes + recreates the .db). */ + * worker is a fresh process that deletes + recreates the .db). NULL-safe: the + * background watcher path (main.c) has no MCP server / cached store — the child + * writes the DB and the parent only needs the return code, so there is nothing + * to invalidate. */ static void supervisor_invalidate_store(cbm_mcp_server_t *srv) { + if (!srv) { + return; + } if (srv->owns_store && srv->store) { cbm_store_close(srv->store); srv->store = NULL; @@ -3687,6 +3693,34 @@ static char *index_run_supervised(cbm_mcp_server_t *srv, const char *args) { return build_worker_failure_response(args, last_outcome); } +/* Build a minimal {"repo_path": ""} args object (path safely escaped) and + * run it through index_run_supervised. Shared by the session auto-index (srv + * present → its cached store is invalidated) and the watcher re-index (srv NULL). + * Returns the worker's response string (caller frees) or NULL to degrade. */ +static char *index_run_supervised_path(cbm_mcp_server_t *srv, const char *root_path) { + if (!root_path || !root_path[0]) { + return NULL; + } + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + yyjson_mut_obj_add_strcpy(doc, root, "repo_path", root_path); + char *args = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + if (!args) { + return NULL; + } + char *resp = index_run_supervised(srv, args); + free(args); + return resp; +} + +/* Public entry (see mcp.h): the watcher re-index in main.c has no MCP server, so + * it reaches the supervised runner through this srv-less wrapper. */ +char *cbm_mcp_index_run_supervised_path(const char *root_path) { + return index_run_supervised_path(NULL, root_path); +} + static char *handle_index_repository(cbm_mcp_server_t *srv, const char *args) { /* Supervisor gate: run the index in a crash/hang-isolating worker subprocess * unless this process IS the worker or the kill switch (CBM_INDEX_SUPERVISOR=0) @@ -5638,6 +5672,25 @@ static void *autoindex_thread(void *arg) { cbm_log_info("autoindex.start", "project", srv->session_project, "path", srv->session_root); + /* #832: prefer the supervised worker subprocess. Indexing the whole session in + * this long-lived server thread ratchets RSS (mimalloc v3 does not reclaim the + * pages worker threads abandon at exit); running it in a child that exits hands + * 100% of that memory back to the OS every cycle. Degrade to the in-process + * pipeline below when the supervisor is off (kill switch) or the spawn fails. */ + if (cbm_index_supervisor_should_wrap()) { + char *resp = index_run_supervised_path(srv, srv->session_root); + if (resp) { + free(resp); + cbm_log_info("autoindex.done", "project", srv->session_project, "mode", "supervised"); + /* Register with watcher for ongoing change detection */ + if (srv->watcher) { + cbm_watcher_watch(srv->watcher, srv->session_project, srv->session_root); + } + return NULL; + } + /* resp == NULL → spawn-failure degrade → fall through to in-process. */ + } + cbm_pipeline_t *p = cbm_pipeline_new(srv->session_root, NULL, CBM_MODE_FULL); if (!p) { cbm_log_warn("autoindex.err", "msg", "pipeline_create_failed"); @@ -5650,7 +5703,7 @@ static void *autoindex_thread(void *arg) { cbm_pipeline_unlock(); cbm_pipeline_free(p); - cbm_mem_collect(); /* return mimalloc pages to OS after indexing */ + cbm_mem_collect(); /* return mimalloc pages to OS after indexing (in-process only) */ if (rc == 0) { cbm_log_info("autoindex.done", "project", srv->session_project); diff --git a/src/mcp/mcp.h b/src/mcp/mcp.h index 6f04deed0..57048f9c9 100644 --- a/src/mcp/mcp.h +++ b/src/mcp/mcp.h @@ -116,6 +116,18 @@ char *cbm_mcp_server_handle(cbm_mcp_server_t *srv, const char *line); /* Handle a tools/call request. Returns MCP tool result JSON. */ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const char *args_json); +/* ── Supervised background index (RSS isolation, #832) ────────── */ + +/* Run a full index of root_path in a supervised worker SUBPROCESS (the same + * crash/hang-isolating runner used by handle_index_repository), so the child + * returns 100% of its RSS to the OS on exit instead of ratcheting the long-lived + * parent. Builds {"repo_path": root_path} internally. Returns the worker's + * response string (caller frees) on success, or NULL to signal the caller must + * degrade to the in-process path (kill switch set, spawn failure, or the process + * is not a supervisor host). This is the shared entry the watcher re-index + * (main.c) and the session auto-index (mcp.c) route through. */ +char *cbm_mcp_index_run_supervised_path(const char *root_path); + /* ── Idle store eviction ──────────────────────────────────────── */ /* Evict the cached project store if idle for more than timeout_s seconds. diff --git a/tests/repro/issue832_rss.py b/tests/repro/issue832_rss.py new file mode 100644 index 000000000..e52264500 --- /dev/null +++ b/tests/repro/issue832_rss.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +issue832_rss.py -- direct RSS reproduction for #832 (NON-GATING, manual repro tier). + +WHY THIS IS NOT A C UNIT TEST +----------------------------- +The RSS ratchet is a property of mimalloc v3's abandoned-page handling +(page_reclaim_on_free=0): pages a worker THREAD abandons at exit are not +reclaimed when the main thread later frees their blocks. That only manifests in +the PROD binary, which links mimalloc as the global allocator (Makefile.cbm: +MI_OVERRIDE=1). The C test-runner and the C repro-runner are built CRT+ASan +(MI_OVERRIDE=0), so mimalloc is inert there and cbm_mem_rss() falls back to +os_rss() -- a C test would be VACUOUS. Hence this drives the real +`build/c/codebase-memory-mcp` server over stdio and samples its RSS from `ps`. + +WHAT IT SHOWS +------------- +A long-lived MCP server is driven through K index_repository cycles of the same +fixture. The in-process pipeline (CBM_INDEX_SUPERVISOR=0) is the pre-#832-fix +background-path behaviour: RSS RATCHETS across cycles. The supervised subprocess +path (default) is the fix: each child returns 100% of its RSS on exit, so the +long-lived parent stays ~FLAT. The auto-index (mcp.c) and watcher re-index +(main.c) paths now route through that same supervised subprocess, so they inherit +this flat profile; the deterministic routing proof is the GATING guard +tests/test_mcp.c::index_bg_paths_route_through_supervisor_issue832. + +Inherently noisy (allocator/OS dependent) -> thresholds are generous and this is +NOT wired into `make test` / `ci-ok`. Run manually: + make -f Makefile.cbm cbm + python3 tests/repro/issue832_rss.py +""" +import json +import os +import shutil +import subprocess +import sys +import tempfile + +ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BINARY = os.path.join(ROOT, "build", "c", "codebase-memory-mcp") +CYCLES = 10 +NUM_FILES = 120 # enough files to fan out across worker threads (abandoned heaps) + + +def rss_kb(pid): + out = subprocess.check_output(["ps", "-o", "rss=", "-p", str(pid)]) + return int(out.strip()) + + +def make_fixture(d): + for i in range(NUM_FILES): + with open(os.path.join(d, f"mod_{i}.py"), "w") as f: + for j in range(20): + f.write(f"def fn_{i}_{j}(a, b):\n") + f.write(f" x = a + b + {i} * {j}\n") + f.write(" return x\n\n") + + +def run_series(repo, cache, supervised): + env = dict(os.environ) + env["CBM_CACHE_DIR"] = cache + if supervised: + env.pop("CBM_INDEX_SUPERVISOR", None) + else: + env["CBM_INDEX_SUPERVISOR"] = "0" # in-process (pre-fix background behaviour) + env["CBM_INDEX_WORKER_TIMEOUT_S"] = "120" + + proc = subprocess.Popen( + [BINARY], stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, env=env, text=True, bufsize=1, + ) + + def rpc(obj): + proc.stdin.write(json.dumps(obj) + "\n") + proc.stdin.flush() + return proc.stdout.readline() + + rpc({"jsonrpc": "2.0", "id": 0, "method": "initialize", "params": {}}) + series = [] + for k in range(CYCLES): + rpc({"jsonrpc": "2.0", "id": k + 1, "method": "tools/call", + "params": {"name": "index_repository", + "arguments": {"repo_path": repo, "mode": "fast"}}}) + series.append(rss_kb(proc.pid)) + try: + proc.stdin.close() + proc.wait(timeout=15) + except Exception: + proc.kill() + return series + + +def main(): + if not os.path.exists(BINARY): + print(f"missing prod binary: {BINARY}\n build it: make -f Makefile.cbm cbm") + return 2 + base = tempfile.mkdtemp(prefix="cbm-832-") + repo = os.path.join(base, "repo") + os.makedirs(repo) + make_fixture(repo) + try: + inproc = run_series(repo, os.path.join(base, "c1"), supervised=False) + superv = run_series(repo, os.path.join(base, "c2"), supervised=True) + finally: + shutil.rmtree(base, ignore_errors=True) + + def mb(kb): + return kb / 1024.0 + + print(f"cycles={CYCLES} files={NUM_FILES}") + print("cycle | in-process(MB) | supervised(MB)") + for i in range(CYCLES): + print(f" {i:2d} | {mb(inproc[i]):8.1f} | {mb(superv[i]):8.1f}") + + ip_peak = max(mb(x) for x in inproc) + sv_peak = max(mb(x) for x in superv) + print(f"\nin-process peak resident: {ip_peak:8.1f} MB") + print(f"supervised peak resident: {sv_peak:8.1f} MB") + # The decisive, robust signal at laptop-fixture scale is the RESIDENT-LEVEL + # contrast, not cycle-over-cycle growth: the in-process server keeps the whole + # index working set resident (it never leaves the long-lived process), while + # the supervised path returns it every cycle (the child exits) -> the server + # stays near its idle baseline. The unbounded ratchet in the field (#832, GB + # over hours) is the same effect amplified by worker-thread count + cycle count + # beyond what a small fixture surfaces. Generous threshold; report-only, + # NON-GATING. + verdict = "SUPERVISED ISOLATION reproduced (server stays near baseline)" \ + if sv_peak < ip_peak / 2 \ + else "inconclusive (env-dependent; see numbers)" + print(f"verdict: {verdict}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_main.c b/tests/test_main.c index da7c21c50..142663c42 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -9,11 +9,74 @@ int tf_fail_count = 0; int tf_skip_count = 0; #include "test_framework.h" -#include "foundation/compat.h" /* cbm_setenv — #845 supervisor kill switch */ +#include "foundation/compat.h" /* cbm_setenv — #845 supervisor kill switch */ +#include "foundation/compat_fs.h" /* cbm_fopen — worker response file */ +#include "foundation/mem.h" /* cbm_mem_init — worker budget */ +#include "mcp/index_supervisor.h" /* cbm_index_set_worker_role */ +#include "mcp/mcp.h" /* cbm_mcp_handle_tool — act as a real worker */ #include #include +#include +#include #include +/* #832 guard support: when the index supervisor spawns THIS binary as + * ` cli --index-worker index_repository --response-out ` + * (exactly the argv cbm_index_spawn_worker builds), act as a faithful in-process + * index worker instead of re-running the test suites. This lets the deterministic + * gating guard (test_mcp.c) spawn a REAL worker child that indexes the fixture and + * writes its response back, using only public APIs — no production test seam. + * Returns an exit code (>=0) when it handled a worker invocation, else -1. */ +static int tf_maybe_run_index_worker(int argc, char **argv) { + if (argc < 2 || strcmp(argv[1], "cli") != 0) { + return -1; + } + bool is_worker = false; + const char *tool = NULL; + const char *args_json = "{}"; + const char *response_out = NULL; + for (int i = 2; i < argc; i++) { + if (strcmp(argv[i], "--index-worker") == 0) { + is_worker = true; + } else if (strcmp(argv[i], "--response-out") == 0) { + if (i + 1 < argc) { + response_out = argv[++i]; + } + } else if (argv[i][0] == '{') { + args_json = argv[i]; + } else if (argv[i][0] != '-' && !tool) { + tool = argv[i]; + } + } + if (!is_worker) { + return -1; + } + if (!tool) { + tool = "index_repository"; + } + + cbm_mem_init(0.5); + cbm_index_set_worker_role(true, response_out); /* worker role → index in-process */ + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + if (!srv) { + return 1; + } + char *result = cbm_mcp_handle_tool(srv, tool, args_json); + if (result) { + const char *ro = cbm_index_worker_response_out(); + if (ro) { + FILE *rf = cbm_fopen(ro, "wb"); + if (rf) { + (void)fputs(result, rf); + (void)fclose(rf); + } + } + free(result); + } + cbm_mcp_server_free(srv); + return 0; +} + static int g_suite_argc = 0; static char **g_suite_argv = NULL; @@ -135,6 +198,13 @@ extern void suite_dump_verify_io(void); extern void cbm_kind_in_set_free_cache(void); int main(int argc, char **argv) { + /* #832: if spawned as a supervised index worker, do the real work and exit + * before any suite runs (see tf_maybe_run_index_worker). */ + int worker_rc = tf_maybe_run_index_worker(argc, argv); + if (worker_rc >= 0) { + return worker_rc; + } + /* #845 belt-and-suspenders: this binary EMBEDS cbm_mcp_handle_tool. The * supervisor gate already ignores unmarked hosts, but pin the kill switch * too so even a future supervisor-marked test host can never resolve THIS diff --git a/tests/test_mcp.c b/tests/test_mcp.c index e8cc2abcb..c421ec0ab 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -4166,6 +4166,160 @@ TEST(index_supervisor_gate_requires_marked_host_issue845) { PASS(); } +/* ══════════════════════════════════════════════════════════════════ + * #832 — background auto-index + watcher re-index must run in the + * supervised worker SUBPROCESS (RSS isolation) + * ══════════════════════════════════════════════════════════════════ */ + +/* The long-lived server ran the full index pipeline in-process on two background + * paths (session auto-index in mcp.c, watcher re-index in main.c). Worker-thread + * mimalloc heaps abandon pages at thread exit and mimalloc v3 + * (page_reclaim_on_free=0) does not reclaim them when the main thread later frees + * their blocks, so RSS ratchets across re-index cycles (#832). The fix routes both + * paths through cbm_mcp_index_run_supervised_path() — the SAME supervised worker + * subprocess the index_repository tool uses — so the child hands 100%% of its RSS + * back to the OS on exit. + * + * This guard proves the ROUTING: on a supervisor-marked host with the kill switch + * OFF, the shared entry the watcher/auto-index now call must (a) spawn a worker + * child (cbm_index_supervisor_spawn_count() increases) and (b) actually index the + * fixture (the worker child writes the Function node). RED on the unfixed + * in-process routing: it calls cbm_pipeline_run directly, so spawn_count is + * unchanged → IDX832_NO_SPAWN. */ +enum { + IDX832_OK = 0, + IDX832_NO_SPAWN = 51, /* spawn_count unchanged — routed in-process (RED) */ + IDX832_NULL_RESP = 52, /* supervised entry degraded to NULL */ + IDX832_NOT_INDEXED = 53, /* response/store lacks the indexed Function node */ + IDX832_SERVER_FAIL = 54, +}; + +static int idx832_supervised_route_check(const char *repo_dir) { + /* Become a supervisor host with the kill switch OFF — exactly the real MCP + * server's state. Done in the FORKED CHILD only (see the harness) so the + * parent test-runner's process-wide host mark stays clear and the #845 + * unmarked-embedder guard is unaffected. Bound the recovery loop + worker + * quiet-timeout so a stuck child cannot run long under the fork+alarm net. */ + cbm_index_supervisor_mark_host(); + cbm_unsetenv("CBM_INDEX_SUPERVISOR"); + cbm_setenv("CBM_INDEX_MAX_RESTARTS", "1", 1); + cbm_setenv("CBM_INDEX_WORKER_TIMEOUT_S", "30", 1); + + int spawns_before = cbm_index_supervisor_spawn_count(); + char *resp = cbm_mcp_index_run_supervised_path(repo_dir); + int spawns_after = cbm_index_supervisor_spawn_count(); + + if (spawns_after == spawns_before) { + free(resp); + return IDX832_NO_SPAWN; /* the discriminating assertion: RED in-process */ + } + if (!resp) { + return IDX832_NULL_RESP; + } + bool indexed = response_contains_json_fragment(resp, "\"status\":\"indexed\""); + free(resp); + if (!indexed) { + return IDX832_NOT_INDEXED; + } + + /* Store-level proof the worker child did real work: the Function node it wrote + * must be queryable from a fresh server reading the DB the child produced. */ + char *project = cbm_project_name_from_path(repo_dir); + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + if (!srv) { + free(project); + return IDX832_SERVER_FAIL; + } + int code = IDX832_OK; + if (project) { + char q[512]; + snprintf(q, sizeof(q), + "{\"project\":\"%s\",\"name_pattern\":\"idx832_fn\",\"label\":\"Function\"}", + project); + char *sr = cbm_mcp_handle_tool(srv, "search_graph", q); + if (!sr || !strstr(sr, "idx832_fn")) { + code = IDX832_NOT_INDEXED; + } + free(sr); + } + cbm_mcp_server_free(srv); + free(project); + return code; +} + +TEST(index_bg_paths_route_through_supervisor_issue832) { +#ifdef _WIN32 + /* The guard marks the process as a supervisor host, which cannot be undone. + * POSIX isolates that in a forked child; without fork we would pollute the + * shared test-runner (breaking the #845 unmarked-embedder guard). The routing + * logic is platform-independent and covered on POSIX CI; Windows containment + * is covered by the end-to-end crash-containment test. */ + SKIP_PLATFORM("supervisor-host guard needs fork isolation (POSIX-only)"); +#else + char tmp_dir[256]; + snprintf(tmp_dir, sizeof(tmp_dir), "/tmp/cbm-idx832-repo-XXXXXX"); + if (!cbm_mkdtemp(tmp_dir)) { + PASS(); + } + char cache[256]; + snprintf(cache, sizeof(cache), "/tmp/cbm-idx832-cache-XXXXXX"); + if (!cbm_mkdtemp(cache)) { + cbm_rmdir(tmp_dir); + PASS(); + } + + const char *saved_cache = getenv("CBM_CACHE_DIR"); + char *saved_cache_copy = saved_cache ? strdup(saved_cache) : NULL; + cbm_setenv("CBM_CACHE_DIR", cache, 1); /* inherited by the worker child */ + + char src_path[512]; + snprintf(src_path, sizeof(src_path), "%s/main.py", tmp_dir); + FILE *fp = fopen(src_path, "w"); + ASSERT_NOT_NULL(fp); + fputs("def idx832_fn():\n return 'ok'\n", fp); + fclose(fp); + + int code = -1; + bool signalled = false; + int sig = 0; + fflush(NULL); + pid_t pid = fork(); + if (pid == 0) { + alarm(60); /* a stuck worker dies here instead of hanging the runner */ + _exit(idx832_supervised_route_check(tmp_dir)); + } + ASSERT_TRUE(pid > 0); + int status = 0; + (void)waitpid(pid, &status, 0); + if (WIFEXITED(status)) { + code = WEXITSTATUS(status); + } else if (WIFSIGNALED(status)) { + signalled = true; + sig = WTERMSIG(status); + } + + char *project = cbm_project_name_from_path(tmp_dir); + cleanup_project_db(cache, project); + free(project); + restore_cache_dir(saved_cache_copy); + free(saved_cache_copy); + remove(src_path); + cbm_rmdir(cache); + cbm_rmdir(tmp_dir); + + if (signalled) { + printf(" child killed by signal %d (alarm => worker hang)\n", sig); + } else if (code != IDX832_OK) { + printf(" child exit code %d (51=no spawn/in-process=RED, 52=null resp, " + "53=not indexed, 54=server fail)\n", + code); + } + ASSERT_FALSE(signalled); + ASSERT_EQ(code, IDX832_OK); + PASS(); +#endif +} + /* ══════════════════════════════════════════════════════════════════ * SUITE * ══════════════════════════════════════════════════════════════════ */ @@ -4288,6 +4442,7 @@ SUITE(mcp) { RUN_TEST(tool_index_repository_reports_store_backed_adr); RUN_TEST(tool_index_repository_dot_uses_absolute_project_key_and_preserves_adr); RUN_TEST(index_supervisor_gate_requires_marked_host_issue845); + RUN_TEST(index_bg_paths_route_through_supervisor_issue832); RUN_TEST(tool_manage_adr_not_found_rich_error); RUN_TEST(tool_manage_adr_get_accepts_abs_path); RUN_TEST(tool_manage_adr_get_accepts_symlink_path); From 40157834f897298a40f279470db8d37ebb14bc63 Mon Sep 17 00:00:00 2001 From: Martin Vogel Date: Sat, 4 Jul 2026 19:13:25 +0200 Subject: [PATCH 2/3] =?UTF-8?q?fix(index):=20gate=20supervised=20auto-inde?= =?UTF-8?q?x=20watcher=20registration=20on=20auto=5Fwatch=20(keystone?= =?UTF-8?q?=C3=97#849)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #832 supervisor keystone added a second watcher-registration site in autoindex_thread's supervised-success branch, but wired it as a direct cbm_watcher_watch() guarded only by `if (srv->watcher)`. srv->watcher is set unconditionally (cbm_mcp_server_set_watcher), so that guard does NOT honour the auto_watch config. PR #849 had routed ALL watcher registration through register_watcher_if_enabled() (which checks auto_watch_enabled() and logs watcher.register.skipped reason=auto_watch_off). On a supervisor-marked prod host with a fresh (unindexed) project and `config set auto_watch false`, the supervised path therefore registered a background watcher against the user's explicit opt-out — defeating #849's multi-project containment. The in-process twin ~20 lines below already used register_watcher_if_enabled() correctly. Fix: replace the ungated block in the supervised-success branch with register_watcher_if_enabled(srv), matching the in-process branch. Surrounding supervised-success logging is unchanged. Reproduce-first: mcp_auto_watch_false_skips_supervised_autoindex_issue853 drives the real public entry initialize -> maybe_auto_index -> autoindex_thread on a supervisor-marked host (kill switch off) with a FRESH project and auto_watch=false, in a fork+alarm-bounded child (like the #832/#845 gate tests). cbm_mcp_server_free() joins the autoindex thread so the registration decision has run before the watch count is read. It asserts spawn_count advanced (so the probe cannot pass vacuously by skipping the supervised branch) and watch_count == 0. RED on the unfixed code (watch_count == 1, exit 61); GREEN after the fix (register_watcher_if_enabled skips → watcher.register.skipped reason=auto_watch_off). Signed-off-by: Martin Vogel --- src/mcp/mcp.c | 9 +-- tests/test_mcp.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+), 4 deletions(-) diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index c11a366c1..413a9d7aa 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -5706,10 +5706,11 @@ static void *autoindex_thread(void *arg) { if (resp) { free(resp); cbm_log_info("autoindex.done", "project", srv->session_project, "mode", "supervised"); - /* Register with watcher for ongoing change detection */ - if (srv->watcher) { - cbm_watcher_watch(srv->watcher, srv->session_project, srv->session_root); - } + /* Register with watcher for ongoing change detection — gated on + * auto_watch (#849), same as the in-process branch below. A bare + * `if (srv->watcher)` would register even when the user set + * `config set auto_watch false`, since srv->watcher is always set. */ + register_watcher_if_enabled(srv); return NULL; } /* resp == NULL → spawn-failure degrade → fall through to in-process. */ diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 09f0f549c..9df8b9e9b 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -4440,6 +4440,183 @@ TEST(mcp_auto_watch_false_skips_watcher_on_connect) { PASS(); } +/* ══════════════════════════════════════════════════════════════════ + * #853 — auto_watch=false must ALSO gate the SUPERVISED fresh-index + * watcher registration (keystone × #849 merge interaction) + * ══════════════════════════════════════════════════════════════════ */ + +/* #849 routed ALL watcher registration through register_watcher_if_enabled() + * (auto_watch gate). The #832 keystone then added a SECOND registration site in + * autoindex_thread's supervised-success branch, but wired it as a DIRECT + * cbm_watcher_watch() guarded only by `if (srv->watcher)` — srv->watcher is set + * unconditionally, so that guard does NOT honour `config set auto_watch false`. + * The above tests only cover the already-indexed on-connect path + * (register_watcher_if_enabled); this guard covers the fresh-index SUPERVISED + * autoindex_thread branch that #832 introduced. + * + * Drive the real public entry initialize → maybe_auto_index → autoindex_thread on + * a supervisor-marked host (kill switch off) with a FRESH project (no prior .db) + * and auto_watch=false. cbm_mcp_server_free() joins the autoindex thread, so the + * (buggy or gated) registration decision has run before we read the watch count. + * + * RED on the unfixed ungated block: the supervised success branch calls + * cbm_watcher_watch() unconditionally → watch_count == 1 → IDX853_WATCHER_REGISTERED. + * GREEN once it calls register_watcher_if_enabled() → auto_watch_off skip → 0. + * spawn_count is asserted to have advanced so the assertion cannot pass vacuously + * (i.e. green only because the supervised branch was never entered). */ +enum { + IDX853_OK = 0, /* watch_count==0, supervised branch ran → GREEN */ + IDX853_WATCHER_REGISTERED = 61, /* watch_count==1 → RED: ungated cbm_watcher_watch */ + IDX853_NO_SPAWN = 62, /* spawn_count unchanged → supervised path not exercised */ + IDX853_SETUP_FAIL = 63, /* config/watcher/server/cwd setup failed */ + IDX853_BAD_COUNT = 64, /* unexpected watch_count (<0 or >1) */ +}; + +static int idx853_supervised_autowatch_check(const char *repo_dir, const char *cache_dir) { + /* Become a supervisor host with the kill switch OFF — the real prod MCP + * server's state. Done in the FORKED CHILD only (see harness) so the parent + * test-runner's process-wide host mark stays clear (#845 invariant). Bound the + * worker so a stuck spawn cannot run long under the fork+alarm net. */ + cbm_index_supervisor_mark_host(); + cbm_unsetenv("CBM_INDEX_SUPERVISOR"); + cbm_setenv("CBM_INDEX_MAX_RESTARTS", "1", 1); + cbm_setenv("CBM_INDEX_WORKER_TIMEOUT_S", "30", 1); + + cbm_config_t *cfg = cbm_config_open(cache_dir); + cbm_store_t *wstore = cbm_store_open_memory(); + cbm_watcher_t *watcher = wstore ? cbm_watcher_new(wstore, NULL, NULL) : NULL; + if (!cfg || !watcher) { + if (watcher) { + cbm_watcher_free(watcher); + } + if (wstore) { + cbm_store_close(wstore); + } + if (cfg) { + cbm_config_close(cfg); + } + return IDX853_SETUP_FAIL; + } + /* auto_index=true → maybe_auto_index launches autoindex_thread for the fresh + * project; auto_watch=false → the gate this guard exercises. */ + cbm_config_set(cfg, CBM_CONFIG_AUTO_INDEX, "true"); + cbm_config_set(cfg, CBM_CONFIG_AUTO_WATCH, "false"); + + /* detect_session derives session_root/session_project from the cwd. */ + char old_cwd[1024]; + if (!cbm_getcwd(old_cwd, sizeof(old_cwd)) || cbm_chdir(repo_dir) != 0) { + cbm_watcher_free(watcher); + cbm_store_close(wstore); + cbm_config_close(cfg); + return IDX853_SETUP_FAIL; + } + + int spawns_before = cbm_index_supervisor_spawn_count(); + int code = IDX853_SETUP_FAIL; + + cbm_mcp_server_t *srv = cbm_mcp_server_new(NULL); + if (srv) { + cbm_mcp_server_set_watcher(srv, watcher); + cbm_mcp_server_set_config(srv, cfg); + char *resp = cbm_mcp_server_handle( + srv, "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"initialize\",\"params\":{}}"); + free(resp); + /* free() joins the autoindex thread → the supervised worker has finished + * and the registration decision (buggy or gated) has executed. */ + cbm_mcp_server_free(srv); + + int spawns_after = cbm_index_supervisor_spawn_count(); + int watch_count = cbm_watcher_watch_count(watcher); + + if (spawns_after == spawns_before) { + code = IDX853_NO_SPAWN; /* supervised branch never ran — not a valid probe */ + } else if (watch_count == 1) { + code = IDX853_WATCHER_REGISTERED; /* the discriminating RED assertion */ + } else if (watch_count == 0) { + code = IDX853_OK; + } else { + code = IDX853_BAD_COUNT; + } + } + + (void)cbm_chdir(old_cwd); + cbm_watcher_free(watcher); + cbm_store_close(wstore); + cbm_config_close(cfg); + return code; +} + +TEST(mcp_auto_watch_false_skips_supervised_autoindex_issue853) { +#ifdef _WIN32 + /* Marks the process as a supervisor host (irreversible); POSIX isolates that + * in a forked child. The gate logic is platform-independent and covered on + * POSIX CI. */ + SKIP_PLATFORM("supervisor-host guard needs fork isolation (POSIX-only)"); +#else + char tmp_dir[256]; + snprintf(tmp_dir, sizeof(tmp_dir), "/tmp/cbm-idx853-repo-XXXXXX"); + if (!cbm_mkdtemp(tmp_dir)) { + PASS(); + } + char cache[256]; + snprintf(cache, sizeof(cache), "/tmp/cbm-idx853-cache-XXXXXX"); + if (!cbm_mkdtemp(cache)) { + cbm_rmdir(tmp_dir); + PASS(); + } + + const char *saved_cache = getenv("CBM_CACHE_DIR"); + char *saved_cache_copy = saved_cache ? strdup(saved_cache) : NULL; + cbm_setenv("CBM_CACHE_DIR", cache, 1); /* inherited by the worker child */ + + char src_path[512]; + snprintf(src_path, sizeof(src_path), "%s/main.py", tmp_dir); + FILE *fp = fopen(src_path, "w"); + ASSERT_NOT_NULL(fp); + fputs("def idx853_fn():\n return 'ok'\n", fp); + fclose(fp); + + int code = -1; + bool signalled = false; + int sig = 0; + fflush(NULL); + pid_t pid = fork(); + if (pid == 0) { + alarm(60); /* a stuck worker dies here instead of hanging the runner */ + _exit(idx853_supervised_autowatch_check(tmp_dir, cache)); + } + ASSERT_TRUE(pid > 0); + int status = 0; + (void)waitpid(pid, &status, 0); + if (WIFEXITED(status)) { + code = WEXITSTATUS(status); + } else if (WIFSIGNALED(status)) { + signalled = true; + sig = WTERMSIG(status); + } + + char *project = cbm_project_name_from_path(tmp_dir); + cleanup_project_db(cache, project); + free(project); + restore_cache_dir(saved_cache_copy); + free(saved_cache_copy); + remove(src_path); + cbm_rmdir(cache); + cbm_rmdir(tmp_dir); + + if (signalled) { + printf(" child killed by signal %d (alarm => worker hang)\n", sig); + } else if (code != IDX853_OK) { + printf(" child exit code %d (61=watcher registered under auto_watch=false=RED, " + "62=no spawn, 63=setup fail, 64=bad count)\n", + code); + } + ASSERT_FALSE(signalled); + ASSERT_EQ(code, IDX853_OK); + PASS(); +#endif +} + /* ══════════════════════════════════════════════════════════════════ * SUITE * ══════════════════════════════════════════════════════════════════ */ @@ -4620,4 +4797,5 @@ SUITE(mcp) { /* auto_watch gate (distilled from PR #625) */ RUN_TEST(mcp_auto_watch_default_registers_watcher_on_connect); RUN_TEST(mcp_auto_watch_false_skips_watcher_on_connect); + RUN_TEST(mcp_auto_watch_false_skips_supervised_autoindex_issue853); } From 0814af1fcfddc18132c9f88235097fe46c840416 Mon Sep 17 00:00:00 2001 From: Martin Vogel Date: Sat, 4 Jul 2026 19:34:33 +0200 Subject: [PATCH 3/3] fix(test): guard POSIX-only supervised-route helpers behind ifndef _WIN32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idx832/idx853 fork-harness helpers are used only in the POSIX (#else) branch of their tests; on Windows (SKIP_PLATFORM) they are unused → -Werror,-Wunused-function on the CLANG64 leg. Guard the definitions to match their call sites. Signed-off-by: Martin Vogel --- tests/test_mcp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_mcp.c b/tests/test_mcp.c index 9df8b9e9b..c1feebcf0 100644 --- a/tests/test_mcp.c +++ b/tests/test_mcp.c @@ -4196,6 +4196,7 @@ enum { IDX832_SERVER_FAIL = 54, }; +#ifndef _WIN32 /* helper used only by the POSIX fork harness below */ static int idx832_supervised_route_check(const char *repo_dir) { /* Become a supervisor host with the kill switch OFF — exactly the real MCP * server's state. Done in the FORKED CHILD only (see the harness) so the @@ -4248,6 +4249,7 @@ static int idx832_supervised_route_check(const char *repo_dir) { free(project); return code; } +#endif /* !_WIN32 */ TEST(index_bg_paths_route_through_supervisor_issue832) { #ifdef _WIN32 @@ -4472,6 +4474,7 @@ enum { IDX853_BAD_COUNT = 64, /* unexpected watch_count (<0 or >1) */ }; +#ifndef _WIN32 /* helper used only by the POSIX fork harness below */ static int idx853_supervised_autowatch_check(const char *repo_dir, const char *cache_dir) { /* Become a supervisor host with the kill switch OFF — the real prod MCP * server's state. Done in the FORKED CHILD only (see harness) so the parent @@ -4545,6 +4548,7 @@ static int idx853_supervised_autowatch_check(const char *repo_dir, const char *c cbm_config_close(cfg); return code; } +#endif /* !_WIN32 */ TEST(mcp_auto_watch_false_skips_supervised_autoindex_issue853) { #ifdef _WIN32