diff --git a/CLAUDE.md b/CLAUDE.md index 5ccac7f..b279a4b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,6 @@ # Claude Code on Databricks -Welcome! This environment comes pre-configured with 5 AI coding agents, 39 skills, and 2 MCP servers. Hermes Agent is available alongside Claude Code, Codex, Gemini CLI, and OpenCode — launch it with `hermes chat`. +Welcome! This environment comes pre-configured with 5 AI coding agents, 43 skills, and 3 MCP servers. Hermes Agent is available alongside Claude Code, Codex, Gemini CLI, and OpenCode — launch it with `hermes chat`. ## Skills (30 total) @@ -39,6 +39,7 @@ From [obra/superpowers](https://github.com/obra/superpowers): - **DeepWiki** - AI-powered documentation for any GitHub repository - **Exa** - Web search and code context retrieval +- **CoDA** (exposed at `/mcp`) - Delegate coding tasks to AI agents via MCP. Any MCP client (Genie Code, Claude Desktop, Cursor) can call `coda_run`, `coda_inbox`, and `coda_get_result` to submit background tasks, check status, and retrieve results. See `docs/mcp-v2-background-execution.md`. ## Databricks CLI diff --git a/README.md b/README.md index aca755c..1e80205 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Use this template](https://img.shields.io/badge/Use%20this%20template-2ea44f?logo=github)](https://github.com/datasciencemonkey/coding-agents-databricks-apps/generate) [![Deploy to Databricks](https://img.shields.io/badge/Deploy-Databricks%20Apps-FF3621?logo=databricks&logoColor=white)](docs/deployment.md) [![Agents](https://img.shields.io/badge/Agents-5%20included-green)](#whats-inside) -[![Skills](https://img.shields.io/badge/Skills-39%20built--in-blue)](#-all-39-skills) +[![Skills](https://img.shields.io/badge/Skills-43%20built--in-blue)](#-all-43-skills) > Run Claude Code, Codex, Gemini CLI, Hermes Agent, and OpenCode in your browser — zero setup, wired to your Databricks workspace. @@ -68,7 +68,7 @@ This isn't just a terminal in the cloud. Running coding agents on Databricks giv | ✂️ **Split Panes** | Run two sessions side by side with a draggable divider | | 🌐 **WebSocket I/O** | Real-time terminal output over WebSocket — zero-latency, eliminates polling delay | | 🔁 **HTTP Polling Fallback** | Automatic fallback via Web Worker when WebSocket is unavailable | -| 🚀 **Parallel Setup** | 7 agent setups run in parallel (~5x faster startup) | +| 🚀 **Parallel Setup** | 6 agent setups run in parallel (~5x faster startup) | | 🔍 **Search** | Find anything in your terminal history (Ctrl+Shift+F) | | 🎤 **Voice Input** | Dictate commands with your mic (Option+V) | | 📋 **Image Paste** | Paste or drag-and-drop images into the terminal — saved to `~/uploads/`, path inserted automatically | @@ -179,7 +179,7 @@ This template repo opens that vision up for every Databricks user — no IDE set ---
-🧠 All 39 Skills +🧠 All 43 Skills ### Databricks Skills (25) — [ai-dev-kit](https://github.com/databricks-solutions/ai-dev-kit) @@ -204,16 +204,100 @@ This template repo opens that vision up for every Databricks user — no IDE set | Ship | finishing-branch, git-worktrees | | Meta | dispatching-agents, writing-skills, using-superpowers | +### BDD Skills (4) + +| Category | Skills | +|----------|--------| +| Testing | bdd-features, bdd-run, bdd-scaffold, bdd-steps | +
-🔌 2 MCP Servers +🔌 MCP Servers + +### Built-in MCP Clients | Server | What it does | |--------|-------------| | **DeepWiki** | Ask questions about any GitHub repo — gets AI-powered answers from the codebase | | **Exa** | Web search and code context retrieval for up-to-date information | +### CoDA MCP Server (exposed at `/mcp`) + +CoDA itself exposes an **MCP server** that any MCP-compatible client can connect to — delegate coding tasks to AI agents running on Databricks, without needing the terminal UI. + +| Tool | Purpose | +|------|---------| +| `coda_run` | Fire-and-forget: submit a coding task, get back immediately | +| `coda_inbox` | Dashboard: see all running/completed/failed tasks at a glance | +| `coda_get_result` | Pull the full structured result of a completed task | + +**Why this matters:** Any tool that speaks MCP can use your Databricks-hosted coding agents — no custom integration needed. + +#### Example: Databricks Genie Code + +Genie Code connects to CoDA's MCP endpoint and delegates coding work to agents running in the background: + +``` +User → Genie Code: "Build me a sales pipeline using the transactions table" + +Genie Code calls coda_run(prompt="Build a sales pipeline...", email="user@company.com", + context='{"tables": ["sales.transactions"]}') + +→ Returns immediately: {task_id: "task-abc", status: "running"} +→ User keeps chatting with Genie Code while the agent works + +User → Genie Code: "How's my pipeline coming?" + +Genie Code calls coda_inbox() +→ {tasks: [{task_id: "task-abc", status: "completed", summary: "Built pipeline.py..."}]} + +Genie Code calls coda_get_result(task_id="task-abc", session_id="sess-123") +→ {summary: "Created pipeline.py with 3 stages", files_changed: ["pipeline.py"], ...} +``` + +#### Connecting MCP Clients (Claude Code, Claude Desktop, Cursor, etc.) + +Databricks Apps use OAuth — not PATs — for authentication. A static `Authorization: Bearer ` header will get a `302` redirect to the OAuth login page. To connect any MCP client, use the **stdio bridge** (`tools/coda-bridge.py`) which injects fresh OAuth tokens automatically via `databricks auth token`. + +**1. Copy the bridge script:** + +```bash +mkdir -p ~/.claude/mcp-bridges +cp tools/coda-bridge.py ~/.claude/mcp-bridges/ +``` + +**2. Add to your MCP client settings** (e.g. `~/.claude/settings.json`): + +```json +"coda-mcp": { + "type": "stdio", + "command": "python3", + "args": ["/path/to/.claude/mcp-bridges/coda-bridge.py"], + "env": { + "CODA_MCP_URL": "https://your-app.databricksapps.com/mcp", + "DATABRICKS_PROFILE": "your-profile" + } +} +``` + +**3. Restart your MCP client.** + +The bridge reads `CODA_MCP_URL` and `DATABRICKS_PROFILE` from environment — no hardcoded values. If you redeploy the app or switch workspaces, just update the `env` block. + +**Prerequisites:** `databricks` CLI installed and authenticated (`databricks auth login -p `), Python 3.8+, no pip dependencies. + +**Troubleshooting:** Bridge logs go to stderr. If you see `Auth failed (302)`, refresh your CLI session with `databricks auth login -p `. See [full setup guide](docs/mcp-client-setup.md) for details. + +#### Task Chaining + +Chain tasks by passing `previous_session_id` — the new agent reads the prior task's results for context: + +``` +coda_run(prompt="Add monitoring to the pipeline", previous_session_id="sess-123") +``` + +See [MCP v2 Design Doc](docs/mcp-v2-background-execution.md) for the full protocol reference.
@@ -221,13 +305,14 @@ This template repo opens that vision up for every Databricks user — no IDE set 🏗️ Architecture ``` -┌─────────────────────┐ WebSocket ┌─────────────────────┐ -│ Browser Client │◄═══════════►│ Gunicorn + Flask │ -│ (xterm.js) │ (primary) │ + Flask-SocketIO │ -│ │───────────►│ (PTY Manager) │ -│ │ HTTP Poll │ │ -│ │ (fallback) │ │ -└─────────────────────┘ └─────────────────────┘ +┌─────────────────────┐ WebSocket ┌──────────────────────────────────┐ +│ Browser Client │◄═══════════►│ uvicorn (ASGI) │ +│ (xterm.js) │ (fallback) │ ├─ python-socketio (Socket.IO) │ +│ │───────────►│ ├─ FastMCP /mcp │ +│ │ HTTP Poll │ └─ WSGIMiddleware(Flask + PTY) │ +│ │ (primary │ │ +│ │ under uvicorn) │ +└─────────────────────┘ └──────────────────────────────────┘ │ │ │ on first load │ on startup ▼ ▼ @@ -245,9 +330,9 @@ This template repo opens that vision up for every Databricks user — no IDE set ### Startup Flow -1. Gunicorn starts, calls `initialize_app()` via `post_worker_init` hook +1. uvicorn starts `coda_mcp.mcp_asgi:app`, which calls `initialize_app()` during ASGI lifespan startup (Flask mounted via `WSGIMiddleware`; MCP mounted at `/mcp` via native ASGI; Socket.IO wraps both) 2. App serves the terminal UI with inline setup progress -3. Background thread runs setup: 5 sequential steps (git config, micro editor, GitHub CLI, Databricks CLI upgrade, content-filter proxy), then 6 agent setups (Claude, Codex, OpenCode, Gemini, Databricks CLI config, MLflow) run in parallel via `ThreadPoolExecutor` +3. Background thread runs setup: 5 sequential steps (git config, micro editor, GitHub CLI, Databricks CLI upgrade, content-filter proxy), then 6 agent setups (`setup/setup_claude.py`, `setup/setup_codex.py`, etc.) run in parallel via `ThreadPoolExecutor` 4. `/api/setup-status` endpoint reports progress to the UI 5. Once complete, the terminal becomes interactive @@ -267,6 +352,7 @@ This template repo opens that vision up for every Databricks user — no IDE set | `/api/resize` | POST | Resize terminal dimensions | | `/api/upload` | POST | Upload file (clipboard image paste) | | `/api/session/close` | POST | Close terminal session | +| `/mcp` | POST | MCP JSON-RPC endpoint (CoDA tools) | ### WebSocket Events (Socket.IO) @@ -303,9 +389,9 @@ This template repo opens that vision up for every Databricks user — no IDE set Single-user app — the owner is resolved via the app's service principal and Apps API (`app.creator`), with no PAT required at deploy time. Authorization checks `X-Forwarded-Email` against `app.creator`. On first terminal session, the user pastes a short-lived PAT interactively. Tokens auto-rotate every 10 minutes (15-minute lifetime), with old tokens proactively revoked. On restart, the user re-pastes (no persistence by design). -### Gunicorn +### Server -Production uses `workers=1` (PTY state is process-local), `threads=16` (concurrent polling + WebSocket), `gthread` worker class, `timeout=60` (long-lived WebSocket connections). +Production uses `uvicorn` (single worker — PTY state is process-local) serving `coda_mcp.mcp_asgi:app`. The ASGI stack composes `python-socketio.ASGIApp` → MCP Streamable HTTP at `/mcp` → `WSGIMiddleware(Flask)` for the terminal UI. WebSocket transport falls back to HTTP polling under uvicorn — the `static/poll-worker.js` Web Worker already handles this transparently. `gunicorn.conf.py` is retained for reference and local WSGI-only dev; it is **not** used in production. @@ -316,27 +402,36 @@ Production uses `workers=1` (PTY state is process-local), `threads=16` (concurre coding-agents-databricks-apps/ ├── app.py # Flask backend + PTY management + setup orchestration ├── app_state.py # Shared app state (setup progress, session registry) -├── app.yaml.template # Databricks Apps deployment config template +├── app.yaml # Databricks Apps deployment config (uvicorn entrypoint) ├── cli_auth.py # Interactive PAT setup + CLI credential writer ├── content_filter_proxy.py # Proxy that sanitises empty-content blocks for OpenCode -├── gunicorn.conf.py # Gunicorn production server config +├── gunicorn.conf.py # Legacy WSGI-only config (unused in production; uvicorn is the entrypoint) ├── pat_rotator.py # Background PAT auto-rotation (10-min cycle) ├── pyproject.toml # Package metadata + uv config (supply-chain guardrails) ├── requirements.txt # Compiled from pyproject.toml (Dependabot compatibility) ├── requirements.lock # Hash-pinned lockfile (auto-regenerated by CI) ├── Makefile # Deploy, redeploy, status, and cleanup targets -├── setup_claude.py # Claude Code CLI + MCP configuration -├── setup_codex.py # Codex CLI configuration -├── setup_gemini.py # Gemini CLI configuration -├── setup_opencode.py # OpenCode configuration -├── setup_databricks.py # Databricks CLI configuration -├── setup_mlflow.py # MLflow tracing auto-configuration -├── setup_proxy.py # Content-filter proxy startup ├── sync_to_workspace.py # Post-commit hook: sync to Workspace -├── install_micro.sh # Micro editor installer -├── install_gh.sh # GitHub CLI installer (OS/arch-aware) -├── install_databricks_cli.sh # Databricks CLI upgrade script -├── utils.py # Utility functions (ensure_https) +├── utils.py # Utility functions (ensure_https, gateway discovery) +├── coda_mcp/ # MCP server package (CoDA — Coding Agents) +│ ├── __init__.py +│ ├── mcp_server.py # FastMCP tool definitions (coda_run, coda_inbox, coda_get_result) +│ ├── mcp_endpoint.py # Flask Blueprint: JSON-RPC /mcp endpoint +│ ├── mcp_asgi.py # ASGI bridge (optional, for native MCP SDK transport) +│ └── task_manager.py # Disk-based session/task state manager +├── setup/ # Agent setup scripts (run at boot) +│ ├── setup_claude.py # Claude Code CLI + MCP configuration +│ ├── setup_codex.py # Codex CLI configuration +│ ├── setup_gemini.py # Gemini CLI configuration +│ ├── setup_opencode.py # OpenCode configuration +│ ├── setup_hermes.py # Hermes Agent configuration +│ ├── setup_databricks.py # Databricks CLI configuration +│ ├── setup_mlflow.py # MLflow tracing auto-configuration +│ └── setup_proxy.py # Content-filter proxy startup +├── scripts/ # Shell scripts +│ ├── install_micro.sh # Micro editor installer +│ ├── install_gh.sh # GitHub CLI installer (OS/arch-aware) +│ └── install_databricks_cli.sh # Databricks CLI upgrade script ├── static/ │ ├── index.html # Terminal UI (xterm.js + split panes + WebSocket) │ ├── favicon.svg # App favicon @@ -350,8 +445,12 @@ coding-agents-databricks-apps/ │ └── workflows/ │ ├── dependency-audit.yml # Weekly CVE audit + lockfile drift check │ └── update-lockfile.yml # Auto-regenerate requirements.lock on push +├── tools/ +│ └── coda-bridge.py # Stdio-to-HTTP MCP bridge (OAuth token injection) └── docs/ ├── deployment.md # Full Databricks Apps deployment guide + ├── mcp-client-setup.md # MCP client setup guide (bridge config) + ├── mcp-v2-background-execution.md # MCP server design doc ├── prd/ # Product requirement documents └── plans/ # Design documentation ``` @@ -362,4 +461,4 @@ coding-agents-databricks-apps/ ## Technologies -Flask · Flask-SocketIO · Socket.IO · Gunicorn · xterm.js · Python PTY · uv · Databricks SDK · Databricks AI Gateway · MLflow +Flask · Flask-SocketIO · Socket.IO · uvicorn · MCP (Streamable HTTP) · xterm.js · Python PTY · uv · Databricks SDK · Databricks AI Gateway · MLflow diff --git a/app.py b/app.py index b5acb65..9a39beb 100644 --- a/app.py +++ b/app.py @@ -1,3 +1,4 @@ +import asyncio import os import pty import fcntl @@ -46,6 +47,7 @@ CLEANUP_INTERVAL_SECONDS = 900 # Check for stale sessions every 15 min GRACEFUL_SHUTDOWN_WAIT = 3 # Seconds to wait after SIGHUP before SIGKILL MAX_CONCURRENT_SESSIONS = int(os.environ.get("MAX_CONCURRENT_SESSIONS", "5")) +TRANSCRIPT_CAP_BYTES = 10 * 1024 * 1024 # 10 MB soft cap per transcript # Logging setup logging.basicConfig(level=logging.INFO) @@ -58,8 +60,46 @@ app.config['MAX_CONTENT_LENGTH'] = 32 * 1024 * 1024 # 32 MB — aligned with Claude Code's 30 MB file limit # WebSocket support via Flask-SocketIO (simple-websocket transport, threading mode) +# Used for local dev (python app.py). Under uvicorn/ASGI, the AsyncServer in +# mcp_asgi.py intercepts /socket.io/ before WSGIMiddleware, so these handlers +# are only active in WSGI mode. socketio = SocketIO(app, async_mode='threading', cors_allowed_origins=[], logger=False, engineio_logger=False) +# ── ASGI WebSocket support (python-socketio AsyncServer) ───────────── +# Set by mcp_asgi.py at startup. Background threads use _emit_from_thread() +# which routes to the async server (ASGI) or Flask-SocketIO (WSGI) automatically. +_async_sio = None +_event_loop = None + + +def set_async_sio(sio_instance, loop): + """Called by mcp_asgi.py to wire up the ASGI Socket.IO server.""" + global _async_sio, _event_loop + _async_sio = sio_instance + _event_loop = loop + + +def _emit_from_thread(event, data, room=None): + """Thread-safe emit for background threads (PTY reader, cleanup, SIGTERM). + + Routes to AsyncServer (ASGI mode) or Flask-SocketIO (WSGI mode) automatically. + """ + if _async_sio and _event_loop and _event_loop.is_running(): + try: + asyncio.run_coroutine_threadsafe( + _async_sio.emit(event, data, room=room), + _event_loop, + ) + except Exception: + pass + else: + # WSGI mode (local dev) — use Flask-SocketIO directly + try: + socketio.emit(event, data, room=room) + except Exception: + pass + + # Store sessions: {session_id: {"master_fd": fd, "pid": pid, "output_buffer": deque, "lock": Lock, ...}} # sessions_lock guards dict-level ops (add/remove/iterate); each session["lock"] guards per-session state sessions = {} @@ -86,10 +126,7 @@ def handle_sigterm(signum, frame): shutting_down = True logger.info("SIGTERM received — setting shutting_down flag for clients") # Notify WS clients immediately (HTTP poll clients will see shutting_down on next poll) - try: - socketio.emit('shutting_down', {}) - except Exception: - pass + _emit_from_thread('shutting_down', {}) # NOTE: Do not register SIGTERM handler at module level. # It is installed in initialize_app() for gunicorn only. @@ -150,6 +187,11 @@ def _run_step(step_id, command): env.pop("DATABRICKS_CLIENT_ID", None) env.pop("DATABRICKS_CLIENT_SECRET", None) + # Ensure setup scripts can still import from repo root (e.g. `from utils import ...`) + app_dir = os.path.dirname(os.path.abspath(__file__)) + existing_pp = env.get("PYTHONPATH", "") + env["PYTHONPATH"] = f"{app_dir}:{existing_pp}" if existing_pp else app_dir + result = subprocess.run(command, env=env, capture_output=True, text=True, timeout=300) if result.returncode == 0: _update_step(step_id, status="complete", completed_at=time.time()) @@ -370,8 +412,14 @@ def _configure_all_cli_auth(token): # 3. Re-run Codex, OpenCode, Gemini setup scripts with token in env # They are idempotent: detect CLI already installed, just write config files - env = {**os.environ, "DATABRICKS_TOKEN": token} - for script in ["setup_codex.py", "setup_opencode.py", "setup_gemini.py", "setup_hermes.py"]: + app_dir = os.path.dirname(os.path.abspath(__file__)) + existing_pp = os.environ.get("PYTHONPATH", "") + env = { + **os.environ, + "DATABRICKS_TOKEN": token, + "PYTHONPATH": f"{app_dir}:{existing_pp}" if existing_pp else app_dir, + } + for script in ["setup/setup_codex.py", "setup/setup_opencode.py", "setup/setup_gemini.py", "setup/setup_hermes.py"]: try: result = subprocess.run( ["uv", "run", "python", script], @@ -410,26 +458,26 @@ def run_setup(): _update_step("git", status="error", completed_at=time.time(), error=str(e)) _run_step("micro", ["bash", "-c", - "mkdir -p ~/.local/bin && bash install_micro.sh && mv micro ~/.local/bin/ 2>/dev/null || true"]) + "mkdir -p ~/.local/bin && bash scripts/install_micro.sh && mv micro ~/.local/bin/ 2>/dev/null || true"]) - _run_step("gh", ["bash", "install_gh.sh"]) + _run_step("gh", ["bash", "scripts/install_gh.sh"]) # --- Upgrade Databricks CLI (runtime image ships an older version) --- - _run_step("dbcli", ["bash", "install_databricks_cli.sh"]) + _run_step("dbcli", ["bash", "scripts/install_databricks_cli.sh"]) # --- Content-filter proxy (must be running before OpenCode starts) --- # Sanitizes requests/responses between OpenCode and Databricks # (see OpenCode #5028, docs/plans/2026-03-11-litellm-empty-content-blocks-design.md) - _run_step("proxy", ["uv", "run", "python", "setup_proxy.py"]) + _run_step("proxy", ["uv", "run", "python", "setup/setup_proxy.py"]) # --- Parallel agent setup (all independent of each other) --- parallel_steps = [ - ("claude", ["uv", "run", "python", "setup_claude.py"]), - ("codex", ["uv", "run", "python", "setup_codex.py"]), - ("opencode", ["uv", "run", "python", "setup_opencode.py"]), - ("gemini", ["uv", "run", "python", "setup_gemini.py"]), - ("hermes", ["uv", "run", "python", "setup_hermes.py"]), - ("databricks", ["uv", "run", "python", "setup_databricks.py"]), + ("claude", ["uv", "run", "python", "setup/setup_claude.py"]), + ("codex", ["uv", "run", "python", "setup/setup_codex.py"]), + ("opencode", ["uv", "run", "python", "setup/setup_opencode.py"]), + ("gemini", ["uv", "run", "python", "setup/setup_gemini.py"]), + ("hermes", ["uv", "run", "python", "setup/setup_hermes.py"]), + ("databricks", ["uv", "run", "python", "setup/setup_databricks.py"]), ] with ThreadPoolExecutor(max_workers=len(parallel_steps)) as executor: @@ -442,7 +490,7 @@ def run_setup(): # --- MLflow setup runs AFTER claude setup to avoid settings.json race --- # setup_mlflow.py merges env vars into ~/.claude/settings.json which # setup_claude.py also writes; running sequentially prevents clobbering. - _run_step("mlflow", ["uv", "run", "python", "setup_mlflow.py"]) + _run_step("mlflow", ["uv", "run", "python", "setup/setup_mlflow.py"]) # Sync latest token into all CLI configs — covers the race where PAT # rotation happened while a setup script was still installing (the @@ -580,7 +628,132 @@ def _check_ws_authorization(): return True -# ── WebSocket Event Handlers ────────────────────────────────────────────── +def _check_ws_authorization_from_environ(environ): + """Check authorization from WSGI environ dict (for ASGI WebSocket via python-socketio). + + Same logic as _check_ws_authorization() but reads headers from the environ + dict instead of Flask's request context. WSGI environ stores HTTP headers as + HTTP_X_FORWARDED_EMAIL (uppercase, underscores, HTTP_ prefix). + """ + if not app_owner: + if _is_databricks_apps(): + logger.error("SECURITY: app_owner not resolved — denying WebSocket (fail-closed)") + return False + return True # Local dev only + + raw_user = ( + environ.get("HTTP_X_FORWARDED_EMAIL") + or environ.get("HTTP_X_FORWARDED_USER") + or environ.get("HTTP_X_DATABRICKS_USER_EMAIL") + ) + current_user = raw_user.lower() if raw_user else raw_user + + if not current_user: + if _is_databricks_apps(): + logger.warning("No user identity in WebSocket request on Databricks Apps — denying") + return False + return True # Local dev only + + if current_user != app_owner: + logger.warning(f"WebSocket unauthorized: {current_user} (owner: {app_owner})") + return False + return True + + +def register_sio_handlers(sio): + """Register Socket.IO event handlers on an AsyncServer for ASGI mode. + + Called by mcp_asgi.py. The handlers mirror the Flask-SocketIO handlers below + but use python-socketio's async API (explicit sid, enter_room/leave_room, + async def, ConnectionRefusedError for auth denial). + """ + + @sio.on('connect') + async def handle_connect(sid, environ, auth): + # Capture event loop on first connection for _emit_from_thread() + set_async_sio(sio, asyncio.get_running_loop()) + + # Diagnostic: log transport and header presence for debugging proxy behavior + transport = environ.get('QUERY_STRING', '') + has_email = bool(environ.get('HTTP_X_FORWARDED_EMAIL')) + has_user = bool(environ.get('HTTP_X_FORWARDED_USER')) + logger.info(f"WS connect: sid={sid}, qs={transport}, " + f"has_email={has_email}, has_user={has_user}") + + if not _check_ws_authorization_from_environ(environ): + raise ConnectionRefusedError('unauthorized') + logger.info("WebSocket client connected (ASGI)") + + @sio.on('join_session') + async def handle_join_session(sid, data): + session_id = data.get('session_id') + if not session_id: + return {'status': 'error', 'message': 'session_id required'} + sess = _get_session(session_id) + if not sess: + return {'status': 'error', 'message': 'Session not found'} + with sess["lock"]: + sess["last_poll_time"] = time.time() + sess["output_buffer"].clear() + await sio.enter_room(sid, session_id) + logger.info(f"WebSocket client joined session room {session_id}") + return {'status': 'ok'} + + @sio.on('leave_session') + async def handle_leave_session(sid, data): + session_id = data.get('session_id') + if session_id: + await sio.leave_room(sid, session_id) + logger.info(f"WebSocket client left session room {session_id}") + + @sio.on('terminal_input') + async def handle_terminal_input(sid, data): + session_id = data.get('session_id') + input_data = data.get('input', '') + sess = _get_session(session_id) + if not sess: + return + with sess["lock"]: + sess["last_poll_time"] = time.time() + fd = sess["master_fd"] + try: + os.write(fd, input_data.encode()) + except OSError as e: + logger.warning(f"WebSocket input write error for {session_id}: {e}") + + @sio.on('terminal_resize') + async def handle_terminal_resize(sid, data): + session_id = data.get('session_id') + cols = data.get('cols', 80) + rows = data.get('rows', 24) + sess = _get_session(session_id) + if not sess: + return + with sess["lock"]: + sess["last_poll_time"] = time.time() + fd = sess["master_fd"] + try: + winsize = struct.pack("HHHH", rows, cols, 0, 0) + fcntl.ioctl(fd, termios.TIOCSWINSZ, winsize) + except OSError as e: + logger.warning(f"WebSocket resize error for {session_id}: {e}") + + @sio.on('heartbeat') + async def handle_heartbeat(sid, data): + session_ids = data.get('session_ids', []) + now = time.time() + for s_id in session_ids: + sess = _get_session(s_id) + if sess: + with sess["lock"]: + sess["last_poll_time"] = now + + @sio.on('disconnect') + async def handle_disconnect(sid): + logger.info("WebSocket client disconnected (ASGI)") + + +# ── WebSocket Event Handlers (Flask-SocketIO — WSGI/local dev only) ────── @socketio.on('connect') def handle_ws_connect(): @@ -686,6 +859,42 @@ def _get_session(session_id): return sessions.get(session_id) +def _tee_transcript_chunk(session, output: bytes, cap: int = TRANSCRIPT_CAP_BYTES) -> None: + """Append PTY output to the transcript file. Single-writer (read_pty_output). + + All file-handle access is under ``session["lock"]`` so we never race the + Timer-driven close path in ``terminate_session``. The ``ValueError`` catch + is belt-and-suspenders for the tiny window where the handle is closed + between the ``is not None`` check and the actual ``write`` call (the lock + prevents this, but be defensive). + """ + with session["lock"]: + fh = session.get("transcript_fh") + written = session.get("transcript_bytes", 0) + if fh is None: + return + remaining = cap - written + if remaining <= 0: + return + chunk = output[:remaining] + try: + fh.write(chunk) + fh.flush() + session["transcript_bytes"] = written + len(chunk) + if len(chunk) < len(output): + fh.write(b"\n[transcript truncated at %d bytes]\n" % cap) + fh.flush() + fh.close() + session["transcript_fh"] = None + except (OSError, ValueError) as exc: + logger.warning("transcript write failed: %s", exc) + try: + fh.close() + except Exception: + pass + session["transcript_fh"] = None + + def read_pty_output(session_id, fd): """Background thread to read PTY output into buffer and push via WebSocket.""" session = _get_session(session_id) @@ -711,12 +920,11 @@ def read_pty_output(session_id, fd): session["output_buffer"].append(decoded) session["last_poll_time"] = time.time() # Keep session alive during WS output # Push via WebSocket to the session room (AC-8) - try: - socketio.emit('terminal_output', + _emit_from_thread('terminal_output', {'session_id': session_id, 'output': decoded}, room=session_id) - except Exception: - pass # No WebSocket clients — HTTP polling handles it + # Tee to transcript file if enabled for this session + _tee_transcript_chunk(session, output) else: # select timed out — check if process is still alive try: @@ -731,10 +939,7 @@ def read_pty_output(session_id, fd): break # Process exited or fd closed — notify WebSocket clients (AC-9) - try: - socketio.emit('session_exited', {'session_id': session_id}, room=session_id) - except Exception: - pass + _emit_from_thread('session_exited', {'session_id': session_id}, room=session_id) logger.info(f"Session {session_id} process exited") @@ -744,14 +949,38 @@ def read_pty_output(session_id, fd): def terminate_session(session_id, pid, master_fd): - """Gracefully terminate a session: SIGHUP -> wait -> SIGKILL -> cleanup.""" + """Gracefully terminate a session: SIGHUP -> wait -> SIGKILL -> cleanup. + + Idempotent. Both the explicit close path (``mcp_close_pty_session``) and the + read-thread exit path (``read_pty_output``) call this for the same session. + We atomically *claim* the session by popping it from ``sessions`` — only the + caller that wins the pop kills the process and closes ``master_fd``. This + guarantees ``os.close()`` runs exactly once: a second close could land on a + since-reused fd (e.g. an asyncio event loop's self-pipe allocated by a later + test) and corrupt unrelated I/O, surfacing as intermittent EBADF. + """ + # Atomically claim the session. If it's already gone, the other teardown + # path handled it — bail out WITHOUT touching the (possibly reused) fd. + with sessions_lock: + sess = sessions.pop(session_id, None) + if sess is None: + return + logger.info(f"Terminating stale session {session_id} (pid={pid})") # Notify WebSocket clients that the session is closed - try: - socketio.emit('session_closed', {'session_id': session_id}, room=session_id) - except Exception: - pass + _emit_from_thread('session_closed', {'session_id': session_id}, room=session_id) + + # Close transcript handle (if any) under per-session lock; swap-then-close + # outside the lock to avoid blocking on slow filesystems. + with sess["lock"]: + transcript_fh = sess.get("transcript_fh") + sess["transcript_fh"] = None + if transcript_fh is not None: + try: + transcript_fh.close() + except Exception: + pass try: os.kill(pid, signal.SIGHUP) @@ -769,8 +998,21 @@ def terminate_session(session_id, pid, master_fd): except OSError: pass # Process or fd already gone - with sessions_lock: - sessions.pop(session_id, None) + # Clean up the project dir if coda_interactive created one. + # Done here (not in mcp_close_pty_session) so BOTH the graceful close + # path AND the idle reaper (which calls terminate_session directly) hit + # this cleanup. Safe for HTTP-created sessions too — they never planted + # a dir at this path, so os.path.isdir short-circuits. + import shutil + project_dir = os.path.join( + os.path.expanduser("~/.coda/projects"), + session_id, + ) + if os.path.isdir(project_dir): + try: + shutil.rmtree(project_dir) + except OSError as e: + logger.warning("Failed to clean up project dir %s: %s", project_dir, e) def _get_session_process(pid): @@ -858,7 +1100,7 @@ def cleanup_stale_sessions(): def authorize_request(): """Check authorization before processing any request.""" # Skip auth for health check, setup status, and Socket.IO (has own auth via connect event) - if request.path in ("/health", "/api/setup-status", "/api/pat-status", "/api/configure-pat", "/api/app-state") or request.path.startswith("/socket.io"): + if request.path in ("/health", "/api/setup-status", "/api/pat-status", "/api/configure-pat", "/api/app-state") or request.path.startswith("/socket.io") or request.path.startswith("/mcp"): return None authorized, user = check_authorization() @@ -873,6 +1115,10 @@ def authorize_request(): @app.after_request def set_security_headers(response): + # MCP endpoint handles its own CORS/headers — skip security headers + # that might interfere (CSP connect-src, X-Frame-Options, etc.) + if request.path.startswith("/mcp"): + return response response.headers["X-Content-Type-Options"] = "nosniff" response.headers["X-Frame-Options"] = "DENY" response.headers["X-XSS-Protection"] = "1; mode=block" @@ -934,17 +1180,58 @@ def list_sessions(): return jsonify(result) +def _serve_transcript_replay(session_id: str): + """Serve the on-disk transcript for a PTY session as a replay response. + + Used by attach_session() in two cases: + 1. The PTY is gone (transcript-fallback path). + 2. The PTY exists but is replay_only=True (no live attach allowed). + + Returns either a Flask JSON response with replay=True, or a 404 if no + transcript exists for this pty_session_id. + """ + from coda_mcp import task_manager as _tm + tdir = _tm.find_task_dir_by_pty_session(session_id) + if tdir: + transcript = os.path.join(tdir, "transcript.log") + if os.path.isfile(transcript): + try: + with open(transcript, "rb") as f: + content = f.read() + return jsonify({ + "session_id": session_id, + "label": "hermes-mcp (replay)", + "output": [content.decode("utf-8", errors="replace")], + "replay": True, + "process": None, + "created_at": None, + }) + except OSError: + pass + return jsonify({"error": "Session not found or exited"}), 404 + + @app.route("/api/session/attach", methods=["POST"]) def attach_session(): - """Reattach to an existing session — returns buffered output for replay.""" + """Reattach to an existing session — returns buffered output for replay. + + If the live PTY is gone but an on-disk transcript exists for this + pty_session_id, return the transcript as ``output`` with ``replay: True``. + """ data = request.get_json(silent=True) or {} session_id = data.get("session_id", "") sess = _get_session(session_id) + + # Replay-only sessions (e.g. those created by coda_run) always serve the + # transcript-from-disk, even when the PTY is still alive. + if sess and sess.get("replay_only"): + return _serve_transcript_replay(session_id) + if not sess or sess.get("exited"): - return jsonify({"error": "Session not found or exited"}), 404 + return _serve_transcript_replay(session_id) - # Reset idle clock so the 24h reaper starts fresh + # Existing live-attach path sess["last_poll_time"] = time.time() return jsonify({ @@ -1083,7 +1370,8 @@ def create_session(): """Create a new terminal session.""" # Quick reject before forking a PTY (approximate — authoritative check below) with sessions_lock: - if len(sessions) >= MAX_CONCURRENT_SESSIONS: + active = len(sessions) + if active >= MAX_CONCURRENT_SESSIONS: return jsonify({"error": f"Maximum {MAX_CONCURRENT_SESSIONS} concurrent sessions reached. Close an existing session first."}), 429 data = request.get_json(silent=True) or {} @@ -1122,7 +1410,8 @@ def create_session(): with sessions_lock: # Authoritative check under the same lock as insertion — prevents # TOCTOU race where two concurrent requests both pass the early check. - if len(sessions) >= MAX_CONCURRENT_SESSIONS: + active = len(sessions) + if active >= MAX_CONCURRENT_SESSIONS: os.close(master_fd) try: os.kill(pid, signal.SIGKILL) @@ -1151,6 +1440,133 @@ def create_session(): return jsonify({"error": str(e)}), 500 +# ── MCP Integration Helpers ────────────────────────────────────────── + + +def mcp_create_pty_session( + label: str = "hermes-mcp", + transcript_path: str | None = None, + replay_only: bool = False, + cwd: str | None = None, +) -> str: + """Create a PTY session for MCP use. Returns the PTY session_id.""" + with sessions_lock: + active = len(sessions) + if active >= MAX_CONCURRENT_SESSIONS: + raise RuntimeError( + f"Maximum {MAX_CONCURRENT_SESSIONS} concurrent sessions reached." + ) + + master_fd, slave_fd = pty.openpty() + + # Strip PAT, SP creds, registry tokens, and other secrets that must not be + # readable from the agent's PTY. See _build_terminal_shell_env docstring + # for the full list. (F-01) + shell_env = _build_terminal_shell_env(os.environ) + if not shell_env.get("HOME") or shell_env["HOME"] == "/": + shell_env["HOME"] = "/app/python/source_code" + local_bin = f"{shell_env['HOME']}/.local/bin" + shell_env["PATH"] = f"{local_bin}:{shell_env.get('PATH', '')}" + + projects_dir = os.path.join(shell_env["HOME"], "projects") + os.makedirs(projects_dir, exist_ok=True) + + # When caller passes cwd, use it; otherwise fall back to projects_dir + # (preserves current behavior for existing callers that don't pass cwd). + spawn_cwd = cwd if cwd is not None else projects_dir + + pid = subprocess.Popen( + ["/bin/bash"], + stdin=slave_fd, + stdout=slave_fd, + stderr=slave_fd, + preexec_fn=os.setsid, + env=shell_env, + cwd=spawn_cwd, + ).pid + os.close(slave_fd) + + # Open transcript file (if requested) before locking the session dict. + transcript_fh = None + if transcript_path: + try: + parent_dir = os.path.dirname(transcript_path) + if parent_dir: + os.makedirs(parent_dir, exist_ok=True) + transcript_fh = open(transcript_path, "ab", buffering=0) + os.fchmod(transcript_fh.fileno(), 0o600) + except OSError as exc: + logger.warning("Could not open transcript at %s: %s", transcript_path, exc) + transcript_fh = None + + session_id = str(uuid.uuid4()) + + try: + with sessions_lock: + active = len(sessions) + if active >= MAX_CONCURRENT_SESSIONS: + os.close(master_fd) + try: + os.kill(pid, signal.SIGKILL) + except OSError: + pass + raise RuntimeError( + f"Maximum {MAX_CONCURRENT_SESSIONS} concurrent sessions reached." + ) + sessions[session_id] = { + "master_fd": master_fd, + "pid": pid, + "output_buffer": deque(maxlen=1000), + "lock": threading.Lock(), + "last_poll_time": time.time(), + "created_at": time.time(), + "label": label, + "transcript_path": transcript_path if transcript_fh else None, + "transcript_fh": transcript_fh, + "transcript_bytes": 0, + "replay_only": replay_only, + "cwd": cwd, + } + + thread = threading.Thread( + target=read_pty_output, args=(session_id, master_fd), daemon=True + ) + thread.start() + except BaseException: + # Roll back transcript open if anything below it raises before the + # session is fully wired. The PTY itself is cleaned up by existing + # error paths; this is just the transcript handle. + if transcript_fh is not None: + try: + transcript_fh.close() + except Exception: + pass + raise + + return session_id + + +def mcp_send_input(session_id: str, data: str): + """Send input to a PTY session.""" + session = _get_session(session_id) + if not session: + raise RuntimeError(f"Session {session_id} not found") + with session["lock"]: + os.write(session["master_fd"], data.encode()) + + +def mcp_close_pty_session(session_id: str): + """Close a PTY session. + + Project-dir cleanup (for coda_interactive sessions) lives inside + terminate_session so the idle reaper hits it too. + """ + session = _get_session(session_id) + if not session: + return + terminate_session(session_id, session["pid"], session["master_fd"]) + + @app.route("/api/input", methods=["POST"]) def send_input(): """Send input to the terminal.""" @@ -1368,6 +1784,20 @@ def initialize_app(local_dev=False): logger.info(f"Started session cleanup thread (timeout={SESSION_TIMEOUT_SECONDS}s, interval={CLEANUP_INTERVAL_SECONDS}s)") +# ── MCP Endpoint ───────────────────────────────────────────────────── +from coda_mcp.mcp_endpoint import mcp_bp +from coda_mcp.mcp_server import set_app_hooks + +app.register_blueprint(mcp_bp) + +# Wire MCP tools to PTY infrastructure +set_app_hooks( + create_session_fn=mcp_create_pty_session, + send_input_fn=mcp_send_input, + close_session_fn=mcp_close_pty_session, +) + + if __name__ == "__main__": # Local dev — no SIGTERM handler (SIG_DFL), no shutting_down flag initialize_app(local_dev=True) diff --git a/app.yaml b/app.yaml index 4d20047..b84a8bc 100644 --- a/app.yaml +++ b/app.yaml @@ -1,6 +1,17 @@ +# Production entrypoint is uvicorn (ASGI), not gunicorn. Required because +# the MCP server at /mcp uses FastMCP.streamable_http_app(), a native ASGI +# transport that cannot be served by gunicorn's WSGI workers. Flask is +# mounted via WSGIMiddleware inside coda_mcp.mcp_asgi alongside MCP and +# Socket.IO. WebSocket transport falls back to HTTP polling under uvicorn — +# acceptable because static/poll-worker.js already implements the fallback. +# gunicorn.conf.py is retained for legacy WSGI-only local dev; not used here. command: - - gunicorn - - app:app + - uvicorn + - coda_mcp.mcp_asgi:app + - --host + - 0.0.0.0 + - --port + - "8000" env: - name: HOME value: /app/python/source_code diff --git a/cli_auth.py b/cli_auth.py index 61c9f25..53c2a25 100644 --- a/cli_auth.py +++ b/cli_auth.py @@ -35,6 +35,7 @@ def _update_claude(token): settings["env"]["ANTHROPIC_AUTH_TOKEN"] = token with open(path, "w") as f: json.dump(settings, f, indent=2) + os.chmod(path, 0o600) except (OSError, json.JSONDecodeError): pass # file doesn't exist yet — initial setup hasn't run @@ -59,6 +60,7 @@ def _update_opencode(token): if changed: with open(path, "w") as f: json.dump(auth, f, indent=2) + os.chmod(path, 0o600) except (OSError, json.JSONDecodeError): pass @@ -84,6 +86,7 @@ def _update_hermes(token): if new_content != content: with open(path, "w") as f: f.write(new_content) + os.chmod(path, 0o600) except OSError: pass @@ -102,5 +105,6 @@ def _replace_dotenv_key(path, key, value): if new_content != content: with open(path, "w") as f: f.write(new_content) + os.chmod(path, 0o600) except OSError: pass diff --git a/coda_mcp/__init__.py b/coda_mcp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/coda_mcp/databricks_preamble.py b/coda_mcp/databricks_preamble.py new file mode 100644 index 0000000..cfe2973 --- /dev/null +++ b/coda_mcp/databricks_preamble.py @@ -0,0 +1,124 @@ +"""Builders for the CoDA prompt envelope's CAPABILITIES and WORKFLOW PROTOCOL sections. + +These are injected into prompt.txt by ``task_manager.wrap_prompt`` when +``workflow_protocol=True``. Pure functions — no side effects, no I/O. +""" +from __future__ import annotations + + +_DATABRICKS_SKILLS: tuple[str, ...] = ( + "agent-bricks", + "databricks-genie", + "databricks-app-python", + "databricks-app-apx", + "databricks-jobs", + "databricks-unity-catalog", + "spark-declarative-pipelines", + "aibi-dashboards", + "model-serving", + "mlflow-evaluation", + "asset-bundles", + "databricks-python-sdk", + "databricks-config", + "databricks-docs", + "synthetic-data-generation", + "unstructured-pdf-generation", +) + + +def get_databricks_skills() -> tuple[str, ...]: + """Return the canonical Databricks skill list. Tests pin this against CLAUDE.md.""" + return _DATABRICKS_SKILLS + + +def build_capabilities() -> str: + """Orientation block: CLI, skills, MCP servers, when to prefer Databricks-native paths.""" + skills_lines = [] + # Pack 4 skills per line for readability in prompt.txt. + for i in range(0, len(_DATABRICKS_SKILLS), 4): + chunk = _DATABRICKS_SKILLS[i:i + 4] + skills_lines.append("- " + ", ".join(chunk)) + skills_block = "\n".join(skills_lines) + return ( + "You are running inside CoDA on a Databricks-authenticated host.\n" + "\n" + "Databricks CLI: pre-configured. `databricks current-user me` confirms auth.\n" + "Use it for jobs, workspace, clusters, warehouses, Unity Catalog operations.\n" + "\n" + "Skills available at ~/.claude/skills/ — read each skill's SKILL.md before\n" + "invoking. Relevant Databricks skills:\n" + f"{skills_block}\n" + "\n" + "MCP servers wired:\n" + "- DeepWiki — ask_question, read_wiki_contents for any GitHub repo\n" + "- Exa — web_search_exa, web_fetch_exa for live web context\n" + "- CoDA — chain follow-up tasks via previous_session_id\n" + "\n" + "When the task touches Databricks data, pipelines, jobs, dashboards, agents,\n" + "or model serving, DEFAULT to the skill / CLI / SDK path above instead of\n" + "generic Python or web search." + ) + + +def build_workflow_protocol() -> str: + """3-phase workflow with critique at each phase + info_needed escape hatch.""" + return ( + "You MUST process this task in three phases. Emit status.jsonl events as\n" + "you go (one JSON object per line, format below).\n" + "\n" + "PHASE 1 — PLAN\n" + "- Write a step-by-step plan as a status.jsonl line with step=\"plan\" and\n" + " message containing the numbered steps.\n" + "- Then critique your own plan as if you were a separate reviewer.\n" + " (Spawn a sub-agent for the critique if your agent supports it; otherwise\n" + " write the critique inline as a self-review.) Emit step=\"critique_plan\"\n" + " with the verdict (APPROVE / BLOCK / APPROVE-WITH-FIXES) and findings.\n" + "- If the critique surfaces blockers, revise the plan once and re-emit\n" + " step=\"plan\". Maximum 2 plan iterations total.\n" + "- If after 2 attempts you still cannot produce a viable plan, write\n" + " result.json with status=\"info_needed\" (see below) and stop.\n" + "\n" + "PHASE 2 — EXECUTE\n" + "- Work the plan. Emit step=\"execute_\" lines after completing each plan\n" + " step (n is 1-indexed, matches the plan's numbering).\n" + "- After execution, emit step=\"critique_execute\" with a review of what got\n" + " built vs what the plan said. APPROVE / BLOCK / APPROVE-WITH-FIXES.\n" + "- If the critique surfaces correctness or scope gaps, fix them and re-emit\n" + " step=\"critique_execute\". Maximum 2 execute iterations total.\n" + "- If you hit a hard blocker (missing access, missing data, ambiguous\n" + " requirements that the plan revealed only mid-execution), write\n" + " result.json with status=\"info_needed\" and stop.\n" + "\n" + "PHASE 3 — SYNTHESIZE\n" + "- Write result.json with status=\"completed\".\n" + "- Emit step=\"critique_synthesize\" with a review of the result against the\n" + " original TASK.\n" + "- If the critique surfaces gaps, revise result.json. Maximum 2 synthesis\n" + " iterations total.\n" + "\n" + "If at any phase you cannot proceed, use the INFO_NEEDED escape hatch:\n" + "- Set status=\"info_needed\" in result.json.\n" + "- Set \"feedback\" to a precise, actionable string naming exactly what is\n" + " missing (a table name, a decision, an access grant, a clarification).\n" + " The calling client will read this and resubmit with the missing context.\n" + "- \"info_needed\" is NOT a failure — it is a structured request for\n" + " iteration. Use it whenever you would otherwise have to guess.\n" + "\n" + "If you encounter a hard, unrecoverable failure (a command crashed, an SDK\n" + "returned 500, a file is corrupt), use status=\"failed\" with a description\n" + "in \"errors\".\n" + "\n" + "DISAMBIGUATION — two soft statuses already exist and they mean different\n" + "things; use the right one:\n" + "- \"info_needed\" — the CALLER must add missing context (table name,\n" + " business decision, file contents, access grant) before the task can\n" + " proceed. Used when ambiguity or missing input blocks you.\n" + "- \"needs_approval\" — you have a concrete plan to do something destructive\n" + " (drop a table, delete a job, modify permissions). You will execute it\n" + " if and only if the caller explicitly approves. Used at the SAFETY\n" + " boundary, never for ambiguity. See SAFETY section below.\n" + "\n" + "If both apply (e.g. \"I'd drop a table but I'm not sure which one\"), prefer\n" + "\"info_needed\" — resolving the ambiguity first is cheaper than approving\n" + "the wrong destructive action." + ) diff --git a/coda_mcp/mcp_asgi.py b/coda_mcp/mcp_asgi.py new file mode 100644 index 0000000..f745e32 --- /dev/null +++ b/coda_mcp/mcp_asgi.py @@ -0,0 +1,122 @@ +"""Native MCP ASGI app with WebSocket support for terminal I/O. + +Architecture (all on one port, one uvicorn process): + + socketio.ASGIApp ← /socket.io/ → native ASGI WebSocket (terminal) + └── mcp_starlette ← /mcp → FastMCP Streamable HTTP (Genie Code) + └── WSGI(Flask) ← /* → REST API, static files (HTTP only) + +Usage in app.yaml:: + + command: ["uvicorn", "coda_mcp.mcp_asgi:app", "--host", "0.0.0.0", "--port", "8000"] +""" + +import os +import logging +import warnings + +import socketio as socketio_lib +from starlette.middleware.cors import CORSMiddleware + +with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + from starlette.middleware.wsgi import WSGIMiddleware + +from coda_mcp.mcp_server import mcp as mcp_instance, set_app_hooks +from coda_mcp import url_builder +from utils import ensure_https + +logger = logging.getLogger(__name__) + + +class AppUrlCaptureMiddleware: + """Capture X-Forwarded-Host (or Host) from every inbound HTTP request and + populate url_builder._app_url_cache. Used so MCP tools can return a + working viewer_url without manual configuration. + + Caveat: /socket.io/ traffic is intercepted by socketio.ASGIApp *before* + reaching mcp_starlette, so WebSocket connect requests never hit this + middleware. This is fine in practice — every HTTP request to /mcp and to + Flask routes does hit it, which is enough to keep the cache hot. + """ + + def __init__(self, app): + self.app = app + + async def __call__(self, scope, receive, send): + if scope.get("type") == "http": + headers = dict(scope.get("headers") or []) + host_bytes = headers.get(b"x-forwarded-host") or headers.get(b"host") + if host_bytes: + try: + url_builder.capture_from_headers(host_bytes.decode("latin-1")) + except Exception: + pass + await self.app(scope, receive, send) + +# ── Build allowed origins ───────────────────────────────────────── +# The browser connects from the app's own URL (e.g. mcp-test-coda-*.databricksapps.com) +# which differs from DATABRICKS_HOST (workspace URL). Databricks proxy handles auth, +# so Socket.IO CORS can safely allow all origins. Starlette CORSMiddleware below +# uses the same list for MCP/Flask routes. +_databricks_host = os.environ.get("DATABRICKS_HOST", "") +ALLOWED_ORIGINS = [] +if _databricks_host: + ALLOWED_ORIGINS.append(ensure_https(_databricks_host).rstrip("/")) + +# ── Import and initialize Flask app ──────────────────────────────── +from app import ( + app as flask_app, + initialize_app, + mcp_create_pty_session, + mcp_send_input, + mcp_close_pty_session, + register_sio_handlers, +) + +initialize_app() + +# Wire MCP tools to PTY infrastructure +set_app_hooks( + create_session_fn=mcp_create_pty_session, + send_input_fn=mcp_send_input, + close_session_fn=mcp_close_pty_session, +) + +# ── Async Socket.IO server (native ASGI WebSocket) ─────────────── +# python-socketio AsyncServer handles /socket.io/ with real WebSocket, +# eliminating the WSGIMiddleware limitation that forced HTTP polling fallback. +sio = socketio_lib.AsyncServer( + async_mode='asgi', + cors_allowed_origins='*', # App URL differs from DATABRICKS_HOST; proxy handles auth + logger=False, + engineio_logger=False, +) + +# Register terminal I/O event handlers (connect, join_session, terminal_input, etc.) +register_sio_handlers(sio) + +# ── Build the ASGI app per Genie Code docs ───────────────────────── +mcp_starlette = mcp_instance.streamable_http_app() + +# Mount Flask as catch-all via WSGI adapter (HTTP routes only) +flask_asgi = WSGIMiddleware(flask_app.wsgi_app) +mcp_starlette.mount("/", app=flask_asgi) + +# CORS for MCP and Flask routes +mcp_starlette.add_middleware( + CORSMiddleware, + allow_origins=ALLOWED_ORIGINS or ["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Capture X-Forwarded-Host into url_builder cache (for MCP viewer_url). +# Added AFTER CORS so it wraps the CORS-handled request. +mcp_starlette.add_middleware(AppUrlCaptureMiddleware) + +# ── Top-level ASGI app ──────────────────────────────────────────── +# socketio.ASGIApp intercepts /socket.io/ for WebSocket + polling, +# passes everything else to mcp_starlette (MCP at /mcp, Flask at /) +app = socketio_lib.ASGIApp(sio, other_asgi_app=mcp_starlette) diff --git a/coda_mcp/mcp_endpoint.py b/coda_mcp/mcp_endpoint.py new file mode 100644 index 0000000..d7730f1 --- /dev/null +++ b/coda_mcp/mcp_endpoint.py @@ -0,0 +1,179 @@ +"""Flask Blueprint fallback for MCP JSON-RPC. + +NOTE: This is NOT the production path. Production deployment uses +`coda_mcp.mcp_asgi:app` served by uvicorn, which mounts the native MCP +SDK Streamable HTTP transport at /mcp. This module is a Flask-native +JSON-RPC fallback used only under WSGI runtimes (gunicorn local dev, +tests that exercise the Flask test client without spinning up ASGI). + +Both paths expose the same four tools (coda_run, coda_inbox, +coda_get_result, coda_interactive) and produce equivalent JSON-RPC responses, +so switching between them is transparent to MCP clients. +""" +import asyncio +import json +import logging +from flask import Blueprint, request, jsonify + +logger = logging.getLogger(__name__) + +mcp_bp = Blueprint("mcp", __name__) + +# Import tool functions from mcp_server.py +from coda_mcp.mcp_server import ( + mcp as mcp_instance, + coda_run, + coda_inbox, + coda_get_result, + coda_interactive, +) + +# Tool function dispatch +_TOOL_DISPATCH = { + "coda_run": coda_run, + "coda_inbox": coda_inbox, + "coda_get_result": coda_get_result, + "coda_interactive": coda_interactive, +} + +SERVER_INFO = { + "name": "coda", + "version": "1.0.0", +} + +CAPABILITIES = { + "tools": {"listChanged": False}, +} + + + +def _cors_headers(): + """Build CORS response headers. + + Permissive CORS for /mcp — the Databricks Apps proxy handles auth. + """ + headers = {} + origin = request.headers.get("Origin", "") + if origin: + headers["Access-Control-Allow-Origin"] = origin + headers["Access-Control-Allow-Methods"] = "GET, POST, DELETE, OPTIONS" + # Explicitly list all headers Genie Code might send + # (wildcard * is incompatible with credentials=true per CORS spec) + allowed_headers = ", ".join([ + "Content-Type", "Authorization", "Accept", + "Mcp-Session-Id", "X-Request-Id", "X-Requested-With", + "X-Forwarded-Email", "X-Forwarded-User", "X-Databricks-User-Email", + "Cookie", "Origin", "Referer", + ]) + headers["Access-Control-Allow-Headers"] = allowed_headers + headers["Access-Control-Allow-Credentials"] = "true" + headers["Access-Control-Max-Age"] = "86400" + return headers + + +@mcp_bp.route("/mcp", methods=["POST", "OPTIONS", "GET"]) +def mcp_handler(): + # Handle CORS preflight + if request.method == "OPTIONS": + resp = jsonify({}) + resp.status_code = 204 + for k, v in _cors_headers().items(): + resp.headers[k] = v + return resp + + # Handle GET for SSE (not supported in stateless mode) + if request.method == "GET": + resp = jsonify({"error": "SSE not supported. Use POST."}) + resp.status_code = 405 + return resp + + # Origin validation skipped — Databricks Apps proxy handles auth. + + data = request.get_json(silent=True) or {} + method = data.get("method", "") + req_id = data.get("id") + params = data.get("params", {}) + + # Route by method + if method == "initialize": + result = { + "protocolVersion": params.get("protocolVersion", "2025-03-26"), + "capabilities": CAPABILITIES, + "serverInfo": SERVER_INFO, + "instructions": mcp_instance._instructions if hasattr(mcp_instance, '_instructions') else "", + } + resp = jsonify({"jsonrpc": "2.0", "id": req_id, "result": result}) + + elif method == "notifications/initialized": + # No-op acknowledgment — return empty OK + resp = jsonify({}) + resp.status_code = 200 + + elif method == "tools/list": + tools = _build_tools_list() + resp = jsonify({"jsonrpc": "2.0", "id": req_id, "result": {"tools": tools}}) + + elif method == "tools/call": + tool_name = params.get("name", "") + arguments = params.get("arguments", {}) + tool_fn = _TOOL_DISPATCH.get(tool_name) + if not tool_fn: + resp = jsonify({ + "jsonrpc": "2.0", "id": req_id, + "error": {"code": -32601, "message": f"Unknown tool: {tool_name}"} + }) + else: + try: + # Tool functions are async — run them + result_str = asyncio.run(tool_fn(**arguments)) + result_data = json.loads(result_str) + resp = jsonify({ + "jsonrpc": "2.0", "id": req_id, + "result": { + "content": [{"type": "text", "text": result_str}], + "isError": "error" in result_data, + } + }) + except Exception as e: + resp = jsonify({ + "jsonrpc": "2.0", "id": req_id, + "error": {"code": -32603, "message": str(e)} + }) + + elif method == "ping": + resp = jsonify({"jsonrpc": "2.0", "id": req_id, "result": {}}) + + else: + resp = jsonify({ + "jsonrpc": "2.0", "id": req_id, + "error": {"code": -32601, "message": f"Method not found: {method}"} + }) + + # Add CORS headers + for k, v in _cors_headers().items(): + resp.headers[k] = v + + return resp + + +def _build_tools_list(): + """Extract tool definitions from FastMCP registry.""" + tools = [] + # Access FastMCP's internal tool manager + tool_manager = mcp_instance._tool_manager + for name, tool in tool_manager._tools.items(): + tool_dict = { + "name": tool.name, + "description": tool.description or "", + "inputSchema": tool.parameters if hasattr(tool, 'parameters') else {}, + } + if hasattr(tool, 'annotations') and tool.annotations: + tool_dict["annotations"] = {} + if tool.annotations.readOnlyHint is not None: + tool_dict["annotations"]["readOnlyHint"] = tool.annotations.readOnlyHint + if tool.annotations.destructiveHint is not None: + tool_dict["annotations"]["destructiveHint"] = tool.annotations.destructiveHint + if tool.annotations.idempotentHint is not None: + tool_dict["annotations"]["idempotentHint"] = tool.annotations.idempotentHint + tools.append(tool_dict) + return tools diff --git a/coda_mcp/mcp_server.py b/coda_mcp/mcp_server.py new file mode 100644 index 0000000..c1a812c --- /dev/null +++ b/coda_mcp/mcp_server.py @@ -0,0 +1,760 @@ +"""MCP server exposing CoDA session/task tools via FastMCP. + +v2: Background execution + inbox pattern. +- ``coda_run`` — fire-and-forget task submission (auto-creates ephemeral session) +- ``coda_inbox`` — dashboard of all background tasks +- ``coda_get_result`` — pull full structured result for a completed task + +Delegates all disk state to ``task_manager.py``. PTY operations are +handled through app hooks (create/send/close) set via ``set_app_hooks()``. + +Run standalone for testing:: + + python mcp_server.py # stdio transport +""" + +import asyncio +import json +import logging +import os +import re +import shlex +import shutil +import threading +import time + +from mcp.server.fastmcp import FastMCP +from mcp.server.fastmcp.server import TransportSecuritySettings +from mcp.types import ToolAnnotations + +from coda_mcp import task_manager +from coda_mcp import url_builder + +logger = logging.getLogger(__name__) + +# ── FastMCP instance ──────────────────────────────────────────────── + +# Build allowed origins from DATABRICKS_HOST for Genie Code requests +_databricks_host = os.environ.get("DATABRICKS_HOST", "") +_allowed_origins = [] +if _databricks_host: + # Ensure https:// prefix, strip trailing slash + origin = _databricks_host if _databricks_host.startswith("https://") else f"https://{_databricks_host}" + _allowed_origins.append(origin.rstrip("/")) + +mcp = FastMCP( + "coda", + instructions=( + "CoDA MCP server — delegate coding tasks to AI agents on Databricks.\n\n" + "CRITICAL — FIRE AND FORGET:\n" + "coda_run submits work and returns IMMEDIATELY. The task runs autonomously " + "in the background. After calling coda_run, DO NOT call coda_inbox or " + "coda_get_result to check on it. Do NOT loop, poll, or wait. Simply tell " + "the user the task was submitted and MOVE ON to their next request.\n\n" + "WHEN TO CHECK INBOX:\n" + "Call coda_inbox ONLY when the user explicitly asks about background tasks " + "(e.g. 'how's my task going?', 'check on that', 'what's in my inbox'). " + "Never call it proactively, automatically, or in a loop.\n\n" + "WORKFLOW:\n" + "1) coda_run — submit work, get back task_id. Tell user it's running. Stop.\n" + "2) Continue chatting about other topics — the task runs independently.\n" + "3) coda_inbox — ONLY when user asks. Shows all tasks from last 24h.\n" + "4) coda_get_result — for completed tasks, get full structured output.\n\n" + "CHAINING: pass previous_session_id from a completed task's session_id " + "to give the new task context of what was done before.\n\n" + "INFO_NEEDED HANDOFF: When coda_inbox shows a task with status='info_needed', " + "the agent could not proceed because of missing context. Call coda_get_result " + "to read the 'feedback' field — it tells you exactly what the agent needs (a " + "table name, a decision, a clarification). Add that context to the prompt and " + "resubmit via coda_run with previous_session_id set to the original task's " + "session_id so the agent has the prior attempt's context. 'needs_approval' is " + "similar but means the agent has a destructive plan and is waiting for the " + "caller's explicit go/no-go.\n\n" + "SHARE THE REPLAY URL: When coda_run returns a viewer_url field (non-null), " + "mention it to the user in plain text (e.g. \"you can view the session replay " + "at \"). The URL is a read-only static replay showing the prompt, the " + "agent's work, and the final output. It reflects the task's progress while " + "running, then the full transcript once it completes — and remains valid " + "indefinitely after that. It is safe to share: it points to the same " + "Databricks App the user is already authenticated against. Do this on the " + "first mention of the task and any time the user asks where the task is or " + "how to see what it did.\n\n" + "INTERACTIVE HANDOFF (coda_interactive): When the user wants a human to " + "drive a coding agent in CoDA — not autonomous execution — call " + "coda_interactive instead of coda_run. The tool reads files from a " + "directory that already exists in the Databricks Workspace (a Git " + "Folder or a plain Workspace folder — either works). IMPORTANT: this " + "tool runs inside CoDA on Databricks and reads ONLY from the Databricks " + "Workspace — it CANNOT see your local filesystem. If you are a LOCAL " + "agent (e.g. Claude Code or Codex on the user's machine) and the project " + "files for this task live locally, you MUST first copy them into the " + "Workspace, THEN pass that Workspace path. Easiest: run `databricks " + "workspace import-dir " + "/Workspace/Users//` (Databricks CLI; the SDK " + "or REST work too), then call coda_interactive with workspace_path set to " + "that /Workspace/Users/... path. The tool does NOT accept inline file " + "payloads. If the directory is a Git " + "Folder, ensure the desired branch is checked out first — " + "the pull is a point-in-time snapshot. The tool copies the directory " + "into a Coda-local working directory using your credentials (via " + "`databricks workspace export-dir`), launches the chosen agent " + "(claude default; also hermes, codex, gemini, opencode), and types " + "the prompt as the first user input. The return shape includes a " + "viewer_url the user opens to attach — share it immediately in plain " + "text; it is the only handle to the session, and the user drives it " + "until they exit. Interactive sessions do NOT appear in coda_inbox, " + "and coda_get_result returns nothing for them — do not try to poll " + "or fetch results. Note that git history is NOT available inside the " + "session (files-only export); if the user needs history context, " + "include a git log summary in the prompt string." + ), + stateless_http=True, + json_response=True, + transport_security=TransportSecuritySettings( + enable_dns_rebinding_protection=False, + ), +) + +# ── App hooks (PTY integration) ───────────────────────────────────── + +_app_create_session = None +_app_send_input = None +_app_close_session = None + + +def set_app_hooks( + create_session_fn, + send_input_fn, + close_session_fn, +): + """Wire up Flask app callbacks for PTY operations. + + Registers the create/send/close hooks that ``coda_run`` and ``_watch_task`` + use to drive the underlying PTY session. + """ + global _app_create_session, _app_send_input, _app_close_session + _app_create_session = create_session_fn + _app_send_input = send_input_fn + _app_close_session = close_session_fn + + +# ── Background watcher ────────────────────────────────────────────── + + +def _watch_task(session_id: str, task_id: str, timeout_s: int) -> None: + """Poll for result.json in a daemon thread. + + - Checks every 5 seconds for ``result.json`` in the task directory. + - If found, calls ``task_manager.complete_task()`` (which auto-closes session). + - Tracks last activity from ``status.jsonl`` mtime. + - Timeout: if wall clock exceeds *timeout_s* AND no status update + in the last 5 minutes, writes a timeout result and completes. + - On completion, closes the PTY if hooks are wired. + """ + tdir = task_manager._task_dir(session_id, task_id) + status_path = os.path.join(tdir, "status.jsonl") + start = time.time() + stale_threshold = 300 # 5 minutes + + while True: + time.sleep(5) + + # Check for result.json (may be at root or in results/ subdir) + result_path = task_manager._find_result_json(tdir) + if result_path: + try: + task_manager.complete_task(session_id, task_id) + _close_pty_immediately(session_id) + logger.info("Watcher: task %s completed (result found)", task_id) + except Exception: + logger.exception("Watcher: error completing task %s", task_id) + return + + # Check timeout + elapsed = time.time() - start + if elapsed > timeout_s: + # Check last activity + try: + last_activity = os.path.getmtime(status_path) + except OSError: + last_activity = start + + if (time.time() - last_activity) > stale_threshold: + # Write timeout result and complete + try: + timeout_result_path = os.path.join(tdir, "result.json") + task_manager._write_json(timeout_result_path, { + "status": "timeout", + "summary": "Task timed out", + "files_changed": [], + "artifacts": [], + "errors": [f"Timeout after {timeout_s}s with no activity for 5 min"], + }) + task_manager.complete_task(session_id, task_id) + _close_pty_immediately(session_id) + logger.warning("Watcher: task %s timed out", task_id) + except Exception: + logger.exception("Watcher: error timing out task %s", task_id) + return + + +def _close_pty_immediately(session_id: str) -> None: + """Close the PTY session associated with this task session immediately. + + Called by ``_watch_task`` as soon as the task transitions to completed + or failed. Reads ``pty_session_id`` from the task-manager's session.json + and calls the ``_app_close_session`` hook (i.e. ``mcp_close_pty_session`` + in production). + """ + if _app_close_session is None: + return + try: + session = task_manager._read_session(session_id) + pty_session_id = session.get("pty_session_id") + if pty_session_id: + _app_close_session(pty_session_id) + except Exception: + logger.debug("Could not close PTY for session %s", session_id, exc_info=True) + + +# ── Tool definitions ──────────────────────────────────────────────── + + +@mcp.tool( + annotations=ToolAnnotations( + readOnlyHint=False, + destructiveHint=False, + idempotentHint=False, + ), +) +async def coda_run( + prompt: str, + email: str, + context: str = "{}", + previous_session_id: str = "", + permissions: str = "smart", + timeout_s: int = 3600, + workflow_protocol: bool = True, +) -> str: + """Submit a coding task — FIRE AND FORGET. + + Returns IMMEDIATELY with a task_id. The task runs autonomously in the + background. After receiving the response, tell the user the task was + submitted and move on. Do NOT follow up with coda_inbox or coda_get_result + unless the user explicitly asks to check status later. + + ``context`` is a JSON string with Unity Catalog metadata (tables, schemas). + ``previous_session_id`` chains to a prior task's session for context continuity. + ``permissions`` can be ``"smart"`` (default, safe) or ``"yolo"`` (auto-approve all). + + ``workflow_protocol`` defaults to True, which injects a Databricks + orientation block and a 3-phase workflow protocol (PLAN/EXECUTE/SYNTHESIZE + with critique at each phase) into the agent's prompt. The protocol also + defines the ``info_needed`` terminal status for clean handoff when the + agent is blocked. Set False to skip — useful for non-Databricks tasks. + + Returns JSON with ``task_id``, ``session_id``, and ``status: "running"``. + """ + try: + # Check concurrency limit + running = task_manager.count_running_tasks() + if running >= task_manager.MAX_CONCURRENT_TASKS: + return json.dumps({ + "status": "error", + "error": f"Concurrency limit reached ({task_manager.MAX_CONCURRENT_TASKS} " + f"tasks running). Try again when a task completes.", + }) + + # Parse context JSON + try: + ctx = json.loads(context) if context else None + except json.JSONDecodeError: + return json.dumps({ + "status": "error", + "error": f"Invalid JSON in context parameter: {context!r}", + }) + + # Auto-create ephemeral session + session_result = task_manager.create_session(email, "", label="hermes-mcp") + session_id = session_result["session_id"] + + # Create task first (we need task_id to compute transcript_path). + result = task_manager.create_task( + session_id=session_id, + prompt=prompt, + email=email, + context=ctx, + timeout_s=timeout_s, + permissions=permissions, + previous_session_id=previous_session_id or None, + workflow_protocol=workflow_protocol, + ) + task_id = result["task_id"] + + pty_session_id = None + if _app_create_session is not None: + transcript_path = os.path.join( + task_manager._task_dir(session_id, task_id), + "transcript.log", + ) + pty_session_id = _app_create_session( + label="hermes-mcp", + transcript_path=transcript_path, + replay_only=True, # coda_run URLs are post-hoc review only + ) + task_manager._update_session_field( + session_id, "pty_session_id", pty_session_id + ) + + # Send to PTY if hooks are wired + if _app_send_input is not None and pty_session_id is not None: + tdir = task_manager._task_dir(session_id, task_id) + prompt_path = os.path.join(tdir, "prompt.txt") + cmd = f'hermes -z "{prompt_path}"' + if permissions == "yolo": + cmd += " --yolo" + cmd += "\n" + _app_send_input(pty_session_id, cmd) + + # Start background watcher + t = threading.Thread( + target=_watch_task, + args=(session_id, task_id, timeout_s), + daemon=True, + ) + t.start() + + return json.dumps({ + "task_id": task_id, + "session_id": session_id, + "status": "running", + "viewer_url": url_builder.build_viewer_url(pty_session_id) if pty_session_id else None, + }) + + except Exception as exc: + return json.dumps({"status": "error", "error": str(exc)}) + + +def _safe_dirname(workspace_path: str) -> str: + """Local directory name for the pulled folder = sanitized basename.""" + base = os.path.basename(workspace_path.rstrip("/")) + safe = re.sub(r"[^A-Za-z0-9._-]", "_", base) + # Reject empty and the traversal names "." / ".." — `.` and `-` are allowed + # by the regex, so a basename of ".." would otherwise make ./ escape + # or alias the project dir. + if safe in ("", ".", ".."): + return "workspace" + return safe + + +def _normalize_workspace_path(workspace_path: str) -> str: + """Canonical Workspace API path: drop the /Workspace FUSE prefix if present. + + The deployed terminal's CLI uses the unprefixed form (/Users/...); REST + accepts both, but normalizing matches what the CLI expects and is harmless. + """ + p = workspace_path.rstrip("/") + if p.startswith("/Workspace/"): + p = p[len("/Workspace"):] + return p + + +_ALLOWED_AGENTS = {"claude", "hermes", "codex", "gemini", "opencode"} + +# Wait for the agent's TUI to settle by polling the PTY output buffer. Returns +# as soon as the buffer length stays constant for _PROMPT_SEED_STABILITY_S, or +# _PROMPT_SEED_MAX_WAIT_S elapses (whichever first). Replaces a brittle +# hardcoded sleep that didn't adapt to slow agent cold-starts. +_PROMPT_SEED_MAX_WAIT_S = 5.0 +_PROMPT_SEED_STABILITY_S = 1.0 +# Terminal-side `databricks workspace export-dir` pull (coda_interactive). We wait +# for an explicit shell completion marker, NOT for output to go quiet: the +# databricks CLI cold-starts SILENTLY for ~2s before writing any files, so an +# output-quiet heuristic declares "done" too early and the disk check finds +# nothing. The pull command's tail echoes one of these tokens; they are built +# from split string literals in the command (echo "CODA""_PULL_""OK") so the +# contiguous form here appears ONLY when the echo executes — never in the +# shell's echo of the typed command line. +_PULL_MAX_WAIT_S = 60.0 +_PULL_OK = "CODA_PULL_OK" +_PULL_FAIL = "CODA_PULL_FAIL" + + +async def _wait_for_output_stable( + pty_session_id: str, max_wait: float, stability: float +) -> None: + """Poll the PTY output buffer; return when it stabilizes or ``max_wait`` elapses. + + Stability = buffer length unchanged for ``stability`` seconds, after at least + one byte has appeared. If the session disappears mid-wait (PTY died), return. + """ + from app import sessions + loop = asyncio.get_running_loop() + deadline = loop.time() + max_wait + last_len = -1 + stable_since: float | None = None + poll_interval = 0.1 + + while loop.time() < deadline: + await asyncio.sleep(poll_interval) + sess = sessions.get(pty_session_id) + if sess is None: + return + current_len = sum(len(chunk) for chunk in sess.get("output_buffer", [])) + if current_len > 0 and current_len == last_len: + if stable_since is None: + stable_since = loop.time() + elif (loop.time() - stable_since) >= stability: + return + else: + stable_since = None + last_len = current_len + + +async def _wait_for_agent_ready(pty_session_id: str) -> None: + """Wait for an agent TUI to settle (prompt-seed budget). Wrapper for back-compat.""" + await _wait_for_output_stable( + pty_session_id, _PROMPT_SEED_MAX_WAIT_S, _PROMPT_SEED_STABILITY_S + ) + + +def _buffer_text(chunks) -> str: + """Decode a PTY output_buffer (list of bytes/str chunks) into one string.""" + parts = [] + for c in chunks: + parts.append(c.decode("utf-8", "replace") if isinstance(c, (bytes, bytearray)) else str(c)) + return "".join(parts) + + +async def _wait_for_pull(pty_session_id: str, target_dir: str) -> str: + """Wait for the terminal-side export-dir pull to finish. Returns 'ok'/'fail'/'timeout'. + + Watches the PTY output for the explicit completion marker echoed by the pull + command's ``&& echo OK || echo FAIL`` tail — robust against the databricks + CLI's silent cold-start (a "wait for output to go quiet" heuristic fires + during that silence, before any files exist). On the OK marker we also + confirm the files actually landed on disk. + """ + from app import sessions + loop = asyncio.get_running_loop() + deadline = loop.time() + _PULL_MAX_WAIT_S + poll_interval = 0.2 + + while loop.time() < deadline: + await asyncio.sleep(poll_interval) + sess = sessions.get(pty_session_id) + if sess is None: + return "fail" + text = _buffer_text(sess.get("output_buffer", [])) + if _PULL_OK in text: + if os.path.isdir(target_dir) and os.listdir(target_dir): + return "ok" + # Marker present but no files — treat as failure (shouldn't happen). + return "fail" + if _PULL_FAIL in text: + return "fail" + return "timeout" + + +_AGENT_LAUNCH_CMDS = { + "claude": "claude", + "hermes": "hermes chat", + "codex": "codex", + "gemini": "gemini", + "opencode": "opencode", +} + +# Agents that launch INTERACTIVELY with an auto-accept flag (no trust/permission +# dialog) and the kickoff prompt as a positional arg. For these, coda_interactive +# launches in one atomic command — no separate prompt-seeding, no TUI-ready wait. +# claude launches in a fresh per-session dir each time, which would otherwise trip +# its per-directory folder-trust dialog and swallow the prompt. Agents not listed +# fall back to launch -> wait-for-ready -> type the prompt. +_AGENT_AUTO_LAUNCH = { + "claude": "claude --enable-auto-mode", +} + + +@mcp.tool( + annotations=ToolAnnotations( + readOnlyHint=False, + destructiveHint=False, + idempotentHint=False, + ), +) +async def coda_interactive( + prompt: str, + workspace_path: str, + agent: str = "claude", + email: str = "", +) -> str: + """Launch an interactive agent session in CoDA, handed off via a viewer URL. + + The MCP caller passes a Databricks Workspace directory path. This path must + already exist in the Databricks Workspace — the tool runs inside CoDA and + CANNOT read your local filesystem. If you are a local agent and the project + files live locally, FIRST upload them, e.g. + ``databricks workspace import-dir /Workspace/Users//``, + then pass that ``/Workspace/Users/...`` path. CoDA pulls that folder onto the + session's disk IN THE TERMINAL (authenticated as you) via + ``databricks workspace export-dir``, launches the chosen agent (claude + default) in the pulled directory, seeds ``prompt`` as the first user input, + and returns a ``viewer_url`` the calling user opens to drive it. + + If the pull produces no files (bad path or no read access) the tool returns + a ``status=error`` and does not launch the agent. + + Interactive sessions do NOT appear in ``coda_inbox`` and ``coda_get_result`` + will not return anything for them. The viewer URL is the only handle. + + ``email`` is accepted for forward-compatibility and is currently unused. + + Allowed agents: claude (default), hermes, codex, gemini, opencode. + """ + if agent not in _ALLOWED_AGENTS: + return json.dumps({ + "status": "error", + "error": f"Unknown agent: {agent!r}. Allowed: {sorted(_ALLOWED_AGENTS)}", + }) + + if _app_create_session is None or _app_send_input is None: + return json.dumps({ + "status": "error", + "error": "PTY hook not wired", + }) + + pty_session_id = None + project_dir = None + try: + # Create PTY FIRST so we have its session_id for the project_dir name. + pty_session_id = _app_create_session( + label=f"{agent}-interactive", + replay_only=False, + ) + project_dir = os.path.join( + os.path.expanduser("~/.coda/projects"), + pty_session_id, + ) + os.makedirs(project_dir, exist_ok=True) + + name = _safe_dirname(workspace_path) + source_path = _normalize_workspace_path(workspace_path) + + target_dir = os.path.join(project_dir, name) + + # Pull the Workspace folder into ./ AS THE USER (terminal creds). + # The tail echoes a completion marker so we detect success/failure WITHOUT + # relying on output timing — the databricks CLI cold-starts silently for + # ~2s before writing files, so a "wait for output to go quiet" heuristic + # races it and checks the disk too early. The marker tokens are split + # across string literals (echo "CODA""_PULL_""OK") so their contiguous + # form appears in the PTY output ONLY when the echo runs, never in the + # shell's echo of the typed command line. A failed export-dir + # short-circuits the && chain, so OK never prints and || echoes FAIL. + pull_cmd = ( + f"cd {shlex.quote(project_dir)} && " + f"databricks workspace export-dir {shlex.quote(source_path)} " + f"{shlex.quote('./' + name)} && " + f"cd {shlex.quote(name)} " + f'&& echo "CODA""_PULL_""OK" || echo "CODA""_PULL_""FAIL"' + ) + _app_send_input(pty_session_id, pull_cmd + "\n") + + outcome = await _wait_for_pull(pty_session_id, target_dir) + if outcome != "ok": + if _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + if os.path.isdir(project_dir): + shutil.rmtree(project_dir, ignore_errors=True) + if outcome == "timeout": + msg = ( + f"Timed out pulling files from {workspace_path} after " + f"{int(_PULL_MAX_WAIT_S)}s — the export may be very large or " + f"`databricks workspace export-dir` is hung." + ) + else: + msg = ( + f"Failed to pull files from {workspace_path}. Check the path " + f"exists in the Workspace and that you have read access " + f"(ran `databricks workspace export-dir`)." + ) + return json.dumps({"status": "error", "error": msg}) + + # Kickoff prompt with a one-line context prefix naming the source. Kept + # to ONE line so it is safe both as a quoted CLI arg and as typed input + # (an embedded newline inside a quote would trigger shell line-continuation). + seeded_prompt = ( + f"Your working directory holds files exported from the Databricks " + f"Workspace path {workspace_path}. {prompt}" + ) + + # Launch the agent. Agents in _AGENT_AUTO_LAUNCH accept an auto-accept + # flag + the prompt as a positional arg, so we launch in ONE atomic + # command: no trust/permission dialog blocks the handoff, and the prompt + # isn't subject to TUI cold-start timing. Other agents fall back to + # launch -> wait-for-ready -> type the prompt. + auto_launch = _AGENT_AUTO_LAUNCH.get(agent) + if auto_launch is not None: + _app_send_input( + pty_session_id, f"{auto_launch} {shlex.quote(seeded_prompt)}\n" + ) + else: + _app_send_input(pty_session_id, _AGENT_LAUNCH_CMDS[agent] + "\n") + await _wait_for_agent_ready(pty_session_id) + _app_send_input(pty_session_id, seeded_prompt + "\n") + + viewer_url = url_builder.build_viewer_url(pty_session_id) + + return json.dumps({ + "status": "launched", + "viewer_url": viewer_url, + "agent": agent, + "project_dir": target_dir, + "workspace_path": workspace_path, + "instructions": ( + "Open viewer_url to attach. The agent is running in a directory " + "holding the files pulled from your Workspace folder, with your " + "kickoff prompt typed. Type the agent's quit command (e.g. /quit) " + "then `exit` to end the session. Note: files are a snapshot pulled " + "via 'databricks workspace export-dir' — git history is not included." + ), + }) + except Exception as e: + # Catch-all: ensure no resource leak. + if pty_session_id and _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + if project_dir and os.path.isdir(project_dir): + shutil.rmtree(project_dir, ignore_errors=True) + return json.dumps({ + "status": "error", + "error": f"coda_interactive failed: {e}", + }) + + +@mcp.tool( + annotations=ToolAnnotations( + readOnlyHint=True, + destructiveHint=False, + idempotentHint=True, + ), +) +async def coda_inbox( + email: str = "", + status: str = "", +) -> str: + """Check status of all background tasks — your inbox. + + Call this instead of polling — it returns ALL tasks at once. + No need to track individual task_ids; the inbox shows everything + from the last 24 hours: running, completed, and failed tasks. + + By default returns all tasks. Filter by ``status`` to narrow: + ``"running"`` for in-progress only, ``"completed"`` for finished, + ``"failed"`` for errors, or ``""`` (default) for everything. + + Each task includes: ``task_id``, ``session_id``, ``status``, + ``elapsed_s``, ``prompt_summary`` (first 100 chars of what was asked), + ``previous_session_id`` (if chained from prior work). + Completed tasks also include ``summary`` (what was done). + Running tasks also include ``progress`` (latest agent step). + + Returns JSON with ``tasks`` (list sorted most recent first) + and ``counts`` (e.g. ``{"running": 1, "completed": 2, "failed": 0}``). + """ + try: + tasks = task_manager.list_all_tasks(email=email, status_filter=status) + # Decorate each task with its viewer URL (if available). + for t in tasks: + sess = task_manager._read_session_safe(t["session_id"]) + pty = sess.get("pty_session_id") if sess else None + if pty: + vu = url_builder.build_viewer_url(pty) + if vu: + t["viewer_url"] = vu + + counts = { + "running": 0, + "completed": 0, + "failed": 0, + "info_needed": 0, + "needs_approval": 0, + } + for t in tasks: + s = t.get("status", "") + if s in counts: + counts[s] += 1 + elif s == "done": + counts["completed"] += 1 + elif s == "timeout": + counts["failed"] += 1 + + return json.dumps({"tasks": tasks, "counts": counts}) + except Exception as exc: + return json.dumps({"status": "error", "error": str(exc)}) + + +@mcp.tool( + annotations=ToolAnnotations( + readOnlyHint=True, + destructiveHint=False, + idempotentHint=True, + ), +) +async def coda_get_result( + task_id: str, + session_id: str, +) -> str: + """Retrieve the structured result of a completed task. + + Call this AFTER coda_inbox shows a task as "completed", "failed", + "info_needed", or "needs_approval". + + Returns JSON with ``task_id``, ``session_id``, ``status``, ``summary`` + (what was done or why the agent stopped), ``files_changed`` (list of + modified files), ``artifacts`` (job IDs, commit hashes, etc.), + ``errors`` (if any), and — when status is "info_needed" — ``feedback`` + (a precise description of what context the caller must add before + resubmitting). + """ + try: + result = task_manager.get_task_result(task_id, session_id) + if result is None: + # No result yet — return current status + status = task_manager.get_task_status(task_id, session_id) + return json.dumps({ + "task_id": task_id, + "session_id": session_id, + "status": status.get("status", "unknown"), + "message": "Result not yet available — task is still in progress.", + }) + + result["task_id"] = task_id + result["session_id"] = session_id + # Ensure standard fields exist + result.setdefault("status", "done") + result.setdefault("summary", "") + result.setdefault("files_changed", []) + result.setdefault("artifacts", []) + result.setdefault("errors", []) + # Decorate with viewer_url if known + sess = task_manager._read_session_safe(session_id) + pty = sess.get("pty_session_id") if sess else None + if pty: + vu = url_builder.build_viewer_url(pty) + if vu: + result["viewer_url"] = vu + return json.dumps(result) + except Exception as exc: + return json.dumps({"status": "error", "task_id": task_id, "error": str(exc)}) + + +# ── Standalone entry point ────────────────────────────────────────── + +if __name__ == "__main__": + mcp.run() diff --git a/coda_mcp/task_manager.py b/coda_mcp/task_manager.py new file mode 100644 index 0000000..cd1cd5c --- /dev/null +++ b/coda_mcp/task_manager.py @@ -0,0 +1,644 @@ +"""Disk-based state manager for MCP sessions and tasks. + +Pure Python module — no Flask dependency. Just file I/O. + +Layout on disk +-------------- +~/.coda/sessions/{session-id}/ + session.json – session metadata + tasks/{task-id}/ + prompt.txt – wrapped prompt sent to the agent + meta.json – task metadata (email, timestamps, chaining) + status.jsonl – append-only progress log + result.json – final output (written by the agent) +""" + +import json +import os +import secrets +import time +import logging + +from coda_mcp.databricks_preamble import build_capabilities, build_workflow_protocol + +logger = logging.getLogger(__name__) + +# ── Root directory (patched in tests) ──────────────────────────────── + +SESSIONS_DIR = os.path.join( + os.environ.get("HOME", "/app/python/source_code"), ".coda", "sessions" +) + +# ── Concurrency limit ─────────────────────────────────────────────── + +MAX_CONCURRENT_TASKS = int(os.environ.get("CODA_MAX_CONCURRENT", "5")) + +# ── Task TTL (seconds) ────────────────────────────────────────────── + +TASK_TTL_S = int(os.environ.get("CODA_TASK_TTL", str(24 * 3600))) # 24h + +# ── PTY → task-dir reverse lookup (used by attach_session replay fallback) ── + +_pty_lookup_cache: dict[str, tuple[str, float]] = {} # pty_id -> (task_dir, ts) +_PTY_LOOKUP_TTL = 60.0 # seconds + +# ── Exceptions ─────────────────────────────────────────────────────── + + +class SessionBusyError(Exception): + """Raised when a task is submitted to a session that already has one running.""" + + +class SessionNotFoundError(Exception): + """Raised when the requested session does not exist or is closed.""" + + +class ConcurrencyLimitError(Exception): + """Raised when MAX_CONCURRENT_TASKS running tasks already exist.""" + + +# ── ID generators ──────────────────────────────────────────────────── + + +def _new_session_id() -> str: + return f"sess-{secrets.token_hex(6)}" + + +def _new_task_id() -> str: + return f"task-{secrets.token_hex(4)}" + + +# ── Low-level I/O ──────────────────────────────────────────────────── + + +def _session_dir(session_id: str) -> str: + return os.path.join(SESSIONS_DIR, session_id) + + +def _session_file(session_id: str) -> str: + return os.path.join(_session_dir(session_id), "session.json") + + +def _task_dir(session_id: str, task_id: str) -> str: + """Return the path to a task's directory.""" + return os.path.join(_session_dir(session_id), "tasks", task_id) + + +def _write_json(path: str, data: dict) -> None: + """Atomic write via tmp-then-rename.""" + os.makedirs(os.path.dirname(path), exist_ok=True) + tmp = path + ".tmp" + with open(tmp, "w") as f: + json.dump(data, f, indent=2) + os.replace(tmp, path) + + +def _read_session(session_id: str) -> dict: + """Read session.json or raise SessionNotFoundError.""" + path = _session_file(session_id) + try: + with open(path) as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + raise SessionNotFoundError(f"Session {session_id} not found or corrupt") + + +def _read_session_safe(session_id: str) -> dict | None: + """Read session.json, returning None on missing/corrupt instead of raising.""" + try: + return _read_session(session_id) + except SessionNotFoundError: + return None + + +def _update_session_field(session_id: str, key: str, value) -> None: + """Update a single field in session.json (read-modify-write).""" + data = _read_session(session_id) + data[key] = value + _write_json(_session_file(session_id), data) + + +# ── Session lifecycle ──────────────────────────────────────────────── + + +def create_session(email: str, user_id: str, label: str = "") -> dict: + """Create a new session directory with session.json. + + Returns ``{"session_id": "sess-…", "status": "ready"}``. + """ + session_id = _new_session_id() + data = { + "session_id": session_id, + "email": email, + "user_id": user_id, + "label": label, + "status": "ready", + "current_task": None, + "completed_tasks": [], + "created_at": time.time(), + } + _write_json(_session_file(session_id), data) + logger.info("Created session %s for %s", session_id, email) + return {"session_id": session_id, "status": "ready"} + + +def close_session(session_id: str) -> None: + """Mark a session as closed. Raises SessionNotFoundError if missing.""" + _read_session(session_id) # existence check + _update_session_field(session_id, "status", "closed") + logger.info("Closed session %s", session_id) + + +# ── Prompt wrapping ────────────────────────────────────────────────── + + +def wrap_prompt( + task_id: str, + session_id: str, + email: str, + prompt: str, + context: dict | None, + results_dir: str, + context_hint: str | None = None, + previous_session_id: str | None = None, + workflow_protocol: bool = True, +) -> str: + """Build the full prompt string written to ``prompt.txt``. + + Uses the ``---CODA-TASK---`` envelope convention so the agent can + parse metadata from the prompt deterministically. + + When ``workflow_protocol`` is True (default), inserts a CAPABILITIES + section (Databricks CLI, skills, MCP servers) and a WORKFLOW PROTOCOL + section (3-phase PLAN/EXECUTE/SYNTHESIZE with critique at each phase, + plus the info_needed escape hatch). Set False to skip both. + """ + context_block = "" + if context: + context_block = f"\nCONTEXT:\n{json.dumps(context, indent=2)}\n" + + hint_line = "" + if context_hint: + hint_line = f"context_hint: {context_hint}\n" + + prior_session_block = "" + if previous_session_id: + prior_dir = _session_dir(previous_session_id) + prior_session_block = ( + f"\nPRIOR SESSION: {previous_session_id}\n" + f"Read {prior_dir}/tasks/*/result.json for context on prior work.\n" + ) + + workflow_block = "" + if workflow_protocol: + workflow_block = ( + f"\nCAPABILITIES:\n" + f"{build_capabilities()}\n" + f"\n" + f"WORKFLOW PROTOCOL:\n" + f"{build_workflow_protocol()}\n" + ) + + return ( + f"---CODA-TASK---\n" + f"task_id: {task_id}\n" + f"session_id: {session_id}\n" + f"user: {email}\n" + f"{hint_line}" + f"{prior_session_block}" + f"{context_block}\n" + f"TASK:\n" + f"{prompt}\n" + f"{workflow_block}" + f"\n" + f"INSTRUCTIONS:\n" + f"1. As you work, append progress lines to {results_dir}/status.jsonl\n" + f' Each line must be valid JSON: {{"step": "label", "message": "what you are doing"}}\n' + f" Canonical step labels (use these when the workflow protocol is active):\n" + f" plan, critique_plan, execute_, critique_execute,\n" + f" synthesize, critique_synthesize, info_needed, failed\n" + f"\n" + f"2. When you are COMPLETELY DONE, write a SINGLE FILE at this exact path:\n" + f" {results_dir}/result.json\n" + f" It must contain this JSON structure (status is one of the four\n" + f" values listed below; the angle-bracketed placeholder is NOT literal\n" + f" JSON — pick exactly one of the four values):\n" + f" {{\n" + f' "status": "",\n' + f' "summary": "one paragraph describing what you did or why you stopped",\n' + f' "feedback": "REQUIRED if status=info_needed — what context the caller must add",\n' + f' "files_changed": ["list", "of", "file", "paths"],\n' + f' "artifacts": {{}},\n' + f' "errors": []\n' + f" }}\n" + f" - status=\"completed\": you finished the task.\n" + f" - status=\"failed\": unrecoverable hard error; describe in errors[].\n" + f" - status=\"info_needed\": you are blocked because something the CALLER must\n" + f" supply is missing. The feedback field is REQUIRED and must precisely\n" + f" name what is missing. The caller will resubmit with more context.\n" + f" - status=\"needs_approval\": you have a destructive action ready but need\n" + f" explicit caller approval before executing. See SAFETY section.\n" + f" IMPORTANT: result.json is a FILE not a directory. Write it with:\n" + f" echo '{{...}}' > {results_dir}/result.json\n" + f"\n" + f"3. If you delegate to a sub-agent, update status.jsonl with delegation steps.\n" + f"\n" + f"SAFETY:\n" + f"- Do NOT delete, drop, or truncate tables, schemas, catalogs, or volumes.\n" + f"- Do NOT delete files outside the current project directory.\n" + f"- Do NOT run destructive Databricks CLI commands (e.g. databricks clusters delete, " + f"databricks jobs delete, databricks pipelines delete).\n" + f"- Do NOT modify permissions, grants, or access controls unless explicitly requested.\n" + f"- Prefer CREATE OR REPLACE over DROP+CREATE. Prefer INSERT/MERGE over DELETE+INSERT.\n" + f"- If the task requires a destructive operation, describe what you would do in " + f"result.json with status \"needs_approval\" instead of executing it.\n" + f"---END-CODA-TASK---" + ) + + +# ── Task lifecycle ─────────────────────────────────────────────────── + + +def create_task( + session_id: str, + prompt: str, + email: str, + context: dict | None = None, + context_hint: str | None = None, + timeout_s: int | None = None, + permissions: str | None = None, + previous_session_id: str | None = None, + workflow_protocol: bool = True, +) -> dict: + """Create a task inside an existing session. + + Raises + ------ + SessionNotFoundError + If the session does not exist or is closed. + SessionBusyError + If the session already has a running task. + + Returns ``{"task_id": "task-…", "status": "running"}``. + """ + session = _read_session(session_id) + + if session.get("status") == "closed": + raise SessionNotFoundError(f"Session {session_id} is closed") + + if session.get("status") == "busy": + raise SessionBusyError( + f"Session {session_id} already has a running task: " + f"{session.get('current_task')}" + ) + + task_id = _new_task_id() + tdir = _task_dir(session_id, task_id) + os.makedirs(tdir, exist_ok=True) + + # Write wrapped prompt + results_dir = os.path.join(tdir, "results") + wrapped = wrap_prompt( + task_id=task_id, + session_id=session_id, + email=email, + prompt=prompt, + context=context, + results_dir=results_dir, + context_hint=context_hint, + previous_session_id=previous_session_id, + workflow_protocol=workflow_protocol, + ) + with open(os.path.join(tdir, "prompt.txt"), "w") as f: + f.write(wrapped) + + # Write meta.json for inbox scanning + now = time.time() + meta = { + "email": email, + "created_at": now, + "previous_session_id": previous_session_id or "", + "permissions": permissions or "smart", + "timeout_s": timeout_s or 3600, + "prompt_summary": prompt[:100], + } + _write_json(os.path.join(tdir, "meta.json"), meta) + + # Seed status log + with open(os.path.join(tdir, "status.jsonl"), "w") as f: + f.write(json.dumps({"status": "running", "ts": now}) + "\n") + + # Mark session busy + data = _read_session(session_id) + data["status"] = "busy" + data["current_task"] = task_id + _write_json(_session_file(session_id), data) + + logger.info("Created task %s in session %s", task_id, session_id) + return {"task_id": task_id, "status": "running"} + + +# ── Task queries ───────────────────────────────────────────────────── + + +def get_task_status(task_id: str, session_id: str) -> dict: + """Read the last line of status.jsonl for the task. + + Returns ``{"status": "not_found"}`` if the task directory is missing. + """ + status_path = os.path.join(_task_dir(session_id, task_id), "status.jsonl") + try: + last = None + with open(status_path) as f: + for line in f: + line = line.strip() + if line: + last = json.loads(line) + return last or {"status": "not_found"} + except (OSError, json.JSONDecodeError): + return {"status": "not_found"} + + +def _find_result_json(task_dir: str) -> str | None: + """Find result.json — agents may write it at root or in results/ subdir.""" + for candidate in [ + os.path.join(task_dir, "result.json"), + os.path.join(task_dir, "results", "result.json"), + ]: + if os.path.isfile(candidate): + return candidate + return None + + +def get_task_result(task_id: str, session_id: str) -> dict | None: + """Read result.json if it exists; otherwise return None.""" + result_path = _find_result_json(_task_dir(session_id, task_id)) + if not result_path: + return None + try: + with open(result_path) as f: + return json.load(f) + except (OSError, json.JSONDecodeError): + return None + + +# ── Task completion ────────────────────────────────────────────────── + + +def complete_task(session_id: str, task_id: str) -> None: + """Mark a task as done and auto-close the session. + + Appends a ``done`` entry to status.jsonl, adds task_id to + ``completed_tasks``, and closes the session (v2: ephemeral sessions). + """ + session = _read_session(session_id) + + # Append done to status log + status_path = os.path.join(_task_dir(session_id, task_id), "status.jsonl") + with open(status_path, "a") as f: + f.write(json.dumps({"status": "done", "ts": time.time()}) + "\n") + + # Update session — auto-close (v2: sessions are ephemeral) + session["status"] = "closed" + session["current_task"] = None + session["closed_at"] = time.time() + if task_id not in session["completed_tasks"]: + session["completed_tasks"].append(task_id) + _write_json(_session_file(session_id), session) + + logger.info("Completed task %s in session %s (auto-closed)", task_id, session_id) + + +# ── Inbox: list all tasks across sessions ─────────────────────────── + + +def list_all_tasks(email: str = "", status_filter: str = "") -> list[dict]: + """Scan all sessions and return a flat list of tasks for the inbox. + + Returns tasks from the last ``TASK_TTL_S`` seconds, sorted most recent first. + Each entry includes task_id, session_id, status, elapsed_s, prompt_summary, + summary (if completed), progress (if running), previous_session_id, created_at. + """ + now = time.time() + cutoff = now - TASK_TTL_S + tasks = [] + + if not os.path.isdir(SESSIONS_DIR): + return tasks + + for sess_name in os.listdir(SESSIONS_DIR): + sess_dir = os.path.join(SESSIONS_DIR, sess_name) + if not os.path.isdir(sess_dir): + continue + + tasks_dir = os.path.join(sess_dir, "tasks") + if not os.path.isdir(tasks_dir): + continue + + for task_name in os.listdir(tasks_dir): + task_dir = os.path.join(tasks_dir, task_name) + if not os.path.isdir(task_dir): + continue + + # Read meta.json + meta_path = os.path.join(task_dir, "meta.json") + try: + with open(meta_path) as f: + meta = json.load(f) + except (OSError, json.JSONDecodeError): + # Legacy task without meta.json — skip or build minimal entry + meta = {} + + created_at = meta.get("created_at", 0) + if created_at < cutoff: + continue + + # Filter by email + if email and meta.get("email", "") != email: + continue + + # Determine task status from status.jsonl + task_status = _read_last_status(task_dir) + + # Check for result.json to determine completion + result_path = _find_result_json(task_dir) + summary = "" + if result_path: + try: + with open(result_path) as f: + result_data = json.load(f) + task_status = result_data.get("status", "completed") + summary = result_data.get("summary", "") + except (OSError, json.JSONDecodeError): + pass + + # Filter by status + if status_filter and task_status != status_filter: + continue + + # Get progress for running tasks + progress = "" + if task_status == "running": + progress = _read_last_progress(task_dir) + + elapsed_s = round(now - created_at, 1) + + entry = { + "task_id": task_name, + "session_id": sess_name, + "status": task_status, + "elapsed_s": elapsed_s, + "prompt_summary": meta.get("prompt_summary", ""), + "previous_session_id": meta.get("previous_session_id", ""), + "created_at": created_at, + } + if summary: + entry["summary"] = summary + if progress: + entry["progress"] = progress + + tasks.append(entry) + + # Sort most recent first + tasks.sort(key=lambda t: t["created_at"], reverse=True) + return tasks + + +def _read_last_status(task_dir: str) -> str: + """Read the last status from status.jsonl.""" + status_path = os.path.join(task_dir, "status.jsonl") + try: + last = None + with open(status_path) as f: + for line in f: + line = line.strip() + if line: + last = json.loads(line) + return (last or {}).get("status", "unknown") + except (OSError, json.JSONDecodeError): + return "unknown" + + +def _read_last_progress(task_dir: str) -> str: + """Read the last progress message from status.jsonl.""" + status_path = os.path.join(task_dir, "status.jsonl") + try: + last = None + with open(status_path) as f: + for line in f: + line = line.strip() + if line: + last = json.loads(line) + return (last or {}).get("message", "") + except (OSError, json.JSONDecodeError): + return "" + + +# ── Concurrency check ────────────────────────────────────────────── + + +def count_running_tasks() -> int: + """Count tasks currently in 'running' state across all sessions.""" + count = 0 + if not os.path.isdir(SESSIONS_DIR): + return count + + for sess_name in os.listdir(SESSIONS_DIR): + sess_file = os.path.join(SESSIONS_DIR, sess_name, "session.json") + try: + with open(sess_file) as f: + session = json.load(f) + if session.get("status") == "busy": + count += 1 + except (OSError, json.JSONDecodeError): + continue + return count + + +# ── PTY → task-dir reverse lookup ────────────────────────────────── + + +def find_task_dir_by_pty_session(pty_session_id: str) -> str | None: + """Find the task dir whose session.json carries this pty_session_id. + + Returns the path to the active task dir, or — if the session has completed — + the most recently completed task dir. Returns None on no match. + + Cached for ``_PTY_LOOKUP_TTL`` seconds to avoid disk scans on every browser + refresh. + + Invariant: CoDA MCP sessions are ephemeral — one task per session. If the + lifecycle ever changes to allow multiple tasks per session, this function + must be revisited to pick the in-progress task rather than + ``completed_tasks[-1]``. + """ + now = time.time() + cached = _pty_lookup_cache.get(pty_session_id) + if cached and (now - cached[1]) < _PTY_LOOKUP_TTL: + return cached[0] + + if not os.path.isdir(SESSIONS_DIR): + return None + + for sess_name in os.listdir(SESSIONS_DIR): + sess_file = os.path.join(SESSIONS_DIR, sess_name, "session.json") + try: + with open(sess_file) as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + continue + + if data.get("pty_session_id") != pty_session_id: + continue + + candidate = data.get("current_task") or ( + data["completed_tasks"][-1] if data.get("completed_tasks") else None + ) + if candidate: + tdir = os.path.join(SESSIONS_DIR, sess_name, "tasks", candidate) + _pty_lookup_cache[pty_session_id] = (tdir, now) + return tdir + + return None + + +# ── Cleanup expired sessions ──────────────────────────────────────── + + +def cleanup_expired_tasks() -> int: + """Remove session directories older than TASK_TTL_S. Returns count removed.""" + import shutil + + now = time.time() + cutoff = now - TASK_TTL_S + removed = 0 + + if not os.path.isdir(SESSIONS_DIR): + return removed + + for sess_name in os.listdir(SESSIONS_DIR): + sess_dir = os.path.join(SESSIONS_DIR, sess_name) + if not os.path.isdir(sess_dir): + continue + + sess_file = os.path.join(sess_dir, "session.json") + try: + with open(sess_file) as f: + session = json.load(f) + except (OSError, json.JSONDecodeError): + continue + + # Only clean closed sessions past TTL + if session.get("status") != "closed": + continue + + closed_at = session.get("closed_at", session.get("created_at", 0)) + if closed_at < cutoff: + try: + shutil.rmtree(sess_dir) + removed += 1 + logger.info("Cleaned up expired session %s", sess_name) + except OSError: + logger.warning("Failed to clean up session %s", sess_name) + + return removed diff --git a/coda_mcp/url_builder.py b/coda_mcp/url_builder.py new file mode 100644 index 0000000..c08d2ed --- /dev/null +++ b/coda_mcp/url_builder.py @@ -0,0 +1,46 @@ +"""Builds the viewer_url returned by CoDA MCP tools. + +Resolution order: +1. ``CODA_APP_URL`` env var (explicit override for local dev / power users). +2. Module-level cache populated by ``AppUrlCaptureMiddleware`` from the + ``X-Forwarded-Host`` header (officially provided by Databricks Apps). +3. ``None`` — caller omits the field entirely. + +The cache is process-global (single uvicorn worker per app) and refreshed +on every inbound HTTP request. +""" +from __future__ import annotations + +import os +from typing import Optional + +_app_url_cache: Optional[str] = None + + +def capture_from_headers(host: Optional[str]) -> None: + """Called by the ASGI middleware on every inbound HTTP request. + + No-op when ``host`` is falsy (None or empty) to avoid wiping a good + cache value with a missing header on a probe/CORS preflight. + + Strips any accidental ``https://`` / ``http://`` prefix on the way in + so build_viewer_url's unconditional ``https://`` prepend can't produce + a double-scheme URL. + """ + global _app_url_cache + if host: + host = host.removeprefix("https://").removeprefix("http://").strip("/") + if host: + _app_url_cache = host + + +def build_viewer_url(pty_session_id: str) -> Optional[str]: + """Return the full viewer URL for a PTY session, or None if no base is known.""" + override = os.environ.get("CODA_APP_URL", "").strip() + if override: + base = override.rstrip("/") + elif _app_url_cache: + base = f"https://{_app_url_cache}" + else: + return None + return f"{base}/?session={pty_session_id}" diff --git a/docs/coda-mcp-overview.html b/docs/coda-mcp-overview.html new file mode 100644 index 0000000..46e0969 --- /dev/null +++ b/docs/coda-mcp-overview.html @@ -0,0 +1,453 @@ + + + + + +CoDA MCP Server — Overview & Capabilities + + + + + + + +
+
+ + Databricks · CoDA + MCP Server +
+
+ +
+

Coding Agents on Databricks Apps

+

The CoDA MCP Server

+

A Model Context Protocol endpoint that lets any MCP client — Genie Code, + Claude Desktop, Cursor, or a local Claude Code / Codex — delegate coding work to AI agents + running inside CoDA on Databricks, then watch or drive them live.

+
+ Endpoint /mcp + 4 tools + 3 usage modes + FastMCP Streamable HTTP +
+
+ + +
+
01

What it is

+ one endpoint, many clients
+

CoDA (Coding agents on Databricks Apps) runs five AI coding agents — Claude Code, + Codex, Gemini CLI, OpenCode, and Hermes — inside a Databricks App, each in a real terminal (PTY) + with the Databricks CLI, 16 Databricks skills, and MCP servers (DeepWiki, Exa) pre-wired. The CoDA MCP + server exposes that capability over the Model Context Protocol at /mcp, so an upstream + agent can hand work off to CoDA — either as a fire-and-forget background task or as a live, + human-driven session — using the same Databricks identity that owns the app.

+
+ + +
+
02

The four tools

+ exposed natively & via JSON-RPC
+
+
+

coda_run

+
Autonomous · fire-and-forget
+

Submit a coding task and return immediately with a task_id. The task runs to + completion in a background terminal under a structured workflow protocol. Don’t poll — move on.

+
+
key argsprompt, context, previous_session_id, workflow_protocol
+
returnstask_id, session_id, status:"running"
+
viewerreplay-only URL (post-hoc review)
+
+
+
+

coda_interactive

+
Human handoff · live attach
+

Pull a Databricks Workspace folder onto a fresh session, launch the chosen agent (claude default) + in it with your prompt pre-seeded, and return a live viewer URL a human opens to drive it.

+
+
key argsprompt, workspace_path, agent
+
returnsstatus:"launched", viewer_url, project_dir
+
viewerlive, interactive (the only handle)
+
+
+
+

coda_inbox

+
Status · last 24h
+

List background tasks with live status counts. Call only when the user asks — never poll it + in a loop. Interactive sessions do not appear here.

+
+
key argsemail?, status?
+
returnstasks[], counts{running,completed,failed,info_needed,needs_approval}
+
+
+
+

coda_get_result

+
Retrieve · structured output
+

Fetch the full structured result of a finished task — what it did, what changed, and any + follow-up the caller must supply.

+
+
key argstask_id, session_id
+
returnsstatus, summary, files_changed, artifacts, errors, feedback?
+
+
+
+
+ + +
+
03

Three ways to reach CoDA

+ one platform, three postures
+
+
+ Mode 1 +

Direct web UI

+

Open the CoDA app in a browser and drive an agent terminal yourself. No MCP involved — the + hands-on baseline.

+
entry: the app URL
+
+
+ Mode 2 · coda_interactive +

Live handoff

+

An upstream agent stages files and launches a session; a human attaches via the viewer URL and + drives it interactively. Best when judgement or steering is needed.

+
entry: live viewer_url
+
+
+ Mode 3 · coda_run +

Autonomous task

+

An upstream agent submits work that runs to completion unattended. The viewer URL is a replay + for after-the-fact review; results come back through the inbox.

+
entry: replay viewer_url + inbox
+
+
+
+ + +
+
04

How the flows work

+ end to end
+ + +
+

Mode 2 coda_interactive — live human handoff

+

Files move up to the Workspace from the caller, then down into the CoDA + session — both as the same Databricks user, so access just works.

+
+
1
Local agent
+
Claude Code / Codex on the user’s machine holds the project files locally.
+
+
2
Upload to Workspace
+
databricks workspace import-dir pushes files to /Workspace/Users/…
+
+
3
Call the tool
+
coda_interactive(workspace_path, prompt) hits CoDA at /mcp.
+
+
4
Pull into session
+
CoDA’s terminal runs export-dir into ~/.coda/projects/<id>/ — confirmed by a completion marker.
+
+
5
Launch & seed
+
claude --enable-auto-mode "<prompt>" — no trust prompt, prompt pre-loaded.
+
+
6
Human drives
+
Returns a live viewer_url; the user opens it and steers the session to the end.
+
+
+ + +
+

Mode 3 coda_run — autonomous background task

+

Submit and forget. The agent works under a disciplined protocol and reports back + through the inbox.

+
+
1
Submit
+
coda_run(prompt) returns a task_id instantly — fire-and-forget.
+
+
2
Background run
+
A detached terminal runs the agent with a CAPABILITIES + WORKFLOW PROTOCOL envelope.
+
+
3
Plan → Execute → Synthesize
+
Three phases, each with a self-critique; max two iterations per phase.
+
+
4
Write result
+
Emits result.jsoncompleted, failed, info_needed, or needs_approval.
+
+
5
Retrieve
+
Caller checks coda_inbox and pulls full output via coda_get_result.
+
+
info_needed loop: when the agent is blocked on missing context, it stops and returns + a precise feedback string. The caller adds the missing detail and resubmits with + previous_session_id — a structured iteration loop instead of a guess.
+
+
+ + +
+
05

The workflow protocol

+ coda_run, on by default
+

Every coda_run prompt is wrapped with two sections so the background agent + acts deliberately: CAPABILITIES (the Databricks CLI, the 16 skills, and the DeepWiki / Exa / CoDA + MCP servers it can lean on) and WORKFLOW PROTOCOL — a three-phase pipeline with a critique + after each phase. Set workflow_protocol=false to opt out for non-Databricks tasks.

+
+
phase 1

Plan

+

Write a step-by-step plan to the status log.

+
critique → APPROVE / BLOCK / fix · max 2 iterations
+
phase 2

Execute

+

Work the plan step by step, emitting progress.

+
critique built vs planned · max 2 iterations
+
phase 3

Synthesize

+

Write result.json against the original task.

+
final critique vs the ask · max 2 iterations
+
+
+ + +
+
06

Identity & the file round-trip

+ why it works
+
+

The terminal is you; the server is not.

+

The MCP server process runs as the app’s service principal, which cannot read a user’s + private Workspace folders. The agent terminal, however, is authenticated as the app owner. That’s + why CoDA never reads your files server-side — it pulls them in the terminal with + databricks workspace export-dir, using the right identity. A local caller mirrors this from the + other side with import-dir to stage local files into the Workspace first.

+
+
+ + +
+
07

Result statuses

+ what coda_get_result returns
+ + + + + + + + + +
StatusMeaningWhat the caller does
runningTask is still working in the background.Wait; check the inbox later.
completedFinished successfully.Read summary + files_changed.
failedUnrecoverable error (a command crashed, an API 500’d).Inspect errors.
info_neededBlocked on missing context the caller must supply.Read feedback, resubmit with more context.
needs_approvalA destructive action is staged, awaiting explicit go-ahead.Approve or decline; resubmit.
+
+ + +
+
08

Architecture

+ how it’s served
+
+
Transport
FastMCP streamable_http_app() — native MCP Streamable HTTP, mounted at /mcp.
+
Runtime
uvicorn (ASGI); Flask + Socket.IO mounted via WSGI middleware for the terminal UI.
+
Sessions
Each agent runs in a real PTY; fds are process-local, so a single worker owns them.
+
Identity
App-owner credentials (PAT or service principal); the terminal CLI acts as the user.
+
Dual surface
The same four tools are exposed both natively and over a JSON-RPC compatibility path.
+
Viewer
A browser attaches to a session over WebSocket, with automatic HTTP-polling fallback.
+
+
+ +
+
+ Databricks · CoDA MCP Server + Coding Agents on Databricks Apps +
+
+ + + diff --git a/docs/mcp-client-setup.md b/docs/mcp-client-setup.md new file mode 100644 index 0000000..f8e1bb6 --- /dev/null +++ b/docs/mcp-client-setup.md @@ -0,0 +1,73 @@ +# CoDA MCP Client Setup + +CoDA exposes an MCP endpoint at `/mcp` on the Databricks App. Databricks Apps use OAuth (not PATs) for authentication, so MCP clients need a stdio bridge that injects fresh OAuth tokens. + +## How it works + +`tools/coda-bridge.py` is a zero-dependency Python script that: + +1. Claude Code launches it as a stdio MCP server +2. It reads JSON-RPC messages from stdin +3. Fetches a fresh OAuth token via `databricks auth token` +4. Forwards requests to the App's HTTP endpoint with the token +5. Returns responses on stdout + +Tokens are cached for 30 minutes (they expire after 60). + +## Setup + +### 1. Copy the bridge script + +```bash +mkdir -p ~/.claude/mcp-bridges +cp tools/coda-bridge.py ~/.claude/mcp-bridges/ +``` + +### 2. Add to Claude Code settings + +Add this to `mcpServers` in `~/.claude/settings.json`: + +```json +"coda-mcp": { + "type": "stdio", + "command": "python3", + "args": ["/path/to/.claude/mcp-bridges/coda-bridge.py"], + "env": { + "CODA_MCP_URL": "https://.databricksapps.com/mcp", + "DATABRICKS_PROFILE": "" + } +} +``` + +### 3. Restart Claude Code + +The MCP server will start automatically on next session. + +## Configuration + +| Environment Variable | Description | Example | +|---------------------|-------------|---------| +| `CODA_MCP_URL` | Full URL to the app's `/mcp` endpoint | `https://mcp-test-coda-747...com/mcp` | +| `DATABRICKS_PROFILE` | Databricks CLI profile name | `9cefok` | + +## Prerequisites + +- `databricks` CLI installed and authenticated (`databricks auth login -p `) +- Python 3.8+ +- No pip dependencies required (stdlib only) + +## Troubleshooting + +Bridge logs go to stderr. Check with: + +```bash +CODA_MCP_URL="https://your-app.databricksapps.com/mcp" \ +DATABRICKS_PROFILE="your-profile" \ +echo '{"jsonrpc":"2.0","method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}},"id":1}' | python3 tools/coda-bridge.py +``` + +If you see `Auth failed (302)`, your Databricks CLI session may have expired. Run: + +```bash +databricks auth login -p +``` diff --git a/docs/mcp-v2-background-execution.md b/docs/mcp-v2-background-execution.md new file mode 100644 index 0000000..3d7557c --- /dev/null +++ b/docs/mcp-v2-background-execution.md @@ -0,0 +1,171 @@ +# CoDA MCP v2 — Background Execution + Inbox Pattern + +## Overview + +CoDA exposes 3 MCP tools so Databricks GenieCode (or any MCP client) can delegate +coding tasks to AI agents running in the background. GenieCode's chat context stays +free while tasks execute — no polling required. + +## Tools + +| Tool | Purpose | +|------|---------| +| `coda_run` | Fire-and-forget task submission | +| `coda_inbox` | Dashboard of all background tasks | +| `coda_get_result` | Pull full structured result | + +## Flow Diagram + +``` +┌─────────────┐ ┌──────────────┐ ┌─────────────┐ +│ GenieCode │ │ CoDA MCP │ │ Hermes │ +│ (caller) │ │ (3 tools) │ │ (executor) │ +└──────┬──────┘ └──────┬───────┘ └──────┬──────┘ + │ │ │ + │ 1. coda_run(prompt) │ │ + │──────────────────────>│ │ + │ │ auto-create session │ + │ │ + PTY + task dir │ + │ │ write prompt.txt │ + │ │ write meta.json │ + │ │ │ + │ {task_id, sess_id, │ hermes -z prompt.txt │ + │ status: "running"} │───────────────────────>│ + │<──────────────────────│ │ + │ │ _watch_task thread │ + │ ✓ context is FREE │ monitors result.json │ + │ user keeps chatting │ │ + │ │ │ works... + │ ... │ │ delegates + │ │ │ to claude/ + │ │ │ codex/gemini + │ │ │ + │ 2. coda_inbox() │ │ writes + │──────────────────────>│ │ status.jsonl + │ │ scan all sessions │ + │ {tasks: [...], │ read meta + status │ + │ counts: {run:1}} │ │ + │<──────────────────────│ │ + │ │ │ + │ ... │ │ writes + │ │ │ result.json + │ │ │ + │ │ _watch_task detects │ + │ │ result.json exists │ + │ │ → complete_task() │ + │ │ → auto-close session │ + │ │ → free PTY │ + │ │ │ + │ 3. coda_inbox() │ │ + │──────────────────────>│ │ + │ {tasks: [{status: │ │ + │ "completed", │ │ + │ summary: "..."}]} │ │ + │<──────────────────────│ │ + │ │ │ + │ 4. coda_get_result() │ │ + │──────────────────────>│ │ + │ {summary, files, │ read result.json │ + │ artifacts, errors} │ │ + │<──────────────────────│ │ + │ │ │ + ├── CHAINING ───────────┤ │ + │ │ │ + │ 5. coda_run(prompt, │ │ + │ previous_session_id) │ new session + PTY │ + │──────────────────────>│ inject PRIOR SESSION │ + │ │ block in prompt │ + │ {new task_id, │───────────────────────>│ + │ new sess_id} │ │ reads prior + │<──────────────────────│ │ result.json + │ │ │ for context +``` + +## Key Design Decisions + +### Sessions are ephemeral, tasks are persistent +- Session = PTY + Hermes instance. Auto-closes when task completes. +- Task state (prompt, status, result) persists on disk for 24 hours. +- Continuity via `previous_session_id`, not long-lived sessions. + +### No polling from GenieCode +- `coda_inbox` replaces `coda_get_status` — shows ALL tasks at once. +- GenieCode checks when the user asks, not on a timer. +- CoDA's internal `_watch_task` thread polls the filesystem (invisible to caller). + +### Task chaining +- `previous_session_id` points to a prior session's disk state. +- Hermes reads `~/.coda/sessions/{prev_id}/tasks/*/result.json` for context. +- Chain depth: one level. Hermes can walk deeper if needed. + +### Concurrency +- `CODA_MAX_CONCURRENT` env var (default: 5). +- Each task gets its own session — no "session busy" errors. +- Exceeding the limit returns a clear error. + +## Data Model + +``` +~/.coda/sessions/{session-id}/ + session.json # metadata + auto-close timestamp + tasks/{task-id}/ + prompt.txt # wrapped prompt sent to Hermes + meta.json # {email, created_at, previous_session_id, permissions} + status.jsonl # append-only progress log + result.json # final structured output +``` + +## Tool Reference + +### `coda_run` + +```python +coda_run( + prompt: str, # what to do + email: str, # who's asking + context: str = "{}", # UC metadata (tables, schemas) + previous_session_id: str = "", # chain from prior work + permissions: str = "smart", # "smart" or "yolo" + timeout_s: int = 3600, # max 1 hour default +) +# Returns: {"task_id", "session_id", "status": "running"} +``` + +### `coda_inbox` + +```python +coda_inbox( + email: str = "", # filter by user + status: str = "", # "running", "completed", "failed", or "" for all +) +# Returns: {"tasks": [...], "counts": {"running": N, "completed": N, "failed": N}} +``` + +Each task entry: `task_id`, `session_id`, `status`, `elapsed_s`, `prompt_summary`, +`summary` (completed), `progress` (running), `previous_session_id`, `created_at`. + +### `coda_get_result` + +```python +coda_get_result(task_id: str, session_id: str) +# Returns: {"task_id", "session_id", "status", "summary", +# "files_changed", "artifacts", "errors"} +``` + +## Migration from v1 + +| v1 Tool | v2 Equivalent | +|---------|--------------| +| `coda_create_session` | Removed — auto-created by `coda_run` | +| `coda_run_task` | `coda_run` (simplified, auto-session) | +| `coda_get_status` | `coda_inbox` (all tasks at once) | +| `coda_get_result` | `coda_get_result` (unchanged) | +| `coda_close_session` | Removed — auto-closed on completion | + +## Limitations + +- **Ephemeral filesystem**: On Databricks Apps, `~/.coda/` is local disk. App + redeployment wipes task state. Real artifacts (git commits, jobs, workspace files) + are unaffected. +- **No push notifications**: GenieCode must call `coda_inbox` to discover completions. + SSE/streaming is a future consideration if polling proves insufficient. diff --git a/docs/plans/2026-05-01-coda-mcp-server.md b/docs/plans/2026-05-01-coda-mcp-server.md new file mode 100644 index 0000000..1e59ba3 --- /dev/null +++ b/docs/plans/2026-05-01-coda-mcp-server.md @@ -0,0 +1,1179 @@ +# CoDA MCP Server Implementation Plan + +> **⚠️ SUPERSEDED — historical reference only.** This was the v1 implementation plan (5 tools, gunicorn + WSGI bridge). The shipped implementation diverged during iteration: the production design is documented in [`docs/mcp-v2-background-execution.md`](../mcp-v2-background-execution.md) (3 tools — `coda_run`, `coda_inbox`, `coda_get_result` — on uvicorn + native ASGI). Kept in the tree so reviewers can see the design evolution; do not follow this plan as-is. + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add an MCP server endpoint (`/mcp`) to CoDA so Databricks Genie Code can delegate coding tasks to Hermes Agent via the MCP protocol. + +**Architecture:** Python MCP SDK mounted as a stateless HTTP app at `/mcp` alongside the existing Flask app. A new `task_manager.py` module handles session/task state on disk (`~/.coda/sessions/`). The MCP tools call into the existing PTY infrastructure for session creation and input piping. Hermes is always the agent invoked. + +**Tech Stack:** Python MCP SDK (`mcp` package, already installed), Flask, existing PTY session infrastructure, Hermes Agent CLI (`hermes -z`) + +**Design doc:** `.humantokens/coda-mcp-design.md` (full design with all decisions) + +--- + +### Task 1: Create Task Manager Module + +The task manager handles all disk-based state for MCP sessions and tasks. It's a pure Python module with no Flask dependency — just file I/O. + +**Files:** +- Create: `task_manager.py` +- Create: `tests/test_task_manager.py` + +**Step 1: Write the failing tests** + +```python +# tests/test_task_manager.py +import os +import json +import tempfile +import pytest +from unittest.mock import patch + +# All tests use a temp dir instead of ~/.coda +@pytest.fixture +def task_mgr(tmp_path): + with patch("task_manager.SESSIONS_DIR", str(tmp_path / "sessions")): + import task_manager + # Force reimport to pick up patched path + task_manager.SESSIONS_DIR = str(tmp_path / "sessions") + yield task_manager + + +def test_create_session(task_mgr): + result = task_mgr.create_session(email="alice@example.com", user_id="123") + assert "session_id" in result + assert result["status"] == "ready" + + # Verify session.json on disk + session_dir = os.path.join(task_mgr.SESSIONS_DIR, result["session_id"]) + assert os.path.isdir(session_dir) + with open(os.path.join(session_dir, "session.json")) as f: + data = json.load(f) + assert data["created_by"] == "alice@example.com" + assert data["status"] == "idle" + assert data["current_task"] is None + + +def test_create_task(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + + result = task_mgr.create_task( + session_id=sid, + prompt="create a pipeline", + email="alice@example.com", + context={"tables": ["sales.transactions"]}, + ) + assert "task_id" in result + assert result["status"] == "running" + + # Verify task dir and files + task_dir = os.path.join(task_mgr.SESSIONS_DIR, sid, "tasks", result["task_id"]) + assert os.path.isfile(os.path.join(task_dir, "prompt.txt")) + + # Session should be busy + with open(os.path.join(task_mgr.SESSIONS_DIR, sid, "session.json")) as f: + data = json.load(f) + assert data["status"] == "busy" + assert data["current_task"] == result["task_id"] + + +def test_create_task_rejects_when_busy(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + + task_mgr.create_task(session_id=sid, prompt="task 1", email="alice@example.com") + with pytest.raises(task_mgr.SessionBusyError): + task_mgr.create_task(session_id=sid, prompt="task 2", email="alice@example.com") + + +def test_get_status_running(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + task = task_mgr.create_task(session_id=sid, prompt="do work", email="alice@example.com") + + status = task_mgr.get_task_status(task["task_id"], sid) + assert status["status"] == "running" + assert "elapsed_s" in status + assert status.get("progress") is None # no status.jsonl yet + + +def test_get_status_with_progress(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + task = task_mgr.create_task(session_id=sid, prompt="do work", email="alice@example.com") + tid = task["task_id"] + + # Simulate agent writing status.jsonl + status_file = os.path.join(task_mgr.SESSIONS_DIR, sid, "tasks", tid, "status.jsonl") + with open(status_file, "a") as f: + f.write(json.dumps({"step": "planning", "message": "Analyzing requirements"}) + "\n") + f.write(json.dumps({"step": "coding", "message": "Writing pipeline"}) + "\n") + + status = task_mgr.get_task_status(tid, sid) + assert status["status"] == "running" + assert status["progress"]["step"] == "coding" + + +def test_get_result_completed(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + task = task_mgr.create_task(session_id=sid, prompt="do work", email="alice@example.com") + tid = task["task_id"] + + # Simulate agent writing result.json + result_file = os.path.join(task_mgr.SESSIONS_DIR, sid, "tasks", tid, "result.json") + with open(result_file, "w") as f: + json.dump({ + "status": "completed", + "summary": "Created pipeline", + "files_changed": ["pipeline.py"], + "artifacts": {"job_id": "123"}, + "errors": [] + }, f) + + result = task_mgr.get_task_result(tid, sid) + assert result["status"] == "completed" + assert result["summary"] == "Created pipeline" + + +def test_get_result_not_done(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + task = task_mgr.create_task(session_id=sid, prompt="do work", email="alice@example.com") + + result = task_mgr.get_task_result(task["task_id"], sid) + assert result["status"] == "running" + assert result.get("summary") is None + + +def test_complete_task(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + task = task_mgr.create_task(session_id=sid, prompt="do work", email="alice@example.com") + tid = task["task_id"] + + # Simulate result.json written by agent + result_file = os.path.join(task_mgr.SESSIONS_DIR, sid, "tasks", tid, "result.json") + with open(result_file, "w") as f: + json.dump({"status": "completed", "summary": "Done", "files_changed": [], "artifacts": {}, "errors": []}, f) + + task_mgr.complete_task(sid, tid) + + # Session should be idle again + with open(os.path.join(task_mgr.SESSIONS_DIR, sid, "session.json")) as f: + data = json.load(f) + assert data["status"] == "idle" + assert data["current_task"] is None + assert tid in data["completed_tasks"] + + +def test_close_session(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + + result = task_mgr.close_session(sid) + assert result["status"] == "closed" + + with open(os.path.join(task_mgr.SESSIONS_DIR, sid, "session.json")) as f: + data = json.load(f) + assert data["status"] == "closed" + + +def test_wrap_prompt(task_mgr): + wrapped = task_mgr.wrap_prompt( + task_id="task-007", + session_id="sess-abc", + email="alice@example.com", + prompt="create a pipeline", + context={"tables": ["sales.transactions"]}, + results_dir="/tmp/test" + ) + assert "---CODA-TASK---" in wrapped + assert "task-007" in wrapped + assert "create a pipeline" in wrapped + assert "sales.transactions" in wrapped + assert "result.json" in wrapped + assert "---END-CODA-TASK---" in wrapped +``` + +**Step 2: Run tests to verify they fail** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run pytest tests/test_task_manager.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'task_manager'` + +**Step 3: Write the task_manager module** + +```python +# task_manager.py +"""Disk-based state manager for MCP sessions and tasks. + +Manages the lifecycle of sessions (PTY-backed Hermes instances) and tasks +(units of work within a session). All state is persisted to ~/.coda/sessions/ +so the MCP transport can remain stateless. +""" +import json +import os +import time +import uuid + +HOME = os.environ.get("HOME", os.path.expanduser("~")) +SESSIONS_DIR = os.path.join(HOME, ".coda", "sessions") + + +class SessionBusyError(Exception): + """Raised when a task is submitted to a session that's already running one.""" + pass + + +class SessionNotFoundError(Exception): + """Raised when a session_id doesn't exist.""" + pass + + +def _session_dir(session_id: str) -> str: + return os.path.join(SESSIONS_DIR, session_id) + + +def _task_dir(session_id: str, task_id: str) -> str: + return os.path.join(SESSIONS_DIR, session_id, "tasks", task_id) + + +def _read_session(session_id: str) -> dict: + path = os.path.join(_session_dir(session_id), "session.json") + if not os.path.isfile(path): + raise SessionNotFoundError(f"Session {session_id} not found") + with open(path) as f: + return json.load(f) + + +def _write_session(session_id: str, data: dict): + path = os.path.join(_session_dir(session_id), "session.json") + with open(path, "w") as f: + json.dump(data, f, indent=2) + + +def create_session(email: str, user_id: str = "", label: str = "") -> dict: + """Create a new session directory and session.json. Returns {session_id, status}.""" + session_id = f"sess-{uuid.uuid4().hex[:12]}" + session_dir = _session_dir(session_id) + os.makedirs(os.path.join(session_dir, "tasks"), exist_ok=True) + + session_data = { + "created_by": email, + "user_id": user_id, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "status": "idle", + "current_task": None, + "completed_tasks": [], + "label": label, + } + _write_session(session_id, session_data) + + return {"session_id": session_id, "status": "ready"} + + +def create_task( + session_id: str, + prompt: str, + email: str, + context: dict = None, + context_hint: str = None, + timeout_s: int = 3600, + permissions: str = "smart", +) -> dict: + """Create a new task within a session. Returns {task_id, status}. + + Raises SessionBusyError if the session already has a running task. + """ + session_data = _read_session(session_id) + + if session_data["status"] == "busy": + raise SessionBusyError(f"Session {session_id} is busy with task {session_data['current_task']}") + + if session_data["status"] == "closed": + raise SessionNotFoundError(f"Session {session_id} is closed") + + task_id = f"task-{uuid.uuid4().hex[:8]}" + task_dir = _task_dir(session_id, task_id) + os.makedirs(task_dir, exist_ok=True) + + # Write prompt file + results_dir = task_dir + wrapped = wrap_prompt( + task_id=task_id, + session_id=session_id, + email=email, + prompt=prompt, + context=context, + results_dir=results_dir, + context_hint=context_hint, + ) + with open(os.path.join(task_dir, "prompt.txt"), "w") as f: + f.write(wrapped) + + # Write task metadata + with open(os.path.join(task_dir, "meta.json"), "w") as f: + json.dump({ + "task_id": task_id, + "session_id": session_id, + "email": email, + "started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "timeout_s": timeout_s, + "permissions": permissions, + "context_hint": context_hint, + }, f, indent=2) + + # Update session state + session_data["status"] = "busy" + session_data["current_task"] = task_id + _write_session(session_id, session_data) + + return {"task_id": task_id, "status": "running"} + + +def get_task_status(task_id: str, session_id: str) -> dict: + """Get current status of a task. Reads status.jsonl for progress.""" + task_dir = _task_dir(session_id, task_id) + + # Check if result.json exists (task completed) + result_path = os.path.join(task_dir, "result.json") + if os.path.isfile(result_path): + with open(result_path) as f: + result = json.load(f) + return { + "task_id": task_id, + "status": result.get("status", "completed"), + "elapsed_s": _elapsed(task_dir), + } + + # Check for progress in status.jsonl + status_path = os.path.join(task_dir, "status.jsonl") + progress = None + if os.path.isfile(status_path): + with open(status_path) as f: + lines = f.readlines() + if lines: + try: + progress = json.loads(lines[-1].strip()) + except json.JSONDecodeError: + pass + + return { + "task_id": task_id, + "status": "running", + "elapsed_s": _elapsed(task_dir), + "progress": progress, + } + + +def get_task_result(task_id: str, session_id: str) -> dict: + """Get the result of a completed task.""" + task_dir = _task_dir(session_id, task_id) + result_path = os.path.join(task_dir, "result.json") + + if not os.path.isfile(result_path): + return { + "task_id": task_id, + "status": "running", + "elapsed_s": _elapsed(task_dir), + } + + with open(result_path) as f: + result = json.load(f) + + result["task_id"] = task_id + result["elapsed_s"] = _elapsed(task_dir) + return result + + +def complete_task(session_id: str, task_id: str): + """Mark a task as completed and update session state back to idle.""" + session_data = _read_session(session_id) + session_data["status"] = "idle" + session_data["current_task"] = None + if task_id not in session_data.get("completed_tasks", []): + session_data.setdefault("completed_tasks", []).append(task_id) + _write_session(session_id, session_data) + + +def close_session(session_id: str) -> dict: + """Mark a session as closed.""" + session_data = _read_session(session_id) + session_data["status"] = "closed" + _write_session(session_id, session_data) + return {"session_id": session_id, "status": "closed"} + + +def wrap_prompt( + task_id: str, + session_id: str, + email: str, + prompt: str, + context: dict = None, + results_dir: str = "", + context_hint: str = None, +) -> str: + """Wrap a user prompt with the CODA-TASK convention.""" + context_block = "" + if context: + context_block = json.dumps(context, indent=2) + + hint_line = "" + if context_hint: + hint_line = f"context_hint: {context_hint}\n" + + return f"""---CODA-TASK--- +task_id: {task_id} +session_id: {session_id} +user: {email} +{hint_line}results_dir: {results_dir} + +CONTEXT: +{context_block} + +TASK: +{prompt} + +INSTRUCTIONS: +1. Append progress to {results_dir}/status.jsonl + Format: {{"step": "label", "message": "description"}} +2. When done, write {results_dir}/result.json with: + {{"status", "summary", "files_changed", "artifacts", "errors"}} +3. If you delegate to a sub-agent (Claude, Codex, Gemini), update + status.jsonl with delegation steps so the caller can track progress. +---END-CODA-TASK---""" + + +def _elapsed(task_dir: str) -> float: + """Calculate elapsed seconds since task started.""" + meta_path = os.path.join(task_dir, "meta.json") + if os.path.isfile(meta_path): + with open(meta_path) as f: + meta = json.load(f) + started = meta.get("started_at", "") + if started: + try: + started_ts = time.mktime(time.strptime(started, "%Y-%m-%dT%H:%M:%SZ")) + return round(time.time() - started_ts, 1) + except ValueError: + pass + # Fallback: use directory creation time + return round(time.time() - os.path.getctime(task_dir), 1) +``` + +**Step 4: Run tests to verify they pass** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run pytest tests/test_task_manager.py -v` +Expected: All 10 tests PASS + +**Step 5: Commit** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +git add task_manager.py tests/test_task_manager.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: add task manager for MCP session/task state" +``` + +--- + +### Task 2: Create MCP Server Module + +The MCP server registers 5 tools and delegates to `task_manager.py` for state. It also integrates with the existing PTY session infrastructure in `app.py` for creating terminal sessions and piping prompts. + +**Files:** +- Create: `mcp_server.py` +- Create: `tests/test_mcp_server.py` + +**Step 1: Write the failing tests** + +```python +# tests/test_mcp_server.py +import json +import pytest +from unittest.mock import patch, MagicMock + + +def test_mcp_tool_list(): + """Verify all 5 tools are registered.""" + from mcp_server import mcp + # The server should have 5 tools registered + tools = mcp._tool_manager._tools # internal access for testing + tool_names = [t.name for t in tools.values()] + assert "create_session" in tool_names + assert "run_task" in tool_names + assert "get_status" in tool_names + assert "get_result" in tool_names + assert "close_session" in tool_names + assert len(tool_names) == 5 +``` + +**Step 2: Run test to verify it fails** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run pytest tests/test_mcp_server.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'mcp_server'` + +**Step 3: Write the MCP server module** + +```python +# mcp_server.py +"""MCP server for CoDA — exposes coding agent capabilities to Genie Code. + +Registers 5 tools: create_session, run_task, get_status, get_result, close_session. +Uses the Python MCP SDK with stateless HTTP transport as required by Genie Code. +""" +import json +import logging +import os +import threading + +from mcp.server.fastmcp import FastMCP + +import task_manager + +logger = logging.getLogger(__name__) + +mcp = FastMCP( + "coda", + stateless_http=True, +) + +# Reference to app.py's session infrastructure — set by mount_mcp() +_app_create_session = None +_app_send_input = None +_app_close_session = None + + +def set_app_hooks(create_session_fn, send_input_fn, close_session_fn): + """Called by app.py to wire MCP tools to the PTY session infrastructure.""" + global _app_create_session, _app_send_input, _app_close_session + _app_create_session = create_session_fn + _app_send_input = send_input_fn + _app_close_session = close_session_fn + + +@mcp.tool() +def create_session( + email: str, + user_id: str = "", + label: str = "", +) -> str: + """Create a new coding agent session backed by Hermes Agent. + + Returns a session_id that can be used with run_task to send work. + Sessions are long-lived — reuse them for follow-up tasks to maintain context. + """ + # Create task manager state on disk + result = task_manager.create_session(email=email, user_id=user_id, label=label) + session_id = result["session_id"] + + # Create the actual PTY session via app.py infrastructure + if _app_create_session: + pty_session_id = _app_create_session(label="hermes-mcp") + # Map our session_id to the PTY session_id + task_manager._update_session_field(session_id, "pty_session_id", pty_session_id) + + return json.dumps(result) + + +@mcp.tool() +def run_task( + session_id: str, + prompt: str, + email: str, + user_id: str = "", + context: str = "{}", + context_hint: str = "", + timeout_s: int = 3600, + permissions: str = "smart", +) -> str: + """Send a coding task to Hermes Agent in an existing session. + + The task runs asynchronously — use get_status to poll progress + and get_result to retrieve the outcome. + + Args: + session_id: From create_session + prompt: Natural language task description + email: User email for audit trail + context: JSON string with Unity Catalog context (tables, schemas, etc.) + context_hint: "new_topic" to signal unrelated work in same session + timeout_s: Max seconds before timeout (default 3600) + permissions: "smart" (default, safe) or "yolo" (full autonomy) + """ + try: + context_dict = json.loads(context) if context else {} + except json.JSONDecodeError: + context_dict = {} + + try: + result = task_manager.create_task( + session_id=session_id, + prompt=prompt, + email=email, + context=context_dict, + context_hint=context_hint or None, + timeout_s=timeout_s, + permissions=permissions, + ) + except task_manager.SessionBusyError as e: + return json.dumps({"error": str(e)}) + except task_manager.SessionNotFoundError as e: + return json.dumps({"error": str(e)}) + + task_id = result["task_id"] + + # Read the wrapped prompt from disk + task_dir = task_manager._task_dir(session_id, task_id) + with open(os.path.join(task_dir, "prompt.txt")) as f: + wrapped_prompt = f.read() + + # Build hermes command + yolo_flag = " --yolo" if permissions == "yolo" else "" + hermes_cmd = f'hermes -z "{task_dir}/prompt.txt"{yolo_flag}\n' + + # Pipe to PTY session in background + if _app_send_input: + session_data = task_manager._read_session(session_id) + pty_session_id = session_data.get("pty_session_id") + if pty_session_id: + # Send the hermes command to the terminal + _app_send_input(pty_session_id, hermes_cmd) + + # Start background watcher for task completion + thread = threading.Thread( + target=_watch_task, + args=(session_id, task_id, timeout_s), + daemon=True, + ) + thread.start() + + return json.dumps(result) + + +@mcp.tool() +def get_status(task_id: str, session_id: str) -> str: + """Check the current status and progress of a running task. + + Returns status (running/completed/failed/timeout), elapsed time, + and the latest progress update from the agent if available. + """ + try: + result = task_manager.get_task_status(task_id, session_id) + return json.dumps(result) + except Exception as e: + return json.dumps({"error": str(e)}) + + +@mcp.tool() +def get_result(task_id: str, session_id: str) -> str: + """Retrieve the structured result of a completed task. + + Returns summary, files changed, artifacts (job IDs, commit hashes, etc.), + and any errors. If the task isn't done yet, returns running status. + """ + try: + result = task_manager.get_task_result(task_id, session_id) + return json.dumps(result) + except Exception as e: + return json.dumps({"error": str(e)}) + + +@mcp.tool() +def close_session(session_id: str) -> str: + """Close a session and clean up resources. + + The PTY process is terminated and session state is marked as closed. + """ + try: + # Close task manager state + result = task_manager.close_session(session_id) + + # Close the PTY session + if _app_close_session: + session_data = task_manager._read_session(session_id) + pty_session_id = session_data.get("pty_session_id") + if pty_session_id: + _app_close_session(pty_session_id) + + return json.dumps(result) + except Exception as e: + return json.dumps({"error": str(e)}) + + +def _watch_task(session_id: str, task_id: str, timeout_s: int): + """Background thread that watches for task completion or timeout.""" + import time + + task_dir = task_manager._task_dir(session_id, task_id) + result_path = os.path.join(task_dir, "result.json") + status_path = os.path.join(task_dir, "status.jsonl") + start = time.time() + last_activity = start + stale_threshold = 300 # 5 minutes with no status update = stale + + while True: + elapsed = time.time() - start + + # Check for result.json (task completed) + if os.path.isfile(result_path): + task_manager.complete_task(session_id, task_id) + logger.info(f"Task {task_id} completed in {elapsed:.0f}s") + return + + # Check for stale (no activity in 5 min) + if os.path.isfile(status_path): + mtime = os.path.getmtime(status_path) + if mtime > last_activity: + last_activity = mtime + + # Timeout: wall clock exceeded AND stale + if elapsed > timeout_s and (time.time() - last_activity) > stale_threshold: + logger.warning(f"Task {task_id} timed out after {elapsed:.0f}s") + # Write a timeout result + with open(result_path, "w") as f: + json.dump({ + "status": "timeout", + "summary": f"Task timed out after {elapsed:.0f} seconds", + "files_changed": [], + "artifacts": {}, + "errors": ["timeout"], + }, f) + task_manager.complete_task(session_id, task_id) + return + + time.sleep(5) # Poll every 5 seconds +``` + +**Step 4: Run tests to verify they pass** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run pytest tests/test_mcp_server.py -v` +Expected: PASS + +**Step 5: Commit** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +git add mcp_server.py tests/test_mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: add MCP server with 5 tools for Genie Code integration" +``` + +--- + +### Task 3: Mount MCP Server in Flask App + +Wire the MCP server into the existing Flask app. Add CORS support, skip auth for `/mcp` (Databricks proxy handles it), and expose helper functions for PTY integration. + +**Files:** +- Modify: `app.py` (add mount + helper functions) +- Modify: `pyproject.toml` (add flask-cors dependency) + +**Step 1: Add flask-cors to dependencies** + +In `pyproject.toml`, add `"flask-cors>=4.0"` to dependencies list. + +**Step 2: Add PTY helper functions to app.py** + +Add these functions after the existing `create_session` route (around line 1081), before the `send_input` route: + +```python +# ── MCP Integration Helpers ────────────────────────────────────────────── + +def mcp_create_pty_session(label: str = "hermes-mcp") -> str: + """Create a PTY session for MCP use. Returns the PTY session_id.""" + master_fd, slave_fd = pty.openpty() + shell_env = os.environ.copy() + shell_env["TERM"] = "xterm-256color" + shell_env.pop("CLAUDECODE", None) + shell_env.pop("CLAUDE_CODE_SESSION", None) + shell_env.pop("DATABRICKS_TOKEN", None) + shell_env.pop("DATABRICKS_HOST", None) + shell_env.pop("GEMINI_API_KEY", None) + if not shell_env.get("HOME") or shell_env["HOME"] == "/": + shell_env["HOME"] = "/app/python/source_code" + local_bin = f"{shell_env['HOME']}/.local/bin" + shell_env["PATH"] = f"{local_bin}:{shell_env.get('PATH', '')}" + projects_dir = os.path.join(shell_env["HOME"], "projects") + os.makedirs(projects_dir, exist_ok=True) + + pid = subprocess.Popen( + ["/bin/bash"], + stdin=slave_fd, stdout=slave_fd, stderr=slave_fd, + preexec_fn=os.setsid, + env=shell_env, + cwd=projects_dir + ).pid + os.close(slave_fd) + + session_id = str(uuid.uuid4()) + with sessions_lock: + if len(sessions) >= MAX_CONCURRENT_SESSIONS: + os.close(master_fd) + try: + os.kill(pid, signal.SIGKILL) + except OSError: + pass + raise RuntimeError(f"Maximum {MAX_CONCURRENT_SESSIONS} concurrent sessions reached") + sessions[session_id] = { + "master_fd": master_fd, + "pid": pid, + "output_buffer": deque(maxlen=1000), + "lock": threading.Lock(), + "last_poll_time": time.time(), + "created_at": time.time(), + "label": label, + } + + thread = threading.Thread(target=read_pty_output, args=(session_id, master_fd), daemon=True) + thread.start() + log_telemetry("agent", label) + return session_id + + +def mcp_send_input(session_id: str, data: str): + """Send input to a PTY session. Used by MCP to pipe hermes commands.""" + sess = _get_session(session_id) + if not sess: + return + with sess["lock"]: + try: + os.write(sess["master_fd"], data.encode()) + except OSError: + pass + + +def mcp_close_pty_session(session_id: str): + """Close a PTY session. Used by MCP close_session tool.""" + sess = _get_session(session_id) + if not sess: + return + terminate_session(session_id, sess["pid"], sess["master_fd"]) +``` + +**Step 3: Mount the MCP app and add CORS** + +At the end of `app.py`, before the `if __name__ == "__main__"` block (around line 1298), add: + +```python +# ── MCP Server Mount ───────────────────────────────────────────────────── +from flask_cors import CORS +from mcp_server import mcp, set_app_hooks + +# CORS for Genie Code cross-origin requests +databricks_host = os.environ.get("DATABRICKS_HOST", "") +if databricks_host: + CORS(app, origins=[ensure_https(databricks_host)], supports_credentials=True) + +# Wire MCP tools to PTY infrastructure +set_app_hooks( + create_session_fn=mcp_create_pty_session, + send_input_fn=mcp_send_input, + close_session_fn=mcp_close_pty_session, +) + +# Mount MCP as ASGI app at /mcp +from werkzeug.middleware.dispatcher import DispatcherMiddleware +from a]syncio import run as arun + +mcp_asgi_app = mcp.streamable_http_app() + +# Bridge ASGI MCP app into Flask's WSGI world +# We use a thin WSGI wrapper since Flask is WSGI and MCP SDK produces ASGI +import asyncio +from io import BytesIO + +def mcp_wsgi_app(environ, start_response): + """WSGI-to-ASGI bridge for the MCP endpoint.""" + # Read request body + content_length = int(environ.get('CONTENT_LENGTH', 0) or 0) + body = environ['wsgi.input'].read(content_length) if content_length else b'' + + async def run_asgi(): + response_started = False + status_code = None + response_headers = None + response_body = BytesIO() + + async def receive(): + return {"type": "http.request", "body": body} + + async def send(message): + nonlocal response_started, status_code, response_headers + if message["type"] == "http.response.start": + status_code = message["status"] + response_headers = [ + (k.decode() if isinstance(k, bytes) else k, + v.decode() if isinstance(v, bytes) else v) + for k, v in message.get("headers", []) + ] + response_started = True + elif message["type"] == "http.response.body": + response_body.write(message.get("body", b"")) + + scope = { + "type": "http", + "asgi": {"version": "3.0"}, + "http_version": "1.1", + "method": environ["REQUEST_METHOD"], + "path": environ.get("PATH_INFO", "/"), + "query_string": environ.get("QUERY_STRING", "").encode(), + "headers": [ + (k.lower().replace("http_", "").replace("_", "-").encode(), + v.encode()) + for k, v in environ.items() + if k.startswith("HTTP_") + ] + ( + [(b"content-type", environ["CONTENT_TYPE"].encode())] + if environ.get("CONTENT_TYPE") else [] + ), + "server": (environ.get("SERVER_NAME", "localhost"), + int(environ.get("SERVER_PORT", 8000))), + } + + await mcp_asgi_app(scope, receive, send) + return status_code, response_headers, response_body.getvalue() + + status_code, headers, body_bytes = asyncio.run(run_asgi()) + status_str = f"{status_code} OK" + start_response(status_str, headers or []) + return [body_bytes] + +app.wsgi_app = DispatcherMiddleware(app.wsgi_app, {"/mcp": mcp_wsgi_app}) +``` + +**Step 4: Update auth bypass for /mcp path** + +In `app.py` line 808, update the auth bypass to include `/mcp`: + +```python +# Before: +if request.path in ("/health", "/api/setup-status", ...): +# After: +if request.path in ("/health", "/api/setup-status", "/api/pat-status", "/api/configure-pat", "/api/app-state") or request.path.startswith("/socket.io") or request.path.startswith("/mcp"): +``` + +Note: `/mcp` auth is handled by the Databricks Apps proxy (same as all other routes), but the Flask `before_request` check would reject because MCP requests from Genie Code may not carry the same headers as browser requests. The Databricks Apps proxy still enforces authentication before the request reaches CoDA. + +**Step 5: Run the app locally to verify mount** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run python -c "from app import app; print('MCP mounted at /mcp'); print([rule.rule for rule in app.url_map.iter_rules()])"` +Expected: No import errors, `/mcp` visible in routes + +**Step 6: Commit** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +git add app.py pyproject.toml mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: mount MCP server at /mcp with CORS and PTY integration" +``` + +--- + +### Task 4: Add _update_session_field to task_manager + +The MCP server needs to store the `pty_session_id` mapping. Add the helper and its test. + +**Files:** +- Modify: `task_manager.py` (add `_update_session_field`) +- Modify: `tests/test_task_manager.py` (add test) + +**Step 1: Add test** + +```python +# Append to tests/test_task_manager.py + +def test_update_session_field(task_mgr): + session = task_mgr.create_session(email="alice@example.com", user_id="123") + sid = session["session_id"] + + task_mgr._update_session_field(sid, "pty_session_id", "pty-abc-123") + + with open(os.path.join(task_mgr.SESSIONS_DIR, sid, "session.json")) as f: + data = json.load(f) + assert data["pty_session_id"] == "pty-abc-123" +``` + +**Step 2: Add the function to task_manager.py** + +After the `_write_session` function: + +```python +def _update_session_field(session_id: str, key: str, value): + """Update a single field in session.json.""" + data = _read_session(session_id) + data[key] = value + _write_session(session_id, data) +``` + +**Step 3: Run tests** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run pytest tests/test_task_manager.py -v` +Expected: All 11 tests PASS + +**Step 4: Commit** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +git add task_manager.py tests/test_task_manager.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: add _update_session_field helper for PTY mapping" +``` + +--- + +### Task 5: Update requirements.txt + +Regenerate requirements after adding flask-cors. + +**Files:** +- Modify: `pyproject.toml` (already done in Task 3) +- Regenerate: `requirements.txt` + +**Step 1: Regenerate requirements** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv pip compile pyproject.toml -o requirements.txt` + +**Step 2: Commit** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +git add pyproject.toml requirements.txt +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "chore: add flask-cors dependency" +``` + +--- + +### Task 6: Integration Test — End-to-End MCP Flow + +Test the full flow: create session → run task → check status → get result → close session. + +**Files:** +- Create: `tests/test_mcp_integration.py` + +**Step 1: Write the integration test** + +```python +# tests/test_mcp_integration.py +"""Integration test for MCP server flow (no real PTY, mocked app hooks).""" +import json +import os +import pytest +from unittest.mock import patch, MagicMock + +import task_manager +import mcp_server + + +@pytest.fixture(autouse=True) +def setup_env(tmp_path): + """Redirect all state to temp dir and mock PTY hooks.""" + with patch.object(task_manager, "SESSIONS_DIR", str(tmp_path / "sessions")): + # Mock the app hooks (no real PTY in tests) + mcp_server.set_app_hooks( + create_session_fn=lambda label: "pty-mock-123", + send_input_fn=MagicMock(), + close_session_fn=MagicMock(), + ) + yield tmp_path + + +def test_full_mcp_flow(): + """End-to-end: create → run → status → result → close.""" + # 1. Create session + result = json.loads(mcp_server.create_session(email="alice@test.com", user_id="u1")) + assert result["status"] == "ready" + sid = result["session_id"] + + # 2. Run task + result = json.loads(mcp_server.run_task( + session_id=sid, + prompt="create a sales pipeline", + email="alice@test.com", + context='{"tables": ["sales.transactions"]}', + )) + assert result["status"] == "running" + tid = result["task_id"] + + # 3. Check status (running, no progress yet) + status = json.loads(mcp_server.get_status(task_id=tid, session_id=sid)) + assert status["status"] == "running" + assert status["progress"] is None + + # 4. Simulate agent writing progress + task_dir = task_manager._task_dir(sid, tid) + with open(os.path.join(task_dir, "status.jsonl"), "w") as f: + f.write(json.dumps({"step": "coding", "message": "Writing pipeline"}) + "\n") + + status = json.loads(mcp_server.get_status(task_id=tid, session_id=sid)) + assert status["progress"]["step"] == "coding" + + # 5. Simulate agent writing result + with open(os.path.join(task_dir, "result.json"), "w") as f: + json.dump({ + "status": "completed", + "summary": "Created sales pipeline with 3 stages", + "files_changed": ["pipelines/sales.py"], + "artifacts": {"job_id": "789"}, + "errors": [] + }, f) + + # 6. Get result + result = json.loads(mcp_server.get_result(task_id=tid, session_id=sid)) + assert result["status"] == "completed" + assert result["summary"] == "Created sales pipeline with 3 stages" + assert result["artifacts"]["job_id"] == "789" + + # 7. Complete and close + task_manager.complete_task(sid, tid) + result = json.loads(mcp_server.close_session(session_id=sid)) + assert result["status"] == "closed" + + +def test_busy_session_rejects(): + """Running a second task on a busy session should return error.""" + result = json.loads(mcp_server.create_session(email="bob@test.com")) + sid = result["session_id"] + + # First task + json.loads(mcp_server.run_task(session_id=sid, prompt="task 1", email="bob@test.com")) + + # Second task should fail + result = json.loads(mcp_server.run_task(session_id=sid, prompt="task 2", email="bob@test.com")) + assert "error" in result + assert "busy" in result["error"].lower() +``` + +**Step 2: Run tests** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run pytest tests/test_mcp_integration.py -v` +Expected: All 2 tests PASS + +**Step 3: Run all tests together** + +Run: `cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp && uv run pytest tests/ -v` +Expected: All tests PASS + +**Step 4: Commit** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +git add tests/test_mcp_integration.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "test: add end-to-end MCP integration test" +``` + +--- + +## Summary + +| Task | What | Files | +|------|------|-------| +| 1 | Task manager (disk state) | `task_manager.py`, `tests/test_task_manager.py` | +| 2 | MCP server (5 tools) | `mcp_server.py`, `tests/test_mcp_server.py` | +| 3 | Flask mount + CORS + PTY helpers | `app.py`, `pyproject.toml` | +| 4 | Session field helper | `task_manager.py`, `tests/test_task_manager.py` | +| 5 | Dependencies | `pyproject.toml`, `requirements.txt` | +| 6 | Integration test | `tests/test_mcp_integration.py` | + +Total: 4 new files, 2 modified files, ~400 lines of production code, ~250 lines of tests. diff --git a/docs/superpowers/plans/2026-05-27-coda-mcp-live-session-url.md b/docs/superpowers/plans/2026-05-27-coda-mcp-live-session-url.md new file mode 100644 index 0000000..ade3838 --- /dev/null +++ b/docs/superpowers/plans/2026-05-27-coda-mcp-live-session-url.md @@ -0,0 +1,1900 @@ +# CoDA MCP Live Session URL Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a `viewer_url` to CoDA MCP tool responses so the calling user can watch hermes execute live in a browser, with a 5-minute grace period after task completion and indefinite static replay from an on-disk PTY transcript. + +**Architecture:** Tee PTY bytes to `~/.coda/sessions/{sess}/tasks/{task}/transcript.log` from `read_pty_output`. Replace the immediate post-completion close in `_watch_task` with a `threading.Timer(300, close)`. Mark grace-period PTYs to exempt them from `MAX_CONCURRENT_SESSIONS`. Build `viewer_url` by capturing `X-Forwarded-Host` from inbound requests in an ASGI middleware. The Flask `/api/session/attach` endpoint adds a replay fallback that returns transcript bytes when the live PTY is gone. The SPA reads `?session=` on boot and routes to either the existing `_doAttach` (live) or a new `_doReplay` (static, chunked). + +**Tech Stack:** Python 3 (Flask + FastMCP + python-socketio AsyncServer + Starlette + uvicorn), xterm.js, pytest, `uv` for runs. + +**Spec:** `docs/superpowers/specs/2026-05-27-coda-mcp-live-session-url-design.md` at commit `02431c8` on `feat/coda-mcp-server`. + +--- + +## Conventions used in this plan + +- Worktree: `/Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp/` +- All `git commit` commands use `-c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty"` (per repo convention). No `Co-authored-by` line. +- All pytest invocations use `uv run pytest ...` (per repo convention). +- All file paths are relative to the worktree root. + +--- + +## Task 1: `coda_mcp/url_builder.py` — base URL resolution module + +**Files:** +- Create: `coda_mcp/url_builder.py` +- Test: `tests/test_url_builder.py` (new) + +- [ ] **Step 1: Write the failing tests** + +Create `tests/test_url_builder.py`: + +```python +"""Tests for url_builder module — base URL resolution for viewer_url.""" +import os +import importlib +from unittest import mock + +import pytest + + +@pytest.fixture(autouse=True) +def _reset_module(): + """Re-import url_builder fresh for each test (module-level cache).""" + from coda_mcp import url_builder + importlib.reload(url_builder) + yield + + +def test_returns_none_when_neither_env_nor_cache(): + from coda_mcp import url_builder + assert url_builder.build_viewer_url("pty-1") is None + + +def test_env_override_wins(): + from coda_mcp import url_builder + with mock.patch.dict(os.environ, {"CODA_APP_URL": "https://override.example.com"}): + assert url_builder.build_viewer_url("pty-1") == \ + "https://override.example.com/?session=pty-1" + + +def test_env_override_strips_trailing_slash(): + from coda_mcp import url_builder + with mock.patch.dict(os.environ, {"CODA_APP_URL": "https://override.example.com/"}): + assert url_builder.build_viewer_url("pty-1") == \ + "https://override.example.com/?session=pty-1" + + +def test_header_capture_used_when_no_env(): + from coda_mcp import url_builder + url_builder.capture_from_headers("app.databricksapps.com") + assert url_builder.build_viewer_url("pty-1") == \ + "https://app.databricksapps.com/?session=pty-1" + + +def test_env_overrides_header_capture(): + from coda_mcp import url_builder + url_builder.capture_from_headers("captured.example.com") + with mock.patch.dict(os.environ, {"CODA_APP_URL": "https://override.example.com"}): + assert url_builder.build_viewer_url("pty-1") == \ + "https://override.example.com/?session=pty-1" + + +def test_header_capture_overwrites_previous(): + from coda_mcp import url_builder + url_builder.capture_from_headers("first.example.com") + url_builder.capture_from_headers("second.example.com") + assert "second.example.com" in url_builder.build_viewer_url("pty-1") + + +def test_capture_empty_string_does_not_overwrite(): + from coda_mcp import url_builder + url_builder.capture_from_headers("good.example.com") + url_builder.capture_from_headers("") + assert "good.example.com" in url_builder.build_viewer_url("pty-1") + + +def test_capture_none_does_not_crash(): + from coda_mcp import url_builder + url_builder.capture_from_headers(None) + assert url_builder.build_viewer_url("pty-1") is None +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_url_builder.py -v` +Expected: ImportError on `from coda_mcp import url_builder` — module does not exist yet. + +- [ ] **Step 3: Implement `coda_mcp/url_builder.py`** + +Create `coda_mcp/url_builder.py`: + +```python +"""Builds the viewer_url returned by CoDA MCP tools. + +Resolution order: +1. ``CODA_APP_URL`` env var (explicit override for local dev / power users). +2. Module-level cache populated by ``AppUrlCaptureMiddleware`` from the + ``X-Forwarded-Host`` header (officially provided by Databricks Apps). +3. ``None`` — caller omits the field entirely. + +The cache is process-global (single uvicorn worker per app) and refreshed +on every inbound HTTP request. +""" +from __future__ import annotations + +import os +from typing import Optional + +_app_url_cache: Optional[str] = None + + +def capture_from_headers(host: Optional[str]) -> None: + """Called by the ASGI middleware on every inbound HTTP request. + + No-op when ``host`` is falsy (None or empty) to avoid wiping a good + cache value with a missing header on a probe/CORS preflight. + """ + global _app_url_cache + if host: + _app_url_cache = host + + +def build_viewer_url(pty_session_id: str) -> Optional[str]: + """Return the full viewer URL for a PTY session, or None if no base is known.""" + override = os.environ.get("CODA_APP_URL", "").strip() + if override: + base = override.rstrip("/") + elif _app_url_cache: + base = f"https://{_app_url_cache}" + else: + return None + return f"{base}/?session={pty_session_id}" +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_url_builder.py -v` +Expected: 8 passed. + +- [ ] **Step 5: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/url_builder.py tests/test_url_builder.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat(coda-mcp): url_builder module for viewer_url resolution" +``` + +--- + +## Task 2: `task_manager.find_task_dir_by_pty_session` — reverse lookup with TTL cache + +**Files:** +- Modify: `coda_mcp/task_manager.py` (add new function at end, before `cleanup_expired_tasks`) +- Test: `tests/test_task_manager.py` (extend) + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_task_manager.py` (locate existing test file; this assumes pytest fixtures `tmp_path` and patching of `SESSIONS_DIR` already exist in the file — confirm pattern, otherwise use the snippet below as a self-contained module): + +```python +import json +import os +import time +from unittest import mock + +import pytest + +from coda_mcp import task_manager + + +@pytest.fixture +def sessions_root(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + # Reset the lookup cache between tests + task_manager._pty_lookup_cache.clear() + return tmp_path + + +def _make_session_dir(root, sess_id, pty_id, current_task=None, completed=None): + sdir = root / sess_id + (sdir / "tasks").mkdir(parents=True) + data = { + "session_id": sess_id, + "pty_session_id": pty_id, + "current_task": current_task, + "completed_tasks": completed or [], + "status": "ready", + } + (sdir / "session.json").write_text(json.dumps(data)) + return sdir + + +def test_find_task_dir_hits_current_task(sessions_root): + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + result = task_manager.find_task_dir_by_pty_session("pty-1") + assert result == str(sessions_root / "sess-A" / "tasks" / "task-X") + + +def test_find_task_dir_falls_back_to_last_completed(sessions_root): + _make_session_dir( + sessions_root, "sess-A", "pty-1", + current_task=None, + completed=["task-old", "task-recent"], + ) + result = task_manager.find_task_dir_by_pty_session("pty-1") + assert result == str(sessions_root / "sess-A" / "tasks" / "task-recent") + + +def test_find_task_dir_returns_none_when_no_match(sessions_root): + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + assert task_manager.find_task_dir_by_pty_session("pty-NONEXIST") is None + + +def test_find_task_dir_ignores_corrupt_session_json(sessions_root): + sdir = sessions_root / "sess-bad" + sdir.mkdir() + (sdir / "session.json").write_text("not json {{{") + _make_session_dir(sessions_root, "sess-good", "pty-1", current_task="task-X") + assert task_manager.find_task_dir_by_pty_session("pty-1") == \ + str(sessions_root / "sess-good" / "tasks" / "task-X") + + +def test_find_task_dir_cache_hits_within_ttl(sessions_root): + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + task_manager.find_task_dir_by_pty_session("pty-1") + # Remove session.json — cache should still return the hit + (sessions_root / "sess-A" / "session.json").unlink() + assert task_manager.find_task_dir_by_pty_session("pty-1") == \ + str(sessions_root / "sess-A" / "tasks" / "task-X") + + +def test_find_task_dir_cache_expires(sessions_root, monkeypatch): + monkeypatch.setattr(task_manager, "_PTY_LOOKUP_TTL", 0.01) + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + task_manager.find_task_dir_by_pty_session("pty-1") + (sessions_root / "sess-A" / "session.json").unlink() + time.sleep(0.02) + assert task_manager.find_task_dir_by_pty_session("pty-1") is None + + +def test_find_task_dir_no_sessions_dir(sessions_root, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", "/nonexistent/path/that/does/not/exist") + assert task_manager.find_task_dir_by_pty_session("pty-1") is None +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_task_manager.py -v -k find_task_dir` +Expected: 7 failures with `AttributeError: module 'coda_mcp.task_manager' has no attribute 'find_task_dir_by_pty_session'`. + +- [ ] **Step 3: Add module-level cache and function** + +Edit `coda_mcp/task_manager.py`. Near the top, after the existing module constants (after `TASK_TTL_S = ...`): + +```python +# ── PTY → task-dir reverse lookup (used by attach_session replay fallback) ── + +_pty_lookup_cache: dict[str, tuple[str, float]] = {} # pty_id -> (task_dir, ts) +_PTY_LOOKUP_TTL = 60.0 # seconds +``` + +Then before `def cleanup_expired_tasks()`, add: + +```python +def find_task_dir_by_pty_session(pty_session_id: str) -> str | None: + """Find the task dir whose session.json carries this pty_session_id. + + Returns the path to the active task dir, or — if the session has completed — + the most recently completed task dir. Returns None on no match. + + Cached for ``_PTY_LOOKUP_TTL`` seconds to avoid disk scans on every browser + refresh. + + Invariant: CoDA MCP sessions are ephemeral — one task per session. If the + lifecycle ever changes to allow multiple tasks per session, this function + must be revisited to pick the active or grace-period task rather than + ``completed_tasks[-1]``. + """ + now = time.time() + cached = _pty_lookup_cache.get(pty_session_id) + if cached and (now - cached[1]) < _PTY_LOOKUP_TTL: + return cached[0] + + if not os.path.isdir(SESSIONS_DIR): + return None + + for sess_name in os.listdir(SESSIONS_DIR): + sess_file = os.path.join(SESSIONS_DIR, sess_name, "session.json") + try: + with open(sess_file) as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + continue + + if data.get("pty_session_id") != pty_session_id: + continue + + candidate = data.get("current_task") or ( + data["completed_tasks"][-1] if data.get("completed_tasks") else None + ) + if candidate: + tdir = os.path.join(SESSIONS_DIR, sess_name, "tasks", candidate) + _pty_lookup_cache[pty_session_id] = (tdir, now) + return tdir + + return None +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_task_manager.py -v -k find_task_dir` +Expected: 7 passed. + +- [ ] **Step 5: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/task_manager.py tests/test_task_manager.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat(coda-mcp): find_task_dir_by_pty_session lookup with TTL cache" +``` + +--- + +## Task 3: `app.py::read_pty_output` — tee PTY bytes to transcript with lock-guarded writes + +**Files:** +- Modify: `app.py` (top: new constant; `read_pty_output` function lines 861-910) +- Test: `tests/test_transcript.py` (new — standalone unit tests for the tee logic; integration tested later) + +- [ ] **Step 1: Write the failing tests** + +Create `tests/test_transcript.py`: + +```python +"""Unit tests for the transcript tee in read_pty_output. + +These tests exercise the tee logic directly by simulating output dispatch into +a synthesized session dict and a real on-disk transcript file. The full PTY +read loop is not exercised here — see test_mcp_integration.py for E2E. +""" +import os +import stat +import threading +from pathlib import Path + +import pytest + + +@pytest.fixture +def session_dict(tmp_path): + """Build a minimally valid sessions[pty_id] entry with a real transcript handle.""" + transcript = tmp_path / "transcript.log" + fh = open(transcript, "ab", buffering=0) + os.fchmod(fh.fileno(), 0o600) + return { + "transcript_path": str(transcript), + "transcript_fh": fh, + "transcript_bytes": 0, + "lock": threading.Lock(), + } + + +def _write_chunk(session, output: bytes, cap: int = 10 * 1024 * 1024) -> None: + """Mirror the tee logic from read_pty_output for unit testing.""" + from app import _tee_transcript_chunk + _tee_transcript_chunk(session, output, cap=cap) + + +def test_tee_writes_bytes_and_flushes(session_dict): + _write_chunk(session_dict, b"hello world\n") + assert session_dict["transcript_bytes"] == 12 + assert Path(session_dict["transcript_path"]).read_bytes() == b"hello world\n" + + +def test_tee_chmod_is_0600(session_dict): + mode = stat.S_IMODE(os.stat(session_dict["transcript_path"]).st_mode) + assert mode == 0o600 + + +def test_tee_truncation_at_cap(session_dict): + cap = 16 + _write_chunk(session_dict, b"AAAAAAAAAA", cap=cap) + _write_chunk(session_dict, b"BBBBBBBBBBBBBBBBBBBB", cap=cap) + body = Path(session_dict["transcript_path"]).read_bytes() + # 10 A's, then 6 B's, then truncation marker. + assert body.startswith(b"AAAAAAAAAABBBBBB") + assert b"[transcript truncated at" in body + # Handle is closed after marker + assert session_dict["transcript_fh"] is None + + +def test_tee_no_op_when_fh_is_none(session_dict): + session_dict["transcript_fh"] = None + _write_chunk(session_dict, b"should not write") + assert Path(session_dict["transcript_path"]).read_bytes() == b"" + + +def test_tee_handles_write_error(session_dict, monkeypatch): + # Close the handle out from under the tee — write() will ValueError. + session_dict["transcript_fh"].close() + _write_chunk(session_dict, b"this will fail") + # Handle replaced with None; no crash. + assert session_dict["transcript_fh"] is None +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_transcript.py -v` +Expected: ImportError on `from app import _tee_transcript_chunk`. + +- [ ] **Step 3: Add the helper and the constant in `app.py`** + +Near the top of `app.py` (after the existing constants block around line 46-50), add: + +```python +TRANSCRIPT_CAP_BYTES = 10 * 1024 * 1024 # 10 MB soft cap per transcript +``` + +Then add the helper (place it near `read_pty_output`, e.g., immediately above it): + +```python +def _tee_transcript_chunk(session, output: bytes, cap: int = TRANSCRIPT_CAP_BYTES) -> None: + """Append PTY output to the transcript file. Single-writer (read_pty_output). + + All file-handle access is under ``session["lock"]`` so we never race the + Timer-driven close path in ``terminate_session``. The ``ValueError`` catch + is belt-and-suspenders for the tiny window where the handle is closed + between the ``is not None`` check and the actual ``write`` call (the lock + prevents this, but be defensive). + """ + with session["lock"]: + fh = session.get("transcript_fh") + written = session.get("transcript_bytes", 0) + if fh is None: + return + remaining = cap - written + if remaining <= 0: + return + chunk = output[:remaining] + try: + fh.write(chunk) + fh.flush() + session["transcript_bytes"] = written + len(chunk) + if len(chunk) < len(output): + fh.write(b"\n[transcript truncated at %d bytes]\n" % cap) + fh.flush() + fh.close() + session["transcript_fh"] = None + except (OSError, ValueError) as exc: + logger.warning("transcript write failed: %s", exc) + try: + fh.close() + except Exception: + pass + session["transcript_fh"] = None +``` + +- [ ] **Step 4: Wire the tee into `read_pty_output`** + +In `app.py::read_pty_output`, locate the block (currently around line 880-888): + +```python + decoded = output.decode(errors="replace") + with session_lock: + # Buffer for HTTP polling fallback (AC-15) + session["output_buffer"].append(decoded) + session["last_poll_time"] = time.time() # Keep session alive during WS output + # Push via WebSocket to the session room (AC-8) + _emit_from_thread('terminal_output', + {'session_id': session_id, 'output': decoded}, + room=session_id) +``` + +Immediately after the `_emit_from_thread` call (and before the `else:` branch), add: + +```python + # Tee to transcript file if enabled for this session + _tee_transcript_chunk(session, output) +``` + +- [ ] **Step 5: Run unit tests to verify they pass** + +Run: `uv run pytest tests/test_transcript.py -v` +Expected: 5 passed. + +- [ ] **Step 6: Run existing terminal tests to verify no regression** + +Run: `uv run pytest tests/test_terminal_env_strip.py tests/test_session_linger.py tests/test_session_detach.py -v` +Expected: existing pass count unchanged (no failures introduced). + +- [ ] **Step 7: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add app.py tests/test_transcript.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: tee PTY output to transcript.log with lock-guarded writes" +``` + +--- + +## Task 4: `app.py` — open transcript handle in `mcp_create_pty_session` + close in `terminate_session` + +**Files:** +- Modify: `app.py::mcp_create_pty_session` (lines ~1324-1387) +- Modify: `app.py::terminate_session` (lines ~912-936) + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_transcript.py`: + +```python +def test_mcp_create_pty_session_opens_transcript_when_path_given(tmp_path, monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 5) + transcript = tmp_path / "transcript.log" + from app import mcp_create_pty_session, sessions, mcp_close_pty_session + sid = mcp_create_pty_session(label="test", transcript_path=str(transcript)) + try: + assert transcript.exists() + mode = stat.S_IMODE(os.stat(transcript).st_mode) + assert mode == 0o600 + sess = sessions[sid] + assert sess["transcript_path"] == str(transcript) + assert sess["transcript_fh"] is not None + assert sess["transcript_bytes"] == 0 + finally: + mcp_close_pty_session(sid) + + +def test_mcp_create_pty_session_no_transcript_when_path_none(monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 5) + from app import mcp_create_pty_session, sessions, mcp_close_pty_session + sid = mcp_create_pty_session(label="test") + try: + sess = sessions[sid] + assert sess.get("transcript_fh") is None + assert sess.get("transcript_path") is None + finally: + mcp_close_pty_session(sid) + + +def test_terminate_session_closes_transcript_handle(tmp_path, monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 5) + transcript = tmp_path / "transcript.log" + from app import mcp_create_pty_session, sessions, mcp_close_pty_session + sid = mcp_create_pty_session(label="test", transcript_path=str(transcript)) + fh = sessions[sid]["transcript_fh"] + mcp_close_pty_session(sid) + assert fh.closed + # Session removed from dict + assert sid not in sessions +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_transcript.py -v -k "create_pty or terminate"` +Expected: 3 failures — `mcp_create_pty_session` does not yet accept `transcript_path`. + +- [ ] **Step 3: Modify `mcp_create_pty_session` signature** + +In `app.py`, change the signature (line ~1324): + +```python +def mcp_create_pty_session(label: str = "hermes-mcp", transcript_path: str | None = None) -> str: +``` + +After the `os.close(slave_fd)` line (around line 1358) and before `session_id = str(uuid.uuid4())`, add the transcript open. Place it inside the existing flow so the file handle is constructed before being stored: + +```python + # Open transcript file (if requested) before locking the session dict. + transcript_fh = None + if transcript_path: + try: + os.makedirs(os.path.dirname(transcript_path), exist_ok=True) + transcript_fh = open(transcript_path, "ab", buffering=0) + os.fchmod(transcript_fh.fileno(), 0o600) + except OSError as exc: + logger.warning("Could not open transcript at %s: %s", transcript_path, exc) + transcript_fh = None +``` + +Modify the `sessions[session_id] = { ... }` block to include the new fields: + +```python + sessions[session_id] = { + "master_fd": master_fd, + "pid": pid, + "output_buffer": deque(maxlen=1000), + "lock": threading.Lock(), + "last_poll_time": time.time(), + "created_at": time.time(), + "label": label, + "transcript_path": transcript_path if transcript_fh else None, + "transcript_fh": transcript_fh, + "transcript_bytes": 0, + "grace": False, + } +``` + +- [ ] **Step 4: Modify `terminate_session` to close the transcript handle** + +In `app.py::terminate_session` (line ~912), at the top of the function (right after the `logger.info` and the `_emit_from_thread('session_closed', ...)` call), add: + +```python + # Close transcript handle (if any) under per-session lock; swap-then-close + # outside the lock to avoid blocking on slow filesystems. + with sessions_lock: + sess = sessions.get(session_id) + if sess is not None: + with sess["lock"]: + transcript_fh = sess.get("transcript_fh") + sess["transcript_fh"] = None + if transcript_fh is not None: + try: + transcript_fh.close() + except Exception: + pass +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `uv run pytest tests/test_transcript.py -v -k "create_pty or terminate"` +Expected: 3 passed. + +- [ ] **Step 6: Run full transcript test suite** + +Run: `uv run pytest tests/test_transcript.py -v` +Expected: 8 passed. + +- [ ] **Step 7: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add app.py tests/test_transcript.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: open transcript handle in mcp_create_pty_session; close in terminate_session" +``` + +--- + +## Task 5: `app.py` — grace-period exemption from `MAX_CONCURRENT_SESSIONS` + helper hooks + +**Files:** +- Modify: `app.py` (the two `MAX_CONCURRENT_SESSIONS` check sites + add two new helpers near the bottom near other MCP hook functions) + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_transcript.py`: + +```python +def test_grace_period_pty_does_not_count_toward_max(monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 2) + from app import mcp_create_pty_session, mcp_close_pty_session, sessions, _mark_grace_for_session + + sid1 = mcp_create_pty_session(label="t1") + sid2 = mcp_create_pty_session(label="t2") + try: + # At cap. A third creation should raise. + with pytest.raises(RuntimeError, match="Maximum"): + mcp_create_pty_session(label="t3") + # Mark one as grace; now we should have headroom. + _mark_grace_for_session(sid1) + assert sessions[sid1]["grace"] is True + sid3 = mcp_create_pty_session(label="t3") + mcp_close_pty_session(sid3) + finally: + for s in [sid1, sid2]: + try: mcp_close_pty_session(s) + except Exception: pass + + +def test_bump_session_last_poll_advances_clock(monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 5) + from app import mcp_create_pty_session, mcp_close_pty_session, sessions, _bump_session_last_poll + sid = mcp_create_pty_session(label="t") + try: + baseline = sessions[sid]["last_poll_time"] + _bump_session_last_poll(sid, 300) + assert sessions[sid]["last_poll_time"] >= baseline + 299 + finally: + mcp_close_pty_session(sid) + + +def test_mark_grace_on_missing_session_is_noop(): + from app import _mark_grace_for_session + _mark_grace_for_session("nonexistent-pty-id") # must not raise + + +def test_bump_session_last_poll_missing_is_noop(): + from app import _bump_session_last_poll + _bump_session_last_poll("nonexistent-pty-id", 100) # must not raise +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_transcript.py -v -k "grace or bump_session"` +Expected: failures — `_mark_grace_for_session` / `_bump_session_last_poll` don't exist; the cap check still uses raw `len`. + +- [ ] **Step 3: Replace the `MAX_CONCURRENT_SESSIONS` checks** + +There are two checkpoints in `app.py`: + +**Site 1 — `create_session()` (around line 1252):** + +```python + with sessions_lock: + if len(sessions) >= MAX_CONCURRENT_SESSIONS: + return jsonify({"error": f"Maximum {MAX_CONCURRENT_SESSIONS} concurrent sessions reached. Close an existing session first."}), 429 +``` + +Replace with: + +```python + with sessions_lock: + active = sum(1 for s in sessions.values() if not s.get("grace")) + if active >= MAX_CONCURRENT_SESSIONS: + return jsonify({"error": f"Maximum {MAX_CONCURRENT_SESSIONS} concurrent sessions reached. Close an existing session first."}), 429 +``` + +**Site 2 — `mcp_create_pty_session()` (around lines 1326-1330 and again 1362-1371):** + +Both `len(sessions) >= MAX_CONCURRENT_SESSIONS` checks become: + +```python + active = sum(1 for s in sessions.values() if not s.get("grace")) + if active >= MAX_CONCURRENT_SESSIONS: + raise RuntimeError( + f"Maximum {MAX_CONCURRENT_SESSIONS} concurrent sessions reached." + ) +``` + +(Apply at both pre-spawn and post-spawn check sites.) + +- [ ] **Step 4: Add the two helper functions** + +Place near `mcp_close_pty_session` (around line 1399): + +```python +def _mark_grace_for_session(session_id: str) -> None: + """Mark a PTY session as 'in grace period' so it doesn't count toward + MAX_CONCURRENT_SESSIONS. Called by ``_watch_task`` immediately before + scheduling the deferred close Timer. + + No-op if the session does not exist (e.g., already torn down). + """ + with sessions_lock: + sess = sessions.get(session_id) + if sess is None: + return + with sess["lock"]: + sess["grace"] = True + + +def _bump_session_last_poll(session_id: str, delta_s: float) -> None: + """Advance ``last_poll_time`` by ``delta_s`` so the idle reaper can't + preempt the Timer's deferred close. Defensive: at the current 24h + SESSION_TIMEOUT_SECONDS the reaper would never win anyway, but a future + tuning shouldn't break the grace window. + + No-op if the session does not exist. + """ + with sessions_lock: + sess = sessions.get(session_id) + if sess is None: + return + with sess["lock"]: + sess["last_poll_time"] = time.time() + delta_s +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `uv run pytest tests/test_transcript.py -v -k "grace or bump_session"` +Expected: 4 passed. + +- [ ] **Step 6: Run full transcript suite + session limit test for regression** + +Run: `uv run pytest tests/test_transcript.py tests/test_session_limit.py -v` +Expected: all pass. + +- [ ] **Step 7: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add app.py tests/test_transcript.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: exempt grace-period PTYs from MAX_CONCURRENT_SESSIONS" +``` + +--- + +## Task 6: `mcp_server.py` — wire deferred close via `Timer`; update `set_app_hooks` + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (lines 70-90 hook plumbing; lines 94-148 `_watch_task` + helpers) +- Test: `tests/test_mcp_server.py` (extend) + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_mcp_server.py`: + +```python +import threading +from unittest import mock + +from coda_mcp import mcp_server, task_manager + + +def test_set_app_hooks_accepts_grace_and_bump_hooks(): + create = mock.MagicMock() + send = mock.MagicMock() + close = mock.MagicMock() + mark_grace = mock.MagicMock() + bump_poll = mock.MagicMock() + mcp_server.set_app_hooks(create, send, close, mark_grace, bump_poll) + assert mcp_server._app_mark_grace is mark_grace + assert mcp_server._app_bump_poll is bump_poll + + +def test_watch_task_schedules_timer_on_completion(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + # Create a session + task with a faked result.json + s = task_manager.create_session("u@x", "uid", label="t") + sid = s["session_id"] + task_manager._update_session_field(sid, "pty_session_id", "pty-abc") + t = task_manager.create_task(sid, "do thing", "u@x") + tid = t["task_id"] + tdir = task_manager._task_dir(sid, tid) + task_manager._write_json(tdir + "/result.json", {"status": "completed"}) + + mark = mock.MagicMock() + bump = mock.MagicMock() + closer = mock.MagicMock() + mcp_server.set_app_hooks(mock.MagicMock(), mock.MagicMock(), closer, mark, bump) + + timer_created = [] + real_timer = threading.Timer + + def fake_timer(seconds, fn, args=None, kwargs=None): + timer_created.append((seconds, fn, args)) + t = real_timer(seconds, fn, args=args, kwargs=kwargs) + return t + + monkeypatch.setattr(mcp_server.threading, "Timer", fake_timer) + + # Use a very short watch interval and ensure no real Timer fires + monkeypatch.setattr(mcp_server, "GRACE_PERIOD_S", 0.05) + + # Run one iteration manually + mcp_server._watch_task(sid, tid, timeout_s=10) + + # Timer should be scheduled for GRACE_PERIOD_S seconds with closer + pty_session_id + assert len(timer_created) == 1 + delay, fn, args = timer_created[0] + assert delay == 0.05 + assert fn is closer + assert args == ("pty-abc",) + + # _mark_grace and _bump_session_last_poll should have been called + mark.assert_called_once_with("pty-abc") + bump.assert_called_once_with("pty-abc", 0.05) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_mcp_server.py -v -k "set_app_hooks_accepts or watch_task_schedules"` +Expected: failures — extra params on `set_app_hooks` not accepted; `_watch_task` calls close synchronously. + +- [ ] **Step 3: Extend `set_app_hooks` and module state** + +In `coda_mcp/mcp_server.py`, at the top of the "App hooks" block (around line 70), expand: + +```python +_app_create_session = None +_app_send_input = None +_app_close_session = None +_app_mark_grace = None +_app_bump_poll = None + +GRACE_PERIOD_S = 300 # 5 minutes + + +def set_app_hooks( + create_session_fn, + send_input_fn, + close_session_fn, + mark_grace_fn=None, + bump_poll_fn=None, +): + """Wire up Flask app callbacks for PTY operations. + + The two new optional hooks (mark_grace, bump_poll) are used by ``_watch_task`` + to defer PTY close by ``GRACE_PERIOD_S`` after task completion so live viewers + can keep watching for a few minutes. + """ + global _app_create_session, _app_send_input, _app_close_session + global _app_mark_grace, _app_bump_poll + _app_create_session = create_session_fn + _app_send_input = send_input_fn + _app_close_session = close_session_fn + _app_mark_grace = mark_grace_fn + _app_bump_poll = bump_poll_fn +``` + +- [ ] **Step 4: Replace the immediate close inside `_watch_task`** + +Replace the existing `_close_pty_for_session(session_id)` calls in `_watch_task` (one in the completion branch around line 117, one in the timeout branch around line 144) with the deferred-Timer helper. Add a new helper at the bottom of the existing helper section (right after `_close_pty_for_session` around line 161): + +```python +def _schedule_deferred_close(session_id: str) -> None: + """Mark the PTY as in-grace and schedule a delayed close. + + Both completion and timeout paths call this in place of the immediate + ``_close_pty_for_session``. The Timer is a daemon thread so it doesn't + block uvicorn shutdown. + """ + if _app_close_session is None: + return + try: + session = task_manager._read_session(session_id) + except task_manager.SessionNotFoundError: + return + pty_session_id = session.get("pty_session_id") + if not pty_session_id: + return + + if _app_mark_grace is not None: + _app_mark_grace(pty_session_id) + if _app_bump_poll is not None: + _app_bump_poll(pty_session_id, GRACE_PERIOD_S) + + t = threading.Timer(GRACE_PERIOD_S, _app_close_session, args=(pty_session_id,)) + t.daemon = True + t.start() + logger.info( + "Watcher: scheduled deferred close for pty %s in %ds", + pty_session_id, GRACE_PERIOD_S, + ) +``` + +Then in `_watch_task`, replace both occurrences of `_close_pty_for_session(session_id)` with `_schedule_deferred_close(session_id)`. + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `uv run pytest tests/test_mcp_server.py -v -k "set_app_hooks_accepts or watch_task_schedules"` +Expected: 2 passed. + +- [ ] **Step 6: Run full mcp_server test suite for regression** + +Run: `uv run pytest tests/test_mcp_server.py -v` +Expected: all pass (existing tests should be unaffected since hooks default to None). + +- [ ] **Step 7: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_server.py tests/test_mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat(coda-mcp): defer PTY close by GRACE_PERIOD_S via threading.Timer" +``` + +--- + +## Task 7: `mcp_server.py` — return `viewer_url` from all three tools + pass `transcript_path` to PTY creation + update instructions + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (`coda_run` body, `coda_inbox` body, `coda_get_result` body, `instructions` block) + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_mcp_server.py`: + +```python +import asyncio +import json +import os +from unittest import mock + +from coda_mcp import mcp_server, task_manager, url_builder + + +def _run(coro): + return asyncio.get_event_loop().run_until_complete(coro) if not asyncio.iscoroutine(coro) else asyncio.run(coro) + + +def test_coda_run_includes_viewer_url_when_builder_returns_one(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", "app.example.com") + + create = mock.MagicMock(return_value="pty-abc") + send = mock.MagicMock() + closer = mock.MagicMock() + mcp_server.set_app_hooks(create, send, closer, mock.MagicMock(), mock.MagicMock()) + + result_json = asyncio.run(mcp_server.coda_run(prompt="do it", email="u@x")) + result = json.loads(result_json) + assert result["status"] == "running" + assert "?session=pty-abc" in result["viewer_url"] + assert result["viewer_url"].startswith("https://app.example.com") + + +def test_coda_run_omits_viewer_url_when_builder_returns_none(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", None) + monkeypatch.delenv("CODA_APP_URL", raising=False) + + create = mock.MagicMock(return_value="pty-abc") + mcp_server.set_app_hooks(create, mock.MagicMock(), mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + + result_json = asyncio.run(mcp_server.coda_run(prompt="do it", email="u@x")) + result = json.loads(result_json) + # viewer_url present but None when builder returns None + assert result.get("viewer_url") is None + + +def test_coda_run_passes_transcript_path_to_create_session(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + create = mock.MagicMock(return_value="pty-abc") + mcp_server.set_app_hooks(create, mock.MagicMock(), mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + + asyncio.run(mcp_server.coda_run(prompt="do it", email="u@x")) + # create_session was called with transcript_path=... pointing into ~/.coda/sessions//tasks//transcript.log + kwargs = create.call_args.kwargs + assert "transcript_path" in kwargs + assert kwargs["transcript_path"].endswith("transcript.log") + assert "tasks" in kwargs["transcript_path"] + + +def test_coda_inbox_decorates_each_task_with_viewer_url(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", "app.example.com") + + # Seed one session with one task and a pty_session_id + s = task_manager.create_session("u@x", "uid", label="t") + sid = s["session_id"] + task_manager._update_session_field(sid, "pty_session_id", "pty-xyz") + task_manager.create_task(sid, "prompt", "u@x") + + result_json = asyncio.run(mcp_server.coda_inbox()) + result = json.loads(result_json) + assert len(result["tasks"]) == 1 + assert "viewer_url" in result["tasks"][0] + assert "?session=pty-xyz" in result["tasks"][0]["viewer_url"] + + +def test_coda_get_result_includes_viewer_url(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", "app.example.com") + + s = task_manager.create_session("u@x", "uid", label="t") + sid = s["session_id"] + task_manager._update_session_field(sid, "pty_session_id", "pty-xyz") + t = task_manager.create_task(sid, "prompt", "u@x") + tid = t["task_id"] + tdir = task_manager._task_dir(sid, tid) + task_manager._write_json(tdir + "/result.json", { + "status": "completed", "summary": "ok", + }) + + result_json = asyncio.run(mcp_server.coda_get_result(tid, sid)) + result = json.loads(result_json) + assert "viewer_url" in result + assert "?session=pty-xyz" in result["viewer_url"] +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_mcp_server.py -v -k "viewer_url or transcript_path"` +Expected: failures — fields not present, `transcript_path` not passed. + +- [ ] **Step 3: Modify `coda_run`** + +In `coda_mcp/mcp_server.py`, at the top of the file add the import: + +```python +from coda_mcp import url_builder +``` + +In the body of `coda_run` (around line 219), modify the PTY creation block to compute and pass the transcript path: + +```python + # Create PTY if hooks are wired + if _app_create_session is not None: + transcript_path = os.path.join( + task_manager._task_dir(session_id, _new_task_id_preview := task_manager._new_task_id()), + "transcript.log", + ) +``` + +Wait — `task_id` isn't known until after `task_manager.create_task`. Restructure: create the task FIRST (so we have task_id), then create the PTY with transcript path, then send the input. The existing order is: create_session → create_pty → update session with pty_id → create_task → send_input. We need: create_session → create_task → create_pty(transcript_path) → update session with pty_id → send_input. + +Replace the existing PTY-create + create_task block (lines ~218-258) with this restructured version: + +```python + # Create task first (we need task_id to compute transcript_path). + result = task_manager.create_task( + session_id=session_id, + prompt=prompt, + email=email, + context=ctx, + timeout_s=timeout_s, + permissions=permissions, + previous_session_id=previous_session_id or None, + ) + task_id = result["task_id"] + + pty_session_id = None + if _app_create_session is not None: + transcript_path = os.path.join( + task_manager._task_dir(session_id, task_id), + "transcript.log", + ) + pty_session_id = _app_create_session( + label="hermes-mcp", + transcript_path=transcript_path, + ) + task_manager._update_session_field( + session_id, "pty_session_id", pty_session_id + ) + + # Send to PTY if hooks are wired + if _app_send_input is not None and pty_session_id is not None: + tdir = task_manager._task_dir(session_id, task_id) + prompt_path = os.path.join(tdir, "prompt.txt") + cmd = f'hermes -z "{prompt_path}"' + if permissions == "yolo": + cmd += " --yolo" + cmd += "\n" + _app_send_input(pty_session_id, cmd) + + # Start background watcher + t = threading.Thread( + target=_watch_task, + args=(session_id, task_id, timeout_s), + daemon=True, + ) + t.start() + + return json.dumps({ + "task_id": task_id, + "session_id": session_id, + "status": "running", + "viewer_url": url_builder.build_viewer_url(pty_session_id) if pty_session_id else None, + }) +``` + +- [ ] **Step 4: Add `viewer_url` to `coda_inbox` entries** + +In `coda_inbox` (around line 300), after the `list_all_tasks` call, decorate each entry. Replace: + +```python + tasks = task_manager.list_all_tasks(email=email, status_filter=status) +``` + +with: + +```python + tasks = task_manager.list_all_tasks(email=email, status_filter=status) + # Decorate each task with its viewer URL (if available). + for t in tasks: + sess = task_manager._read_session_safe(t["session_id"]) + pty = sess.get("pty_session_id") if sess else None + if pty: + vu = url_builder.build_viewer_url(pty) + if vu: + t["viewer_url"] = vu +``` + +This requires adding `_read_session_safe` to `task_manager.py` — a wrapper that returns `None` instead of raising. Add it now in `coda_mcp/task_manager.py` next to `_read_session`: + +```python +def _read_session_safe(session_id: str) -> dict | None: + """Read session.json, returning None on missing/corrupt instead of raising.""" + try: + return _read_session(session_id) + except SessionNotFoundError: + return None +``` + +- [ ] **Step 5: Add `viewer_url` to `coda_get_result`** + +In `coda_get_result` (around line 327), after the existing field-setting block, add: + +```python + # Decorate with viewer_url if known + sess = task_manager._read_session_safe(session_id) + pty = sess.get("pty_session_id") if sess else None + if pty: + vu = url_builder.build_viewer_url(pty) + if vu: + result["viewer_url"] = vu +``` + +Place this immediately before `return json.dumps(result)`. + +- [ ] **Step 6: Update FastMCP `instructions`** + +In `coda_mcp/mcp_server.py`, modify the `instructions=` argument to FastMCP (around line 42) by appending a paragraph at the end of the existing instructions string: + +```python + "CHAINING: pass previous_session_id from a completed task's session_id " + "to give the new task context of what was done before.\n\n" + "SHARE THE LIVE URL: When coda_run returns a viewer_url field (non-null), " + "mention it to the user in plain text (e.g. \"you can watch progress at " + "\"). The URL is safe to share — it points to the same Databricks App " + "the user is already authenticated against. Do this on the first mention " + "of the task and any time the user asks where the task is or how to see it." +``` + +- [ ] **Step 7: Run tests to verify they pass** + +Run: `uv run pytest tests/test_mcp_server.py -v -k "viewer_url or transcript_path"` +Expected: 5 passed. + +- [ ] **Step 8: Run full mcp test suite for regression** + +Run: `uv run pytest tests/test_mcp_server.py tests/test_mcp_integration.py -v` +Expected: all pass. + +- [ ] **Step 9: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_server.py coda_mcp/task_manager.py tests/test_mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat(coda-mcp): return viewer_url from coda_run/inbox/get_result + transcript wiring" +``` + +--- + +## Task 8: `mcp_asgi.py` — capture `X-Forwarded-Host` via ASGI middleware + +**Files:** +- Modify: `coda_mcp/mcp_asgi.py` (add middleware class + register it on `mcp_starlette`) +- Test: `tests/test_app_url_middleware.py` (new) + +- [ ] **Step 1: Write the failing tests** + +Create `tests/test_app_url_middleware.py`: + +```python +"""Tests for AppUrlCaptureMiddleware — populates url_builder._app_url_cache.""" +import asyncio +import importlib + +import pytest + +from coda_mcp import url_builder + + +@pytest.fixture(autouse=True) +def _reset_cache(): + importlib.reload(url_builder) + yield + + +async def _fake_app(scope, receive, send): + await send({"type": "http.response.start", "status": 200, "headers": []}) + await send({"type": "http.response.body", "body": b"", "more_body": False}) + + +def _make_scope(headers: list[tuple[bytes, bytes]]): + return { + "type": "http", + "asgi": {"version": "3.0"}, + "method": "POST", + "path": "/mcp", + "headers": headers, + } + + +async def _drive(middleware, scope): + sent = [] + async def send(msg): sent.append(msg) + async def receive(): return {"type": "http.request", "body": b"", "more_body": False} + await middleware(scope, receive, send) + + +def test_middleware_captures_x_forwarded_host(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = _make_scope([(b"x-forwarded-host", b"app.databricksapps.com")]) + asyncio.run(_drive(mw, scope)) + assert url_builder._app_url_cache == "app.databricksapps.com" + + +def test_middleware_falls_back_to_host_when_no_xforwarded(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = _make_scope([(b"host", b"localhost:8000")]) + asyncio.run(_drive(mw, scope)) + assert url_builder._app_url_cache == "localhost:8000" + + +def test_middleware_skips_non_http_scope(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = {"type": "lifespan"} + async def receive(): return {"type": "lifespan.startup"} + sent = [] + async def send(msg): sent.append(msg) + # Must not crash. Cache stays None. + asyncio.run(mw(scope, receive, send)) + assert url_builder._app_url_cache is None + + +def test_middleware_no_op_when_no_host_header(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = _make_scope([]) + asyncio.run(_drive(mw, scope)) + assert url_builder._app_url_cache is None +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_app_url_middleware.py -v` +Expected: ImportError on `AppUrlCaptureMiddleware`. + +- [ ] **Step 3: Add the middleware class to `mcp_asgi.py`** + +At the top of `coda_mcp/mcp_asgi.py` (after imports, around line 28), add: + +```python +from coda_mcp import url_builder + + +class AppUrlCaptureMiddleware: + """Capture X-Forwarded-Host (or Host) from every inbound HTTP request and + populate url_builder._app_url_cache. Used so MCP tools can return a + working viewer_url without manual configuration. + + Caveat: /socket.io/ traffic is intercepted by socketio.ASGIApp *before* + reaching mcp_starlette, so WebSocket connect requests never hit this + middleware. This is fine in practice — every HTTP request to /mcp and to + Flask routes does hit it, which is enough to keep the cache hot. + """ + + def __init__(self, app): + self.app = app + + async def __call__(self, scope, receive, send): + if scope.get("type") == "http": + headers = dict(scope.get("headers") or []) + host_bytes = headers.get(b"x-forwarded-host") or headers.get(b"host") + if host_bytes: + try: + url_builder.capture_from_headers(host_bytes.decode("latin-1")) + except Exception: + pass + await self.app(scope, receive, send) +``` + +- [ ] **Step 4: Register the middleware on `mcp_starlette`** + +In the existing block that adds CORS (around lines 80-86): + +```python +# CORS for MCP and Flask routes +mcp_starlette.add_middleware( + CORSMiddleware, + allow_origins=ALLOWED_ORIGINS or ["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +``` + +Add a second `add_middleware` call immediately after: + +```python +# Capture X-Forwarded-Host into url_builder cache (for MCP viewer_url). +# Added AFTER CORS so it wraps the CORS-handled request. +mcp_starlette.add_middleware(AppUrlCaptureMiddleware) +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `uv run pytest tests/test_app_url_middleware.py -v` +Expected: 4 passed. + +- [ ] **Step 6: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_asgi.py tests/test_app_url_middleware.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat(coda-mcp): AppUrlCaptureMiddleware seeds url_builder from X-Forwarded-Host" +``` + +--- + +## Task 9: `app.py::attach_session` — replay fallback when PTY is gone + +**Files:** +- Modify: `app.py::attach_session` (lines ~1104-1123) +- Test: `tests/test_replay_attach.py` (new) + +- [ ] **Step 1: Write the failing tests** + +Create `tests/test_replay_attach.py`: + +```python +"""Tests for /api/session/attach replay fallback.""" +import json +import os +from pathlib import Path + +import pytest + +from coda_mcp import task_manager + + +@pytest.fixture +def client(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setenv("MAX_CONCURRENT_SESSIONS", "5") + from app import app + # Bypass authorization (single-user app pattern used by other tests) + monkeypatch.setattr("app.check_authorization", lambda: True) + with app.test_client() as c: + yield c, tmp_path + + +def _seed_transcript(sessions_root: Path, pty_id: str, content: bytes) -> None: + sess_id = "sess-test" + task_id = "task-test" + sdir = sessions_root / sess_id + tdir = sdir / "tasks" / task_id + tdir.mkdir(parents=True) + (sdir / "session.json").write_text(json.dumps({ + "session_id": sess_id, + "pty_session_id": pty_id, + "current_task": None, + "completed_tasks": [task_id], + "status": "closed", + })) + (tdir / "transcript.log").write_bytes(content) + + +def test_attach_returns_replay_when_pty_gone_and_transcript_exists(client): + c, root = client + _seed_transcript(root, "pty-gone", b"hello\r\nworld\r\n") + resp = c.post("/api/session/attach", json={"session_id": "pty-gone"}) + assert resp.status_code == 200 + data = resp.get_json() + assert data["replay"] is True + assert data["output"] == ["hello\r\nworld\r\n"] + assert data["label"] == "hermes-mcp (replay)" + + +def test_attach_404_when_pty_gone_and_no_transcript(client): + c, root = client + resp = c.post("/api/session/attach", json={"session_id": "pty-nope"}) + assert resp.status_code == 404 +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_replay_attach.py -v` +Expected: replay test fails (no fallback); 404 test passes already. + +- [ ] **Step 3: Modify `attach_session`** + +In `app.py::attach_session` (around line 1104), replace the body with: + +```python +@app.route("/api/session/attach", methods=["POST"]) +def attach_session(): + """Reattach to an existing session — returns buffered output for replay. + + If the live PTY is gone but an on-disk transcript exists for this + pty_session_id, return the transcript as ``output`` with ``replay: True``. + """ + data = request.get_json(silent=True) or {} + session_id = data.get("session_id", "") + + sess = _get_session(session_id) + if not sess or sess.get("exited"): + # Replay fallback: look up transcript.log by pty_session_id + from coda_mcp import task_manager as _tm + tdir = _tm.find_task_dir_by_pty_session(session_id) + if tdir: + transcript = os.path.join(tdir, "transcript.log") + if os.path.isfile(transcript): + try: + with open(transcript, "rb") as f: + content = f.read() + return jsonify({ + "session_id": session_id, + "label": "hermes-mcp (replay)", + "output": [content.decode("utf-8", errors="replace")], + "replay": True, + "process": None, + "created_at": None, + }) + except OSError: + pass + return jsonify({"error": "Session not found or exited"}), 404 + + # Existing live-attach path + sess["last_poll_time"] = time.time() + return jsonify({ + "session_id": session_id, + "label": sess.get("label", ""), + "output": list(sess["output_buffer"]), + "process": _get_session_process(sess["pid"]), + "created_at": sess.get("created_at"), + }) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_replay_attach.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Run regression for the existing session-attach tests** + +Run: `uv run pytest tests/test_session_detach.py -v` +Expected: all pass. + +- [ ] **Step 6: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add app.py tests/test_replay_attach.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: attach_session replay fallback reads transcript.log when PTY is gone" +``` + +--- + +## Task 10: `static/index.html` — boot URL parse + `_doReplay` + history hygiene + +**Files:** +- Modify: `static/index.html` + +> **Note**: This is the most "real" change. We add ~50-70 LoC of JS. Tested manually (Playwright not configured in this repo). + +- [ ] **Step 1: Locate the SPA boot path** + +Read `static/index.html` lines 990-1030 (the existing session-picker boot logic) to confirm where pane creation happens after the picker. The new URL-driven branch must run before the picker. + +- [ ] **Step 2: Add boot-time URL parse** + +Find the existing function that runs on `DOMContentLoaded` or the IIFE that initializes the app. Just before it would invoke the session picker, insert: + +```javascript + // ── Deep-link to a CoDA MCP session via ?session= ── + async function _initFromQueryString() { + const params = new URLSearchParams(location.search); + const sessionId = params.get('session'); + if (!sessionId) return false; + + try { + const resp = await fetch('/api/session/attach', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ session_id: sessionId }) + }); + + if (resp.status === 404) { + _renderExpiredPage(sessionId); + return true; // handled, skip picker + } + + const data = await resp.json(); + const term = createTerminalPane({ sessionId, label: data.label || sessionId }); + + if (data.replay) { + const content = (data.output || []).join(''); + await _doReplay(term, sessionId, content); + } else { + await _doAttach(term, sessionId); + if (typeof socket !== 'undefined' && socket) { + socket.emit('join_session', { session_id: sessionId }); + } + } + + return true; // handled, skip picker + } catch (err) { + console.error('deep-link attach failed:', err); + return false; + } + } +``` + +`createTerminalPane({ sessionId, label })` is the name commonly used in this repo for pane creation; if the actual name differs, substitute the local helper. Read the existing pane creation site to confirm and adjust the call site accordingly. + +- [ ] **Step 3: Add `_doReplay`** + +Place near `_doAttach` (around line 1339): + +```javascript + async function _doReplay(term, sessionId, content) { + // Chunk the write to avoid main-thread jank on multi-MB transcripts. + const CHUNK = 64 * 1024; + for (let i = 0; i < content.length; i += CHUNK) { + term.write(content.slice(i, i + CHUNK)); + await new Promise(r => requestAnimationFrame(r)); + } + // Mount a static banner above the pane. + _showReplayBanner(term, sessionId); + // NOTE: do NOT wire term.onData → terminal_input; do NOT include in heartbeat + // session_ids list; do NOT emit join_session. + return sessionId; + } + + function _showReplayBanner(term, sessionId) { + const pane = getAllPanes().find(p => p.sessionId === sessionId); + if (!pane || !pane.element) return; + const banner = document.createElement('div'); + banner.className = 'replay-banner'; + banner.textContent = 'Task completed — viewing replay'; + banner.style.cssText = 'padding:4px 8px;background:#333;color:#aaa;font-size:12px;text-align:center;'; + pane.element.insertBefore(banner, pane.element.firstChild); + } +``` + +- [ ] **Step 4: Add `_renderExpiredPage`** + +Place near `_doReplay`: + +```javascript + function _renderExpiredPage(sessionId) { + const root = document.body; + root.innerHTML = ` +
+

Session expired

+

Session ${sessionId.replace(/[<>]/g, '')} is gone, and no replay is available.

+

The transcript may have aged out after the 24-hour retention window.

+

← Back to terminal

+
+ `; + } +``` + +- [ ] **Step 5: Wire `_initFromQueryString` into the boot path** + +Find where the existing session-picker is shown after `DOMContentLoaded`. Wrap it: + +```javascript + document.addEventListener('DOMContentLoaded', async () => { + // existing init code (sockets, themes, etc.) + + const handled = await _initFromQueryString(); + if (handled) return; + + // existing flow (show session picker, etc.) + }); +``` + +The exact insertion site depends on the existing boot structure — read lines 990-1050 of `static/index.html` to find the right place. + +- [ ] **Step 6: Add history hygiene on pane close** + +Locate the existing pane-close handler. Inside, after the pane is removed, add: + +```javascript + // If this pane was opened via ?session=, drop the query param so a + // refresh doesn't re-attach to a stale id. + const params = new URLSearchParams(location.search); + if (params.get('session') === pane.sessionId) { + history.replaceState({}, '', '/'); + } +``` + +- [ ] **Step 7: Manual smoke test** + +Local dev: + +```bash +uv run uvicorn coda_mcp.mcp_asgi:app --host 0.0.0.0 --port 8000 +``` + +Then open `http://localhost:8000/?session=fake-id` in a browser. Expected: "Session expired" page (404 since no transcript exists). + +Create a fake live session via the regular UI, note its session_id from the picker, then navigate to `http://localhost:8000/?session=` — expected: terminal opens directly attached to that session. + +- [ ] **Step 8: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add static/index.html +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat(spa): deep-link ?session= with live attach + replay rendering" +``` + +--- + +## Task 11: Integration test — E2E grace period + transcript replay + +**Files:** +- Modify: `tests/test_mcp_integration.py` (extend) + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_mcp_integration.py`: + +```python +import asyncio +import json +import os +import time +from pathlib import Path +from unittest import mock + +import pytest + +from coda_mcp import mcp_server, task_manager, url_builder + + +@pytest.fixture +def mcp_env(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", "app.example.com") + # Shrink grace for the test + monkeypatch.setattr(mcp_server, "GRACE_PERIOD_S", 2) + return tmp_path + + +def test_end_to_end_grace_and_replay(mcp_env, monkeypatch): + """Stub hermes via direct file I/O, then exercise the full coda_run flow.""" + from app import mcp_create_pty_session, mcp_send_input, mcp_close_pty_session + from app import _mark_grace_for_session, _bump_session_last_poll, sessions + + mcp_server.set_app_hooks( + mcp_create_pty_session, mcp_send_input, mcp_close_pty_session, + _mark_grace_for_session, _bump_session_last_poll, + ) + + # Submit a fake task + result_json = asyncio.run(mcp_server.coda_run( + prompt="test", email="u@x", timeout_s=5, + )) + result = json.loads(result_json) + assert result["status"] == "running" + sess_id = result["session_id"] + task_id = result["task_id"] + pty_id = task_manager._read_session(sess_id)["pty_session_id"] + + # viewer_url returned + assert pty_id in result["viewer_url"] + + # Simulate hermes writing to the PTY by sending input that echoes to bash + mcp_send_input(pty_id, "echo HELLO_FROM_HERMES\n") + time.sleep(0.5) + + # Now simulate hermes completion by writing result.json + tdir = task_manager._task_dir(sess_id, task_id) + Path(tdir).joinpath("result.json").write_text(json.dumps({ + "status": "completed", "summary": "stub", "files_changed": [], + "artifacts": {}, "errors": [], + })) + + # Wait for watcher to pick it up (polls every 5s — shorten via patch below if slow) + # In practice, the test patches the poll interval. For now, manually invoke: + mcp_server._schedule_deferred_close(sess_id) + + # PTY still alive immediately after grace scheduling + assert pty_id in sessions + assert sessions[pty_id]["grace"] is True + + # Wait past GRACE_PERIOD_S + time.sleep(2.5) + + # PTY now gone + assert pty_id not in sessions + + # Transcript file exists and contains the echoed line + transcript = Path(tdir) / "transcript.log" + assert transcript.exists() + assert b"HELLO_FROM_HERMES" in transcript.read_bytes() + + # find_task_dir_by_pty_session now returns the task dir from the on-disk record + found = task_manager.find_task_dir_by_pty_session(pty_id) + assert found == str(tdir) +``` + +- [ ] **Step 2: Run the test** + +Run: `uv run pytest tests/test_mcp_integration.py -v -k end_to_end_grace_and_replay` +Expected: pass. + +- [ ] **Step 3: Run the full test suite for regression** + +Run: `uv run pytest tests/ -v --timeout=60` +Expected: prior pass count + the new tests. No failures. + +- [ ] **Step 4: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add tests/test_mcp_integration.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "test: E2E coverage for grace period + transcript replay" +``` + +--- + +## Task 12: Manual smoke + deployment verification + +**Files:** none (verification only) + +- [ ] **Step 1: Deploy the worktree to the test app** + +From the worktree root: + +```bash +databricks bundle deploy --target test-coda +``` + +(Adjust target name to whatever the existing deployment uses — check `databricks.yml` or `app.yaml` notes.) + +- [ ] **Step 2: Verify in Genie Code** + +In the Databricks workspace, open Genie Code, ensure the Custom MCP server `mcp-test-coda` is connected. Submit a simple task: `"List the files in /tmp"`. + +Expected: +- Genie Code's response mentions a `viewer_url` like `https://mcp-test-coda-.aws.databricksapps.com/?session=`. +- Clicking the URL opens the terminal pre-attached to that session. +- Hermes output streams in real time. + +- [ ] **Step 3: Verify replay** + +After the task completes, wait 6+ minutes (grace period + buffer), then reload the same URL. + +Expected: +- Page loads showing the static transcript of what hermes did. +- "Task completed — viewing replay" banner. +- No input is sent when you type. + +- [ ] **Step 4: Verify chmod on transcript** + +From a shell in the deployed app (workspace terminal or `databricks workspace files` API): + +```bash +ls -la ~/.coda/sessions/*/tasks/*/transcript.log +``` + +Expected: files have mode `-rw-------` (0o600). + +- [ ] **Step 5: Verify `viewer_url` absence locally without env** + +```bash +unset CODA_APP_URL +uv run uvicorn coda_mcp.mcp_asgi:app --host 0.0.0.0 --port 8000 & +SERVER_PID=$! + +# Submit a coda_run via curl-formatted JSON-RPC +curl -s http://localhost:8000/mcp \ + -H 'Content-Type: application/json' \ + -d '{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"coda_run","arguments":{"prompt":"test","email":"local@dev"}}}' + +kill $SERVER_PID +``` + +Expected: the JSON response contains `"viewer_url": "http://localhost:8000/?session=..."` (because the inbound `Host: localhost:8000` was captured). + +- [ ] **Step 6: Final commit (if any verification turned up a fix)** + +If smoke tests revealed issues, fix them as separate commits, then update this checklist. + +--- + +## Self-review notes + +- All eight spec decisions covered: §1 viewer mode → Task 10 `_doReplay`; §2 transcript tee → Tasks 3-4; §3 deferred Timer → Task 6; §4 grace exemption → Task 5; §5 URL form → Tasks 1, 7; §6 ASGI middleware → Task 8; §7 attach replay fallback → Task 9; §8 SPA → Task 10. +- No "TODO" / "TBD" / "implement later" / placeholder text — every step has concrete code, exact paths, exact commands. +- Type/method consistency: + - `set_app_hooks` signature in Task 6 matches the call site updated in Task 11 (`mcp_server.set_app_hooks(create, send, close, mark_grace, bump_poll)` with optional defaults). + - `_mark_grace_for_session` / `_bump_session_last_poll` defined in Task 5 used by Task 6 and Task 11. + - `transcript_path` kwarg added to `mcp_create_pty_session` in Task 4 used by `coda_run` in Task 7. + - `find_task_dir_by_pty_session` defined in Task 2 used by `attach_session` in Task 9. + - `url_builder.build_viewer_url` defined in Task 1 used by `coda_run`/`coda_inbox`/`coda_get_result` in Task 7. +- Spec §3 "Architecture" diagram preserved as the mental model; data flows §5.1-5.4 map to Tasks 7, 9, 6, 9 respectively. +- Risks §9 (secrets, grace race, multi-tab) accepted in the spec; surface in the test plan via the chmod-600 verification in Task 12 step 4. diff --git a/docs/superpowers/plans/2026-05-28-coda-interactive-broaden-source.md b/docs/superpowers/plans/2026-05-28-coda-interactive-broaden-source.md new file mode 100644 index 0000000..1590755 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-coda-interactive-broaden-source.md @@ -0,0 +1,696 @@ +# `coda_interactive` Broaden Source Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Drop the Git-Folder requirement from `coda_interactive`. `workspace_path` accepts any Databricks Workspace directory. Remove the `branch` parameter. Add a `workspace.get_status` validation step. + +**Architecture:** Single MCP tool simplification on the open PR #67. We replace the Repos API lookup (`client.repos.list` + `client.repos.update`) with a single existence/type check (`client.workspace.get_status` → `_is_directory`). The export helper (`export_workspace_tree`) is unchanged because it already uses the generic Workspace API. Tests are rewritten to match: drop branch-related tests, swap `repos.list` mocks for `workspace.get_status` mocks, add a not-a-directory case. + +**Tech Stack:** Python 3.11, FastMCP, databricks-sdk WorkspaceClient, pytest, MagicMock. + +--- + +## Files modified by this plan + +- **Modify:** `coda_mcp/mcp_server.py` — remove `branch` param, remove repos lookup, add `get_status` validation, update INTERACTIVE HANDOFF instructions paragraph and tool docstring, update import line +- **Modify:** `tests/test_coda_interactive.py` — drop 3 tests, update 4 tests, add 4 tests (3 in Task 1 + 1 instructions-content test in Task 3) +- **Modify:** `docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md` — prepend an amendment notice (Task 4) +- **No change:** `coda_mcp/workspace_export.py` — already generic; we just re-use its `_is_directory` helper via import + +## Pre-flight context + +- Worktree path: `/Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp` +- Branch: `feat/coda-mcp-interactive-handoff` (PR #67, open) +- Run tests with `uv run pytest ` (per user's `always use uv` directive) +- Commit identity: `-c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty"` (per user's directive). No AI co-author lines. + +The current `coda_interactive` body is at `coda_mcp/mcp_server.py:370-517`. The current INTERACTIVE HANDOFF paragraph is at `coda_mcp/mcp_server.py:79-93`. The current test file is `tests/test_coda_interactive.py` (385 lines, 11 tests). + +--- + +## Task 1: Rewrite tests for the broadened contract (RED state) + +This task replaces the test file's mocking shape and assertions. Implementation in Task 2 is what makes them pass. + +**Files:** +- Modify: `tests/test_coda_interactive.py` (drop 3 tests, update 4 tests, add 2 tests) + +- [ ] **Step 1: Delete the three branch/git-folder-only tests** + +These three tests no longer make sense because the corresponding code paths are being removed. Remove them entirely from `tests/test_coda_interactive.py`: + +1. `test_coda_interactive_workspace_path_not_found` (lines 42-58) — tests `repos.list()` returning empty. The new code uses `workspace.get_status`, not `repos.list`. A different test covers the missing-path case. +2. `test_coda_interactive_branch_update_failure` (lines 61-83) — tests `repos.update` raising. The `branch` parameter is going away entirely. +3. `test_coda_interactive_skips_branch_update_when_empty` (lines 86-107) — tests that `repos.update` isn't called when branch is empty. The `branch` parameter is going away entirely. + +- [ ] **Step 2: Update the four tests that have stale mock setup** + +These four tests currently set up `fake_repo` and `fake_client.repos.list.return_value = [fake_repo]`. After the change, `coda_interactive` no longer calls `repos.list`. Replace that scaffolding with a `workspace.get_status` mock returning a directory-typed object. + +Add this helper at the top of the file (just after `_no_wait`): + +```python +def _make_dir_status(): + """Build a mock object_type=DIRECTORY response from workspace.get_status.""" + from unittest.mock import MagicMock + status = MagicMock() + status.object_type = "DIRECTORY" + return status +``` + +Then update these four tests by replacing the `fake_repo` + `fake_client.repos.list.return_value = [fake_repo]` block with: + +```python +fake_client = MagicMock() +fake_client.workspace.get_status.return_value = _make_dir_status() +``` + +The tests: +- `test_coda_interactive_export_failure_cleans_partial_dir` (currently line 110) +- `test_coda_interactive_happy_path_sends_agent_command_and_prompt` (currently line 164) +- `test_coda_interactive_agent_command_matrix` (currently line 224) +- `test_coda_interactive_does_not_use_blocking_sleep` (currently line 272) + +In `test_coda_interactive_happy_path_sends_agent_command_and_prompt`, also remove the assertion line referencing `branch` in the return shape if present (re-check after edit — current return shape includes `"branch"`; the new shape does not). The current test does not assert on `result["branch"]`, so no change needed there, but verify after edit. + +- [ ] **Step 3: Add `test_coda_interactive_workspace_path_does_not_exist`** + +Append to the file: + +```python +def test_coda_interactive_workspace_path_does_not_exist(monkeypatch): + """If workspace.get_status raises, return error and don't proceed to PTY.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + fake_client = MagicMock() + fake_client.workspace.get_status.side_effect = Exception("RESOURCE_DOES_NOT_EXIST") + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + + pty_created = [] + monkeypatch.setattr( + mcp_server, "_app_create_session", + lambda **kw: pty_created.append(kw) or "should-not-be-used", + ) + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="hello", + workspace_path="/Workspace/Users/x/nonexistent", + )) + result = json.loads(result_str) + + assert result["status"] == "error" + assert "not found" in result["error"].lower() or "does_not_exist" in result["error"].lower() + # No PTY may be created if validation fails. + assert pty_created == [], f"PTY must not be created when workspace_path is invalid; got {pty_created}" +``` + +- [ ] **Step 4: Add `test_coda_interactive_workspace_path_not_directory`** + +Append to the file: + +```python +def test_coda_interactive_workspace_path_not_directory(monkeypatch): + """If workspace.get_status returns object_type=FILE (or anything not DIRECTORY), return error.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + file_status = MagicMock() + file_status.object_type = "FILE" + fake_client = MagicMock() + fake_client.workspace.get_status.return_value = file_status + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + + pty_created = [] + monkeypatch.setattr( + mcp_server, "_app_create_session", + lambda **kw: pty_created.append(kw) or "should-not-be-used", + ) + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="hello", + workspace_path="/Workspace/Users/x/some-file.py", + )) + result = json.loads(result_str) + + assert result["status"] == "error" + assert "directory" in result["error"].lower() + assert pty_created == [], "PTY must not be created when workspace_path is not a directory" +``` + +- [ ] **Step 5: Add `test_coda_interactive_no_branch_parameter`** + +Signature regression guard so the `branch` arg cannot quietly come back. Append to the file: + +```python +def test_coda_interactive_no_branch_parameter(): + """The branch parameter must not exist on coda_interactive's signature.""" + import inspect + from coda_mcp import mcp_server + + sig = inspect.signature(mcp_server.coda_interactive) + assert "branch" not in sig.parameters, ( + f"coda_interactive must not accept a `branch` parameter (got {list(sig.parameters)}). " + f"The broadened contract handles git-folder branch state on the caller side." + ) +``` + +- [ ] **Step 6: Run the test file — expect failures** + +Run: `uv run pytest tests/test_coda_interactive.py -v` + +Expected: At least the two new tests (`workspace_path_does_not_exist`, `workspace_path_not_directory`), the signature guard (`no_branch_parameter`), and the four updated mock-shape tests all FAIL — because `coda_interactive` still uses `repos.list` and still accepts `branch`. The unchanged tests (`unknown_agent`, `default_agent_is_claude`, the three `_wait_for_agent_ready` tests) should still PASS. + +This is the intended RED state — proves the new tests actually exercise the new code path. + +- [ ] **Step 7: Commit the tests** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add tests/test_coda_interactive.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "test: rewrite coda_interactive tests for broadened workspace-folder contract" +``` + +--- + +## Task 2: Simplify `coda_interactive` implementation (GREEN state) + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (signature, body, import, return shape) + +- [ ] **Step 1: Update the import line to include the directory check helper** + +In `coda_mcp/mcp_server.py:31`, change: + +```python +from coda_mcp.workspace_export import export_workspace_tree +``` + +to: + +```python +from coda_mcp.workspace_export import export_workspace_tree, _is_directory +``` + +`_is_directory` is currently module-private in `workspace_export.py:35`. We import it directly rather than aliasing for two reasons: (a) it is a stable, narrowly-scoped helper already used internally; (b) renaming it would force an unrelated edit. Python permits underscore imports; the cost is one symbol shared across two modules in the same package. + +- [ ] **Step 2: Replace the function signature and body** + +In `coda_mcp/mcp_server.py:370-517`, replace the entire `async def coda_interactive(...)` definition. The full new function body: + +```python +@mcp.tool( + annotations=ToolAnnotations( + readOnlyHint=False, + destructiveHint=False, + idempotentHint=False, + ), +) +async def coda_interactive( + prompt: str, + workspace_path: str, + agent: str = "claude", + email: str = "", +) -> str: + """Launch an interactive agent session in CoDA, handed off via a viewer URL. + + The MCP caller passes a Databricks Workspace directory path (a Git Folder + or a plain Workspace folder — either works). Coda exports its file tree, + launches the chosen agent (claude default) in that directory, auto-types + ``prompt`` as the first user input, and returns a ``viewer_url`` the + calling user opens in a browser to drive the session. + + Pre-condition: ``workspace_path`` must point to a directory that already + exists in the Databricks Workspace. If the directory is a Git Folder and + the caller wants a specific branch checked out, they must do that + themselves before calling — the export is a server-side snapshot. + + Interactive sessions do NOT appear in ``coda_inbox`` and ``coda_get_result`` + will not return anything for them. The viewer URL is the only handle. + + Allowed agents: claude (default), hermes, codex, gemini, opencode. + """ + if agent not in _ALLOWED_AGENTS: + return json.dumps({ + "status": "error", + "error": f"Unknown agent: {agent!r}. Allowed: {sorted(_ALLOWED_AGENTS)}", + }) + + if WorkspaceClient is None: + return json.dumps({ + "status": "error", + "error": "databricks-sdk not installed", + }) + + client = WorkspaceClient() + + # Validate that the path exists and is a directory. + try: + status = client.workspace.get_status(workspace_path) + except Exception as e: + return json.dumps({ + "status": "error", + "error": f"Workspace path not found: {workspace_path}: {e}", + }) + + if not _is_directory(status): + return json.dumps({ + "status": "error", + "error": f"Workspace path is not a directory: {workspace_path}", + }) + + # Create PTY FIRST so we have its session_id for the project_dir name. + if _app_create_session is None: + return json.dumps({ + "status": "error", + "error": "PTY hook not wired", + }) + + pty_session_id = None + project_dir = None + try: + pty_session_id = _app_create_session( + label=f"{agent}-interactive", + replay_only=False, + ) + + # Build the project dir at the canonical path keyed by PTY id. + project_dir = os.path.join( + os.path.expanduser("~/.coda/projects"), + pty_session_id, + ) + + # Export the Workspace tree into project_dir. + try: + export_workspace_tree(client, workspace_path, project_dir) + except Exception as e: + # Close the PTY and clean up the partial dir. + if _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + if os.path.isdir(project_dir): + shutil.rmtree(project_dir, ignore_errors=True) + return json.dumps({ + "status": "error", + "error": f"Failed to export workspace tree: {e}", + }) + + # cd into the project dir. + if _app_send_input is None: + return json.dumps({ + "status": "error", + "error": "PTY send hook not wired", + }) + _app_send_input(pty_session_id, f"cd {shlex.quote(project_dir)}\n") + + # Launch the agent. + launch_cmd = _AGENT_LAUNCH_CMDS[agent] + _app_send_input(pty_session_id, launch_cmd + "\n") + + # Wait briefly for agent initialization, then paste the prompt. + await _wait_for_agent_ready(pty_session_id) + _app_send_input(pty_session_id, prompt + "\n") + + viewer_url = url_builder.build_viewer_url(pty_session_id) + + return json.dumps({ + "status": "launched", + "viewer_url": viewer_url, + "agent": agent, + "project_dir": project_dir, + "workspace_path": workspace_path, + "instructions": ( + "Open viewer_url to attach. The agent is loaded with the " + "project files exported from Workspace and your kickoff " + "prompt typed. Type the agent's quit command (e.g. /quit) " + "and then `exit` to end the session. Note: git history is " + "NOT available in the session — files are an export, not " + "a clone." + ), + }) + except Exception as e: + # Catch-all: ensure no resource leak. + if pty_session_id and _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + if project_dir and os.path.isdir(project_dir): + shutil.rmtree(project_dir, ignore_errors=True) + return json.dumps({ + "status": "error", + "error": f"coda_interactive failed: {e}", + }) +``` + +Key changes vs. the existing body: +- `branch: str = ""` parameter removed. +- `client.repos.list` / exact-match filter / `client.repos.update` block removed. +- Replaced by `client.workspace.get_status(workspace_path)` + `_is_directory` check. +- `"branch": branch,` dropped from the return JSON. +- Docstring rewritten to say "Git Folder or plain Workspace folder" and drop the "commit and push to remote" admonition. + +- [ ] **Step 3: Run the test file — expect green** + +Run: `uv run pytest tests/test_coda_interactive.py -v` + +Expected: All tests PASS. If any fail, fix the implementation (not the tests) and re-run. + +- [ ] **Step 4: Run the full unit test suite to catch regressions** + +Run: `uv run pytest tests/ -v --no-header -x` (stop on first failure) + +Expected: All previously-passing tests still pass. The skipped PTY-gated and Docker-gated tests stay skipped (those auto-skip on this machine; no behaviour to verify here). + +If unrelated tests fail, stop and investigate before committing. + +- [ ] **Step 5: Commit the implementation** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: coda_interactive accepts any Workspace folder, drop branch param + +Replaces the Repos API lookup (repos.list + repos.update) with a single +workspace.get_status check. Caller is now responsible for managing +Git Folder branch state. Workspace path can be a Git Folder or a plain +Workspace folder — either works." +``` + +--- + +## Task 3: Update INTERACTIVE HANDOFF instructions string + +The server-level instructions string surfaced to upstream LLM callers still says "must be a Git Folder ... commit and push to remote." Rewrite to match the broadened contract. + +**Files:** +- Modify: `coda_mcp/mcp_server.py:79-93` (INTERACTIVE HANDOFF paragraph in the `mcp = FastMCP(instructions=...)` block) + +- [ ] **Step 1: Write a test that pins the instructions string content** + +Append to `tests/test_coda_interactive.py`: + +```python +def test_interactive_handoff_instructions_describe_broadened_contract(): + """The server-level INTERACTIVE HANDOFF paragraph must reflect the broadened contract.""" + from coda_mcp import mcp_server + + instructions = mcp_server.mcp.instructions + + # Must mention coda_interactive. + assert "coda_interactive" in instructions + + # Must NOT still claim a Git Folder is required. + lowered = instructions.lower() + assert "must be a databricks workspace git folder" not in lowered, ( + "Instructions still require a Git Folder — broadened contract was not applied." + ) + assert "commit and push" not in lowered, ( + "Instructions still tell the caller to commit and push — only relevant for Git Folders, " + "but the broadened contract accepts plain folders too." + ) + + # Must mention that plain folders work. + # Either "git folder or" phrasing, or "plain workspace folder" — accept either. + assert ( + "git folder or" in lowered + or "plain workspace folder" in lowered + or "plain folder" in lowered + ), "Instructions must mention that plain Workspace folders are accepted." + + # Must surface the upload-then-handoff pattern so upstream callers know + # to push files into the workspace BEFORE calling. + assert ( + "upload" in lowered + or "workspace.import" in lowered + or "post" in lowered + ), ( + "Instructions must tell the upstream caller to upload/import the project " + "files into the Workspace first if they aren't already there — the tool " + "only reads existing Workspace paths, it doesn't accept inline payloads." + ) +``` + +Run: `uv run pytest tests/test_coda_interactive.py::test_interactive_handoff_instructions_describe_broadened_contract -v` + +Expected: FAIL — the current instructions string still says "must be a Databricks Workspace Git Folder." + +- [ ] **Step 2: Rewrite the INTERACTIVE HANDOFF paragraph in `mcp_server.py:79-93`** + +In `coda_mcp/mcp_server.py`, find the block beginning at line 79: + +```python + "INTERACTIVE HANDOFF (coda_interactive): When the user wants a human to " + "drive a coding agent in CoDA — not autonomous execution — call " + "coda_interactive instead of coda_run. The user's project must be a " + "Databricks Workspace Git Folder, and any in-progress changes must be " + "committed and pushed to the Git Folder's remote BEFORE the call. The tool " + "exports the committed HEAD state into a Coda-local directory, launches " + "the chosen agent (claude default; also hermes, codex, gemini, opencode), " + "and types the prompt as the first user input. The return shape includes " + "a viewer_url the user opens to attach — share it immediately in plain " + "text; it is the only handle to the session, and the user drives it until " + "they exit. Interactive sessions do NOT appear in coda_inbox, and " + "coda_get_result returns nothing for them — do not try to poll or fetch " + "results. Note that git history is NOT available inside the session " + "(files-only export); if the user needs history context, include a git " + "log summary in the prompt string." +``` + +Replace it with: + +```python + "INTERACTIVE HANDOFF (coda_interactive): When the user wants a human to " + "drive a coding agent in CoDA — not autonomous execution — call " + "coda_interactive instead of coda_run. The tool reads files from a " + "directory that already exists in the Databricks Workspace (a Git " + "Folder or a plain Workspace folder — either works). If your working " + "files are not yet in the Workspace, upload them first (workspace.import " + "via the Databricks SDK, REST, or CLI — any of these) into a folder " + "the user can read, then pass that folder as workspace_path. The tool " + "does NOT accept inline file payloads. If the directory is a Git " + "Folder, ensure the desired branch is checked out and pushed first — " + "the export is a server-side snapshot. The tool exports the directory " + "into a Coda-local working directory, launches the chosen agent " + "(claude default; also hermes, codex, gemini, opencode), and types " + "the prompt as the first user input. The return shape includes a " + "viewer_url the user opens to attach — share it immediately in plain " + "text; it is the only handle to the session, and the user drives it " + "until they exit. Interactive sessions do NOT appear in coda_inbox, " + "and coda_get_result returns nothing for them — do not try to poll " + "or fetch results. Note that git history is NOT available inside the " + "session (files-only export); if the user needs history context, " + "include a git log summary in the prompt string." +``` + +- [ ] **Step 3: Run the pinned-instructions test plus full suite** + +Run: `uv run pytest tests/test_coda_interactive.py -v` +Expected: All PASS (including the new instructions test). + +Run: `uv run pytest tests/ -v --no-header` +Expected: All previously-passing tests still pass. + +- [ ] **Step 4: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_server.py tests/test_coda_interactive.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: update INTERACTIVE HANDOFF instructions for broadened contract + +Tells upstream LLM callers that workspace_path can be either a Git Folder +or a plain Workspace folder. Drops the 'commit and push' admonition that +only applied to Git Folders." +``` + +--- + +## Task 4: Amend the original spec doc + +The broadening spec says it "Amends" the original (line 6 of the broadening spec), but the original spec doc on disk still describes the `branch` parameter, `repos.list`/`repos.update`, and Git-Folder-only requirements. Anyone reading the original later would implement the wrong API. Add an amendment notice to its header. + +**Files:** +- Modify: `docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md` (prepend amendment notice after the `**Related:**` line) + +- [ ] **Step 1: Read the current header of the original spec** + +Run: `head -10 docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md` + +You should see something like: + +``` +# Spec: `coda_interactive` MCP Tool + +**Status:** Draft, pre-critique-gate +**Date:** 2026-05-28 +**Branch:** `feat/coda-mcp-live-session-url` (same as Todo 1) +**Related:** `docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md` (Todo 1 — establishes the three-mode framework this spec slots into as Mode 2) + +## Goal +``` + +- [ ] **Step 2: Prepend the amendment notice** + +Use Edit to insert a new line after `**Related:**` (and before `## Goal`): + +The block to insert is: + +```markdown +> **Amended by:** [`docs/superpowers/specs/2026-05-28-coda-interactive-broaden-source-design.md`](2026-05-28-coda-interactive-broaden-source-design.md) — the `branch` parameter and the Git-Folder-only requirement have been removed. `coda_interactive` now accepts any Workspace directory (Git Folder or plain). The `repos.list` + `repos.update` flow described in Section 3 of this spec has been replaced by a single `workspace.get_status` directory check. The return shape no longer includes a `"branch"` key. +``` + +After the edit, the header should read: + +``` +**Branch:** `feat/coda-mcp-live-session-url` (same as Todo 1) +**Related:** `docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md` (Todo 1 — establishes the three-mode framework this spec slots into as Mode 2) + +> **Amended by:** [`docs/superpowers/specs/2026-05-28-coda-interactive-broaden-source-design.md`](2026-05-28-coda-interactive-broaden-source-design.md) — the `branch` parameter and the Git-Folder-only requirement have been removed. `coda_interactive` now accepts any Workspace directory (Git Folder or plain). The `repos.list` + `repos.update` flow described in Section 3 of this spec has been replaced by a single `workspace.get_status` directory check. The return shape no longer includes a `"branch"` key. + +## Goal +``` + +- [ ] **Step 3: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "docs: mark original coda_interactive spec as amended by broaden-source spec" +``` + +--- + +## Task 5: Push and update PR #67 description + +**Files:** +- None (remote/PR update) + +- [ ] **Step 1: Verify the branch's git state** + +```bash +git status +git log --oneline origin/feat/coda-mcp-interactive-handoff..HEAD +``` + +Expected: Clean working tree. Three new commits since the previous remote head (tests rewrite, impl, instructions string). + +- [ ] **Step 2: Push the branch** + +```bash +git push origin feat/coda-mcp-interactive-handoff +``` + +Expected: Successful fast-forward. + +- [ ] **Step 3: Update PR #67 description** + +Add a "Follow-up: broadened source" section at the bottom of the PR body via `gh pr edit` (or, if gh CLI's TLS bug hits, via curl + REST). Content: + +``` +## Follow-up: broadened source contract + +`coda_interactive` no longer requires a Databricks Workspace **Git Folder**. +Any Workspace directory (Git Folder or plain Workspace folder) is accepted. +The `branch` parameter has been removed — callers manage Git Folder branch +state themselves before calling. + +API change (no shipped consumers — safe): +- `coda_interactive(prompt, workspace_path, branch=..., agent=..., email=...)` → + `coda_interactive(prompt, workspace_path, agent=..., email=...)` +- Return shape: `"branch"` key dropped. + +Validation is now a `workspace.get_status` call with a directory-type check +(replaces the `repos.list` + exact-match filter). +``` + +Try the gh path first: + +```bash +gh pr edit 67 --body-file <(gh pr view 67 --json body -q .body; echo; echo; cat <<'EOF' +## Follow-up: broadened source contract + +`coda_interactive` no longer requires a Databricks Workspace **Git Folder**. +Any Workspace directory (Git Folder or plain Workspace folder) is accepted. +The `branch` parameter has been removed — callers manage Git Folder branch +state themselves before calling. + +API change (no shipped consumers — safe): +- `coda_interactive(prompt, workspace_path, branch=..., agent=..., email=...)` → + `coda_interactive(prompt, workspace_path, agent=..., email=...)` +- Return shape: `"branch"` key dropped. + +Validation is now a `workspace.get_status` call with a directory-type check +(replaces the `repos.list` + exact-match filter). +EOF +) +``` + +If gh fails with the known `x509: OSStatus -26276` issue on this machine, fall back to curl: + +```bash +TOKEN=$(gh auth token) +EXISTING_BODY=$(curl -s -k -H "Authorization: token $TOKEN" \ + https://api.github.com/repos/databrickslabs/coding-agents-databricks-apps/pulls/67 | jq -r .body) + +NEW_BODY="$EXISTING_BODY + +## Follow-up: broadened source contract + +\`coda_interactive\` no longer requires a Databricks Workspace **Git Folder**. +Any Workspace directory (Git Folder or plain Workspace folder) is accepted. +The \`branch\` parameter has been removed — callers manage Git Folder branch +state themselves before calling. + +API change (no shipped consumers — safe): +- \`coda_interactive(prompt, workspace_path, branch=..., agent=..., email=...)\` → + \`coda_interactive(prompt, workspace_path, agent=..., email=...)\` +- Return shape: \`\"branch\"\` key dropped. + +Validation is now a \`workspace.get_status\` call with a directory-type check +(replaces the \`repos.list\` + exact-match filter)." + +jq -n --arg body "$NEW_BODY" '{body: $body}' | curl -s -k -X PATCH \ + -H "Authorization: token $TOKEN" \ + -H "Content-Type: application/json" \ + -d @- \ + https://api.github.com/repos/databrickslabs/coding-agents-databricks-apps/pulls/67 +``` + +Confirm the PR description has the new section by visiting the PR URL or via `gh pr view 67`. + +--- + +## Self-review of this plan against the spec + +**Spec section 1 — Tool signature.** Task 2 Step 2 replaces the signature, dropping `branch`. Task 1 Step 5 adds a signature regression guard. ✓ + +**Spec section 2 — Body of `coda_interactive`.** Task 2 Step 2 contains the full new body. `repos.list`/`repos.update` removed, `workspace.get_status` + `_is_directory` added. ✓ + +**Spec section 3 — Return shape.** Task 2 Step 2 omits the `"branch"` key. The existing happy-path test does not assert on `"branch"`, so no test change needed; the regression is the signature test. ✓ + +**Spec section 4 — Caller pre-condition rewrite.** Task 3 rewrites the INTERACTIVE HANDOFF paragraph. Task 2 also rewrites the tool's docstring. Both surfaces updated. ✓ + +**Spec section 5 — INTERACTIVE HANDOFF string.** Task 3 covers it with a pinned-content test (Step 1) then the rewrite (Step 2). ✓ + +**Spec "Tests to update."** Task 1 covers every bullet: 3 drops, 4 updates, 2 adds. The pinned-instructions test in Task 3 is a fifth add. ✓ + +**Spec "Tests for the SDK validation step."** Task 1 Steps 3 and 4 cover the missing-path and not-a-directory cases. ✓ + +**Spec "Out of scope."** This plan does not add single-file workspace_path, branch-info surfacing in the response, or extra cleanup paths. ✓ + +**Spec "Acceptance criteria."** +- `coda_interactive` accepts any Workspace directory → Task 2. ✓ +- No `branch` parameter → Task 2 + signature guard test. ✓ +- Clean error for missing/non-directory paths → Task 2 + 2 new tests. ✓ +- Existing tests pass after updates → Task 1 + Task 2 Steps 3-4. ✓ +- PR description reflects simpler contract → Task 4 Step 3. ✓ + +**Placeholder scan:** No TBD/TODO. Every step has explicit code or a concrete command. ✓ + +**Type consistency:** `_is_directory(status)` accepts an object with `.object_type` attribute — matches what `workspace.get_status` returns and matches the mock helper in tests. The mock helper in Task 1 Step 2 (`_make_dir_status`) returns a MagicMock with `object_type = "DIRECTORY"`, which `_is_directory` accepts via its string-fallback branch (`str(ot) == "DIRECTORY"`). ✓ diff --git a/docs/superpowers/plans/2026-05-28-coda-interactive-mcp-tool.md b/docs/superpowers/plans/2026-05-28-coda-interactive-mcp-tool.md new file mode 100644 index 0000000..f5f663d --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-coda-interactive-mcp-tool.md @@ -0,0 +1,1631 @@ +# `coda_interactive` MCP Tool Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add `coda_interactive` MCP tool that lets an upstream MCP client hand off a coding session to a human via a CoDA viewer URL — the human attaches to a live PTY with the chosen agent (claude default) already loaded with the user's Databricks Workspace Git Folder as CWD and the kickoff prompt typed. + +**Architecture:** Mode 2 in the three-mode framework (see `docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md`). The tool resolves a `workspace_path` to a Databricks Workspace Git Folder, optionally updates it to a specified branch, exports the file tree to `~/.coda/projects//`, creates a PTY with that dir as CWD, launches the agent, and auto-pastes the prompt. The PTY inherits Mode 1's existing 24h-idle lifecycle. Cleanup of the project dir is tied to PTY teardown. + +**Tech Stack:** Python 3.11 + FastMCP + Databricks SDK (`databricks-sdk` already in requirements) + Flask + uvicorn + pytest. No new dependencies. All work localized to `app.py`, `coda_mcp/`, and the test suite. + +--- + +## Pre-flight check (do before Task 1) + +- [ ] **P1: Verify baseline tests pass.** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: `524 passed, 15 skipped` (or close to it — matches Todo 1's final state). + +- [ ] **P2: Confirm worktree is on the `feat/coda-mcp-live-session-url` branch.** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +git branch --show-current +``` + +Expected: `feat/coda-mcp-live-session-url` + +- [ ] **P3: Capture the baseline SHA for downstream code-quality reviews.** + +```bash +git rev-parse HEAD +``` + +Note the SHA — reviewer subagents need it as BASE_SHA. + +--- + +## Task 1: Prerequisite — refactor `mcp_create_pty_session` to use `_build_terminal_shell_env` + +Closes a pre-existing security gap. Today, `mcp_create_pty_session`'s inline env strip only removes 5 keys, while the HTTP `create_session` path uses `_build_terminal_shell_env` which also strips `NPM_TOKEN`, `UV_DEFAULT_INDEX`, `UV_INDEX_*_PASSWORD`, `UV_INDEX_*_USERNAME`, and `npm_config_//*` registry credential patterns. The refactor closes the gap for all MCP-created PTYs (current `coda_run` and future `coda_interactive`). + +**Important context:** The current session dict in `mcp_create_pty_session` (around `app.py:1488`) does **NOT** store the child shell's env. The test below would silently pass if it relied on `sessions[sid]["env"]` alone (a missing key returns `{}` from `.get()`). To get a TDD red-then-green cycle that means something, **Task 1 explicitly adds an `"env"` key to the session dict AND swaps the env-strip to use `_build_terminal_shell_env`** — both changes happen together so the test fails ONLY because of credential leaks, not because of a missing key. + +**Files:** +- Modify: `app.py` (function `mcp_create_pty_session` at line 1420, env-strip block at line 1435, session dict insert at line 1488) +- Create: `tests/test_mcp_env_strip.py` + +- [ ] **Step 1: Write the failing test.** + +Create `tests/test_mcp_env_strip.py`: + +```python +"""Tests for env-stripping consistency between MCP and HTTP PTY creation paths.""" +import os +import pytest + +try: + import pty as _pty + _master, _slave = _pty.openpty() + os.close(_master) + os.close(_slave) + _PTY_AVAILABLE = True +except Exception: + _PTY_AVAILABLE = False + +_pty_skip = pytest.mark.skipif( + not _PTY_AVAILABLE, + reason="PTY not allocatable in this environment", +) + + +@_pty_skip +def test_mcp_create_pty_session_strips_registry_credentials(monkeypatch): + """mcp_create_pty_session must strip NPM_TOKEN, UV_DEFAULT_INDEX, UV_INDEX_*_PASSWORD, + UV_INDEX_*_USERNAME, and npm_config_//* from the child shell's environment — + matching the HTTP create_session path. Today, these leak into MCP-created PTYs. + """ + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + # Plant registry-credential env vars before creating the PTY. + monkeypatch.setenv("NPM_TOKEN", "leak-me-npm") + monkeypatch.setenv("UV_DEFAULT_INDEX", "https://leaked-index.example/") + monkeypatch.setenv("UV_INDEX_MYREG_PASSWORD", "leak-me-uv-pw") + monkeypatch.setenv("UV_INDEX_MYREG_USERNAME", "leak-me-uv-user") + monkeypatch.setenv("npm_config_//registry.example/:_authToken", "leak-me-npm-cfg") + + sid = mcp_create_pty_session(label="t-env-strip") + try: + env = sessions[sid].get("env", {}) + assert "NPM_TOKEN" not in env, f"NPM_TOKEN leaked into MCP PTY: keys={list(env)}" + assert "UV_DEFAULT_INDEX" not in env, "UV_DEFAULT_INDEX leaked" + assert "UV_INDEX_MYREG_PASSWORD" not in env, "UV_INDEX_*_PASSWORD leaked" + assert "UV_INDEX_MYREG_USERNAME" not in env, "UV_INDEX_*_USERNAME leaked" + assert not any(k.startswith("npm_config_//") for k in env), "npm_config_// keys leaked" + finally: + mcp_close_pty_session(sid) +``` + +**Note on the test:** The test reads `sessions[sid]["env"]`. The session dict currently has NO `"env"` key, so without Step 3 changes the test would silently pass (`.get("env", {})` returns `{}` and all `not in {}` assertions trivially pass). Step 3 fixes BOTH (a) adds the `"env"` key, (b) swaps the env-strip to use `_build_terminal_shell_env`. Step 2 verifies failure ONLY after Step 3a (key added) — that gives a meaningful red, then Step 3b (strip refactor) gives the green. + +- [ ] **Step 2: Add the `"env"` key to the session dict (this alone makes the test runnable but failing).** + +In `app.py`, find the session dict literal inside `mcp_create_pty_session` (around line 1488 — the block that has `"master_fd"`, `"pid"`, `"output_buffer"`, etc.). Add a new key: + +```python +sessions[session_id] = { + ..., + "replay_only": replay_only, + "env": env_for_child, # NEW — exposed for env-strip test + ... +} +``` + +`env_for_child` is the variable name used in the env-construction block above. If it's named differently in the actual code, use the actual variable name. + +- [ ] **Step 3: Run the test and verify it fails for the RIGHT reason.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_env_strip.py -v 2>&1 | tail -10 +``` + +Expected: FAIL — at least one of NPM_TOKEN/UV_*/npm_config_// keys is present in `sessions[sid]["env"]` (because the existing inline env-strip doesn't remove them). If the test PASSES at this point, the `"env"` key didn't get added — go back to Step 2. + +- [ ] **Step 4: Refactor `mcp_create_pty_session` env-stripping.** + +In `app.py`, find the env-construction block inside `mcp_create_pty_session` (around line 1435). It currently looks like: + +```python +env_for_child = os.environ.copy() +for k in ("CLAUDECODE", "CLAUDE_CODE_SESSION", "DATABRICKS_TOKEN", "DATABRICKS_HOST", "GEMINI_API_KEY"): + env_for_child.pop(k, None) +``` + +Replace with: + +```python +env_for_child = _build_terminal_shell_env(os.environ) +``` + +`_build_terminal_shell_env` is already defined in `app.py` (around line 210). It returns a dict with ALL the right strips applied (registry creds + the 5 keys above + others). + +- [ ] **Step 5: Run the test and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_env_strip.py -v 2>&1 | tail -10 +``` + +Expected: PASS — registry credentials are now stripped. + +- [ ] **Step 6: Run the full suite to confirm no regression.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: 525 passed, 15 skipped (one more pass than baseline). + +- [ ] **Step 7: Commit.** + +```bash +git add app.py tests/test_mcp_env_strip.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "fix: mcp_create_pty_session strips registry credentials like HTTP path does + +Pre-existing gap: the MCP PTY-creation path stripped only 5 env vars +while the HTTP create_session path used _build_terminal_shell_env which +also strips NPM_TOKEN, UV_DEFAULT_INDEX, UV_INDEX_*_PASSWORD, +UV_INDEX_*_USERNAME, and npm_config_// keys. This let deployer-level +registry credentials leak into the agent's child shell visible via env. +Refactor mcp_create_pty_session to use _build_terminal_shell_env." +``` + +--- + +## Task 2: Add `cwd` kwarg to `mcp_create_pty_session` + +`coda_interactive` needs the spawned bash to start in a specific directory (the exported project dir). Add an optional `cwd: str | None = None` kwarg; default `None` preserves current behavior. + +**Files:** +- Modify: `app.py` (`mcp_create_pty_session` signature and PTY spawn call) +- Modify: `tests/test_mcp_env_strip.py` (add new test in this same file for compactness) + +- [ ] **Step 1: Write the failing test.** + +Append to `tests/test_mcp_env_strip.py`: + +```python +@_pty_skip +def test_mcp_create_pty_session_respects_cwd_kwarg(tmp_path): + """When cwd is passed, the spawned bash starts in that directory.""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + # Create a sentinel file in tmp_path so we can detect the CWD via shell output. + sentinel = tmp_path / "SENTINEL_FILE" + sentinel.write_text("hello") + + sid = mcp_create_pty_session(label="t-cwd", cwd=str(tmp_path)) + try: + # The session dict should record the cwd. + assert sessions[sid].get("cwd") == str(tmp_path) + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_mcp_create_pty_session_cwd_defaults_to_none(): + """When cwd is not passed, sessions[sid]['cwd'] is None (preserves current behavior).""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = mcp_create_pty_session(label="t-no-cwd") + try: + assert sessions[sid].get("cwd") is None + finally: + mcp_close_pty_session(sid) +``` + +- [ ] **Step 2: Run and verify failure.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_env_strip.py::test_mcp_create_pty_session_respects_cwd_kwarg tests/test_mcp_env_strip.py::test_mcp_create_pty_session_cwd_defaults_to_none -v 2>&1 | tail -10 +``` + +Expected: FAIL — `TypeError: unexpected keyword argument 'cwd'` for the first test. + +- [ ] **Step 3: Add the `cwd` kwarg.** + +In `app.py`, change the `mcp_create_pty_session` signature to: + +```python +def mcp_create_pty_session( + label: str = "hermes-mcp", + transcript_path: str | None = None, + replay_only: bool = False, + cwd: str | None = None, +) -> str: +``` + +Inside the function, find the PTY spawn / `subprocess.Popen` call (it's the one that launches bash inside the PTY). It should currently look something like: + +```python +process = subprocess.Popen( + ["/bin/bash", "-l"], + stdin=slave_fd, stdout=slave_fd, stderr=slave_fd, + env=env_for_child, + preexec_fn=os.setsid, + close_fds=True, +) +``` + +Add `cwd=cwd` (which is None by default, meaning the child uses the parent's CWD — current behavior): + +```python +process = subprocess.Popen( + ["/bin/bash", "-l"], + stdin=slave_fd, stdout=slave_fd, stderr=slave_fd, + env=env_for_child, + cwd=cwd, # NEW + preexec_fn=os.setsid, + close_fds=True, +) +``` + +Also add `cwd` to the session dict: + +```python +sessions[session_id] = { + ..., + "cwd": cwd, # NEW + ... +} +``` + +- [ ] **Step 4: Run tests and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_env_strip.py -v 2>&1 | tail -10 +``` + +Expected: all tests in the file pass. + +- [ ] **Step 5: Run the full suite.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: 527 passed, 15 skipped. + +- [ ] **Step 6: Commit.** + +```bash +git add app.py tests/test_mcp_env_strip.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: mcp_create_pty_session accepts cwd kwarg + +Adds optional cwd parameter so callers can spawn the PTY's bash in a +specific directory. Default None preserves current behavior. Required +for coda_interactive (which needs to start agents in the exported +project dir)." +``` + +--- + +## Task 3: Create `coda_mcp/workspace_export.py` helper + +Encapsulates the Workspace-tree-to-local-dir export logic. Single responsibility: given a Databricks Workspace path and a local destination, copy the file tree. + +**Files:** +- Create: `coda_mcp/workspace_export.py` +- Create: `tests/test_workspace_export.py` + +- [ ] **Step 1: Write the failing tests.** + +Create `tests/test_workspace_export.py`: + +```python +"""Tests for coda_mcp.workspace_export.export_workspace_tree.""" +import os +from unittest.mock import MagicMock, patch + +import pytest + +from coda_mcp.workspace_export import export_workspace_tree + + +def _fake_object(path, object_type): + """Minimal stand-in for databricks.sdk.service.workspace.ObjectInfo.""" + o = MagicMock() + o.path = path + o.object_type = object_type + return o + + +def test_export_workspace_tree_creates_dest_dir(tmp_path): + """Helper creates the destination directory if it doesn't exist.""" + dest = tmp_path / "subdir" + assert not dest.exists() + + client = MagicMock() + client.workspace.list.return_value = [] + export_workspace_tree(client, "/Workspace/Users/x/empty", str(dest)) + + assert dest.exists() and dest.is_dir() + + +def test_export_workspace_tree_writes_single_file(tmp_path): + """A workspace with one file gets that file written to the local dir.""" + client = MagicMock() + client.workspace.list.return_value = [ + _fake_object("/Workspace/Users/x/proj/main.py", "FILE"), + ] + # Export returns an object with .content (base64-encoded bytes) + import base64 + mock_export = MagicMock() + mock_export.content = base64.b64encode(b"print('hi')\n").decode("ascii") + client.workspace.export.return_value = mock_export + + export_workspace_tree(client, "/Workspace/Users/x/proj", str(tmp_path)) + + main_py = tmp_path / "main.py" + assert main_py.exists() + assert main_py.read_text() == "print('hi')\n" + + +def test_export_workspace_tree_handles_nested_dirs(tmp_path): + """Nested directory structure is preserved in the destination.""" + client = MagicMock() + # First list call returns the top-level entries + # Subsequent recursive calls return the subdir contents + def list_side_effect(path, **kwargs): + if path == "/Workspace/Users/x/proj": + return [ + _fake_object("/Workspace/Users/x/proj/main.py", "FILE"), + _fake_object("/Workspace/Users/x/proj/lib", "DIRECTORY"), + ] + elif path == "/Workspace/Users/x/proj/lib": + return [ + _fake_object("/Workspace/Users/x/proj/lib/util.py", "FILE"), + ] + return [] + client.workspace.list.side_effect = list_side_effect + + import base64 + def export_side_effect(path, **kwargs): + mock = MagicMock() + if path.endswith("main.py"): + mock.content = base64.b64encode(b"main\n").decode("ascii") + else: + mock.content = base64.b64encode(b"util\n").decode("ascii") + return mock + client.workspace.export.side_effect = export_side_effect + + export_workspace_tree(client, "/Workspace/Users/x/proj", str(tmp_path)) + + assert (tmp_path / "main.py").read_text() == "main\n" + assert (tmp_path / "lib" / "util.py").read_text() == "util\n" + + +def test_export_workspace_tree_skips_binary_files_gracefully(tmp_path, caplog): + """Files that fail to export (e.g. binaries) are skipped and logged, not fatal.""" + client = MagicMock() + client.workspace.list.return_value = [ + _fake_object("/Workspace/Users/x/proj/text.py", "FILE"), + _fake_object("/Workspace/Users/x/proj/image.png", "FILE"), + ] + + import base64 + def export_side_effect(path, **kwargs): + if path.endswith(".png"): + raise Exception("400 Bad Request: cannot export binary as SOURCE") + mock = MagicMock() + mock.content = base64.b64encode(b"hello\n").decode("ascii") + return mock + client.workspace.export.side_effect = export_side_effect + + # Should NOT raise; should skip and log. + export_workspace_tree(client, "/Workspace/Users/x/proj", str(tmp_path)) + + assert (tmp_path / "text.py").exists() + assert not (tmp_path / "image.png").exists() + + +def test_export_workspace_tree_empty_workspace(tmp_path): + """Empty workspace path produces empty destination dir (no error).""" + client = MagicMock() + client.workspace.list.return_value = [] + + export_workspace_tree(client, "/Workspace/Users/x/empty", str(tmp_path)) + + assert tmp_path.exists() + assert list(tmp_path.iterdir()) == [] +``` + +- [ ] **Step 2: Run and verify failure.** + +```bash +.venv/bin/python -m pytest tests/test_workspace_export.py -v 2>&1 | tail -10 +``` + +Expected: ImportError (`No module named coda_mcp.workspace_export`). + +- [ ] **Step 3: Implement the helper.** + +Create `coda_mcp/workspace_export.py`: + +```python +"""Export a Databricks Workspace tree (Git Folder contents) to a local directory. + +Used by ``coda_interactive`` to materialize a Workspace Git Folder onto the +Coda container's disk before launching an agent in that directory. + +Only the working tree is exported — Git Folder server-side metadata (the +``.git/`` directory) is not exposed by the Workspace API. +""" +from __future__ import annotations + +import base64 +import logging +import os +from typing import Any + +logger = logging.getLogger(__name__) + + +def export_workspace_tree(client: Any, workspace_path: str, dest_dir: str) -> None: + """Export the Workspace tree rooted at ``workspace_path`` into ``dest_dir``. + + ``client`` is a ``databricks.sdk.WorkspaceClient`` (or compatible mock). + Recursively lists entries, calls ``workspace.export()`` per file with + ``ExportFormat.SOURCE``, decodes the base64 content, and writes to the + local mirror. + + Per-file export errors (e.g. binaries that fail SOURCE export) are logged + and skipped — they do not abort the export. The agent in the session may + not have access to those files; the human can decide whether that matters. + """ + os.makedirs(dest_dir, exist_ok=True) + + try: + from databricks.sdk.service.workspace import ExportFormat + export_format = ExportFormat.SOURCE + except Exception: + export_format = None # mocks won't care + + _export_recursive(client, workspace_path, dest_dir, export_format) + + +def _export_recursive(client, workspace_path: str, dest_dir: str, export_format) -> None: + """Walk one level of the workspace and export files / recurse into dirs.""" + try: + entries = list(client.workspace.list(workspace_path)) + except Exception as e: + logger.warning("workspace.list(%s) failed: %s", workspace_path, e) + return + + for entry in entries: + rel_name = os.path.basename(entry.path) + local_path = os.path.join(dest_dir, rel_name) + object_type = str(getattr(entry, "object_type", "")) + + if object_type == "DIRECTORY" or object_type.endswith(".DIRECTORY"): + _export_recursive(client, entry.path, local_path, export_format) + elif object_type == "FILE" or object_type.endswith(".FILE") or object_type == "NOTEBOOK" or object_type.endswith(".NOTEBOOK"): + try: + if export_format is not None: + exported = client.workspace.export(path=entry.path, format=export_format) + else: + exported = client.workspace.export(path=entry.path) + content_b64 = getattr(exported, "content", "") or "" + content_bytes = base64.b64decode(content_b64) if content_b64 else b"" + with open(local_path, "wb") as f: + f.write(content_bytes) + except Exception as e: + logger.warning("workspace.export(%s) failed; skipping: %s", entry.path, e) + continue + else: + # Unknown object type; skip with a log line. + logger.info("Skipping unknown object_type=%r at %s", object_type, entry.path) +``` + +- [ ] **Step 4: Run tests and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_workspace_export.py -v 2>&1 | tail -15 +``` + +Expected: 5 passed. + +- [ ] **Step 5: Commit.** + +```bash +git add coda_mcp/workspace_export.py tests/test_workspace_export.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: add coda_mcp.workspace_export.export_workspace_tree helper + +Recursively exports a Databricks Workspace Git Folder's file tree to +a local directory. Used by coda_interactive (next commit) to +materialize project files before launching an agent. + +Per-file export errors (binary files etc.) are logged and skipped +rather than aborting the export." +``` + +--- + +## Task 4: Extend `mcp_close_pty_session` to clean up the project dir + +When a `coda_interactive` PTY is torn down, the corresponding `~/.coda/projects//` directory should be removed. Same cleanup hook fires on graceful exit and idle reaper. + +**Files:** +- Modify: `app.py` (function `mcp_close_pty_session` — find its definition by grep) +- Modify: `tests/test_mcp_env_strip.py` (append cleanup-hook test for compactness; could also be a new file) + +- [ ] **Step 1: Write the failing test.** + +Append to `tests/test_mcp_env_strip.py`: + +```python +@_pty_skip +def test_mcp_close_pty_session_removes_project_dir(tmp_path, monkeypatch): + """When the PTY is closed, any project dir at ~/.coda/projects// is removed.""" + import os + from app import mcp_create_pty_session, mcp_close_pty_session + + # Point HOME at tmp_path so ~/.coda lives in a controllable place. + monkeypatch.setenv("HOME", str(tmp_path)) + + sid = mcp_create_pty_session(label="t-cleanup") + + project_dir = os.path.join(str(tmp_path), ".coda", "projects", sid) + os.makedirs(project_dir, exist_ok=True) + sentinel = os.path.join(project_dir, "SENTINEL") + with open(sentinel, "w") as f: + f.write("present-before-close") + assert os.path.exists(sentinel) + + mcp_close_pty_session(sid) + + assert not os.path.exists(project_dir), \ + f"Expected project dir to be removed after PTY close: {project_dir} still exists" + + +@_pty_skip +def test_mcp_close_pty_session_handles_missing_project_dir(monkeypatch, tmp_path): + """No project dir present → close still succeeds (no exception).""" + from app import mcp_create_pty_session, mcp_close_pty_session + + monkeypatch.setenv("HOME", str(tmp_path)) + + sid = mcp_create_pty_session(label="t-no-projdir") + # Do NOT create the project dir — verify close still works. + mcp_close_pty_session(sid) # must not raise +``` + +- [ ] **Step 2: Run and verify failure.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_env_strip.py::test_mcp_close_pty_session_removes_project_dir -v 2>&1 | tail -10 +``` + +Expected: FAIL — the sentinel still exists after `mcp_close_pty_session(sid)`. + +- [ ] **Step 3: Add the cleanup hook.** + +In `app.py`, find `def mcp_close_pty_session(` (search for it). Inside the function, after the existing close logic (closing master_fd, killing process, popping from sessions), add the project-dir cleanup: + +```python +def mcp_close_pty_session(session_id: str) -> None: + # ... existing close logic ... + + # NEW: clean up the project dir if coda_interactive created one. + import shutil + project_dir = os.path.join( + os.path.expanduser("~/.coda/projects"), + session_id, + ) + if os.path.isdir(project_dir): + try: + shutil.rmtree(project_dir) + except OSError as e: + logger.warning("Failed to clean up project dir %s: %s", project_dir, e) +``` + +Place this near the END of the function so the PTY is fully closed before disk cleanup. The `try/except OSError` is intentional — a stuck file (rare) shouldn't break the close path. + +- [ ] **Step 4: Run tests and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_env_strip.py -v 2>&1 | tail -15 +``` + +Expected: all tests in the file pass. + +- [ ] **Step 5: Run the full suite.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: 529 passed, 15 skipped. + +- [ ] **Step 6: Commit.** + +```bash +git add app.py tests/test_mcp_env_strip.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: mcp_close_pty_session removes project dir on teardown + +When coda_interactive creates ~/.coda/projects//, that directory +should be deleted when the PTY is closed. Single cleanup path ties the +project's disk lifecycle to the PTY's lifecycle — no separate timer or +state to track." +``` + +--- + +## Task 5: Stub `coda_interactive` with agent validation + +First slice: register the tool, validate the agent kwarg, return error for unknown agents. No SDK calls, no PTY yet. + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (add tool definition near `coda_run`) +- Create: `tests/test_coda_interactive.py` + +- [ ] **Step 1: Write failing tests.** + +Create `tests/test_coda_interactive.py`: + +```python +"""Tests for the coda_interactive MCP tool.""" +import asyncio +import json +import os + +import pytest + +ALLOWED_AGENTS = {"claude", "hermes", "codex", "gemini", "opencode"} + + +def test_coda_interactive_unknown_agent_returns_error(): + """An agent value not in the allow-list returns status=error and lists allowed values.""" + from coda_mcp import mcp_server + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="hello", + workspace_path="/Workspace/Users/x/proj", + agent="vim", + )) + result = json.loads(result_str) + assert result["status"] == "error" + assert "vim" in result["error"] + # Error message lists all allowed agents so the calling LLM can correct itself. + for allowed in ALLOWED_AGENTS: + assert allowed in result["error"] + + +def test_coda_interactive_default_agent_is_claude(): + """Calling with no agent kwarg defaults to claude (assertion via signature inspection).""" + import inspect + from coda_mcp import mcp_server + + sig = inspect.signature(mcp_server.coda_interactive) + assert sig.parameters["agent"].default == "claude" +``` + +- [ ] **Step 2: Run and verify failure.** + +```bash +.venv/bin/python -m pytest tests/test_coda_interactive.py -v 2>&1 | tail -10 +``` + +Expected: FAIL — `AttributeError: module 'coda_mcp.mcp_server' has no attribute 'coda_interactive'`. + +- [ ] **Step 3: Add the stub tool to `coda_mcp/mcp_server.py`.** + +In `coda_mcp/mcp_server.py`, locate the `@mcp.tool(...)` block for `coda_run` (around line 190 in the current file). The `coda_run` function ends around line 289 (before `coda_inbox`). Add the new tool definition between `coda_run` and `coda_inbox`: + +```python +_ALLOWED_AGENTS = {"claude", "hermes", "codex", "gemini", "opencode"} + + +@mcp.tool( + annotations=ToolAnnotations( + readOnlyHint=False, + destructiveHint=False, + idempotentHint=False, + ), +) +async def coda_interactive( + prompt: str, + workspace_path: str, + branch: str = "", + agent: str = "claude", + email: str = "", +) -> str: + """Launch an interactive agent session in CoDA, handed off via a viewer URL. + + The MCP caller passes a Databricks Workspace Git Folder path; Coda exports + its file tree, launches the chosen agent (claude default) in that directory, + auto-types ``prompt`` as the first user input, and returns a ``viewer_url`` + the calling user opens in a browser to drive the session. + + Pre-condition: ``workspace_path`` must be a Databricks Workspace Git Folder + and any in-progress changes must have been committed and pushed to its + remote before this call. The export reflects the committed HEAD state. + + Interactive sessions do NOT appear in ``coda_inbox`` and ``coda_get_result`` + will not return anything for them. The viewer URL is the only handle. + + Allowed agents: claude (default), hermes, codex, gemini, opencode. + """ + if agent not in _ALLOWED_AGENTS: + return json.dumps({ + "status": "error", + "error": f"Unknown agent: {agent!r}. Allowed: {sorted(_ALLOWED_AGENTS)}", + }) + + # TODO(Task 6+): workspace lookup, branch update, export, PTY launch. + return json.dumps({ + "status": "error", + "error": "Not yet implemented (stub).", + }) +``` + +Notes: +- `json` is already imported at top of file. If not, add `import json`. +- The `# TODO` comment is acceptable here because the function is being built incrementally across Tasks 5–8; each task removes one TODO. + +- [ ] **Step 4: Run tests and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_coda_interactive.py -v 2>&1 | tail -10 +``` + +Expected: 2 passed. + +- [ ] **Step 5: Commit.** + +```bash +git add coda_mcp/mcp_server.py tests/test_coda_interactive.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: stub coda_interactive MCP tool with agent validation + +First slice. Validates the agent kwarg against the allow-list +(claude, hermes, codex, gemini, opencode); returns a clear error +listing the allowed values when an unknown agent is passed. +Workspace lookup, branch update, export, and PTY launch come in +follow-up commits." +``` + +--- + +## Task 6: Add workspace lookup + branch update to `coda_interactive` + +Resolve `workspace_path` to a Git Folder via `WorkspaceClient.repos.list()`; if `branch` is non-empty, call `repos.update(repo_id, branch=branch)`. + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (`coda_interactive` body) +- Modify: `tests/test_coda_interactive.py` + +- [ ] **Step 1: Write failing tests.** + +Append to `tests/test_coda_interactive.py`: + +```python +def test_coda_interactive_workspace_path_not_found(monkeypatch): + """If repos.list() returns no match for workspace_path, status=error.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + fake_client = MagicMock() + fake_client.repos.list.return_value = [] # no Git Folder at that path + + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="hello", + workspace_path="/Workspace/Users/x/nonexistent", + )) + result = json.loads(result_str) + assert result["status"] == "error" + assert "No Git Folder found" in result["error"] + + +def test_coda_interactive_branch_update_failure(monkeypatch): + """If repos.update() raises, return error and don't proceed to PTY.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + fake_repo = MagicMock() + fake_repo.id = 123 + fake_repo.path = "/Workspace/Users/x/proj" + + fake_client = MagicMock() + fake_client.repos.list.return_value = [fake_repo] + fake_client.repos.update.side_effect = Exception("404 branch not found: nonexistent") + + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="hello", + workspace_path="/Workspace/Users/x/proj", + branch="nonexistent", + )) + result = json.loads(result_str) + assert result["status"] == "error" + assert "branch" in result["error"].lower() or "404" in result["error"] + + +def test_coda_interactive_skips_branch_update_when_empty(monkeypatch): + """If branch is empty, repos.update() must NOT be called.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + fake_repo = MagicMock() + fake_repo.id = 123 + fake_repo.path = "/Workspace/Users/x/proj" + + fake_client = MagicMock() + fake_client.repos.list.return_value = [fake_repo] + + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + + # We don't expect a successful return yet (export+PTY not wired); we just + # verify that repos.update was not called. + asyncio.run(mcp_server.coda_interactive( + prompt="hello", + workspace_path="/Workspace/Users/x/proj", + branch="", + )) + fake_client.repos.update.assert_not_called() +``` + +- [ ] **Step 2: Run and verify failure.** + +```bash +.venv/bin/python -m pytest tests/test_coda_interactive.py -v 2>&1 | tail -10 +``` + +Expected: 3 new tests fail (function returns the stub error, not the lookup-based errors expected). + +- [ ] **Step 3: Implement workspace lookup + branch update.** + +In `coda_mcp/mcp_server.py`, near the top of the file (with other imports), add: + +```python +try: + from databricks.sdk import WorkspaceClient +except ImportError: + WorkspaceClient = None # type: ignore +``` + +(This guards against tests that mock the SDK by monkey-patching `mcp_server.WorkspaceClient`.) + +Replace the body of `coda_interactive` (the part after the agent-validation `if` block, currently just the `# TODO` and stub return) with: + +```python + # Resolve the Git Folder by listing under the workspace_path prefix. + if WorkspaceClient is None: + return json.dumps({ + "status": "error", + "error": "databricks-sdk not installed", + }) + + client = WorkspaceClient() + + try: + repos = list(client.repos.list(path_prefix=workspace_path)) + except Exception as e: + return json.dumps({ + "status": "error", + "error": f"Failed to list Git Folders: {e}", + }) + + repo = next((r for r in repos if r.path == workspace_path), None) + if repo is None: + return json.dumps({ + "status": "error", + "error": f"No Git Folder found at {workspace_path}", + }) + + # Optional branch update. + if branch: + try: + client.repos.update(repo_id=repo.id, branch=branch) + except Exception as e: + return json.dumps({ + "status": "error", + "error": f"Failed to update Git Folder to branch {branch!r}: {e}", + }) + + # TODO(Task 7+): export tree, create PTY, launch agent. + return json.dumps({ + "status": "error", + "error": "Not yet implemented (stub).", + }) +``` + +- [ ] **Step 4: Run tests and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_coda_interactive.py -v 2>&1 | tail -15 +``` + +Expected: 5 passed (2 from Task 5 + 3 new). + +- [ ] **Step 5: Commit.** + +```bash +git add coda_mcp/mcp_server.py tests/test_coda_interactive.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: coda_interactive resolves Git Folder and optionally updates branch + +Uses WorkspaceClient.repos.list to resolve workspace_path to a Git +Folder; returns a clear error if no match. If branch is non-empty, +calls repos.update which performs the actual git fetch+checkout +server-side. Export and PTY launch land in follow-up commits." +``` + +--- + +## Task 7: Implement `coda_interactive`'s full happy path + +Combined task: export workspace tree, create PTY, cd into project dir, launch agent, seed prompt, return viewer URL. **Single task with a single commit** — avoids the intermediate orphaned-state problem of the previous Task 7→Task 8 split (where the project dir's name didn't match the PTY's session id). + +**Ordering insight:** PTY is created FIRST (so we know its session_id), THEN we build `project_dir = ~/.coda/projects//`, THEN export into it, THEN `cd` the PTY into the dir via input, THEN launch the agent, THEN paste the prompt. This single chronology eliminates the chicken-and-egg between project_dir naming and PTY id. + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (`coda_interactive` body — replace the stub return from Task 6 with the full happy path; also add module-level imports and constants) +- Modify: `tests/test_coda_interactive.py` (append happy-path test + export-failure test + agent-matrix test) + +- [ ] **Step 1: Write failing tests.** + +Append to `tests/test_coda_interactive.py`: + +```python +def test_coda_interactive_export_failure_cleans_partial_dir(monkeypatch, tmp_path): + """If export raises mid-way, the partial project dir is removed and the PTY is closed.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + monkeypatch.setenv("HOME", str(tmp_path)) + + fake_repo = MagicMock() + fake_repo.id = 123 + fake_repo.path = "/Workspace/Users/x/proj" + fake_client = MagicMock() + fake_client.repos.list.return_value = [fake_repo] + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + + # PTY-creation hook returns a deterministic id we can predict. + monkeypatch.setattr( + mcp_server, "_app_create_session", lambda **kw: "pty-exportfail-id", + ) + + closed = [] + monkeypatch.setattr( + mcp_server, "_app_close_session", lambda sid: closed.append(sid), + ) + + def fake_export(client, workspace_path, dest_dir): + # Create the dir + a partial file, then raise. + os.makedirs(dest_dir, exist_ok=True) + with open(os.path.join(dest_dir, "partial.txt"), "w") as f: + f.write("partial") + raise RuntimeError("simulated export failure") + + monkeypatch.setattr(mcp_server, "export_workspace_tree", fake_export) + + # send_input hook should NOT be called for export-failure path (we close before launch). + sent = [] + monkeypatch.setattr( + mcp_server, "_app_send_input", lambda sid, payload: sent.append((sid, payload)), + ) + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="hello", + workspace_path="/Workspace/Users/x/proj", + )) + result = json.loads(result_str) + + assert result["status"] == "error" + assert "export" in result["error"].lower() + # PTY was created — must be closed on failure. + assert "pty-exportfail-id" in closed, "PTY must be closed when export fails" + # Project dir cleaned up. + project_dir = tmp_path / ".coda" / "projects" / "pty-exportfail-id" + assert not project_dir.exists(), "Partial project dir must be removed after export failure" + + +def test_coda_interactive_happy_path_sends_agent_command_and_prompt(monkeypatch, tmp_path): + """End-to-end mock: export succeeds, PTY created, cd + agent + prompt sent in order.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + monkeypatch.setenv("HOME", str(tmp_path)) + + fake_repo = MagicMock() + fake_repo.id = 123 + fake_repo.path = "/Workspace/Users/x/proj" + fake_client = MagicMock() + fake_client.repos.list.return_value = [fake_repo] + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + + monkeypatch.setattr( + mcp_server, + "export_workspace_tree", + lambda client, ws_path, dest_dir: os.makedirs(dest_dir, exist_ok=True), + ) + monkeypatch.setattr( + mcp_server, "_app_create_session", lambda **kw: "pty-happy-id", + ) + + sent_to_pty = [] + monkeypatch.setattr( + mcp_server, + "_app_send_input", + lambda sid, payload: sent_to_pty.append((sid, payload)), + ) + + # Stub the sleep so the test runs fast. + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_DELAY_S", 0) + + monkeypatch.setattr( + mcp_server.url_builder, + "build_viewer_url", + lambda pty_id: f"https://test.example/?session={pty_id}", + ) + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="continue debugging the auth flow", + workspace_path="/Workspace/Users/x/proj", + agent="claude", + )) + result = json.loads(result_str) + + assert result["status"] == "launched" + assert result["agent"] == "claude" + assert result["viewer_url"] == "https://test.example/?session=pty-happy-id" + assert result["project_dir"].endswith("/pty-happy-id") + + # Three PTY writes, in order: cd, agent command, prompt. + assert len(sent_to_pty) == 3, f"Expected 3 PTY writes; got {sent_to_pty}" + assert sent_to_pty[0][0] == "pty-happy-id" + assert sent_to_pty[0][1].startswith("cd "), \ + f"First write should be cd; got {sent_to_pty[0][1]!r}" + assert sent_to_pty[1] == ("pty-happy-id", "claude\n") + assert sent_to_pty[2] == ("pty-happy-id", "continue debugging the auth flow\n") + + +def test_coda_interactive_agent_command_matrix(monkeypatch, tmp_path): + """Each allowed agent maps to its expected launch command.""" + from unittest.mock import MagicMock + from coda_mcp import mcp_server + + expected = { + "claude": "claude\n", + "hermes": "hermes chat\n", + "codex": "codex\n", + "gemini": "gemini\n", + "opencode": "opencode\n", + } + + for agent, expected_cmd in expected.items(): + monkeypatch.setenv("HOME", str(tmp_path / agent)) + + fake_repo = MagicMock(); fake_repo.id = 1; fake_repo.path = "/W/x/p" + fake_client = MagicMock() + fake_client.repos.list.return_value = [fake_repo] + monkeypatch.setattr(mcp_server, "WorkspaceClient", lambda: fake_client) + monkeypatch.setattr( + mcp_server, "export_workspace_tree", + lambda client, ws_path, dest_dir: os.makedirs(dest_dir, exist_ok=True), + ) + monkeypatch.setattr( + mcp_server, "_app_create_session", lambda **kw: f"pty-{agent}", + ) + sent = [] + monkeypatch.setattr( + mcp_server, "_app_send_input", lambda sid, p: sent.append(p), + ) + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_DELAY_S", 0) + monkeypatch.setattr( + mcp_server.url_builder, "build_viewer_url", + lambda pty_id: f"https://test/?s={pty_id}", + ) + + result_str = asyncio.run(mcp_server.coda_interactive( + prompt="x", workspace_path="/W/x/p", agent=agent, + )) + result = json.loads(result_str) + assert result["status"] == "launched", f"agent {agent}: {result}" + + # sent[0] is cd, sent[1] is the agent command, sent[2] is the prompt. + assert sent[1] == expected_cmd, \ + f"agent {agent}: expected {expected_cmd!r}, got {sent[1]!r}" +``` + +- [ ] **Step 2: Run and verify failure.** + +```bash +.venv/bin/python -m pytest tests/test_coda_interactive.py -v 2>&1 | tail -15 +``` + +Expected: 3 new tests fail (stub returns "Not yet implemented", happy-path assertions trip). + +- [ ] **Step 3: Implement the full happy path.** + +In `coda_mcp/mcp_server.py`: + +(a) Near the existing imports at the top of the file, add: + +```python +import shlex +import time +from coda_mcp import url_builder +from coda_mcp.workspace_export import export_workspace_tree +``` + +(b) Near other module-level constants, add: + +```python +_PROMPT_SEED_DELAY_S = 2 # seconds to wait for agent to initialize before pasting prompt + +_AGENT_LAUNCH_CMDS = { + "claude": "claude", + "hermes": "hermes chat", + "codex": "codex", + "gemini": "gemini", + "opencode": "opencode", +} +``` + +(c) Replace the trailing stub `return json.dumps({"status": "error", "error": "Not yet implemented (stub)."})` in `coda_interactive` (the one added by Task 6 after the branch-update block) with the full implementation: + +```python + # Create PTY FIRST so we have its session_id for the project_dir name. + if _app_create_session is None: + return json.dumps({ + "status": "error", + "error": "PTY hook not wired", + }) + + pty_session_id = None + project_dir = None + try: + pty_session_id = _app_create_session( + label=f"{agent}-interactive", + replay_only=False, + ) + + # Build the project dir at the canonical path keyed by PTY id. + project_dir = os.path.join( + os.path.expanduser("~/.coda/projects"), + pty_session_id, + ) + + # Export the Workspace tree into project_dir. + try: + export_workspace_tree(client, workspace_path, project_dir) + except Exception as e: + # Close the PTY and clean up the partial dir. + if _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + import shutil + if os.path.isdir(project_dir): + shutil.rmtree(project_dir, ignore_errors=True) + return json.dumps({ + "status": "error", + "error": f"Failed to export workspace tree: {e}", + }) + + # cd into the project dir. + if _app_send_input is None: + return json.dumps({ + "status": "error", + "error": "PTY send hook not wired", + }) + _app_send_input(pty_session_id, f"cd {shlex.quote(project_dir)}\n") + + # Launch the agent. + launch_cmd = _AGENT_LAUNCH_CMDS[agent] + _app_send_input(pty_session_id, launch_cmd + "\n") + + # Wait briefly for agent initialization, then paste the prompt. + time.sleep(_PROMPT_SEED_DELAY_S) + _app_send_input(pty_session_id, prompt + "\n") + + viewer_url = url_builder.build_viewer_url(pty_session_id) + + return json.dumps({ + "status": "launched", + "viewer_url": viewer_url, + "agent": agent, + "project_dir": project_dir, + "workspace_path": workspace_path, + "branch": branch, + "instructions": ( + "Open viewer_url to attach. The agent is loaded with the " + "project files exported from Workspace and your kickoff " + "prompt typed. Type the agent's quit command (e.g. /quit) " + "and then `exit` to end the session. Note: git history is " + "NOT available in the session — files are an export, not " + "a clone." + ), + }) + except Exception as e: + # Catch-all: ensure no resource leak. + if pty_session_id and _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + if project_dir and os.path.isdir(project_dir): + import shutil + shutil.rmtree(project_dir, ignore_errors=True) + return json.dumps({ + "status": "error", + "error": f"coda_interactive failed: {e}", + }) +``` + +Delete the now-unused `# TODO(Task 7+)` comments from Task 6's stub if they remain. + +- [ ] **Step 4: Run tests and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_coda_interactive.py -v 2>&1 | tail -15 +``` + +Expected: 8 passed (2 from Task 5 + 3 from Task 6 + 3 from Task 7). If any earlier test breaks because they didn't anticipate `_app_send_input` being called (the export-failure test from Task 6 patches `_app_create_session` but not `_app_send_input`), patch it accordingly with `monkeypatch.setattr(mcp_server, "_app_send_input", lambda *a, **k: None)`. + +- [ ] **Step 5: Run the full suite.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: 537+ passed, 15 skipped. + +- [ ] **Step 6: Commit.** + +```bash +git add coda_mcp/mcp_server.py tests/test_coda_interactive.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: coda_interactive end-to-end happy path + +Combined task: creates the PTY first (to get its id), builds the project +dir at ~/.coda/projects//, exports the Workspace tree into it, +cds the PTY into the dir, launches the chosen agent, waits 2s for +initialization, then pastes the prompt as the first user input. +Returns the viewer URL. + +Agent matrix (claude/hermes/codex/gemini/opencode) maps to each +agent's known interactive launch command. Export failure cleanly +closes the PTY and removes the partial project dir." +``` + +**Acknowledgment**: Task 2's `cwd` kwarg on `mcp_create_pty_session` ends up unused by this implementation (we `cd` via PTY input instead because the project_dir doesn't exist when the PTY is spawned). Leaving the tested optional kwarg in place is acceptable; reverting is more churn for no behavioral gain. + +--- + +## Task 8: Register `coda_interactive` in Flask fallback dispatch + +`coda_mcp/mcp_endpoint.py` has a Flask-based MCP fallback used in non-ASGI environments. It needs `coda_interactive` in its dispatch table. + +**Files:** +- Modify: `coda_mcp/mcp_endpoint.py` (imports + `_TOOL_DISPATCH`) + +- [ ] **Step 1: Read the existing dispatch.** + +```bash +grep -n "_TOOL_DISPATCH\|coda_run\|coda_inbox\|coda_get_result" coda_mcp/mcp_endpoint.py +``` + +Confirm the dispatch is a dict keyed by tool name → function reference. + +- [ ] **Step 2: Add the import + dispatch entry.** + +In `coda_mcp/mcp_endpoint.py`, find the import block that pulls in the existing tools (around line 22): + +```python +from coda_mcp.mcp_server import ( + mcp as mcp_instance, + coda_run, + coda_inbox, + coda_get_result, +) +``` + +Add `coda_interactive`: + +```python +from coda_mcp.mcp_server import ( + mcp as mcp_instance, + coda_run, + coda_inbox, + coda_get_result, + coda_interactive, +) +``` + +Find `_TOOL_DISPATCH` (around line 31): + +```python +_TOOL_DISPATCH = { + "coda_run": coda_run, + "coda_inbox": coda_inbox, + "coda_get_result": coda_get_result, +} +``` + +Add `coda_interactive`: + +```python +_TOOL_DISPATCH = { + "coda_run": coda_run, + "coda_inbox": coda_inbox, + "coda_get_result": coda_get_result, + "coda_interactive": coda_interactive, +} +``` + +- [ ] **Step 3: Run the test suite.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: all pass. + +- [ ] **Step 4: Spot-check the Flask fallback path with a quick test.** + +```bash +.venv/bin/python -c "from coda_mcp.mcp_endpoint import _TOOL_DISPATCH; print(list(_TOOL_DISPATCH))" +``` + +Expected output: `['coda_run', 'coda_inbox', 'coda_get_result', 'coda_interactive']` + +- [ ] **Step 5: Commit.** + +```bash +git add coda_mcp/mcp_endpoint.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: wire coda_interactive into Flask-fallback MCP dispatch + +The Flask blueprint at coda_mcp/mcp_endpoint.py is the WSGI-compatible +fallback used by tests and local dev. Without this entry, those paths +can't call coda_interactive." +``` + +--- + +## Task 9: Update FastMCP `instructions` string + +The instructions block at `coda_mcp/mcp_server.py:43-70` currently describes only `coda_run` (after Todo 1's update). Add a paragraph for `coda_interactive` so MCP-client LLMs understand the new tool's contract. + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (the `instructions` string passed to `FastMCP(...)`) + +- [ ] **Step 1: Read the current instructions block.** + +```bash +grep -n "SHARE THE REPLAY URL\|FIRE AND FORGET\|WORKFLOW" coda_mcp/mcp_server.py | head -10 +``` + +Open the file and locate the `FastMCP(name=..., instructions="""...""")` block. + +- [ ] **Step 2: Add the new paragraph.** + +After the existing `SHARE THE REPLAY URL` paragraph and before the `WORKFLOW` paragraph, insert: + +``` +INTERACTIVE HANDOFF (coda_interactive): When the user wants a human to drive +a coding agent in CoDA — not autonomous execution — call coda_interactive +instead of coda_run. The user must have their project as a Databricks +Workspace Git Folder, and any in-progress changes must be committed and +pushed to the Git Folder's remote BEFORE the call. The tool exports the +committed HEAD state into a Coda-local directory, launches the chosen agent +(claude default; also hermes, codex, gemini, opencode), and types the prompt +as the first user input. Return shape includes a viewer_url the user opens +to attach — they then drive the session until they exit. Interactive sessions +do NOT appear in coda_inbox; coda_get_result returns nothing for them. The +viewer URL is the only handle — pass it to the user immediately. Note that +git history is NOT available inside the session (files-only export); if the +user needs history context, include a git log summary in the prompt string. +``` + +The exact wording can be tightened to match the existing paragraphs' tone — read the surrounding text first. + +- [ ] **Step 3: Run the suite.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: all pass (no tests assert on instruction text strings). + +- [ ] **Step 4: Commit.** + +```bash +git add coda_mcp/mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "docs: add INTERACTIVE HANDOFF paragraph to MCP instructions + +Describes coda_interactive's contract for calling LLMs: Git Folder +pre-condition, viewer URL handoff, no coda_inbox / coda_get_result +integration, git history unavailable trade-off. Prevents calling LLMs +from treating coda_interactive like coda_run (e.g., trying to poll +results)." +``` + +--- + +## Task 10: Add regression guard test + +Defends the mode separation: calling `coda_run` must NOT create anything under `~/.coda/projects/`. Protects against future drift that accidentally couples the two modes. + +**Files:** +- Modify: `tests/test_replay_only_flag.py` (append to keep regression guards together) + +- [ ] **Step 1: Append the test.** + +Append to `tests/test_replay_only_flag.py`: + +```python +@_pty_skip +def test_coda_run_does_not_create_project_dir(tmp_path, monkeypatch): + """Regression guard: coda_run is Mode 3 (replay-only, no project dir). + Only coda_interactive (Mode 2) creates dirs under ~/.coda/projects/. + + If a future change accidentally calls export_workspace_tree from + coda_run or otherwise creates a per-session project dir, this test fires. + """ + import asyncio + import json + from app import sessions, mcp_close_pty_session + from coda_mcp import mcp_server, task_manager + + monkeypatch.setenv("HOME", str(tmp_path)) + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path / "sessions")) + # Stop the watcher from racing the test. + monkeypatch.setattr(mcp_server, "_watch_task", lambda *a, **kw: None) + + result_str = asyncio.run(mcp_server.coda_run( + prompt="ignored", email="t@example.com", + )) + result = json.loads(result_str) + pty_id = None + try: + sess = task_manager._read_session(result["session_id"]) + pty_id = sess.get("pty_session_id") + + # Project dir must NOT exist for coda_run. + projects_root = os.path.join(str(tmp_path), ".coda", "projects") + assert not os.path.isdir(projects_root) or not os.listdir(projects_root), ( + f"coda_run unexpectedly created project dirs under {projects_root}: " + f"{os.listdir(projects_root) if os.path.isdir(projects_root) else 'n/a'}" + ) + finally: + if pty_id is not None: + mcp_close_pty_session(pty_id) +``` + +- [ ] **Step 2: Run.** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py -v 2>&1 | tail -10 +``` + +Expected: all pass (this test specifically asserts coda_run's NEGATIVE behavior). + +- [ ] **Step 3: Run the full suite.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: ~540 passed, 15 skipped (depending on PTY availability — some Task 7 tests skip on this Mac). + +- [ ] **Step 4: Commit.** + +```bash +git add tests/test_replay_only_flag.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "test: regression guard against coda_run creating project dirs + +Mode separation is the spine of the three-mode framework: coda_run is +replay-only (no project_dir, no workspace export), coda_interactive +is the only path that creates ~/.coda/projects/. If a future refactor +accidentally couples them, this test fails loudly." +``` + +--- + +## Final verification (post-task) + +- [ ] **F1: Full suite green.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -5 +``` + +Expected: all pass. + +- [ ] **F2: No grace/dead references re-introduced.** + +```bash +grep -rn "grace\|GRACE_PERIOD\|_mark_grace\|_bump_session_last_poll\|_schedule_deferred_close" coda_mcp/ app.py | grep -v "graceful\|GRACEFUL_" +``` + +Expected: no matches. + +- [ ] **F3: Mode separation still holds.** + +```bash +grep -n "_TOOL_DISPATCH" coda_mcp/mcp_endpoint.py +.venv/bin/python -c "from coda_mcp.mcp_endpoint import _TOOL_DISPATCH; print(sorted(_TOOL_DISPATCH))" +``` + +Expected: `['coda_get_result', 'coda_inbox', 'coda_interactive', 'coda_run']`. + +- [ ] **F4: Manual smoke (optional, requires deployed environment + a real Workspace Git Folder).** + +1. Restart the app: `uvicorn coda_mcp.mcp_asgi:app`. +2. From an MCP client, call `coda_interactive(prompt="explain this repo", workspace_path="/Workspace/Users/you@db.com/your-git-folder")`. +3. Open the returned `viewer_url`. Confirm: live attach lands you in a session with `claude` running, prompt visible in the chat, CWD is the project dir. +4. Type `/quit` then `exit`. Reattach to the URL — confirm replay or expired-session page. +5. SSH into the container (or check `/health`) — confirm `~/.coda/projects//` is gone. + +--- + +## Self-review checklist (run on completed plan) + +1. **Spec coverage** ✓ + - §1 Tool signature → Task 5 (stub + signature), Task 6 (workspace lookup/branch), Task 7 (full happy path: export+PTY+launch+prompt+viewer_url) + - §1a Caller pre-condition → Task 9 (MCP instructions string) + - §2 Agent launch matrix → Task 7 (`_AGENT_LAUNCH_CMDS`) + - §3 Project source export → Task 3 (`workspace_export.py`) + Task 7 wiring + - §4 Prompt seeding → Task 7 (`_PROMPT_SEED_DELAY_S` + send_input ordering) + - §5 PTY lifecycle → Task 4 (cleanup hook) + - §6 Where this lives + env-strip prereq → Task 1 (env-strip), Task 2 (cwd kwarg), Task 8 (Flask dispatch), Task 9 (instructions) + - Regression guard → Task 10 + +2. **Placeholders** ✓ — every step has concrete code/commands. The `# TODO(Task N+)` markers inside intermediate `coda_interactive` versions are explicit hand-offs between tasks, not deferred work. + +3. **Type consistency** ✓ + - `_ALLOWED_AGENTS: set[str]` — used identically in Tasks 5 and 7 + - `_AGENT_LAUNCH_CMDS: dict[str, str]` — defined in Task 7 + - `_PROMPT_SEED_DELAY_S: int` — defined in Task 7 + - `pty_session_id: str` — comes from `_app_create_session(...)`'s return; project_dir built from it + - `workspace_path: str`, `branch: str = ""`, `agent: str = "claude"` consistent across signature, tests, and instructions + +4. **Ordering safety** ✓ + - Prereq env-strip (Task 1) runs first — no Todo-2-specific dependency, just security cleanup + - `cwd` kwarg (Task 2) added before any caller uses it (Task 7, though ultimately unused — see Task 7 acknowledgment) + - `workspace_export.py` (Task 3) created before `coda_interactive` imports it (Task 7) + - Cleanup hook (Task 4) added before any project dir gets created (Task 7) + - `coda_interactive` built incrementally Tasks 5→7 with each task's tests gating progress + - Flask dispatch (Task 8) and instructions (Task 9) come after the tool itself exists + - Regression guard (Task 10) verifies the final state + +5. **Test discipline** ✓ + - Every code-adding task has a failing test in Step 1, verified failure in Step 2, implementation in Step 3, verified pass in Step 4 + - Tasks 8 (wiring) and 9 (docs) are not TDD but are minimal-risk + - Final regression guard (Task 10) defends against future drift + +--- + +## Plan critique gate + +**Cleared** (2026-05-28). Critic verdict: APPROVE WITH CHANGES. All flagged issues incorporated: + +1. **CRITICAL — Task 1 `sessions[sid]["env"]` key didn't exist.** Fixed: Task 1 now has an explicit Step 2 that adds the `"env"` key to the session dict before the env-strip refactor. Step 3 verifies the test fails for the RIGHT reason (credentials present), not silently passes. +2. **MAJOR — Task 7→Task 8 orphaned-state rework.** Fixed: Tasks 7 and 8 merged into a single Task 7 that creates the PTY FIRST, then builds the project_dir keyed by the PTY's session_id, then exports + cds + launches + seeds. Eliminates the intermediate state where the project dir's name didn't match the PTY's actual session id. +3. **MAJOR — Line number drift.** Fixed: `app.py:1402` → `app.py:1420`. `mcp_server.py:218` → "around line 190; insert between `coda_run` (ends near 289) and `coda_inbox`". Other line refs verified accurate. + +Original 10 critique questions, all answered in the critique pass: + +1. **Task 7 chicken-and-egg** — Resolved by merging Tasks 7+8. +2. **`cwd` kwarg unused** — Acceptable; tested optional kwarg left in place. Documented in Task 7 Acknowledgment. +3. **`WorkspaceClient` monkeypatch target** — Confirmed correct. Task 6 imports it module-level. +4. **`sessions[sid]["env"]` key** — Added explicitly in Task 1 Step 2 (was missing). +5. **`_PROMPT_SEED_DELAY_S` flake risk** — Tests patch to 0. Acceptable. +6. **`_app_create_session is None` null-check** — Consistent with `coda_run`'s pattern. +7. **`os.makedirs(exist_ok=True)`** — UUID collision probability negligible. Acceptable. +8. **Per-task commits** — Matches Todo 1's commit conventions. +9. **Line numbers** — Two references corrected (see MAJOR #3 above). +10. **Test count expectation** — Plausible estimates; exact counts depend on PTY availability. + +Plus eight additional critic-eye questions (spec coverage, ordering, TDD discipline, line numbers, test correctness, fragile assumptions, plan gate), all resolved. See the critic's verdict in the conversation history. + +Plan is ready for execution. diff --git a/docs/superpowers/plans/2026-05-28-coda-interactive-terminal-pull.md b/docs/superpowers/plans/2026-05-28-coda-interactive-terminal-pull.md new file mode 100644 index 0000000..0b244a1 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-coda-interactive-terminal-pull.md @@ -0,0 +1,581 @@ +# `coda_interactive` Terminal-Side Pull — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: superpowers:subagent-driven-development or superpowers:executing-plans. Steps use `- [ ]` checkboxes. + +**Goal:** Replace `coda_interactive`'s broken server-side Workspace export (runs as the app SP, which can't read the user's folder) with a terminal-side `databricks workspace export-dir` pull (runs as the user), guarded by a split wait + a server-side filesystem post-check. Delete `workspace_export.py`. + +**Architecture:** The MCP server types a chained `cd && databricks workspace export-dir ./ && cd ` into the PTY (which is authenticated as the app owner), waits for the pull to settle, verifies on the local filesystem that files arrived, then launches the agent and seeds the prompt. No `WorkspaceClient` in the tool anymore. + +**Tech stack:** Python 3.11, pytest, FastMCP. No new dependencies. Run tests with `uv run pytest`. + +**Reference:** `docs/superpowers/specs/2026-05-28-coda-interactive-terminal-pull-design.md` (full design, error table, risks). + +--- + +## Files + +- **Modify:** `coda_mcp/mcp_server.py` — remove export import + `WorkspaceClient` usage; add `re` import; add `_safe_dirname`, `_normalize_workspace_path`; refactor `_wait_for_agent_ready` → `_wait_for_output_stable` + wrapper; add `_EXPORT_MAX_WAIT_S`/`_EXPORT_STABILITY_S`; rewrite `coda_interactive` body. +- **Delete:** `coda_mcp/workspace_export.py`, `tests/test_workspace_export.py`. +- **Modify:** `tests/test_replay_only_flag.py` — refresh stale comment (line ~166). +- **Rewrite:** `tests/test_coda_interactive.py`. +- **Modify:** `tests/test_mcp_server.py` — add helper + wrapper tests. + +## Pre-flight + +- Worktree: `/Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp`, branch `feat/coda-mcp-interactive-handoff` (already merged with main / deps bump, HEAD `2dd66aa`). +- Commit identity: `-c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty"`. No AI co-author. +- `databricks workspace export-dir SOURCE TARGET` is verified: creates TARGET, recursive, auto notebook extensions, `--overwrite` flag (not needed here). + +--- + +## Task 1: Helpers + wait-helper refactor (TDD) + +**Files:** Modify `coda_mcp/mcp_server.py`; add tests to `tests/test_mcp_server.py`. + +- [ ] **Step 1: Write failing tests** — append to `tests/test_mcp_server.py`: + +```python +class TestInteractiveHelpers: + def test_safe_dirname_basename(self): + from coda_mcp.mcp_server import _safe_dirname + assert _safe_dirname("/Users/x@y.com/WAM") == "WAM" + assert _safe_dirname("/Users/x@y.com/WAM/") == "WAM" + + def test_safe_dirname_sanitizes(self): + from coda_mcp.mcp_server import _safe_dirname + assert _safe_dirname("/Users/x/My Project!") == "My_Project_" + + def test_safe_dirname_empty_fallback(self): + from coda_mcp.mcp_server import _safe_dirname + assert _safe_dirname("/") == "workspace" + assert _safe_dirname("") == "workspace" + + def test_normalize_strips_workspace_prefix(self): + from coda_mcp.mcp_server import _normalize_workspace_path + assert _normalize_workspace_path("/Workspace/Users/x/WAM") == "/Users/x/WAM" + + def test_normalize_leaves_plain_path(self): + from coda_mcp.mcp_server import _normalize_workspace_path + assert _normalize_workspace_path("/Users/x/WAM") == "/Users/x/WAM" + assert _normalize_workspace_path("/Users/x/WAM/") == "/Users/x/WAM" + + @pytest.mark.asyncio + async def test_wait_for_agent_ready_delegates(self, monkeypatch): + """_wait_for_agent_ready calls _wait_for_output_stable with prompt-seed constants.""" + from coda_mcp import mcp_server + seen = {} + async def fake_stable(pty, max_wait, stability): + seen["args"] = (pty, max_wait, stability) + monkeypatch.setattr(mcp_server, "_wait_for_output_stable", fake_stable) + await mcp_server._wait_for_agent_ready("pty-1") + assert seen["args"] == ("pty-1", mcp_server._PROMPT_SEED_MAX_WAIT_S, mcp_server._PROMPT_SEED_STABILITY_S) +``` + +- [ ] **Step 2: Run, expect FAIL** — `uv run pytest tests/test_mcp_server.py::TestInteractiveHelpers -v` → all fail (symbols don't exist). + +- [ ] **Step 3: Add `re` import** to `coda_mcp/mcp_server.py` (near `import os` at line 19, keep alphabetical-ish with the stdlib group): + +```python +import re +``` + +- [ ] **Step 4: Add the two helpers** in `coda_mcp/mcp_server.py` just above `_ALLOWED_AGENTS` (line 336): + +```python +def _safe_dirname(workspace_path: str) -> str: + """Local directory name for the pulled folder = sanitized basename.""" + base = os.path.basename(workspace_path.rstrip("/")) + safe = re.sub(r"[^A-Za-z0-9._-]", "_", base) + return safe or "workspace" + + +def _normalize_workspace_path(workspace_path: str) -> str: + """Canonical Workspace API path: drop the /Workspace FUSE prefix if present.""" + p = workspace_path.rstrip("/") + if p.startswith("/Workspace/"): + p = p[len("/Workspace"):] + return p +``` + +- [ ] **Step 5: Refactor the wait helper.** Replace the existing `_wait_for_agent_ready` definition (lines 346-380, the `async def _wait_for_agent_ready(...)` through the end of its `while` loop) with a generalized function plus a thin wrapper. Also add the two new constants next to the existing ones (after line 343): + +Add constants (after `_PROMPT_SEED_STABILITY_S = 1.0`): + +```python +_EXPORT_MAX_WAIT_S = 120.0 # generous; export-dir prints per-file so it won't prematurely stabilize mid-pull +_EXPORT_STABILITY_S = 1.5 +``` + +Replace the function: + +```python +async def _wait_for_output_stable(pty_session_id: str, max_wait: float, stability: float) -> None: + """Poll the PTY output buffer; return when it stabilizes or max_wait elapses. + + Stability = buffer length unchanged for ``stability`` seconds, after at + least one byte has appeared. If the session disappears mid-wait, return. + """ + from app import sessions + loop = asyncio.get_running_loop() + deadline = loop.time() + max_wait + last_len = -1 + stable_since: float | None = None + poll_interval = 0.1 + + while loop.time() < deadline: + await asyncio.sleep(poll_interval) + sess = sessions.get(pty_session_id) + if sess is None: + return + current_len = sum(len(chunk) for chunk in sess.get("output_buffer", [])) + if current_len > 0 and current_len == last_len: + if stable_since is None: + stable_since = loop.time() + elif (loop.time() - stable_since) >= stability: + return + else: + stable_since = None + last_len = current_len + + +async def _wait_for_agent_ready(pty_session_id: str) -> None: + """Wait for an agent TUI to settle (prompt-seed budget). Wrapper for back-compat.""" + await _wait_for_output_stable( + pty_session_id, _PROMPT_SEED_MAX_WAIT_S, _PROMPT_SEED_STABILITY_S + ) +``` + +- [ ] **Step 6: Run, expect PASS** — `uv run pytest tests/test_mcp_server.py::TestInteractiveHelpers -v` → all pass. Then `uv run pytest tests/test_mcp_server.py -q` → no regressions (coda_run still uses `_wait_for_agent_ready`). + +- [ ] **Step 7: Ruff** — `uv run ruff check coda_mcp/mcp_server.py tests/test_mcp_server.py` → clean. + +- [ ] **Step 8: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_server.py tests/test_mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: add _safe_dirname/_normalize_workspace_path + generalize wait helper + +_wait_for_output_stable(pty, max_wait, stability) is the parametrized poller; +_wait_for_agent_ready becomes a thin wrapper preserving the 5.0/1.0 budget so +coda_run is unaffected. Adds _EXPORT_MAX_WAIT_S/_EXPORT_STABILITY_S for the +upcoming terminal-side pull wait." +``` + +--- + +## Task 2: Rewrite `coda_interactive` + delete export module (TDD) + +**Files:** Modify `coda_mcp/mcp_server.py`; delete `coda_mcp/workspace_export.py` + `tests/test_workspace_export.py`; rewrite `tests/test_coda_interactive.py`; touch `tests/test_replay_only_flag.py` comment. + +- [ ] **Step 1: Rewrite `tests/test_coda_interactive.py`** to the new contract. Replace the whole file with: + +```python +"""Tests for coda_interactive — terminal-side workspace pull (no server-side export).""" +import json +import os + +import pytest + +from coda_mcp import mcp_server + + +@pytest.fixture +def wired(monkeypatch, tmp_path): + """Wire PTY hooks with recording mocks; HOME -> tmp so project_dir is sandboxed. + + The _app_send_input mock simulates a SUCCESSFUL export-dir by creating the + target dir + a file when it sees the pull command. Tests that want the + failure path override `simulate_pull` to False. + """ + monkeypatch.setenv("HOME", str(tmp_path)) + inputs: list[str] = [] + state = {"pty_id": "pty-abc123", "simulate_pull": True, "closed": []} + + def fake_create(label, replay_only=False, **kw): + return state["pty_id"] + + def fake_send(pty_id, text): + inputs.append(text) + # Simulate export-dir landing files on disk. + if state["simulate_pull"] and "export-dir" in text: + # project_dir = ~/.coda/projects/; name parsed from the command tail "cd " + project_dir = os.path.join(os.path.expanduser("~/.coda/projects"), state["pty_id"]) + # name is the final `cd ` token + name = text.rstrip().rsplit("cd ", 1)[-1].strip().strip("'\"") + target = os.path.join(project_dir, name) + os.makedirs(target, exist_ok=True) + with open(os.path.join(target, "README.md"), "w") as f: + f.write("# hi") + + def fake_close(pty_id): + state["closed"].append(pty_id) + + async def fake_wait(*a, **kw): + return None + + monkeypatch.setattr(mcp_server, "_app_create_session", fake_create) + monkeypatch.setattr(mcp_server, "_app_send_input", fake_send) + monkeypatch.setattr(mcp_server, "_app_close_session", fake_close) + monkeypatch.setattr(mcp_server, "_wait_for_output_stable", fake_wait) + monkeypatch.setattr(mcp_server, "_wait_for_agent_ready", fake_wait) + monkeypatch.setattr(mcp_server.url_builder, "build_viewer_url", lambda pid: f"https://viewer/{pid}") + return inputs, state + + +@pytest.mark.asyncio +async def test_pull_command_is_sent_first(wired): + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="analyze", workspace_path="/Workspace/Users/x@y.com/WAM", agent="claude") + first = inputs[0] + assert "databricks workspace export-dir" in first + assert "/Users/x@y.com/WAM" in first # /Workspace prefix stripped + assert "/Workspace/Users" not in first + assert "./WAM" in first and first.rstrip().endswith("WAM") # cd tail + + +@pytest.mark.asyncio +async def test_agent_launches_after_successful_pull(wired): + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent="claude") + assert any(t.strip() == "claude" for t in inputs) + + +@pytest.mark.asyncio +async def test_prompt_seeded_with_context_line(wired): + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="DO THE THING", workspace_path="/Users/x/WAM", agent="claude") + seeded = inputs[-1] + assert "/Users/x/WAM" in seeded + assert "DO THE THING" in seeded + assert "Workspace" in seeded # precondition (clean fail, not ValueError) + assert seeded.index("Workspace") < seeded.index("DO THE THING") # context precedes prompt + + +def test_instructions_drop_stale_export_wording(): + """Server-level MCP instructions must not claim the deleted server-side export.""" + txt = mcp_server.mcp.instructions + assert "server-side snapshot" not in txt + assert "export-dir" in txt # describes the real terminal-side pull mechanism + + +@pytest.mark.asyncio +async def test_empty_pull_returns_error_and_no_launch(wired): + inputs, state = wired + state["simulate_pull"] = False # export-dir produces nothing + out = json.loads(await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent="claude")) + assert out["status"] == "error" + assert state["closed"] == [state["pty_id"]] # PTY closed + assert not any(t.strip() == "claude" for t in inputs) # agent NOT launched + + +@pytest.mark.asyncio +async def test_happy_path_returns_launched(wired): + out = json.loads(await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent="claude")) + assert out["status"] == "launched" + assert out["viewer_url"] == "https://viewer/pty-abc123" + assert out["project_dir"].endswith(os.path.join("pty-abc123", "WAM")) + + +@pytest.mark.asyncio +async def test_unknown_agent_rejected(wired): + out = json.loads(await mcp_server.coda_interactive( + prompt="x", workspace_path="/Users/x/WAM", agent="bogus")) + assert out["status"] == "error" and "Unknown agent" in out["error"] + + +@pytest.mark.asyncio +async def test_pty_hook_not_wired(monkeypatch): + monkeypatch.setattr(mcp_server, "_app_create_session", None) + monkeypatch.setattr(mcp_server, "_app_send_input", None) + out = json.loads(await mcp_server.coda_interactive( + prompt="x", workspace_path="/Users/x/WAM", agent="claude")) + assert out["status"] == "error" and "PTY hook" in out["error"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("agent,cmd", [ + ("claude", "claude"), ("hermes", "hermes chat"), ("codex", "codex"), + ("gemini", "gemini"), ("opencode", "opencode"), +]) +async def test_agent_matrix(wired, agent, cmd): + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent=agent) + assert any(t.strip() == cmd for t in inputs) + + +def test_no_blocking_sleep_in_source(): + import inspect + src = inspect.getsource(mcp_server.coda_interactive) + assert "time.sleep(" not in src + + +def test_no_workspaceclient_in_module(): + """The export-era WorkspaceClient import/use is gone from the module.""" + import inspect + src = inspect.getsource(mcp_server) + assert "export_workspace_tree" not in src + assert "workspace.get_status(" not in src +``` + +- [ ] **Step 2: Run, expect FAIL** — `uv run pytest tests/test_coda_interactive.py -q` → fails (old behavior still in place; `export_workspace_tree`/`get_status` still present). + +- [ ] **Step 3: Rewrite `coda_interactive`** in `coda_mcp/mcp_server.py`. Replace the entire function body (lines 416-523, from `if agent not in _ALLOWED_AGENTS:` through the catch-all `return`) with: + +```python + if agent not in _ALLOWED_AGENTS: + return json.dumps({ + "status": "error", + "error": f"Unknown agent: {agent!r}. Allowed: {sorted(_ALLOWED_AGENTS)}", + }) + + if _app_create_session is None or _app_send_input is None: + return json.dumps({ + "status": "error", + "error": "PTY hook not wired", + }) + + pty_session_id = None + project_dir = None + try: + # Create PTY FIRST so we have its session_id for the project_dir name. + pty_session_id = _app_create_session( + label=f"{agent}-interactive", + replay_only=False, + ) + project_dir = os.path.join( + os.path.expanduser("~/.coda/projects"), + pty_session_id, + ) + os.makedirs(project_dir, exist_ok=True) + + name = _safe_dirname(workspace_path) + source_path = _normalize_workspace_path(workspace_path) + + # Pull the Workspace folder into ./ AS THE USER (terminal creds). + # A failed export-dir short-circuits the && chain, leaving absent; + # the filesystem check below turns that into a real error. + pull_cmd = ( + f"cd {shlex.quote(project_dir)} && " + f"databricks workspace export-dir {shlex.quote(source_path)} {shlex.quote('./' + name)} && " + f"cd {shlex.quote(name)}" + ) + _app_send_input(pty_session_id, pull_cmd + "\n") + + # Wait for the pull to finish (shell goes idle), then verify on disk. + await _wait_for_output_stable( + pty_session_id, _EXPORT_MAX_WAIT_S, _EXPORT_STABILITY_S + ) + + target_dir = os.path.join(project_dir, name) + if not os.path.isdir(target_dir) or not os.listdir(target_dir): + if _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + if os.path.isdir(project_dir): + shutil.rmtree(project_dir, ignore_errors=True) + return json.dumps({ + "status": "error", + "error": ( + f"No files were pulled from {workspace_path}. Check the path " + f"exists in the Workspace and that you have read access." + ), + }) + + # Launch the agent (fresh — same proven path as before). + launch_cmd = _AGENT_LAUNCH_CMDS[agent] + _app_send_input(pty_session_id, launch_cmd + "\n") + + # Wait for the agent TUI to settle, then paste the kickoff prompt with a + # context line naming the source so the agent knows where the files came from. + await _wait_for_agent_ready(pty_session_id) + seeded_prompt = ( + f"Your working directory contains files exported from the Databricks " + f"Workspace path {workspace_path}.\n\n{prompt}" + ) + _app_send_input(pty_session_id, seeded_prompt + "\n") + + viewer_url = url_builder.build_viewer_url(pty_session_id) + + return json.dumps({ + "status": "launched", + "viewer_url": viewer_url, + "agent": agent, + "project_dir": target_dir, + "workspace_path": workspace_path, + "instructions": ( + "Open viewer_url to attach. The agent is running in a directory " + "holding the files pulled from your Workspace folder, with your " + "kickoff prompt typed. Type the agent's quit command (e.g. /quit) " + "then `exit` to end the session. Note: files are a snapshot pulled " + "via 'databricks workspace export-dir' — git history is not included." + ), + }) + except Exception as e: + if pty_session_id and _app_close_session is not None: + try: + _app_close_session(pty_session_id) + except Exception: + pass + if project_dir and os.path.isdir(project_dir): + shutil.rmtree(project_dir, ignore_errors=True) + return json.dumps({ + "status": "error", + "error": f"coda_interactive failed: {e}", + }) +``` + +- [ ] **Step 4: Update the `coda_interactive` docstring** (lines 398-414). Replace the body text so it no longer says "exports its file tree / server-side snapshot". New docstring: + +```python + """Launch an interactive agent session in CoDA, handed off via a viewer URL. + + The MCP caller passes a Databricks Workspace directory path. CoDA pulls that + folder onto the session's disk IN THE TERMINAL (authenticated as you) via + ``databricks workspace export-dir``, launches the chosen agent (claude + default) in the pulled directory, auto-types ``prompt`` as the first user + input, and returns a ``viewer_url`` the calling user opens to drive it. + + If the pull produces no files (bad path or no read access) the tool returns + a ``status=error`` and does not launch the agent. + + Interactive sessions do NOT appear in ``coda_inbox`` and ``coda_get_result`` + will not return anything for them. The viewer URL is the only handle. + + ``email`` is accepted for forward-compatibility and is currently unused. + + Allowed agents: claude (default), hermes, codex, gemini, opencode. + """ +``` + +- [ ] **Step 4b: Update the server-level `mcp.instructions` blob** (lines ~95-98) so it no longer claims the deleted server-side export. Replace the exact substring: + +``` +"Folder, ensure the desired branch is checked out and pushed first — " +"the export is a server-side snapshot. The tool exports the directory " +"into a Coda-local working directory, launches the chosen agent " +``` + +with: + +``` +"Folder, ensure the desired branch is checked out first — " +"the pull is a point-in-time snapshot. The tool copies the directory " +"into a Coda-local working directory using your credentials (via " +"`databricks workspace export-dir`), launches the chosen agent " +``` + +This keeps the caller-facing contract (pass `workspace_path`, files-only, no git history) but stops describing a mechanism that no longer exists. Guarded by `test_instructions_drop_stale_export_wording`. + +- [ ] **Step 5: Remove the dead export imports.** In `coda_mcp/mcp_server.py` line 31, delete: + +```python +from coda_mcp.workspace_export import export_workspace_tree, _is_directory +``` + +And remove the `WorkspaceClient` import guard (lines ~33-36) IF nothing else in the file uses `WorkspaceClient`. Verify first: + +```bash +grep -n "WorkspaceClient" coda_mcp/mcp_server.py +``` + +If the only hits are the import guard, delete the guard block: + +```python +try: + from databricks.sdk import WorkspaceClient +except Exception: + WorkspaceClient = None # type: ignore +``` + +If `WorkspaceClient` is used elsewhere, leave the guard and only remove `coda_interactive`'s usage (already done in Step 3). + +- [ ] **Step 6: Delete the export module + its tests** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" rm coda_mcp/workspace_export.py tests/test_workspace_export.py +``` + +- [ ] **Step 7: Refresh the stale comment** in `tests/test_replay_only_flag.py` (~line 166). It currently references `export_workspace_tree`. Read the surrounding lines and reword so it describes the invariant generically (e.g. "must not create a project directory / pull workspace files") without naming the deleted symbol. Do NOT change the test's logic. + +- [ ] **Step 8: Run the target tests, expect PASS** + +```bash +uv run pytest tests/test_coda_interactive.py tests/test_mcp_server.py -v +``` +Expect all green. If `test_pull_command_is_sent_first` fails on the `endswith("WAM")` assertion, inspect the actual `pull_cmd` string and adjust the test's tail assertion to match the real (shlex-quoted) form — the production string is the source of truth for *behavior*, but the command MUST contain `databricks workspace export-dir`, the normalized source, and a final `cd `. + +- [ ] **Step 9: Import sanity** — `uv run python -c "import coda_mcp.mcp_server; import app"` → no ImportError (confirms the deleted module isn't imported anywhere at load time). + +- [ ] **Step 10: Ruff** — `uv run ruff check coda_mcp/mcp_server.py tests/test_coda_interactive.py tests/test_replay_only_flag.py` → clean (watch for now-unused imports like `shutil`/`shlex` — both are still used; confirm). + +- [ ] **Step 11: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_server.py tests/test_coda_interactive.py tests/test_replay_only_flag.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: coda_interactive pulls workspace files in the terminal, not server-side + +Root cause of the empty-session bug: the MCP server's WorkspaceClient runs as +the app service principal, which can't list/export the user's Workspace folder, +and the error was swallowed. Now the tool types 'databricks workspace export-dir' +into the PTY (authed as the user), waits for the pull to settle, verifies files +landed on disk, then launches the agent and seeds the prompt. Deletes +workspace_export.py and the server-side WorkspaceClient/get_status path." +``` + +--- + +## Task 3: Full regression sweep + +**Files:** none (verification only). + +- [ ] **Step 1: Targeted suite** + +```bash +uv run pytest tests/test_coda_interactive.py tests/test_mcp_server.py tests/test_task_manager.py tests/test_databricks_preamble.py tests/test_replay_only_flag.py -v +``` +Expect green. `test_replay_only_flag.py::test_coda_run_creates_pty_with_replay_only_true` is PTY-fd flaky in multi-file runs — if it fails, re-run that file alone; if it passes alone, it's environmental. + +- [ ] **Step 2: Confirm `workspace_export` is fully gone** + +```bash +grep -rn "workspace_export\|export_workspace_tree" coda_mcp/ tests/ || echo "CLEAN — no references remain" +``` +Expect only (at most) the reworded comment in `test_replay_only_flag.py` if you kept any mention; ideally CLEAN. + +- [ ] **Step 3: Ruff over the package** + +```bash +uv run ruff check coda_mcp/ tests/test_coda_interactive.py +``` +Expect clean. + +No commit (verification only). Proceed to final critic + push. + +--- + +## Self-review vs spec + +- AC1 (no export/WorkspaceClient/get_status in coda_interactive) → Task 2 Steps 3, 5; guarded by `test_no_workspaceclient_in_module`. +- AC2 (module + tests deleted, no importers) → Task 2 Step 6; Task 3 Step 2. +- AC3 (`_safe_dirname`/`_normalize_workspace_path`) → Task 1 Steps 4; tests Step 1. +- AC4 (`_wait_for_output_stable` + wrapper, coda_run unaffected) → Task 1 Step 5; `test_wait_for_agent_ready_delegates` + `tests/test_mcp_server.py` regression. +- AC5 (first input = chained pull, normalized source, ``) → `test_pull_command_is_sent_first`. +- AC6 (launch only if FS check passes; else error + close) → `test_empty_pull_returns_error_and_no_launch`. +- AC7 (prompt prefixed with context line) → `test_prompt_seeded_with_context_line`. +- AC8 (new + existing suites green) → Task 3. + +**Placeholder scan:** none. **Type consistency:** `_wait_for_output_stable(pty, max_wait, stability)` signature identical across Task 1 def, the wrapper, and `coda_interactive`'s two call sites. `_safe_dirname`/`_normalize_workspace_path` names identical in helpers, tests, and `coda_interactive`. + +**Risk flagged for the executor:** the `fake_send` mock in `test_coda_interactive.py` parses `` from the command tail via `rsplit("cd ", 1)`. If the production `pull_cmd` quoting makes that parse brittle, the executor should instead compute `name` in the fixture from the known `workspace_path` basename rather than parsing the command. The intent: simulate files appearing at `~/.coda/projects///`. diff --git a/docs/superpowers/plans/2026-05-28-coda-run-replay-only.md b/docs/superpowers/plans/2026-05-28-coda-run-replay-only.md new file mode 100644 index 0000000..6aacaf6 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-coda-run-replay-only.md @@ -0,0 +1,1079 @@ +# `coda_run` Replay-Only URL Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make `coda_run`'s returned `viewer_url` resolve to a read-only static transcript replay (never a live PTY attach), and rip out the unwired 5-minute grace-period machinery from PR #66 as a consequence. + +**Architecture:** Mode 3 in the three-mode framework (see spec `docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md`). A new `replay_only` boolean on the PTY session dict steers the existing `/api/session/attach` endpoint into the transcript-from-disk path unconditionally for `coda_run`-created sessions. The watcher closes the PTY immediately on task completion — no deferred timer. + +**Tech Stack:** Python 3.11 + Flask + FastMCP + uvicorn (ASGI) + pytest. No new deps. All changes localized to `app.py`, `coda_mcp/mcp_server.py`, and the test suite. + +--- + +## Pre-flight check (do before Task 1) + +- [ ] **P1: Verify baseline tests pass.** + +```bash +cd /Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp +.venv/bin/python -m pytest tests/ -x --ignore=tests/e2e -q 2>&1 | tail -20 +``` + +Expected: All pass (~527 passed + ~11 PTY-gated skipped). If anything fails on `main` for unrelated reasons, stop and report. + +- [ ] **P2: Confirm worktree is on the `feat/coda-mcp-live-session-url` branch.** + +```bash +git branch --show-current +``` + +Expected: `feat/coda-mcp-live-session-url` + +--- + +## Task 1: Add `replay_only` parameter to `mcp_create_pty_session` + +Backward-compatible default (`False`) so existing callers (direct-launch via `create_session`, future `coda_interactive`) keep their behavior unchanged. + +**Files:** +- Modify: `app.py` (function `mcp_create_pty_session`, line ~1402, and the session-dict insert at ~1469) +- Create: `tests/test_replay_only_flag.py` + +- [ ] **Step 1: Write the failing test.** + +Create `tests/test_replay_only_flag.py`: + +```python +"""Tests for the replay_only flag on PTY sessions.""" +import pytest + +# Reuse the PTY-availability guard pattern from the suite. +import os +try: + import pty as _pty + _master, _slave = _pty.openpty() + os.close(_master) + os.close(_slave) + _PTY_AVAILABLE = True +except Exception: + _PTY_AVAILABLE = False + +_pty_skip = pytest.mark.skipif( + not _PTY_AVAILABLE, + reason="PTY not allocatable in this environment", +) + + +@_pty_skip +def test_mcp_create_pty_session_stores_replay_only_flag(): + """Creating a PTY with replay_only=True stores the flag in the session dict.""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = mcp_create_pty_session(label="t1", replay_only=True) + try: + assert sessions[sid].get("replay_only") is True + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_mcp_create_pty_session_defaults_replay_only_false(): + """Default for replay_only is False (backward compat).""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = mcp_create_pty_session(label="t2") + try: + assert sessions[sid].get("replay_only") is False + finally: + mcp_close_pty_session(sid) +``` + +- [ ] **Step 2: Run the test and verify it fails.** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py -v 2>&1 | tail -15 +``` + +Expected: 2 failures. First test fails with `TypeError: mcp_create_pty_session() got an unexpected keyword argument 'replay_only'`. Second test fails with `assert None is False` (the key doesn't exist yet so `.get` returns None, which is not `False`). + +- [ ] **Step 3: Add the parameter and storage.** + +In `app.py`, change the `mcp_create_pty_session` signature (search for `def mcp_create_pty_session`): + +```python +# Before: +def mcp_create_pty_session(label: str = "hermes-mcp", transcript_path: str | None = None) -> str: + +# After: +def mcp_create_pty_session( + label: str = "hermes-mcp", + transcript_path: str | None = None, + replay_only: bool = False, +) -> str: +``` + +In the same function, add the `replay_only` key to the session dict that's being built (find the dict literal that contains `"grace": False,` — that's the one). Add right after the existing `"grace": False,` line: + +```python + "grace": False, + "replay_only": replay_only, # NEW +``` + +(The `"grace": False,` line gets removed entirely in Task 8 — leave it alone here.) + +- [ ] **Step 4: Run the test and verify it passes.** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py -v 2>&1 | tail -10 +``` + +Expected: 2 passed. + +- [ ] **Step 5: Commit.** + +```bash +git add app.py tests/test_replay_only_flag.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: add replay_only param to mcp_create_pty_session + +Backward-compatible default (False). Stored in session dict for later +attach-time enforcement." +``` + +--- + +## Task 2: Extract `_serve_transcript_replay` helper from `attach_session` + +Pure refactor. Extracts the transcript-from-disk lookup currently inlined in `attach_session` at `app.py:1170-1188` into a reusable helper. Existing tests (`tests/test_replay_attach.py`) act as the safety net. + +**Files:** +- Modify: `app.py` (`attach_session` at ~1158, plus new helper above it) + +- [ ] **Step 1: Verify existing replay tests pass (the safety net).** + +```bash +.venv/bin/python -m pytest tests/test_replay_attach.py -v 2>&1 | tail -10 +``` + +Expected: 2 passed (the two tests that already exist for transcript-after-PTY-exit replay). + +- [ ] **Step 2: Add the helper just above `attach_session`.** + +In `app.py`, find `@app.route("/api/session/attach"` (around line 1157). Just **above** the `@app.route` decorator, add this helper: + +```python +def _serve_transcript_replay(session_id: str): + """Serve the on-disk transcript for a PTY session as a replay response. + + Used by attach_session() in two cases: + 1. The PTY is gone (existing transcript-fallback path). + 2. The PTY exists but is replay_only=True (new in Task 3). + + Returns either a Flask JSON response with replay=True, or a 404 if no + transcript exists for this pty_session_id. + """ + from coda_mcp import task_manager as _tm + tdir = _tm.find_task_dir_by_pty_session(session_id) + if tdir: + transcript = os.path.join(tdir, "transcript.log") + if os.path.isfile(transcript): + try: + with open(transcript, "rb") as f: + content = f.read() + return jsonify({ + "session_id": session_id, + "label": "hermes-mcp (replay)", + "output": [content.decode("utf-8", errors="replace")], + "replay": True, + "process": None, + "created_at": None, + }) + except OSError: + pass + return jsonify({"error": "Session not found or exited"}), 404 +``` + +- [ ] **Step 3: Replace the inlined block in `attach_session` with a helper call.** + +Inside `attach_session`, find the block: + +```python + sess = _get_session(session_id) + if not sess or sess.get("exited"): + # Replay fallback: look up transcript.log by pty_session_id + from coda_mcp import task_manager as _tm + tdir = _tm.find_task_dir_by_pty_session(session_id) + if tdir: + transcript = os.path.join(tdir, "transcript.log") + if os.path.isfile(transcript): + try: + with open(transcript, "rb") as f: + content = f.read() + return jsonify({ + "session_id": session_id, + "label": "hermes-mcp (replay)", + "output": [content.decode("utf-8", errors="replace")], + "replay": True, + "process": None, + "created_at": None, + }) + except OSError: + pass + return jsonify({"error": "Session not found or exited"}), 404 +``` + +Replace it with: + +```python + sess = _get_session(session_id) + if not sess or sess.get("exited"): + return _serve_transcript_replay(session_id) +``` + +- [ ] **Step 4: Run replay tests to verify behavior is preserved.** + +```bash +.venv/bin/python -m pytest tests/test_replay_attach.py tests/test_transcript.py -v 2>&1 | tail -20 +``` + +Expected: All pass (refactor is behavior-preserving). + +- [ ] **Step 5: Commit.** + +```bash +git add app.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "refactor: extract _serve_transcript_replay helper from attach_session + +Pure refactor — no behavior change. Helper is also used by the new +replay_only short-circuit in the next commit." +``` + +--- + +## Task 3: Enforce `replay_only=True` in `attach_session` + +New early-return: if the live session has `replay_only=True`, serve the transcript regardless of whether the PTY is still alive. + +**Files:** +- Modify: `app.py` (`attach_session`) +- Modify: `tests/test_replay_only_flag.py` + +- [ ] **Step 1: Add two failing tests.** + +Append to `tests/test_replay_only_flag.py`: + +```python +@_pty_skip +def test_attach_session_replay_only_alive_pty_returns_replay(tmp_path, monkeypatch): + """A replay_only=True PTY that is still alive serves the transcript, not the live buffer.""" + from app import app as flask_app, mcp_create_pty_session, mcp_close_pty_session, sessions + from coda_mcp import task_manager + + # Point task_manager at a tmp sessions root so find_task_dir_by_pty_session resolves. + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + + # Create a fake task dir keyed by the PTY id we'll mint shortly. + sid = mcp_create_pty_session(label="t-replay-alive", replay_only=True) + try: + # Plant a session.json that links task → this pty_session_id, plus a transcript. + sess_id = "sess-fake" + task_id = "task-fake" + sdir = tmp_path / sess_id + tdir = sdir / "tasks" / task_id + tdir.mkdir(parents=True) + (sdir / "session.json").write_text( + '{"session_id": "%s", "pty_session_id": "%s"}' % (sess_id, sid) + ) + (tdir / "transcript.log").write_bytes(b"HELLO TRANSCRIPT") + + # Bust the lookup cache so find_task_dir_by_pty_session sees the new files. + task_manager._pty_lookup_cache.clear() + + client = flask_app.test_client() + resp = client.post("/api/session/attach", json={"session_id": sid}) + + assert resp.status_code == 200 + body = resp.get_json() + assert body["replay"] is True + assert body["output"] == ["HELLO TRANSCRIPT"] + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_attach_session_replay_only_false_alive_pty_returns_live_buffer(): + """A replay_only=False PTY that is still alive returns the live output_buffer (unchanged behavior).""" + from app import app as flask_app, mcp_create_pty_session, mcp_close_pty_session + + sid = mcp_create_pty_session(label="t-live", replay_only=False) + try: + client = flask_app.test_client() + resp = client.post("/api/session/attach", json={"session_id": sid}) + + assert resp.status_code == 200 + body = resp.get_json() + assert body.get("replay") in (False, None) # live path doesn't set replay key + assert "output" in body + finally: + mcp_close_pty_session(sid) +``` + +- [ ] **Step 2: Run the new tests and verify they fail (first one only — second should pass already).** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py -v 2>&1 | tail -20 +``` + +Expected: `test_attach_session_replay_only_alive_pty_returns_replay` FAILS (because the alive PTY currently returns the live buffer, not the transcript). `test_attach_session_replay_only_false_alive_pty_returns_live_buffer` PASSES (existing behavior is correct). The two Task 1 tests still pass. + +- [ ] **Step 3: Add the early-return in `attach_session`.** + +In `app.py`, modify the body of `attach_session`. Find: + +```python + sess = _get_session(session_id) + if not sess or sess.get("exited"): + return _serve_transcript_replay(session_id) +``` + +Insert the new replay-only check **between** the `_get_session` call and the `if not sess` check: + +```python + sess = _get_session(session_id) + + # Replay-only sessions (e.g. those created by coda_run) always serve the + # transcript-from-disk, even when the PTY is still alive. + if sess and sess.get("replay_only"): + return _serve_transcript_replay(session_id) + + if not sess or sess.get("exited"): + return _serve_transcript_replay(session_id) +``` + +- [ ] **Step 4: Run the new tests and verify they pass.** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py -v 2>&1 | tail -10 +``` + +Expected: 4 passed. + +- [ ] **Step 5: Commit.** + +```bash +git add app.py tests/test_replay_only_flag.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: replay_only PTY sessions short-circuit to transcript in attach_session + +Replay-only sessions always serve the on-disk transcript regardless of +whether the PTY is still alive. Used by coda_run (wired in the next commit)." +``` + +--- + +## Task 4: Wire `coda_run` to pass `replay_only=True` + +One-line change in the call to `_app_create_session` (the hook that points to `mcp_create_pty_session`). + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (around line 289 — the `_app_create_session(...)` call inside `coda_run`) +- Modify: `tests/test_replay_only_flag.py` + +- [ ] **Step 1: Add a failing test.** + +Append to `tests/test_replay_only_flag.py`: + +```python +@_pty_skip +def test_coda_run_creates_pty_with_replay_only_true(tmp_path, monkeypatch): + """coda_run must create its PTY with replay_only=True.""" + import asyncio + import json + from app import sessions + from coda_mcp import mcp_server, task_manager + + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + # Stop the watcher from racing the test — we only care about creation here. + monkeypatch.setattr(mcp_server, "_watch_task", lambda *a, **kw: None) + + result_str = asyncio.run(mcp_server.coda_run(prompt="ignored", email="t@example.com")) + result = json.loads(result_str) + pty_id = task_manager._read_session(result["session_id"])["pty_session_id"] + try: + assert sessions[pty_id].get("replay_only") is True + finally: + from app import mcp_close_pty_session + mcp_close_pty_session(pty_id) +``` + +- [ ] **Step 2: Run and verify failure.** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py::test_coda_run_creates_pty_with_replay_only_true -v 2>&1 | tail -10 +``` + +Expected: FAIL — `assert None is True` (or `assert False is True`) because `coda_run` is not yet passing the flag. + +- [ ] **Step 3: Modify `coda_run` in `coda_mcp/mcp_server.py`.** + +Find the `_app_create_session(...)` call inside `coda_run` (search for `pty_session_id = _app_create_session(`). Currently: + +```python + pty_session_id = _app_create_session( + label="hermes-mcp", + transcript_path=transcript_path, + ) +``` + +Add the new kwarg: + +```python + pty_session_id = _app_create_session( + label="hermes-mcp", + transcript_path=transcript_path, + replay_only=True, # NEW: coda_run URLs are post-hoc review only + ) +``` + +- [ ] **Step 4: Run and verify pass.** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py -v 2>&1 | tail -10 +``` + +Expected: 5 passed. + +- [ ] **Step 5: Commit.** + +```bash +git add coda_mcp/mcp_server.py tests/test_replay_only_flag.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "feat: coda_run creates PTY sessions with replay_only=True + +Mode 3 in the three-mode framework. The viewer_url returned by coda_run +now always resolves to a transcript-from-disk replay." +``` + +--- + +## Task 5: Switch `_watch_task` to immediate PTY close (pure refactor) + +Replace `_schedule_deferred_close(session_id)` with `_close_pty_immediately(session_id)` in `_watch_task`. Both functions already exist — this is a one-name-for-another swap. **Not a TDD task** — existing tests (specifically `tests/test_mcp_integration.py`, which already calls `_close_pty_immediately`-equivalent paths directly) act as the safety net. The "no timer" behavior is hard to test as a red-green cycle without instrumenting the watcher's polling loop, which isn't worth the complexity here. + +**Files:** +- Modify: `coda_mcp/mcp_server.py` (`_watch_task`, around lines 133 and 160) + +- [ ] **Step 1: Confirm existing safety-net tests pass.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_integration.py tests/test_mcp_server.py -v 2>&1 | tail -10 +``` + +Expected: All pass. These tests cover `_watch_task`'s completion path and `_close_pty_immediately`'s teardown. + +- [ ] **Step 2: Locate the call sites.** + +```bash +grep -n "_schedule_deferred_close" coda_mcp/mcp_server.py +``` + +Expected: 3 matches — one at the function definition (~line 186), two call sites inside `_watch_task` (~lines 133 and 160). You're swapping the two call sites; the definition gets deleted in Task 7. + +- [ ] **Step 3: Swap the calls in `_watch_task`.** + +In `coda_mcp/mcp_server.py`, at each of the **two** call sites inside `_watch_task` (the success branch and the timeout branch), replace: + +```python +# Before: +_schedule_deferred_close(session_id) + +# After: +_close_pty_immediately(session_id) +``` + +Leave the `_schedule_deferred_close` function definition alone for now — it becomes dead code that Task 7 deletes. + +- [ ] **Step 4: Re-run the safety-net tests.** + +```bash +.venv/bin/python -m pytest tests/test_mcp_integration.py tests/test_mcp_server.py -v 2>&1 | tail -10 +``` + +Expected: All pass. Behavior is preserved at the test-observable level (the watcher still drives a teardown after completion); only the timing changes (immediate vs. 5-min deferred), and no current test asserts the 5-min delay (the grace-timing tests use `monkeypatch` to shrink it to milliseconds). + +- [ ] **Step 5: Confirm via grep that `_watch_task` no longer calls `_schedule_deferred_close`.** + +```bash +grep -n "_schedule_deferred_close" coda_mcp/mcp_server.py +``` + +Expected: 1 match (only the function definition itself, which Task 7 will delete). + +- [ ] **Step 6: Commit.** + +```bash +git add coda_mcp/mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "refactor: _watch_task uses _close_pty_immediately instead of deferred close + +Pure call-site swap. Behavior change: PTY teardown is immediate rather +than 5-minute-deferred. _schedule_deferred_close becomes dead code, +ripped out in a follow-up commit." +``` + +--- + +## Task 6: Drop dead grace tests + +Now that no production code path calls grace machinery, the tests that exercise it can go. Doing this BEFORE the code rip-out keeps the suite green at every commit. + +**Files:** +- Modify: `tests/test_transcript.py` (delete 4 tests) +- Modify: `tests/test_mcp_server.py` (delete 2 tests + setup/teardown grace lines) +- Modify: `tests/test_mcp_integration.py` (delete 1 test) + +- [ ] **Step 1: Delete grace tests from `tests/test_transcript.py`.** + +Open `tests/test_transcript.py`. Delete these **4 test functions in full** (each is one block from `def` line through to the next blank line / next `def`): + +| Test | Approx line | +|---|---| +| `def test_grace_period_pty_does_not_count_toward_max(monkeypatch):` | 135 | +| `def test_bump_session_last_poll_advances_clock(monkeypatch):` | 157 | +| `def test_mark_grace_on_missing_session_is_noop():` | 169 | +| `def test_bump_session_last_poll_missing_is_noop():` | 174 | + +Re-verify after deletion: + +```bash +grep -n "grace\|_mark_grace\|_bump_session\|GRACE" tests/test_transcript.py +``` + +Expected: no matches. + +- [ ] **Step 2: Delete grace tests from `tests/test_mcp_server.py`.** + +Delete: +- `def test_set_app_hooks_accepts_grace_and_bump_hooks():` (around line 361) +- The function that starts at line ~399 (the `monkeypatch.setattr(mcp_server, "GRACE_PERIOD_S", 0.05)` one — search for `GRACE_PERIOD_S` to find it). + +Also in the setup/teardown fixtures at the top of the file (lines 21-22 and 27-28), remove the lines: + +```python + mcp_server._app_mark_grace = None + mcp_server._app_bump_poll = None +``` + +Verify: + +```bash +grep -n "grace\|mark_grace\|bump_poll\|GRACE" tests/test_mcp_server.py +``` + +Expected: no matches. + +- [ ] **Step 3: Delete the grace E2E test from `tests/test_mcp_integration.py`.** + +Delete the entire `# ── 7. E2E: grace period + transcript replay ────────────────────────` section. Specifically: +- The section header comment at line ~293 +- The full `def test_end_to_end_grace_and_replay(tmp_path, monkeypatch):` function (starts line 315, ends after line ~408) + +Verify: + +```bash +grep -n "grace\|GRACE\|_mark_grace" tests/test_mcp_integration.py +``` + +Expected: no matches. + +- [ ] **Step 4: Run the full suite — must still pass.** + +```bash +.venv/bin/python -m pytest tests/ -x --ignore=tests/e2e -q 2>&1 | tail -20 +``` + +Expected: All remaining tests pass. The grace tests are gone; nothing imports `_mark_grace_for_session` or `GRACE_PERIOD_S` from test code anymore. + +- [ ] **Step 5: Commit.** + +```bash +git add tests/test_transcript.py tests/test_mcp_server.py tests/test_mcp_integration.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "test: drop dead grace-period tests + +Prep for grace-machinery rip-out in follow-up commits. Removes 7 tests +that exercised code paths now superseded by replay_only + immediate close." +``` + +--- + +## Task 7: Rip out grace machinery from `coda_mcp/mcp_server.py` + +Delete `_schedule_deferred_close`, the grace hook slots, and the `GRACE_PERIOD_S` constant. Also clean up `set_app_hooks` and `_close_pty_immediately`'s docstring. + +**Files:** +- Modify: `coda_mcp/mcp_server.py` + +- [ ] **Step 1: Verify nothing in the suite imports the symbols you're about to delete.** + +```bash +grep -rn "_schedule_deferred_close\|_app_mark_grace\|_app_bump_poll\|GRACE_PERIOD_S" coda_mcp/ tests/ app.py +``` + +Expected: Only matches inside `coda_mcp/mcp_server.py`. If any tests still import these, return to Task 6. + +- [ ] **Step 2: Remove the dead module-level state and the function.** + +In `coda_mcp/mcp_server.py`: + +- Delete lines 79-80: `_app_mark_grace = None` and `_app_bump_poll = None` +- Delete line 82: `GRACE_PERIOD_S = 300 # 5 minutes` +- Delete the entire `_schedule_deferred_close` function (lines ~186-213). Search for `def _schedule_deferred_close` and delete from that line through the function's closing line. + +- [ ] **Step 3: Update `set_app_hooks` signature.** + +Find `def set_app_hooks(` (around line 85). Currently it accepts `mark_grace_fn` and `bump_poll_fn` parameters. Remove those parameters from the signature, and remove the lines inside the function body that assign them to the module-level slots (`_app_mark_grace = mark_grace_fn`, `_app_bump_poll = bump_poll_fn`). + +Also update the function's docstring — search for the line that mentions "defer PTY close by ``GRACE_PERIOD_S``" and rewrite the docstring to remove grace references entirely. + +- [ ] **Step 4: Update `_close_pty_immediately` docstring.** + +Find `def _close_pty_immediately(` (around line 167). Its docstring currently says it's for "emergency teardown or tests". Rewrite to reflect that it's the normal close path: + +```python +def _close_pty_immediately(session_id: str) -> None: + """Close the PTY session associated with this task session immediately. + + Called by ``_watch_task`` as soon as the task transitions to completed + or failed. Reads ``pty_session_id`` from the task-manager's session.json + and calls the ``_app_close_session`` hook (i.e. ``mcp_close_pty_session`` + in production). + """ +``` + +- [ ] **Step 5: Update the module-level docstring.** + +At the top of `coda_mcp/mcp_server.py`, find the line that mentions hooks (around line 9: "handled through optional app hooks set via ``set_app_hooks()``."). Make sure it doesn't claim grace functionality. Search for any other comment block referencing grace and remove. + +- [ ] **Step 6: Run the suite.** + +```bash +.venv/bin/python -m pytest tests/ -x --ignore=tests/e2e -q 2>&1 | tail -15 +``` + +Expected: All pass. + +- [ ] **Step 7: Commit.** + +```bash +git add coda_mcp/mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "refactor: rip out grace-period machinery from coda_mcp/mcp_server.py + +Removes _schedule_deferred_close, GRACE_PERIOD_S, the unused grace hook +slots, and the corresponding set_app_hooks parameters. The grace hooks +were never wired in production — this is dead code removal, not a +behavior change." +``` + +--- + +## Task 8: Rip out grace machinery from `app.py` + +Delete `_mark_grace_for_session`, `_bump_session_last_poll`, the `grace` key from the session dict creation, and the `MAX_CONCURRENT_SESSIONS` exclusion at all 4 sites. + +**Files:** +- Modify: `app.py` + +- [ ] **Step 1: Remove the `"grace": False,` key from session dict creation in `mcp_create_pty_session`.** + +In `app.py`, find the dict literal in `mcp_create_pty_session` that contains `"grace": False,` (around line 1477). Delete that single line. The `replay_only` line you added in Task 1 stays. + +There may be ANOTHER similar `"grace": False,` line in the other session-creation path inside `create_session` (search the file for `"grace": False,` — there may be 2 occurrences). Delete both. + +```bash +grep -n '"grace"' app.py +``` + +Expected after deletion: no matches. + +- [ ] **Step 2: Revert the `MAX_CONCURRENT_SESSIONS` exclusion at 4 sites.** + +Search for `sum(1 for s in sessions.values() if not s.get("grace"))`: + +```bash +grep -n "if not s.get(\"grace\")" app.py +``` + +Expected: 4 matches at lines around 1329, 1369, 1405, 1456. + +**CRITICAL — locking note:** All 4 sites are **already** inside a `with sessions_lock:` block (the lock is acquired by the surrounding session-creation code immediately before the check). `sessions_lock` is `threading.Lock()` (not `RLock`), so **do NOT** wrap the replacement in another `with sessions_lock:` — that will deadlock. Just use `len(sessions)` directly. + +At each of the 4 sites, replace: + +```python +# Before (inside an existing `with sessions_lock:` block): +active = sum(1 for s in sessions.values() if not s.get("grace")) +if active >= MAX_CONCURRENT_SESSIONS: + ... +``` + +With: + +```python +# After (still inside the same `with sessions_lock:` block — no new lock): +active = len(sessions) +if active >= MAX_CONCURRENT_SESSIONS: + ... +``` + +To verify each site really is inside a lock block, read the ~5 lines preceding each `sum(...)` call. You should see `with sessions_lock:` at lines 1328, 1366 (for site 1369), 1404 (for site 1405), and 1455 (for site 1456). If any site is somehow NOT already locked, stop and ask before proceeding — the original code may have a latent bug worth investigating. + +- [ ] **Step 3: Delete `_mark_grace_for_session` and `_bump_session_last_poll`.** + +Find both functions (around lines 1515 and 1530). Delete each function definition in full. + +- [ ] **Step 4: Verify no stale references.** + +```bash +grep -n "grace\|_mark_grace\|_bump_session_last_poll" app.py +``` + +Expected: no matches (or only comment lines that reference history — delete those too). + +- [ ] **Step 5: Run the suite.** + +```bash +.venv/bin/python -m pytest tests/ -x --ignore=tests/e2e -q 2>&1 | tail -15 +``` + +Expected: All pass. + +- [ ] **Step 6: Commit.** + +```bash +git add app.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "refactor: rip out grace-period machinery from app.py + +Removes _mark_grace_for_session, _bump_session_last_poll, the 'grace' +key on session dicts, and the MAX_CONCURRENT_SESSIONS exclusion at all +4 check sites. Grace was never wired through set_app_hooks in prod, so +this removes dead code." +``` + +--- + +## Task 9: Update MCP `instructions` string + check `mcp_asgi.py` cleanup + +The FastMCP `instructions` string at `mcp_server.py:61-66` currently tells callers to "SHARE THE LIVE URL" and "watch progress". With replay-only semantics, that text is wrong. + +**Files:** +- Modify: `coda_mcp/mcp_server.py` +- Spot-check: `coda_mcp/mcp_asgi.py` + +- [ ] **Step 1: Locate the instructions string.** + +```bash +grep -n "SHARE THE LIVE URL\|watch progress\|live URL" coda_mcp/mcp_server.py +``` + +Expected: matches near the `FastMCP(...)` instantiation block (around lines 61-66). + +- [ ] **Step 2: Rewrite the relevant paragraph.** + +In `coda_mcp/mcp_server.py`, find the paragraph that starts "SHARE THE LIVE URL" (or whatever the exact phrasing is at lines 61-66). Replace it with: + +``` +SHARE THE REPLAY URL: After calling coda_run, you receive a ``viewer_url`` +in the response. Pass this URL to your user so they can open it in a browser +to review the agent's transcript — what was prompted, what was reasoned, what +was produced. The URL is read-only and serves a static replay of the session, +so it remains valid indefinitely after the task completes. +``` + +(Exact wording may need adjustment to match the surrounding paragraph style — read the surrounding text first.) + +- [ ] **Step 3: Spot-check `mcp_asgi.py`.** + +```bash +grep -n "set_app_hooks\|grace\|mark_grace\|bump_poll" coda_mcp/mcp_asgi.py +``` + +Expected: a `set_app_hooks(...)` call exists but does **not** pass grace-related kwargs (per critic's finding). No changes needed. If grace kwargs ARE passed (shouldn't be, but verify), remove them. + +- [ ] **Step 4: Verify nothing relies on the old text.** + +```bash +grep -rn "watch progress\|live URL\|LIVE URL" docs/ tests/ static/ +``` + +Expected: matches only in historical documents (specs/plans from prior PRs). No live code depends on the old phrasing. + +- [ ] **Step 5: Run the suite.** + +```bash +.venv/bin/python -m pytest tests/ -x --ignore=tests/e2e -q 2>&1 | tail -15 +``` + +Expected: All pass. + +- [ ] **Step 6: Commit.** + +```bash +git add coda_mcp/mcp_server.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "docs: update MCP instructions string for replay-only viewer_url semantics + +The viewer_url returned by coda_run is no longer a live attach — it is +a static replay. Update the FastMCP instructions text accordingly so +MCP clients describe it correctly to end users." +``` + +--- + +## Task 10: Update / rewrite `test_replay_attach.py` for the new contract + +After the rip-out, `test_replay_attach.py` may pass without changes (the helper extraction and replay-only flag don't break its existing assertions). But the two tests in it should now make the stronger assertion: replay works regardless of PTY state, not just after the PTY has exited. + +**Files:** +- Modify: `tests/test_replay_attach.py` + +- [ ] **Step 1: Read the current contents.** + +```bash +cat tests/test_replay_attach.py +``` + +- [ ] **Step 2: Run the file as-is to confirm green starting point.** + +```bash +.venv/bin/python -m pytest tests/test_replay_attach.py -v 2>&1 | tail -10 +``` + +Expected: 2 passed. + +- [ ] **Step 3: Strengthen the assertions.** + +The existing tests likely create a transcript file and an exited PTY, then assert that attach returns replay. Add a third test that uses a `replay_only=True` PTY which is STILL ALIVE and asserts the same — confirming the new short-circuit. + +**Important:** This test allocates a real PTY (via `mcp_create_pty_session`), so it needs the same `_pty_skip` guard pattern used in `tests/test_replay_only_flag.py`. Add the guard at the top of the file if it isn't there already (next to the existing imports). + +At the top of `tests/test_replay_attach.py`, if not already present, add: + +```python +import os as _os +import pytest as _pytest + +try: + import pty as _pty + _master, _slave = _pty.openpty() + _os.close(_master) + _os.close(_slave) + _PTY_AVAILABLE = True +except Exception: + _PTY_AVAILABLE = False + +_pty_skip = _pytest.mark.skipif( + not _PTY_AVAILABLE, + reason="PTY not allocatable in this environment", +) +``` + +Then add to the end of `tests/test_replay_attach.py`: + +```python +@_pty_skip +def test_attach_session_returns_replay_for_alive_replay_only_pty(tmp_path, monkeypatch): + """A coda_run-style PTY (replay_only=True) that is still alive serves the transcript. + + This is the new contract introduced by the replay-only flag — historically + a live PTY would serve its output_buffer. + """ + import os + from app import app as flask_app, mcp_create_pty_session, mcp_close_pty_session + from coda_mcp import task_manager + + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + + sid = mcp_create_pty_session(label="replay-alive", replay_only=True) + try: + sess_id = "sess-x" + task_id = "task-x" + sdir = tmp_path / sess_id + tdir = sdir / "tasks" / task_id + tdir.mkdir(parents=True) + (sdir / "session.json").write_text( + '{"session_id": "%s", "pty_session_id": "%s"}' % (sess_id, sid) + ) + (tdir / "transcript.log").write_bytes(b"FROM DISK") + # Cache may have stale entries from earlier tests — clear before the lookup. + task_manager._pty_lookup_cache.clear() + + client = flask_app.test_client() + resp = client.post("/api/session/attach", json={"session_id": sid}) + assert resp.status_code == 200 + body = resp.get_json() + assert body["replay"] is True + assert body["output"] == ["FROM DISK"] + finally: + mcp_close_pty_session(sid) +``` + +- [ ] **Step 4: Run.** + +```bash +.venv/bin/python -m pytest tests/test_replay_attach.py -v 2>&1 | tail -10 +``` + +Expected: 3 passed. + +- [ ] **Step 5: Commit.** + +```bash +git add tests/test_replay_attach.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "test: extend test_replay_attach.py for alive-PTY replay_only case + +Confirms the new contract: replay-only sessions always serve the +transcript-from-disk, even when the PTY is still alive." +``` + +--- + +## Task 11: Add regression-guard test + +Prevent future drift that accidentally re-introduces `grace` on the `coda_run` path. + +**Files:** +- Modify: `tests/test_replay_only_flag.py` + +- [ ] **Step 1: Append the regression test.** + +Append to `tests/test_replay_only_flag.py`: + +```python +@_pty_skip +def test_no_grace_key_in_coda_run_session_dict(): + """Regression guard: coda_run-created PTYs must not have a 'grace' key, + and mcp_create_pty_session must not accept a 'grace' kwarg. + + Protects against accidental re-introduction of grace-period machinery + in future changes. + """ + import inspect + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + # The function signature must not include 'grace'. + sig = inspect.signature(mcp_create_pty_session) + assert "grace" not in sig.parameters, ( + f"mcp_create_pty_session should not accept a 'grace' parameter " + f"(found in signature: {list(sig.parameters)})" + ) + + # And the session dict must not contain a 'grace' key. + sid = mcp_create_pty_session(label="t-no-grace", replay_only=True) + try: + assert "grace" not in sessions[sid], ( + f"session dict should not contain a 'grace' key " + f"(found: {list(sessions[sid].keys())})" + ) + finally: + mcp_close_pty_session(sid) +``` + +- [ ] **Step 2: Run.** + +```bash +.venv/bin/python -m pytest tests/test_replay_only_flag.py -v 2>&1 | tail -15 +``` + +Expected: 7 passed (the previous 6 + this regression-guard). + +- [ ] **Step 3: Run the full suite one final time.** + +```bash +.venv/bin/python -m pytest tests/ -x --ignore=tests/e2e -q 2>&1 | tail -15 +``` + +Expected: Around 525 passed + ~11 skipped (PTY-gated). Net change from baseline: -2 tests. + +- [ ] **Step 4: Commit.** + +```bash +git add tests/test_replay_only_flag.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" commit -m "test: regression guard against re-introduction of grace key + +Asserts mcp_create_pty_session does not accept a 'grace' kwarg and that +coda_run-created session dicts contain no 'grace' key. Catches drift +if a future change tries to bring the grace machinery back." +``` + +--- + +## Final verification (post-task) + +- [ ] **F1: Full suite green.** + +```bash +.venv/bin/python -m pytest tests/ --ignore=tests/e2e -q 2>&1 | tail -10 +``` + +Expected: all pass. + +- [ ] **F2: `grep` confirms no stale references.** + +```bash +grep -rn "grace\|GRACE_PERIOD\|_mark_grace\|_bump_session_last_poll\|_schedule_deferred_close" coda_mcp/ app.py 2>&1 | grep -v ".pyc\|.git" +``` + +Expected: no matches (or only matches in comments that document the removal — those are fine). + +- [ ] **F3: Manual smoke (optional, requires deployed environment).** + +1. Restart the app (`uvicorn coda_mcp.mcp_asgi:app`). +2. Trigger a `coda_run` from an MCP client. Capture the `viewer_url`. +3. Open the URL in a browser **while** hermes is still running. Confirm: read-only replay UI, no terminal input box. +4. Wait for hermes to complete (~30s). Confirm: PTY is gone from `/health` (`active_sessions` returns to baseline). +5. Re-open the URL. Confirm: same read-only replay, full final transcript. + +--- + +## Self-review checklist (run on completed plan) + +1. **Spec coverage** ✓ + - Section "Add replay_only flag" → Task 1 + - Section "Enforce replay-only" → Tasks 2 (extract) + 3 (enforce) + - Section "Wire coda_run" → Task 4 + - Section "Rip out grace machinery" → Tasks 6 (tests) + 7 (mcp_server.py) + 8 (app.py) + - Section "Watcher teardown on completion" → Task 5 + - "Docstrings to update" → Tasks 7 (docstring inside) + 9 (MCP instructions) + - "Regression guard" → Task 11 + +2. **Placeholders** ✓ — every step has concrete code/commands. No TBDs. + +3. **Type consistency** ✓ + - `replay_only: bool = False` used identically in signature, dict, and tests + - `_close_pty_immediately(session_id: str) -> None` — task-manager session_id, not pty_session_id (the function takes the task session ID and looks up the PTY internally) + - `_serve_transcript_replay(session_id)` — pty_session_id (passed straight through to `find_task_dir_by_pty_session`) + +4. **Ordering safety** ✓ + - Tests dropped (Task 6) BEFORE code rip-out (Tasks 7, 8) → suite stays green + - `_watch_task` swap (Task 5) BEFORE `_schedule_deferred_close` deletion (Task 7) → no orphan calls + - `replay_only` storage (Task 1) BEFORE attach short-circuit (Task 3) → flag exists before being read + +--- + +## Plan critique gate + +**Cleared** (2026-05-28). Critic verdict: APPROVE WITH CHANGES. Issues found and resolved: + +1. **CRITICAL — locking deadlock in Task 8 Step 2.** Original instruction wrapped the replacement code in `with sessions_lock:`, but all 4 MAX_CONCURRENT sites are already inside `with sessions_lock:` blocks. `sessions_lock` is a non-reentrant `threading.Lock()`, so the wrap would deadlock the server. Fixed: Task 8 Step 2 now explicitly says "do NOT wrap" and replaces the code with bare `active = len(sessions)`. + +2. **MAJOR — TDD violation in Task 5.** Original task tried to wrap the `_watch_task` swap in a red-green cycle, but the test ended up passing on first run (it called `_close_pty_immediately` directly, not through `_watch_task`). Fixed: Task 5 relabeled as a non-TDD refactor with existing integration tests as the safety net, in the same style as Task 2. + +3. **MAJOR — missing `_pty_skip` in Task 10 test.** New test in `test_replay_attach.py` allocates a real PTY but didn't carry the PTY-skip guard, so it would error on CI environments without `pty.openpty()`. Fixed: Task 10 now adds the guard pattern at the file top and decorates the new test with `@_pty_skip`. + +4. **MINOR — vague test names in Task 6 Step 1.** Original named 2 of 4 grace tests to delete and said "plus two more". Fixed: all 4 tests now named explicitly in a table. + +Per-dimension verdicts from the critic: +- **Spec coverage**: Complete (all spec sections map to ≥1 task) +- **Task atomicity & ordering**: Sound — green at every commit boundary +- **TDD discipline**: Clean after Task 5 relabel (Tasks 1, 3, 4, 11 do genuine red-green; Tasks 2, 5 are pure refactors with safety-net tests) +- **Line-number accuracy**: Verified exact at every reference (no drift) +- **Test-code correctness**: All fixtures/imports/decorators verified after fixes +- **Concurrency**: Safe after Task 8 lock-wrap fix +- **Commit messages**: Conventional-commits format with `-c user.email=datasciencemonkey@gmail.com` override — correct + +Plan is ready for execution. diff --git a/docs/superpowers/plans/2026-05-28-coda-run-workflow-protocol.md b/docs/superpowers/plans/2026-05-28-coda-run-workflow-protocol.md new file mode 100644 index 0000000..60a29f6 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-coda-run-workflow-protocol.md @@ -0,0 +1,1156 @@ +# `coda_run` Workflow Protocol Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Inject a Databricks orientation block (CAPABILITIES) and a structured 3-phase workflow protocol (PLAN → EXECUTE → SYNTHESIZE with critique at each phase) into every `coda_run` task's `prompt.txt`. Add a third terminal `result.json` status `"info_needed"` with a required `feedback` field so the calling client can iterate when the agent is blocked. Update `coda_inbox`, `coda_get_result`, and the MCP `instructions` block to know about the new status and its `needs_approval` sibling. + +**Architecture:** +- Pure-function module `coda_mcp/databricks_preamble.py` produces the two new prompt sections (CAPABILITIES, WORKFLOW PROTOCOL). One source of truth for the skill list. Trivially unit-testable. +- `task_manager.wrap_prompt()` gains a `workflow_protocol: bool = True` parameter. When true, inserts the two sections between TASK and INSTRUCTIONS, and updates INSTRUCTIONS to describe new step labels and the `info_needed` status. The flag flows from `coda_run` through `create_task` to `wrap_prompt` — three call sites, one parameter. +- Inbox / result surfaces (`coda_inbox` counts dict, `coda_get_result` docstring, the FastMCP `instructions=` block at server construction) are updated to tolerate and surface the new statuses (`info_needed`, `needs_approval`). +- Tests pin the prompt sections verbatim where it matters, pin the skill list against CLAUDE.md, and guard the new counts-dict keys and docstring content. + +**Tech Stack:** Python 3.11, pytest, MagicMock, FastMCP. No new dependencies. + +--- + +## Files modified by this plan + +- **Create:** `coda_mcp/databricks_preamble.py` — new module, three exports +- **Create:** `tests/test_databricks_preamble.py` — unit tests for the new module +- **Modify:** `coda_mcp/task_manager.py:153-225` — `wrap_prompt` signature, body, INSTRUCTIONS section text +- **Modify:** `coda_mcp/task_manager.py:231-...` — `create_task` signature + forwarding +- **Modify:** `coda_mcp/mcp_server.py:52-99` — FastMCP `instructions=` block (add INFO_NEEDED HANDOFF paragraph) +- **Modify:** `coda_mcp/mcp_server.py:220-227` — `coda_run` signature + forwarding +- **Modify:** `coda_mcp/mcp_server.py:551-559` — `coda_inbox` counts dict +- **Modify:** `coda_mcp/mcp_server.py:573-584` — `coda_get_result` docstring +- **Create:** `tests/test_inbox_status_passthrough.py` — counts dict + docstring + MCP instructions tests + +## Pre-flight context + +- Worktree: `/Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp` +- Branch: `feat/coda-mcp-interactive-handoff` (PR #67, in-flight — this lands as follow-up commits) +- Run tests with `uv run pytest` (per user's `always use uv` directive) +- Commit identity: `-c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty"`. No AI/Claude co-author lines. +- The full spec is `docs/superpowers/specs/2026-05-28-coda-run-workflow-protocol-design.md` — consult for full text of CAPABILITIES, WORKFLOW PROTOCOL, and DISAMBIGUATION sections. +- Skill list source of truth: the "Databricks Skills" markdown table in the project-level `CLAUDE.md` at the repo root (`/Users/sathish.gangichetty/Documents/xterm-experiment/.worktrees/coda-mcp/CLAUDE.md`). + +--- + +## Task 1: Create `databricks_preamble.py` module and unit tests (TDD) + +This task creates a new module with three pure functions and exhaustive tests. New module → tests and implementation land in the same commit (the module doesn't exist for the tests to fail against prior to the commit, so "RED-then-GREEN-in-one-commit" is the right shape here). + +**Files:** +- Create: `coda_mcp/databricks_preamble.py` +- Create: `tests/test_databricks_preamble.py` + +- [ ] **Step 1: Write the new module `coda_mcp/databricks_preamble.py`** + +Create the file with this exact content: + +```python +"""Builders for the CoDA prompt envelope's CAPABILITIES and WORKFLOW PROTOCOL sections. + +These are injected into prompt.txt by ``task_manager.wrap_prompt`` when +``workflow_protocol=True``. Pure functions — no side effects, no I/O. +""" +from __future__ import annotations + + +_DATABRICKS_SKILLS: tuple[str, ...] = ( + "agent-bricks", + "databricks-genie", + "databricks-app-python", + "databricks-app-apx", + "databricks-jobs", + "databricks-unity-catalog", + "spark-declarative-pipelines", + "aibi-dashboards", + "model-serving", + "mlflow-evaluation", + "asset-bundles", + "databricks-python-sdk", + "databricks-config", + "databricks-docs", + "synthetic-data-generation", + "unstructured-pdf-generation", +) + + +def get_databricks_skills() -> tuple[str, ...]: + """Return the canonical Databricks skill list. Tests pin this against CLAUDE.md.""" + return _DATABRICKS_SKILLS + + +def build_capabilities() -> str: + """Orientation block: CLI, skills, MCP servers, when to prefer Databricks-native paths.""" + skills_lines = [] + # Pack 4 skills per line for readability in prompt.txt. + for i in range(0, len(_DATABRICKS_SKILLS), 4): + chunk = _DATABRICKS_SKILLS[i:i + 4] + skills_lines.append("- " + ", ".join(chunk)) + skills_block = "\n".join(skills_lines) + return ( + "You are running inside CoDA on a Databricks-authenticated host.\n" + "\n" + "Databricks CLI: pre-configured. `databricks current-user me` confirms auth.\n" + "Use it for jobs, workspace, clusters, warehouses, Unity Catalog operations.\n" + "\n" + "Skills available at ~/.claude/skills/ — read each skill's SKILL.md before\n" + "invoking. Relevant Databricks skills:\n" + f"{skills_block}\n" + "\n" + "MCP servers wired:\n" + "- DeepWiki — ask_question, read_wiki_contents for any GitHub repo\n" + "- Exa — web_search_exa, web_fetch_exa for live web context\n" + "- CoDA — chain follow-up tasks via previous_session_id\n" + "\n" + "When the task touches Databricks data, pipelines, jobs, dashboards, agents,\n" + "or model serving, DEFAULT to the skill / CLI / SDK path above instead of\n" + "generic Python or web search." + ) + + +def build_workflow_protocol() -> str: + """3-phase workflow with critique at each phase + info_needed escape hatch.""" + return ( + "You MUST process this task in three phases. Emit status.jsonl events as\n" + "you go (one JSON object per line, format below).\n" + "\n" + "PHASE 1 — PLAN\n" + "- Write a step-by-step plan as a status.jsonl line with step=\"plan\" and\n" + " message containing the numbered steps.\n" + "- Then critique your own plan as if you were a separate reviewer.\n" + " (Spawn a sub-agent for the critique if your agent supports it; otherwise\n" + " write the critique inline as a self-review.) Emit step=\"critique_plan\"\n" + " with the verdict (APPROVE / BLOCK / APPROVE-WITH-FIXES) and findings.\n" + "- If the critique surfaces blockers, revise the plan once and re-emit\n" + " step=\"plan\". Maximum 2 plan iterations total.\n" + "- If after 2 attempts you still cannot produce a viable plan, write\n" + " result.json with status=\"info_needed\" (see below) and stop.\n" + "\n" + "PHASE 2 — EXECUTE\n" + "- Work the plan. Emit step=\"execute_\" lines after completing each plan\n" + " step (n is 1-indexed, matches the plan's numbering).\n" + "- After execution, emit step=\"critique_execute\" with a review of what got\n" + " built vs what the plan said. APPROVE / BLOCK / APPROVE-WITH-FIXES.\n" + "- If the critique surfaces correctness or scope gaps, fix them and re-emit\n" + " step=\"critique_execute\". Maximum 2 execute iterations total.\n" + "- If you hit a hard blocker (missing access, missing data, ambiguous\n" + " requirements that the plan revealed only mid-execution), write\n" + " result.json with status=\"info_needed\" and stop.\n" + "\n" + "PHASE 3 — SYNTHESIZE\n" + "- Write result.json with status=\"completed\".\n" + "- Emit step=\"critique_synthesize\" with a review of the result against the\n" + " original TASK.\n" + "- If the critique surfaces gaps, revise result.json. Maximum 2 synthesis\n" + " iterations total.\n" + "\n" + "If at any phase you cannot proceed, use the INFO_NEEDED escape hatch:\n" + "- Set status=\"info_needed\" in result.json.\n" + "- Set \"feedback\" to a precise, actionable string naming exactly what is\n" + " missing (a table name, a decision, an access grant, a clarification).\n" + " The calling client will read this and resubmit with the missing context.\n" + "- \"info_needed\" is NOT a failure — it is a structured request for\n" + " iteration. Use it whenever you would otherwise have to guess.\n" + "\n" + "If you encounter a hard, unrecoverable failure (a command crashed, an SDK\n" + "returned 500, a file is corrupt), use status=\"failed\" with a description\n" + "in \"errors\".\n" + "\n" + "DISAMBIGUATION — two soft statuses already exist and they mean different\n" + "things; use the right one:\n" + "- \"info_needed\" — the CALLER must add missing context (table name,\n" + " business decision, file contents, access grant) before the task can\n" + " proceed. Used when ambiguity or missing input blocks you.\n" + "- \"needs_approval\" — you have a concrete plan to do something destructive\n" + " (drop a table, delete a job, modify permissions). You will execute it\n" + " if and only if the caller explicitly approves. Used at the SAFETY\n" + " boundary, never for ambiguity. See SAFETY section below.\n" + "\n" + "If both apply (e.g. \"I'd drop a table but I'm not sure which one\"), prefer\n" + "\"info_needed\" — resolving the ambiguity first is cheaper than approving\n" + "the wrong destructive action." + ) +``` + +- [ ] **Step 2: Write `tests/test_databricks_preamble.py`** + +Create the file with this exact content: + +```python +"""Unit tests for coda_mcp.databricks_preamble.""" +import re + +from coda_mcp.databricks_preamble import ( + build_capabilities, + build_workflow_protocol, + get_databricks_skills, +) + + +def test_get_databricks_skills_returns_exactly_sixteen(): + skills = get_databricks_skills() + assert isinstance(skills, tuple) + assert len(skills) == 16, f"Expected 16 skills, got {len(skills)}: {skills}" + + +def test_skills_list_matches_claude_md(): + """The hardcoded skill tuple must match the Databricks Skills table in CLAUDE.md. + + Drift in either direction (added to tuple but not docs, or vice versa) fails + this test. The test is the canary that forces both sources to stay in sync. + """ + import os + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + claude_md = os.path.join(repo_root, "CLAUDE.md") + with open(claude_md, "r") as f: + text = f.read() + # Find the Databricks Skills section. Names are comma-separated within table cells. + section_match = re.search( + r"###\s+Databricks Skills.*?(?=\n###|\n##|\Z)", + text, re.DOTALL, + ) + assert section_match, "Could not find 'Databricks Skills' section in CLAUDE.md" + section = section_match.group(0) + # Extract skill names: kebab-case tokens that follow a list pattern. Be loose — + # accept anything that looks like a skill identifier inside table cells. + skill_names_in_md = set(re.findall(r"\b([a-z][a-z0-9-]{2,}(?:-[a-z0-9]+)+)\b", section)) + skills_in_code = set(get_databricks_skills()) + # Every skill in code must appear in CLAUDE.md. + missing_from_md = skills_in_code - skill_names_in_md + assert not missing_from_md, ( + f"Skills in code but NOT in CLAUDE.md (update CLAUDE.md): {missing_from_md}" + ) + # Every skill in CLAUDE.md's Databricks section must appear in code. + # Filter out section/category words that match the regex but aren't skill names. + section_noise = { + "ai-agents", "data-engineering", # category labels, hyphenated + } + missing_from_code = (skill_names_in_md - skills_in_code) - section_noise + assert not missing_from_code, ( + f"Skills in CLAUDE.md but NOT in code (update databricks_preamble.py): " + f"{missing_from_code}" + ) + + +def test_capabilities_mentions_cli(): + text = build_capabilities() + assert "Databricks CLI" in text + assert "databricks current-user me" in text + + +def test_capabilities_lists_at_least_ten_skills(): + text = build_capabilities() + skills = get_databricks_skills() + hits = sum(1 for s in skills if s in text) + assert hits >= 10, f"Expected at least 10 skills in CAPABILITIES, found {hits}" + + +def test_capabilities_mentions_all_three_mcp_servers(): + text = build_capabilities() + assert "DeepWiki" in text + assert "Exa" in text + assert "CoDA" in text + + +def test_capabilities_under_token_budget(): + text = build_capabilities() + # ~4 chars/token rough lower bound. 1600 chars ≈ 400 tokens budget. + assert len(text) < 1600, ( + f"CAPABILITIES is {len(text)} chars (~{len(text)//4} tokens); budget is 1600." + ) + + +def test_workflow_protocol_lists_three_phases(): + text = build_workflow_protocol() + assert "PHASE 1 — PLAN" in text + assert "PHASE 2 — EXECUTE" in text + assert "PHASE 3 — SYNTHESIZE" in text + + +def test_workflow_protocol_caps_iterations_at_two(): + text = build_workflow_protocol() + # The string "Maximum 2" should appear once per phase = 3 times. + count = text.count("Maximum 2") + assert count == 3, f"Expected 'Maximum 2' to appear 3 times (once per phase); got {count}" + + +def test_workflow_protocol_describes_info_needed(): + text = build_workflow_protocol() + assert "info_needed" in text + assert "feedback" in text + + +def test_workflow_protocol_disambiguates_needs_approval(): + text = build_workflow_protocol() + assert "needs_approval" in text + assert "DISAMBIGUATION" in text + + +def test_workflow_protocol_under_token_budget(): + text = build_workflow_protocol() + # ~4 chars/token. 3200 chars ≈ 800 tokens budget. + assert len(text) < 3200, ( + f"WORKFLOW PROTOCOL is {len(text)} chars (~{len(text)//4} tokens); budget is 3200." + ) +``` + +- [ ] **Step 3: Run the test file to verify everything passes** + +Run: `uv run pytest tests/test_databricks_preamble.py -v` +Expected: 11 passed. + +If a test fails, fix the module (NOT the test) — the test pins the spec. + +The one possible test that needs adjustment: `test_skills_list_matches_claude_md` reads CLAUDE.md and parses its Databricks Skills section. The regex pattern is loose; if it picks up false-positives (e.g. category labels that contain hyphens), add them to `section_noise`. Don't loosen the assertion itself. + +- [ ] **Step 4: Run ruff check** + +Run: `uv run ruff check coda_mcp/databricks_preamble.py tests/test_databricks_preamble.py` +Expected: All checks passed. + +- [ ] **Step 5: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/databricks_preamble.py tests/test_databricks_preamble.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: add databricks_preamble module — CAPABILITIES + WORKFLOW PROTOCOL builders + +Two pure-function builders for the new prompt envelope sections plus the +canonical Databricks skill list. Tests pin the skill list against CLAUDE.md +to catch drift in either direction, and pin both sections to token budgets." +``` + +--- + +## Task 2: Wire `workflow_protocol` flag through wrap_prompt → create_task → coda_run (TDD) + +A single flag, three call sites. TDD: write the tests against the desired flow, watch them fail, then wire the flag. + +**Files:** +- Modify: `coda_mcp/task_manager.py:153-225` (`wrap_prompt` — signature + body) +- Modify: `coda_mcp/task_manager.py:231-...` (`create_task` — signature + forward) +- Modify: `coda_mcp/mcp_server.py:220-227` (`coda_run` — signature + forward) +- Modify (or create): `tests/test_task_manager.py` (extend if exists; create otherwise) + +- [ ] **Step 1: Check whether `tests/test_task_manager.py` already exists** + +Run: `ls -la tests/test_task_manager.py 2>&1 || echo "MISSING"` + +If it exists, you'll append tests. If it doesn't, you'll create it. + +- [ ] **Step 2: Append (or create with) these tests for the flag wiring** + +Add these tests to `tests/test_task_manager.py` (create the file if missing — start with `"""Tests for coda_mcp.task_manager."""` plus imports). + +```python +def test_wrap_prompt_default_includes_capabilities_and_workflow(): + """Default workflow_protocol=True; rendered prompt contains both new sections.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/results", + ) + assert "CAPABILITIES:" in out + assert "WORKFLOW PROTOCOL:" in out + # Sanity: still has the existing structure. + assert "TASK:" in out + assert "INSTRUCTIONS:" in out + assert "SAFETY:" in out + + +def test_wrap_prompt_workflow_protocol_false_omits_sections(): + """With workflow_protocol=False, both new sections are absent.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/results", + workflow_protocol=False, + ) + assert "CAPABILITIES:" not in out + assert "WORKFLOW PROTOCOL:" not in out + # Existing sections are still present. + assert "TASK:" in out + assert "INSTRUCTIONS:" in out + + +def test_wrap_prompt_workflow_protocol_default_is_true(): + """Signature inspection: default value of workflow_protocol is True.""" + import inspect + from coda_mcp.task_manager import wrap_prompt + + sig = inspect.signature(wrap_prompt) + assert "workflow_protocol" in sig.parameters + assert sig.parameters["workflow_protocol"].default is True + + +def test_create_task_signature_has_workflow_protocol_param(): + """create_task accepts workflow_protocol kwarg with default True.""" + import inspect + from coda_mcp.task_manager import create_task + + sig = inspect.signature(create_task) + assert "workflow_protocol" in sig.parameters + assert sig.parameters["workflow_protocol"].default is True + + +def test_create_task_forwards_workflow_protocol_to_wrap_prompt(monkeypatch, tmp_path): + """create_task must pass workflow_protocol through to wrap_prompt.""" + from coda_mcp import task_manager + + captured: dict = {} + + def fake_wrap_prompt(**kwargs): + captured.update(kwargs) + return "DUMMY PROMPT" + + monkeypatch.setattr(task_manager, "wrap_prompt", fake_wrap_prompt) + monkeypatch.setattr(task_manager, "_session_dir", lambda sid: str(tmp_path)) + monkeypatch.setattr(task_manager, "_task_dir", lambda sid, tid: str(tmp_path)) + # _write_json is the real helper used inside create_task (writes meta.json + session file). + # Stub it out — we're testing flag pass-through, not filesystem behavior. + monkeypatch.setattr(task_manager, "_write_json", lambda *a, **kw: None) + monkeypatch.setattr(task_manager.os, "makedirs", lambda *a, **kw: None) + # Stub the file-open for prompt.txt write. + real_open = open + def fake_open(path, mode="r", *args, **kwargs): + if "prompt.txt" in str(path) and "w" in mode: + import io + return io.StringIO() + return real_open(path, mode, *args, **kwargs) + monkeypatch.setattr("builtins.open", fake_open) + + task_manager.create_task( + session_id="s-1", + prompt="x", + email="u@example.com", + workflow_protocol=False, + ) + assert captured.get("workflow_protocol") is False + + +def test_coda_run_signature_has_workflow_protocol_param(): + """coda_run accepts workflow_protocol kwarg with default True.""" + import inspect + from coda_mcp import mcp_server + + sig = inspect.signature(mcp_server.coda_run) + assert "workflow_protocol" in sig.parameters + assert sig.parameters["workflow_protocol"].default is True +``` + +- [ ] **Step 3: Run the new tests; verify they FAIL** + +Run: `uv run pytest tests/test_task_manager.py -v` (or whichever file you appended to) +Expected: All 6 new tests FAIL — `wrap_prompt`/`create_task`/`coda_run` don't accept the kwarg yet. + +- [ ] **Step 4: Modify `coda_mcp/task_manager.py:153` — `wrap_prompt` signature + body** + +Open `coda_mcp/task_manager.py` and find the existing `wrap_prompt` function (starts around line 153). Change its signature and body as follows. + +Add a new import at the top of the file (if not already present, near other coda_mcp imports): + +```python +from coda_mcp.databricks_preamble import build_capabilities, build_workflow_protocol +``` + +Then change the function signature from: + +```python +def wrap_prompt( + task_id: str, + session_id: str, + email: str, + prompt: str, + context: dict | None, + results_dir: str, + context_hint: str | None = None, + previous_session_id: str | None = None, +) -> str: +``` + +to: + +```python +def wrap_prompt( + task_id: str, + session_id: str, + email: str, + prompt: str, + context: dict | None, + results_dir: str, + context_hint: str | None = None, + previous_session_id: str | None = None, + workflow_protocol: bool = True, +) -> str: +``` + +Update the docstring to mention the new flag: + +```python +"""Build the full prompt string written to ``prompt.txt``. + +Uses the ``---CODA-TASK---`` envelope convention so the agent can +parse metadata from the prompt deterministically. + +When ``workflow_protocol`` is True (default), inserts a CAPABILITIES +section (Databricks CLI, skills, MCP servers) and a WORKFLOW PROTOCOL +section (3-phase PLAN/EXECUTE/SYNTHESIZE with critique at each phase, +plus the info_needed escape hatch). Set False to skip both. +""" +``` + +Update the body. The current return statement looks roughly like this (around lines 184-225): + +```python +return ( + f"---CODA-TASK---\n" + ... + f"TASK:\n" + f"{prompt}\n" + f"\n" + f"INSTRUCTIONS:\n" + ... + f"SAFETY:\n" + ... + f"---END-CODA-TASK---" +) +``` + +Change it to insert the new sections between TASK and INSTRUCTIONS: + +```python +workflow_block = "" +if workflow_protocol: + workflow_block = ( + f"\nCAPABILITIES:\n" + f"{build_capabilities()}\n" + f"\n" + f"WORKFLOW PROTOCOL:\n" + f"{build_workflow_protocol()}\n" + ) + +return ( + f"---CODA-TASK---\n" + f"task_id: {task_id}\n" + f"session_id: {session_id}\n" + f"user: {email}\n" + f"{hint_line}" + f"{prior_session_block}" + f"{context_block}\n" + f"TASK:\n" + f"{prompt}\n" + f"{workflow_block}" + f"\n" + f"INSTRUCTIONS:\n" + f"1. As you work, append progress lines to {results_dir}/status.jsonl\n" + f' Each line must be valid JSON: {{"step": "label", "message": "what you are doing"}}\n' + f"\n" + f"2. When you are COMPLETELY DONE, write a SINGLE FILE at this exact path:\n" + f" {results_dir}/result.json\n" + f" It must contain this JSON structure:\n" + f" {{\n" + f' "status": "completed",\n' + f' "summary": "one paragraph describing what you did",\n' + f' "files_changed": ["list", "of", "file", "paths"],\n' + f' "artifacts": {{}},\n' + f' "errors": []\n' + f" }}\n" + f" If you failed, set status to \"failed\" and describe the error.\n" + f" IMPORTANT: result.json is a FILE not a directory. Write it with:\n" + f" echo '{{...}}' > {results_dir}/result.json\n" + f"\n" + f"3. If you delegate to a sub-agent, update status.jsonl with delegation steps.\n" + f"\n" + f"SAFETY:\n" + f"- Do NOT delete, drop, or truncate tables, schemas, catalogs, or volumes.\n" + f"- Do NOT delete files outside the current project directory.\n" + f"- Do NOT run destructive Databricks CLI commands (e.g. databricks clusters delete, " + f"databricks jobs delete, databricks pipelines delete).\n" + f"- Do NOT modify permissions, grants, or access controls unless explicitly requested.\n" + f"- Prefer CREATE OR REPLACE over DROP+CREATE. Prefer INSERT/MERGE over DELETE+INSERT.\n" + f"- If the task requires a destructive operation, describe what you would do in " + f"result.json with status \"needs_approval\" instead of executing it.\n" + f"---END-CODA-TASK---" +) +``` + +Note: the INSTRUCTIONS body itself is updated in Task 3 to mention `info_needed` and the new step labels. For this task, leave the INSTRUCTIONS text exactly as today — only insert the new sections. + +- [ ] **Step 5: Modify `coda_mcp/task_manager.py:231` — `create_task` signature + forward** + +Find the `create_task` function (starts around line 231). Add `workflow_protocol: bool = True` to its parameter list (alongside the existing kwargs like `timeout_s`, `permissions`, `previous_session_id`). Forward it into the `wrap_prompt` call inside the function body. + +The existing function probably looks like: + +```python +def create_task( + session_id: str, + prompt: str, + email: str, + context: dict | None = None, + context_hint: str | None = None, + timeout_s: int | None = None, + permissions: str | None = None, + previous_session_id: str | None = None, +): + ... + wrapped = wrap_prompt( + task_id=task_id, + session_id=session_id, + email=email, + prompt=prompt, + context=context, + results_dir=results_dir, + context_hint=context_hint, + previous_session_id=previous_session_id, + ) + ... +``` + +Change to: + +```python +def create_task( + session_id: str, + prompt: str, + email: str, + context: dict | None = None, + context_hint: str | None = None, + timeout_s: int | None = None, + permissions: str | None = None, + previous_session_id: str | None = None, + workflow_protocol: bool = True, +): + ... + wrapped = wrap_prompt( + task_id=task_id, + session_id=session_id, + email=email, + prompt=prompt, + context=context, + results_dir=results_dir, + context_hint=context_hint, + previous_session_id=previous_session_id, + workflow_protocol=workflow_protocol, + ) + ... +``` + +- [ ] **Step 6: Modify `coda_mcp/mcp_server.py:220` — `coda_run` signature + forward** + +Find the `coda_run` function (starts around line 220). Add `workflow_protocol: bool = True` to its parameter list and pass it to `task_manager.create_task`. + +Current signature: + +```python +async def coda_run( + prompt: str, + email: str, + context: str = "{}", + previous_session_id: str = "", + permissions: str = "smart", + timeout_s: int = 3600, +) -> str: +``` + +Change to: + +```python +async def coda_run( + prompt: str, + email: str, + context: str = "{}", + previous_session_id: str = "", + permissions: str = "smart", + timeout_s: int = 3600, + workflow_protocol: bool = True, +) -> str: +``` + +Update the docstring (the existing string ends "Returns JSON with ``task_id``, ``session_id``, and ``status: \"running\"``"). Add this sentence to the docstring body before the Returns line: + +``` +``workflow_protocol`` defaults to True, which injects a Databricks +orientation block and a 3-phase workflow protocol (PLAN/EXECUTE/SYNTHESIZE +with critique at each phase) into the agent's prompt. The protocol also +defines the ``info_needed`` terminal status for clean handoff when the +agent is blocked. Set False to skip — useful for non-Databricks tasks. +``` + +Find the `task_manager.create_task(...)` call (around line 265) and add the new kwarg: + +```python +result = task_manager.create_task( + session_id=session_id, + prompt=prompt, + email=email, + context=ctx, + timeout_s=timeout_s, + permissions=permissions, + previous_session_id=previous_session_id or None, + workflow_protocol=workflow_protocol, +) +``` + +- [ ] **Step 7: Run the new tests; verify they PASS** + +Run: `uv run pytest tests/test_task_manager.py -v` (or whichever file) +Expected: All 6 new tests PASS. + +Also run the full target file plus the new module's tests to check no regression: + +``` +uv run pytest tests/test_databricks_preamble.py tests/test_task_manager.py tests/test_coda_interactive.py tests/test_mcp_server.py tests/test_replay_only_flag.py -v +``` + +Expected: All pass. If any fail, fix the implementation (not the tests). + +- [ ] **Step 8: Run ruff** + +Run: `uv run ruff check coda_mcp/task_manager.py coda_mcp/mcp_server.py tests/test_task_manager.py` +Expected: clean. + +- [ ] **Step 9: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/task_manager.py coda_mcp/mcp_server.py tests/test_task_manager.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: wire workflow_protocol flag through coda_run → create_task → wrap_prompt + +The flag defaults to True. When set, wrap_prompt inserts CAPABILITIES and +WORKFLOW PROTOCOL sections between TASK and INSTRUCTIONS in prompt.txt. +Callers can opt out via workflow_protocol=False on coda_run for purely +non-Databricks tasks." +``` + +--- + +## Task 3: Update INSTRUCTIONS section to document `info_needed` + new step labels + +The INSTRUCTIONS block in `wrap_prompt` still says only "If you failed, set status to 'failed'" — silent about `info_needed`. Update it. + +**Files:** +- Modify: `coda_mcp/task_manager.py:153-225` (INSTRUCTIONS portion of `wrap_prompt`'s return) +- Modify (or extend): `tests/test_task_manager.py` + +- [ ] **Step 1: Append the pinning tests to `tests/test_task_manager.py`** + +```python +def test_wrap_prompt_instructions_documents_info_needed(): + """INSTRUCTIONS section must mention the info_needed status and feedback field.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/r", + ) + # Pull the INSTRUCTIONS section out for focused assertions. + assert "info_needed" in out + assert "feedback" in out + + +def test_wrap_prompt_instructions_lists_new_step_labels(): + """INSTRUCTIONS section enumerates the canonical step labels emitted by the agent.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/r", + ) + for label in ("plan", "critique_plan", "execute", "critique_execute", "synthesize", "critique_synthesize"): + assert label in out, f"Missing step label {label!r} from prompt text" +``` + +- [ ] **Step 2: Run; verify FAIL** + +Run: `uv run pytest tests/test_task_manager.py::test_wrap_prompt_instructions_documents_info_needed tests/test_task_manager.py::test_wrap_prompt_instructions_lists_new_step_labels -v` +Expected: both FAIL. + +- [ ] **Step 3: Update the INSTRUCTIONS section in `wrap_prompt`** + +In `coda_mcp/task_manager.py`, find the line that says `f' Each line must be valid JSON: ...'` (currently around line 197). Replace the entire INSTRUCTIONS portion (steps 1, 2, 3) with this: + +```python +f"INSTRUCTIONS:\n" +f"1. As you work, append progress lines to {results_dir}/status.jsonl\n" +f' Each line must be valid JSON: {{"step": "label", "message": "what you are doing"}}\n' +f" Canonical step labels (use these when the workflow protocol is active):\n" +f" plan, critique_plan, execute_, critique_execute,\n" +f" synthesize, critique_synthesize, info_needed, failed\n" +f"\n" +f"2. When you are COMPLETELY DONE, write a SINGLE FILE at this exact path:\n" +f" {results_dir}/result.json\n" +f" It must contain this JSON structure (status field has four allowed values):\n" +f" {{\n" +f' "status": "completed" | "failed" | "info_needed" | "needs_approval",\n' +f' "summary": "one paragraph describing what you did or why you stopped",\n' +f' "feedback": "REQUIRED if status=info_needed — what context the caller must add",\n' +f' "files_changed": ["list", "of", "file", "paths"],\n' +f' "artifacts": {{}},\n' +f' "errors": []\n' +f" }}\n" +f" - status=\"completed\": you finished the task.\n" +f" - status=\"failed\": unrecoverable hard error; describe in errors[].\n" +f" - status=\"info_needed\": you are blocked because something the CALLER must\n" +f" supply is missing. The feedback field is REQUIRED and must precisely\n" +f" name what is missing. The caller will resubmit with more context.\n" +f" - status=\"needs_approval\": you have a destructive action ready but need\n" +f" explicit caller approval before executing. See SAFETY section.\n" +f" IMPORTANT: result.json is a FILE not a directory. Write it with:\n" +f" echo '{{...}}' > {results_dir}/result.json\n" +f"\n" +f"3. If you delegate to a sub-agent, update status.jsonl with delegation steps.\n" +f"\n" +``` + +The block above replaces the OLD INSTRUCTIONS steps 1-3 ENTIRELY. The SAFETY section below it stays unchanged. + +- [ ] **Step 4: Run; verify GREEN** + +Run: `uv run pytest tests/test_task_manager.py -v` +Expected: all task_manager tests pass. + +Run: `uv run pytest tests/test_databricks_preamble.py tests/test_task_manager.py tests/test_coda_interactive.py -v` +Expected: still green across the board. + +- [ ] **Step 5: Ruff check** + +Run: `uv run ruff check coda_mcp/task_manager.py tests/test_task_manager.py` +Expected: clean. + +- [ ] **Step 6: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/task_manager.py tests/test_task_manager.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: document info_needed status and canonical step labels in INSTRUCTIONS + +The INSTRUCTIONS section of prompt.txt now enumerates the four allowed +result.json status values (completed, failed, info_needed, needs_approval), +describes when to use each, and lists the canonical status.jsonl step +labels emitted by the workflow protocol." +``` + +--- + +## Task 4: Update surfaces — counts dict, get_result docstring, MCP instructions paragraph (TDD) + +Three small surface updates that together let upstream callers understand the new statuses. + +**Files:** +- Modify: `coda_mcp/mcp_server.py:551-559` (counts dict in `coda_inbox`) +- Modify: `coda_mcp/mcp_server.py:573-584` (`coda_get_result` docstring) +- Modify: `coda_mcp/mcp_server.py:52-99` (FastMCP `instructions=` block) +- Create: `tests/test_inbox_status_passthrough.py` + +- [ ] **Step 1: Create the test file `tests/test_inbox_status_passthrough.py`** + +```python +"""Tests covering counts dict, coda_get_result docstring, and MCP instructions +all reflect the new info_needed / needs_approval terminal statuses.""" +import asyncio +import json + + +def test_mcp_instructions_mention_info_needed(): + """Server-level MCP instructions teach calling LLMs about info_needed.""" + from coda_mcp import mcp_server + + txt = mcp_server.mcp.instructions + assert "info_needed" in txt + assert "needs_approval" in txt + assert "feedback" in txt + + +def test_coda_get_result_docstring_mentions_info_needed(): + """coda_get_result docstring lists info_needed / needs_approval alongside completed/failed.""" + from coda_mcp import mcp_server + + doc = (mcp_server.coda_get_result.__doc__ or "").lower() + assert "info_needed" in doc + assert "needs_approval" in doc + + +def test_inbox_counts_dict_includes_new_statuses(monkeypatch): + """coda_inbox counts dict has info_needed and needs_approval keys.""" + from coda_mcp import mcp_server + + fake_tasks = [ + {"task_id": "t1", "session_id": "s1", "status": "running"}, + {"task_id": "t2", "session_id": "s2", "status": "completed"}, + {"task_id": "t3", "session_id": "s3", "status": "failed"}, + {"task_id": "t4", "session_id": "s4", "status": "info_needed"}, + {"task_id": "t5", "session_id": "s5", "status": "needs_approval"}, + {"task_id": "t6", "session_id": "s6", "status": "info_needed"}, + ] + + monkeypatch.setattr( + mcp_server.task_manager, "list_all_tasks", + lambda email, status_filter=None: list(fake_tasks), + ) + # _read_session_safe is called inside the loop; return None so no viewer_url is added. + monkeypatch.setattr( + mcp_server.task_manager, "_read_session_safe", lambda sid: None, + ) + + result_str = asyncio.run(mcp_server.coda_inbox(email="u@e")) + result = json.loads(result_str) + counts = result["counts"] + + assert counts["running"] == 1 + assert counts["completed"] == 1 + assert counts["failed"] == 1 + assert counts["info_needed"] == 2 + assert counts["needs_approval"] == 1 +``` + +- [ ] **Step 2: Run; verify FAIL** + +Run: `uv run pytest tests/test_inbox_status_passthrough.py -v` +Expected: all 3 tests FAIL — instructions don't mention info_needed, docstring doesn't, and counts dict has only 3 keys. + +- [ ] **Step 3: Update the FastMCP `instructions=` block in `coda_mcp/mcp_server.py:52-99`** + +Find the `mcp = FastMCP(...)` constructor (starts around line 50). Inside the `instructions=` argument is a multi-line string concatenation. Locate the existing "CHAINING" paragraph (the one that says `"CHAINING: pass previous_session_id ..."`). After that paragraph and BEFORE the "SHARE THE REPLAY URL" paragraph, insert this new paragraph: + +```python + "INFO_NEEDED HANDOFF: When coda_inbox shows a task with status='info_needed', " + "the agent could not proceed because of missing context. Call coda_get_result " + "to read the 'feedback' field — it tells you exactly what the agent needs (a " + "table name, a decision, a clarification). Add that context to the prompt and " + "resubmit via coda_run with previous_session_id set to the original task's " + "session_id so the agent has the prior attempt's context. 'needs_approval' is " + "similar but means the agent has a destructive plan and is waiting for the " + "caller's explicit go/no-go.\n\n" +``` + +Make sure the trailing newlines match the surrounding string concatenation (the other paragraphs end with `\n\n`). + +- [ ] **Step 4: Update the counts dict in `coda_inbox` (lines 551-559)** + +Find this block: + +```python +counts = {"running": 0, "completed": 0, "failed": 0} +for t in tasks: + s = t.get("status", "") + if s in counts: + counts[s] += 1 + elif s == "done": + counts["completed"] += 1 + elif s == "timeout": + counts["failed"] += 1 +``` + +Change the first line to add the two new keys: + +```python +counts = { + "running": 0, + "completed": 0, + "failed": 0, + "info_needed": 0, + "needs_approval": 0, +} +for t in tasks: + s = t.get("status", "") + if s in counts: + counts[s] += 1 + elif s == "done": + counts["completed"] += 1 + elif s == "timeout": + counts["failed"] += 1 +``` + +The aliasing branches (`done`, `timeout`) are unchanged. + +- [ ] **Step 5: Update `coda_get_result` docstring (line ~579)** + +Find the docstring of `coda_get_result`: + +```python +"""Retrieve the structured result of a completed task. + +Call this AFTER coda_inbox shows a task as "completed" or "failed". + +Returns JSON with ``task_id``, ``session_id``, ``status``, ``summary`` +(what was done), ``files_changed`` (list of modified files), +``artifacts`` (job IDs, commit hashes, etc.), and ``errors`` (if any). +""" +``` + +Change to: + +```python +"""Retrieve the structured result of a completed task. + +Call this AFTER coda_inbox shows a task as "completed", "failed", +"info_needed", or "needs_approval". + +Returns JSON with ``task_id``, ``session_id``, ``status``, ``summary`` +(what was done or why the agent stopped), ``files_changed`` (list of +modified files), ``artifacts`` (job IDs, commit hashes, etc.), +``errors`` (if any), and — when status is "info_needed" — ``feedback`` +(a precise description of what context the caller must add before +resubmitting). +""" +``` + +- [ ] **Step 6: Run the new tests; verify GREEN** + +Run: `uv run pytest tests/test_inbox_status_passthrough.py -v` +Expected: 3 passed. + +- [ ] **Step 7: Run target-area tests to verify no regression** + +Run: `uv run pytest tests/test_inbox_status_passthrough.py tests/test_coda_interactive.py tests/test_databricks_preamble.py tests/test_task_manager.py tests/test_mcp_server.py tests/test_replay_only_flag.py -v` +Expected: all pass. + +- [ ] **Step 8: Ruff** + +Run: `uv run ruff check coda_mcp/mcp_server.py tests/test_inbox_status_passthrough.py` +Expected: clean. + +- [ ] **Step 9: Commit** + +```bash +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + add coda_mcp/mcp_server.py tests/test_inbox_status_passthrough.py +git -c user.email=datasciencemonkey@gmail.com -c user.name="Sathish Gangichetty" \ + commit -m "feat: surface info_needed + needs_approval in inbox counts, get_result doc, MCP instructions + +Three surfaces updated so calling LLMs and dashboards know about the +two soft terminal statuses: +- coda_inbox counts dict gains info_needed and needs_approval keys. +- coda_get_result docstring lists all four valid statuses and the + feedback field that accompanies info_needed. +- FastMCP server-level instructions gain an INFO_NEEDED HANDOFF + paragraph teaching upstream LLMs to read 'feedback' and resubmit + with previous_session_id for the chained context." +``` + +--- + +## Task 5: Push branch and update PR #67 description + +**Files:** +- None (remote/PR update only) + +- [ ] **Step 1: Verify branch state** + +```bash +git status +git log --oneline origin/feat/coda-mcp-interactive-handoff..HEAD +``` + +Expected: working tree clean. The new commits since the last push include the spec, the spec-critic fixes, the plan, and the four implementation commits (Tasks 1-4). + +- [ ] **Step 2: Push** + +```bash +git push origin feat/coda-mcp-interactive-handoff +``` + +Expected: fast-forward push. + +- [ ] **Step 3: Append a follow-up section to PR #67 body** + +Read the current body: + +```bash +gh pr view 67 --json body -q .body > /tmp/pr67-body.md +``` + +Append this section: + +``` +--- + +## Follow-up #2: Workflow protocol + Databricks orientation + +`coda_run` now injects two new sections into `prompt.txt`: +- **CAPABILITIES** — tells hermes about the Databricks CLI (pre-authed), the 16 Databricks skills under `~/.claude/skills/`, and the DeepWiki / Exa / CoDA MCP servers. +- **WORKFLOW PROTOCOL** — imposes a 3-phase pipeline (PLAN → EXECUTE → SYNTHESIZE) with a critique step after each phase (self-review or sub-agent — agent's choice). Max 2 iterations per phase to keep token cost bounded. + +New terminal `result.json` status `"info_needed"` with a required `feedback` field gives the calling client a structured iteration loop when the agent is blocked. The existing `"needs_approval"` status is preserved with explicit disambiguation: `info_needed` = "caller must add context"; `needs_approval` = "caller must approve a destructive action". + +**Three surfaces updated** so upstream LLMs know about the new statuses: +- `coda_inbox` counts dict gains `info_needed` and `needs_approval` keys. +- `coda_get_result` docstring lists all four valid statuses + the new `feedback` field. +- FastMCP server-level instructions gain an INFO_NEEDED HANDOFF paragraph. + +**Flag:** `coda_run(... workflow_protocol=True)` is the default. Set False to skip both new sections for non-Databricks tasks. + +**Artifacts:** +- Spec: `docs/superpowers/specs/2026-05-28-coda-run-workflow-protocol-design.md` +- Plan: `docs/superpowers/plans/2026-05-28-coda-run-workflow-protocol.md` +``` + +Then update the PR body: + +```bash +gh pr edit 67 --body-file /tmp/pr67-body.md +``` + +Or if gh's TLS bug hits on this machine, fall back to curl + REST per the prior follow-up. + +- [ ] **Step 4: Confirm** + +Run `gh pr view 67 --json body -q .body | tail -30` and verify the new section appears. + +--- + +## Self-review of this plan against the spec + +**Spec section 1 — Goal.** Task 1 creates the module; Task 2 wires the flag; Task 3 updates INSTRUCTIONS; Task 4 surfaces the statuses. ✓ + +**Spec section "Components" 1 (databricks_preamble.py).** Task 1 creates it with all three exports. ✓ + +**Components 2 + 3 (CAPABILITIES + WORKFLOW PROTOCOL content).** Task 1's module has the verbatim text from the spec. ✓ + +**Components 4 (expanded INSTRUCTIONS).** Task 3 covers it. ✓ + +**Components 5 (task_manager changes).** Task 2 covers wrap_prompt and create_task. ✓ + +**Components 6 (mcp_server.coda_run changes).** Task 2 covers it. ✓ + +**Components 7 (counts dict + get_result docstring).** Task 4 covers both. ✓ + +**Components 7a (MCP instructions string).** Task 4 covers it. ✓ + +**Components 7b (watcher interaction).** Documented in spec as no-code-change. Plan does not need a task for it. + +**Testing strategy.** Every test listed in the spec maps to a task step in Task 1 (`test_databricks_preamble.py`), Task 2 (extension of `test_task_manager.py`), Task 3 (further extension of same), Task 4 (`test_inbox_status_passthrough.py`). ✓ + +**Acceptance criteria 1-8.** All mapped. ✓ + +**Placeholder scan:** No TBD/TODO. Every step has explicit code or commands. + +**Type consistency:** `workflow_protocol: bool = True` used uniformly across all three call sites (wrap_prompt, create_task, coda_run). Step labels (`plan`, `critique_plan`, etc.) match between Task 1's module text, Task 3's INSTRUCTIONS update, and the spec. + +**Risk: Task 2 Step 5 might leave the `_write_task_meta` mock or other internal helpers' signatures stale.** The test `test_create_task_forwards_workflow_protocol_to_wrap_prompt` monkeypatches `_session_dir`, `_task_dir`, `_write_task_meta`, and `os.makedirs`. If `create_task` calls additional helpers in production, the test will fail with cryptic AttributeError. If that happens during execution, add the missing helpers to the monkeypatch list — the test's intent is to verify ONLY the flag pass-through, not the file-system side effects. diff --git a/docs/superpowers/specs/2026-05-27-coda-mcp-live-session-url-design.md b/docs/superpowers/specs/2026-05-27-coda-mcp-live-session-url-design.md new file mode 100644 index 0000000..c82bdc6 --- /dev/null +++ b/docs/superpowers/specs/2026-05-27-coda-mcp-live-session-url-design.md @@ -0,0 +1,447 @@ +# CoDA MCP Live Session URL — Design + +**Date:** 2026-05-27 +**Branch:** `feat/coda-mcp-server` +**Status:** Spec approved by user; ready for implementation plan +**Related PR:** databrickslabs/coding-agents-databricks-apps#64 (parent feature) + +## 1. Problem + +`coda_run` is fire-and-forget today: it returns `{task_id, session_id, status: "running"}` and the calling MCP client (Genie Code, Claude Desktop, Cursor) has no way to surface progress to the user. The user only sees a structured `result.json` after the task completes via `coda_inbox`/`coda_get_result`. Status messages from `status.jsonl` are coarse-grained. There is no way to watch hermes execute live, intervene mid-task, or reconstruct what happened after the fact. + +The Flask app side already has a fully working real-time terminal UI (xterm.js + Socket.IO + HTTP polling fallback) that knows how to attach to any active PTY by id. The MCP server already spawns those PTYs to run hermes. **The two halves are not connected by a URL.** + +## 2. Goal + +Give every `coda_run` (and existing tasks listed via `coda_inbox` / fetched via `coda_get_result`) a `viewer_url` that: + +- **During execution** — opens the existing terminal UI attached to that task's live PTY. The user can watch hermes work in real time and type into the session if they want to redirect or take over (single-user app; this is intentional). +- **For ~5 minutes after completion** — keeps the PTY alive so a viewer who joined mid-task isn't yanked the instant `result.json` is written. Heartbeats from an active viewer do not extend this window — the grace timer is fixed. +- **Indefinitely after PTY closes** (within the 24h `TASK_TTL_S`) — serves a static "replay" rendering of the captured terminal transcript so a user can scroll the full execution history from `coda_inbox`. + +Out of scope (deferred to separate specs): configurable agent selection (hermes vs claude-code vs codex), multi-user attribution, asciinema-style timed replay. + +## 3. Architecture + +``` +┌────────────────────────────────────────────────────────────────┐ +│ MCP client Browser │ +│ (Genie Code, Claude Desktop) (single user, app URL) │ +└──────────┬──────────────────────────────────┬──────────────────┘ + │ tools/call coda_run │ GET /?session= + ▼ ▼ + ┌───────────────┐ ┌─────────────────────┐ + │ coda_mcp /mcp │ │ Flask /static + WS │ + │ +viewer_url │ │ /api/session/attach│ + └───────┬───────┘ └──────────┬──────────┘ + │ │ + ▼ ▼ + ┌──────────────────────────────────────────────────────┐ + │ Flask app (single process) │ + │ sessions[] → {fd, buffer, transcript_fh, │ + │ grace: bool} │ + │ read_pty_output thread: │ + │ fd → buffer → socketio emit (room=) │ + │ fd → transcript.log (NEW: tee, flush per write) │ + └──────────────────────────────────────────────────────┘ + │ │ + │ writes (chmod 600) │ reads when PTY gone + ▼ ▼ + ~/.coda/sessions/{sess}/tasks/{task}/transcript.log +``` + +Everything between the MCP server and the Flask app already exists. The feature is mostly plumbing: + +1. **Tee PTY output** to `transcript.log` (on disk, per task, chmod 0600, 10 MB soft cap). +2. **Defer PTY close** on task completion by 5 minutes (`threading.Timer`) so live viewers can finish reading. +3. **Build `viewer_url`** in MCP tool responses by capturing `X-Forwarded-Host` from the inbound request. +4. **Teach the SPA** to read `?session=` on load and to render replay mode when the PTY is gone but a transcript exists. + +## 4. Components + +### 4.1 `app.py::sessions[pty_id]` dict (additive) + +Four new keys, all optional/defaulting: + +- `transcript_path: str | None` — absolute path to the tee target. +- `transcript_fh: BinaryIO | None` — open file handle owned by `read_pty_output`. +- `transcript_bytes: int` (default 0) — running count to enforce the 10 MB cap. +- `grace: bool` (default False) — set `True` when `_watch_task` schedules deferred close. Used by the concurrency check to exempt this slot. + +No removals. No semantic changes to existing keys. + +### 4.2 `app.py::mcp_create_pty_session(label, transcript_path=None)` + +New optional kwarg. When provided: + +- `os.makedirs(os.path.dirname(transcript_path), exist_ok=True)` +- Open file: `fh = open(transcript_path, "ab", buffering=0)` (binary append, unbuffered) +- `os.fchmod(fh.fileno(), 0o600)` immediately +- Store `transcript_path` and `transcript_fh` on the session dict +- If open fails: log error, set both to `None`, continue (live PTY still works) + +### 4.3 `app.py::read_pty_output` (additive) + +After the existing buffer append and Socket.IO emit, if a transcript handle is present, write under the per-session lock to prevent races against `terminate_session` (which may close the handle from the Timer thread): + +```python +with session_lock: + fh = session.get("transcript_fh") + written = session.get("transcript_bytes", 0) + if fh is not None: + remaining = TRANSCRIPT_CAP_BYTES - written + if remaining > 0: + chunk = output[:remaining] + try: + fh.write(chunk) + fh.flush() + session["transcript_bytes"] = written + len(chunk) + if len(chunk) < len(output): + fh.write(b"\n[transcript truncated at 10MB]\n") + fh.flush() + fh.close() + session["transcript_fh"] = None + except (OSError, ValueError) as exc: + logger.warning("transcript write failed for %s: %s", session_id, exc) + try: fh.close() + except Exception: pass + session["transcript_fh"] = None +``` + +`TRANSCRIPT_CAP_BYTES = 10 * 1024 * 1024`. + +**Invariants** (documented for future maintainers): + +- `transcript_fh` is opened in `mcp_create_pty_session`, written exclusively by `read_pty_output`, and closed by either (a) `read_pty_output` on cap/error or (b) `terminate_session` on PTY teardown. All three sites operate under `session["lock"]`. +- `transcript_bytes` is incremented only by `read_pty_output`. Single-writer; reads from other threads must hold `session["lock"]`. +- `ValueError` is caught alongside `OSError` to defend against a tiny window where `terminate_session` closes the handle between the spec's `if fh is not None` check and the actual `fh.write` call — the lock prevents this, but the catch is belt-and-suspenders. + +### 4.4 `app.py::terminate_session` (additive) + +Close the transcript file handle under the per-session lock before the existing fd close. The swap-to-`None` is the synchronization point that lets `read_pty_output` notice the handle is gone on its next iteration: + +```python +sess = sessions.get(session_id) +if sess is not None: + with sess["lock"]: + fh = sess.get("transcript_fh") + sess["transcript_fh"] = None # swap first, then close + if fh is not None: + try: fh.close() + except Exception: pass +``` + +(The actual close happens outside the lock to avoid holding it across a potential blocking I/O on a slow filesystem.) + +### 4.5 `app.py::MAX_CONCURRENT_SESSIONS` check (modified) + +At the `if len(sessions) >= MAX_CONCURRENT_SESSIONS` checkpoints in `create_session()` and `mcp_create_pty_session()`, replace the raw length check with a filtered count that excludes grace-period PTYs: + +```python +active = sum(1 for s in sessions.values() if not s.get("grace")) +if active >= MAX_CONCURRENT_SESSIONS: ... +``` + +`cleanup_stale_sessions` itself is **unchanged** — it still treats grace-period PTYs like any other session, but the 24h `SESSION_TIMEOUT_SECONDS` is so long the reaper never wins the race against the 5-min Timer. + +`MAX_CONCURRENT_SESSIONS` default stays at 5. + +### 4.6 `coda_mcp/mcp_server.py::_watch_task` (modified) + +Both completion and timeout paths replace immediate `_close_pty_for_session(session_id)` with: + +```python +session_data = task_manager._read_session(session_id) +pty_session_id = session_data.get("pty_session_id") +if pty_session_id and _app_close_session is not None: + _mark_grace(pty_session_id) # sets sessions[pty_id]["grace"] = True + _bump_last_poll(pty_session_id, GRACE_PERIOD_S) # defensive against reaper + threading.Timer( + GRACE_PERIOD_S, + _app_close_session, + args=(pty_session_id,), + ).start() +``` + +`GRACE_PERIOD_S = 300` (5 minutes), defined as a module constant for testability. `_mark_grace` and `_bump_last_poll` are two new hook callbacks wired through `set_app_hooks()` alongside the existing three — consistent with the current pattern (no direct Flask imports from the MCP module). + +The Timer must be a daemon so it doesn't block uvicorn shutdown: `t = threading.Timer(...); t.daemon = True; t.start()`. + +### 4.7 `coda_mcp/mcp_server.py::coda_run` (additive) + +After `mcp_create_pty_session`, compute the transcript path and pass it in: + +```python +transcript_path = os.path.join( + task_manager._task_dir(session_id, task_id), + "transcript.log", +) +pty_session_id = _app_create_session( + label="hermes-mcp", + transcript_path=transcript_path, +) +``` + +(Note: `_app_create_session` signature gains the kwarg. The implementation in `app.py` already documented above.) + +Then build the response with the new field: + +```python +return json.dumps({ + "task_id": task_id, + "session_id": session_id, + "status": "running", + "viewer_url": _build_viewer_url(pty_session_id), # may be None +}) +``` + +Tools serialize via `json.dumps` so `None` becomes `null`. Clients that don't recognize the field will ignore it. + +### 4.8 `coda_mcp/url_builder.py` (new tiny module) + +```python +import os +from typing import Optional + +_app_url_cache: Optional[str] = None + +def capture_from_headers(host: Optional[str]) -> None: + """Called by middleware on every inbound request.""" + global _app_url_cache + if host: + _app_url_cache = host + +def build_viewer_url(pty_session_id: str) -> Optional[str]: + override = os.environ.get("CODA_APP_URL", "").strip() + if override: + base = override.rstrip("/") + elif _app_url_cache: + base = f"https://{_app_url_cache}" + else: + return None + return f"{base}/?session={pty_session_id}" +``` + +### 4.9 `coda_mcp/mcp_asgi.py` (additive middleware) + +Insert a small ASGI middleware on `mcp_starlette` (via `mcp_starlette.add_middleware(...)`) that extracts `X-Forwarded-Host` (fallback: `Host`) from every HTTP request and calls `url_builder.capture_from_headers(host)`. Both MCP requests AND inbound browser HTTP requests refresh the cache. + +**Coverage caveat** (not a problem in practice): the top-level ASGI app is `socketio.ASGIApp(sio, other_asgi_app=mcp_starlette)`, so `/socket.io/` traffic is intercepted by socketio *before* it reaches `mcp_starlette` and therefore never hits this middleware. This is fine because (a) the user always loads the SPA via plain HTTP first (which refreshes the cache), and (b) every `coda_run` MCP call is a plain HTTP POST to `/mcp` (also through the middleware). The cache is hot by the time any tool needs the URL. + +```python +class AppUrlCaptureMiddleware: + def __init__(self, app): self.app = app + async def __call__(self, scope, receive, send): + if scope["type"] == "http": + headers = dict(scope.get("headers") or []) + host = headers.get(b"x-forwarded-host") or headers.get(b"host") + if host: + url_builder.capture_from_headers(host.decode()) + await self.app(scope, receive, send) +``` + +### 4.10 `coda_mcp/task_manager.py::find_task_dir_by_pty_session` (new) + +```python +_pty_lookup_cache: dict[str, tuple[str, float]] = {} # pty_id -> (task_dir, ts) +_PTY_LOOKUP_TTL = 60.0 # seconds + +def find_task_dir_by_pty_session(pty_session_id: str) -> str | None: + """Find the task dir whose session.json carries this pty_session_id.""" + now = time.time() + cached = _pty_lookup_cache.get(pty_session_id) + if cached and (now - cached[1]) < _PTY_LOOKUP_TTL: + return cached[0] + # Scan SESSIONS_DIR + if not os.path.isdir(SESSIONS_DIR): + return None + for sess_name in os.listdir(SESSIONS_DIR): + sess_file = os.path.join(SESSIONS_DIR, sess_name, "session.json") + try: + with open(sess_file) as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + continue + if data.get("pty_session_id") != pty_session_id: + continue + # The session has a current_task or completed_tasks; pick the most recent. + candidate = data.get("current_task") or ( + data["completed_tasks"][-1] if data.get("completed_tasks") else None + ) + if candidate: + tdir = os.path.join(SESSIONS_DIR, sess_name, "tasks", candidate) + _pty_lookup_cache[pty_session_id] = (tdir, now) + return tdir + return None +``` + +TTL handles the rename/close case without manual invalidation. + +**Invariant**: CoDA MCP sessions are ephemeral — one task per session (see `task_manager.create_session` then `complete_task` which sets `current_task=None` and appends to `completed_tasks`). This function therefore returns the right task dir for the lifetime of the URL. If the lifecycle ever changes to allow task reuse within a single session, this function must be revisited to pick the *active or grace-period* task rather than `completed_tasks[-1]`. + +### 4.11 `app.py::attach_session` endpoint (additive) + +After the existing `_get_session()` lookup, add a fallback: + +```python +sess = _get_session(session_id) +if not sess or sess.get("exited"): + # NEW: try transcript replay + tdir = task_manager.find_task_dir_by_pty_session(session_id) + if tdir: + transcript = os.path.join(tdir, "transcript.log") + if os.path.isfile(transcript): + with open(transcript, "rb") as f: + content = f.read() + return jsonify({ + "session_id": session_id, + "label": "hermes-mcp (replay)", + "output": [content.decode("utf-8", errors="replace")], + "replay": True, + "process": None, + "created_at": None, + }) + return jsonify({"error": "Session not found or exited"}), 404 +``` + +The response shape (`output: [str]`, `replay: true|absent`, plus existing keys) is **NOT** consumed by the existing `_doAttach` — that function deliberately ignores `data.output` and forces a SIGWINCH redraw of the live application (`static/index.html:1339-1357`, comment at line 1347: "We skip buffer replay because it contains raw escape sequences that produce garbled output"). The replay-mode response is consumed by a new SPA function `_doReplay` described in §4.12, which writes the bytes directly into xterm. + +### 4.12 `static/index.html` (~50-70 LoC) + +Four additions: + +1. **Boot-time URL parse** — before the existing session-picker fetch, check `new URLSearchParams(location.search).get("session")`. If absent → existing flow. If present → call `POST /api/session/attach` once and branch on the response: + - 200 with `replay: true` → call **`_doReplay`** (new, described below). Skip `_doAttach`. Do NOT emit `join_session`. Do NOT wire `terminal_input` to the WS. + - 200 without `replay` → call the existing `_doAttach(term, sessionId)` and the existing `socket.emit('join_session', { session_id })` path. (Reusing `_doAttach` is correct here because the *live* PTY is running an interactive app, and SIGWINCH-redraw is the right behavior.) + - 404 → render a small in-page fallback: "session expired or never existed" + a button to navigate to `/`. + +2. **`_doReplay(term, sessionId, bytes)` — new function** that handles static replay rendering. Cannot route through `_doAttach` because `_doAttach` discards `data.output` (it relies on a running app to redraw via SIGWINCH; replay mode has no running app). Implementation: + + ```js + async function _doReplay(term, sessionId, content) { + // Chunk the write to avoid main-thread jank on multi-MB transcripts. + // xterm.js write() is internally batched, but a single 10MB call + // still blocks until the parser drains. 64KB slices with rAF gives + // the browser a chance to repaint between chunks. + const CHUNK = 64 * 1024; + for (let i = 0; i < content.length; i += CHUNK) { + term.write(content.slice(i, i + CHUNK)); + await new Promise(r => requestAnimationFrame(r)); + } + // Mount a small "Task completed — viewing replay" banner above the pane. + // No input handler, no WS subscription, no heartbeat for this session id. + } + ``` + +3. **Replay-mode pane behavior** — the tab gets a "(replay)" badge. The xterm input handler is not wired. The session is NOT included in the heartbeat session_ids list (the PTY is dead; heartbeats would 404 the lookup). + +4. **History/URL hygiene** — when the user closes a pane that was opened via `?session=`, call `history.replaceState({}, '', '/')` so a refresh doesn't re-attach. + +**Estimate revised**: 50-70 LoC including the new `_doReplay` and the 404 fallback. Architecturally the most "real" change in the spec — the rest of the codebase shifts are mostly additive. + +### 4.13 MCP tool `instructions` update (`coda_mcp/mcp_server.py`) + +Append one paragraph to the existing `instructions` block on the FastMCP instance: + +> SHARE THE LIVE URL: When `coda_run` returns a `viewer_url` field, mention it to the user in plain text (e.g. "you can watch progress at "). The URL is safe to share — it points to the same Databricks App the user is already authenticated against. Do this on the FIRST mention of the task and any time the user asks where the task is or how to see it. + +## 5. Data flow + +### 5.1 Submit + +`MCP client → /mcp coda_run → task_manager.create_session → mcp_create_pty_session(transcript_path) → task_manager.create_task → mcp_send_input("hermes -z ...") → _watch_task thread spawned → return {task_id, session_id, status: "running", viewer_url}`. + +### 5.2 Live view + +`Browser → GET /?session= → SPA reads ?session → POST /api/session/attach → live output buffer returned → WS join_session → live stream from read_pty_output → terminal_input writes to fd → heartbeat keeps the (already non-grace) PTY alive`. + +### 5.3 Grace window + +At T+0 hermes writes `result.json`. `_watch_task` calls `task_manager.complete_task` (disk status → closed), marks the PTY `grace=True`, bumps `last_poll_time`, schedules `Timer(300, _app_close_session)`. A viewer present at T+0 keeps streaming for up to 5 min. At T+300 the Timer SIGHUPs bash, `read_pty_output` sees EOF, flushes and closes the transcript handle, removes the session entry. + +### 5.4 Replay + +`Browser → GET /?session= → POST /api/session/attach → PTY not found → find_task_dir_by_pty_session → read transcript.log → return {output: [bytes], replay: true} → SPA renders bytes, no WS subscription`. + +## 6. Error handling + +| Failure | Behavior | +|---|---| +| `CODA_APP_URL` and `X-Forwarded-Host` both absent | `viewer_url: null`. One startup WARN. | +| Transcript open fails | `transcript_fh = None`. Live PTY works; replay disabled. | +| Transcript write fails mid-stream | Log once per session, close handle, set `transcript_fh = None`, keep reading PTY. | +| 10 MB cap hit | Write marker, close handle, set `transcript_fh = None`. PTY keeps streaming live (no further teeing). | +| Timer fires after manual close | `terminate_session` is re-entrant; `sessions.pop(_, None)` and `os.kill` wrapped in try/except. No-op. | +| uvicorn restart during grace | In-memory state lost; old `viewer_url` falls through to transcript replay (if file exists) or 404. Acceptable. | +| Browser opens URL mid-grace, grace expires while connected | `read_pty_output` emits `session_exited` to the room. SPA shows "session ended" banner. User reloads → replay mode. | +| Browser opens URL after grace AND transcript reaped | 404. SPA shows expired page. | +| `MAX_CONCURRENT_TASKS` reached | Unchanged "concurrency limit" error. Grace PTYs don't count toward this (disk status = closed). | +| `MAX_CONCURRENT_SESSIONS` reached among active (non-grace) | Existing 429. Grace PTYs don't count. | +| Hermes hangs (no `result.json`) | Existing `_watch_task` timeout path now also defers close via the same Timer mechanism. | + +## 7. Testing + +### 7.1 Unit + +- `coda_mcp/url_builder.py`: env override beats header capture; `None` when both absent; trailing slash on override is stripped. +- `coda_run` returns `viewer_url` only when builder returns non-None; same for `coda_inbox` per-entry and `coda_get_result`. +- `find_task_dir_by_pty_session`: hit, miss, TTL expiry, ignores corrupt session.json. +- `_watch_task`: schedules `Timer` (mocked) with correct args on both completion and timeout paths; never calls `_app_close_session` synchronously. +- `_mark_grace` / `_bump_last_poll` set the session dict fields. + +### 7.2 Integration (`tests/test_mcp_integration.py`) + +- E2E with a stub hermes (`bash -c 'echo hello; touch results/result.json; echo done'`): + - `transcript.log` contains "hello". + - At T+1s, PTY still alive (grace). + - At T+(GRACE+1)s (test uses a 2s grace via patched constant), PTY closed; transcript file persists. + - `/api/session/attach` returns `replay: true` after close; live mode before. +- Concurrency: submit `MAX_CONCURRENT_TASKS` tasks, complete them all (grace begins), submit `MAX_CONCURRENT_TASKS` more — all succeed (grace PTYs don't block). +- 10 MB cap: feed a hermes stub that prints `>10MB` of output; transcript file is exactly `10MB + marker`; PTY keeps running. + +### 7.3 SPA + +- New `tests/test_frontend_deeplink.spec.js` (Playwright if available; else manual checklist): + - `/?session=` → live attach, WS room joined, terminal renders. + - `/?session=` → replay rendered, no WS join, banner visible. + - `/?session=` → expired page. + - Closing the pane drops `?session=` from `history`. + +### 7.4 Manual smoke + +- Deploy to `mcp-test-coda` app, connect Genie Code, run a `coda_run`, click `viewer_url` from the chat response, confirm live stream + grace + replay. +- `chmod 600` check: `ls -la ~/.coda/sessions/*/tasks/*/transcript.log` on deployed pod. +- Confirm `viewer_url` absent on a local uvicorn boot without `CODA_APP_URL` and no inbound request yet. + +## 8. Open questions (resolved) + +- ~~Read-only vs interactive viewer?~~ → Interactive (full terminal). +- ~~Grace period mechanism?~~ → `threading.Timer(300, _close)`. +- ~~Replay storage?~~ → Tee to `transcript.log`. +- ~~Configurable agent?~~ → Deferred to a separate spec. +- ~~Base URL resolution?~~ → `CODA_APP_URL` env override → `X-Forwarded-Host` capture (officially provided by Databricks Apps). +- ~~Concurrency under grace?~~ → Exempt grace PTYs from `MAX_CONCURRENT_SESSIONS`. Cap stays at 5. + +## 9. Risks accepted + +- **Transcript on disk contains secrets** if hermes prints them. Single-user app, file is mode 0600, cleaned with the rest of the session at 24h TTL. Documented in `docs/mcp-v2-background-execution.md`. +- **5 min grace + 0 second active task** means a viewer who opens the URL late may still race the close. Acceptable; replay mode covers them. +- **Browser tabs can interact with the same PTY simultaneously.** Already true for the existing terminal UI; no new exposure. + +## 10. Surface summary + +| Surface | LoC est | Risk | +|---|---|---| +| `app.py` (4 functions touched) | ~60 | Low — additive, no semantic shifts | +| `coda_mcp/mcp_server.py` (2 functions + instructions) | ~40 | Low | +| `coda_mcp/url_builder.py` (new) | ~25 | Low | +| `coda_mcp/mcp_asgi.py` (middleware) | ~15 | Low | +| `coda_mcp/task_manager.py` (new lookup) | ~30 | Low | +| `static/index.html` | ~50-70 | Medium — new boot branch + new `_doReplay` rendering path; live attach still reuses `_doAttach` | +| Tests | ~250 | — | + +**Total**: ~235-255 LoC of production code + ~250 LoC of tests. + +## 11. Next step + +Hand to `writing-plans` skill to produce an executable implementation plan with task ordering, dependencies, and verification gates. diff --git a/docs/superpowers/specs/2026-05-28-coda-interactive-broaden-source-design.md b/docs/superpowers/specs/2026-05-28-coda-interactive-broaden-source-design.md new file mode 100644 index 0000000..0b35f09 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-coda-interactive-broaden-source-design.md @@ -0,0 +1,162 @@ +# Spec: Broaden `coda_interactive` source to any Workspace folder + +**Status:** Draft, pre-critique-gate +**Date:** 2026-05-28 +**Branch:** `feat/coda-mcp-interactive-handoff` (continues PR #67) +**Amends:** `docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md` + +## Goal + +Drop the requirement that `coda_interactive`'s `workspace_path` point to a Databricks Workspace **Git Folder**. The path can be any Workspace directory — a Git Folder *or* a plain Workspace folder. The MCP tool only needs the directory to exist in the workspace; how it got there is the caller's concern. + +## Why + +The original design (PR #67) used the Repos API (`client.repos.list` + `client.repos.update`) to resolve a Git Folder and optionally switch its branch before exporting. Two problems with that: + +1. **Unnecessary friction.** Users with a regular Workspace folder (uploaded via the UI, written via the Jobs API, etc.) cannot hand off to `coda_interactive` even though the underlying Workspace export API (`client.workspace.list` + `client.workspace.export`) works for *both* Git Folders and plain folders. The Repos gate excludes a valid use case for no benefit. +2. **Branch convenience overlaps with caller capabilities.** The upstream MCP caller (Genie Code, Claude Desktop) already has Databricks SDK access — if they want a specific branch checked out on a Git Folder, they can do it themselves before calling. The `branch` parameter on `coda_interactive` was duplicating capability that already lives upstream. + +Broadening the contract makes the tool surface smaller and the call site simpler. The user's framing: *"It may or may not be backed by git."* + +## Changes + +### 1. Tool signature + +**Before:** +```python +async def coda_interactive( + prompt: str, + workspace_path: str, + branch: str = "", + agent: str = "claude", + email: str = "", +) -> str: +``` + +**After:** +```python +async def coda_interactive( + prompt: str, + workspace_path: str, + agent: str = "claude", + email: str = "", +) -> str: +``` + +The `branch` parameter is removed entirely. If the caller wants a Git Folder on a specific branch, they switch it themselves before calling. + +### 2. Body of `coda_interactive` + +**Removed:** +- `client.repos.list(path_prefix=workspace_path)` lookup +- The exact-match filter (`next((r for r in repos if r.path == workspace_path), None)`) +- The `client.repos.update(repo_id=repo.id, branch=branch)` call + +**Added (light validation):** +A single `client.workspace.get_status(workspace_path)` call before export, to give callers a clean error when the path doesn't exist or isn't a directory. This replaces the implicit "empty export" failure mode with an explicit error. + +```python +try: + status = client.workspace.get_status(workspace_path) +except Exception as e: + return json.dumps({ + "status": "error", + "error": f"Workspace path not found: {workspace_path}: {e}", + }) + +if not _is_directory(status): + return json.dumps({ + "status": "error", + "error": f"Workspace path is not a directory: {workspace_path}", + }) +``` + +`_is_directory` already exists in `workspace_export.py` and works for both real SDK objects and mocks. Re-use it. + +### 3. Return shape + +**Removed field:** `"branch"`. + +**After:** +```json +{ + "status": "launched", + "viewer_url": "...", + "agent": "claude", + "project_dir": "/home/app/.coda/projects/pty-...", + "workspace_path": "/Workspace/Users/me@db.com/projects/feature-X", + "instructions": "Open viewer_url to attach. The agent is loaded with the project files exported from Workspace and your kickoff prompt typed. Type the agent's quit command (e.g. /quit) and then `exit` to end the session. Note: git history is NOT available in the session — files are an export, not a clone." +} +``` + +The `instructions` string is unchanged — it never claimed git history was preserved, so it stays valid for both Git Folders and plain folders. + +### 4. Caller pre-condition (spec section 1a rewrite) + +**Old contract:** "Project must be a Databricks Workspace Git Folder; commit and push to remote before calling." + +**New contract:** "Project must be a directory at `workspace_path` in the Databricks Workspace. Files visible to `workspace.export` (notebooks, source files) will appear in the session. If the directory is a Git Folder and you want a specific branch, switch it on the Git Folder yourself before calling — the export is a server-side snapshot." + +### 5. INTERACTIVE HANDOFF instructions string (server-level) + +The paragraph in `coda_mcp/mcp_server.py:79` surfaced to upstream LLM callers is rewritten: + +**Before (excerpt):** +> The user's project must be a Databricks Workspace Git Folder ... commit and push any local working changes back to the Git Folder's remote before calling. + +**After:** +> The tool reads files from a directory that already exists in the Databricks Workspace (a Git Folder or a plain Workspace folder — either works). If your working files are not yet in the Workspace, upload them first (`workspace.import` via the Databricks SDK, REST, or CLI) into a folder the user can read, then pass that folder as `workspace_path`. The tool does NOT accept inline file payloads. If the directory is a Git Folder, ensure the desired branch is checked out and pushed first — the export is a server-side snapshot. + +**Why the upload-then-handoff guidance is explicit:** The full workflow this tool enables is *upstream client generates / collects working files → uploads them to a Workspace folder → calls `coda_interactive` with that folder → the user opens the viewer URL and continues live in CoDA*. The instructions string needs to make the upload step visible to the calling LLM; otherwise it might assume `coda_interactive` accepts a file payload or that the user has already wired up the Workspace folder by hand. + +## What does NOT change + +- **`export_workspace_tree` helper** — already generic. No code changes in `coda_mcp/workspace_export.py`. +- **PTY lifecycle, agent launch matrix, prompt-seed stabilization** — unchanged. +- **`coda_run` and other tools** — untouched. +- **Three-mode framework table** — Mode 2 column "How invoked" stays the same; the spec for it now reads "any workspace folder, Git Folder or plain." + +## Tests to update + +In `tests/test_coda_interactive.py`: + +1. **Drop:** `test_unknown_workspace_path_returns_error` if it covered the `repos.list` empty-result case → replace with a `workspace.get_status` raises case. +2. **Drop:** `test_branch_update_succeeds` and `test_branch_update_fails` — branch param is gone. +3. **Drop:** any test asserting `"branch"` in the return JSON. +4. **Update:** the happy-path test mock — remove `client.repos.list` and `client.repos.update` setup; add `client.workspace.get_status` returning a directory-typed mock. +5. **Add:** `test_plain_workspace_folder_succeeds` — covers a `workspace.get_status` returning ObjectType.DIRECTORY for a path that is NOT a Repo. Should reach the export step and succeed. +6. **Add:** `test_workspace_path_not_directory_returns_error` — `workspace.get_status` returns a FILE-typed mock; tool returns `"not a directory"` error without creating a PTY. + +Expected test count delta: ~−3 / +2 = net −1 test. + +## Tests for the SDK validation step + +Since we're relying on `client.workspace.get_status` to validate, add a mock-level test that verifies: +- A non-existent path raises an exception from `get_status` → tool returns `"Workspace path not found"` error. +- A directory path returns object_type=DIRECTORY → tool proceeds. +- A file path returns object_type=FILE → tool returns `"not a directory"` error. + +These belong in the same file as the existing tool tests. + +## Out of scope (deferred) + +- **Single-file `workspace_path`.** Not supported. If a caller wants to ship a single file, they create a directory containing it. Keeps `_export_recursive` semantics simple. +- **Recovering branch info from a Git Folder for the response.** Not added — caller already knows the branch state, and surfacing it in the response would be ornamental. +- **`workspace.get_status` for the export-failed cleanup path.** The existing `try/except` around `export_workspace_tree` still runs; this change does not affect cleanup. + +## Migration notes + +PR #67 is open and not yet merged — no shipped consumers depend on the `branch` parameter. Removing it is safe. The PR description should note the API change. + +## Risks + +- **A caller that calls with `branch="main"`** (positional or kwarg) will now error with `TypeError: unexpected keyword argument 'branch'`. Acceptable because no consumer has shipped. The FastMCP runtime surfaces this as a tool-validation error on the caller side. +- **`workspace.get_status` adds one extra API call** to the happy path. Negligible — same network plane as the export calls that follow. + +## Acceptance criteria + +- `coda_interactive` accepts ANY Workspace directory path, Git Folder or plain. +- `coda_interactive` no longer accepts a `branch` parameter. +- The tool gives a clean error when the path doesn't exist or isn't a directory. +- All existing tests pass (after the test updates above). +- The PR description for #67 reflects the simpler contract. diff --git a/docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md b/docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md new file mode 100644 index 0000000..b395562 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md @@ -0,0 +1,379 @@ +# Spec: `coda_interactive` MCP Tool + +**Status:** Draft, pre-critique-gate +**Date:** 2026-05-28 +**Branch:** `feat/coda-mcp-live-session-url` (same as Todo 1) +**Related:** `docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md` (Todo 1 — establishes the three-mode framework this spec slots into as Mode 2) + +> **Amended by:** [`docs/superpowers/specs/2026-05-28-coda-interactive-broaden-source-design.md`](2026-05-28-coda-interactive-broaden-source-design.md) — the `branch` parameter and the Git-Folder-only requirement have been removed. `coda_interactive` now accepts any Workspace directory (Git Folder or plain). The `repos.list` + `repos.update` flow described in Section 3 of this spec has been replaced by a single `workspace.get_status` directory check. The return shape no longer includes a `"branch"` key. + +## Goal + +Add a new MCP tool, `coda_interactive`, that lets an upstream MCP client (Genie Code, Claude Desktop, Cursor) hand off an in-flight coding session to a human via a CoDA viewer URL. The handoff carries: +- A **chosen coding agent** (`claude` by default; pluggable to `hermes`, `codex`, `gemini`, `opencode`) +- A **project source**: a Databricks Workspace Git Folder path, optionally on a specific branch +- A **kickoff prompt** that gets auto-typed into the agent as the first user message + +The human opens the URL, attaches to a live PTY where the agent is already loaded with the project as CWD and the prompt already typed, drives the session, and exits when done. The URL is the only handle — no `result.json`, no `coda_get_result`, no `coda_inbox` integration. + +## Why + +Mode 3 (`coda_run`) is fire-and-forget batch — the MCP caller can't iterate mid-task. Mode 1 (direct web UI) requires the human to already be inside CoDA and manually wire their project. Neither covers the "I was working in Genie Code on a repo and want to continue with a coding agent inside CoDA" workflow. + +`coda_interactive` is built for that handoff. **Critically, this design uses Databricks Workspace Git Folders as the source of truth** — Coda already has Databricks authentication via its existing PAT, so no new credentials need to be configured for the tool to clone repos. The MCP caller's Git Folder in Workspace is the durable artifact that survives between local sessions and Coda sessions. + +## The Three-Mode Framework (reminder) + +See Todo 1's spec for the canonical table. This spec finalizes Mode 2: + +| Mode | How invoked | PTY tag | Lifecycle | URL semantics | +|---|---|---|---|---| +| **1. Direct launch** | User opens web UI, creates a tab | (none) | 24h idle / WS-extends | No external URL | +| **2. `coda_interactive`** *(this spec)* | MCP client calls the tool, passes the URL to a human | `replay_only=False` | 24h idle / WS-extends | Live attach | +| **3. `coda_run`** | MCP client fires the tool, URL is post-hoc replay only | `replay_only=True` | Immediate teardown on hermes -z exit | Replay only | + +## Design + +### 1. Tool signature + +```python +@mcp.tool( + annotations=ToolAnnotations( + readOnlyHint=False, + destructiveHint=False, + idempotentHint=False, + ), +) +async def coda_interactive( + prompt: str, # initial kickoff message; auto-typed as first user input + workspace_path: str, # required, e.g. "/Workspace/Users/me@db.com/projects/feature-X" + branch: str = "", # optional — if set, updates the Git Folder to this branch first + agent: str = "claude", # claude | hermes | codex | gemini | opencode + email: str = "", # X-Forwarded-Email passthrough (single-user app, kept for parity) +) -> str: + ... +``` + +**Return shape** (JSON string): +```json +{ + "status": "launched", + "viewer_url": "https://..aws.databricksapps.com/?session=pty-...", + "agent": "claude", + "project_dir": "/home/app/.coda/projects/pty-...", + "workspace_path": "/Workspace/Users/me@db.com/projects/feature-X", + "branch": "feature/X", + "instructions": "Open viewer_url to attach. The agent is loaded with the project files exported from Workspace and your kickoff prompt typed. Continue from there; type the agent's quit command (e.g. /quit) and then `exit` to end the session. Note: git history is NOT available in the session — files are an export, not a clone." +} +``` + +### 1a. Caller pre-condition: project must be in Databricks Workspace + +This is a contract the **upstream MCP caller** (Genie Code, Claude Desktop, etc.) is responsible for satisfying — `coda_interactive` cannot create a Git Folder, it can only consume one. + +**The caller must:** +1. Ensure the project of interest is a **Databricks Workspace Git Folder** (created via the workspace UI's "Create > Git Folder" or via the Repos API). Plain Workspace folders without a git remote backing will not work — the branch-update step has no remote to fetch from. +2. **Commit and push** any local working changes back to the Git Folder's remote (GitHub/GitLab/etc.) **before** calling `coda_interactive`. The export is a server-side snapshot — uncommitted local changes are invisible to Coda. +3. If a specific branch is needed, ensure that branch exists on the remote and is reachable by the Databricks Workspace's stored credentials for the Git Folder. + +**The MCP tool's `instructions` string surfaces this requirement to the calling LLM:** + +> Before calling `coda_interactive`, ensure the user's project is a Databricks Workspace Git Folder and that any in-progress changes have been pushed to the Git Folder's remote. The tool exports a server-side snapshot — uncommitted local changes will not appear in the Coda session. If unsure, prompt the user to push their changes first or pass `workspace_path` for a recently-synced Git Folder. + +This text becomes the tool's surfaced description in the FastMCP server's instruction block, alongside the existing `coda_run` guidance. + +On error: `{"status": "error", "error": ""}`. No partial state — if export fails or PTY creation fails, no PTY is created and no `viewer_url` is returned. + +### 2. Agent launch matrix + +Each agent has a known interactive-launch command (verified against the deployed setup scripts): + +| `agent` value | Launch command sent to PTY | +|---|---| +| `claude` (default) | `claude\n` | +| `hermes` | `hermes chat\n` | +| `codex` | `codex\n` | +| `gemini` | `gemini\n` | +| `opencode` | `opencode\n` | + +Unknown `agent` values return an error immediately — no Workspace API call, no PTY. + +### 3. Project source: Workspace Git Folder export + +Coda's existing Databricks authentication (PAT in `DATABRICKS_TOKEN`) is sufficient for both steps. No new tokens, no `repo_token` parameter, no GitHub credential plumbing. + +Working directory on Coda: `~/.coda/projects//`. + +**Step 3a — (Optional) Update the Git Folder to the requested branch.** Skip if `branch` is empty. + +**Side-effect note:** `repos.update(branch=...)` mutates the Git Folder's server-side state — the folder is now on the requested branch for *any* tools/processes accessing it (other notebooks, jobs, parallel `coda_interactive` calls, etc.). For Coda's single-user-app model this is acceptable: the user is the only one mutating the Git Folder. If multi-user support is ever added, this design must be revisited — likely by cloning a sibling Git Folder per session. + +```python +from databricks.sdk import WorkspaceClient +w = WorkspaceClient() # picks up DATABRICKS_HOST + DATABRICKS_TOKEN from env + +# Resolve the Repos / Git Folder ID from the workspace_path +repos = w.repos.list(path_prefix=workspace_path) +repo = next((r for r in repos if r.path == workspace_path), None) +if repo is None: + return {"status": "error", "error": f"No Git Folder found at {workspace_path}"} + +# Update to the requested branch — Databricks performs the actual fetch + checkout server-side +w.repos.update(repo_id=repo.id, branch=branch) +``` + +**Step 3b — Export the file tree into Coda's local disk.** + +The Databricks Workspace API exposes a `workspace export-dir`-equivalent through the SDK: + +```python +import os +project_dir = os.path.join(os.path.expanduser("~/.coda/projects"), pty_session_id) +os.makedirs(project_dir, exist_ok=True) + +# Recursive export — files only, no `.git` directory. +# (Implementation may use the workspace.export() loop or shell out to `databricks workspace export-dir`.) +_export_workspace_tree(w, workspace_path, project_dir) +``` + +`_export_workspace_tree` is a small helper that: +1. Lists the workspace_path recursively (`w.workspace.list(workspace_path)` with recursive traversal) +2. For each file: calls `w.workspace.export(path=..., format=ExportFormat.SOURCE)` and writes the content to the local mirror +3. Preserves directory structure +4. Handles files (NOT notebooks — notebooks export to `.py`/`.ipynb` via the `SOURCE` format) + +Implementation note: if the SDK's recursive-export is awkward, fall back to shelling out: `subprocess.run(["databricks", "workspace", "export-dir", workspace_path, project_dir, "--overwrite"], check=True, capture_output=True, timeout=300)`. The CLI is preconfigured on Coda. Either approach is acceptable; the planner will pick after a small spike. + +**Important:** Only the working tree is exported. The `.git/` directory is NOT included — Workspace Git Folders manage git state server-side and don't expose `.git` via the API. Git history is unavailable inside the session. This trade-off is acknowledged in §7 (Out of Scope) and surfaced to the caller via the `instructions` field in the response. + +**Snapshot semantics:** `workspace.export()` reflects the **committed HEAD state** of the Git Folder — not any uncommitted changes that exist in the Databricks Workspace UI editor. If the caller's user has uncommitted edits in the Workspace UI for this Git Folder, those changes will NOT appear in the Coda session. This is the same constraint the caller pre-condition (§1a) communicates: push commits first. + +**Binary file handling:** `workspace.export(format=ExportFormat.SOURCE)` may fail (HTTP 400) on binary files (images, PDFs, compiled artifacts). The export helper must wrap each per-file export in a try/except and skip-and-log files that error out, rather than aborting the entire export. The agent in the session gets a partial tree (text/source files); the human can decide whether the missing binaries matter. + +**Empty export:** If the Workspace Git Folder is empty OR if all files are non-exportable, the project dir ends up empty after the export. The PTY is still launched (the agent will sit in an empty dir). This is acceptable — the human can investigate via the agent. + +Export timeout: 300 s (5 min). Big repos may need bumping later; not parameterizable in MVP. + +### 4. Prompt seeding + +After the PTY is created and the agent launched: + +```python +import time +# Wait briefly for the agent to initialize and present its prompt. +time.sleep(2) + +# Type the prompt into the PTY as the first user message. +_app_send_input(pty_session_id, prompt + "\n") +``` + +The 2 s delay is a pragmatic choice — agents typically print a banner + prompt within that window. If the timing misses on slow startup, the prompt still lands; the agent sees it as part of the kickoff. No assertion that the agent is "ready" — that's a brittle race we don't need. + +### 5. PTY + project lifecycle + +`coda_interactive` PTYs inherit Mode 1's lifecycle exactly: +- Created with `replay_only=False` +- 24h idle TTL via existing `SESSION_TIMEOUT_SECONDS = 86400` cleanup +- WS heartbeat extends while the human is attached +- Teardown via human typing `exit` (which closes bash, which EOFs the PTY) OR 24h idle + +**Cleanup hook:** `mcp_close_pty_session(pty_id)` (in `app.py`) gains a side-effect: if `~/.coda/projects//` exists, delete it (recursively) after closing the PTY. Single cleanup path means the disk lifecycle matches the PTY lifecycle — no new timer or state. + +### 6. Where this lives in the codebase + +- Modified: `coda_mcp/mcp_server.py` — add `coda_interactive` tool definition next to `coda_run`. **Also update the FastMCP `instructions` string** (currently around lines 43-70) to add a paragraph describing `coda_interactive` so calling LLMs don't treat it like `coda_run` (e.g., don't try to poll for results). The new paragraph must include: the pre-condition that the project must be a Workspace Git Folder, the contract that interactive sessions don't appear in `coda_inbox`, and a note that `coda_get_result` won't return anything for these sessions. +- Modified: `app.py` — extend `mcp_close_pty_session` to clean up the project dir; add `cwd` kwarg to `mcp_create_pty_session` so the spawned bash starts in the project dir. **Prerequisite refactor (security-relevant):** `mcp_create_pty_session`'s inline env-stripping at `app.py:1435-1441` only strips a handful of keys (CLAUDECODE, CLAUDE_CODE_SESSION, DATABRICKS_TOKEN, DATABRICKS_HOST, GEMINI_API_KEY). The HTTP `create_session` route uses `_build_terminal_shell_env(os.environ)` which ALSO strips `NPM_TOKEN`, `UV_DEFAULT_INDEX`, `UV_INDEX_*_PASSWORD`, `UV_INDEX_*_USERNAME`, and `npm_config_//*` registry credential patterns. Today, any MCP-created PTY (including `coda_run`'s) leaks these registry credentials to the child shell via `env`. Fix this as a prerequisite to Todo 2: refactor `mcp_create_pty_session` to call `_build_terminal_shell_env(os.environ)` instead of the inline copy. Zero behavioral impact on the happy path; closes a latent security gap. +- Modified: `coda_mcp/mcp_endpoint.py` — register `coda_interactive` in the Flask-fallback tool dispatch (parity with how `coda_run` is wired). +- New helper: `coda_mcp/workspace_export.py` — encapsulates the Workspace-tree-to-local-dir export logic. Keeps `mcp_server.py` focused on tool orchestration. +- New tests: `tests/test_coda_interactive.py` covering signature validation, branch update, export, agent allow-list, prompt seeding, cleanup on PTY close. Plus `tests/test_workspace_export.py` for the helper. Plus `tests/test_mcp_env_strip.py` (or extending an existing env-strip test file) to assert `mcp_create_pty_session` properly strips registry credentials post-refactor. + +**Implementation note on SDK calls:** `WorkspaceClient()` is constructed inside the `coda_interactive` tool function (in the server process). The SDK calls happen BEFORE `mcp_create_pty_session` is invoked, so they execute with the full server environment (including `DATABRICKS_TOKEN`). The PTY child shell's env is separately filtered via `_build_terminal_shell_env` and does NOT receive the Databricks token (which is the correct behavior — we don't want agents in the PTY to see deployer credentials). Future implementers must not move the SDK calls into the PTY subprocess. + +### 7. What does NOT change + +- `coda_run` is untouched (Todo 1 already finalized). +- `coda_inbox` and `coda_get_result` ignore `coda_interactive` PTYs (no task records get written for them). +- The Mode 1 web-UI launch path is untouched. +- `replay_only` flag plumbing from Todo 1 — `coda_interactive` passes `replay_only=False`, which is already the default. +- `MAX_CONCURRENT_SESSIONS` enforcement — `coda_interactive` PTYs count against the cap exactly like Mode 1 sessions do. + +## Architecture + +``` + ┌──────────────────────────────────────────┐ + │ MCP client calls coda_interactive │ + │ (prompt, workspace_path, branch, agent) │ + └────────────────┬─────────────────────────┘ + ▼ + ┌────────────────────────────────────────────────────┐ + │ Validate agent ∈ allow-list │ + │ [if branch]: w.repos.update(branch=branch) │ + │ _export_workspace_tree(w, ws_path, project_dir) │ + └────────────────────┬───────────────────────────────┘ + ▼ + ┌────────────────────────────────────────────────────┐ + │ pty_session_id = mcp_create_pty_session( │ + │ label="-interactive", │ + │ replay_only=False, │ + │ cwd=project_dir, # NEW kwarg │ + │ ) │ + │ _app_send_input(pty_session_id, "\n") │ + │ time.sleep(2) │ + │ _app_send_input(pty_session_id, prompt + "\n") │ + │ return {viewer_url, agent, ...} │ + └────────────────────┬───────────────────────────────┘ + ▼ + (Human opens viewer_url; attaches to a live PTY + already cd'd into the exported project, agent + running, kickoff prompt already typed.) + ▼ + ┌────────────────────────────────────────────────────┐ + │ Human types `/quit` (agent) and `exit` (shell), OR │ + │ 24h idle reaper fires │ + │ → mcp_close_pty_session(pty_id) │ + │ → shutil.rmtree(~/.coda/projects/pty_id/) │ + └────────────────────────────────────────────────────┘ +``` + +**New `cwd` kwarg on `mcp_create_pty_session`:** required so the PTY's bash spawns in the exported project dir. Default is the existing behavior (bash uses `$HOME`). Additive change; no other callers need updates. + +## Data flow scenarios + +**Happy path:** +1. User is working locally in their Workspace Git Folder. Pushes recent commits via the Git Folder UI or via the post-commit hook from their existing local Coda environment. +2. MCP client (Genie Code) calls `coda_interactive(prompt="continue debugging the auth flow", workspace_path="/Workspace/Users/me@db.com/projects/auth-feature", branch="feature/auth", agent="claude")` +3. Server validates agent; updates Git Folder to `feature/auth` via Repos API (Databricks does the git fetch); exports tree to `~/.coda/projects//`; creates PTY in that dir; launches `claude`; types the prompt. +4. Returns `viewer_url` +5. Human opens URL → attaches to live Claude session in the exported project, with kickoff prompt already in the chat +6. Human iterates with Claude; eventually exits the agent and the shell +7. PTY teardown deletes the project dir + +**Branch update failure:** +1. MCP client passes a nonexistent `branch` +2. `w.repos.update(...)` raises (Databricks API returns 4xx) +3. Server returns `{"status": "error", "error": "Failed to update Git Folder to branch X: "}` +4. No export, no PTY, no leak + +**Workspace path not found:** +1. MCP client passes a `workspace_path` that isn't a Git Folder or doesn't exist +2. The `repos.list(...)` lookup returns no match, OR the workspace API returns 404 +3. Server returns `{"status": "error", "error": "No Git Folder found at "}` +4. No PTY, no leak + +**Agent allow-list rejection:** +1. MCP client passes `agent="vim"` +2. Server returns `{"status": "error", "error": "Unknown agent: vim. Allowed: claude, hermes, codex, gemini, opencode"}` +3. No Workspace API call, no PTY + +**Concurrent-session limit:** +1. `MAX_CONCURRENT_SESSIONS` already at cap when call arrives +2. Server returns `"Maximum 5 concurrent sessions reached."` (same shape as `coda_run`) +3. No export, no PTY + +**Human never attaches:** +1. PTY sits at the agent's prompt, with the kickoff already typed +2. 24h elapses → existing idle cleanup reaps the PTY +3. Project dir deleted as part of `mcp_close_pty_session` + +**Human attaches, drives, but closes tab without exiting agent:** +1. WS heartbeat stops +2. 24h idle countdown begins +3. If human reopens within 24h: WS resumes, session continues +4. Else: idle cleanup, project dir cleanup, done + +## Error handling + +| Error | Returned to MCP client | Server-side cleanup | +|---|---|---| +| Unknown `agent` value | `{"status":"error","error":"Unknown agent: ..."}` | None needed | +| `workspace_path` doesn't exist / not a Git Folder | `{"status":"error","error":"No Git Folder found at "}` | None needed | +| `repos.update(branch=...)` fails (bad branch, network) | `{"status":"error","error":"Failed to update Git Folder to branch X: "}` | Remove partial project dir | +| Export fails midway (disk full, network) | `{"status":"error","error":"Failed to export workspace tree: "}` | Remove partial project dir | +| `MAX_CONCURRENT_SESSIONS` reached | `{"status":"error","error":"Maximum N concurrent sessions reached."}` | None needed | +| PTY creation fails | `{"status":"error","error":"Failed to allocate PTY: "}` | Remove project dir | + +No `result.json` is written — no watcher, no completion machinery. Cleanup happens via the PTY's own teardown path. + +## Testing strategy + +### Unit tests (no PTY, mock Databricks SDK) + +1. `test_coda_interactive_unknown_agent_returns_error` — `agent="vim"` → status=error, no SDK call +2. `test_coda_interactive_missing_workspace_path_returns_error` — empty `workspace_path` → error +3. `test_coda_interactive_workspace_not_found` — mock `repos.list()` returns empty → status=error +4. `test_coda_interactive_branch_update_failure_returns_error` — mock `repos.update()` raises → error + no PTY +5. `test_coda_interactive_export_failure_cleans_partial_dir` — mock export raises mid-way → partial dir is removed +6. `test_coda_interactive_skips_branch_update_when_empty` — mock confirms `repos.update()` is NOT called when `branch=""` + +### Integration tests (PTY-gated via `_pty_skip`, with mocked Databricks SDK) + +7. `test_coda_interactive_happy_path_mocked_export` — mock the Workspace SDK to "export" a fake tree into the local dir, assert PTY is created with the right CWD, agent command is sent, prompt is typed. +8. `test_coda_interactive_concurrent_limit` — fill up `MAX_CONCURRENT_SESSIONS` → call returns error +9. `test_mcp_close_pty_session_removes_project_dir` — create PTY with project dir, close it, assert dir deleted +10. `test_mcp_close_pty_session_handles_missing_project_dir` — no project dir present → close still succeeds (no exception) +11. `test_mcp_create_pty_session_respects_cwd_kwarg` — bash spawns in the requested dir + +### Helper tests + +12. `tests/test_workspace_export.py`: tests for `_export_workspace_tree` covering: nested dirs, file content fidelity, empty dirs, files-only (skips notebooks), error handling for individual file export failures. + +### Regression guard + +13. `test_coda_run_does_not_create_project_dir` — calling `coda_run` doesn't touch `~/.coda/projects/`. Defends the lifecycle separation between Modes 2 and 3. + +## Out of scope (for Todo 2) + +- **Git history inside the session.** Files-only export. Inside the PTY, `git log`, `git diff`, `git blame` return nothing. If history matters for a particular session, the MCP caller can include a `git log --oneline -50` summary in the `prompt` string. A future Todo can layer on a git-clone path with token-based auth. +- **Notebooks as `.ipynb`.** The export uses `ExportFormat.SOURCE` which converts Databricks notebooks to `.py` (or equivalent). MVP doesn't attempt to round-trip notebooks back to Workspace; agents work on the exported source files. +- **Conversation history transfer from the MCP client's local session.** Not in scope. Caller summarizes context into `prompt`. +- **Listing live `coda_interactive` sessions via `coda_inbox`.** URL is the only handle. +- **`coda_get_result` for interactive sessions.** No result.json, no inbox entry. +- **Incremental Workspace updates during the session.** If the user wants to pull newer changes mid-session, they'd need to push to Workspace and re-launch `coda_interactive`. No in-session sync mechanism. +- **Multiple-agent sessions in one PTY.** One agent per call. +- **Non-Workspace sources** (raw zips, external git remotes). Future Todo if needed. +- **Pushing changes BACK from the session to Workspace.** The agent can run Coda's existing post-commit hook (which syncs `~/projects/` to Workspace), but the exported dir at `~/.coda/projects//` is OUTSIDE that hook's scope by design — we don't want every interactive session to clobber Workspace state. If write-back is needed, that's a follow-up design. + +## Migration / Rollout + +- Single commit chain on the `feat/coda-mcp-live-session-url` branch on top of Todo 1's work. +- No data migration: new tool, no existing state to update. +- No config flag — the new tool is unconditionally available once the code lands. +- App restart picks up the new tool registration. +- MCP clients (Genie Code, etc.) will see the new tool listed via `tools/list` and can call it immediately. + +## Critique gate + +**Cleared** (2026-05-28). Critic verdict: APPROVE WITH CHANGES. All flagged issues incorporated above: + +- **MAJOR** — pre-existing env-strip gap in `mcp_create_pty_session` (misses `NPM_TOKEN`, `UV_DEFAULT_INDEX`, `UV_INDEX_*_PASSWORD`, etc.) → added as prerequisite refactor task in §6 +- **HIGH-PRIORITY GAP** — FastMCP `instructions` string update for the new tool → added explicitly in §6 +- Section 3 expanded with snapshot-semantics, binary-file handling, and empty-export notes +- Section 3a expanded with multi-user side-effect caveat +- Section 6 expanded with SDK-call placement note (calls happen in server process, not PTY subprocess) +- Tool description text guidance integrated (instructions string must mention `coda_inbox` invisibility, no `coda_get_result` integration, Git Folder pre-condition) + +Original 10 critique questions, all answered in the critique pass: + +1. **Auth model** — Confirmed. Coda's PAT covers both `repos.update()` and `workspace.export()`; no scope gotcha for single-user. +2. **Export performance** — Both SDK loop and `databricks workspace export-dir` CLI are viable; planner picks after a small spike. CLI is faster. +3. **Git Folder vs. ordinary folder** — Hard error is correct. `repos.list()` returns empty for non-Git folders; clear error message. +4. **Branch update side effect** — Acceptable for single-user app; multi-user caveat added to §3a. +5. **Notebook handling** — `ExportFormat.SOURCE` converts notebooks to `.py`/`.scala`/`.sql`. Acceptable; out-of-scope to round-trip back to notebooks. +6. **Concurrent branch race** — Acceptable for single-user; documented as user error. +7. **Disk lifecycle** — UUID-based session IDs prevent collisions; rmtree failure orphans the dir but doesn't break next session. +8. **Prompt seeding** — 2-second sleep is pragmatic; bash buffers stdin if agent is slow to read. +9. **`cwd` kwarg** — Only `coda_interactive` needs it. Additive change, no other callers affected. +10. **Test coverage** — Mocked Databricks SDK is the right MVP approach; E2E against real workspace deferred as nice-to-have behind CI flag. + +Plus eight additional critic-eye questions (11–18), all resolved: + +11. **Mode separation drift** — No drift. Regression guard test (`test_coda_run_does_not_create_project_dir`) defends the separation. +12. **PTY exhaustion** — Production PTY limit is ~4096; `MAX_CONCURRENT_SESSIONS=5` is nowhere near. macOS dev exhaustion is a known local-test concern, handled via `_pty_skip`. +13. **Project dir collision** — UUID-based IDs make collision probability negligible; `exist_ok=True` on `makedirs` handles the unlikely case. +14. **Pre-condition realism** — Realistic for Genie Code (primary target); secondary clients (Claude Desktop, Cursor) get clear guidance via `instructions` string. +15. **Dirty Workspace UI state** — Export reflects committed HEAD; uncommitted UI edits NOT included. Documented in §3 snapshot-semantics note. +16. **Binary files** — Per-file try/except + skip-and-log added to §3 binary-file note. +17. **`coda_inbox` invisibility** — Documented in `instructions` string per §6. +18. **Tool description text** — Spelled out in §6 (instructions string must explain the new tool's contract). + +Spec is ready for planning. diff --git a/docs/superpowers/specs/2026-05-28-coda-interactive-terminal-pull-design.md b/docs/superpowers/specs/2026-05-28-coda-interactive-terminal-pull-design.md new file mode 100644 index 0000000..d6df636 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-coda-interactive-terminal-pull-design.md @@ -0,0 +1,219 @@ +# Spec: `coda_interactive` Terminal-Side Workspace Pull + +**Status:** Draft, design-critic passed (SOUND-WITH-FIXES, all fixes folded in) +**Date:** 2026-05-28 +**Branch:** `feat/coda-mcp-interactive-handoff` (continues PR #67) +**Supersedes the export mechanism in:** `docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md` + +## Problem + +`coda_interactive` currently does a **server-side** export of a Databricks Workspace folder into a local project directory via `WorkspaceClient().workspace.export(...)` (module `coda_mcp/workspace_export.py`), then launches an agent in that directory. In the deployed app this produces an **empty directory** — the agent has no idea about the user's files. + +### Confirmed root cause + +The deployed app runs as its **own service principal** (`app-167dcd mcp-test-coda-labs-feat`, client_id `460e920e-…`), confirmed via the Apps API. The MCP server calls `WorkspaceClient()` with no args → that resolves to the **app SP**. The app SP can `get_status` the user's `/Users//WAM` folder (so the tool reports `"launched"`) but **cannot `list`/`export` its contents**. `workspace_export.py._export_recursive` **swallows** those errors (`logger.warning` + `return`), so `export_workspace_tree` raises nothing and the agent launches over an empty directory. + +### Evidence + +- **REST as the user** (curl): `list`, `get_status`, and `export` (SOURCE and AUTO, with and without `/Workspace` prefix) all succeed for the 5 `.md` files in WAM. So the API, the export format, and the path prefix are NOT the problem. +- **Live CoDA terminal:** `databricks current-user me` returns the **user** (`sathish.gangichetty@databricks.com`), not the app SP. `databricks workspace list /Users/.../WAM` from the terminal returns the 5 files. +- **Conclusion:** the identity that can read the files is the **terminal** (the app owner / user), not the **MCP server** (app SP). Move the file access to the terminal. + +## Goal + +Stop exporting server-side. `coda_interactive` hands the location to the **terminal** (authenticated as the user) and pulls the files there with `databricks workspace export-dir`, then launches the agent in the pulled directory. Net effect: the agent starts in a directory that actually contains the workspace files, and any failure is visible (a real tool error or terminal output) instead of silently swallowed. + +## Non-goals + +- `/Workspace` FUSE-mount access — `export-dir` works regardless of whether the mount exists. Not pursued. +- Pushing edits back to the Workspace (`import-dir`) — the agent can do that itself if asked. Out of scope. +- Git Folder branch checkout — caller's responsibility, as before. +- Changing `coda_run` (mode 3) or any other tool. +- Hardening the existing `_wait_for_agent_ready` heuristic beyond what this change needs (see Risks). + +--- + +## Design + +### New `coda_interactive` flow + +``` +1. Validate `agent` ∈ _ALLOWED_AGENTS (unchanged) +2. Verify PTY hooks wired (_app_create_session/_app_send_input) (unchanged) +3. pty_session_id = _app_create_session(label=f"{agent}-interactive", replay_only=False) +4. project_dir = os.path.join(os.path.expanduser("~/.coda/projects"), pty_session_id) + os.makedirs(project_dir, exist_ok=True) +5. name = _safe_dirname(workspace_path) # e.g. "WAM" + source_path = _normalize_workspace_path(workspace_path) # strip leading /Workspace +6. Type ONE chained line into the PTY (runs as the user): + cd && databricks workspace export-dir ./ && cd +7. await _wait_for_output_stable(pty, _EXPORT_MAX_WAIT_S, _EXPORT_STABILITY_S) + # wait for the pull to finish — shell goes truly idle after export-dir, + # so stabilization here is reliable (no agent-cold-start gap to confuse it) +8. SERVER-SIDE post-condition check (does NOT depend on the app SP — stats local disk): + target_dir = os.path.join(project_dir, name) + if not os.path.isdir(target_dir) or not os.listdir(target_dir): + close PTY; shutil.rmtree(project_dir, ignore_errors=True) + return {"status":"error", "error": ""} +9. Launch the agent (fresh — identical to the proven existing path): + _app_send_input(pty, _AGENT_LAUNCH_CMDS[agent] + "\n") + await _wait_for_agent_ready(pty) # existing 5s/1s window, unchanged behavior +10. Paste kickoff prompt, prefixed with a context line naming workspace_path: + "Your working directory contains files exported from the Databricks + Workspace path .\n\n" +11. return {"status":"launched", "viewer_url", "agent", "project_dir": target_dir, + "workspace_path", "instructions"} +``` + +### Why split the waits (design-critic CRITICAL fix) + +The naive design (`cd && export-dir && cd`, then launch agent, then a single `_wait_for_agent_ready`) risks `_wait_for_agent_ready` returning **early** in the silent gap between `export-dir` finishing and the agent's TUI producing output — pasting the prompt into a half-initialized agent or the shell. + +The split removes that risk: +- **Step 7** waits for the *pull* to finish. After `export-dir` completes the shell is genuinely idle (output stops), so stabilization is reliable. It is NOT waiting across an agent cold-start. +- **Step 9** waits for the *agent* exactly the way the current working code does (launch → wait → prompt), with no preceding network op. It inherits the known-good behavior. +- **Step 8** (the filesystem post-check) is the safety net: if the pull produced nothing, we error out cleanly instead of launching into an empty directory. This also resolves the `&&`-failure ambiguity — a failed `export-dir` short-circuits the chain, leaves `target_dir` absent, and step 8 turns that into a proper tool error. + +### Helpers + +```python +def _safe_dirname(workspace_path: str) -> str: + """Local directory name for the pulled folder = sanitized basename.""" + base = os.path.basename(workspace_path.rstrip("/")) + safe = re.sub(r"[^A-Za-z0-9._-]", "_", base) + return safe or "workspace" + + +def _normalize_workspace_path(workspace_path: str) -> str: + """Canonical Workspace API path: drop the /Workspace FUSE prefix if present. + + The deployed terminal's CLI uses the unprefixed form (/Users/...); REST + accepts both, but normalizing matches what the CLI expects and is harmless. + """ + p = workspace_path.rstrip("/") + if p.startswith("/Workspace/"): + p = p[len("/Workspace"):] # "/Workspace/Users/x" -> "/Users/x" + return p +``` + +### Wait-helper refactor (backward compatible) + +Generalize the existing poller so the export wait can use a longer budget while `coda_run`'s call site stays unchanged: + +```python +_PROMPT_SEED_MAX_WAIT_S = 5.0 # existing — agent TUI settle +_PROMPT_SEED_STABILITY_S = 1.0 # existing +_EXPORT_MAX_WAIT_S = 120.0 # new — generous; export-dir prints per-file so it won't prematurely stabilize on a slow pull +_EXPORT_STABILITY_S = 1.5 # new + +async def _wait_for_output_stable(pty_session_id, max_wait, stability): + # exact body of the current _wait_for_agent_ready, parametrized on max_wait/stability + +async def _wait_for_agent_ready(pty_session_id): + await _wait_for_output_stable(pty_session_id, _PROMPT_SEED_MAX_WAIT_S, _PROMPT_SEED_STABILITY_S) +``` + +`coda_run` already calls `_wait_for_agent_ready` — that call and its behavior are unchanged. + +### `databricks workspace export-dir` (verified) + +`databricks workspace export-dir SOURCE_PATH TARGET_PATH`: +- Exports a directory recursively from the Workspace to the local filesystem. +- **Creates** `TARGET_PATH`. +- Auto-appends notebook extensions (`.py/.scala/.sql/.r`) by language — natively replaces the hand-rolled logic in `workspace_export.py`. +- `--overwrite` flag exists; not needed here (the session `` dir is fresh). + +### Deletions + +- `coda_mcp/workspace_export.py` — whole module. +- `tests/test_workspace_export.py` — whole file. +- In `coda_mcp/mcp_server.py`: remove `from coda_mcp.workspace_export import export_workspace_tree, _is_directory`, the `WorkspaceClient` import guard (verify no other use first), the `WorkspaceClient()` instantiation, the `get_status` validation, and the `_is_directory` call. +- `tests/test_replay_only_flag.py:166` — only a **comment** mentions `export_workspace_tree` (not an import). Refresh the wording so it doesn't reference a deleted symbol. Non-breaking. + +### Kept + +PTY creation (`replay_only=False`), `project_dir` + `os.makedirs`, `_wait_for_agent_ready` (now a wrapper), `viewer_url`, `_ALLOWED_AGENTS`, `_AGENT_LAUNCH_CMDS`, the existing try/except resource cleanup. `email` stays in the signature (upstream callers pass it; currently unused, reserved). + +### Cleanup on session end (no new code) + +`app.py:terminate_session` already `shutil.rmtree`s `os.path.expanduser("~/.coda/projects/")` on both graceful exit and idle-reaper paths. The pulled `` dir lives inside `project_dir`, so it is cleaned up automatically. + +--- + +## Error handling + +| Situation | Behavior | +|-----------|----------| +| Unknown `agent` | Immediate `{"status":"error"}` (unchanged) | +| PTY hooks not wired | Immediate `{"status":"error"}` (unchanged) | +| Bad `workspace_path` / no access / empty folder | `export-dir` fails or pulls nothing → step-8 FS check fails → close PTY, rmtree, `{"status":"error", "error": "No files were pulled from ; check it exists and you have read access."}` | +| Pull succeeds | Agent launches in `target_dir`; prompt seeded; `{"status":"launched", viewer_url, ...}` | +| Unexpected exception anywhere | Catch-all: close PTY if created, rmtree `project_dir`, `{"status":"error"}` (unchanged) | + +No server-side path validation via `WorkspaceClient` — the app SP can't reliably validate the user's folder anyway (that was the bug). The step-8 FS check is the validation, and it reads the local disk the *terminal* wrote (correct identity). + +--- + +## Testing strategy + +### `tests/test_workspace_export.py` — DELETE + +### `tests/test_replay_only_flag.py` — refresh the stale comment at line 166 (no logic change) + +### `tests/test_coda_interactive.py` — rewrite + +Mock `_app_create_session` (returns a fake `pty_session_id`), `_app_send_input` (records inputs; on the pull command, side-effect creates `target_dir` + a dummy file to simulate a successful `export-dir`), `_app_close_session`, and the wait helpers (return immediately). Set `HOME` to a `tmp_path` so `project_dir` resolves under the test sandbox. + +| Test | Pins | +|------|------| +| `test_pull_command_is_sent_first` | First `_app_send_input` is the chained `cd … && databricks workspace export-dir ./ && cd `; source has no `/Workspace` prefix; `` is the sanitized basename | +| `test_agent_launches_after_successful_pull` | After the simulated pull creates files, the launch command (`_AGENT_LAUNCH_CMDS[agent]`) is sent | +| `test_prompt_seeded_with_context_line` | Final input starts with the "exported from the Databricks Workspace path " line, then the user prompt | +| `test_empty_pull_returns_error_and_no_launch` | When the pull side-effect creates nothing, result is `{"status":"error"}`, PTY is closed, and the launch command is NEVER sent | +| `test_no_workspaceclient_or_get_status_called` | `WorkspaceClient` is not referenced (import removed); no `get_status` call path | +| `test_happy_path_returns_launched_with_viewer_url` | `{"status":"launched"}`, `viewer_url` present, `project_dir` == `target_dir` | +| `test_unknown_agent_rejected` | Unknown agent → error (unchanged) | +| `test_pty_hook_not_wired` | Hooks `None` → error (unchanged) | +| `test_agent_matrix` | Each of claude/hermes/codex/gemini/opencode sends the right launch cmd | +| `test_no_blocking_sleep` | `coda_interactive` source contains no `time.sleep(` (async regression guard, kept) | + +### `tests/test_mcp_server.py` (or wherever helpers are tested) — add + +| Test | Pins | +|------|------| +| `test_safe_dirname_basename` | `/Users/x/WAM` → `WAM`; trailing slash stripped | +| `test_safe_dirname_sanitizes` | spaces / special chars → `_` | +| `test_safe_dirname_empty_fallback` | `"/"` or `""` → `"workspace"` | +| `test_normalize_strips_workspace_prefix` | `/Workspace/Users/x/WAM` → `/Users/x/WAM` | +| `test_normalize_leaves_plain_path` | `/Users/x/WAM` → `/Users/x/WAM` | +| `test_wait_for_agent_ready_still_wrapper` | `_wait_for_agent_ready` delegates to `_wait_for_output_stable` with the prompt-seed constants | + +### Regression + +Run together (per the established flake note — `test_replay_only_flag.py::test_coda_run_creates_pty_with_replay_only_true` is PTY-fd flaky in multi-file runs; re-run alone if it fails): + +``` +uv run pytest tests/test_coda_interactive.py tests/test_mcp_server.py tests/test_replay_only_flag.py tests/test_task_manager.py tests/test_databricks_preamble.py -v +``` + +--- + +## Acceptance criteria + +1. `coda_interactive` no longer imports or calls `workspace_export` / `WorkspaceClient` / `get_status`. +2. `coda_mcp/workspace_export.py` and `tests/test_workspace_export.py` are deleted; no remaining importers. +3. `_safe_dirname` and `_normalize_workspace_path` exist with the specified behavior. +4. `_wait_for_output_stable(pty, max_wait, stability)` exists; `_wait_for_agent_ready` is a wrapper preserving the `5.0/1.0` budget; `coda_run`'s call is unaffected. +5. The first PTY input is the chained pull command using the normalized (unprefixed) source path and the sanitized ``. +6. The agent launch command is sent **only** when the post-pull FS check finds files; otherwise a `{"status":"error"}` is returned and the PTY is closed. +7. The kickoff prompt is prefixed with the context line naming `workspace_path`. +8. All new/updated tests pass; existing suites (minus the known PTY-fd flake) stay green. + +--- + +## Risks + +1. **Slow / huge folders.** `_EXPORT_MAX_WAIT_S = 120s`; if a pull exceeds it, step 7 returns while `export-dir` is still running and step 8 may see a partial dir and (incorrectly) proceed. Mitigation: 120s is generous for the interactive-handoff use case (docs / small projects); `export-dir` prints per-file so it won't prematurely stabilize during an active pull. Larger-folder support is a future tweak, not in scope. +2. **HOME equivalence.** Step 4/8 resolve `project_dir` via `os.path.expanduser` in the MCP-server process; the PTY `cd`/write uses that same absolute string and the terminal's `$HOME` resolves identically in the deployed container (observed: both `/app/python/source_code/.coda/...`). If a future environment gave the server and PTY different `$HOME`, the `cd` and FS check would diverge. Documented assumption; matches existing code (the deleted export and `terminate_session` cleanup already rely on it). +3. **`_wait_for_agent_ready` cold-start (pre-existing).** The agent wait can still, in principle, fire during a long agent cold-start silence — but this is the current production behavior, unchanged by this spec. A marker-based ready gate is a possible future hardening, explicitly out of scope here. +4. **`export-dir` on `/Workspace`-prefixed paths.** Mitigated by `_normalize_workspace_path` (we pass the `/Users/...` form the CLI expects and that REST verified). diff --git a/docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md b/docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md new file mode 100644 index 0000000..766c749 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md @@ -0,0 +1,255 @@ +# Spec: `coda_run` Returns Replay-Only URL + +**Status:** Draft, pre-critique-gate +**Date:** 2026-05-28 +**Branch:** `coda-mcp` +**Related:** PR #66 (introduced the live-attach `viewer_url` we are now narrowing) ; `docs/superpowers/specs/2026-05-27-coda-mcp-live-session-url-design.md` (predecessor design) + +## Goal + +Make `coda_run`'s returned `viewer_url` resolve to a **read-only static replay** of the agent's transcript, never to a live PTY attach. As a consequence, drop the 5-minute "grace period" machinery from the `coda_run` execution path entirely — the PTY session can be torn down immediately when `hermes -z` exits. + +## Why + +PR #66 introduced a dual-purpose `viewer_url` on `coda_run`: live attach during a 5-minute grace window, then static replay after that. The dual mode was sized for "human watches hermes run live, then post-mortem replays the same URL". + +That use case is being split out into a **separate** MCP tool, `coda_interactive` (designed in a follow-up spec). `coda_run` is now exclusively the fire-and-forget batch surface — autonomous execution, post-hoc inspection. The live-attach affordance on its returned URL is no longer useful: by the time most callers' humans click the URL, hermes has already exited; what they get is a dead bash shell, not a live agent. + +## The Three-Mode Framework + +This spec settles the contract by enumerating the three ways CoDA sessions get created: + +The existing PTY lifecycle in `app.py` (`SESSION_TIMEOUT_SECONDS = 86400`, `CLEANUP_INTERVAL_SECONDS = 900`) **already gives sessions a 24h idle TTL** with WS-heartbeat extension. Mode 2 inherits this directly; only Mode 3 needs to deviate (faster teardown). + +| Mode | How invoked | PTY tag | Pre-attach lifecycle | Post-attach lifecycle | Teardown trigger | URL semantics | +|---|---|---|---|---|---|---| +| **1. Direct launch** | User opens web UI, creates a tab | (none) | n/a — user starts attached | 24h idle cleanup; WS heartbeat extends indefinitely | Tab close / disconnect + 24h idle | No external URL | +| **2. `coda_interactive`** (Todo 2, not in this spec) | MCP client fires the tool, passes URL to a human | `replay_only=False` | Same 24h idle cleanup as Mode 1 | Same — WS heartbeat extends | Agent process exit (`exit` / `/quit` / Ctrl-D), 24h idle, or user closes tab + 24h idle | Live attach; fallback to replay if PTY gone | +| **3. `coda_run`** *(this spec)* | MCP client fires the tool, URL is for post-hoc review only | `replay_only=True` | n/a — no live attach exists | n/a | Hermes -z process exit → `result.json` appears → immediate teardown (bypasses 24h idle) | Replay only, always | + +This spec finalizes Mode 3 and embeds Mode 2 as a forward-reference so the critique gate can sanity-check both together. Mode 1 is the existing direct-launch path — no changes; Mode 2 inherits its lifecycle wholesale. + +## Design + +### 1. Add `replay_only` flag to PTY sessions + +In `app.py`'s `mcp_create_pty_session(label, transcript_path=None)`, add a third parameter: + +```python +def mcp_create_pty_session( + label: str = "hermes-mcp", + transcript_path: str | None = None, + replay_only: bool = False, +) -> str: + ... + sessions[session_id] = { + ... + "replay_only": replay_only, + ... + } +``` + +Default is `False` so existing callers (direct-launch via `create_session`, future `coda_interactive`) keep their current behavior. + +### 2. Enforce replay-only in the attach endpoint + +In `app.py`'s `attach_session()` route, **before** the live-attach branch runs, check the flag. If `sess.get("replay_only")` is true, serve the transcript regardless of whether the PTY is still alive: + +```python +def attach_session(): + ... + sess = _get_session(session_id) + + # NEW: replay-only sessions always serve transcript, never live buffer + if sess and sess.get("replay_only"): + return _serve_transcript_replay(session_id) + + # Existing: PTY gone → transcript fallback + if not sess or sess.get("exited"): + return _serve_transcript_replay(session_id) + + # Existing: live attach + ... +``` + +Where `_serve_transcript_replay()` is a helper extracted from the existing transcript-lookup block at `app.py:1170-1188`. The helper takes only the PTY `session_id` — it does not need any fields from the live session dict (`output_buffer`, `pid`, `label`, `created_at`), since the replay path uses `task_manager.find_task_dir_by_pty_session(session_id)` + file I/O on the transcript. Clean extraction, no field synthesis. + +If no transcript file exists for the session (rare — e.g., PTY died before any output flushed), the helper returns the existing 404 page. + +### 3. Wire `coda_run` to pass `replay_only=True` + +In `coda_mcp/mcp_server.py` `coda_run()`: + +```python +pty_session_id = _app_create_session( + label="hermes-mcp", + transcript_path=transcript_path, + replay_only=True, # NEW +) +``` + +### 4. Rip out the grace-period machinery from the `coda_run` path + +**Pre-existing reality check (informational, per critique):** The `mark_grace_fn` and `bump_poll_fn` hooks were *never wired* in production — neither `app.py:1770-1774`'s `set_app_hooks(...)` call nor `mcp_asgi.py:80-84`'s equivalent passes them. At runtime `_app_mark_grace` and `_app_bump_poll` are both `None`, so `_schedule_deferred_close` no-ops through its `if _app_mark_grace is not None:` guard at `mcp_server.py:203`. The Timer fires and the close happens, but the `grace` flag is never set, the `MAX_CONCURRENT_SESSIONS` exclusion never activates. So the rip-out is removing partially dead code — the spec executor should not waste time reproducing or regression-testing grace-period state that never existed in prod. + +The following code added in PR #66 is now dead weight for `coda_run` sessions and should be removed: + +- `coda_mcp/mcp_server.py`: + - `GRACE_PERIOD_S = 300` constant + - `_app_mark_grace` / `_app_bump_poll` hook slots (and the `set_app_hooks` parameters that accept them) + - `_schedule_deferred_close(session_id)` function + - The `threading.Timer(GRACE_PERIOD_S, ...)` call in `_watch_task` +- `app.py`: + - `_mark_grace_for_session(session_id)` function (line ~1515) + - `_bump_session_last_poll(session_id, delta_s)` function (line ~1530) + - `grace` key written to the session dict in `mcp_create_pty_session` (line ~1477) + - The `sum(1 for s in sessions.values() if not s.get("grace"))` exclusion in all 4 `MAX_CONCURRENT_SESSIONS` check sites at `app.py:1329`, `1369`, `1405`, `1456` (revert to simple `len(sessions)` count) +- Docstrings to update: + - `_close_pty_immediately` at `mcp_server.py:167` currently says "only use from emergency teardown or tests" — rewrite to say it is the normal teardown path for `coda_run`. + - MCP `instructions` string at `mcp_server.py:61-66` says "SHARE THE LIVE URL" / "watch progress" — rewrite to say "replay URL" / "review what was done." +- Tests: + - `tests/test_transcript.py`: drop 4 grace-related tests (lines 135, 157, 169, 174) + - `tests/test_replay_attach.py`: rewrite to assert *immediate* replay regardless of PTY state, not "replay-after-grace" + - `tests/test_mcp_server.py`: drop 2 grace tests (lines 361, 372 — hooks test + timer-scheduling test) + - `tests/test_mcp_integration.py`: drop 1 grace test (line 315); the E2E test at `:396` already calls `complete_task` + close directly, keep that pattern + +### 5. Watcher teardown on completion + +In `_watch_task` (currently spawned by `coda_run`), when the watcher detects `result.json` and marks the task complete, replace the deferred-close path with the immediate one: + +```python +# Old: +_schedule_deferred_close(session_id) +# New: +_close_pty_immediately(session_id) +``` + +`_close_pty_immediately` already exists at `mcp_server.py:167`. It's a thin wrapper that reads `pty_session_id` from task_manager's `session.json` and calls the `_app_close_session(pty_session_id)` hook (`app.py`'s `mcp_close_pty_session`). After the rip-out it becomes the sole teardown path for `coda_run` — update its docstring to reflect that it's now the normal path, not "emergency teardown." + +## What does NOT change + +- `coda_run`'s **return shape** is unchanged: `{task_id, session_id, status, viewer_url}`. The `viewer_url` string itself is the same format (`{base}/?session={pty}`). The change is purely in what that URL does when followed. +- Transcript writing (the tee in `read_pty_output`) is unchanged. +- The 404-when-no-transcript-found page (`_renderExpiredPage`) is unchanged. +- The frontend (`static/index.html`) `_initFromQueryString`, `_doReplay`, `_doAttach` flow is unchanged. The replay code path already exists and is the one the server will steer all `coda_run` traffic into. +- Direct-launch PTY sessions are unchanged — they keep their existing 24h-idle cleanup (`SESSION_TIMEOUT_SECONDS = 86400`) and WS-heartbeat-extends lifecycle. + +## Architecture + +``` + ┌─────────────────────────────────┐ + │ MCP client calls coda_run │ + └────────────────┬────────────────┘ + ▼ + ┌──────────────────────────────────────────────┐ + │ task_manager.create_task → write prompt.txt │ + │ mcp_create_pty_session(replay_only=True) │ + │ send "hermes -z prompt.txt\n" to PTY │ + │ spawn _watch_task daemon thread │ + │ return {viewer_url: ".../?session=..."} │ + └────────────────┬─────────────────────────────┘ + ▼ + (hermes runs in PTY) + ▼ + ┌──────────────────────────────────────────────┐ + │ hermes writes result.json → exits │ + │ _watch_task detects result.json │ + │ _watch_task calls _close_pty_immediately │ + │ PTY torn down, slot freed │ + └────────────────┬─────────────────────────────┘ + ▼ + (Human clicks the URL at any time — before/during/after task) + ▼ + ┌──────────────────────────────────────────────┐ + │ Frontend POSTs /api/session/attach │ + │ attach_session() sees sess["replay_only"] │ + │ OR sess is gone (post-teardown) │ + │ Returns {replay: true, output: [transcript]} │ + │ Frontend calls _doReplay() — read-only view │ + └──────────────────────────────────────────────┘ +``` + +## Data flow under different timings + +The replay-only contract makes timing irrelevant. Three cases, all converge on the same UX: + +1. **Human clicks URL while hermes is still running:** + PTY exists, `replay_only=True` → server serves the in-progress transcript. Read-only view of partial output. + +2. **Human clicks URL right after hermes exits (no grace):** + `_watch_task` has just called `_close_pty_immediately`. PTY may or may not still be in `sessions`. Either way, `replay_only` is true OR PTY is gone → server serves the final transcript from disk. + +3. **Human clicks URL hours / days later:** + PTY is long gone. Transcript file still on disk. Existing transcript-fallback path serves it. + +In none of these cases does the user need a live PTY attached. The transcript file is always sufficient. + +## Error handling + +- **Transcript file missing / unreadable** (rare — PTY died before flush): existing 404 + `_renderExpiredPage` UI applies. No behavior change. +- **`replay_only` flag on a session that has no `transcript_path`**: should not happen for `coda_run` (we always set transcript_path). If it does, the attach endpoint falls through to the existing 404 path. Defensive — no special handling needed. +- **Race: human clicks URL exactly as `_close_pty_immediately` runs**: both old (PTY still in `sessions`) and new (PTY gone) outcomes resolve to "serve transcript". No race-condition bug. + +## Testing + +### Modified tests +- `tests/test_replay_attach.py`: rewrite the two existing tests to assert immediate replay on a `replay_only=True` session, regardless of `exited` status. Drop the grace-window scenario. +- `tests/test_transcript.py`: drop the tests that exercised grace-period transitions (~6 of 12). +- `tests/test_mcp_server.py`: drop tests for `_schedule_deferred_close`, `_app_mark_grace`, `_app_bump_poll`. Keep tests for `viewer_url` generation and `find_task_dir_by_pty_session`. +- `tests/test_mcp_integration.py`: replace the manual `_schedule_deferred_close` call in the E2E test with assertions that the PTY is torn down within ~100ms of `result.json` appearing. + +### New tests +- `tests/test_replay_only_flag.py` (new): + 1. `attach_session` on a `replay_only=True` PTY that is still alive returns `{replay: true, output: [transcript]}`, not the live buffer. + 2. `attach_session` on a `replay_only=False` PTY that is still alive returns the live buffer (unchanged behavior). + 3. `mcp_create_pty_session(replay_only=True)` stores the flag in the session dict. + 4. `coda_run` end-to-end (using the existing `test_mcp_integration.py:396` pattern — call `complete_task` + close path directly, do NOT wait for the 5s watcher poll cycle): after the close call, slot count returns to baseline immediately. **No timing-based assertion** — call ordering is the contract. + 5. **Regression guard**: assert that a session dict created via `coda_run`'s path contains NO `grace` key, and that `mcp_create_pty_session` does not accept a `grace` keyword argument. Prevents future drift that accidentally re-introduces grace on the `coda_run` path. + +### Test count expectation +- Removals: 4 (`test_transcript.py`) + 2 (`test_mcp_server.py`) + 1 (`test_mcp_integration.py`) = 7 grace-only tests dropped. `test_replay_attach.py` has 2 tests that get rewritten, not removed. +- Additions: 5 new tests in `test_replay_only_flag.py`. +- **Net: -2 tests overall.** +- Total: targets ~525 passing + ~10 PTY-gated skipped + +## Out of scope (for Todo 1) + +- **`coda_interactive` tool** (Mode 2): designed in a separate spec / Todo 2. +- Changes to Mode 1 direct-launch lifecycle: untouched. The 24h-idle / WS-heartbeat-extends behavior stays as-is for tabs. +- Backporting the `replay_only` concept to historical `coda_run`-created sessions on disk: not necessary. Old transcripts on disk are served via the same path; the flag matters only at attach-time for alive PTYs. + +## Migration / Rollout + +- Single commit (or small commit chain) to the `coda-mcp` branch, on top of PR #66's merge. +- No data migration: `replay_only` defaults to `False`, so existing sessions in any in-flight worker process behave unchanged. Future `coda_run` invocations get `replay_only=True`. +- No config flag needed — the behavior change is unconditional. +- No deployment ordering constraint: app restart picks up the new behavior cleanly. + +## Open questions + +None blocking. The design is concrete enough for planning. + +## Critique gate + +**Cleared** (2026-05-28). Critic verdict: APPROVE WITH CHANGES. All flagged issues incorporated above: +- Pre-existing hooks-never-wired reality documented in Section 4 (informational — simplifies rip-out) +- Step 5 corrected: `_close_pty_immediately(session_id)` exists at `mcp_server.py:167`, not `app.py` +- `_bump_session_last_poll(session_id, delta_s)` added to `app.py` rip-out inventory +- Test count corrected to -2 (was -6); assertion #4 rewritten to be deterministic (call-ordering, not 100ms timing) +- MCP `instructions` string at `mcp_server.py:61-66` added to "docstrings to update" list +- 5th compensating regression test added to prevent future grace re-introduction +- `_serve_transcript_replay()` extraction note expanded with data-source clarification + +Original five critique questions, all answered in the critique pass: +1. **Rip-out scope** — mostly complete; missed `_bump_session_last_poll` (added) and the hooks-never-wired note (added) +2. **Flag placement** — `replay_only` on session dict is correct; disk-based alternative would add latency +3. **Mode 2 forward-compat** — verified clean; 24h idle clock starts from session creation, behaves correctly whether human attaches or not +4. **Replay-only edge cases** — no admin override needed (admins use Mode 1 directly); partial-transcript-during-live behavior is intentional +5. **100ms assertion** — confirmed flake-bait (watcher polls every 5s); replaced with `test_mcp_integration.py:396`-style direct-call assertion + +Plus five additional critic-eye questions, all resolved: +6. **Concurrency/race** — verified safe under GIL + `sessions_lock`; both interleavings serve transcript correctly +7. **Grace was load-bearing** — confirmed obsolete for Mode 3; live-watch case shifts to Mode 2 as designed +8. **Refactor coupling** — `_serve_transcript_replay` extraction is clean, no field synthesis needed +9. **Documentation drift** — `docs/mcp-v2-background-execution.md` predates PR #66 (no drift); only the MCP `instructions` string needs updating +10. **Test budget** — confirmed -2 net with the regression-guard test added diff --git a/docs/superpowers/specs/2026-05-28-coda-run-workflow-protocol-design.md b/docs/superpowers/specs/2026-05-28-coda-run-workflow-protocol-design.md new file mode 100644 index 0000000..be6866b --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-coda-run-workflow-protocol-design.md @@ -0,0 +1,422 @@ +# Spec: `coda_run` Workflow Protocol + Databricks Orientation + +**Status:** Draft, pre-critique-gate +**Date:** 2026-05-28 +**Branch:** `feat/coda-mcp-interactive-handoff` (continues PR #67) or follow-up branch +**Related:** +- `docs/superpowers/specs/2026-05-28-coda-interactive-mcp-tool-design.md` (Mode 2) +- `docs/superpowers/specs/2026-05-28-coda-run-replay-only-design.md` (Mode 3 narrowing) + +## Goal + +When a caller invokes `coda_run`, the background hermes session should: + +1. **Know** it is running inside a Databricks-authenticated environment with skills, CLI, and MCP servers available. +2. **Follow** a structured 3-phase workflow (PLAN → EXECUTE → SYNTHESIZE) with a critique step after each phase. +3. **Escape cleanly** when blocked — emit `status="info_needed"` with structured feedback so the calling client can iterate. + +Both behaviors are imposed by writing a richer prompt envelope into the `prompt.txt` file that hermes reads. No PTY-timing hacks, no agent-specific config. + +## Why + +Today's `wrap_prompt` (`task_manager.py:153`) gives the agent: TASK, INSTRUCTIONS (status/result file contract), and SAFETY (don't-delete guardrails). It does NOT tell the agent: +- What capabilities exist on the host (Databricks CLI, skills, MCP servers). +- HOW to work the task (just-jump-in vs plan-first vs self-review). +- WHAT to do when blocked (today, the agent either invents an answer or fails hard). + +The fix is to extend the prompt envelope with two new sections — CAPABILITIES and WORKFLOW PROTOCOL — and a new terminal status, `info_needed`. + +## Non-goals + +- Not changing hermes itself. The protocol is enforced via prompt content; if hermes ignores it, that's a hermes problem to chase separately. +- Not adding protocol enforcement to `coda_interactive`. Interactive sessions are human-driven. +- Not adding dynamic skill discovery. The Databricks skill list is hardcoded; staleness is caught by tests, not runtime introspection. +- Not changing the result.json file location, file name, or top-level convention. Only the value of `status` and the addition of an optional `feedback` field. + +--- + +## Architecture + +``` +coda_run(prompt, ..., workflow_protocol=True) + │ + ▼ +task_manager.create_task(..., workflow_protocol=True) + │ + ▼ +task_manager.wrap_prompt(..., workflow_protocol=True) + │ + ▼ +prompt.txt now contains: + ---CODA-TASK--- + metadata... + TASK: + + CAPABILITIES: ← from coda_mcp/databricks_preamble.py::build_capabilities() + + + WORKFLOW PROTOCOL: ← from coda_mcp/databricks_preamble.py::build_workflow_protocol() + <3-phase + info_needed instructions> + + INSTRUCTIONS: ← existing status.jsonl + result.json contract, + + + SAFETY: ← unchanged + + ---END-CODA-TASK--- + │ + ▼ +hermes -z "/path/to/prompt.txt" + │ + ▼ +Hermes works the task, emits status.jsonl, writes result.json + │ + ▼ +coda_inbox / coda_get_result surface the result, including new "info_needed" status +``` + +--- + +## Components + +### 1. New module: `coda_mcp/databricks_preamble.py` + +Exposes pure-function builders that produce the two new prompt sections. Pure functions for testability — no I/O, no global state. + +```python +"""Builders for the CoDA workflow prompt envelope sections. + +These produce static text that is injected into prompt.txt by +``task_manager.wrap_prompt``. Pure functions — no side effects. +""" + +_DATABRICKS_SKILLS = ( + "agent-bricks", "databricks-genie", "databricks-app-python", + "databricks-app-apx", "databricks-jobs", "databricks-unity-catalog", + "spark-declarative-pipelines", "aibi-dashboards", "model-serving", + "mlflow-evaluation", "asset-bundles", "databricks-python-sdk", + "databricks-config", "databricks-docs", "synthetic-data-generation", + "unstructured-pdf-generation", +) + +def build_capabilities() -> str: + """Orientation block: CLI, skills, MCP servers, when to prefer them.""" + +def build_workflow_protocol() -> str: + """3-phase workflow (PLAN/EXECUTE/SYNTHESIZE) + critique + info_needed.""" + +def get_databricks_skills() -> tuple[str, ...]: + """Return the canonical skill list. Used by tests to pin the catalog.""" + return _DATABRICKS_SKILLS +``` + +### 2. `CAPABILITIES:` section content (verbatim) + +``` +You are running inside CoDA on a Databricks-authenticated host. + +Databricks CLI: pre-configured. `databricks current-user me` confirms auth. +Use it for jobs, workspace, clusters, warehouses, Unity Catalog operations. + +Skills available at ~/.claude/skills/ — read each skill's SKILL.md before +invoking. Relevant Databricks skills: +- agent-bricks, databricks-genie, databricks-app-python, databricks-app-apx +- databricks-jobs, databricks-unity-catalog, spark-declarative-pipelines +- aibi-dashboards, model-serving, mlflow-evaluation, asset-bundles +- databricks-python-sdk, databricks-config, databricks-docs +- synthetic-data-generation, unstructured-pdf-generation + +MCP servers wired: +- DeepWiki — ask_question, read_wiki_contents for any GitHub repo +- Exa — web_search_exa, web_fetch_exa for live web context +- CoDA — chain follow-up tasks via previous_session_id + +When the task touches Databricks data, pipelines, jobs, dashboards, agents, +or model serving, DEFAULT to the skill / CLI / SDK path above instead of +generic Python or web search. +``` + +### 3. `WORKFLOW PROTOCOL:` section content (verbatim) + +``` +You MUST process this task in three phases. Emit status.jsonl events as +you go (one JSON object per line, format below). + +PHASE 1 — PLAN +- Write a step-by-step plan as a status.jsonl line with step="plan" and + message containing the numbered steps. +- Then critique your own plan as if you were a separate reviewer. + (Spawn a sub-agent for the critique if your agent supports it; otherwise + write the critique inline as a self-review.) Emit step="critique_plan" + with the verdict (APPROVE / BLOCK / APPROVE-WITH-FIXES) and findings. +- If the critique surfaces blockers, revise the plan once and re-emit + step="plan". Maximum 2 plan iterations total. +- If after 2 attempts you still cannot produce a viable plan, write + result.json with status="info_needed" (see below) and stop. + +PHASE 2 — EXECUTE +- Work the plan. Emit step="execute_" lines after completing each plan + step (n is 1-indexed, matches the plan's numbering). +- After execution, emit step="critique_execute" with a review of what got + built vs what the plan said. APPROVE / BLOCK / APPROVE-WITH-FIXES. +- If the critique surfaces correctness or scope gaps, fix them and re-emit + step="critique_execute". Maximum 2 execute iterations total. +- If you hit a hard blocker (missing access, missing data, ambiguous + requirements that the plan revealed only mid-execution), write + result.json with status="info_needed" and stop. + +PHASE 3 — SYNTHESIZE +- Write result.json with status="completed". +- Emit step="critique_synthesize" with a review of the result against the + original TASK. +- If the critique surfaces gaps, revise result.json. Maximum 2 synthesis + iterations total. + +If at any phase you cannot proceed, use the INFO_NEEDED escape hatch: +- Set status="info_needed" in result.json. +- Set "feedback" to a precise, actionable string naming exactly what is + missing (a table name, a decision, an access grant, a clarification). + The calling client will read this and resubmit with the missing context. +- "info_needed" is NOT a failure — it is a structured request for + iteration. Use it whenever you would otherwise have to guess. + +If you encounter a hard, unrecoverable failure (a command crashed, an SDK +returned 500, a file is corrupt), use status="failed" with a description +in "errors". + +DISAMBIGUATION — two soft statuses already exist and they mean different +things; use the right one: +- "info_needed" — the CALLER must add missing context (table name, + business decision, file contents, access grant) before the task can + proceed. Used when ambiguity or missing input blocks you. +- "needs_approval" — you have a concrete plan to do something destructive + (drop a table, delete a job, modify permissions). You will execute it + if and only if the caller explicitly approves. Used at the SAFETY + boundary, never for ambiguity. See SAFETY section below. + +If both apply (e.g. "I'd drop a table but I'm not sure which one"), prefer +"info_needed" — resolving the ambiguity first is cheaper than approving +the wrong destructive action. +``` + +### 4. Expanded `INSTRUCTIONS:` content + +The existing INSTRUCTIONS block grows to enumerate the new step labels and the new status. The actual labels and the result.json schema additions appear here for the agent's reference. + +New result.json `status` values: `"completed"` | `"failed"` | `"info_needed"`. + +When `status="info_needed"`, the `feedback` field is REQUIRED and must be a string ≥ 20 chars. + +```json +{ + "status": "info_needed", + "summary": "Could not proceed: ", + "feedback": "Specific question or missing context the calling client must supply before resubmit. Name the table, field, decision, or access that's missing.", + "files_changed": ["..."], + "artifacts": {}, + "errors": [] +} +``` + +### 5. `coda_mcp/task_manager.py` changes + +- `wrap_prompt()` gains a parameter: `workflow_protocol: bool = True`. +- When `True`, inserts the CAPABILITIES and WORKFLOW PROTOCOL sections between TASK and INSTRUCTIONS. When `False`, the prompt looks like today. +- `create_task()` gains the same parameter and forwards it. +- Update the existing INSTRUCTIONS section text to enumerate the new step labels (`plan`, `critique_plan`, `execute_`, `critique_execute`, `synthesize`, `critique_synthesize`, `info_needed`) and the new result.json status options. + +### 6. `coda_mcp/mcp_server.py` changes + +`coda_run` gains `workflow_protocol: bool = True` parameter, passed straight through to `create_task`. The tool's docstring is updated to mention the parameter and its effect. + +### 7. Inbox / result surfacing changes (REQUIRED — was previously deferred) + +The current `coda_inbox` implementation at `coda_mcp/mcp_server.py:551` has a HARDCODED counts dict: + +```python +counts = {"running": 0, "completed": 0, "failed": 0} +``` + +Tasks with `status="info_needed"` or `status="needs_approval"` would appear in the `tasks` list but the counts summary would show 0/0/0 — visibly broken. This must be fixed: + +```python +counts = { + "running": 0, + "completed": 0, + "failed": 0, + "info_needed": 0, + "needs_approval": 0, +} +for t in tasks: + s = t.get("status", "") + if s in counts: + counts[s] += 1 + elif s == "done": + counts["completed"] += 1 + elif s == "timeout": + counts["failed"] += 1 +``` + +The `coda_get_result` docstring at `mcp_server.py:579` says: +> Call this AFTER coda_inbox shows a task as "completed" or "failed". + +Must be updated to: +> Call this AFTER coda_inbox shows a task as "completed", "failed", "info_needed", or "needs_approval". + +And the response should pass through the new `feedback` field (and the existing schema fields) verbatim — `task_manager.get_task_result` already returns the full result.json content, so no code change there beyond a regression test. + +### 7a. MCP `instructions` string update (REQUIRED) + +The server-level instructions block at `coda_mcp/mcp_server.py:52-99` is the document that teaches upstream LLM callers how to use these tools. Currently it says nothing about `info_needed`. Add a new paragraph (placed after the CHAINING paragraph at line 68): + +``` +INFO_NEEDED HANDOFF: When coda_inbox shows a task with status='info_needed', +the agent could not proceed because of missing context. Call +coda_get_result to read the 'feedback' field — it tells you exactly what +the agent needs (a table name, a decision, a clarification). Add that +context to the prompt and resubmit via coda_run with previous_session_id +set to the original task's session_id so the agent has the prior attempt's +context. 'needs_approval' is similar but means the agent has a destructive +plan and is waiting for the caller's explicit go/no-go. +``` + +### 7b. `_watch_task` interaction (sanity, no change required) + +`_watch_task` in `mcp_server.py:134` polls for `result.json` and calls `task_manager.complete_task(session_id, task_id)` as soon as it appears. This is correct for all three terminal statuses: from a session-lifecycle perspective, a task that wrote a result.json IS done, regardless of whether the status is `completed`, `failed`, `info_needed`, or `needs_approval`. The session can be auto-closed; the status is preserved in result.json for the caller to read. No code change needed — but document this so the implementer doesn't second-guess. + +--- + +## Data flow examples + +### Happy path — task completes +1. Caller: `coda_run(prompt="build a UC dashboard", workflow_protocol=True)`. +2. `prompt.txt` contains CAPABILITIES + WORKFLOW PROTOCOL. +3. Hermes writes: + - `step=plan`: 1. Use databricks-unity-catalog skill to list catalogs. 2. ... + - `step=critique_plan`: APPROVE — plan is concrete and uses the right skill. + - `step=execute_1`: listed 3 catalogs. + - `step=execute_2`: built dashboard JSON via aibi-dashboards skill. + - `step=critique_execute`: APPROVE — output matches plan. + - `step=synthesize`: writing result.json. + - `step=critique_synthesize`: APPROVE. +4. `result.json` has `status="completed"`. + +### Blocked path — info_needed +1. Caller: `coda_run(prompt="add a column to the orders table", workflow_protocol=True)`. +2. `prompt.txt` contains CAPABILITIES + WORKFLOW PROTOCOL. +3. Hermes writes: + - `step=plan`: 1. Identify orders table. 2. Determine column to add. 3. ... + - `step=critique_plan`: BLOCK — "which orders table? Which schema/catalog? What column type?" + - `step=info_needed`: terminal. +4. `result.json`: + ```json + { + "status": "info_needed", + "summary": "Could not proceed: ambiguous table reference", + "feedback": "The prompt says 'orders table' but the workspace has 4 catalogs with 'orders' tables (main.sales.orders, dev.test.orders, staging.app.orders, prod.dwh.orders). Please specify the fully-qualified table name, and the column name + type to add.", + ... + } + ``` +5. Caller's MCP client sees `info_needed` in `coda_inbox`, reads the feedback, resubmits `coda_run` with the resolved table name and the original task's session ID via `previous_session_id`. + +### Failed path — hard error +1. Caller: `coda_run(prompt="run my flaky pipeline", workflow_protocol=True)`. +2. Hermes plans, executes, then `databricks pipelines start ...` returns 500. +3. After retry, still 500. Agent decides this is unrecoverable from inside the task. +4. `result.json` has `status="failed"`, `errors=["pipeline API 500: ..."]`. +5. `info_needed` is NOT used — the caller cannot help by adding context; the problem is server-side. + +--- + +## Testing strategy + +### `tests/test_databricks_preamble.py` (new) + +| Test | What it pins | +|------|--------------| +| `test_capabilities_mentions_cli` | Contains "Databricks CLI" | +| `test_capabilities_lists_at_least_10_skills` | At least 10 of `_DATABRICKS_SKILLS` appear in the rendered text | +| `test_capabilities_mentions_all_three_mcp_servers` | "DeepWiki", "Exa", "CoDA" each present | +| `test_capabilities_under_token_budget` | Length < 1600 chars (proxy for ~400 tokens) | +| `test_workflow_protocol_lists_three_phases` | Contains "PHASE 1 — PLAN", "PHASE 2 — EXECUTE", "PHASE 3 — SYNTHESIZE" | +| `test_workflow_protocol_caps_iterations_at_two` | Contains "Maximum 2" or "max 2" exactly 3 times (once per phase) | +| `test_workflow_protocol_describes_info_needed` | Contains "info_needed" and "feedback" | +| `test_skills_list_matches_claude_md` | Parse the "Databricks Skills" table from project CLAUDE.md; the set of skill names in that table must equal `set(get_databricks_skills())`. Catches drift in either direction (skill added to CLAUDE.md but not to the tuple, or vice versa). | + +### `tests/test_task_manager.py` (extend) + +| Test | What it pins | +|------|--------------| +| `test_wrap_prompt_with_workflow_protocol_default` | Output contains "CAPABILITIES:" and "WORKFLOW PROTOCOL:" | +| `test_wrap_prompt_workflow_protocol_false_omits_sections` | Both sections absent | +| `test_wrap_prompt_workflow_protocol_default_is_true` | Default param value is True | +| `test_wrap_prompt_lists_info_needed_in_instructions` | INSTRUCTIONS section mentions "info_needed" status | +| `test_wrap_prompt_lists_new_step_labels` | INSTRUCTIONS mentions plan, critique_plan, execute, etc. | +| `test_create_task_passes_workflow_protocol_through` | Mock-verify wrap_prompt receives the flag | + +### `tests/test_mcp_server_coda_run.py` (extend or create) + +| Test | What it pins | +|------|--------------| +| `test_coda_run_signature_has_workflow_protocol_param` | Inspect signature, default True | +| `test_coda_run_passes_workflow_protocol_to_create_task` | Monkeypatch create_task, assert kwarg received | + +### `tests/test_inbox_status_passthrough.py` (new) + +| Test | What it pins | +|------|--------------| +| `test_inbox_counts_dict_includes_info_needed_and_needs_approval` | Construct fake tasks with status="info_needed" and status="needs_approval"; call `coda_inbox`; assert counts dict contains both keys with correct values | +| `test_inbox_surfaces_info_needed_status` | Build a fake result.json with status="info_needed" and feedback="..." in a tmp results dir; call the inbox function; assert the new status comes through verbatim in the tasks list | +| `test_get_result_surfaces_feedback_field` | Same fixture; call `coda_get_result`; assert feedback field passes through | +| `test_mcp_instructions_mention_info_needed` | Read `mcp.instructions`; assert it contains "info_needed" and "needs_approval" | +| `test_get_result_docstring_mentions_info_needed` | Inspect `coda_get_result.__doc__`; assert it lists `info_needed` and `needs_approval` alongside `completed` / `failed` | + +--- + +## Acceptance criteria + +1. `coda_mcp/databricks_preamble.py` exists and exports `build_capabilities()`, `build_workflow_protocol()`, `get_databricks_skills()`. +2. `task_manager.wrap_prompt()` accepts `workflow_protocol: bool = True`; when True, inserts CAPABILITIES and WORKFLOW PROTOCOL sections; when False, omits them. +3. `task_manager.create_task()` forwards the flag. +4. `mcp_server.coda_run()` accepts `workflow_protocol: bool = True`; passes it through. +5. The 16 Databricks skills enumerated in `_DATABRICKS_SKILLS` match what CLAUDE.md documents. +6. New result.json status `"info_needed"` is described in the agent-facing INSTRUCTIONS and is allowed (not rejected) by inbox/result tooling. +7. All new tests in `tests/test_databricks_preamble.py`, plus extensions in `tests/test_task_manager.py` and `tests/test_inbox_status_passthrough.py`, pass. +8. Existing tests (especially the inbox/result tests) continue to pass. + +--- + +## Risks + +1. **Token cost.** Measured: CAPABILITIES ≈ 1050 chars (~260 tokens), WORKFLOW PROTOCOL ≈ 2280 chars (~570 tokens), plus an expanded INSTRUCTIONS section adds another ~100 tokens. Total: **~900 added tokens per task**. Acceptable because the agent gets oriented and disciplined; the flag lets callers opt out. (Earlier estimate of 600 was wrong — see spec history.) +2. **Hermes ignores the protocol.** If hermes treats the prompt as suggestion rather than contract, the structured phases may not appear in `status.jsonl`. Mitigation: not in scope for this spec — first ship the prompt content and measure adoption. +3. **Drift between hardcoded skill list and reality.** If skills are added/removed in CLAUDE.md, `_DATABRICKS_SKILLS` lies until updated. Mitigation: `test_skills_list_is_canonical` makes drift visible by failing. +4. **Critique loops eating tokens.** Max 2 iterations per phase is explicit in the protocol text. Mitigation built into the spec. +5. **`info_needed` status not surfaced in UI.** The viewer / dashboard rendering of `coda_inbox` may not have a visual treatment for `info_needed`. Out of scope for this spec — the protocol surfaces it in the JSON; rendering improvements are a separate change. + +--- + +## Out of scope (explicit) + +- Visual surfacing of `info_needed` in the inbox dashboard / viewer URL — defer. +- Dynamic skill discovery — defer. +- `coda_interactive` protocol enforcement — defer. +- Hermes-specific critic sub-agent mechanism — the protocol says "self-review OR sub-agent — agent's choice"; we don't dictate. +- Token-cost measurement / observability — defer. +- Status filtering in `coda_inbox` (e.g., "show only info_needed tasks") — defer. + +--- + +## Migration notes + +PR #67 is in flight on the same branch. This change can land as a follow-up commit on the same branch OR on a new branch. Recommend: same branch, new commits. The PR description gets a third follow-up section. + +No existing callers depend on the absence of CAPABILITIES / WORKFLOW PROTOCOL sections. Adding them is additive. + +The `workflow_protocol=False` escape hatch makes this safe to land even if the protocol turns out to be too aggressive — callers can opt out. + +--- + +## Open question reserved for execution time + +How does the existing `coda_inbox` / `coda_get_result` code handle unknown status strings today? If it normalizes them or filters them out, the implementation step needs to add `info_needed` to the allow list. If it's a pass-through, no change is needed beyond a regression test. The implementer answers this by reading `task_manager.py` and `mcp_server.py` at the relevant lines and documenting the answer in the commit message. diff --git a/install_databricks_cli.sh b/scripts/install_databricks_cli.sh similarity index 100% rename from install_databricks_cli.sh rename to scripts/install_databricks_cli.sh diff --git a/install_gh.sh b/scripts/install_gh.sh similarity index 100% rename from install_gh.sh rename to scripts/install_gh.sh diff --git a/install_micro.sh b/scripts/install_micro.sh similarity index 100% rename from install_micro.sh rename to scripts/install_micro.sh diff --git a/setup_claude.py b/setup/setup_claude.py similarity index 78% rename from setup_claude.py rename to setup/setup_claude.py index 125393e..9db3d6e 100644 --- a/setup_claude.py +++ b/setup/setup_claude.py @@ -6,6 +6,18 @@ from utils import discover_serving_endpoints, ensure_https, get_gateway_host, pick_in_geo_model + +def resolve_agents_src() -> Path: + """Repo-root agents/ dir holding the bundled subagent .md files that setup + copies into ~/.claude/agents (build-feature, prd-writer, test-generator, + implementer). + + Resolves from the repo root (parent of setup/), NOT Path(__file__).parent: + this script moved into setup/ in fec2152 while agents/ stayed at the repo + root, so the old lookup silently skipped subagent install.""" + return Path(__file__).resolve().parent.parent / "agents" + + # Set HOME if not properly set if not os.environ.get("HOME") or os.environ["HOME"] == "/": os.environ["HOME"] = "/app/python/source_code" @@ -129,31 +141,35 @@ local_bin = home / ".local" / "bin" claude_bin = local_bin / "claude" -# Honour CLAUDE_INSTALLER_URL for enterprise environments where claude.ai is -# firewalled — defaults to the public installer when unset. The URL is -# validated by enterprise_config to reject shell metacharacters before it -# reaches subprocess. Additionally, we avoid embedding the URL in a shell -# string by piping curl's output into bash via positional args — even if a -# malicious URL somehow slipped through validation, it would land as a curl -# argument, not as shell. -from enterprise_config import claude_installer_url - -installer_url = claude_installer_url() -print(f"Installing/upgrading Claude Code CLI from {installer_url}...") -curl_proc = subprocess.Popen( - ["curl", "-fsSL", installer_url], - stdout=subprocess.PIPE, - env={**os.environ, "HOME": str(home)}, -) -result = subprocess.run( - ["bash"], - stdin=curl_proc.stdout, - env={**os.environ, "HOME": str(home)}, - capture_output=True, - text=True, -) -curl_proc.stdout.close() -curl_proc.wait() +if os.environ.get("SKIP_CLAUDE_INSTALL"): + print("SKIP_CLAUDE_INSTALL set — skipping CLI install") + result = type("R", (), {"returncode": 0, "stderr": ""})() +else: + # Honour CLAUDE_INSTALLER_URL for enterprise environments where claude.ai is + # firewalled — defaults to the public installer when unset. The URL is + # validated by enterprise_config to reject shell metacharacters before it + # reaches subprocess. Additionally, we avoid embedding the URL in a shell + # string by piping curl's output into bash via positional args — even if a + # malicious URL somehow slipped through validation, it would land as a curl + # argument, not as shell. + from enterprise_config import claude_installer_url + + installer_url = claude_installer_url() + print(f"Installing/upgrading Claude Code CLI from {installer_url}...") + curl_proc = subprocess.Popen( + ["curl", "-fsSL", installer_url], + stdout=subprocess.PIPE, + env={**os.environ, "HOME": str(home)}, + ) + result = subprocess.run( + ["bash"], + stdin=curl_proc.stdout, + env={**os.environ, "HOME": str(home)}, + capture_output=True, + text=True, + ) + curl_proc.stdout.close() + curl_proc.wait() if result.returncode == 0: print("Claude Code CLI installed successfully") else: @@ -161,7 +177,7 @@ # 4. Copy subagent definitions to ~/.claude/agents/ # These enable TDD workflow: prd-writer → test-generator → implementer → build-feature -agents_src = Path(__file__).parent / "agents" +agents_src = resolve_agents_src() agents_dst = claude_dir / "agents" agents_dst.mkdir(exist_ok=True) diff --git a/setup_codex.py b/setup/setup_codex.py similarity index 93% rename from setup_codex.py rename to setup/setup_codex.py index f2fdd20..ffc8c18 100644 --- a/setup_codex.py +++ b/setup/setup_codex.py @@ -22,6 +22,18 @@ resolve_mlflow_experiment_id, ) + +def resolve_codex_catalog_src() -> Path: + """Repo-root .codex/databricks-models.json — the bundled model catalog that + setup copies into ~/.codex (referenced by config.toml's model_catalog_json). + + Resolves from the repo root (parent of setup/), NOT Path(__file__).parent: + this script moved into setup/ in fec2152 while .codex/ stayed at the repo + root, so the old lookup silently skipped the catalog copy and Codex's + config.toml then pointed at a missing model_catalog_json.""" + return Path(__file__).resolve().parent.parent / ".codex" / "databricks-models.json" + + # Set HOME if not properly set if not os.environ.get("HOME") or os.environ["HOME"] == "/": os.environ["HOME"] = "/app/python/source_code" @@ -102,7 +114,7 @@ # Copy bundled Databricks model catalog into ~/.codex so it can be referenced # by relative path in config.toml (codex resolves relatives against CODEX_HOME). -catalog_src = Path(__file__).parent / ".codex" / "databricks-models.json" +catalog_src = resolve_codex_catalog_src() catalog_dst = codex_dir / "databricks-models.json" if catalog_src.exists() and catalog_src.resolve() != catalog_dst.resolve(): shutil.copyfile(catalog_src, catalog_dst) diff --git a/setup_databricks.py b/setup/setup_databricks.py similarity index 100% rename from setup_databricks.py rename to setup/setup_databricks.py diff --git a/setup_gemini.py b/setup/setup_gemini.py similarity index 100% rename from setup_gemini.py rename to setup/setup_gemini.py diff --git a/setup_hermes.py b/setup/setup_hermes.py similarity index 55% rename from setup_hermes.py rename to setup/setup_hermes.py index 599777e..d533aef 100644 --- a/setup_hermes.py +++ b/setup/setup_hermes.py @@ -241,6 +241,172 @@ def _run(cmd, **kwargs): cli_name="Hermes", ) +# 5b. Append CoDA orchestrator instructions to HERMES.md +CODA_ORCHESTRATOR_INSTRUCTIONS = """ + +## CoDA Constitution (NON-NEGOTIABLE) + +This is the single most important rule. It applies to you AND every sub-agent you delegate to. + +**NO DESTRUCTIVE ACTIONS on pre-existing assets.** Specifically: +- **NEVER delete** files, tables, jobs, notebooks, pipelines, or any resource that was NOT + created during the current session — unless you have EXPLICIT confirmation from the user + or upstream caller. +- **NEVER drop** database tables, schemas, or catalogs that existed before the task started. +- **NEVER overwrite** existing files without confirmation if the content would be lost. +- **NEVER run** destructive CLI commands (`rm -rf`, `databricks jobs delete`, `DROP TABLE`, etc.) + on assets you didn't create. + +**What IS allowed without confirmation:** +- Creating new files, tables, jobs, pipelines, notebooks — building is always OK. +- Modifying files you created during the session. +- Deleting temporary files or artifacts you created during the session. +- Iterating on work in progress — edit, refactor, rebuild freely. +- Overwriting files you created in this session. + +**When in doubt:** Report back to the upstream caller (Genie Code or the user) describing +what you want to delete and why, and ask for confirmation before proceeding. This applies +to you directly AND to any sub-agent you delegate to — pass this rule in every delegation prompt. + +## CoDA Orchestrator Role + +You are Hermes, the primary orchestrator inside **CoDA** (Coding Agents on Databricks Apps). +You are not just a chat assistant — you are the brain that receives tasks and decides how +to execute them, either directly or by delegating to specialized sub-agents. + +### Your Environment + +- You are running inside a Databricks App with full workspace access. +- The Databricks CLI is pre-configured: `databricks` commands work out of the box. +- Unity Catalog, Jobs, Workflows, Notebooks, MLflow — all accessible. +- Projects live at `~/projects/` and sync to `/Workspace/Users/{email}/` on git commit. +- You have 39 Databricks and workflow skills available. + +### Prior Session Context + +When your prompt includes a `PRIOR SESSION:` block, it means this task continues +work from a previous session. The prior session's results are stored on disk: + +``` +~/.coda/sessions/{previous_session_id}/tasks/*/result.json +``` + +**Read those result files** to understand what was done before. Each result.json contains: +- `summary` — what the prior task accomplished +- `files_changed` — which files were created or modified +- `artifacts` — job IDs, commit hashes, dashboard URLs, etc. + +Use this context to continue the work without asking the user to repeat themselves. + +### Sub-Agents Available + +You have three coding agents you can delegate work to. Choose the best one for each subtask: + +**Claude Code** — Deep work, complex implementations, orchestration +```bash +claude -p "your prompt here" --allowedTools "Read,Edit,Bash" --max-turns 50 +``` +- Best for: multi-step implementations, planning, debugging, code review +- Can spawn teams: assign roles, goals, and backstory to parallel workers +- Has access to all 39 skills (Databricks + workflow) +- Use `--max-turns` to bound execution, `--max-budget-usd` for cost control + +**Codex** — Fast edits, refactoring, structured transforms +```bash +codex -q "your prompt here" +``` +- Best for: quick code changes, targeted refactors, code review +- Lightweight and fast — use when the task is well-scoped + +**Gemini** — Research, documentation, large-context analysis +```bash +gemini -p "your prompt here" +``` +- Best for: broad codebase analysis, documentation generation, research tasks +- Large context window — good for understanding big codebases + +### How to Delegate + +1. **Assess the task.** Is it something you can handle directly, or does it need a specialist? +2. **Pick the right agent.** Match the task to the agent's strengths (see above). +3. **Be specific.** Give the sub-agent a clear, self-contained prompt with all context it needs. +4. **Collect results.** Read the sub-agent's output and incorporate it into your response. +5. **Chain when needed.** Plan with Claude, implement with Codex, review with Gemini. + +### For Complex Tasks — Use Claude Code Teams + +When a task is large enough to benefit from parallel work, use Claude Code's team capability: +```bash +claude -p "Create a team of 3 agents to: [task]. Agent 1 handles [X], Agent 2 handles [Y], Agent 3 handles [Z]. Coordinate and merge results." --allowedTools "Read,Edit,Bash" --max-turns 100 +``` + +### Ephemeral Session Model + +Each task runs in its own short-lived session. When the task completes, the session closes +automatically. You will NOT receive follow-up tasks in the same session. + +**What this means for you:** +- **Be self-contained.** Complete the entire task in one go — there is no "next message." +- **Read prior context if provided.** If the prompt has a `PRIOR SESSION:` block, read + those result files to understand what was done before. This is how task chaining works. +- **Write thorough results.** Your `result.json` is the only thing the next task (or the + user) will see. Include a clear summary, all files changed, and any artifacts created. +- **Don't rely on in-memory state.** Anything you want to persist must go to disk — + either in the result files, git commits, or the workspace. + +### Single-User Mode + +You are operating in **single-user mode**. Every task comes from the same person — the app owner. +This means: + +- **Learn their patterns.** Pay attention to how they work, what tools they prefer, what + coding style they use, and what kind of tasks they send. +- **Remember across tasks.** If they always work with certain tables, frameworks, or patterns, + carry that knowledge forward. Use your memory system to persist insights. +- **Be proactive.** If you notice patterns, suggest improvements: + - "I've noticed you frequently create similar pipelines — want me to template this?" + - "Based on your last 3 tasks, you might want to consider..." + - "This task is similar to what you asked last time. Should I reuse that approach?" +- **Adapt your communication style.** Match their level of detail preference, verbosity, + and technical depth. Some users want terse results, others want explanations. +- **Build a profile over time.** Track their preferred tools, common workflows, recurring + patterns, and pain points. The longer you work together, the better you should get. + +### Task Protocol (CODA-TASK Convention) + +When you receive a task wrapped in `---CODA-TASK---` markers, follow this protocol: + +1. **Read the envelope.** Extract task_id, session_id, user, context, and the actual task. +2. **Write progress.** As you work, append lines to `{results_dir}/status.jsonl`: + ```json + {"step": "planning", "message": "Analyzing task requirements"} + {"step": "delegating", "message": "Sending implementation to Claude Code"} + {"step": "complete", "message": "Pipeline created successfully"} + ``` +3. **Write result.** When done, write `{results_dir}/result.json`: + ```json + { + "status": "completed", + "summary": "One paragraph of what was done", + "files_changed": ["path/to/file1.py"], + "artifacts": {"job_id": "123", "commit": "abc123"}, + "errors": [] + } + ``` + IMPORTANT: `result.json` must be a FILE, not a directory. + +4. **If you delegate,** update `status.jsonl` with delegation steps so the caller can track + which sub-agent is doing what. +""" + +if hermes_md.exists(): + existing_content = hermes_md.read_text() + if "CoDA Orchestrator Role" not in existing_content: + hermes_md.write_text(existing_content + CODA_ORCHESTRATOR_INSTRUCTIONS) + print("CoDA orchestrator instructions appended to HERMES.md") + else: + print("CoDA orchestrator instructions already present in HERMES.md") + # 6. Create projects directory (parity with other agents) projects_dir = home / "projects" projects_dir.mkdir(exist_ok=True) diff --git a/setup_mlflow.py b/setup/setup_mlflow.py similarity index 100% rename from setup_mlflow.py rename to setup/setup_mlflow.py diff --git a/setup_opencode.py b/setup/setup_opencode.py similarity index 96% rename from setup_opencode.py rename to setup/setup_opencode.py index e273334..f99d66a 100644 --- a/setup_opencode.py +++ b/setup/setup_opencode.py @@ -25,7 +25,7 @@ host = os.environ.get("DATABRICKS_HOST", "") token = os.environ.get("DATABRICKS_TOKEN", "") -anthropic_model = os.environ.get("ANTHROPIC_MODEL", "databricks-claude-sonnet-4-6") +anthropic_model = os.environ.get("ANTHROPIC_MODEL", "databricks-claude-opus-4-7") # 1. Install OpenCode CLI into ~/.local/bin (always, even without token) local_bin = home / ".local" / "bin" @@ -142,6 +142,13 @@ "apiKey": "{env:DATABRICKS_TOKEN}" }, "models": { + "databricks-claude-opus-4-7": { + "name": "Claude Opus 4.7 (Databricks)", + "limit": { + "context": 200000, + "output": 16384 + } + }, "databricks-claude-opus-4-6": { "name": "Claude Opus 4.6 (Databricks)", "limit": { @@ -170,13 +177,6 @@ "output": 8192 } }, - "databricks-gemini-2-5-pro": { - "name": "Gemini 2.5 Pro (Databricks)", - "limit": { - "context": 1000000, - "output": 8192 - } - }, } }, "databricks-openai": { @@ -222,6 +222,13 @@ "apiKey": "{env:DATABRICKS_TOKEN}" }, "models": { + "databricks-claude-opus-4-7": { + "name": "Claude Opus 4.7 (Databricks)", + "limit": { + "context": 200000, + "output": 16384 + } + }, "databricks-claude-opus-4-6": { "name": "Claude Opus 4.6 (Databricks)", "limit": { @@ -250,13 +257,6 @@ "output": 8192 } }, - "databricks-gemini-2-5-pro": { - "name": "Gemini 2.5 Pro (Databricks)", - "limit": { - "context": 1000000, - "output": 8192 - } - }, } } }, diff --git a/setup/setup_proxy.py b/setup/setup_proxy.py new file mode 100644 index 0000000..0d315c6 --- /dev/null +++ b/setup/setup_proxy.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +"""Start the content-filter proxy between OpenCode and Databricks. + +Fixes known OpenCode bugs by sanitizing requests and responses: + - Empty text content blocks (OpenCode #5028) + - Orphaned tool_result blocks with no matching tool_use + - Databricks 'databricks-tool-call' name mangling + - Incorrect finish_reason on tool call responses + +See docs/plans/2026-03-11-litellm-empty-content-blocks-design.md +""" +import os +import signal +import sys +import time +import subprocess +from pathlib import Path +from urllib.request import urlopen, Request +from urllib.error import URLError + +from utils import ensure_https, get_gateway_host + +PROXY_PORT = 4000 +PROXY_HOST = "127.0.0.1" +HEALTH_TIMEOUT = 15 +HEALTH_POLL_INTERVAL = 0.5 + + +def resolve_proxy_script_path(): + """Absolute path to the content_filter_proxy.py server this launcher runs. + + content_filter_proxy.py lives at the REPO ROOT, not in this setup/ directory. + This file (setup_proxy.py) was moved into setup/ in git fec2152 without + updating the lookup; resolving from setup/ pointed Popen at a nonexistent + file, so the proxy never started and OpenCode (the only agent that routes + through 127.0.0.1:4000) failed with "Cannot connect to API". Resolve from + the parent of setup/ so the path tracks the proxy's real location. + """ + repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + return os.path.join(repo_root, "content_filter_proxy.py") + + +def main(): + # Set HOME if not properly set + if not os.environ.get("HOME") or os.environ["HOME"] == "/": + os.environ["HOME"] = "/app/python/source_code" + + home = Path(os.environ["HOME"]) + + # Kill any existing proxy on our port (more reliable than PID file) + try: + result = subprocess.run( + ["fuser", "-k", f"{PROXY_PORT}/tcp"], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + print(f"Killed previous process on port {PROXY_PORT}") + time.sleep(1) + except (FileNotFoundError, subprocess.TimeoutExpired): + # fuser not available, try lsof + try: + result = subprocess.run( + ["lsof", "-ti", f":{PROXY_PORT}"], + capture_output=True, text=True, timeout=5 + ) + for pid in result.stdout.strip().split(): + try: + os.kill(int(pid), signal.SIGKILL) + print(f"Killed previous proxy (PID: {pid})") + except (ValueError, ProcessLookupError): + pass + if result.stdout.strip(): + time.sleep(1) + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + # Clean up stale PID file + pid_path = home / ".content-filter-proxy.pid" + pid_path.unlink(missing_ok=True) + + # Databricks configuration + gateway_host = get_gateway_host() + host = ensure_https(os.environ.get("DATABRICKS_HOST", "").rstrip("/")) + token = os.environ.get("DATABRICKS_TOKEN", "") + + if not token: + print("Warning: DATABRICKS_TOKEN not set, skipping proxy setup") + sys.exit(0) + + # Determine the upstream base URL + if gateway_host: + upstream_base = f"{gateway_host}/mlflow/v1" + print(f"Content-filter proxy will forward to AI Gateway: {gateway_host}") + else: + upstream_base = f"{host}/serving-endpoints" + print(f"Content-filter proxy will forward to: {host}/serving-endpoints") + + # Start proxy as a background process + proxy_script = resolve_proxy_script_path() + log_path = home / ".content-filter-proxy.log" + print(f"Starting content-filter proxy on {PROXY_HOST}:{PROXY_PORT}...") + + env = os.environ.copy() + env["PROXY_UPSTREAM_BASE"] = upstream_base + env["PROXY_HOST"] = PROXY_HOST + env["PROXY_PORT"] = str(PROXY_PORT) + + proc = subprocess.Popen( + [sys.executable, proxy_script], + stdout=open(log_path, "w"), + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) + + # Write PID file for cleanup + pid_path = home / ".content-filter-proxy.pid" + pid_path.write_text(str(proc.pid)) + print(f"Proxy started (PID: {proc.pid})") + + # Wait for health check + health_url = f"http://{PROXY_HOST}:{PROXY_PORT}/health" + start = time.time() + ready = False + + while time.time() - start < HEALTH_TIMEOUT: + try: + resp = urlopen(Request(health_url), timeout=2) + if resp.status == 200: + ready = True + break + except (URLError, OSError): + pass + + if proc.poll() is not None: + print(f"Error: Proxy exited with code {proc.returncode}") + try: + print(f"Logs: {log_path.read_text()[:1000]}") + except Exception: + pass + sys.exit(1) + + time.sleep(HEALTH_POLL_INTERVAL) + + if ready: + elapsed = time.time() - start + print(f"Content-filter proxy ready on {PROXY_HOST}:{PROXY_PORT} ({elapsed:.1f}s)") + else: + print(f"Warning: Proxy health check timed out after {HEALTH_TIMEOUT}s") + try: + print(f"Logs: {log_path.read_text()[:1000]}") + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/setup_proxy.py b/setup_proxy.py deleted file mode 100644 index 92edd3c..0000000 --- a/setup_proxy.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python -"""Start the content-filter proxy between OpenCode and Databricks. - -Fixes known OpenCode bugs by sanitizing requests and responses: - - Empty text content blocks (OpenCode #5028) - - Orphaned tool_result blocks with no matching tool_use - - Databricks 'databricks-tool-call' name mangling - - Incorrect finish_reason on tool call responses - -See docs/plans/2026-03-11-litellm-empty-content-blocks-design.md -""" -import os -import signal -import sys -import time -import subprocess -from pathlib import Path -from urllib.request import urlopen, Request -from urllib.error import URLError - -from utils import ensure_https, get_gateway_host - -PROXY_PORT = 4000 -PROXY_HOST = "127.0.0.1" -HEALTH_TIMEOUT = 15 -HEALTH_POLL_INTERVAL = 0.5 - -# Set HOME if not properly set -if not os.environ.get("HOME") or os.environ["HOME"] == "/": - os.environ["HOME"] = "/app/python/source_code" - -home = Path(os.environ["HOME"]) - -# Kill any existing proxy on our port (more reliable than PID file) -try: - result = subprocess.run( - ["fuser", "-k", f"{PROXY_PORT}/tcp"], - capture_output=True, text=True, timeout=5 - ) - if result.returncode == 0: - print(f"Killed previous process on port {PROXY_PORT}") - time.sleep(1) -except (FileNotFoundError, subprocess.TimeoutExpired): - # fuser not available, try lsof - try: - result = subprocess.run( - ["lsof", "-ti", f":{PROXY_PORT}"], - capture_output=True, text=True, timeout=5 - ) - for pid in result.stdout.strip().split(): - try: - os.kill(int(pid), signal.SIGKILL) - print(f"Killed previous proxy (PID: {pid})") - except (ValueError, ProcessLookupError): - pass - if result.stdout.strip(): - time.sleep(1) - except (FileNotFoundError, subprocess.TimeoutExpired): - pass - -# Clean up stale PID file -pid_path = home / ".content-filter-proxy.pid" -pid_path.unlink(missing_ok=True) - -# Databricks configuration -gateway_host = get_gateway_host() -host = ensure_https(os.environ.get("DATABRICKS_HOST", "").rstrip("/")) -token = os.environ.get("DATABRICKS_TOKEN", "") - -if not token: - print("Warning: DATABRICKS_TOKEN not set, skipping proxy setup") - sys.exit(0) - -# Determine the upstream base URL -if gateway_host: - upstream_base = f"{gateway_host}/mlflow/v1" - print(f"Content-filter proxy will forward to AI Gateway: {gateway_host}") -else: - upstream_base = f"{host}/serving-endpoints" - print(f"Content-filter proxy will forward to: {host}/serving-endpoints") - -# Start proxy as a background process -proxy_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), "content_filter_proxy.py") -log_path = home / ".content-filter-proxy.log" -print(f"Starting content-filter proxy on {PROXY_HOST}:{PROXY_PORT}...") - -env = os.environ.copy() -env["PROXY_UPSTREAM_BASE"] = upstream_base -env["PROXY_HOST"] = PROXY_HOST -env["PROXY_PORT"] = str(PROXY_PORT) - -proc = subprocess.Popen( - [sys.executable, proxy_script], - stdout=open(log_path, "w"), - stderr=subprocess.STDOUT, - env=env, - start_new_session=True, -) - -# Write PID file for cleanup -pid_path = home / ".content-filter-proxy.pid" -pid_path.write_text(str(proc.pid)) -print(f"Proxy started (PID: {proc.pid})") - -# Wait for health check -health_url = f"http://{PROXY_HOST}:{PROXY_PORT}/health" -start = time.time() -ready = False - -while time.time() - start < HEALTH_TIMEOUT: - try: - resp = urlopen(Request(health_url), timeout=2) - if resp.status == 200: - ready = True - break - except (URLError, OSError): - pass - - if proc.poll() is not None: - print(f"Error: Proxy exited with code {proc.returncode}") - try: - print(f"Logs: {log_path.read_text()[:1000]}") - except Exception: - pass - sys.exit(1) - - time.sleep(HEALTH_POLL_INTERVAL) - -if ready: - elapsed = time.time() - start - print(f"Content-filter proxy ready on {PROXY_HOST}:{PROXY_PORT} ({elapsed:.1f}s)") -else: - print(f"Warning: Proxy health check timed out after {HEALTH_TIMEOUT}s") - try: - print(f"Logs: {log_path.read_text()[:1000]}") - except Exception: - pass diff --git a/static/index.html b/static/index.html index 9f517a6..f5b0f2a 100644 --- a/static/index.html +++ b/static/index.html @@ -1010,7 +1010,10 @@

General

return; } - socket = io({ transports: ['websocket', 'polling'] }); + // Start with polling (HTTP) so Databricks proxy identity headers are present + // for auth, then upgrade to WebSocket transparently. Direct WebSocket-first + // fails because the proxy doesn't inject X-Forwarded-Email on WS upgrade. + socket = io({ transports: ['polling', 'websocket'] }); socket.on('connect', () => { // Check actual transport — Socket.IO reports connected=true even on long-polling @@ -1353,6 +1356,116 @@

General

return sessionId; } + // ── Deep-link helpers ───────────────────────────────────────────── + + async function _doReplay(term, sessionId, content) { + // Chunk the write to avoid main-thread jank on multi-MB transcripts. + const CHUNK = 64 * 1024; + for (let i = 0; i < content.length; i += CHUNK) { + term.write(content.slice(i, i + CHUNK)); + await new Promise(r => requestAnimationFrame(r)); + } + // Mount a static banner above the pane. + _showReplayBanner(term, sessionId); + // NOTE: do NOT wire term.onData → terminal_input; do NOT include in heartbeat + // session_ids list; do NOT emit join_session. + return sessionId; + } + + function _showReplayBanner(term, sessionId) { + const pane = getAllPanes().find(p => p.sessionId === sessionId); + if (!pane || !pane.element) return; + const banner = document.createElement('div'); + banner.className = 'replay-banner'; + banner.textContent = 'Task completed — viewing replay'; + banner.style.cssText = 'padding:4px 8px;background:#333;color:#aaa;font-size:12px;text-align:center;'; + pane.element.insertBefore(banner, pane.element.firstChild); + } + + function _renderExpiredPage(sessionId) { + // Use DOM construction instead of innerHTML interpolation to prevent XSS + // via crafted ?session= values. textContent escapes everything. + document.body.innerHTML = ''; // clear + + const wrap = document.createElement('div'); + wrap.style.cssText = 'font-family:monospace;padding:40px;text-align:center;color:#ccc;'; + + const heading = document.createElement('h2'); + heading.textContent = 'Session expired'; + wrap.appendChild(heading); + + const intro = document.createElement('p'); + intro.appendChild(document.createTextNode('Session ')); + const code = document.createElement('code'); + code.textContent = sessionId; // textContent escapes <>"'`& + intro.appendChild(code); + intro.appendChild(document.createTextNode(' is gone, and no replay is available.')); + wrap.appendChild(intro); + + const explain = document.createElement('p'); + explain.textContent = 'The transcript may have aged out after the 24-hour retention window.'; + wrap.appendChild(explain); + + const link = document.createElement('a'); + link.href = '/'; + link.style.color = '#6cf'; + link.textContent = '← Back to terminal'; + const linkPara = document.createElement('p'); + linkPara.appendChild(link); + wrap.appendChild(linkPara); + + document.body.appendChild(wrap); + } + + async function _initFromQueryString() { + const params = new URLSearchParams(location.search); + const sessionId = params.get('session'); + if (!sessionId) return false; + + try { + const resp = await fetch('/api/session/attach', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ session_id: sessionId }) + }); + + if (resp.status === 404) { + _renderExpiredPage(sessionId); + return true; // handled, skip picker + } + + const data = await resp.json(); + + // Create a tab that skips the session picker and uses our known session id. + const tab = await createTab({ deepLinkSessionId: sessionId }); + if (!tab || tab.panes.length === 0) return false; + const pane = tab.panes[0]; + const term = pane.term; + + if (data.replay) { + // Replay pane: static, read-only. createPane skipped onData/join_session + // wiring because of deepLinkSessionId, so we leave it that way. Keystrokes + // are ignored; nothing to clean up. + const content = (data.output || []).join(''); + await _doReplay(term, sessionId, content); + } else { + // Live pane: createPane skipped the default wiring, so we own it here. + await _doAttach(term, sessionId); + term.onData(d => sendInput(d, pane.sessionId)); + if (wsConnected && socket) { + socket.emit('join_session', { session_id: sessionId }); + } else { + pollWorker.postMessage({ type: 'start_poll', paneId: pane.id, sessionId: sessionId }); + } + } + + return true; // handled, skip picker + } catch (err) { + console.error('deep-link attach failed:', err); + return false; + } + } + function _formatAge(timestamp) { const seconds = Math.floor((Date.now() / 1000) - timestamp); if (seconds < 60) return 'just now'; @@ -1672,6 +1785,10 @@

General

await waitForSetup(); } var { sid, reattached } = await getOrPromptSession(term, tab.label, opts.skipPrompt); + } else if (opts.deepLinkSessionId) { + // Deep-link boot — session id is already known; skip picker entirely. + var sid = opts.deepLinkSessionId; + var reattached = true; } else if (!opts.newSession) { // PAT is valid, initial page load — check for existing sessions first. const setupResp2 = await fetch('/api/setup-status'); @@ -1715,13 +1832,19 @@

General

const pane = { id, element, term, fitAddon, searchAddon, sessionId: sid, batchWrite: createWriteBatcher(term) }; - term.onData(data => sendInput(data, pane.sessionId)); - // Join WebSocket room if connected; otherwise start HTTP polling (AC-11, AC-16) - if (wsConnected && socket) { - socket.emit('join_session', { session_id: sid }); - } else { - pollWorker.postMessage({ type: 'start_poll', paneId: id, sessionId: sid }); + // Deep-link panes own their own input wiring + transport joins from + // _initFromQueryString (so replay mode can stay read-only and live mode + // doesn't double-emit join_session). Skip the default wiring here. + if (!opts.deepLinkSessionId) { + term.onData(data => sendInput(data, pane.sessionId)); + + // Join WebSocket room if connected; otherwise start HTTP polling (AC-11, AC-16) + if (wsConnected && socket) { + socket.emit('join_session', { session_id: sid }); + } else { + pollWorker.postMessage({ type: 'start_poll', paneId: id, sessionId: sid }); + } } // Click to focus @@ -1806,6 +1929,13 @@

General

p.term.dispose(); }); + // If the tab contained a deep-linked pane, drop ?session= from the URL. + const _ctParams = new URLSearchParams(location.search); + const _ctSid = _ctParams.get('session'); + if (_ctSid && tab.panes.some(p => p.sessionId === _ctSid)) { + history.replaceState({}, '', '/'); + } + // Remove DOM tab.paneContainer.remove(); @@ -1959,7 +2089,17 @@

General

const ap = tab.panes.find(p => p.id === tab.activePaneId) || tab.panes[0]; if (!ap) return; + // Capture before cleanupPane() nulls pane.sessionId. + const _apSessionId = ap.sessionId; cleanupPane(ap); + + // If this pane was opened via ?session=, drop the query param so a + // refresh doesn't re-attach to a stale id. + const _cpParams = new URLSearchParams(location.search); + if (_apSessionId && _cpParams.get('session') === _apSessionId) { + history.replaceState({}, '', '/'); + } + ap.term.dispose(); ap.element.remove(); @@ -2263,7 +2403,11 @@

General

// The element is kept in the DOM for error reporting (see catch below). status.style.display = 'none'; - await createTab(); + // ── Deep-link: ?session= takes priority over the session picker ── + const deepLinkHandled = await _initFromQueryString(); + if (!deepLinkHandled) { + await createTab(); + } updateSessionBadge(); let resizeTimer; diff --git a/tests/conftest.py b/tests/conftest.py index 1f88658..41bb935 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,3 +31,31 @@ def _isolate_home(request, tmp_path, monkeypatch): """ if request.module.__name__ in _HOME_WRITERS: monkeypatch.setenv("HOME", str(tmp_path)) + + +@pytest.fixture(autouse=True) +def _restore_real_app_hooks(): + """Keep mcp_server's PTY hooks pointed at app's real implementations. + + set_app_hooks() mutates process-wide module globals in coda_mcp.mcp_server. + Several test files clear or mock those hooks for their own cases — e.g. + test_mcp_server._reset_hooks sets them to None in teardown, and + test_mcp_integration.isolated_env does set_app_hooks(None, None, None). That + cleared state LEAKED into later files: test_replay_only_flag's coda_run then + saw _app_create_session is None and created no PTY, so it failed only in + full-suite runs (never in isolation, where app's import re-wired the hooks). + + Re-establishing app's real hooks AFTER every test makes hook state + independent of file order. Tests that need mocks/None still set them in + their own setup — this only governs the post-test baseline. No-op until + `app` has been imported (and for the few tests that run before that).""" + yield + import sys + app_mod = sys.modules.get("app") + ms = sys.modules.get("coda_mcp.mcp_server") + if app_mod is not None and ms is not None: + ms.set_app_hooks( + app_mod.mcp_create_pty_session, + app_mod.mcp_send_input, + app_mod.mcp_close_pty_session, + ) diff --git a/tests/test_app_url_middleware.py b/tests/test_app_url_middleware.py new file mode 100644 index 0000000..46ee7df --- /dev/null +++ b/tests/test_app_url_middleware.py @@ -0,0 +1,71 @@ +"""Tests for AppUrlCaptureMiddleware — populates url_builder._app_url_cache.""" +import asyncio +import importlib + +import pytest + +from coda_mcp import url_builder + + +@pytest.fixture(autouse=True) +def _reset_cache(): + importlib.reload(url_builder) + yield + + +async def _fake_app(scope, receive, send): + await send({"type": "http.response.start", "status": 200, "headers": []}) + await send({"type": "http.response.body", "body": b"", "more_body": False}) + + +def _make_scope(headers: list[tuple[bytes, bytes]]): + return { + "type": "http", + "asgi": {"version": "3.0"}, + "method": "POST", + "path": "/mcp", + "headers": headers, + } + + +async def _drive(middleware, scope): + sent = [] + async def send(msg): sent.append(msg) + async def receive(): return {"type": "http.request", "body": b"", "more_body": False} + await middleware(scope, receive, send) + + +def test_middleware_captures_x_forwarded_host(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = _make_scope([(b"x-forwarded-host", b"app.databricksapps.com")]) + asyncio.run(_drive(mw, scope)) + assert url_builder._app_url_cache == "app.databricksapps.com" + + +def test_middleware_falls_back_to_host_when_no_xforwarded(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = _make_scope([(b"host", b"localhost:8000")]) + asyncio.run(_drive(mw, scope)) + assert url_builder._app_url_cache == "localhost:8000" + + +def test_middleware_skips_non_http_scope(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = {"type": "lifespan"} + async def receive(): return {"type": "lifespan.startup"} + sent = [] + async def send(msg): sent.append(msg) + # Must not crash. Cache stays None. + asyncio.run(mw(scope, receive, send)) + assert url_builder._app_url_cache is None + + +def test_middleware_no_op_when_no_host_header(): + from coda_mcp.mcp_asgi import AppUrlCaptureMiddleware + mw = AppUrlCaptureMiddleware(_fake_app) + scope = _make_scope([]) + asyncio.run(_drive(mw, scope)) + assert url_builder._app_url_cache is None diff --git a/tests/test_coda_bridge.py b/tests/test_coda_bridge.py new file mode 100644 index 0000000..8d1e39f --- /dev/null +++ b/tests/test_coda_bridge.py @@ -0,0 +1,122 @@ +"""Unit tests for the stdio→HTTP MCP bridge (tools/coda-bridge.py). + +The bridge sits between a local MCP client (Claude Code's OAuth flow) and a +remote deployed CoDA app. It must: + 1. Mint a Databricks access token via the CLI and inject it as Bearer auth + 2. Forward the JSON-RPC payload unchanged to the configured APP_URL + 3. Surface server errors without dropping them + 4. Refuse to run without an APP_URL (operator misconfiguration) +""" +import importlib.util +import json +import os +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[1] +BRIDGE_PATH = REPO_ROOT / "tools" / "coda-bridge.py" + + +def _load_bridge(): + spec = importlib.util.spec_from_file_location("coda_bridge", BRIDGE_PATH) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +@pytest.fixture +def bridge(monkeypatch, tmp_path): + monkeypatch.setenv("CODA_MCP_URL", "https://fake-app.databricksapps.com/mcp") + monkeypatch.setenv("DATABRICKS_PROFILE", "test") + monkeypatch.setenv("HOME", str(tmp_path)) + return _load_bridge() + + +def test_bridge_loads_with_app_url(bridge): + assert bridge is not None + assert callable(getattr(bridge, "_forward", None)) or callable( + getattr(bridge, "forward", None) + ), "bridge must expose a forward function" + + +def test_forward_injects_authorization_header(bridge): + forward = getattr(bridge, "_forward", None) or getattr(bridge, "forward", None) + if forward is None: + pytest.skip("bridge implementation does not expose a forward entrypoint") + + fake_resp = MagicMock() + fake_resp.status = 200 + fake_resp.headers = {} + fake_resp.read.return_value = b'{"jsonrpc":"2.0","id":1,"result":{}}' + fake_resp.__enter__ = lambda s: s + fake_resp.__exit__ = MagicMock(return_value=False) + + fake_proc = MagicMock( + returncode=0, + stdout=json.dumps({"access_token": "tok-from-cli"}), + stderr="", + ) + + with patch("subprocess.run", return_value=fake_proc), \ + patch("urllib.request.urlopen", return_value=fake_resp) as mock_open: + forward(json.dumps({"jsonrpc": "2.0", "id": 1, "method": "ping", "params": {}})) + + sent_req = mock_open.call_args[0][0] + headers_lower = {k.lower(): v for k, v in sent_req.headers.items()} + assert "authorization" in headers_lower, "Bearer token MUST be injected" + assert "tok-from-cli" in headers_lower["authorization"], ( + "Authorization header should contain the token from `databricks auth token`" + ) + + +def test_forward_returns_server_response_body(bridge): + forward = getattr(bridge, "_forward", None) or getattr(bridge, "forward", None) + if forward is None: + pytest.skip("bridge implementation does not expose a forward entrypoint") + + server_payload = b'{"jsonrpc":"2.0","id":42,"result":{"ok":true}}' + fake_resp = MagicMock() + fake_resp.status = 200 + fake_resp.headers = {} + fake_resp.read.return_value = server_payload + fake_resp.__enter__ = lambda s: s + fake_resp.__exit__ = MagicMock(return_value=False) + + fake_proc = MagicMock( + returncode=0, + stdout=json.dumps({"access_token": "tok"}), + stderr="", + ) + + with patch("subprocess.run", return_value=fake_proc), \ + patch("urllib.request.urlopen", return_value=fake_resp): + result = forward( + json.dumps({"jsonrpc": "2.0", "id": 42, "method": "tools/list", "params": {}}) + ) + + if result is None: + pytest.skip("bridge writes directly to stdout — capture via capsys in a follow-up") + if isinstance(result, (bytes, bytearray)): + result = result.decode() + assert "ok" in result and "true" in result.lower(), ( + f"forward should surface the server response body; got {result!r}" + ) + + +def test_missing_app_url_is_handled(monkeypatch, tmp_path): + monkeypatch.delenv("CODA_MCP_URL", raising=False) + monkeypatch.delenv("APP_URL", raising=False) + monkeypatch.setenv("HOME", str(tmp_path)) + sys.modules.pop("coda_bridge", None) + with pytest.raises((SystemExit, ValueError, RuntimeError, KeyError)): + spec = importlib.util.spec_from_file_location("coda_bridge", BRIDGE_PATH) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + # If import-time guard is absent, the forward call itself should refuse. + forward = getattr(mod, "_forward", None) or getattr(mod, "forward", None) + if forward: + forward(json.dumps({"jsonrpc": "2.0", "id": 1, "method": "ping", "params": {}})) diff --git a/tests/test_coda_interactive.py b/tests/test_coda_interactive.py new file mode 100644 index 0000000..2026997 --- /dev/null +++ b/tests/test_coda_interactive.py @@ -0,0 +1,356 @@ +"""Tests for coda_interactive — terminal-side workspace pull (no server-side export).""" +import asyncio +import inspect +import json +import os + +import pytest + +from coda_mcp import mcp_server + +ALLOWED_AGENTS = {"claude", "hermes", "codex", "gemini", "opencode"} + + +@pytest.fixture +def wired(monkeypatch, tmp_path): + """Wire PTY hooks with recording mocks; HOME -> tmp so project_dir is sandboxed. + + ``_wait_for_pull`` is mocked to return ``state["pull_outcome"]`` (default + "ok"); tests override it to exercise the failure / timeout paths. + """ + monkeypatch.setenv("HOME", str(tmp_path)) + inputs: list[str] = [] + state = {"pty_id": "pty-abc123", "pull_outcome": "ok", "closed": []} + + def fake_create(label, replay_only=False, **kw): + return state["pty_id"] + + def fake_send(pty_id, text): + inputs.append(text) + + def fake_close(pty_id): + state["closed"].append(pty_id) + + async def fake_wait_pull(pty_id, target_dir): + return state["pull_outcome"] + + async def fake_agent_ready(*a, **kw): + return None + + monkeypatch.setattr(mcp_server, "_app_create_session", fake_create) + monkeypatch.setattr(mcp_server, "_app_send_input", fake_send) + monkeypatch.setattr(mcp_server, "_app_close_session", fake_close) + monkeypatch.setattr(mcp_server, "_wait_for_pull", fake_wait_pull) + monkeypatch.setattr(mcp_server, "_wait_for_agent_ready", fake_agent_ready) + monkeypatch.setattr( + mcp_server.url_builder, "build_viewer_url", lambda pid: f"https://viewer/{pid}" + ) + return inputs, state + + +# ── new contract: terminal-side pull ───────────────────────────────── + + +@pytest.mark.asyncio +async def test_pull_command_is_sent_first(wired): + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="analyze", workspace_path="/Workspace/Users/x@y.com/WAM", agent="claude" + ) + first = inputs[0] + assert "databricks workspace export-dir" in first + assert "/Users/x@y.com/WAM" in first # /Workspace prefix stripped + assert "/Workspace/Users" not in first + assert "./WAM" in first + assert "&& cd " in first # cd into the pulled dir + assert "echo " in first # completion-marker tail present + + +@pytest.mark.asyncio +async def test_pull_marker_not_literal_in_command(wired): + """CRITICAL: the contiguous marker tokens must NOT appear in the typed command. + + The shell echoes the command line back into the PTY output buffer. If the + contiguous token were present in the command, the wait would match it from + the echo and declare success before export-dir ran. The command builds the + tokens from split string literals, so only their split form is typed. + """ + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="x", workspace_path="/Users/x/WAM", agent="claude" + ) + pull = inputs[0] + assert mcp_server._PULL_OK not in pull, f"contiguous OK token leaked into command: {pull!r}" + assert mcp_server._PULL_FAIL not in pull, f"contiguous FAIL token leaked into command: {pull!r}" + + +@pytest.mark.asyncio +async def test_claude_launches_auto_mode_with_embedded_prompt(wired): + """claude launches in ONE command: --enable-auto-mode + the prompt as an arg. + + No separate bare `claude` line and no separately-typed prompt — that avoids + the per-directory folder-trust dialog and the TUI cold-start timing. + """ + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="go now", workspace_path="/Users/x/WAM", agent="claude" + ) + assert len(inputs) == 2, f"expected pull + atomic launch only; got {inputs!r}" + launch = inputs[1] + assert launch.startswith("claude --enable-auto-mode ") + assert "go now" in launch + assert "/Users/x/WAM" in launch # context prefix embedded + assert not any(t.strip() == "claude" for t in inputs) # no bare claude launch + + +def test_claude_in_auto_launch_map(): + assert mcp_server._AGENT_AUTO_LAUNCH.get("claude") == "claude --enable-auto-mode" + + +@pytest.mark.asyncio +async def test_prompt_seeded_with_context_line(wired): + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="DO THE THING", workspace_path="/Users/x/WAM", agent="claude" + ) + seeded = inputs[-1] + assert "/Users/x/WAM" in seeded + assert "DO THE THING" in seeded + assert "Workspace" in seeded # precondition (clean fail, not ValueError) + assert seeded.index("Workspace") < seeded.index("DO THE THING") # context precedes prompt + + +@pytest.mark.asyncio +async def test_pull_failure_returns_error_and_no_launch(wired): + inputs, state = wired + state["pull_outcome"] = "fail" + out = json.loads(await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent="claude" + )) + assert out["status"] == "error" + assert "Failed to pull" in out["error"] + assert state["closed"] == [state["pty_id"]] # PTY closed + assert not any(t.strip() == "claude" for t in inputs) # agent NOT launched + + +@pytest.mark.asyncio +async def test_pull_timeout_returns_error(wired): + inputs, state = wired + state["pull_outcome"] = "timeout" + out = json.loads(await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent="claude" + )) + assert out["status"] == "error" + assert "Timed out" in out["error"] + assert state["closed"] == [state["pty_id"]] + assert not any(t.strip() == "claude" for t in inputs) + + +@pytest.mark.asyncio +async def test_happy_path_returns_launched(wired): + out = json.loads(await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent="claude" + )) + assert out["status"] == "launched" + assert out["viewer_url"] == "https://viewer/pty-abc123" + assert out["project_dir"].endswith(os.path.join("pty-abc123", "WAM")) + + +@pytest.mark.asyncio +async def test_unknown_agent_rejected(wired): + out = json.loads(await mcp_server.coda_interactive( + prompt="x", workspace_path="/Users/x/WAM", agent="bogus" + )) + assert out["status"] == "error" and "Unknown agent" in out["error"] + for allowed in ALLOWED_AGENTS: + assert allowed in out["error"] + + +@pytest.mark.asyncio +async def test_pty_hook_not_wired(monkeypatch): + monkeypatch.setattr(mcp_server, "_app_create_session", None) + monkeypatch.setattr(mcp_server, "_app_send_input", None) + out = json.loads(await mcp_server.coda_interactive( + prompt="x", workspace_path="/Users/x/WAM", agent="claude" + )) + assert out["status"] == "error" and "PTY hook" in out["error"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("agent,cmd", [ + ("hermes", "hermes chat"), ("codex", "codex"), + ("gemini", "gemini"), ("opencode", "opencode"), +]) +async def test_fallback_agents_launch_then_type_prompt(wired, agent, cmd): + """Agents without an auto-launch entry launch bare, then the prompt is typed.""" + inputs, _ = wired + await mcp_server.coda_interactive( + prompt="go", workspace_path="/Users/x/WAM", agent=agent + ) + assert any(t.strip() == cmd for t in inputs) # bare launch present + assert inputs[-1].strip().endswith("go") # prompt typed last + assert "--enable-auto-mode" not in " ".join(inputs) # not the claude path + + +def test_no_blocking_sleep_in_source(): + src = inspect.getsource(mcp_server.coda_interactive) + assert "time.sleep(" not in src + + +def test_no_workspaceclient_in_module(): + """The export-era WorkspaceClient import/use is gone from the module.""" + src = inspect.getsource(mcp_server) + assert "export_workspace_tree" not in src + assert "workspace.get_status(" not in src + + +# ── _wait_for_pull behavior (real helper, fake sessions buffer) ─────── + + +@pytest.mark.asyncio +async def test_wait_for_pull_ok_with_files(monkeypatch, tmp_path): + from app import sessions + sid = "pty-pull-ok" + target = tmp_path / "WAM" + target.mkdir() + (target / "README.md").write_text("# hi") + sessions[sid] = {"output_buffer": [b"Exporting...\n", (mcp_server._PULL_OK + "\n").encode()]} + try: + out = await mcp_server._wait_for_pull(sid, str(target)) + assert out == "ok" + finally: + sessions.pop(sid, None) + + +@pytest.mark.asyncio +async def test_wait_for_pull_ok_marker_but_no_files_is_fail(monkeypatch, tmp_path): + from app import sessions + sid = "pty-pull-okempty" + target = tmp_path / "WAM" # never created + sessions[sid] = {"output_buffer": [(mcp_server._PULL_OK + "\n").encode()]} + try: + assert await mcp_server._wait_for_pull(sid, str(target)) == "fail" + finally: + sessions.pop(sid, None) + + +@pytest.mark.asyncio +async def test_wait_for_pull_fail_marker(monkeypatch, tmp_path): + from app import sessions + sid = "pty-pull-fail" + sessions[sid] = {"output_buffer": [b"ERROR: nope\n", (mcp_server._PULL_FAIL + "\n").encode()]} + try: + assert await mcp_server._wait_for_pull(sid, str(tmp_path / "WAM")) == "fail" + finally: + sessions.pop(sid, None) + + +@pytest.mark.asyncio +async def test_wait_for_pull_split_echo_does_not_false_trigger(monkeypatch, tmp_path): + """The split-literal command echo must NOT be read as the success marker.""" + from app import sessions + sid = "pty-pull-splitecho" + # This is what the shell echoes when the command line is typed — the SPLIT form. + echoed_command = 'cd /x && databricks workspace export-dir /Users/x/WAM ./WAM && cd WAM && echo "CODA""_PULL_""OK" || echo "CODA""_PULL_""FAIL"\n' + sessions[sid] = {"output_buffer": [echoed_command.encode()]} + monkeypatch.setattr(mcp_server, "_PULL_MAX_WAIT_S", 0.5) # keep the test fast + try: + # Only the split echo is present (no executed contiguous token) -> timeout. + assert await mcp_server._wait_for_pull(sid, str(tmp_path)) == "timeout" + finally: + sessions.pop(sid, None) + + +# ── preserved signature / contract guards ──────────────────────────── + + +def test_default_agent_is_claude(): + sig = inspect.signature(mcp_server.coda_interactive) + assert sig.parameters["agent"].default == "claude" + + +def test_no_branch_parameter(): + sig = inspect.signature(mcp_server.coda_interactive) + assert "branch" not in sig.parameters + + +def test_instructions_drop_stale_export_wording_and_keep_contract(): + """Server-level MCP instructions: no stale server-side export claim; contract intact.""" + txt = mcp_server.mcp.instructions + lowered = txt.lower() + assert "server-side snapshot" not in txt + assert "export-dir" in txt + assert "coda_interactive" in txt + assert ( + "git folder or" in lowered + or "plain workspace folder" in lowered + or "plain folder" in lowered + ) + # Local-agent contract: must tell a local caller to copy local files INTO the + # Workspace first, with the concrete command, since the tool can't see local disk. + assert "import-dir" in lowered, "instructions must give the `workspace import-dir` command" + assert "local" in lowered, "instructions must address the local-agent case" + + +def test_docstring_tells_local_callers_to_import_dir(): + """coda_interactive's own docstring carries the local-upload guidance too.""" + doc = (mcp_server.coda_interactive.__doc__ or "").lower() + assert "import-dir" in doc + assert "cannot read your local filesystem" in doc + + +# ── preserved wait-helper behavior tests (now via the wrapper) ──────── + + +def test_wait_for_agent_ready_returns_when_buffer_stabilizes(monkeypatch): + """Wrapper returns once the output buffer has been stable for the window.""" + from app import sessions + + sid = "pty-stabilize-test" + sessions[sid] = {"output_buffer": [b"banner line\n", b"prompt> "]} + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_STABILITY_S", 0.05) + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_MAX_WAIT_S", 2.0) + try: + async def _run(): + import time + t0 = time.time() + await mcp_server._wait_for_agent_ready(sid) + return time.time() - t0 + elapsed = asyncio.run(_run()) + assert elapsed < 1.0, f"Helper took {elapsed:.2f}s — should return quickly when stable" + finally: + sessions.pop(sid, None) + + +def test_wait_for_agent_ready_times_out_when_buffer_empty(monkeypatch): + """Wrapper returns at max-wait if the buffer never gets content.""" + from app import sessions + + sid = "pty-empty-test" + sessions[sid] = {"output_buffer": []} + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_STABILITY_S", 0.05) + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_MAX_WAIT_S", 0.3) + try: + async def _run(): + import time + t0 = time.time() + await mcp_server._wait_for_agent_ready(sid) + return time.time() - t0 + elapsed = asyncio.run(_run()) + assert 0.2 <= elapsed <= 0.8, f"Expected ~0.3s max-wait; got {elapsed:.2f}s" + finally: + sessions.pop(sid, None) + + +def test_wait_for_agent_ready_returns_when_session_gone(monkeypatch): + """Wrapper returns immediately if the session is no longer present.""" + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_STABILITY_S", 0.05) + monkeypatch.setattr(mcp_server, "_PROMPT_SEED_MAX_WAIT_S", 5.0) + + async def _run(): + import time + t0 = time.time() + await mcp_server._wait_for_agent_ready("nonexistent-pty-id") + return time.time() - t0 + elapsed = asyncio.run(_run()) + assert elapsed < 0.5, f"Helper took {elapsed:.2f}s — should return when session gone" diff --git a/tests/test_content_filter_proxy.py b/tests/test_content_filter_proxy.py new file mode 100644 index 0000000..4aad029 --- /dev/null +++ b/tests/test_content_filter_proxy.py @@ -0,0 +1,556 @@ +"""Tests for content_filter_proxy — request/response sanitization for OpenCode.""" + +import json +import time + +import pytest +from unittest import mock + + +# --------------------------------------------------------------------------- +# strip_unsupported_schema_keys +# --------------------------------------------------------------------------- + +class TestStripUnsupportedSchemaKeys: + def test_strips_top_level_keys(self): + from content_filter_proxy import strip_unsupported_schema_keys + obj = {"type": "object", "$schema": "http://...", "additionalProperties": False, "title": "Foo"} + result = strip_unsupported_schema_keys(obj) + assert result == {"type": "object", "title": "Foo"} + + def test_strips_nested_keys(self): + from content_filter_proxy import strip_unsupported_schema_keys + obj = { + "type": "object", + "properties": { + "name": {"type": "string", "$ref": "#/defs/Name", "$comment": "ignore"}, + }, + } + result = strip_unsupported_schema_keys(obj) + assert result == { + "type": "object", + "properties": { + "name": {"type": "string"}, + }, + } + + def test_strips_inside_lists(self): + from content_filter_proxy import strip_unsupported_schema_keys + obj = [{"$id": "x", "type": "string"}, {"type": "int"}] + result = strip_unsupported_schema_keys(obj) + assert result == [{"type": "string"}, {"type": "int"}] + + def test_passes_through_primitives(self): + from content_filter_proxy import strip_unsupported_schema_keys + assert strip_unsupported_schema_keys("hello") == "hello" + assert strip_unsupported_schema_keys(42) == 42 + assert strip_unsupported_schema_keys(None) is None + + +# --------------------------------------------------------------------------- +# sanitize_tool_schemas +# --------------------------------------------------------------------------- + +class TestSanitizeToolSchemas: + def test_cleans_tool_parameters(self): + from content_filter_proxy import sanitize_tool_schemas + data = { + "tools": [ + {"function": {"name": "foo", "parameters": {"$schema": "x", "type": "object"}}}, + ], + } + result = sanitize_tool_schemas(data) + assert result["tools"][0]["function"]["parameters"] == {"type": "object"} + + def test_strips_top_level_request_keys(self): + from content_filter_proxy import sanitize_tool_schemas + data = { + "tools": [{"function": {"name": "foo", "parameters": {"type": "object"}}}], + "stream_options": {"include_usage": True}, + "$schema": "x", + } + result = sanitize_tool_schemas(data) + assert "stream_options" not in result + assert "$schema" not in result + + def test_no_tools_is_noop(self): + from content_filter_proxy import sanitize_tool_schemas + data = {"messages": [{"role": "user", "content": "hi"}]} + result = sanitize_tool_schemas(data) + assert result == data + + +# --------------------------------------------------------------------------- +# _extract_tool_ids_from_message +# --------------------------------------------------------------------------- + +class TestExtractToolIds: + def test_anthropic_format(self): + from content_filter_proxy import _extract_tool_ids_from_message + msg = { + "role": "assistant", + "content": [ + {"type": "tool_use", "id": "tu_1", "name": "bash"}, + {"type": "text", "text": "running..."}, + {"type": "tool_use", "id": "tu_2", "name": "read"}, + ], + } + assert _extract_tool_ids_from_message(msg) == {"tu_1", "tu_2"} + + def test_openai_format(self): + from content_filter_proxy import _extract_tool_ids_from_message + msg = { + "role": "assistant", + "tool_calls": [ + {"id": "tc_1", "function": {"name": "bash"}}, + {"id": "tc_2", "function": {"name": "read"}}, + ], + } + assert _extract_tool_ids_from_message(msg) == {"tc_1", "tc_2"} + + def test_no_tools(self): + from content_filter_proxy import _extract_tool_ids_from_message + msg = {"role": "assistant", "content": "hello"} + assert _extract_tool_ids_from_message(msg) == set() + + +# --------------------------------------------------------------------------- +# _extract_tool_refs_from_message +# --------------------------------------------------------------------------- + +class TestExtractToolRefs: + def test_anthropic_tool_result(self): + from content_filter_proxy import _extract_tool_refs_from_message + msg = { + "role": "user", + "content": [ + {"type": "tool_result", "tool_use_id": "tu_1", "content": "ok"}, + ], + } + assert _extract_tool_refs_from_message(msg) == {"tu_1"} + + def test_openai_tool_message(self): + from content_filter_proxy import _extract_tool_refs_from_message + msg = {"role": "tool", "tool_call_id": "tc_1", "content": "result"} + assert _extract_tool_refs_from_message(msg) == {"tc_1"} + + def test_no_refs(self): + from content_filter_proxy import _extract_tool_refs_from_message + msg = {"role": "user", "content": "hi"} + assert _extract_tool_refs_from_message(msg) == set() + + +# --------------------------------------------------------------------------- +# sanitize_messages — the big one +# --------------------------------------------------------------------------- + +class TestSanitizeMessages: + def test_strips_empty_text_blocks(self): + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "user", "content": [ + {"type": "text", "text": "hello"}, + {"type": "text", "text": ""}, + {"type": "text", "text": " "}, + ]}, + ] + result = sanitize_messages(messages) + assert len(result) == 1 + assert len(result[0]["content"]) == 1 + assert result[0]["content"][0]["text"] == "hello" + + def test_strips_orphaned_tool_result_anthropic(self): + """tool_result referencing a tool_use ID that doesn't exist in prev assistant msg.""" + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "assistant", "content": [ + {"type": "tool_use", "id": "tu_1", "name": "bash"}, + ]}, + {"role": "user", "content": [ + {"type": "tool_result", "tool_use_id": "tu_1", "content": "ok"}, + {"type": "tool_result", "tool_use_id": "tu_ORPHAN", "content": "stale"}, + ]}, + ] + result = sanitize_messages(messages) + assert len(result) == 2 + # Only tu_1 should survive + user_blocks = result[1]["content"] + assert len(user_blocks) == 1 + assert user_blocks[0]["tool_use_id"] == "tu_1" + + def test_strips_orphaned_openai_tool_message(self): + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "assistant", "tool_calls": [{"id": "tc_1", "function": {"name": "bash"}}]}, + {"role": "tool", "tool_call_id": "tc_1", "content": "ok"}, + {"role": "tool", "tool_call_id": "tc_ORPHAN", "content": "stale"}, + ] + result = sanitize_messages(messages) + assert len(result) == 2 + assert result[1]["role"] == "tool" + assert result[1]["tool_call_id"] == "tc_1" + + def test_cascading_orphan_removal(self): + """Dropping one message can make the next one orphaned too — multi-pass.""" + from content_filter_proxy import sanitize_messages + messages = [ + # assistant with tool_use tu_A + {"role": "assistant", "content": [{"type": "tool_use", "id": "tu_A", "name": "bash"}]}, + # user responds to tu_A + {"role": "user", "content": [{"type": "tool_result", "tool_use_id": "tu_A", "content": "ok"}]}, + # assistant with tool_use tu_B (referencing something dropped) + {"role": "assistant", "content": [{"type": "tool_use", "id": "tu_B", "name": "read"}]}, + # user responds to tu_B AND orphan tu_C (no matching tool_use) + {"role": "user", "content": [ + {"type": "tool_result", "tool_use_id": "tu_B", "content": "ok"}, + {"type": "tool_result", "tool_use_id": "tu_C", "content": "orphan"}, + ]}, + ] + result = sanitize_messages(messages) + # tu_C should be stripped, tu_A and tu_B should survive + assert len(result) == 4 + last_user_blocks = result[3]["content"] + assert len(last_user_blocks) == 1 + assert last_user_blocks[0]["tool_use_id"] == "tu_B" + + def test_drops_empty_user_message_after_filter(self): + """If all content blocks are stripped, the user message is dropped entirely.""" + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "assistant", "content": [{"type": "tool_use", "id": "tu_1", "name": "bash"}]}, + {"role": "user", "content": [ + {"type": "tool_result", "tool_use_id": "tu_ORPHAN", "content": "stale"}, + ]}, + ] + result = sanitize_messages(messages) + # The user message should be dropped (all blocks were orphaned) + assert len(result) == 1 + assert result[0]["role"] == "assistant" + + def test_keeps_empty_assistant_message(self): + """Empty assistant messages are kept (not dropped) to preserve alternation.""" + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "assistant", "content": [{"type": "text", "text": ""}]}, + ] + result = sanitize_messages(messages) + assert len(result) == 1 + assert result[0]["role"] == "assistant" + + def test_replaces_null_assistant_content(self): + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "assistant", "content": None}, + ] + result = sanitize_messages(messages) + assert result[0]["content"] == "." + + def test_replaces_empty_string_assistant(self): + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "assistant", "content": " "}, + ] + result = sanitize_messages(messages) + assert result[0]["content"] == "." + + def test_strips_empty_string_user(self): + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + {"role": "user", "content": ""}, + ] + result = sanitize_messages(messages) + assert len(result) == 2 # empty user dropped + + def test_passthrough_non_list(self): + from content_filter_proxy import sanitize_messages + assert sanitize_messages("not a list") == "not a list" + assert sanitize_messages(None) is None + + def test_preserves_non_dict_blocks(self): + """Non-dict items in content list are preserved as-is.""" + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "user", "content": ["plain string", {"type": "text", "text": "hi"}]}, + ] + result = sanitize_messages(messages) + assert len(result[0]["content"]) == 2 + + def test_null_assistant_with_tool_calls_not_replaced(self): + """Assistant msg with null content but tool_calls should NOT get placeholder.""" + from content_filter_proxy import sanitize_messages + messages = [ + {"role": "assistant", "content": None, "tool_calls": [{"id": "tc_1"}]}, + ] + result = sanitize_messages(messages) + assert result[0]["content"] is None # preserved because tool_calls exist + + +# --------------------------------------------------------------------------- +# remap_tool_call +# --------------------------------------------------------------------------- + +class TestRemapToolCall: + def test_remaps_databricks_tool_call(self): + from content_filter_proxy import remap_tool_call + tc = { + "id": "tc_1", + "function": { + "name": "databricks-tool-call", + "arguments": json.dumps({"name": "execute_sql", "query": "SELECT 1"}), + }, + } + result = remap_tool_call(tc) + assert result["function"]["name"] == "execute_sql" + args = json.loads(result["function"]["arguments"]) + assert "name" not in args + assert args["query"] == "SELECT 1" + + def test_passthrough_normal_tool(self): + from content_filter_proxy import remap_tool_call + tc = {"id": "tc_1", "function": {"name": "bash", "arguments": '{"cmd": "ls"}'}} + result = remap_tool_call(tc) + assert result["function"]["name"] == "bash" + + def test_handles_invalid_json_args(self): + from content_filter_proxy import remap_tool_call + tc = {"id": "tc_1", "function": {"name": "databricks-tool-call", "arguments": "not json"}} + result = remap_tool_call(tc) + assert result["function"]["name"] == "databricks-tool-call" # unchanged + + +# --------------------------------------------------------------------------- +# fix_response_data +# --------------------------------------------------------------------------- + +class TestFixResponseData: + def test_remaps_tool_calls_in_message(self): + from content_filter_proxy import fix_response_data + data = { + "choices": [{ + "message": { + "tool_calls": [{ + "id": "tc_1", + "function": { + "name": "databricks-tool-call", + "arguments": json.dumps({"name": "run_sql", "q": "SELECT 1"}), + }, + }], + }, + "finish_reason": "stop", + }], + } + result = fix_response_data(data) + assert result["choices"][0]["message"]["tool_calls"][0]["function"]["name"] == "run_sql" + assert result["choices"][0]["finish_reason"] == "tool_calls" + + def test_fixes_streaming_delta(self): + from content_filter_proxy import fix_response_data + data = { + "choices": [{ + "delta": { + "tool_calls": [{ + "id": "tc_1", + "function": { + "name": "databricks-tool-call", + "arguments": json.dumps({"name": "run_sql"}), + }, + }], + }, + "finish_reason": "stop", + }], + } + result = fix_response_data(data) + assert result["choices"][0]["delta"]["tool_calls"][0]["function"]["name"] == "run_sql" + assert result["choices"][0]["finish_reason"] == "tool_calls" + + def test_noop_on_non_dict(self): + from content_filter_proxy import fix_response_data + assert fix_response_data("string") == "string" + assert fix_response_data(None) is None + + def test_no_choices_is_noop(self): + from content_filter_proxy import fix_response_data + data = {"id": "resp_1"} + assert fix_response_data(data) == data + + +# --------------------------------------------------------------------------- +# SSEProcessor +# --------------------------------------------------------------------------- + +class TestSSEProcessor: + def test_passthrough_non_data_lines(self): + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + assert proc.process_line("event: message") == ["event: message"] + assert proc.process_line(": comment") == [": comment"] + + def test_passthrough_done_signal(self): + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + result = proc.process_line("data: [DONE]") + assert "data: [DONE]" in result + + def test_passthrough_normal_tool(self): + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + event = { + "choices": [{ + "delta": {"tool_calls": [{"index": 0, "function": {"name": "bash"}}]}, + "finish_reason": None, + }], + } + result = proc.process_line(f"data: {json.dumps(event)}") + assert len(result) == 1 + assert "bash" in result[0] + + def test_buffers_databricks_tool_call(self): + """First chunk with databricks-tool-call name should be buffered.""" + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + event = { + "choices": [{ + "delta": { + "tool_calls": [{ + "index": 0, + "function": {"name": "databricks-tool-call", "arguments": ""}, + }], + }, + "finish_reason": None, + }], + } + result = proc.process_line(f"data: {json.dumps(event)}") + assert result == [] # buffered, not sent + + def test_resolves_name_from_args(self): + """Once args JSON is complete, name is resolved and buffered events flushed.""" + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + # First chunk — name is databricks-tool-call + event1 = { + "choices": [{ + "delta": { + "tool_calls": [{ + "index": 0, + "function": {"name": "databricks-tool-call", "arguments": ""}, + }], + }, + "finish_reason": None, + }], + } + proc.process_line(f"data: {json.dumps(event1)}") + + # Second chunk — args with real name + event2 = { + "choices": [{ + "delta": { + "tool_calls": [{ + "index": 0, + "function": {"arguments": json.dumps({"name": "execute_sql", "query": "SELECT 1"})}, + }], + }, + "finish_reason": None, + }], + } + result = proc.process_line(f"data: {json.dumps(event2)}") + # Should flush buffered events + current event + assert len(result) >= 1 + # The resolved name should appear in flushed output + combined = " ".join(result) + assert "execute_sql" in combined + + def test_flush_remaining(self): + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + # Buffer a databricks-tool-call but never resolve it + event = { + "choices": [{ + "delta": { + "tool_calls": [{ + "index": 0, + "function": {"name": "databricks-tool-call", "arguments": '{"partial'}, + }], + }, + "finish_reason": None, + }], + } + proc.process_line(f"data: {json.dumps(event)}") + remaining = proc.flush_remaining() + assert len(remaining) >= 1 # buffered lines flushed as-is + + def test_fixes_finish_reason_on_stop(self): + """finish_reason 'stop' with active tool state should become 'tool_calls'.""" + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + # Seed tool state + proc._tool_state[0] = {"args_buffer": "", "resolved_name": "bash", "buffered_lines": []} + event = { + "choices": [{"delta": {}, "finish_reason": "stop"}], + } + result = proc.process_line(f"data: {json.dumps(event)}") + parsed = json.loads(result[0][6:]) # strip "data: " + assert parsed["choices"][0]["finish_reason"] == "tool_calls" + + def test_invalid_json_passthrough(self): + from content_filter_proxy import SSEProcessor + proc = SSEProcessor() + result = proc.process_line("data: {invalid json}") + assert result == ["data: {invalid json}"] + + +# --------------------------------------------------------------------------- +# _get_fresh_token +# --------------------------------------------------------------------------- + +class TestGetFreshToken: + def setup_method(self): + """Reset token cache before each test.""" + from content_filter_proxy import _TOKEN_CACHE + _TOKEN_CACHE["token"] = None + _TOKEN_CACHE["read_at"] = 0.0 + + def test_reads_from_databrickscfg(self, tmp_path): + from content_filter_proxy import _get_fresh_token, _TOKEN_CACHE + cfg = tmp_path / ".databrickscfg" + cfg.write_text("[DEFAULT]\nhost = https://test.cloud.databricks.com\ntoken = dapi_test123\n") + with mock.patch("content_filter_proxy._DATABRICKSCFG_PATH", str(cfg)): + token = _get_fresh_token() + assert token == "dapi_test123" + assert _TOKEN_CACHE["token"] == "dapi_test123" + + def test_returns_cached_within_ttl(self, tmp_path): + from content_filter_proxy import _get_fresh_token, _TOKEN_CACHE + _TOKEN_CACHE["token"] = "cached_token" + _TOKEN_CACHE["read_at"] = time.time() # just now + # Even with a bad path, should return cached + with mock.patch("content_filter_proxy._DATABRICKSCFG_PATH", "/nonexistent"): + token = _get_fresh_token() + assert token == "cached_token" + + def test_refreshes_after_ttl(self, tmp_path): + from content_filter_proxy import _get_fresh_token, _TOKEN_CACHE + _TOKEN_CACHE["token"] = "old_token" + _TOKEN_CACHE["read_at"] = time.time() - 60 # expired + cfg = tmp_path / ".databrickscfg" + cfg.write_text("[DEFAULT]\nhost = https://test.cloud.databricks.com\ntoken = new_token\n") + with mock.patch("content_filter_proxy._DATABRICKSCFG_PATH", str(cfg)): + token = _get_fresh_token() + assert token == "new_token" + + def test_returns_stale_on_read_error(self, tmp_path): + from content_filter_proxy import _get_fresh_token, _TOKEN_CACHE + _TOKEN_CACHE["token"] = "stale_token" + _TOKEN_CACHE["read_at"] = 0.0 # force re-read + with mock.patch("content_filter_proxy._DATABRICKSCFG_PATH", "/nonexistent"): + token = _get_fresh_token() + assert token == "stale_token" + + def test_returns_none_when_no_cache_and_no_file(self): + from content_filter_proxy import _get_fresh_token, _TOKEN_CACHE + _TOKEN_CACHE["token"] = None + _TOKEN_CACHE["read_at"] = 0.0 + with mock.patch("content_filter_proxy._DATABRICKSCFG_PATH", "/nonexistent"): + token = _get_fresh_token() + assert token is None diff --git a/tests/test_databricks_preamble.py b/tests/test_databricks_preamble.py new file mode 100644 index 0000000..f7ef0a5 --- /dev/null +++ b/tests/test_databricks_preamble.py @@ -0,0 +1,113 @@ +"""Unit tests for coda_mcp.databricks_preamble.""" +import re + +from coda_mcp.databricks_preamble import ( + build_capabilities, + build_workflow_protocol, + get_databricks_skills, +) + + +def test_get_databricks_skills_returns_exactly_sixteen(): + skills = get_databricks_skills() + assert isinstance(skills, tuple) + assert len(skills) == 16, f"Expected 16 skills, got {len(skills)}: {skills}" + + +def test_skills_list_matches_claude_md(): + """The hardcoded skill tuple must match the Databricks Skills table in CLAUDE.md. + + Drift in either direction (added to tuple but not docs, or vice versa) fails + this test. The test is the canary that forces both sources to stay in sync. + """ + import os + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + claude_md = os.path.join(repo_root, "CLAUDE.md") + with open(claude_md, "r") as f: + text = f.read() + # Find the Databricks Skills section. Names are comma-separated within table cells. + section_match = re.search( + r"###\s+Databricks Skills.*?(?=\n###|\n##|\Z)", + text, re.DOTALL, + ) + assert section_match, "Could not find 'Databricks Skills' section in CLAUDE.md" + section = section_match.group(0) + # Extract skill names: kebab-case tokens that follow a list pattern. Be loose — + # accept anything that looks like a skill identifier inside table cells. + skill_names_in_md = set(re.findall(r"\b([a-z][a-z0-9-]{2,}(?:-[a-z0-9]+)+)\b", section)) + skills_in_code = set(get_databricks_skills()) + # Every skill in code must appear in CLAUDE.md. + missing_from_md = skills_in_code - skill_names_in_md + assert not missing_from_md, ( + f"Skills in code but NOT in CLAUDE.md (update CLAUDE.md): {missing_from_md}" + ) + # Every kebab-case identifier in CLAUDE.md's Databricks section must appear in code. + # The regex deliberately matches lowercase-only, so category labels like + # "AI & Agents" / "Data Engineering" cannot create false positives. + missing_from_code = skill_names_in_md - skills_in_code + assert not missing_from_code, ( + f"Skills in CLAUDE.md but NOT in code (update databricks_preamble.py): " + f"{missing_from_code}" + ) + + +def test_capabilities_mentions_cli(): + text = build_capabilities() + assert "Databricks CLI" in text + assert "databricks current-user me" in text + + +def test_capabilities_lists_at_least_ten_skills(): + text = build_capabilities() + skills = get_databricks_skills() + hits = sum(1 for s in skills if s in text) + assert hits >= 10, f"Expected at least 10 skills in CAPABILITIES, found {hits}" + + +def test_capabilities_mentions_all_three_mcp_servers(): + text = build_capabilities() + assert "DeepWiki" in text + assert "Exa" in text + assert "CoDA" in text + + +def test_capabilities_under_token_budget(): + text = build_capabilities() + # ~4 chars/token rough lower bound. 1600 chars ≈ 400 tokens budget. + assert len(text) < 1600, ( + f"CAPABILITIES is {len(text)} chars (~{len(text)//4} tokens); budget is 1600." + ) + + +def test_workflow_protocol_lists_three_phases(): + text = build_workflow_protocol() + assert "PHASE 1 — PLAN" in text + assert "PHASE 2 — EXECUTE" in text + assert "PHASE 3 — SYNTHESIZE" in text + + +def test_workflow_protocol_caps_iterations_at_two(): + text = build_workflow_protocol() + # The string "Maximum 2" should appear once per phase = 3 times. + count = text.count("Maximum 2") + assert count == 3, f"Expected 'Maximum 2' to appear 3 times (once per phase); got {count}" + + +def test_workflow_protocol_describes_info_needed(): + text = build_workflow_protocol() + assert "info_needed" in text + assert "feedback" in text + + +def test_workflow_protocol_disambiguates_needs_approval(): + text = build_workflow_protocol() + assert "needs_approval" in text + assert "DISAMBIGUATION" in text + + +def test_workflow_protocol_under_token_budget(): + text = build_workflow_protocol() + # ~4 chars/token. 3200 chars ≈ 800 tokens budget. + assert len(text) < 3200, ( + f"WORKFLOW PROTOCOL is {len(text)} chars (~{len(text)//4} tokens); budget is 3200." + ) diff --git a/tests/test_gateway_discovery.py b/tests/test_gateway_discovery.py index 698445a..92ca725 100644 --- a/tests/test_gateway_discovery.py +++ b/tests/test_gateway_discovery.py @@ -132,7 +132,7 @@ def test_workspace_id_whitespace_stripped(self, mock_probe): # Integration tests — verify endpoint URLs constructed by setup scripts # --------------------------------------------------------------------------- -SETUP_DIR = Path(__file__).parent.parent +SETUP_DIR = Path(__file__).parent.parent / "setup" class TestEndpointConstruction: @@ -146,9 +146,11 @@ def _run_setup(self, script_name, tmp_path, env_overrides=None): "DATABRICKS_TOKEN": "dapi_test_token", "DATABRICKS_WORKSPACE_ID": "6280049833385130", "PATH": os.environ.get("PATH", ""), - "PYTHONPATH": str(SETUP_DIR), + "PYTHONPATH": str(SETUP_DIR.parent), # Pre-resolve gateway so subprocess skips the network probe "_GATEWAY_RESOLVED": "", + # Skip CLI install (curl | bash) — tests only verify config files + "SKIP_CLAUDE_INSTALL": "1", } # Ensure DATABRICKS_GATEWAY_HOST is NOT set (test auto-discovery) env.pop("DATABRICKS_GATEWAY_HOST", None) @@ -175,15 +177,15 @@ def test_setup_claude_falls_back_when_gateway_unreachable(self, tmp_path): # Gateway is unreachable from test env, so should fall back import json settings_path = tmp_path / ".claude" / "settings.json" - if settings_path.exists(): - settings = json.loads(settings_path.read_text()) - base_url = settings.get("env", {}).get("ANTHROPIC_BASE_URL", "") - assert base_url.endswith("/anthropic") - # Either gateway or serving-endpoints is valid - assert ( - "ai-gateway.cloud.databricks.com" in base_url - or "serving-endpoints/anthropic" in base_url - ) + assert settings_path.exists(), "settings.json was not written" + settings = json.loads(settings_path.read_text()) + base_url = settings.get("env", {}).get("ANTHROPIC_BASE_URL", "") + assert base_url.endswith("/anthropic") + # Either gateway or serving-endpoints is valid + assert ( + "ai-gateway.cloud.databricks.com" in base_url + or "serving-endpoints/anthropic" in base_url + ) def test_setup_claude_explicit_override(self, tmp_path): """setup_claude.py should prefer explicit DATABRICKS_GATEWAY_HOST.""" @@ -196,10 +198,10 @@ def test_setup_claude_explicit_override(self, tmp_path): import json settings_path = tmp_path / ".claude" / "settings.json" - if settings_path.exists(): - settings = json.loads(settings_path.read_text()) - base_url = settings.get("env", {}).get("ANTHROPIC_BASE_URL", "") - assert "custom.gateway.example.com" in base_url + assert settings_path.exists(), "settings.json was not written" + settings = json.loads(settings_path.read_text()) + base_url = settings.get("env", {}).get("ANTHROPIC_BASE_URL", "") + assert "custom.gateway.example.com" in base_url def test_setup_claude_fallback_no_gateway(self, tmp_path): """setup_claude.py falls back to DATABRICKS_HOST when no gateway available.""" @@ -210,10 +212,10 @@ def test_setup_claude_fallback_no_gateway(self, tmp_path): import json settings_path = tmp_path / ".claude" / "settings.json" - if settings_path.exists(): - settings = json.loads(settings_path.read_text()) - base_url = settings.get("env", {}).get("ANTHROPIC_BASE_URL", "") - assert "test.cloud.databricks.com/serving-endpoints/anthropic" in base_url + assert settings_path.exists(), "settings.json was not written" + settings = json.loads(settings_path.read_text()) + base_url = settings.get("env", {}).get("ANTHROPIC_BASE_URL", "") + assert "test.cloud.databricks.com/serving-endpoints/anthropic" in base_url @mock.patch("utils._probe_gateway", return_value=True) def test_codex_gateway_url_construction(self, mock_probe): diff --git a/tests/test_inbox_status_passthrough.py b/tests/test_inbox_status_passthrough.py new file mode 100644 index 0000000..bbcb8a9 --- /dev/null +++ b/tests/test_inbox_status_passthrough.py @@ -0,0 +1,56 @@ +"""Tests covering counts dict, coda_get_result docstring, and MCP instructions +all reflect the new info_needed / needs_approval terminal statuses.""" +import asyncio +import json + + +def test_mcp_instructions_mention_info_needed(): + """Server-level MCP instructions teach calling LLMs about info_needed.""" + from coda_mcp import mcp_server + + txt = mcp_server.mcp.instructions + assert "info_needed" in txt + assert "needs_approval" in txt + assert "feedback" in txt + + +def test_coda_get_result_docstring_mentions_info_needed(): + """coda_get_result docstring lists info_needed / needs_approval alongside completed/failed.""" + from coda_mcp import mcp_server + + doc = (mcp_server.coda_get_result.__doc__ or "").lower() + assert "info_needed" in doc + assert "needs_approval" in doc + + +def test_inbox_counts_dict_includes_new_statuses(monkeypatch): + """coda_inbox counts dict has info_needed and needs_approval keys.""" + from coda_mcp import mcp_server + + fake_tasks = [ + {"task_id": "t1", "session_id": "s1", "status": "running"}, + {"task_id": "t2", "session_id": "s2", "status": "completed"}, + {"task_id": "t3", "session_id": "s3", "status": "failed"}, + {"task_id": "t4", "session_id": "s4", "status": "info_needed"}, + {"task_id": "t5", "session_id": "s5", "status": "needs_approval"}, + {"task_id": "t6", "session_id": "s6", "status": "info_needed"}, + ] + + monkeypatch.setattr( + mcp_server.task_manager, "list_all_tasks", + lambda email, status_filter=None: list(fake_tasks), + ) + # _read_session_safe is called inside the loop; return None so no viewer_url is added. + monkeypatch.setattr( + mcp_server.task_manager, "_read_session_safe", lambda sid: None, + ) + + result_str = asyncio.run(mcp_server.coda_inbox(email="u@e")) + result = json.loads(result_str) + counts = result["counts"] + + assert counts["running"] == 1 + assert counts["completed"] == 1 + assert counts["failed"] == 1 + assert counts["info_needed"] == 2 + assert counts["needs_approval"] == 1 diff --git a/tests/test_mcp_endpoint.py b/tests/test_mcp_endpoint.py new file mode 100644 index 0000000..bc67e34 --- /dev/null +++ b/tests/test_mcp_endpoint.py @@ -0,0 +1,102 @@ +"""Unit tests for the Flask Blueprint fallback at coda_mcp.mcp_endpoint. + +Production traffic flows through coda_mcp.mcp_asgi (uvicorn + native MCP SDK). +This blueprint is the WSGI-only fallback. These tests pin the JSON-RPC contract +so the two paths stay in lockstep. +""" +import json + +import pytest + + +@pytest.fixture +def client(): + from app import app as flask_app + + return flask_app.test_client() + + +def _rpc(method, params=None, rpc_id=1): + return {"jsonrpc": "2.0", "id": rpc_id, "method": method, "params": params or {}} + + +def test_initialize_returns_server_info(client): + r = client.post("/mcp", json=_rpc("initialize", {"protocolVersion": "2025-03-26"})) + assert r.status_code == 200 + body = r.get_json() + assert body["jsonrpc"] == "2.0" + assert body["result"]["serverInfo"]["name"] == "coda" + assert "capabilities" in body["result"] + + +def test_tools_list_returns_v2_tools(client): + r = client.post("/mcp", json=_rpc("tools/list", {}, rpc_id=2)) + assert r.status_code == 200 + tools = r.get_json()["result"]["tools"] + names = {t["name"] for t in tools} + assert names == {"coda_run", "coda_inbox", "coda_get_result", "coda_interactive"}, ( + f"Tool surface drifted from the v2 contract (docs/mcp-v2-background-execution.md). Got: {names}" + ) + + +def test_tools_list_each_tool_has_description_and_schema(client): + r = client.post("/mcp", json=_rpc("tools/list", {}, rpc_id=3)) + for t in r.get_json()["result"]["tools"]: + assert t.get("description"), f"tool {t['name']} missing description (MCP requires it)" + assert isinstance(t.get("inputSchema"), dict), f"tool {t['name']} missing inputSchema" + + +def test_cors_preflight_returns_204(client): + r = client.options( + "/mcp", + headers={ + "Origin": "https://test.cloud.databricks.com", + "Access-Control-Request-Method": "POST", + }, + ) + assert r.status_code == 204 + assert "Access-Control-Allow-Origin" in r.headers + + +def test_ping_returns_empty_result(client): + r = client.post("/mcp", json=_rpc("ping", {}, rpc_id=4)) + assert r.status_code == 200 + body = r.get_json() + assert body["result"] == {} + assert "error" not in body + + +def test_unknown_method_returns_method_not_found(client): + r = client.post("/mcp", json=_rpc("does/not/exist", {}, rpc_id=5)) + body = r.get_json() + assert body.get("error", {}).get("code") == -32601, ( + f"Expected JSON-RPC method-not-found (-32601); got {body}" + ) + + +def test_unknown_tool_returns_jsonrpc_error(client): + r = client.post( + "/mcp", + json=_rpc("tools/call", {"name": "not_a_real_tool", "arguments": {}}, rpc_id=6), + ) + body = r.get_json() + assert "error" in body or ( + "result" in body and body["result"].get("isError") is True + ), f"Calling an unknown tool should error; got {body}" + + +def test_jsonrpc_id_is_echoed(client): + for rpc_id in (7, "string-id", 0): + r = client.post("/mcp", json=_rpc("ping", {}, rpc_id=rpc_id)) + assert r.get_json()["id"] == rpc_id + + +def test_post_with_non_json_body_does_not_crash(client): + r = client.post( + "/mcp", + data="not json at all", + headers={"Content-Type": "application/json"}, + ) + assert r.status_code in (200, 400) + if r.status_code == 200: + assert "error" in r.get_json() diff --git a/tests/test_mcp_env_strip.py b/tests/test_mcp_env_strip.py new file mode 100644 index 0000000..756f133 --- /dev/null +++ b/tests/test_mcp_env_strip.py @@ -0,0 +1,191 @@ +"""Tests for _build_terminal_shell_env's credential-stripping behavior. + +Replaces the inline 5-key strip that mcp_create_pty_session used to do. +Both create_session (HTTP path) and mcp_create_pty_session (MCP path) +now call this helper, so it must strip both the original 5 keys and +the registry-credential patterns the HTTP path was already covering. +""" +import os +import pytest + +from app import _build_terminal_shell_env + + +# Keys that must be absent from the child shell's env after the strip. +STRIPPED_KEYS = [ + "CLAUDECODE", + "CLAUDE_CODE_SESSION", + "DATABRICKS_TOKEN", + "DATABRICKS_HOST", + "GEMINI_API_KEY", + "NPM_TOKEN", + "UV_DEFAULT_INDEX", + "UV_INDEX_MYREG_PASSWORD", + "UV_INDEX_MYREG_USERNAME", + "npm_config_//registry.example/:_authToken", +] + + +@pytest.mark.parametrize("key", STRIPPED_KEYS) +def test_build_terminal_shell_env_strips_credential_key(key): + """Each known credential / registry-auth key is stripped from the child env.""" + fake_env = { + "PATH": "/usr/bin:/usr/local/bin", # positive control — must survive + "HOME": "/home/test", + key: "leak-me-test-value", + } + result = _build_terminal_shell_env(fake_env) + assert key not in result, ( + f"{key} survived the strip — registry/auth credential leaked into " + f"the child shell's env. Result keys: {sorted(result)}" + ) + + +def test_build_terminal_shell_env_preserves_benign_keys(): + """Positive control: non-credential keys survive the strip. + + Guards against a future regression where the strip becomes too aggressive + and wipes the env entirely. If THIS test fails, the negative assertions + above would silently pass for the wrong reason. + """ + fake_env = { + "PATH": "/usr/bin:/usr/local/bin", + "HOME": "/home/test", + "LANG": "en_US.UTF-8", + } + result = _build_terminal_shell_env(fake_env) + assert result.get("PATH") and "/usr/bin" in result["PATH"] + assert result.get("HOME") == "/home/test" + assert result.get("LANG") == "en_US.UTF-8" + + +try: + import pty as _pty + _master, _slave = _pty.openpty() + os.close(_master) + os.close(_slave) + _PTY_AVAILABLE = True +except Exception: + _PTY_AVAILABLE = False + +_pty_skip = pytest.mark.skipif( + not _PTY_AVAILABLE, + reason="PTY not allocatable in this environment", +) + + +@_pty_skip +def test_mcp_create_pty_session_respects_cwd_kwarg(tmp_path): + """When cwd is passed, sessions[sid]['cwd'] records it.""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = None + try: + sid = mcp_create_pty_session(label="t-cwd", cwd=str(tmp_path)) + assert sessions[sid].get("cwd") == str(tmp_path) + finally: + if sid is not None: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_mcp_create_pty_session_cwd_defaults_to_none(): + """When cwd is not passed, sessions[sid]['cwd'] is None (preserves current behavior).""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = None + try: + sid = mcp_create_pty_session(label="t-no-cwd") + assert sessions[sid].get("cwd") is None + finally: + if sid is not None: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_mcp_close_pty_session_removes_project_dir(tmp_path, monkeypatch): + """When the PTY is closed, any project dir at ~/.coda/projects// is removed.""" + import os + from app import mcp_create_pty_session, mcp_close_pty_session + + # Point HOME at tmp_path so ~/.coda lives in a controllable place. + monkeypatch.setenv("HOME", str(tmp_path)) + + sid = None + try: + sid = mcp_create_pty_session(label="t-cleanup") + + project_dir = os.path.join(str(tmp_path), ".coda", "projects", sid) + os.makedirs(project_dir, exist_ok=True) + sentinel = os.path.join(project_dir, "SENTINEL") + with open(sentinel, "w") as f: + f.write("present-before-close") + assert os.path.exists(sentinel) + + mcp_close_pty_session(sid) + sid = None # session closed; don't double-close in finally + + assert not os.path.exists(project_dir), \ + f"Expected project dir to be removed after PTY close: {project_dir} still exists" + finally: + if sid is not None: + try: + mcp_close_pty_session(sid) + except Exception: + pass + + +@_pty_skip +def test_mcp_close_pty_session_handles_missing_project_dir(tmp_path, monkeypatch): + """No project dir present → close still succeeds (no exception).""" + from app import mcp_create_pty_session, mcp_close_pty_session + + monkeypatch.setenv("HOME", str(tmp_path)) + + sid = None + try: + sid = mcp_create_pty_session(label="t-no-projdir") + # Do NOT create the project dir — verify close still works. + mcp_close_pty_session(sid) # must not raise + sid = None + finally: + if sid is not None: + try: + mcp_close_pty_session(sid) + except Exception: + pass + + +@_pty_skip +def test_terminate_session_removes_project_dir(tmp_path, monkeypatch): + """The idle reaper calls terminate_session directly. Project dir must still be cleaned.""" + import os + from app import mcp_create_pty_session, sessions, terminate_session, mcp_close_pty_session + + monkeypatch.setenv("HOME", str(tmp_path)) + + sid = None + try: + sid = mcp_create_pty_session(label="t-reaper-cleanup") + + # Plant a project dir like coda_interactive would have done. + project_dir = os.path.join(str(tmp_path), ".coda", "projects", sid) + os.makedirs(project_dir, exist_ok=True) + with open(os.path.join(project_dir, "SENTINEL"), "w") as f: + f.write("present") + assert os.path.exists(project_dir) + + # Simulate the reaper's code path: call terminate_session directly. + sess = sessions[sid] + terminate_session(sid, sess["pid"], sess["master_fd"]) + sid = None # session terminated; finally is a no-op + + # Project dir must be removed even though we bypassed mcp_close_pty_session. + assert not os.path.exists(project_dir), \ + "Reaper path must also clean up the project dir — fix terminate_session not mcp_close_pty_session" + finally: + if sid is not None: + try: + mcp_close_pty_session(sid) + except Exception: + pass diff --git a/tests/test_mcp_integration.py b/tests/test_mcp_integration.py new file mode 100644 index 0000000..215d616 --- /dev/null +++ b/tests/test_mcp_integration.py @@ -0,0 +1,292 @@ +"""End-to-end MCP integration tests — v2 background execution + inbox API. + +Exercises the full flow: coda_run -> coda_inbox -> coda_get_result. +No real PTY — app hooks are mocked. +""" + +import json +import os +import time +from unittest.mock import MagicMock + +import pytest + + +# ── helpers ────────────────────────────────────────────────────────── + + +def _parse(result: str) -> dict: + """Parse JSON string returned by MCP tools.""" + return json.loads(result) + + +# ── fixture ────────────────────────────────────────────────────────── + + +@pytest.fixture(autouse=True) +def isolated_env(tmp_path): + """Redirect state to tmp and mock PTY hooks.""" + from coda_mcp import task_manager as tm + from coda_mcp import mcp_server as ms + + original_dir = tm.SESSIONS_DIR + tm.SESSIONS_DIR = str(tmp_path / "sessions") + + mock_send = MagicMock() + mock_close = MagicMock() + ms.set_app_hooks( + create_session_fn=lambda label, **kwargs: f"pty-mock-{label}", + send_input_fn=mock_send, + close_session_fn=mock_close, + ) + + yield {"tmp": tmp_path, "mock_send": mock_send, "mock_close": mock_close} + + tm.SESSIONS_DIR = original_dir + ms.set_app_hooks(None, None, None) + + +# ── 1. Happy-path: fire-and-forget → inbox → result ───────────────── + + +class TestFullMcpFlow: + @pytest.mark.asyncio + async def test_full_background_flow(self, isolated_env): + """Happy path: run (fire-and-forget) → inbox → result.""" + from coda_mcp import mcp_server as ms + from coda_mcp import task_manager as tm + + # Step 1: submit task (returns immediately) + with MagicMock() as mock_thread: + from coda_mcp import mcp_server + with pytest.MonkeyPatch.context() as mp: + mp.setattr("coda_mcp.mcp_server.threading", mock_thread) + raw = await ms.coda_run( + prompt="create a sales pipeline", + email="alice@test.com", + context='{"tables": ["sales.transactions"]}', + ) + + task = _parse(raw) + assert task["status"] == "running" + task_id = task["task_id"] + session_id = task["session_id"] + assert task_id.startswith("task-") + assert session_id.startswith("sess-") + + # Step 2: inbox shows running task + raw = await ms.coda_inbox() + inbox = _parse(raw) + assert len(inbox["tasks"]) == 1 + assert inbox["tasks"][0]["task_id"] == task_id + assert inbox["tasks"][0]["status"] == "running" + assert inbox["counts"]["running"] == 1 + + # Step 3: simulate agent writing result.json + tdir = tm._task_dir(session_id, task_id) + result_path = os.path.join(tdir, "result.json") + with open(result_path, "w") as f: + json.dump({ + "status": "completed", + "summary": "Created sales pipeline with 3 stages", + "files_changed": ["pipeline.py", "config.yaml"], + "artifacts": ["/workspace/pipeline.py"], + "errors": [], + }, f) + + # Step 4: complete_task (simulating what _watch_task does) + tm.complete_task(session_id, task_id) + + # Step 5: inbox shows completed + raw = await ms.coda_inbox() + inbox = _parse(raw) + assert len(inbox["tasks"]) == 1 + assert inbox["tasks"][0]["status"] == "completed" + assert inbox["tasks"][0]["summary"] == "Created sales pipeline with 3 stages" + assert inbox["counts"]["completed"] == 1 + + # Step 6: get full result + raw = await ms.coda_get_result(task_id=task_id, session_id=session_id) + result = _parse(raw) + assert result["task_id"] == task_id + assert result["summary"] == "Created sales pipeline with 3 stages" + assert result["files_changed"] == ["pipeline.py", "config.yaml"] + + # Step 7: session was auto-closed + session = tm._read_session(session_id) + assert session["status"] == "closed" + + +# ── 2. Task chaining with previous_session_id ─────────────────────── + + +class TestTaskChaining: + @pytest.mark.asyncio + async def test_chained_task_references_prior_session(self, isolated_env): + """A chained task includes prior session context in prompt.""" + from coda_mcp import mcp_server as ms + from coda_mcp import task_manager as tm + + # First task + raw = await ms.coda_run( + prompt="build pipeline", + email="bob@test.com", + ) + first = _parse(raw) + first_sid = first["session_id"] + first_tid = first["task_id"] + + # Complete first task + tdir = tm._task_dir(first_sid, first_tid) + with open(os.path.join(tdir, "result.json"), "w") as f: + json.dump({ + "status": "completed", + "summary": "Built pipeline.py", + "files_changed": ["pipeline.py"], + }, f) + tm.complete_task(first_sid, first_tid) + + # Second task chained to first + raw = await ms.coda_run( + prompt="add tests for the pipeline", + email="bob@test.com", + previous_session_id=first_sid, + ) + second = _parse(raw) + second_sid = second["session_id"] + second_tid = second["task_id"] + + # Verify prompt references prior session + prompt_path = os.path.join( + tm._task_dir(second_sid, second_tid), "prompt.txt" + ) + with open(prompt_path) as f: + prompt_text = f.read() + assert f"PRIOR SESSION: {first_sid}" in prompt_text + + # Verify meta.json has previous_session_id + meta_path = os.path.join( + tm._task_dir(second_sid, second_tid), "meta.json" + ) + with open(meta_path) as f: + meta = json.load(f) + assert meta["previous_session_id"] == first_sid + + # Verify inbox shows chaining + raw = await ms.coda_inbox() + inbox = _parse(raw) + running_tasks = [t for t in inbox["tasks"] if t["status"] == "running"] + assert len(running_tasks) == 1 + assert running_tasks[0]["previous_session_id"] == first_sid + + +# ── 3. Concurrency limit ──────────────────────────────────────────── + + +class TestConcurrencyLimit: + @pytest.mark.asyncio + async def test_exceeding_limit_returns_error(self, isolated_env): + """Exceeding MAX_CONCURRENT_TASKS returns a clear error.""" + from coda_mcp import mcp_server as ms + from unittest.mock import patch + + with patch("coda_mcp.task_manager.MAX_CONCURRENT_TASKS", 1): + r1 = await ms.coda_run(prompt="task1", email="a@b.com") + assert _parse(r1)["status"] == "running" + + r2 = await ms.coda_run(prompt="task2", email="a@b.com") + d2 = _parse(r2) + assert d2["status"] == "error" + assert "concurrency" in d2["error"].lower() + + +# ── 4. Yolo permissions → --yolo flag ─────────────────────────────── + + +class TestYoloPermissions: + @pytest.mark.asyncio + async def test_yolo_permissions(self, isolated_env): + """permissions='yolo' causes the PTY command to include --yolo.""" + from coda_mcp import mcp_server as ms + + mock_send = isolated_env["mock_send"] + + with MagicMock() as mock_thread: + from coda_mcp import mcp_server + with pytest.MonkeyPatch.context() as mp: + mp.setattr("coda_mcp.mcp_server.threading", mock_thread) + await ms.coda_run( + prompt="deploy everything", + email="dave@test.com", + permissions="yolo", + ) + + mock_send.assert_called_once() + cmd = mock_send.call_args[0][1] + assert "--yolo" in cmd + + +# ── 5. Session auto-close on completion ────────────────────────────── + + +class TestAutoClose: + @pytest.mark.asyncio + async def test_session_auto_closes(self, isolated_env): + """Session is auto-closed when task completes.""" + from coda_mcp import mcp_server as ms + from coda_mcp import task_manager as tm + + raw = await ms.coda_run(prompt="quick job", email="a@b.com") + d = _parse(raw) + + # Session should be busy + session = tm._read_session(d["session_id"]) + assert session["status"] == "busy" + + # Complete the task + tdir = tm._task_dir(d["session_id"], d["task_id"]) + with open(os.path.join(tdir, "result.json"), "w") as f: + json.dump({"status": "completed", "summary": "done"}, f) + tm.complete_task(d["session_id"], d["task_id"]) + + # Session should now be closed + session = tm._read_session(d["session_id"]) + assert session["status"] == "closed" + assert "closed_at" in session + + +# ── 6. Cleanup expired tasks ──────────────────────────────────────── + + +class TestCleanup: + @pytest.mark.asyncio + async def test_cleanup_removes_expired(self, isolated_env): + """cleanup_expired_tasks removes old closed sessions.""" + from coda_mcp import mcp_server as ms + from coda_mcp import task_manager as tm + from unittest.mock import patch + + raw = await ms.coda_run(prompt="old task", email="a@b.com") + d = _parse(raw) + + # Complete and close + tdir = tm._task_dir(d["session_id"], d["task_id"]) + with open(os.path.join(tdir, "result.json"), "w") as f: + json.dump({"status": "completed", "summary": "done"}, f) + tm.complete_task(d["session_id"], d["task_id"]) + + # Backdate closed_at to expire it + session = tm._read_session(d["session_id"]) + session["closed_at"] = time.time() - 90000 # 25 hours ago + tm._write_json(tm._session_file(d["session_id"]), session) + + # Cleanup should remove it + removed = tm.cleanup_expired_tasks() + assert removed == 1 + + # Inbox should be empty now + raw = await ms.coda_inbox() + inbox = _parse(raw) + assert len(inbox["tasks"]) == 0 + + diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py new file mode 100644 index 0000000..dd7d004 --- /dev/null +++ b/tests/test_mcp_server.py @@ -0,0 +1,481 @@ +"""Tests for mcp_server — v2 background execution + inbox API.""" + +import asyncio +import json +import os +from unittest import mock + +import pytest +from coda_mcp import mcp_server, task_manager, url_builder + + +# ── helpers ────────────────────────────────────────────────────────── + + +@pytest.fixture(autouse=True) +def _reset_hooks(): + """Clear app hooks before/after each test.""" + from coda_mcp import mcp_server + + mcp_server._app_create_session = None + mcp_server._app_send_input = None + mcp_server._app_close_session = None + yield + mcp_server._app_create_session = None + mcp_server._app_send_input = None + mcp_server._app_close_session = None + + +@pytest.fixture(autouse=True) +def _isolated_sessions(tmp_path): + """Point task_manager.SESSIONS_DIR at a temp dir.""" + sessions_dir = str(tmp_path / ".coda" / "sessions") + with mock.patch("coda_mcp.task_manager.SESSIONS_DIR", sessions_dir): + yield sessions_dir + + +def _parse(result: str) -> dict: + """Parse JSON string returned by MCP tools.""" + return json.loads(result) + + +# ── Tool registration ──────────────────────────────────────────────── + + +class TestToolRegistration: + def test_tools_registered(self): + from coda_mcp import mcp_server + + tool_mgr = mcp_server.mcp._tool_manager + tool_names = set(tool_mgr._tools.keys()) + expected = {"coda_run", "coda_inbox", "coda_get_result", "coda_interactive"} + assert expected == tool_names, f"Expected {expected}, got {tool_names}" + + def test_tool_count(self): + from coda_mcp import mcp_server + + tool_mgr = mcp_server.mcp._tool_manager + assert len(tool_mgr._tools) == 4 + + +# ── coda_run ───────────────────────────────────────────────────────── + + +class TestCodaRun: + @pytest.mark.asyncio + async def test_creates_task_disk_only(self): + """Without app hooks, creates session+task on disk, returns immediately.""" + from coda_mcp import mcp_server + + result = await mcp_server.coda_run( + prompt="fix the bug", + email="a@b.com", + ) + data = _parse(result) + assert data["status"] == "running" + assert data["task_id"].startswith("task-") + assert data["session_id"].startswith("sess-") + + @pytest.mark.asyncio + async def test_auto_creates_session(self): + """coda_run auto-creates a session — no separate create_session needed.""" + from coda_mcp import mcp_server + from coda_mcp import task_manager + + result = await mcp_server.coda_run( + prompt="build pipeline", + email="a@b.com", + ) + data = _parse(result) + session = task_manager._read_session(data["session_id"]) + assert session["email"] == "a@b.com" + assert session["status"] == "busy" # task is running + + @pytest.mark.asyncio + async def test_sends_to_pty_when_hooks_set(self): + """With hooks, creates PTY and sends hermes command.""" + from coda_mcp import mcp_server + + mock_create = mock.Mock(return_value="pty-xyz") + mock_send = mock.Mock() + mcp_server.set_app_hooks( + create_session_fn=mock_create, + send_input_fn=mock_send, + close_session_fn=mock.Mock(), + ) + + with mock.patch("coda_mcp.mcp_server.threading"): + result = await mcp_server.coda_run( + prompt="fix the bug", + email="a@b.com", + ) + + data = _parse(result) + assert data["status"] == "running" + mock_create.assert_called_once() + call_kwargs = mock_create.call_args.kwargs + assert call_kwargs["label"] == "hermes-mcp" + assert "transcript_path" in call_kwargs + mock_send.assert_called_once() + assert "hermes" in mock_send.call_args[0][1] + + @pytest.mark.asyncio + async def test_yolo_permission(self): + """permissions='yolo' produces --yolo flag in PTY command.""" + from coda_mcp import mcp_server + + mock_send = mock.Mock() + mcp_server.set_app_hooks( + create_session_fn=mock.Mock(return_value="pty-1"), + send_input_fn=mock_send, + close_session_fn=mock.Mock(), + ) + + with mock.patch("coda_mcp.mcp_server.threading"): + await mcp_server.coda_run( + prompt="go fast", + email="a@b.com", + permissions="yolo", + ) + + cmd = mock_send.call_args[0][1] + assert "--yolo" in cmd + + @pytest.mark.asyncio + async def test_previous_session_id_in_prompt(self): + """previous_session_id appears in the wrapped prompt.""" + from coda_mcp import mcp_server + from coda_mcp import task_manager + + # Create a "prior" session with a completed task + prior = task_manager.create_session("a@b.com", "u1") + prior_sid = prior["session_id"] + + result = await mcp_server.coda_run( + prompt="add tests", + email="a@b.com", + previous_session_id=prior_sid, + ) + data = _parse(result) + + # Read the prompt.txt and verify prior session reference + tdir = task_manager._task_dir(data["session_id"], data["task_id"]) + with open(os.path.join(tdir, "prompt.txt")) as f: + prompt_text = f.read() + + assert f"PRIOR SESSION: {prior_sid}" in prompt_text + + @pytest.mark.asyncio + async def test_meta_json_written(self): + """coda_run writes meta.json with task metadata.""" + from coda_mcp import mcp_server + from coda_mcp import task_manager + + result = await mcp_server.coda_run( + prompt="build a dashboard for sales", + email="alice@test.com", + previous_session_id="sess-old", + ) + data = _parse(result) + + meta_path = os.path.join( + task_manager._task_dir(data["session_id"], data["task_id"]), + "meta.json", + ) + with open(meta_path) as f: + meta = json.load(f) + + assert meta["email"] == "alice@test.com" + assert meta["previous_session_id"] == "sess-old" + assert meta["prompt_summary"] == "build a dashboard for sales" + assert "created_at" in meta + + @pytest.mark.asyncio + async def test_concurrency_limit(self): + """Exceeding MAX_CONCURRENT_TASKS returns an error.""" + from coda_mcp import mcp_server + + with mock.patch("coda_mcp.task_manager.MAX_CONCURRENT_TASKS", 1): + # First task succeeds + r1 = await mcp_server.coda_run(prompt="task1", email="a@b.com") + assert _parse(r1)["status"] == "running" + + # Second task should fail (1 already running) + r2 = await mcp_server.coda_run(prompt="task2", email="a@b.com") + d2 = _parse(r2) + assert d2["status"] == "error" + assert "concurrency" in d2["error"].lower() + + +# ── coda_inbox ─────────────────────────────────────────────────────── + + +class TestCodaInbox: + @pytest.mark.asyncio + async def test_empty_inbox(self): + """No tasks → empty inbox.""" + from coda_mcp import mcp_server + + result = await mcp_server.coda_inbox() + data = _parse(result) + assert data["tasks"] == [] + assert data["counts"] == {"running": 0, "completed": 0, "failed": 0, "info_needed": 0, "needs_approval": 0} + + @pytest.mark.asyncio + async def test_running_task_in_inbox(self): + """A running task shows up in the inbox.""" + from coda_mcp import mcp_server + + await mcp_server.coda_run(prompt="build pipeline", email="a@b.com") + + result = await mcp_server.coda_inbox() + data = _parse(result) + assert len(data["tasks"]) == 1 + assert data["tasks"][0]["status"] == "running" + assert data["tasks"][0]["prompt_summary"] == "build pipeline" + assert data["counts"]["running"] == 1 + + @pytest.mark.asyncio + async def test_completed_task_in_inbox(self): + """A completed task shows summary in inbox.""" + from coda_mcp import mcp_server + from coda_mcp import task_manager + + r = await mcp_server.coda_run(prompt="fix bug", email="a@b.com") + d = _parse(r) + + # Simulate agent writing result.json + tdir = task_manager._task_dir(d["session_id"], d["task_id"]) + result_path = os.path.join(tdir, "result.json") + with open(result_path, "w") as f: + json.dump({ + "status": "completed", + "summary": "Fixed the login bug", + "files_changed": ["auth.py"], + "artifacts": [], + "errors": [], + }, f) + + result = await mcp_server.coda_inbox() + data = _parse(result) + assert len(data["tasks"]) == 1 + assert data["tasks"][0]["status"] == "completed" + assert data["tasks"][0]["summary"] == "Fixed the login bug" + + @pytest.mark.asyncio + async def test_status_filter(self): + """Filtering inbox by status works.""" + from coda_mcp import mcp_server + from coda_mcp import task_manager + + # Create two tasks — one running, one completed + r1 = await mcp_server.coda_run(prompt="task1", email="a@b.com") + d1 = _parse(r1) + + r2 = await mcp_server.coda_run(prompt="task2", email="a@b.com") + d2 = _parse(r2) + + # Complete task2 + tdir = task_manager._task_dir(d2["session_id"], d2["task_id"]) + with open(os.path.join(tdir, "result.json"), "w") as f: + json.dump({"status": "completed", "summary": "done"}, f) + + # Filter running only + result = await mcp_server.coda_inbox(status="running") + data = _parse(result) + assert len(data["tasks"]) == 1 + assert data["tasks"][0]["task_id"] == d1["task_id"] + + @pytest.mark.asyncio + async def test_multiple_tasks_sorted_recent_first(self): + """Inbox returns tasks sorted most recent first.""" + from coda_mcp import mcp_server + + await mcp_server.coda_run(prompt="first", email="a@b.com") + await mcp_server.coda_run(prompt="second", email="a@b.com") + + result = await mcp_server.coda_inbox() + data = _parse(result) + assert len(data["tasks"]) == 2 + # Most recent first + assert data["tasks"][0]["prompt_summary"] == "second" + assert data["tasks"][1]["prompt_summary"] == "first" + + +# ── coda_get_result ────────────────────────────────────────────────── + + +class TestCodaGetResult: + @pytest.mark.asyncio + async def test_returns_result(self): + from coda_mcp import mcp_server + from coda_mcp import task_manager + + r = await mcp_server.coda_run(prompt="go", email="a@b.com") + d = _parse(r) + + # Simulate agent writing result.json + tdir = task_manager._task_dir(d["session_id"], d["task_id"]) + with open(os.path.join(tdir, "result.json"), "w") as f: + json.dump({ + "summary": "Fixed the bug", + "files_changed": ["app.py"], + "artifacts": [], + "errors": [], + }, f) + + result = await mcp_server.coda_get_result( + task_id=d["task_id"], session_id=d["session_id"] + ) + data = _parse(result) + assert data["task_id"] == d["task_id"] + assert data["session_id"] == d["session_id"] + assert data["summary"] == "Fixed the bug" + + @pytest.mark.asyncio + async def test_no_result_yet(self): + from coda_mcp import mcp_server + + r = await mcp_server.coda_run(prompt="go", email="a@b.com") + d = _parse(r) + + result = await mcp_server.coda_get_result( + task_id=d["task_id"], session_id=d["session_id"] + ) + data = _parse(result) + assert data["status"] == "running" + assert "not yet available" in data["message"] + + +# ── viewer_url + transcript_path wiring ───────────────────────────── + + +def _run(coro): + return asyncio.get_event_loop().run_until_complete(coro) if not asyncio.iscoroutine(coro) else asyncio.run(coro) + + +def test_coda_run_includes_viewer_url_when_builder_returns_one(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", "app.example.com") + + create = mock.MagicMock(return_value="pty-abc") + send = mock.MagicMock() + closer = mock.MagicMock() + mcp_server.set_app_hooks(create, send, closer) + + result_json = asyncio.run(mcp_server.coda_run(prompt="do it", email="u@x")) + result = json.loads(result_json) + assert result["status"] == "running" + assert "?session=pty-abc" in result["viewer_url"] + assert result["viewer_url"].startswith("https://app.example.com") + + +def test_coda_run_omits_viewer_url_when_builder_returns_none(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", None) + monkeypatch.delenv("CODA_APP_URL", raising=False) + + create = mock.MagicMock(return_value="pty-abc") + mcp_server.set_app_hooks(create, mock.MagicMock(), mock.MagicMock()) + + result_json = asyncio.run(mcp_server.coda_run(prompt="do it", email="u@x")) + result = json.loads(result_json) + # viewer_url present but None when builder returns None + assert result.get("viewer_url") is None + + +def test_coda_run_passes_transcript_path_to_create_session(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + create = mock.MagicMock(return_value="pty-abc") + mcp_server.set_app_hooks(create, mock.MagicMock(), mock.MagicMock()) + + asyncio.run(mcp_server.coda_run(prompt="do it", email="u@x")) + # create_session was called with transcript_path=... pointing into ~/.coda/sessions//tasks//transcript.log + kwargs = create.call_args.kwargs + assert "transcript_path" in kwargs + assert kwargs["transcript_path"].endswith("transcript.log") + assert "tasks" in kwargs["transcript_path"] + + +def test_coda_inbox_decorates_each_task_with_viewer_url(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", "app.example.com") + + # Seed one session with one task and a pty_session_id + s = task_manager.create_session("u@x", "uid", label="t") + sid = s["session_id"] + task_manager._update_session_field(sid, "pty_session_id", "pty-xyz") + task_manager.create_task(sid, "prompt", "u@x") + + result_json = asyncio.run(mcp_server.coda_inbox()) + result = json.loads(result_json) + assert len(result["tasks"]) == 1 + assert "viewer_url" in result["tasks"][0] + assert "?session=pty-xyz" in result["tasks"][0]["viewer_url"] + + +def test_coda_get_result_includes_viewer_url(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setattr(url_builder, "_app_url_cache", "app.example.com") + + s = task_manager.create_session("u@x", "uid", label="t") + sid = s["session_id"] + task_manager._update_session_field(sid, "pty_session_id", "pty-xyz") + t = task_manager.create_task(sid, "prompt", "u@x") + tid = t["task_id"] + tdir = task_manager._task_dir(sid, tid) + task_manager._write_json(tdir + "/result.json", { + "status": "completed", "summary": "ok", + }) + + result_json = asyncio.run(mcp_server.coda_get_result(tid, sid)) + result = json.loads(result_json) + assert "viewer_url" in result + assert "?session=pty-xyz" in result["viewer_url"] + + +class TestInteractiveHelpers: + def test_safe_dirname_basename(self): + from coda_mcp.mcp_server import _safe_dirname + assert _safe_dirname("/Users/x@y.com/WAM") == "WAM" + assert _safe_dirname("/Users/x@y.com/WAM/") == "WAM" + + def test_safe_dirname_sanitizes(self): + from coda_mcp.mcp_server import _safe_dirname + assert _safe_dirname("/Users/x/My Project!") == "My_Project_" + + def test_safe_dirname_empty_fallback(self): + from coda_mcp.mcp_server import _safe_dirname + assert _safe_dirname("/") == "workspace" + assert _safe_dirname("") == "workspace" + + def test_safe_dirname_rejects_traversal(self): + from coda_mcp.mcp_server import _safe_dirname + assert _safe_dirname("/foo/..") == "workspace" + assert _safe_dirname("/foo/.") == "workspace" + + def test_normalize_strips_workspace_prefix(self): + from coda_mcp.mcp_server import _normalize_workspace_path + assert _normalize_workspace_path("/Workspace/Users/x/WAM") == "/Users/x/WAM" + + def test_normalize_leaves_plain_path(self): + from coda_mcp.mcp_server import _normalize_workspace_path + assert _normalize_workspace_path("/Users/x/WAM") == "/Users/x/WAM" + assert _normalize_workspace_path("/Users/x/WAM/") == "/Users/x/WAM" + + @pytest.mark.asyncio + async def test_wait_for_agent_ready_delegates(self, monkeypatch): + """_wait_for_agent_ready calls _wait_for_output_stable with prompt-seed constants.""" + from coda_mcp import mcp_server + seen = {} + + async def fake_stable(pty, max_wait, stability): + seen["args"] = (pty, max_wait, stability) + + monkeypatch.setattr(mcp_server, "_wait_for_output_stable", fake_stable) + await mcp_server._wait_for_agent_ready("pty-1") + assert seen["args"] == ( + "pty-1", + mcp_server._PROMPT_SEED_MAX_WAIT_S, + mcp_server._PROMPT_SEED_STABILITY_S, + ) diff --git a/tests/test_mlflow_tracing.py b/tests/test_mlflow_tracing.py index fb6e975..c72113f 100644 --- a/tests/test_mlflow_tracing.py +++ b/tests/test_mlflow_tracing.py @@ -14,7 +14,7 @@ # Helpers # --------------------------------------------------------------------------- -SETUP_MLFLOW = Path(__file__).parent.parent / "setup_mlflow.py" +SETUP_MLFLOW = Path(__file__).parent.parent / "setup" / "setup_mlflow.py" def run_setup_mlflow(tmp_path, env_overrides=None): diff --git a/tests/test_npm_version_pinning.py b/tests/test_npm_version_pinning.py index ee128dd..a155596 100644 --- a/tests/test_npm_version_pinning.py +++ b/tests/test_npm_version_pinning.py @@ -318,19 +318,40 @@ def test_zero_disables_cooldown_via_env(self, monkeypatch): # 5. Live integration (runs actual npm, skip if npm not available) # --------------------------------------------------------------------------- +def _npm_live_unavailable(): + """True when the live npm-registry probe can't run — SKIP, never ERROR. + + The probe must not raise: a registry timeout/connection failure was being + surfaced as a pytest *collection error* (the old skipif ran the subprocess + inline, so TimeoutExpired propagated out of the condition). Catch everything + and treat any failure as 'skip this live test'.""" + import shutil + import subprocess + if not shutil.which("npm"): + return True + try: + return subprocess.run( + ["npm", "view", "npm", "version"], + capture_output=True, timeout=15, + ).returncode != 0 + except Exception: + return True + + class TestNpmVersionLive: """Run against real npm registry to verify the function works end-to-end.""" @pytest.mark.skipif( - not __import__("shutil").which("npm"), - reason="npm not installed" + _npm_live_unavailable(), + reason="npm not installed or npm registry unreachable", ) def test_resolves_real_package(self): get_npm_version = _get_npm_version() # Use fast path (no cooldown) so this test isn't sensitive to recent # publishes — it's a sanity check that npm + the network work. version = get_npm_version("opencode-ai", min_age_days=0) - assert version is not None + if version is None: + pytest.skip("npm registry returned no version (network/registry flake, not a code bug)") # Version should look like a semver (X.Y.Z) parts = version.split(".") assert len(parts) >= 2, f"Expected semver, got: {version}" diff --git a/tests/test_replay_attach.py b/tests/test_replay_attach.py new file mode 100644 index 0000000..89a893e --- /dev/null +++ b/tests/test_replay_attach.py @@ -0,0 +1,105 @@ +"""Tests for /api/session/attach replay fallback.""" +import json +import os +from pathlib import Path + +import pytest + +from coda_mcp import task_manager + +try: + import pty as _pty + _master, _slave = _pty.openpty() + os.close(_master) + os.close(_slave) + _PTY_AVAILABLE = True +except Exception: + _PTY_AVAILABLE = False + +_pty_skip = pytest.mark.skipif( + not _PTY_AVAILABLE, + reason="PTY not allocatable in this environment", +) + + +@pytest.fixture +def client(tmp_path, monkeypatch): + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + monkeypatch.setenv("MAX_CONCURRENT_SESSIONS", "5") + import app as app_module + # Set app_owner so check_authorization returns (True, None) for requests + # with no user header (same pattern used by test_session_detach.py) + app_module.app_owner = "test@example.com" + with app_module.app.test_client() as c: + yield c, tmp_path + + +def _seed_transcript(sessions_root: Path, pty_id: str, content: bytes) -> None: + sess_id = "sess-test" + task_id = "task-test" + sdir = sessions_root / sess_id + tdir = sdir / "tasks" / task_id + tdir.mkdir(parents=True) + (sdir / "session.json").write_text(json.dumps({ + "session_id": sess_id, + "pty_session_id": pty_id, + "current_task": None, + "completed_tasks": [task_id], + "status": "closed", + })) + (tdir / "transcript.log").write_bytes(content) + + +def test_attach_returns_replay_when_pty_gone_and_transcript_exists(client): + c, root = client + _seed_transcript(root, "pty-gone", b"hello\r\nworld\r\n") + resp = c.post("/api/session/attach", json={"session_id": "pty-gone"}) + assert resp.status_code == 200 + data = resp.get_json() + assert data["replay"] is True + assert data["output"] == ["hello\r\nworld\r\n"] + assert data["label"] == "hermes-mcp (replay)" + + +def test_attach_404_when_pty_gone_and_no_transcript(client): + c, root = client + resp = c.post("/api/session/attach", json={"session_id": "pty-nope"}) + assert resp.status_code == 404 + + +@_pty_skip +def test_attach_session_returns_replay_for_alive_replay_only_pty(tmp_path, monkeypatch): + """A PTY created with `replay_only=True` (the flag introduced by coda_run's contract) that is still alive serves the transcript-from-disk, not the live output_buffer. + + This is the new contract introduced by the replay-only flag — historically + a live PTY would serve its output_buffer. + """ + from app import app as flask_app, mcp_create_pty_session, mcp_close_pty_session + from coda_mcp import task_manager + + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + + sid = None + try: + sid = mcp_create_pty_session(label="replay-alive", replay_only=True) + sess_id = "sess-x" + task_id = "task-x" + sdir = tmp_path / sess_id + tdir = sdir / "tasks" / task_id + tdir.mkdir(parents=True) + (sdir / "session.json").write_text( + '{"session_id": "%s", "pty_session_id": "%s", "current_task": "%s"}' % (sess_id, sid, task_id) + ) + (tdir / "transcript.log").write_bytes(b"FROM DISK") + # Cache may have stale entries from earlier tests — clear before the lookup. + task_manager._pty_lookup_cache.clear() + + client = flask_app.test_client() + resp = client.post("/api/session/attach", json={"session_id": sid}) + assert resp.status_code == 200 + body = resp.get_json() + assert body["replay"] is True + assert body["output"] == ["FROM DISK"] + finally: + if sid is not None: + mcp_close_pty_session(sid) diff --git a/tests/test_replay_only_flag.py b/tests/test_replay_only_flag.py new file mode 100644 index 0000000..8ee1e46 --- /dev/null +++ b/tests/test_replay_only_flag.py @@ -0,0 +1,197 @@ +"""Tests for the replay_only flag on PTY sessions.""" +import inspect +import pytest + +# Reuse the PTY-availability guard pattern from the suite. +import os +try: + import pty as _pty + _master, _slave = _pty.openpty() + os.close(_master) + os.close(_slave) + _PTY_AVAILABLE = True +except Exception: + _PTY_AVAILABLE = False + +_pty_skip = pytest.mark.skipif( + not _PTY_AVAILABLE, + reason="PTY not allocatable in this environment", +) + + +@_pty_skip +def test_mcp_create_pty_session_stores_replay_only_flag(): + """Creating a PTY with replay_only=True stores the flag in the session dict.""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = mcp_create_pty_session(label="t1", replay_only=True) + try: + assert sessions[sid].get("replay_only") is True + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_mcp_create_pty_session_defaults_replay_only_false(): + """Default for replay_only is False (backward compat).""" + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = mcp_create_pty_session(label="t2") + try: + assert sessions[sid].get("replay_only") is False + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_attach_session_replay_only_alive_pty_returns_replay(tmp_path, monkeypatch): + """A replay_only=True PTY that is still alive serves the transcript, not the live buffer.""" + from app import app as flask_app, mcp_create_pty_session, mcp_close_pty_session + from coda_mcp import task_manager + + # Point task_manager at a tmp sessions root so find_task_dir_by_pty_session resolves. + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + + # Create a fake task dir keyed by the PTY id we'll mint shortly. + sid = mcp_create_pty_session(label="t-replay-alive", replay_only=True) + try: + # Plant a session.json that links task → this pty_session_id, plus a transcript. + sess_id = "sess-fake" + task_id = "task-fake" + sdir = tmp_path / sess_id + tdir = sdir / "tasks" / task_id + tdir.mkdir(parents=True) + (sdir / "session.json").write_text( + '{"session_id": "%s", "pty_session_id": "%s", "current_task": "%s"}' + % (sess_id, sid, task_id) + ) + (tdir / "transcript.log").write_bytes(b"HELLO TRANSCRIPT") + + # Bust the lookup cache so find_task_dir_by_pty_session sees the new files. + task_manager._pty_lookup_cache.clear() + + client = flask_app.test_client() + resp = client.post("/api/session/attach", json={"session_id": sid}) + + assert resp.status_code == 200 + body = resp.get_json() + assert body["replay"] is True + assert body["output"] == ["HELLO TRANSCRIPT"] + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_attach_session_replay_only_false_alive_pty_returns_live_buffer(): + """A replay_only=False PTY that is still alive returns the live output_buffer (unchanged behavior).""" + from app import app as flask_app, mcp_create_pty_session, mcp_close_pty_session + + sid = mcp_create_pty_session(label="t-live", replay_only=False) + try: + client = flask_app.test_client() + resp = client.post("/api/session/attach", json={"session_id": sid}) + + assert resp.status_code == 200 + body = resp.get_json() + assert body.get("replay") in (False, None) # live path doesn't set replay key + assert "output" in body + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_coda_run_creates_pty_with_replay_only_true(tmp_path, monkeypatch): + """coda_run must create its PTY with replay_only=True.""" + import asyncio + import json + from app import sessions, mcp_close_pty_session + from coda_mcp import mcp_server, task_manager + + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + # Stop the watcher from racing the test — we only care about creation here. + monkeypatch.setattr(mcp_server, "_watch_task", lambda *a, **kw: None) + + result_str = asyncio.run(mcp_server.coda_run(prompt="ignored", email="t@example.com")) + result = json.loads(result_str) + session = task_manager._read_session(result["session_id"]) + pty_id = session.get("pty_session_id") + try: + assert pty_id is not None + assert sessions[pty_id].get("replay_only") is True + finally: + if pty_id is not None: + mcp_close_pty_session(pty_id) + + +def test_mcp_create_pty_session_signature_has_no_grace_param(): + """Regression guard: mcp_create_pty_session must not accept a 'grace' kwarg. + + Pure signature introspection — no PTY needed, runs unconditionally so + that no-PTY environments (CI, sandboxed runners) still catch regressions. + """ + from app import mcp_create_pty_session + + sig = inspect.signature(mcp_create_pty_session) + assert "grace" not in sig.parameters, ( + f"mcp_create_pty_session should not accept a 'grace' parameter " + f"(found in signature: {list(sig.parameters)})" + ) + + +@_pty_skip +def test_no_grace_key_in_session_dict(): + """Regression guard: session dicts from mcp_create_pty_session must not + contain a 'grace' key. + + Protects against accidental re-introduction of grace-period machinery + in future changes. PTY-gated because it actually allocates one. + """ + from app import mcp_create_pty_session, mcp_close_pty_session, sessions + + sid = mcp_create_pty_session(label="t-no-grace", replay_only=True) + try: + assert "grace" not in sessions[sid], ( + f"session dict should not contain a 'grace' key " + f"(found: {list(sessions[sid].keys())})" + ) + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_coda_run_does_not_create_project_dir(tmp_path, monkeypatch): + """Regression guard: coda_run is Mode 3 (replay-only, no project dir). + Only coda_interactive (Mode 2) creates dirs under ~/.coda/projects/. + + If a future change makes coda_run pull workspace files or otherwise + creates a per-session project dir under ~/.coda/projects/, this test fires. + """ + import asyncio + import json + import os + from app import mcp_close_pty_session + from coda_mcp import mcp_server, task_manager + + monkeypatch.setenv("HOME", str(tmp_path)) + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path / "sessions")) + # Stop the watcher from racing the test. + monkeypatch.setattr(mcp_server, "_watch_task", lambda *a, **kw: None) + + result_str = asyncio.run(mcp_server.coda_run( + prompt="ignored", email="t@example.com", + )) + result = json.loads(result_str) + pty_id = None + try: + sess = task_manager._read_session(result["session_id"]) + pty_id = sess.get("pty_session_id") + + # Project dir must NOT exist for coda_run. + projects_root = os.path.join(str(tmp_path), ".coda", "projects") + assert not os.path.isdir(projects_root) or not os.listdir(projects_root), ( + f"coda_run unexpectedly created project dirs under {projects_root}: " + f"{os.listdir(projects_root) if os.path.isdir(projects_root) else 'n/a'}" + ) + finally: + if pty_id is not None: + mcp_close_pty_session(pty_id) diff --git a/tests/test_run_step.py b/tests/test_run_step.py new file mode 100644 index 0000000..af09733 --- /dev/null +++ b/tests/test_run_step.py @@ -0,0 +1,170 @@ +"""Tests for _run_step and _configure_all_cli_auth — env setup for subprocesses.""" + +import os +import subprocess +from unittest import mock + +import pytest + + +# We need to test _run_step from app.py. It calls subprocess.run, so we mock that. +# The function also updates setup_state, so we mock that too. + + +@pytest.fixture +def patch_app_globals(): + """Patch app.py globals needed by _run_step.""" + with mock.patch("app._update_step"): + yield + + +class TestRunStepEnvStripping: + """Verify _run_step strips OAuth credentials from subprocess env.""" + + def test_strips_databricks_client_id(self, patch_app_globals): + from app import _run_step + with mock.patch.dict(os.environ, { + "DATABRICKS_CLIENT_ID": "sp-client-id", + "DATABRICKS_CLIENT_SECRET": "sp-client-secret", + "HOME": "/tmp/test-home", + }), mock.patch("subprocess.run") as mock_run: + mock_run.return_value = subprocess.CompletedResult = mock.MagicMock( + returncode=0, stdout="ok", stderr="" + ) + _run_step("test-step", "echo hello") + + call_env = mock_run.call_args.kwargs.get("env", {}) + assert "DATABRICKS_CLIENT_ID" not in call_env + assert "DATABRICKS_CLIENT_SECRET" not in call_env + + def test_preserves_other_env_vars(self, patch_app_globals): + from app import _run_step + with mock.patch.dict(os.environ, { + "HOME": "/tmp/test-home", + "MY_CUSTOM_VAR": "keep-this", + "DATABRICKS_CLIENT_ID": "remove-this", + }), mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="ok", stderr="") + _run_step("test-step", "echo hello") + + call_env = mock_run.call_args.kwargs.get("env", {}) + assert call_env.get("MY_CUSTOM_VAR") == "keep-this" + + +class TestRunStepPythonpath: + """Verify _run_step injects PYTHONPATH for setup script imports.""" + + def test_sets_pythonpath_to_app_dir(self, patch_app_globals): + from app import _run_step + with mock.patch.dict(os.environ, {"HOME": "/tmp/test-home"}), \ + mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="ok", stderr="") + _run_step("test-step", "echo hello") + + call_env = mock_run.call_args.kwargs.get("env", {}) + # PYTHONPATH should contain the app directory (dirname of app.py) + assert "PYTHONPATH" in call_env + assert call_env["PYTHONPATH"] # non-empty + + def test_prepends_to_existing_pythonpath(self, patch_app_globals): + from app import _run_step + with mock.patch.dict(os.environ, { + "HOME": "/tmp/test-home", + "PYTHONPATH": "/existing/path", + }), mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="ok", stderr="") + _run_step("test-step", "echo hello") + + call_env = mock_run.call_args.kwargs.get("env", {}) + assert "/existing/path" in call_env["PYTHONPATH"] + + +class TestRunStepPath: + """Verify _run_step adds ~/.local/bin to PATH.""" + + def test_adds_local_bin_to_path(self, patch_app_globals): + from app import _run_step + with mock.patch.dict(os.environ, { + "HOME": "/tmp/test-home", + "PATH": "/usr/bin", + }), mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="ok", stderr="") + _run_step("test-step", "echo hello") + + call_env = mock_run.call_args.kwargs.get("env", {}) + assert "/tmp/test-home/.local/bin" in call_env["PATH"] + + def test_skips_if_already_in_path(self, patch_app_globals): + from app import _run_step + with mock.patch.dict(os.environ, { + "HOME": "/tmp/test-home", + "PATH": "/tmp/test-home/.local/bin:/usr/bin", + }), mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="ok", stderr="") + _run_step("test-step", "echo hello") + + call_env = mock_run.call_args.kwargs.get("env", {}) + # Should not duplicate + assert call_env["PATH"].count(".local/bin") == 1 + + def test_defaults_home_when_empty(self, patch_app_globals): + """When HOME is empty or '/', should default to /app/python/source_code.""" + from app import _run_step + with mock.patch.dict(os.environ, {"HOME": ""}, clear=False), \ + mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="ok", stderr="") + _run_step("test-step", "echo hello") + + call_env = mock_run.call_args.kwargs.get("env", {}) + assert "/app/python/source_code" in call_env.get("HOME", "") + + +# --------------------------------------------------------------------------- +# _configure_all_cli_auth — PAT reconfiguration path +# --------------------------------------------------------------------------- + +class TestConfigureAllCliAuth: + """Verify _configure_all_cli_auth injects PYTHONPATH for setup script imports. + + This is a separate code path from _run_step — it runs setup scripts via + subprocess.run after PAT rotation. Without PYTHONPATH, the scripts can't + `from utils import ...` since they live in setup/ subdirectory. + """ + + def _call_configure(self, mock_run, tmp_path, token="dapi_test"): + """Helper to call _configure_all_cli_auth with all dependencies mocked.""" + from app import _configure_all_cli_auth + # Create .claude dir so settings.json write succeeds + (tmp_path / ".claude").mkdir(exist_ok=True) + with mock.patch("utils.resolve_and_cache_gateway"), \ + mock.patch("app.get_gateway_host", return_value=None), \ + mock.patch("app.ensure_https", return_value="https://test.databricks.com"), \ + mock.patch("app.pat_rotator"), \ + mock.patch.dict(os.environ, {"HOME": str(tmp_path)}): + _configure_all_cli_auth(token) + + def test_injects_pythonpath(self, tmp_path): + with mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="", stderr="") + self._call_configure(mock_run, tmp_path) + + # Find a subprocess call that runs a setup script + setup_calls = [c for c in mock_run.call_args_list + if any("setup/" in str(a) for a in c[0][0])] + assert len(setup_calls) > 0, "Expected subprocess calls for setup scripts" + + for call in setup_calls: + call_env = call.kwargs.get("env") or call[1].get("env", {}) + assert "PYTHONPATH" in call_env, f"PYTHONPATH missing from env for {call[0][0]}" + assert call_env["PYTHONPATH"], "PYTHONPATH should not be empty" + + def test_passes_token_in_env(self, tmp_path): + with mock.patch("subprocess.run") as mock_run: + mock_run.return_value = mock.MagicMock(returncode=0, stdout="", stderr="") + self._call_configure(mock_run, tmp_path, token="dapi_mytoken") + + setup_calls = [c for c in mock_run.call_args_list + if any("setup/" in str(a) for a in c[0][0])] + for call in setup_calls: + call_env = call.kwargs.get("env") or call[1].get("env", {}) + assert call_env.get("DATABRICKS_TOKEN") == "dapi_mytoken" diff --git a/tests/test_session_detach.py b/tests/test_session_detach.py index c381a40..6e3b60f 100644 --- a/tests/test_session_detach.py +++ b/tests/test_session_detach.py @@ -7,7 +7,6 @@ import os import subprocess -import sys import threading import time from collections import deque @@ -40,42 +39,23 @@ def test_detects_child_process_name(self): """When a shell has a child process, return the child's name.""" app_mod = _get_app() - # Launch a shell (bash) with a child process (sleep) - shell = subprocess.Popen( - ["bash", "-c", "sleep 300"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - # Give the child time to spawn - time.sleep(0.5) - - try: - result = app_mod._get_session_process(shell.pid) - assert result == "sleep", f"Expected 'sleep', got '{result}'" - finally: - shell.kill() - shell.wait() + # Mock pgrep returning a child PID, then ps resolving it to "sleep" + pgrep_result = mock.Mock(returncode=0, stdout="12345\n") + ps_result = mock.Mock(returncode=0, stdout="sleep\n") + with mock.patch("subprocess.run", side_effect=[pgrep_result, ps_result]): + result = app_mod._get_session_process(100) + assert result == "sleep", f"Expected 'sleep', got '{result}'" def test_returns_parent_process_name_when_no_children(self): """When a shell has no foreground children, return the shell name.""" app_mod = _get_app() - # Launch a bare shell that just sleeps via bash built-in wait - # Use cat which will block on stdin with no children of its own - proc = subprocess.Popen( - ["cat"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - try: - result = app_mod._get_session_process(proc.pid) - assert result == "cat", f"Expected 'cat', got '{result}'" - finally: - proc.kill() - proc.wait() + # Mock pgrep finding no children (exit 1), then ps resolving the process itself + pgrep_result = mock.Mock(returncode=1, stdout="") + ps_result = mock.Mock(returncode=0, stdout="cat\n") + with mock.patch("subprocess.run", side_effect=[pgrep_result, ps_result]): + result = app_mod._get_session_process(100) + assert result == "cat", f"Expected 'cat', got '{result}'" def test_returns_unknown_for_dead_pid(self): """Return 'unknown' when the PID does not exist.""" @@ -230,28 +210,31 @@ def setup_app(self): app_module.sessions.clear() def test_exited_session_removed_from_dict(self): - import pty - master_fd, slave_fd = pty.openpty() + fake_master = 50 + # Use a completed process so waitpid works proc = subprocess.Popen( - ["bash", "-c", "echo hello && exit 0"], - stdin=slave_fd, stdout=slave_fd, stderr=slave_fd, - preexec_fn=os.setsid + ["bash", "-c", "exit 0"], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - os.close(slave_fd) + proc.wait() session_id = "sess-eof-test" with self.app_module.sessions_lock: self.app_module.sessions[session_id] = { "pid": proc.pid, - "master_fd": master_fd, + "master_fd": fake_master, "output_buffer": deque(maxlen=1000), "lock": threading.Lock(), "last_poll_time": time.time(), "created_at": time.time(), } - # read_pty_output should detect EOF and call terminate_session - self.app_module.read_pty_output(session_id, master_fd) + # Simulate EOF: select says readable, os.read returns empty bytes + with mock.patch("select.select", return_value=([fake_master], [], [])), \ + mock.patch("os.read", return_value=b""), \ + mock.patch("os.close"), \ + mock.patch("os.kill"): + self.app_module.read_pty_output(session_id, fake_master) with self.app_module.sessions_lock: assert session_id not in self.app_module.sessions diff --git a/tests/test_setup_proxy.py b/tests/test_setup_proxy.py new file mode 100644 index 0000000..3e4658b --- /dev/null +++ b/tests/test_setup_proxy.py @@ -0,0 +1,45 @@ +"""Regression tests for setup/setup_proxy.py — the content-filter proxy launcher. + +The launcher spawns ``content_filter_proxy.py`` as a subprocess. That file lives +at the REPO ROOT, not in setup/. A 2026 refactor moved setup_proxy.py into +setup/ (git fec2152, R100 rename) without updating its relative path lookup, so +the launcher pointed at a nonexistent ``setup/content_filter_proxy.py`` and the +proxy never started — silently breaking OpenCode (the only agent that routes +through the proxy at 127.0.0.1:4000). These tests pin the resolved path to an +existing file so a future move can't regress it again. +""" + +import importlib.util +import os + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +SETUP_PROXY_PATH = os.path.join(REPO_ROOT, "setup", "setup_proxy.py") + + +def _load_setup_proxy(): + """Import setup_proxy.py by path WITHOUT running its main() side effects.""" + spec = importlib.util.spec_from_file_location("setup_proxy_under_test", SETUP_PROXY_PATH) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def test_resolved_proxy_script_exists(): + """The script the launcher hands to Popen must actually exist on disk.""" + mod = _load_setup_proxy() + path = mod.resolve_proxy_script_path() + assert os.path.isfile(path), ( + f"setup_proxy.py resolves the proxy server to a non-existent path: {path}. " + f"content_filter_proxy.py lives at the repo root, not in setup/." + ) + + +def test_resolved_proxy_script_is_repo_root_content_filter_proxy(): + """It must be the repo-root content_filter_proxy.py, not a setup/-relative path.""" + mod = _load_setup_proxy() + path = mod.resolve_proxy_script_path() + assert os.path.basename(path) == "content_filter_proxy.py" + assert os.path.dirname(os.path.abspath(path)) == REPO_ROOT, ( + f"expected the repo-root copy ({REPO_ROOT}), got dir " + f"{os.path.dirname(os.path.abspath(path))}" + ) diff --git a/tests/test_setup_resource_paths.py b/tests/test_setup_resource_paths.py new file mode 100644 index 0000000..bb4727b --- /dev/null +++ b/tests/test_setup_resource_paths.py @@ -0,0 +1,48 @@ +"""Regression tests for bundled-resource path resolution in the setup scripts. + +Commit fec2152 moved every setup script into setup/ (R100 renames) but left the +resources they copy (agents/, .codex/, content_filter_proxy.py) at the repo root. +Scripts that located those resources via ``Path(__file__).parent`` silently broke: +the proxy never launched (OpenCode), Claude subagents weren't installed, and the +Codex model catalog wasn't copied. These tests pin each resolver to an existing +resource so a future move can't silently regress it again. + +The setup scripts run heavy side effects at import (npm installs, curl), so we +extract and execute ONLY the resolver function from the source via AST — this +tests the real resolver code without triggering the script body. +""" + +import ast +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +SETUP_DIR = REPO_ROOT / "setup" + + +def _extract_resolver(script_path: Path, func_name: str): + """Compile and exec just ``func_name`` from ``script_path`` (no body run).""" + tree = ast.parse(script_path.read_text()) + for node in tree.body: + if isinstance(node, ast.FunctionDef) and node.name == func_name: + ns = {"Path": Path, "__file__": str(script_path)} + exec(compile(ast.Module(body=[node], type_ignores=[]), str(script_path), "exec"), ns) + return ns[func_name] + raise AssertionError(f"{func_name}() not found in {script_path}") + + +def test_claude_agents_resolver_points_at_existing_dir(): + """setup_claude.py must resolve agents/ to a real dir with the subagents.""" + resolve = _extract_resolver(SETUP_DIR / "setup_claude.py", "resolve_agents_src") + agents = resolve() + assert agents.is_dir(), f"setup_claude resolves agents/ to a missing dir: {agents}" + names = {p.name for p in agents.glob("*.md")} + expected = {"build-feature.md", "implementer.md", "prd-writer.md", "test-generator.md"} + assert expected <= names, f"missing bundled subagents: {expected - names}" + + +def test_codex_catalog_resolver_points_at_existing_file(): + """setup_codex.py must resolve the model catalog to a real file.""" + resolve = _extract_resolver(SETUP_DIR / "setup_codex.py", "resolve_codex_catalog_src") + catalog = resolve() + assert catalog.is_file(), f"setup_codex resolves the model catalog to a missing file: {catalog}" + assert catalog.name == "databricks-models.json" diff --git a/tests/test_sync_to_workspace.py b/tests/test_sync_to_workspace.py new file mode 100644 index 0000000..6faedf4 --- /dev/null +++ b/tests/test_sync_to_workspace.py @@ -0,0 +1,181 @@ +"""Tests for sync_to_workspace — path-escape guard and workspace sync.""" + +import subprocess +from pathlib import Path +from unittest import mock + +import pytest + + +# --------------------------------------------------------------------------- +# _read_databrickscfg +# --------------------------------------------------------------------------- + +class TestReadDatabrickscfg: + def test_reads_host_and_token(self, tmp_path): + cfg = tmp_path / ".databrickscfg" + cfg.write_text("[DEFAULT]\nhost = https://test.cloud.databricks.com\ntoken = dapi_abc123\n") + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path): + from sync_to_workspace import _read_databrickscfg + host, token = _read_databrickscfg() + assert host == "https://test.cloud.databricks.com" + assert token == "dapi_abc123" + + def test_returns_none_when_missing(self, tmp_path): + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path): + from sync_to_workspace import _read_databrickscfg + host, token = _read_databrickscfg() + assert host is None + assert token is None + + def test_returns_none_for_missing_keys(self, tmp_path): + cfg = tmp_path / ".databrickscfg" + cfg.write_text("[DEFAULT]\n# empty section\n") + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path): + from sync_to_workspace import _read_databrickscfg + host, token = _read_databrickscfg() + assert host is None + assert token is None + + +# --------------------------------------------------------------------------- +# get_user_email +# --------------------------------------------------------------------------- + +class TestGetUserEmail: + def test_raises_when_no_config(self, tmp_path): + from sync_to_workspace import get_user_email + with mock.patch("sync_to_workspace._read_databrickscfg", return_value=(None, None)): + with pytest.raises(RuntimeError, match="missing host or token"): + get_user_email() + + def test_raises_when_no_token(self): + from sync_to_workspace import get_user_email + with mock.patch("sync_to_workspace._read_databrickscfg", return_value=("https://host", None)): + with pytest.raises(RuntimeError, match="missing host or token"): + get_user_email() + + def test_returns_email(self): + from sync_to_workspace import get_user_email + mock_user = mock.MagicMock() + mock_user.user_name = "test@example.com" + mock_client = mock.MagicMock() + mock_client.current_user.me.return_value = mock_user + with mock.patch("sync_to_workspace._read_databrickscfg", return_value=("https://host", "tok")): + with mock.patch("sync_to_workspace.WorkspaceClient", return_value=mock_client): + email = get_user_email() + assert email == "test@example.com" + + +# --------------------------------------------------------------------------- +# sync_project — path-escape guard +# --------------------------------------------------------------------------- + +class TestSyncProject: + def test_rejects_path_outside_projects_dir(self, tmp_path, capsys): + from sync_to_workspace import sync_project + # Create a path outside ~/projects/ + outside = tmp_path / "evil-repo" + outside.mkdir() + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path): + sync_project(outside) + captured = capsys.readouterr() + assert "SKIP" in captured.err + assert "outside" in captured.err + + def test_accepts_path_inside_projects_dir(self, tmp_path): + from sync_to_workspace import sync_project + projects = tmp_path / "projects" + projects.mkdir() + repo = projects / "my-repo" + repo.mkdir() + + mock_user = mock.MagicMock() + mock_user.user_name = "test@example.com" + mock_client = mock.MagicMock() + mock_client.current_user.me.return_value = mock_user + + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path), \ + mock.patch("sync_to_workspace._read_databrickscfg", return_value=("https://host", "tok")), \ + mock.patch("sync_to_workspace.WorkspaceClient", return_value=mock_client), \ + mock.patch("sync_to_workspace.subprocess.run") as mock_run: + mock_run.return_value = subprocess.CompletedProcess([], 0, stdout="", stderr="") + sync_project(repo) + + mock_run.assert_called_once() + args = mock_run.call_args + assert "databricks" in args[0][0][0] + assert "sync" in args[0][0][1] + + def test_strips_oauth_env_from_subprocess(self, tmp_path): + """Verify OAuth credentials are stripped so CLI falls through to ~/.databrickscfg.""" + from sync_to_workspace import sync_project + projects = tmp_path / "projects" + projects.mkdir() + repo = projects / "my-repo" + repo.mkdir() + + mock_user = mock.MagicMock() + mock_user.user_name = "test@example.com" + mock_client = mock.MagicMock() + mock_client.current_user.me.return_value = mock_user + + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path), \ + mock.patch("sync_to_workspace._read_databrickscfg", return_value=("https://host", "tok")), \ + mock.patch("sync_to_workspace.WorkspaceClient", return_value=mock_client), \ + mock.patch("sync_to_workspace.subprocess.run") as mock_run, \ + mock.patch.dict("os.environ", { + "DATABRICKS_CLIENT_ID": "sp-id", + "DATABRICKS_CLIENT_SECRET": "sp-secret", + "DATABRICKS_HOST": "https://host", + "DATABRICKS_TOKEN": "dapi_tok", + }): + mock_run.return_value = subprocess.CompletedProcess([], 0, stdout="", stderr="") + sync_project(repo) + + call_env = mock_run.call_args[1].get("env") or mock_run.call_args.kwargs.get("env", {}) + assert "DATABRICKS_CLIENT_ID" not in call_env + assert "DATABRICKS_CLIENT_SECRET" not in call_env + assert "DATABRICKS_HOST" not in call_env + assert "DATABRICKS_TOKEN" not in call_env + + def test_logs_error_on_failure(self, tmp_path, capsys): + from sync_to_workspace import sync_project + projects = tmp_path / "projects" + projects.mkdir() + repo = projects / "my-repo" + repo.mkdir() + + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path), \ + mock.patch("sync_to_workspace.get_user_email", side_effect=Exception("auth failed")): + sync_project(repo) + + captured = capsys.readouterr() + assert "Sync failed" in captured.err + # Error should be logged to file + error_log = tmp_path / ".sync-errors.log" + assert error_log.exists() + assert "auth failed" in error_log.read_text() + + def test_sync_failure_warns(self, tmp_path, capsys): + """Non-zero return code from databricks sync should print warning.""" + from sync_to_workspace import sync_project + projects = tmp_path / "projects" + projects.mkdir() + repo = projects / "my-repo" + repo.mkdir() + + mock_user = mock.MagicMock() + mock_user.user_name = "test@example.com" + mock_client = mock.MagicMock() + mock_client.current_user.me.return_value = mock_user + + with mock.patch("sync_to_workspace.Path.home", return_value=tmp_path), \ + mock.patch("sync_to_workspace._read_databrickscfg", return_value=("https://host", "tok")), \ + mock.patch("sync_to_workspace.WorkspaceClient", return_value=mock_client), \ + mock.patch("sync_to_workspace.subprocess.run") as mock_run: + mock_run.return_value = subprocess.CompletedProcess([], 1, stdout="", stderr="permission denied") + sync_project(repo) + + captured = capsys.readouterr() + assert "Sync warning" in captured.err diff --git a/tests/test_task_manager.py b/tests/test_task_manager.py new file mode 100644 index 0000000..fbbc032 --- /dev/null +++ b/tests/test_task_manager.py @@ -0,0 +1,688 @@ +"""Tests for task_manager — disk-based MCP session/task state.""" + +import json +import os +import time +from unittest import mock + +import pytest + + +@pytest.fixture(autouse=True) +def isolated_sessions(tmp_path): + """Point task_manager.SESSIONS_DIR at a temp dir.""" + sessions_dir = str(tmp_path / ".coda" / "sessions") + with mock.patch("coda_mcp.task_manager.SESSIONS_DIR", sessions_dir): + yield sessions_dir + + +# ── helpers ────────────────────────────────────────────────────────── + + +def _read_json(path): + with open(path) as f: + return json.load(f) + + +def _read_text(path): + with open(path) as f: + return f.read() + + +def _read_jsonl(path): + lines = [] + with open(path) as f: + for line in f: + line = line.strip() + if line: + lines.append(json.loads(line)) + return lines + + +# ── Session lifecycle ──────────────────────────────────────────────── + + +class TestCreateSession: + def test_returns_session_id_and_status(self): + from coda_mcp import task_manager + + result = task_manager.create_session("a@b.com", "u1", "my-label") + assert result["status"] == "ready" + assert result["session_id"].startswith("sess-") + assert len(result["session_id"]) == 5 + 12 # "sess-" + 12 hex + + def test_creates_session_json_on_disk(self, isolated_sessions): + from coda_mcp import task_manager + + result = task_manager.create_session("a@b.com", "u1", "my-label") + sid = result["session_id"] + path = os.path.join(isolated_sessions, sid, "session.json") + assert os.path.isfile(path) + data = _read_json(path) + assert data["email"] == "a@b.com" + assert data["user_id"] == "u1" + assert data["label"] == "my-label" + assert data["status"] == "ready" + assert data["current_task"] is None + assert data["completed_tasks"] == [] + assert "created_at" in data + + def test_unique_ids(self): + from coda_mcp import task_manager + + ids = {task_manager.create_session("a@b.com", "u1")["session_id"] for _ in range(20)} + assert len(ids) == 20 + + +class TestCloseSession: + def test_marks_session_closed(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + task_manager.close_session(sid) + data = _read_json(os.path.join(isolated_sessions, sid, "session.json")) + assert data["status"] == "closed" + + def test_close_nonexistent_raises(self): + from coda_mcp import task_manager + + with pytest.raises(task_manager.SessionNotFoundError): + task_manager.close_session("sess-doesnotexist") + + +class TestReadSession: + def test_read_existing(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1", "lbl")["session_id"] + data = task_manager._read_session(sid) + assert data["email"] == "a@b.com" + + def test_read_nonexistent_raises(self): + from coda_mcp import task_manager + + with pytest.raises(task_manager.SessionNotFoundError): + task_manager._read_session("sess-000000000000") + + +class TestUpdateSessionField: + def test_updates_single_field(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + task_manager._update_session_field(sid, "status", "busy") + data = task_manager._read_session(sid) + assert data["status"] == "busy" + + def test_preserves_other_fields(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1", "lbl")["session_id"] + task_manager._update_session_field(sid, "status", "busy") + data = task_manager._read_session(sid) + assert data["email"] == "a@b.com" + assert data["label"] == "lbl" + + +# ── Task lifecycle ─────────────────────────────────────────────────── + + +class TestCreateTask: + def test_returns_task_id_and_running(self): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + result = task_manager.create_task(sid, "do something", "a@b.com") + assert result["status"] == "running" + assert result["task_id"].startswith("task-") + assert len(result["task_id"]) == 5 + 8 # "task-" + 8 hex + + def test_creates_task_directory_with_files(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "do something", "a@b.com")["task_id"] + task_dir = task_manager._task_dir(sid, tid) + assert os.path.isdir(task_dir) + assert os.path.isfile(os.path.join(task_dir, "prompt.txt")) + assert os.path.isfile(os.path.join(task_dir, "status.jsonl")) + + def test_prompt_txt_contains_wrapped_prompt(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "fix the bug", "a@b.com")["task_id"] + prompt = _read_text(os.path.join(task_manager._task_dir(sid, tid), "prompt.txt")) + assert "---CODA-TASK---" in prompt + assert "fix the bug" in prompt + + def test_session_marked_busy(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + task_manager.create_task(sid, "do it", "a@b.com") + data = task_manager._read_session(sid) + assert data["status"] == "busy" + + def test_session_current_task_set(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "do it", "a@b.com")["task_id"] + data = task_manager._read_session(sid) + assert data["current_task"] == tid + + def test_busy_session_raises(self): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + task_manager.create_task(sid, "first", "a@b.com") + with pytest.raises(task_manager.SessionBusyError): + task_manager.create_task(sid, "second", "a@b.com") + + def test_nonexistent_session_raises(self): + from coda_mcp import task_manager + + with pytest.raises(task_manager.SessionNotFoundError): + task_manager.create_task("sess-doesnotexist", "p", "e@x.com") + + def test_status_jsonl_has_initial_entry(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + entries = _read_jsonl( + os.path.join(task_manager._task_dir(sid, tid), "status.jsonl") + ) + assert len(entries) == 1 + assert entries[0]["status"] == "running" + + def test_optional_params_stored(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task( + sid, "go", "a@b.com", + context={"repo": "myrepo"}, + context_hint="look at utils.py", + timeout_s=120, + permissions=["read", "write"], + )["task_id"] + prompt = _read_text(os.path.join(task_manager._task_dir(sid, tid), "prompt.txt")) + assert "myrepo" in prompt + assert "utils.py" in prompt + + +class TestTaskDir: + def test_returns_correct_path(self, isolated_sessions): + from coda_mcp import task_manager + + path = task_manager._task_dir("sess-aabbccddee01", "task-11223344") + expected = os.path.join( + isolated_sessions, "sess-aabbccddee01", "tasks", "task-11223344" + ) + assert path == expected + + +# ── Task status / result ───────────────────────────────────────────── + + +class TestGetTaskStatus: + def test_returns_latest_status(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + status = task_manager.get_task_status(tid, sid) + assert status["status"] == "running" + + def test_reads_appended_lines(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + # simulate agent appending progress + status_path = os.path.join(task_manager._task_dir(sid, tid), "status.jsonl") + with open(status_path, "a") as f: + f.write(json.dumps({"status": "progress", "pct": 50, "ts": time.time()}) + "\n") + status = task_manager.get_task_status(tid, sid) + assert status["status"] == "progress" + assert status["pct"] == 50 + + def test_missing_task_returns_not_found(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + status = task_manager.get_task_status("task-nonexist", sid) + assert status["status"] == "not_found" + + +class TestGetTaskResult: + def test_returns_result_when_present(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + # simulate agent writing result + result_path = os.path.join(task_manager._task_dir(sid, tid), "result.json") + with open(result_path, "w") as f: + json.dump({"answer": 42}, f) + result = task_manager.get_task_result(tid, sid) + assert result["answer"] == 42 + + def test_returns_none_when_absent(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + result = task_manager.get_task_result(tid, sid) + assert result is None + + def test_missing_task_returns_none(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + result = task_manager.get_task_result("task-nonexist", sid) + assert result is None + + +# ── Complete task ───────────────────────────────────────────────────── + + +class TestCompleteTask: + def test_marks_session_closed(self, isolated_sessions): + """v2: sessions are ephemeral — complete_task auto-closes the session.""" + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + task_manager.complete_task(sid, tid) + data = task_manager._read_session(sid) + assert data["status"] == "closed" + assert "closed_at" in data + + def test_appends_to_completed_tasks(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + task_manager.complete_task(sid, tid) + data = task_manager._read_session(sid) + assert tid in data["completed_tasks"] + + def test_closed_session_rejects_new_task(self, isolated_sessions): + """v2: ephemeral sessions — new tasks need new sessions.""" + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid1 = task_manager.create_task(sid, "first", "a@b.com")["task_id"] + task_manager.complete_task(sid, tid1) + with pytest.raises(task_manager.SessionNotFoundError): + task_manager.create_task(sid, "second", "a@b.com") + + def test_appends_done_to_status_jsonl(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, "go", "a@b.com")["task_id"] + task_manager.complete_task(sid, tid) + entries = _read_jsonl( + os.path.join(task_manager._task_dir(sid, tid), "status.jsonl") + ) + assert entries[-1]["status"] == "done" + + def test_nonexistent_session_raises(self): + from coda_mcp import task_manager + + with pytest.raises(task_manager.SessionNotFoundError): + task_manager.complete_task("sess-doesnotexist", "task-00000000") + + +# ── Prompt wrapping ────────────────────────────────────────────────── + + +class TestWrapPrompt: + def test_contains_marker(self): + from coda_mcp import task_manager + + wrapped = task_manager.wrap_prompt( + task_id="task-aabbccdd", + session_id="sess-112233445566", + email="a@b.com", + prompt="fix the bug", + context=None, + results_dir="/tmp/r", + context_hint=None, + ) + assert "---CODA-TASK---" in wrapped + assert "fix the bug" in wrapped + assert "task-aabbccdd" in wrapped + assert "sess-112233445566" in wrapped + assert "a@b.com" in wrapped + assert "/tmp/r" in wrapped + + def test_includes_context_when_provided(self): + from coda_mcp import task_manager + + wrapped = task_manager.wrap_prompt( + task_id="task-aabbccdd", + session_id="sess-112233445566", + email="a@b.com", + prompt="go", + context={"repo": "myrepo", "branch": "main"}, + results_dir="/tmp/r", + context_hint=None, + ) + assert "myrepo" in wrapped + assert "main" in wrapped + + def test_includes_context_hint(self): + from coda_mcp import task_manager + + wrapped = task_manager.wrap_prompt( + task_id="task-aabbccdd", + session_id="sess-112233445566", + email="a@b.com", + prompt="go", + context=None, + results_dir="/tmp/r", + context_hint="look at utils.py first", + ) + assert "look at utils.py first" in wrapped + + def test_no_context_still_valid(self): + from coda_mcp import task_manager + + wrapped = task_manager.wrap_prompt( + task_id="task-aabbccdd", + session_id="sess-112233445566", + email="a@b.com", + prompt="hello", + context=None, + results_dir="/tmp/r", + context_hint=None, + ) + assert "---CODA-TASK---" in wrapped + assert "hello" in wrapped + + +# ── Edge cases ──────────────────────────────────────────────────────── + + +class TestEdgeCases: + def test_closed_session_rejects_task(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + task_manager.close_session(sid) + with pytest.raises(task_manager.SessionNotFoundError): + task_manager.create_task(sid, "go", "a@b.com") + + def test_multiple_tasks_across_sessions(self, isolated_sessions): + """v2: each task gets its own ephemeral session; all appear in list_all_tasks.""" + from coda_mcp import task_manager + + tids = [] + for i in range(3): + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + tid = task_manager.create_task(sid, f"task {i}", "a@b.com")["task_id"] + task_manager.complete_task(sid, tid) + tids.append(tid) + # Each session auto-closes + data = task_manager._read_session(sid) + assert data["status"] == "closed" + + all_tasks = task_manager.list_all_tasks() + all_tids = [t["task_id"] for t in all_tasks] + for tid in tids: + assert tid in all_tids + + def test_corrupt_session_json_raises(self, isolated_sessions): + from coda_mcp import task_manager + + sid = task_manager.create_session("a@b.com", "u1")["session_id"] + path = os.path.join(isolated_sessions, sid, "session.json") + with open(path, "w") as f: + f.write("{bad json") + with pytest.raises(task_manager.SessionNotFoundError): + task_manager._read_session(sid) + + +# ── find_task_dir_by_pty_session ───────────────────────────────────── + + +@pytest.fixture +def sessions_root(tmp_path, monkeypatch): + from coda_mcp import task_manager + monkeypatch.setattr(task_manager, "SESSIONS_DIR", str(tmp_path)) + # Reset the lookup cache between tests + task_manager._pty_lookup_cache.clear() + return tmp_path + + +def _make_session_dir(root, sess_id, pty_id, current_task=None, completed=None): + sdir = root / sess_id + (sdir / "tasks").mkdir(parents=True) + data = { + "session_id": sess_id, + "pty_session_id": pty_id, + "current_task": current_task, + "completed_tasks": completed or [], + "status": "ready", + } + (sdir / "session.json").write_text(json.dumps(data)) + return sdir + + +def test_find_task_dir_hits_current_task(sessions_root): + from coda_mcp import task_manager + + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + result = task_manager.find_task_dir_by_pty_session("pty-1") + assert result == str(sessions_root / "sess-A" / "tasks" / "task-X") + + +def test_find_task_dir_falls_back_to_last_completed(sessions_root): + from coda_mcp import task_manager + + _make_session_dir( + sessions_root, "sess-A", "pty-1", + current_task=None, + completed=["task-old", "task-recent"], + ) + result = task_manager.find_task_dir_by_pty_session("pty-1") + assert result == str(sessions_root / "sess-A" / "tasks" / "task-recent") + + +def test_find_task_dir_returns_none_when_no_match(sessions_root): + from coda_mcp import task_manager + + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + assert task_manager.find_task_dir_by_pty_session("pty-NONEXIST") is None + + +def test_find_task_dir_ignores_corrupt_session_json(sessions_root): + from coda_mcp import task_manager + + sdir = sessions_root / "sess-bad" + sdir.mkdir() + (sdir / "session.json").write_text("not json {{{") + _make_session_dir(sessions_root, "sess-good", "pty-1", current_task="task-X") + assert task_manager.find_task_dir_by_pty_session("pty-1") == \ + str(sessions_root / "sess-good" / "tasks" / "task-X") + + +def test_find_task_dir_cache_hits_within_ttl(sessions_root): + from coda_mcp import task_manager + + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + task_manager.find_task_dir_by_pty_session("pty-1") + # Remove session.json — cache should still return the hit + (sessions_root / "sess-A" / "session.json").unlink() + assert task_manager.find_task_dir_by_pty_session("pty-1") == \ + str(sessions_root / "sess-A" / "tasks" / "task-X") + + +def test_find_task_dir_cache_expires(sessions_root, monkeypatch): + from coda_mcp import task_manager + + monkeypatch.setattr(task_manager, "_PTY_LOOKUP_TTL", 0.01) + _make_session_dir(sessions_root, "sess-A", "pty-1", current_task="task-X") + task_manager.find_task_dir_by_pty_session("pty-1") + (sessions_root / "sess-A" / "session.json").unlink() + time.sleep(0.02) + assert task_manager.find_task_dir_by_pty_session("pty-1") is None + + +def test_find_task_dir_no_sessions_dir(sessions_root, monkeypatch): + from coda_mcp import task_manager + + monkeypatch.setattr(task_manager, "SESSIONS_DIR", "/nonexistent/path/that/does/not/exist") + assert task_manager.find_task_dir_by_pty_session("pty-1") is None + + +# ── workflow_protocol flag wiring ──────────────────────────────────── + + +def test_wrap_prompt_default_includes_capabilities_and_workflow(): + """Default workflow_protocol=True; rendered prompt contains both new sections.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/results", + ) + assert "CAPABILITIES:" in out + assert "WORKFLOW PROTOCOL:" in out + # Sanity: still has the existing structure. + assert "TASK:" in out + assert "INSTRUCTIONS:" in out + assert "SAFETY:" in out + + +def test_wrap_prompt_workflow_protocol_false_omits_sections(): + """With workflow_protocol=False, both new sections are absent.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/results", + workflow_protocol=False, + ) + assert "CAPABILITIES:" not in out + assert "WORKFLOW PROTOCOL:" not in out + # Existing sections are still present. + assert "TASK:" in out + assert "INSTRUCTIONS:" in out + + +def test_wrap_prompt_workflow_protocol_default_is_true(): + """Signature inspection: default value of workflow_protocol is True.""" + import inspect + from coda_mcp.task_manager import wrap_prompt + + sig = inspect.signature(wrap_prompt) + assert "workflow_protocol" in sig.parameters + assert sig.parameters["workflow_protocol"].default is True + + +def test_create_task_signature_has_workflow_protocol_param(): + """create_task accepts workflow_protocol kwarg with default True.""" + import inspect + from coda_mcp.task_manager import create_task + + sig = inspect.signature(create_task) + assert "workflow_protocol" in sig.parameters + assert sig.parameters["workflow_protocol"].default is True + + +def test_create_task_forwards_workflow_protocol_to_wrap_prompt(monkeypatch, tmp_path): + """create_task must pass workflow_protocol through to wrap_prompt.""" + from coda_mcp import task_manager + + captured: dict = {} + + def fake_wrap_prompt(**kwargs): + captured.update(kwargs) + return "DUMMY PROMPT" + + monkeypatch.setattr(task_manager, "wrap_prompt", fake_wrap_prompt) + monkeypatch.setattr(task_manager, "_session_dir", lambda sid: str(tmp_path)) + monkeypatch.setattr(task_manager, "_task_dir", lambda sid, tid: str(tmp_path)) + # Stub _read_session so create_task sees a valid ready session without disk I/O. + monkeypatch.setattr( + task_manager, + "_read_session", + lambda sid: {"session_id": sid, "status": "ready", "current_task": None, "completed_tasks": []}, + ) + # _write_json is the real helper used inside create_task (writes meta.json + session file). + # Stub it out — we're testing flag pass-through, not filesystem behavior. + monkeypatch.setattr(task_manager, "_write_json", lambda *a, **kw: None) + monkeypatch.setattr(task_manager.os, "makedirs", lambda *a, **kw: None) + # Stub the file-open for prompt.txt and status.jsonl writes. + real_open = open + def fake_open(path, mode="r", *args, **kwargs): + if ("prompt.txt" in str(path) or "status.jsonl" in str(path)) and "w" in mode: + import io + return io.StringIO() + return real_open(path, mode, *args, **kwargs) + monkeypatch.setattr("builtins.open", fake_open) + + task_manager.create_task( + session_id="s-1", + prompt="x", + email="u@example.com", + workflow_protocol=False, + ) + assert captured.get("workflow_protocol") is False + + +def test_coda_run_signature_has_workflow_protocol_param(): + """coda_run accepts workflow_protocol kwarg with default True.""" + import inspect + from coda_mcp import mcp_server + + sig = inspect.signature(mcp_server.coda_run) + assert "workflow_protocol" in sig.parameters + assert sig.parameters["workflow_protocol"].default is True + + +def test_wrap_prompt_instructions_documents_info_needed(): + """INSTRUCTIONS section must mention the info_needed status and feedback field.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/r", + ) + # Pull the INSTRUCTIONS section out for focused assertions. + assert "info_needed" in out + assert "feedback" in out + + +def test_wrap_prompt_instructions_lists_new_step_labels(): + """INSTRUCTIONS section enumerates the canonical step labels emitted by the agent.""" + from coda_mcp.task_manager import wrap_prompt + + out = wrap_prompt( + task_id="t-1", + session_id="s-1", + email="user@example.com", + prompt="do the thing", + context=None, + results_dir="/tmp/r", + ) + for label in ("plan", "critique_plan", "execute", "critique_execute", "synthesize", "critique_synthesize"): + assert label in out, f"Missing step label {label!r} from prompt text" diff --git a/tests/test_terminate_session_idempotent.py b/tests/test_terminate_session_idempotent.py new file mode 100644 index 0000000..072adc8 --- /dev/null +++ b/tests/test_terminate_session_idempotent.py @@ -0,0 +1,59 @@ +"""Regression test: terminate_session must close master_fd exactly once. + +Both the explicit close path (mcp_close_pty_session) and the read-thread exit +path (read_pty_output, which calls terminate_session when its loop ends) fire +for the same session. If terminate_session closes master_fd on BOTH calls, the +second os.close() can land on a since-reused fd — e.g. an asyncio event loop's +self-pipe allocated by a later test — corrupting unrelated I/O. That is the +source of the intermittent 'OSError: [Errno 9] Bad file descriptor' (EBADF) +flakiness seen when PTY tests and asyncio tests run together. + +terminate_session must be idempotent: claim the session atomically and close +the fd exactly once. +""" + +import threading + + +def test_terminate_session_closes_master_fd_exactly_once(monkeypatch): + import app + + closed = [] + monkeypatch.setattr(app.os, "close", lambda fd: closed.append(fd)) + monkeypatch.setattr(app.os, "kill", lambda *a, **k: None) + monkeypatch.setattr(app.time, "sleep", lambda *a, **k: None) + monkeypatch.setattr(app, "_emit_from_thread", lambda *a, **k: None) + + fake_fd = 999777 + sid = "sess-idempotent-test" + with app.sessions_lock: + app.sessions[sid] = { + "lock": threading.Lock(), + "pid": 2147480000, # implausible; os.kill is mocked anyway + "master_fd": fake_fd, + "transcript_fh": None, + } + + # Two callers, same session: explicit close, then read-thread auto-terminate. + app.terminate_session(sid, 2147480000, fake_fd) + app.terminate_session(sid, 2147480000, fake_fd) + + assert closed.count(fake_fd) == 1, ( + f"master_fd was closed {closed.count(fake_fd)}x — a double close can land " + f"on a reused fd and corrupt unrelated I/O (EBADF)" + ) + assert sid not in app.sessions + + +def test_terminate_session_missing_session_is_noop(monkeypatch): + """Terminating an unknown/already-removed session must not close any fd.""" + import app + + closed = [] + monkeypatch.setattr(app.os, "close", lambda fd: closed.append(fd)) + monkeypatch.setattr(app.os, "kill", lambda *a, **k: None) + monkeypatch.setattr(app.time, "sleep", lambda *a, **k: None) + monkeypatch.setattr(app, "_emit_from_thread", lambda *a, **k: None) + + app.terminate_session("sess-does-not-exist", 2147480000, 999778) + assert closed == [] diff --git a/tests/test_transcript.py b/tests/test_transcript.py new file mode 100644 index 0000000..d48dbbd --- /dev/null +++ b/tests/test_transcript.py @@ -0,0 +1,133 @@ +"""Unit tests for the transcript tee in read_pty_output. + +These tests exercise the tee logic directly by simulating output dispatch into +a synthesized session dict and a real on-disk transcript file. The full PTY +read loop is not exercised here — see test_mcp_integration.py for E2E. +""" +import os +import stat +import threading +from pathlib import Path + +import pytest + +# The three tests that hit mcp_create_pty_session call pty.openpty(), which +# fails in headless CI containers without TTY allocators. Mark those tests +# explicitly so existing fixture-based tests (test_tee_*) keep running. +def _pty_is_usable() -> bool: + if not hasattr(os, "openpty"): + return False + try: + master, slave = os.openpty() + os.close(master) + os.close(slave) + return True + except OSError: + return False + + +_pty_available = _pty_is_usable() +_pty_skip = pytest.mark.skipif(not _pty_available, reason="pty.openpty() not available") + + +@pytest.fixture +def session_dict(tmp_path): + """Build a minimally valid sessions[pty_id] entry with a real transcript handle.""" + transcript = tmp_path / "transcript.log" + fh = open(transcript, "ab", buffering=0) + os.fchmod(fh.fileno(), 0o600) + return { + "transcript_path": str(transcript), + "transcript_fh": fh, + "transcript_bytes": 0, + "lock": threading.Lock(), + } + + +def _write_chunk(session, output: bytes, cap: int = 10 * 1024 * 1024) -> None: + """Mirror the tee logic from read_pty_output for unit testing.""" + from app import _tee_transcript_chunk + _tee_transcript_chunk(session, output, cap=cap) + + +def test_tee_writes_bytes_and_flushes(session_dict): + _write_chunk(session_dict, b"hello world\n") + assert session_dict["transcript_bytes"] == 12 + assert Path(session_dict["transcript_path"]).read_bytes() == b"hello world\n" + + +def test_tee_chmod_is_0600(session_dict): + mode = stat.S_IMODE(os.stat(session_dict["transcript_path"]).st_mode) + assert mode == 0o600 + + +def test_tee_truncation_at_cap(session_dict): + cap = 16 + _write_chunk(session_dict, b"AAAAAAAAAA", cap=cap) + _write_chunk(session_dict, b"BBBBBBBBBBBBBBBBBBBB", cap=cap) + body = Path(session_dict["transcript_path"]).read_bytes() + # 10 A's, then 6 B's, then truncation marker. + assert body.startswith(b"AAAAAAAAAABBBBBB") + assert b"[transcript truncated at" in body + # Handle is closed after marker + assert session_dict["transcript_fh"] is None + + +def test_tee_no_op_when_fh_is_none(session_dict): + session_dict["transcript_fh"] = None + _write_chunk(session_dict, b"should not write") + assert Path(session_dict["transcript_path"]).read_bytes() == b"" + + +def test_tee_handles_write_error(session_dict, monkeypatch): + # Close the handle out from under the tee — write() will ValueError. + session_dict["transcript_fh"].close() + _write_chunk(session_dict, b"this will fail") + # Handle replaced with None; no crash. + assert session_dict["transcript_fh"] is None + + +@_pty_skip +def test_mcp_create_pty_session_opens_transcript_when_path_given(tmp_path, monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 5) + transcript = tmp_path / "transcript.log" + from app import mcp_create_pty_session, sessions, mcp_close_pty_session + sid = mcp_create_pty_session(label="test", transcript_path=str(transcript)) + try: + assert transcript.exists() + mode = stat.S_IMODE(os.stat(transcript).st_mode) + assert mode == 0o600 + sess = sessions[sid] + assert sess["transcript_path"] == str(transcript) + assert sess["transcript_fh"] is not None + assert sess["transcript_bytes"] == 0 + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_mcp_create_pty_session_no_transcript_when_path_none(monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 5) + from app import mcp_create_pty_session, sessions, mcp_close_pty_session + sid = mcp_create_pty_session(label="test") + try: + sess = sessions[sid] + assert sess.get("transcript_fh") is None + assert sess.get("transcript_path") is None + finally: + mcp_close_pty_session(sid) + + +@_pty_skip +def test_terminate_session_closes_transcript_handle(tmp_path, monkeypatch): + monkeypatch.setattr("app.MAX_CONCURRENT_SESSIONS", 5) + transcript = tmp_path / "transcript.log" + from app import mcp_create_pty_session, sessions, mcp_close_pty_session + sid = mcp_create_pty_session(label="test", transcript_path=str(transcript)) + fh = sessions[sid]["transcript_fh"] + mcp_close_pty_session(sid) + assert fh.closed + # Session removed from dict + assert sid not in sessions + + diff --git a/tests/test_url_builder.py b/tests/test_url_builder.py new file mode 100644 index 0000000..907287e --- /dev/null +++ b/tests/test_url_builder.py @@ -0,0 +1,82 @@ +"""Tests for url_builder module — base URL resolution for viewer_url.""" +import os +import importlib +from unittest import mock + +import pytest + + +@pytest.fixture(autouse=True) +def _reset_module(): + """Re-import url_builder fresh for each test (module-level cache).""" + from coda_mcp import url_builder + importlib.reload(url_builder) + yield + + +def test_returns_none_when_neither_env_nor_cache(): + from coda_mcp import url_builder + assert url_builder.build_viewer_url("pty-1") is None + + +def test_env_override_wins(): + from coda_mcp import url_builder + with mock.patch.dict(os.environ, {"CODA_APP_URL": "https://override.example.com"}): + assert url_builder.build_viewer_url("pty-1") == \ + "https://override.example.com/?session=pty-1" + + +def test_env_override_strips_trailing_slash(): + from coda_mcp import url_builder + with mock.patch.dict(os.environ, {"CODA_APP_URL": "https://override.example.com/"}): + assert url_builder.build_viewer_url("pty-1") == \ + "https://override.example.com/?session=pty-1" + + +def test_header_capture_used_when_no_env(): + from coda_mcp import url_builder + url_builder.capture_from_headers("app.databricksapps.com") + assert url_builder.build_viewer_url("pty-1") == \ + "https://app.databricksapps.com/?session=pty-1" + + +def test_env_overrides_header_capture(): + from coda_mcp import url_builder + url_builder.capture_from_headers("captured.example.com") + with mock.patch.dict(os.environ, {"CODA_APP_URL": "https://override.example.com"}): + assert url_builder.build_viewer_url("pty-1") == \ + "https://override.example.com/?session=pty-1" + + +def test_header_capture_overwrites_previous(): + from coda_mcp import url_builder + url_builder.capture_from_headers("first.example.com") + url_builder.capture_from_headers("second.example.com") + assert "second.example.com" in url_builder.build_viewer_url("pty-1") + + +def test_capture_empty_string_does_not_overwrite(): + from coda_mcp import url_builder + url_builder.capture_from_headers("good.example.com") + url_builder.capture_from_headers("") + assert "good.example.com" in url_builder.build_viewer_url("pty-1") + + +def test_capture_none_does_not_crash(): + from coda_mcp import url_builder + url_builder.capture_from_headers(None) + assert url_builder.build_viewer_url("pty-1") is None + + +def test_capture_strips_scheme_prefix(): + from coda_mcp import url_builder + url_builder.capture_from_headers("https://app.example.com") + assert url_builder._app_url_cache == "app.example.com" + assert url_builder.build_viewer_url("pty-1") == "https://app.example.com/?session=pty-1" + + +def test_capture_strips_http_scheme_prefix(): + from coda_mcp import url_builder + url_builder.capture_from_headers("http://app.example.com/") + # http stripped, trailing slash stripped + assert url_builder._app_url_cache == "app.example.com" diff --git a/todos.md b/todos.md new file mode 100644 index 0000000..17cf9cf --- /dev/null +++ b/todos.md @@ -0,0 +1,70 @@ +# Pending work (scratch — wipe after done) + +For each todo, the loop is: +1. Brainstorm shape → **critique gate** +2. Plan → **critique gate** +3. Implement → **critique gate** + +A critique pass is mandatory at every gate (use `oh-my-claudecode:critic` or `oh-my-claudecode:architect` subagents, depending on whether the review is about quality/quality or design/architecture). + +--- + +## Todo 1 — `coda_run` returns replay-only URL (no live attach) + +**Intent.** Split the two use cases by tool, not by URL behavior. `coda_run` is fire-and-forget batch — its returned `viewer_url` should be **read-only static replay** of what the agent did. Live interaction is the exclusive surface area of Todo 2. + +**Why.** Today `coda_run`'s `viewer_url` does double duty: live PTY attach during a 5-minute grace window, then static replay forever after. With `coda_interactive` arriving in Todo 2 as the dedicated live-attach tool, the dual-mode on `coda_run` is no longer useful — it just confuses the contract. + +**Scope hint** (to refine in brainstorming): +- Server: `coda_run`'s `viewer_url` should resolve to the static-replay endpoint, not the live-PTY join path +- Static replay reads the on-disk transcript that's already being written (no changes to the tee mechanism) +- The 5-minute PTY grace period for live attach is no longer reachable from `coda_run`'s URL (still applies to `coda_interactive`) +- Update test expectations in `test_mcp_integration.py`, `test_mcp_server.py`, `test_replay_attach.py` + +--- + +## Todo 2 — New MCP tool `coda_interactive` + +**Intent.** MCP caller hands off to a human. Task is "running" until the agent process exits (human types `exit` / `/quit` / Ctrl-D). + +**Default agent.** `claude`. Pluggable via `agent` parameter: `claude` (default), `hermes`, `codex`, `gemini`, `opencode`. + +**Surface** (to refine in brainstorming): +```python +coda_interactive( + prompt: str, + agent: str = "claude", + email: str = "", + context: str = "", + previous_session_id: str = "", + timeout_s: int = 1800, # 30 min — human-driven, generous +) +``` + +**Returns:** `{task_id, session_id, viewer_url, agent, status: "awaiting_human", instructions}` + +**Flow** (to refine in brainstorming): +1. Reuse `coda_run`'s task setup (task_dir, prompt.txt, meta.json, PTY with transcript_path) +2. Send agent launch command per agent matrix +3. Wait briefly for agent to initialize +4. Paste prompt as first user message +5. Watcher polls for PTY child exit (master_fd EOF) — not `result.json` +6. On exit, write `result.json` = `{status: "completed", agent, transcript_path, exit_reason}` + +**Agent launch matrix** (verify in brainstorming): +| Agent | Launch command | +|-------|----------------| +| `claude` | `claude` | +| `hermes` | `hermes chat` | +| `codex` | `codex` | +| `gemini` | `gemini` (or `gemini chat`?) | +| `opencode` | `opencode` | + +--- + +## Workflow rules + +- One todo at a time. Finish Todo 1 fully (brainstorm → critique → plan → critique → implement → critique) before starting Todo 2. +- Every critique gate uses a fresh subagent. No skipping. +- Both todos share the same branch (`coda-mcp`). +- Both eventually go into the same PR (or a new PR that subsumes #66 — decide later). diff --git a/tools/coda-bridge.py b/tools/coda-bridge.py new file mode 100644 index 0000000..c67b54c --- /dev/null +++ b/tools/coda-bridge.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""Stdio-to-HTTP MCP bridge with Databricks OAuth token injection. + +Proxies MCP JSON-RPC (stdio) to a Databricks App (Streamable HTTP), +injecting fresh OAuth tokens via `databricks auth token`. + +Config via environment variables (set in Claude Code settings.json): + + CODA_MCP_URL — App MCP endpoint URL + DATABRICKS_PROFILE — Databricks CLI profile for auth +""" + +import json +import os +import subprocess +import sys +import time +import urllib.request +import urllib.error + +APP_URL = os.environ.get("CODA_MCP_URL", "") +PROFILE = os.environ.get("DATABRICKS_PROFILE", "DEFAULT") +TOKEN_TTL = 1800 # cache 30 min (tokens last 60) + +_cache = {"token": None, "expires_at": 0.0} +_session_id = None + + +def _log(msg): + print(f"[coda-bridge] {msg}", file=sys.stderr, flush=True) + + +def _get_token(force=False): + now = time.time() + if not force and _cache["token"] and now < _cache["expires_at"]: + return _cache["token"] + result = subprocess.run( + ["databricks", "auth", "token", "-p", PROFILE], + capture_output=True, text=True, timeout=15, + ) + if result.returncode != 0: + raise RuntimeError(f"databricks auth token failed: {result.stderr.strip()}") + data = json.loads(result.stdout) + _cache["token"] = data["access_token"] + _cache["expires_at"] = now + TOKEN_TTL + _log("OAuth token refreshed") + return _cache["token"] + + +def _forward(line): + global _session_id + token = _get_token() + + headers = { + "Content-Type": "application/json", + "Accept": "application/json, text/event-stream", + "Authorization": f"Bearer {token}", + } + if _session_id: + headers["Mcp-Session-Id"] = _session_id + + req = urllib.request.Request(APP_URL, data=line.encode(), headers=headers, method="POST") + try: + with urllib.request.urlopen(req, timeout=300) as resp: + sid = resp.headers.get("Mcp-Session-Id") + if sid: + _session_id = sid + body = resp.read().decode() + if body.strip(): + sys.stdout.write(body.rstrip("\n") + "\n") + sys.stdout.flush() + except urllib.error.HTTPError as e: + if e.code in (302, 401, 403): + _log(f"Auth failed ({e.code}), forcing token refresh") + token = _get_token(force=True) + headers["Authorization"] = f"Bearer {token}" + retry = urllib.request.Request(APP_URL, data=line.encode(), headers=headers, method="POST") + with urllib.request.urlopen(retry, timeout=300) as resp: + sid = resp.headers.get("Mcp-Session-Id") + if sid: + _session_id = sid + body = resp.read().decode() + if body.strip(): + sys.stdout.write(body.rstrip("\n") + "\n") + sys.stdout.flush() + else: + raise + + +def main(): + if not APP_URL: + _log("FATAL: CODA_MCP_URL not set") + sys.exit(1) + _log(f"Proxying to {APP_URL} (profile={PROFILE})") + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + _forward(line) + except Exception as e: + _log(f"Error: {e}") + try: + msg_id = json.loads(line).get("id") + except Exception: + msg_id = None + if msg_id is not None: + err = json.dumps({ + "jsonrpc": "2.0", + "id": msg_id, + "error": {"code": -32000, "message": str(e)}, + }) + sys.stdout.write(err + "\n") + sys.stdout.flush() + + +if __name__ == "__main__": + main()