diff --git a/CHANGELOG.md b/CHANGELOG.md index 263a091..326725b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.1] - 2026-06-12 + +### Added + +- 🌐 **Browser tools.** Browse the web from chat. Navigate pages, click elements, type into forms, take screenshots, and run JavaScript. Works with local Chrome (auto-launched), Firecrawl, or Browser-Use. Enable in Settings > Browser. +- 🖼️ **Image understanding.** The AI can now read and describe images from your workspace. Open a screenshot or image file and it just works, across all providers. +- 🔴 **Error toasts.** When something goes wrong during a response (API errors, model failures), you'll now see a clear error message in the chat and a toast notification instead of silent failures. + +### Fixed + +- 🔁 **Responses API multi-turn tool calling.** Fixed an issue where tool calls would stop after the first round when using OpenAI's Responses API. The AI now correctly loops through multiple tool calls as expected. +- 💬 **`/new` command in Telegram/Discord.** Starting a new conversation with `/new` now actually creates a fresh chat instead of continuing the previous one. +- 🛡️ **Responses API spec compliance.** Input messages, tool outputs, and error handling now fully follow the Open Responses specification, preventing unexpected 400 errors. + ## [0.3.0] - 2026-06-12 ### Added diff --git a/cptr/app.py b/cptr/app.py index 4898808..53dbea3 100644 --- a/cptr/app.py +++ b/cptr/app.py @@ -69,6 +69,15 @@ async def shutdown(): bot_manager = getattr(app.state, "bot_manager", None) if bot_manager: await bot_manager.stop_all() + # Clean up browser sessions and launched Chrome + try: + from cptr.utils.browser.session import session_manager + from cptr.utils.browser.launcher import shutdown_browser + + await session_manager.close_all() + await shutdown_browser() + except Exception: + pass # Auth middleware diff --git a/cptr/frontend/package.json b/cptr/frontend/package.json index d0a632e..923b45b 100644 --- a/cptr/frontend/package.json +++ b/cptr/frontend/package.json @@ -1,7 +1,7 @@ { "name": "frontend", "private": true, - "version": "0.3.0", + "version": "0.3.1", "type": "module", "scripts": { "dev": "vite dev", diff --git a/cptr/frontend/src/lib/apis/admin.ts b/cptr/frontend/src/lib/apis/admin.ts index 10de548..c6243fb 100644 --- a/cptr/frontend/src/lib/apis/admin.ts +++ b/cptr/frontend/src/lib/apis/admin.ts @@ -54,8 +54,8 @@ export const getAdminConfig = async (): Promise> => { export const updateConfig = (config: Record) => fetchJSON('/api/admin/config', { - method: 'PUT', - ...jsonBody({ config }) + ...jsonBody({ config }), + method: 'PUT' }); // ── Connections ───────────────────────────────────────────── diff --git a/cptr/frontend/src/lib/components/Icon.svelte b/cptr/frontend/src/lib/components/Icon.svelte index fd661f1..71b7c15 100644 --- a/cptr/frontend/src/lib/components/Icon.svelte +++ b/cptr/frontend/src/lib/components/Icon.svelte @@ -333,5 +333,10 @@ {:else if name === 'signal'} + {:else if name === 'browser'} + + + + {/if} diff --git a/cptr/frontend/src/lib/components/Settings/Browser.svelte b/cptr/frontend/src/lib/components/Settings/Browser.svelte new file mode 100644 index 0000000..15a6905 --- /dev/null +++ b/cptr/frontend/src/lib/components/Settings/Browser.svelte @@ -0,0 +1,254 @@ + + +
+

Browser

+ + {#if loading} +
+ {:else} + +

Enable

+ +
+ +

+ Give the AI access to a web browser for navigating pages, clicking elements, and taking screenshots. +

+
+ + {#if enabled} + +

Provider

+ +
+ {#each [ + { value: 'local' as const, label: 'Local CDP' }, + { value: 'firecrawl' as const, label: 'Firecrawl' }, + { value: 'browser_use' as const, label: 'Browser-Use' } + ] as opt} + + {/each} +
+

+ {#if provider === 'local'} + Connects to Chrome via DevTools Protocol. Full interactive browsing with clicking, typing, and screenshots. + {:else if provider === 'firecrawl'} + Cloud API that converts web pages to markdown. Fast extraction, no interactive browsing. + {:else} + Cloud API for LLM-driven browser tasks. Describe what you need in natural language. + {/if} +

+ + + {#if provider === 'local'} +

Connection

+ +
+ + +
+ +
+ + +
+ {#if testResult} +

+ {testResult.message} +

+ {/if} +
+ +
+ +
+ + minutes +
+
+
+ {/if} + + + {#if provider === 'firecrawl'} +

Firecrawl

+ +
+
+ + +
+
+ + +

Change for self-hosted Firecrawl instances

+
+
+ {/if} + + + {#if provider === 'browser_use'} +

Browser-Use

+ +
+
+ + +
+
+ + +
+
+ {/if} + {/if} + + +
+ +
+ {/if} +
diff --git a/cptr/frontend/src/lib/components/SettingsModal.svelte b/cptr/frontend/src/lib/components/SettingsModal.svelte index a5de3f1..4dc97fa 100644 --- a/cptr/frontend/src/lib/components/SettingsModal.svelte +++ b/cptr/frontend/src/lib/components/SettingsModal.svelte @@ -4,6 +4,7 @@ import General from './Settings/General.svelte'; import Account from './Settings/Account.svelte'; import Keyboard from './Settings/Keyboard.svelte'; + import Browser from './Settings/Browser.svelte'; import About from './Settings/About.svelte'; import Users from './Admin/Users.svelte'; import Connections from './Admin/Connections.svelte'; @@ -16,6 +17,7 @@ type Tab = | 'general' | 'keyboard' + | 'browser' | 'account' | 'about' | 'users' @@ -47,6 +49,7 @@ { id: 'connections', label: $t('admin.connections'), icon: 'plug' }, { id: 'models', label: $t('admin.models'), icon: 'cube' }, { id: 'messaging', label: $t('admin.messaging'), icon: 'chat-bubble' }, + { id: 'browser', label: 'Browser', icon: 'browser' }, { id: 'admin_settings', label: $t('settings.configuration'), icon: 'shield' } ]); @@ -106,6 +109,8 @@ {:else if activeTab === 'keyboard'} + {:else if activeTab === 'browser'} + {:else if activeTab === 'account'} {:else if activeTab === 'about'} diff --git a/cptr/frontend/src/lib/components/chat/ChatInput.svelte b/cptr/frontend/src/lib/components/chat/ChatInput.svelte index 54d5d12..7a60586 100644 --- a/cptr/frontend/src/lib/components/chat/ChatInput.svelte +++ b/cptr/frontend/src/lib/components/chat/ChatInput.svelte @@ -401,7 +401,7 @@ if (!child) return; const popupHeight = child.offsetHeight || 200; child.style.position = 'fixed'; - child.style.left = `${Math.max(8, Math.min(rect.left, window.innerWidth - 340))}px`; + child.style.left = `${Math.max(8, Math.min(rect.left, window.innerWidth - 280))}px`; child.style.top = `${rect.top - popupHeight - 8}px`; } diff --git a/cptr/frontend/src/lib/components/chat/ChatPanel.svelte b/cptr/frontend/src/lib/components/chat/ChatPanel.svelte index 662950d..e0be5c1 100644 --- a/cptr/frontend/src/lib/components/chat/ChatPanel.svelte +++ b/cptr/frontend/src/lib/components/chat/ChatPanel.svelte @@ -38,6 +38,7 @@ import AssistantMessage from './AssistantMessage.svelte'; import ChatHistory from './ChatHistory.svelte'; import Spinner from '../common/Spinner.svelte'; + import { toast } from 'svelte-sonner'; interface Props { workspace: string; @@ -307,6 +308,7 @@ delta?: string; output?: any; done?: boolean; + error?: string; queue_processed?: boolean; title?: string; }) { @@ -375,6 +377,9 @@ } allMessages = [...allMessages]; } + if (data.error) { + toast.error(data.error, { duration: 8000 }); + } if (data.done) { // Clear streaming indicator for this tab if (tabId) { diff --git a/cptr/frontend/src/lib/components/chat/SkillSuggestionPopup.svelte b/cptr/frontend/src/lib/components/chat/SkillSuggestionPopup.svelte index 97ea264..ea9a5ee 100644 --- a/cptr/frontend/src/lib/components/chat/SkillSuggestionPopup.svelte +++ b/cptr/frontend/src/lib/components/chat/SkillSuggestionPopup.svelte @@ -27,7 +27,7 @@
{#if items.length === 0}
No skills found
diff --git a/cptr/utils/ai.py b/cptr/utils/ai.py index e162836..7845eb2 100644 --- a/cptr/utils/ai.py +++ b/cptr/utils/ai.py @@ -12,6 +12,7 @@ import asyncio import json import logging +import uuid from collections.abc import AsyncIterator from typing import Dict, List @@ -187,6 +188,24 @@ def _to_anthropic_messages(messages: list[dict]) -> list[dict]: content = formatted_content if role == "tool": # tool result → Anthropic tool_result block + # Content may be a string or a list of blocks (multimodal image results) + if isinstance(content, list): + # Multimodal tool result — convert blocks to Anthropic format + tool_content = [] + for block in content: + if block.get("type") == "text": + tool_content.append({"type": "text", "text": block.get("text", "")}) + elif block.get("type") == "image": + tool_content.append({ + "type": "image", + "source": { + "type": "base64", + "media_type": block.get("media_type", "image/jpeg"), + "data": block.get("base64", ""), + } + }) + else: + tool_content = content result.append( { "role": "user", @@ -194,7 +213,7 @@ def _to_anthropic_messages(messages: list[dict]) -> list[dict]: { "type": "tool_result", "tool_use_id": m.get("tool_call_id", ""), - "content": content, + "content": tool_content, } ], } @@ -338,6 +357,9 @@ def _to_openai_messages(messages: list[dict], instructions: str) -> list[dict]: for block in content: if block.get("type") == "text": formatted_content.append({"type": "text", "text": block.get("text", "")}) + elif block.get("type") == "image_url": + # Already in OpenAI-native format (e.g. from image extraction) + formatted_content.append(block) elif block.get("type") == "image": data_uri = f"data:{block.get('media_type', 'image/jpeg')};base64,{block.get('base64', '')}" formatted_content.append({ @@ -460,22 +482,39 @@ def _to_responses_input(messages: list[dict], instructions: str) -> list[dict]: if role == "system": continue if role == "tool": + content = m.get("content", "") + if isinstance(content, list): + # Multimodal tool content — extract text only for output + # (images are handled in the agentic loop) + text_parts = [] + for block in content: + if block.get("type") == "text": + text_parts.append(block.get("text", "")) + content = "\n".join(text_parts) items.append( { "type": "function_call_output", "call_id": m.get("tool_call_id", ""), - "output": m.get("content", ""), + "output": content, + "status": "completed", } ) elif role == "assistant" and m.get("tool_calls"): for tc in m["tool_calls"]: + args = tc["function"].get("arguments", "{}") + call_id = tc.get("id", "") + # Responses API requires id to start with "fc_" + fc_id = tc.get("fc_id", "") + if not fc_id or not fc_id.startswith("fc_"): + fc_id = f"fc_{call_id.replace('call_', '', 1) or uuid.uuid4().hex}" items.append( { "type": "function_call", - "id": tc.get("id", ""), - "call_id": tc.get("id", ""), + "id": fc_id, + "call_id": call_id, "name": tc["function"]["name"], - "arguments": tc["function"].get("arguments", "{}"), + "arguments": args if isinstance(args, str) else json.dumps(args), + "status": "completed", } ) else: @@ -486,15 +525,14 @@ def _to_responses_input(messages: list[dict], instructions: str) -> list[dict]: if block.get("type") == "text": formatted_content.append({"type": "input_text", "text": block.get("text", "")}) elif block.get("type") == "image": - # Not all models support input_image, but this is the Responses API spec data_uri = f"data:{block.get('media_type', 'image/jpeg')};base64,{block.get('base64', '')}" formatted_content.append({ "type": "input_image", "image_url": data_uri }) - items.append({"role": role, "content": formatted_content}) + items.append({"type": "message", "role": role, "content": formatted_content}) else: - items.append({"role": role, "content": content}) + items.append({"type": "message", "role": role, "content": content}) return items @@ -529,17 +567,26 @@ async def stream_openai_responses( try: async with httpx.AsyncClient(timeout=_STREAM_TIMEOUT) as client: logger.info( - "[stream] openai responses POST %s/responses model=%s", url, form_data.model + "[stream] openai responses POST %s/responses model=%s input_items=%d types=%s", + url, form_data.model, + len(body.get("input", [])), + [i.get("type", i.get("role", "?")) for i in body.get("input", [])], ) async with client.stream( "POST", f"{url}/responses", json=body, headers=headers ) as resp: logger.info("[stream] openai responses status=%s", resp.status_code) + if resp.status_code >= 400: + error_body = await resp.aread() + logger.error("[stream] openai responses error body: %s", error_body.decode(errors="replace")) resp.raise_for_status() async for line in resp.aiter_lines(): if not line.startswith("data: "): continue - event = json.loads(line[6:]) + raw = line[6:] + if raw == "[DONE]": + break + event = json.loads(raw) etype = event.get("type") if etype == "response.output_text.delta": @@ -552,11 +599,17 @@ async def stream_openai_responses( emitted = True yield { "type": "tool_call", + "id": item.get("id", ""), "call_id": item["call_id"], "name": item["name"], "arguments": json.loads(item["arguments"]), } + elif etype == "response.failed": + error = event.get("response", {}).get("error", {}) + msg = error.get("message", "Response failed") + raise RuntimeError(f"Responses API error: {msg}") + elif etype == "response.completed": usage = event.get("response", {}).get("usage", {}) if usage: diff --git a/cptr/utils/bridge.py b/cptr/utils/bridge.py index 4127ffd..32b1772 100644 --- a/cptr/utils/bridge.py +++ b/cptr/utils/bridge.py @@ -174,15 +174,18 @@ async def delete_bot_config(bot_id: str) -> bool: async def find_chat_for_thread(bot_id: str, external_thread_id: str) -> str | None: - """Find an existing cptr chat_id for a platform thread. + """Find the most recent cptr chat_id for a platform thread. - Scans chats with matching bridge metadata. This is fine for the - small number of active bridge threads (typically < 100). + Scans chats with matching bridge metadata and returns the newest one, + so /new (which creates a new chat with the same thread ID) takes effect. """ from cptr.models import Chat from cptr.utils.db import get_db from sqlalchemy import select + best_id = None + best_ts = -1 + async with await get_db() as db: result = await db.execute( select(Chat).where(Chat.user_id.isnot(None)) @@ -193,8 +196,11 @@ async def find_chat_for_thread(bot_id: str, external_thread_id: str) -> str | No meta.get("bridge_bot_id") == bot_id and meta.get("bridge_external_thread_id") == external_thread_id ): - return chat.id - return None + ts = getattr(chat, "created_at", 0) or 0 + if ts > best_ts: + best_ts = ts + best_id = chat.id + return best_id # ── BotManager ─────────────────────────────────────────────── diff --git a/cptr/utils/browser/__init__.py b/cptr/utils/browser/__init__.py new file mode 100644 index 0000000..defe8f9 --- /dev/null +++ b/cptr/utils/browser/__init__.py @@ -0,0 +1 @@ +"""Browser automation package — pluggable providers behind unified tool interface.""" diff --git a/cptr/utils/browser/browser_use.py b/cptr/utils/browser/browser_use.py new file mode 100644 index 0000000..1c4c439 --- /dev/null +++ b/cptr/utils/browser/browser_use.py @@ -0,0 +1,47 @@ +"""Browser-Use cloud provider — LLM-driven browser tasks via REST API. + +Pure httpx, no SDK dependency. +""" + +from __future__ import annotations + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +DEFAULT_BASE_URL = "https://api.browser-use.com" + + +async def browse( + task: str, + api_key: str, + base_url: str = DEFAULT_BASE_URL, +) -> str: + """Run a natural language browser task and return the result. + + Uses POST /v1/run. + """ + headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + payload = {"task": task} + + async with httpx.AsyncClient(timeout=120) as http: + resp = await http.post( + f"{base_url.rstrip('/')}/v1/run", + json=payload, + headers=headers, + ) + resp.raise_for_status() + data = resp.json() + + result = data.get("result", data.get("output", "")) + + if not result: + return f"Browser-Use returned no result for task: {task}" + + # Trim if needed + if isinstance(result, str) and len(result) > 50_000: + result = result[:50_000] + "\n\n[... truncated]" + + return str(result) diff --git a/cptr/utils/browser/cdp.py b/cptr/utils/browser/cdp.py new file mode 100644 index 0000000..2c49056 --- /dev/null +++ b/cptr/utils/browser/cdp.py @@ -0,0 +1,330 @@ +"""Chrome DevTools Protocol client over WebSocket. + +Connects to a running Chrome/Chromium instance via CDP and provides methods for +navigation, accessibility tree snapshots with ref IDs, clicking, typing, +screenshots, and JS evaluation. Zero external dependencies beyond websockets. +""" + +from __future__ import annotations + +import asyncio +import base64 +import json +import logging +from typing import Any + +import websockets + +logger = logging.getLogger(__name__) + +# Ref ID prefix used in accessibility tree snapshots +_REF_PREFIX = "@e" + + +class CDPClient: + """Low-level Chrome DevTools Protocol client.""" + + def __init__(self, ws: Any, target_id: str) -> None: + self._ws = ws + self._target_id = target_id + self._msg_id = 0 + self._ref_map: dict[str, int] = {} # ref_id -> backend_node_id + self._closed = False + + # ── Connection ───────────────────────────────────────── + + @classmethod + async def connect(cls, cdp_url: str = "http://localhost:9222") -> "CDPClient": + """Connect to a Chrome instance via CDP. + + Discovers the WebSocket debug URL from the /json/version endpoint, + then opens a WebSocket connection to the first available page target. + """ + import httpx + + base = cdp_url.rstrip("/") + + # Get available targets (pages/tabs) + async with httpx.AsyncClient() as http: + resp = await http.get(f"{base}/json/list", timeout=5) + targets = resp.json() + + # Find a page target, or create one + page_target = None + for t in targets: + if t.get("type") == "page": + page_target = t + break + + if not page_target: + # Create a new tab + async with httpx.AsyncClient() as http: + resp = await http.put(f"{base}/json/new?about:blank", timeout=5) + page_target = resp.json() + + ws_url = page_target["webSocketDebuggerUrl"] + target_id = page_target["id"] + + ws = await websockets.connect(ws_url, max_size=50 * 1024 * 1024) + + client = cls(ws, target_id) + + # Enable required domains + await client._send("Page.enable") + await client._send("DOM.enable") + await client._send("Accessibility.enable") + await client._send("Runtime.enable") + + return client + + # ── Low-level CDP messaging ──────────────────────────── + + async def _send(self, method: str, params: dict | None = None) -> dict: + """Send a CDP command and wait for the result.""" + self._msg_id += 1 + msg_id = self._msg_id + payload = {"id": msg_id, "method": method} + if params: + payload["params"] = params + + await self._ws.send(json.dumps(payload)) + + # Wait for matching response (skip events) + while True: + raw = await asyncio.wait_for(self._ws.recv(), timeout=30) + data = json.loads(raw) + if data.get("id") == msg_id: + if "error" in data: + raise RuntimeError(f"CDP error: {data['error'].get('message', data['error'])}") + return data.get("result", {}) + # Ignore events (no "id" field) + + # ── Navigation ───────────────────────────────────────── + + async def navigate(self, url: str) -> dict: + """Navigate to a URL and wait for the page to load.""" + result = await self._send("Page.navigate", {"url": url}) + + # Wait for load event + while True: + raw = await asyncio.wait_for(self._ws.recv(), timeout=30) + data = json.loads(raw) + if data.get("method") == "Page.loadEventFired": + break + + # Small delay for DOM to settle + await asyncio.sleep(0.5) + + # Get page title + title_result = await self._send( + "Runtime.evaluate", {"expression": "document.title"} + ) + title = title_result.get("result", {}).get("value", "") + + return {"url": url, "title": title, "frame_id": result.get("frameId")} + + # ── Accessibility tree snapshot ──────────────────────── + + async def snapshot(self) -> str: + """Capture the accessibility tree and return a text representation with ref IDs. + + Interactive elements (links, buttons, inputs, etc.) are assigned ref IDs + like @e1, @e2 that can be used with click() and type_text(). + """ + result = await self._send("Accessibility.getFullAXTree") + nodes = result.get("nodes", []) + + self._ref_map.clear() + ref_counter = 0 + lines: list[str] = [] + + # Interactive roles that get ref IDs + interactive_roles = { + "link", "button", "textbox", "searchbox", "combobox", + "checkbox", "radio", "tab", "menuitem", "option", + "switch", "slider", "spinbutton", "textfield", + } + + for node in nodes: + role_data = node.get("role", {}) + role = role_data.get("value", "") if isinstance(role_data, dict) else str(role_data) + if not role or role in ("none", "generic", "InlineTextBox", "StaticText"): + continue + + name_data = node.get("name", {}) + name = name_data.get("value", "") if isinstance(name_data, dict) else str(name_data) + if not name and role not in interactive_roles: + continue + + # Build indent based on depth (simplified: flat for now) + depth = 0 + for prop in node.get("properties", []): + if isinstance(prop, dict) and prop.get("name") == "level": + depth = int(prop.get("value", {}).get("value", 0)) + break + + indent = " " * min(depth, 4) + + # Assign ref ID to interactive elements + ref_label = "" + if role.lower() in interactive_roles: + ref_counter += 1 + ref_id = f"{_REF_PREFIX}{ref_counter}" + backend_node_id = node.get("backendDOMNodeId") + if backend_node_id: + self._ref_map[ref_id] = backend_node_id + ref_label = f" {ref_id}" + + # Format: [role @ref] Name + display_name = f" {name}" if name else "" + lines.append(f"{indent}[{role}{ref_label}]{display_name}") + + if not lines: + return "[empty page]" + + return "\n".join(lines) + + # ── Interaction ──────────────────────────────────────── + + async def click(self, ref: str) -> None: + """Click an element identified by its ref ID from the latest snapshot.""" + ref = ref.strip() + if not ref.startswith(_REF_PREFIX): + ref = f"{_REF_PREFIX}{ref}" + + backend_node_id = self._ref_map.get(ref) + if not backend_node_id: + raise ValueError(f"Unknown ref '{ref}'. Run browser_snapshot() first to get valid ref IDs.") + + # Resolve to a remote object + result = await self._send( + "DOM.resolveNode", {"backendNodeId": backend_node_id} + ) + object_id = result.get("object", {}).get("objectId") + if not object_id: + raise RuntimeError(f"Could not resolve element for ref {ref}") + + # Scroll into view + try: + await self._send( + "DOM.scrollIntoViewIfNeeded", {"backendNodeId": backend_node_id} + ) + except RuntimeError: + pass + + # Get box model for click coordinates + try: + box = await self._send( + "DOM.getBoxModel", {"backendNodeId": backend_node_id} + ) + content = box.get("model", {}).get("content", []) + if len(content) >= 4: + x = (content[0] + content[2]) / 2 + y = (content[1] + content[5]) / 2 + else: + x, y = 0, 0 + except RuntimeError: + # Fallback: use JS click + await self._send( + "Runtime.callFunctionOn", + {"objectId": object_id, "functionDeclaration": "function() { this.click(); }"}, + ) + return + + # Dispatch mouse events + for event_type in ("mousePressed", "mouseReleased"): + await self._send( + "Input.dispatchMouseEvent", + { + "type": event_type, + "x": x, + "y": y, + "button": "left", + "clickCount": 1, + }, + ) + + # Wait for potential navigation + await asyncio.sleep(0.3) + + async def type_text(self, ref: str, text: str) -> None: + """Type text into an element identified by its ref ID.""" + # Focus the element first + ref = ref.strip() + if not ref.startswith(_REF_PREFIX): + ref = f"{_REF_PREFIX}{ref}" + + backend_node_id = self._ref_map.get(ref) + if not backend_node_id: + raise ValueError(f"Unknown ref '{ref}'. Run browser_snapshot() first.") + + await self._send("DOM.focus", {"backendNodeId": backend_node_id}) + + # Clear existing content + await self._send( + "Input.dispatchKeyEvent", + {"type": "keyDown", "key": "a", "modifiers": 2}, # Ctrl+A / Cmd+A + ) + await self._send( + "Input.dispatchKeyEvent", + {"type": "keyUp", "key": "a", "modifiers": 2}, + ) + + # Type each character + for char in text: + await self._send( + "Input.dispatchKeyEvent", + {"type": "keyDown", "key": char, "text": char}, + ) + await self._send( + "Input.dispatchKeyEvent", + {"type": "keyUp", "key": char}, + ) + + async def scroll(self, direction: str = "down", amount: int = 3) -> None: + """Scroll the page. Direction: 'up' or 'down'.""" + delta_y = 300 * amount * (1 if direction == "down" else -1) + await self._send( + "Input.dispatchMouseEvent", + {"type": "mouseWheel", "x": 400, "y": 400, "deltaX": 0, "deltaY": delta_y}, + ) + await asyncio.sleep(0.3) + + # ── Observation ──────────────────────────────────────── + + async def screenshot(self) -> bytes: + """Capture a screenshot of the current viewport. Returns PNG bytes.""" + result = await self._send( + "Page.captureScreenshot", {"format": "png", "quality": 80} + ) + return base64.b64decode(result["data"]) + + async def get_text(self) -> str: + """Extract visible text content from the page.""" + result = await self._send( + "Runtime.evaluate", + {"expression": "document.body?.innerText || ''"}, + ) + return result.get("result", {}).get("value", "") + + async def evaluate(self, expression: str) -> str: + """Evaluate a JavaScript expression and return the result.""" + result = await self._send( + "Runtime.evaluate", + {"expression": expression, "returnByValue": True}, + ) + value = result.get("result", {}) + if value.get("type") == "undefined": + return "undefined" + return str(value.get("value", value.get("description", ""))) + + # ── Lifecycle ────────────────────────────────────────── + + async def close(self) -> None: + """Close the CDP connection.""" + if not self._closed: + self._closed = True + try: + await self._ws.close() + except Exception: + pass diff --git a/cptr/utils/browser/firecrawl.py b/cptr/utils/browser/firecrawl.py new file mode 100644 index 0000000..c259bd4 --- /dev/null +++ b/cptr/utils/browser/firecrawl.py @@ -0,0 +1,54 @@ +"""Firecrawl browser provider — page-to-markdown via REST API. + +Pure httpx, no SDK dependency. Supports both cloud (api.firecrawl.dev) and +self-hosted instances. +""" + +from __future__ import annotations + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +DEFAULT_BASE_URL = "https://api.firecrawl.dev" + + +async def scrape( + url: str, + api_key: str, + base_url: str = DEFAULT_BASE_URL, + format: str = "markdown", +) -> str: + """Scrape a single page and return content as markdown. + + Uses POST /v1/scrape. + """ + headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + payload = {"url": url, "formats": [format]} + + async with httpx.AsyncClient(timeout=30) as http: + resp = await http.post( + f"{base_url.rstrip('/')}/v1/scrape", + json=payload, + headers=headers, + ) + resp.raise_for_status() + data = resp.json() + + if not data.get("success"): + error = data.get("error", "Unknown error") + return f"Firecrawl error: {error}" + + result = data.get("data", {}) + content = result.get(format, result.get("markdown", "")) + + if not content: + return f"Firecrawl returned empty content for {url}" + + # Trim to reasonable size for LLM context + if len(content) > 50_000: + content = content[:50_000] + "\n\n[... truncated]" + + return content diff --git a/cptr/utils/browser/launcher.py b/cptr/utils/browser/launcher.py new file mode 100644 index 0000000..d3354e8 --- /dev/null +++ b/cptr/utils/browser/launcher.py @@ -0,0 +1,168 @@ +"""Chrome/Chromium discovery and auto-launch. + +Finds a running Chrome instance or launches one headless with a debug port. +Supports macOS and Linux. Called automatically when browser tools are invoked. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import shutil +import tempfile +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Common Chrome/Chromium binary paths by platform +_CHROME_PATHS_MACOS = [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser", + "/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge", +] + +_CHROME_PATHS_LINUX = [ + "google-chrome", + "google-chrome-stable", + "chromium", + "chromium-browser", + "brave-browser", + "microsoft-edge", +] + +# Track launched process so we can kill it on shutdown +_launched_process: asyncio.subprocess.Process | None = None +_user_data_dir: str | None = None + + +def _find_chrome() -> str | None: + """Find a Chrome/Chromium binary on this system.""" + import platform + + if platform.system() == "Darwin": + for path in _CHROME_PATHS_MACOS: + if Path(path).exists(): + return path + # Also check PATH + for name in ("google-chrome", "chromium"): + found = shutil.which(name) + if found: + return found + else: + # Linux / other + for name in _CHROME_PATHS_LINUX: + found = shutil.which(name) + if found: + return found + + return None + + +async def _probe_cdp(base_url: str) -> bool: + """Check if a CDP endpoint is responding.""" + import httpx + + try: + async with httpx.AsyncClient() as http: + resp = await http.get(f"{base_url}/json/version", timeout=3) + data = resp.json() + logger.info( + "Found Chrome %s at %s", + data.get("Browser", "unknown"), + base_url, + ) + return True + except Exception: + return False + + +async def ensure_browser(port: int = 9222) -> str: + """Ensure a Chrome instance is available for CDP connection. + + 1. Check if CDP is already available at the configured URL + 2. If not, find and launch Chrome/Chromium headless + 3. Return the CDP base URL + + Called automatically when any browser tool is invoked. + """ + global _launched_process, _user_data_dir + + base_url = f"http://localhost:{port}" + + # 1. Check if already running + if await _probe_cdp(base_url): + return base_url + + # 2. Find Chrome binary + chrome_path = _find_chrome() + if not chrome_path: + raise RuntimeError( + "No Chrome or Chromium found. Install Google Chrome, Chromium, or Brave, " + "or set browser.cdp_url to point to a running instance." + ) + + # 3. Launch headless with debug port + _user_data_dir = tempfile.mkdtemp(prefix="cptr-browser-") + + args = [ + chrome_path, + f"--remote-debugging-port={port}", + "--headless=new", + "--no-first-run", + "--no-default-browser-check", + "--disable-background-networking", + "--disable-sync", + "--disable-translate", + "--disable-extensions", + f"--user-data-dir={_user_data_dir}", + "about:blank", + ] + + logger.info("Launching Chrome: %s", " ".join(args[:3])) + + _launched_process = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + + # Wait for CDP to become available + for _ in range(20): + await asyncio.sleep(0.5) + if await _probe_cdp(base_url): + logger.info("Chrome launched successfully on port %d", port) + return base_url + + raise RuntimeError( + f"Chrome launched but CDP not responding on port {port} after 10s. " + f"Binary: {chrome_path}" + ) + + +async def shutdown_browser() -> None: + """Kill the Chrome process we launched (if any). Called on app shutdown.""" + global _launched_process, _user_data_dir + + if _launched_process and _launched_process.returncode is None: + logger.info("Shutting down launched Chrome (pid %d)", _launched_process.pid) + try: + _launched_process.terminate() + await asyncio.wait_for(_launched_process.wait(), timeout=5) + except (asyncio.TimeoutError, ProcessLookupError): + try: + _launched_process.kill() + except ProcessLookupError: + pass + _launched_process = None + + # Clean up temp profile + if _user_data_dir: + import shutil as sh + + try: + sh.rmtree(_user_data_dir, ignore_errors=True) + except Exception: + pass + _user_data_dir = None diff --git a/cptr/utils/browser/session.py b/cptr/utils/browser/session.py new file mode 100644 index 0000000..b30a4bc --- /dev/null +++ b/cptr/utils/browser/session.py @@ -0,0 +1,88 @@ +"""Per-chat browser session manager. + +Maintains one CDPClient per chat so the AI can do multi-step browser flows +(navigate -> snapshot -> click -> type -> snapshot) without losing state. +Sessions auto-close after an idle timeout. +""" + +from __future__ import annotations + +import asyncio +import logging +import time + +from cptr.utils.browser.cdp import CDPClient + +logger = logging.getLogger(__name__) + +DEFAULT_TIMEOUT_MINUTES = 10 + + +class BrowserSessionManager: + """One browser session per chat, with idle timeout cleanup.""" + + def __init__(self) -> None: + self._sessions: dict[str, CDPClient] = {} + self._last_used: dict[str, float] = {} + self._cleanup_task: asyncio.Task | None = None + self._timeout_minutes = DEFAULT_TIMEOUT_MINUTES + + def set_timeout(self, minutes: int) -> None: + self._timeout_minutes = max(1, minutes) + + async def get_or_create(self, chat_id: str, cdp_url: str) -> CDPClient: + """Get an existing session for this chat, or create a new one.""" + if chat_id in self._sessions: + client = self._sessions[chat_id] + if not client._closed: + self._last_used[chat_id] = time.monotonic() + return client + # Session was closed externally, remove it + del self._sessions[chat_id] + self._last_used.pop(chat_id, None) + + # Create new session + client = await CDPClient.connect(cdp_url) + self._sessions[chat_id] = client + self._last_used[chat_id] = time.monotonic() + + # Start cleanup loop if not running + if self._cleanup_task is None or self._cleanup_task.done(): + self._cleanup_task = asyncio.create_task(self._cleanup_loop()) + + logger.info("Browser session created for chat %s", chat_id[:8]) + return client + + async def close(self, chat_id: str) -> None: + """Close and remove a specific chat's session.""" + client = self._sessions.pop(chat_id, None) + self._last_used.pop(chat_id, None) + if client: + await client.close() + logger.info("Browser session closed for chat %s", chat_id[:8]) + + async def close_all(self) -> None: + """Close all sessions. Called on app shutdown.""" + if self._cleanup_task and not self._cleanup_task.done(): + self._cleanup_task.cancel() + for chat_id in list(self._sessions): + await self.close(chat_id) + + async def _cleanup_loop(self) -> None: + """Periodically close idle sessions.""" + while self._sessions: + await asyncio.sleep(60) # Check every minute + now = time.monotonic() + timeout_seconds = self._timeout_minutes * 60 + expired = [ + cid + for cid, last in self._last_used.items() + if now - last > timeout_seconds + ] + for chat_id in expired: + logger.info("Browser session timed out for chat %s", chat_id[:8]) + await self.close(chat_id) + + +# Singleton instance +session_manager = BrowserSessionManager() diff --git a/cptr/utils/chat_task.py b/cptr/utils/chat_task.py index 74d02c6..93fdfe0 100644 --- a/cptr/utils/chat_task.py +++ b/cptr/utils/chat_task.py @@ -560,7 +560,7 @@ async def _load_message_history( text_content = entry["content"] - # Append file:// references so the AI can read them with view_file + # Append file:// references so the AI can read them with read_file if non_images: from cptr.utils.storage import UPLOADS_DIR file_refs = [] @@ -600,16 +600,18 @@ async def _load_message_history( tool_calls = [] for item in m.output: if item.get("type") == "function_call" and item.get("status") == "completed": - tool_calls.append( - { - "id": item["call_id"], - "type": "function", - "function": { - "name": item["name"], - "arguments": json.dumps(item.get("arguments", {})), - }, - } - ) + tc = { + "id": item["call_id"], + "type": "function", + "function": { + "name": item["name"], + "arguments": json.dumps(item.get("arguments", {})), + }, + } + # Preserve Responses API fc_ ID for round-tripping + if item.get("fc_id"): + tc["fc_id"] = item["fc_id"] + tool_calls.append(tc) if tool_calls: entry["tool_calls"] = tool_calls @@ -627,12 +629,32 @@ async def _load_message_history( return result, existing_summary +def _parse_image_data_uri(result: str) -> tuple[str, str] | None: + """Check if a tool result is a data URI image (from read_file on image files). + + Returns (media_type, base64_data) if it's a data URI image, else None. + """ + if not result.startswith("data:image/"): + return None + # data:image/png;base64,iVBOR... + try: + header, b64_data = result.split(",", 1) + media_type = header.split(";")[0].replace("data:", "") + return media_type, b64_data + except (ValueError, IndexError): + return None + + def _append_tool_to_messages(messages: list[dict], event: dict, result: str, provider: str): """Append a tool call + result to the message history for the next API call.""" - # Guard against oversized tool outputs - if len(result) > CHAT_TOOL_MAX_CHARS: - half = CHAT_TOOL_MAX_CHARS // 2 - result = result[:half] + "\n\n...(truncated)...\n\n" + result[-half:] + # Check for image result before truncation (data URI is large but needed) + image = _parse_image_data_uri(result) + + if not image: + # Guard against oversized tool outputs (skip for images, handled above) + if len(result) > CHAT_TOOL_MAX_CHARS: + half = CHAT_TOOL_MAX_CHARS // 2 + result = result[:half] + "\n\n...(truncated)...\n\n" + result[-half:] # Add assistant message with tool_call messages.append( @@ -642,6 +664,7 @@ def _append_tool_to_messages(messages: list[dict], event: dict, result: str, pro "tool_calls": [ { "id": event["call_id"], + "fc_id": event.get("id", ""), "type": "function", "function": { "name": event["name"], @@ -651,14 +674,35 @@ def _append_tool_to_messages(messages: list[dict], event: dict, result: str, pro ], } ) - # Add tool result - messages.append( - { - "role": "tool", - "tool_call_id": event["call_id"], - "content": result, - } - ) + + if image: + # Structured multimodal content — provider converters handle the + # "image" block type appropriately for each API. + media_type, b64_data = image + path = event["arguments"].get("path", "image") + messages.append( + { + "role": "tool", + "tool_call_id": event["call_id"], + "content": [ + {"type": "text", "text": f"Image file: {path}"}, + { + "type": "image", + "media_type": media_type, + "base64": b64_data, + }, + ], + } + ) + else: + # Plain text tool result + messages.append( + { + "role": "tool", + "tool_call_id": event["call_id"], + "content": result, + } + ) def _find_safe_split(messages: list[dict], target_keep: int) -> int: @@ -798,7 +842,7 @@ def _sync_state(): system += f"\n\n[CONVERSATION SUMMARY]\n{loaded_summary}" if regeneration_prompt: messages.append({"role": "user", "content": regeneration_prompt}) - tools = get_tool_list() + tools = await get_tool_list() # Remove view_skill tool if no skills are available skills = discover_skills(workspace) @@ -902,9 +946,36 @@ def _sync_state(): message_id[:8], len(drop_zone), len(keep_zone), len(summary), ) + # Anthropic supports images natively in tool_result content blocks. + # Chat Completions and Responses API don't support multimodal tool messages, + # so extract images into a follow-up user message. + api_messages = messages + if provider != "anthropic": + image_blocks = [] + api_messages = [] + for m in messages: + if m.get("role") == "tool" and isinstance(m.get("content"), list): + text_parts = [] + for part in m["content"]: + if part.get("type") == "text": + text_parts.append(part.get("text", "")) + elif part.get("type") == "image": + image_blocks.append(part) + api_messages.append({**m, "content": "\n".join(text_parts)}) + else: + api_messages.append(m) + if image_blocks: + api_messages.append({ + "role": "user", + "content": [ + {"type": "text", "text": "Here are the images from the tool results above."}, + *image_blocks, + ], + }) + form_data = ChatCompletionForm( model=model, - messages=messages, + messages=api_messages, instructions=system, tools=tools, ) @@ -935,6 +1006,7 @@ def _sync_state(): "type": "function_call", "id": str(uuid.uuid4()), "call_id": event["call_id"], + "fc_id": event.get("id", ""), "name": name, "arguments": event["arguments"], } @@ -955,7 +1027,7 @@ def _sync_state(): if name == "create_artifact": result = await create_artifact(**event["arguments"], workspace=workspace) else: - result = await execute_tool(name, event["arguments"], {"workspace": workspace, "user_id": user_id, "model_id": model}) + result = await execute_tool(name, event["arguments"], {"workspace": workspace, "user_id": user_id, "model_id": model, "chat_id": chat_id}) # Update status to completed item["status"] = "completed" @@ -1073,15 +1145,35 @@ def _sync_state(): except Exception as e: logger.exception(f"Chat task error for message {message_id}") _flush_text() + error_msg = str(e) + # Try to extract API error body for more detail + if hasattr(e, 'response'): + try: + body = e.response.text or "" + if body: + import json as _json + err_data = _json.loads(body) + api_msg = err_data.get("error", {}).get("message", "") + if api_msg: + error_msg = api_msg + except Exception: + pass + # Append error to content so it's visible in the chat + error_block = f"\n\n> **Error:** {error_msg}" + content += error_block + text_buffer += error_block + flushed_item = _flush_text() + if flushed_item: + await emit(output=flushed_item) await ChatMessage.update( message_id, content=content, output=output_items, done=True, - meta={"error": str(e)}, + meta={"error": error_msg}, ) _task_state.pop(message_id, None) - await _emit_done() + await emit(done=True, error=error_msg) finally: _tasks.pop(message_id, None) _task_state.pop(message_id, None) diff --git a/cptr/utils/tools.py b/cptr/utils/tools.py index bce8ff6..7ca14c1 100644 --- a/cptr/utils/tools.py +++ b/cptr/utils/tools.py @@ -78,6 +78,84 @@ def _truncate_output(text: str, max_chars: int = 80_000) -> str: return text[:half] + "\n\n... (truncated) ...\n\n" + text[-half:] +# ── Image support ─────────────────────────────────────────── + +IMAGE_EXTENSIONS = { + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", +} + +_IMAGE_MAX_BYTES = 5 * 1024 * 1024 # 5 MB target for API payload + +_IMAGE_MIME = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + ".tiff": "image/tiff", + ".tif": "image/tiff", +} + + +def _read_image_file(full: Path, path: str) -> str: + """Read an image file and return a data URI string. + + If the file exceeds _IMAGE_MAX_BYTES, attempts to resize it down + using Pillow. Falls back to a text error if Pillow is unavailable + and the file is too large. + """ + import base64 + + size = full.stat().st_size + ext = full.suffix.lower() + media_type = _IMAGE_MIME.get(ext, "image/png") + data = full.read_bytes() + + if size > _IMAGE_MAX_BYTES: + try: + from PIL import Image + import io + + img = Image.open(io.BytesIO(data)) + # Progressively scale down until under limit + # Use JPEG for lossy formats, PNG for lossless + out_format = "JPEG" if ext in (".jpg", ".jpeg", ".bmp", ".tiff", ".tif") else "PNG" + if out_format == "JPEG": + media_type = "image/jpeg" + # Convert RGBA to RGB for JPEG + if img.mode in ("RGBA", "P"): + img = img.convert("RGB") + else: + media_type = "image/png" + + scale = 0.8 # start at 80% + for _ in range(10): + new_w = int(img.width * scale) + new_h = int(img.height * scale) + if new_w < 100 or new_h < 100: + break + resized = img.resize((new_w, new_h), Image.LANCZOS) + buf = io.BytesIO() + save_kwargs = {"quality": 85} if out_format == "JPEG" else {} + resized.save(buf, format=out_format, **save_kwargs) + if buf.tell() <= _IMAGE_MAX_BYTES: + data = buf.getvalue() + size = len(data) + break + scale *= 0.7 # more aggressive on each pass + else: + return f"Error: image too large ({_human_size(full.stat().st_size)}) and could not be resized below 5MB." + except ImportError: + return ( + f"Error: image file is too large ({_human_size(size)}). " + f"Install Pillow (`pip install Pillow`) to enable automatic resizing." + ) + + b64 = base64.b64encode(data).decode("ascii") + return f"data:{media_type};base64,{b64}" + + # ── Tool functions ────────────────────────────────────────── @@ -100,6 +178,10 @@ async def read_file( if not full.is_file(): return f"Error: file not found: {path}" + # Image files: return base64 JSON instead of garbled text + if full.suffix.lower() in IMAGE_EXTENSIONS: + return await asyncio.to_thread(_read_image_file, full, path) + def _read(): size = full.stat().st_size if size > 500_000: @@ -939,6 +1021,158 @@ async def view_skill( return format_skill_content(skill) +# ── Browser tools ──────────────────────────────────────────── + + +async def _get_browser_config() -> dict: + """Read browser config from DB.""" + try: + from cptr.models import Config + + return { + "enabled": await Config.get("browser.enabled") or False, + "provider": await Config.get("browser.provider") or "local", + "cdp_url": await Config.get("browser.cdp_url") or "http://localhost:9222", + "auto_launch": await Config.get("browser.auto_launch") if await Config.get("browser.auto_launch") is not None else True, + "session_timeout": int(await Config.get("browser.session_timeout_minutes") or 10), + "firecrawl_api_key": await Config.get("browser.firecrawl_api_key") or "", + "firecrawl_base_url": await Config.get("browser.firecrawl_base_url") or "https://api.firecrawl.dev", + "browser_use_api_key": await Config.get("browser.browser_use_api_key") or "", + "browser_use_base_url": await Config.get("browser.browser_use_base_url") or "https://api.browser-use.com", + } + except Exception: + return {"enabled": False, "provider": "local"} + + +async def _get_cdp_session(chat_id: str) -> "CDPClient": + """Get or create a CDP session for the current chat.""" + cfg = await _get_browser_config() + cdp_url = cfg["cdp_url"] + + if cfg.get("auto_launch", True): + from cptr.utils.browser.launcher import ensure_browser + + cdp_url = await ensure_browser(port=int(cdp_url.split(":")[-1])) + + from cptr.utils.browser.session import session_manager + + session_manager.set_timeout(cfg.get("session_timeout", 10)) + return await session_manager.get_or_create(chat_id, cdp_url) + + +async def browser_navigate(url: str, *, __context__: dict) -> str: + """Navigate to a URL in the browser. Returns the page title and status. + :param url: The URL to navigate to. + """ + cfg = await _get_browser_config() + provider = cfg.get("provider", "local") + + if provider == "firecrawl": + key = cfg.get("firecrawl_api_key", "") + if not key: + return "Error: Firecrawl API key not configured. Set it in Settings > Browser." + from cptr.utils.browser.firecrawl import scrape + + content = await scrape(url, key, cfg.get("firecrawl_base_url", "")) + return f"Navigated to {url} (via Firecrawl)\n\n{content}" + + if provider == "browser_use": + key = cfg.get("browser_use_api_key", "") + if not key: + return "Error: Browser-Use API key not configured. Set it in Settings > Browser." + from cptr.utils.browser.browser_use import browse + + result = await browse(f"Navigate to {url} and describe what you see", key, cfg.get("browser_use_base_url", "")) + return f"Navigated to {url} (via Browser-Use)\n\n{result}" + + # Local CDP + chat_id = __context__.get("chat_id", "default") + client = await _get_cdp_session(chat_id) + result = await client.navigate(url) + return f"Navigated to {url}\nTitle: {result.get('title', '')}" + + +async def browser_snapshot(*, __context__: dict) -> str: + """Get the current page content. For local browser, returns an accessibility tree with ref IDs (@e1, @e2, etc.) that can be used with browser_click and browser_type. For cloud providers, returns page content as text.""" + cfg = await _get_browser_config() + provider = cfg.get("provider", "local") + + if provider in ("firecrawl", "browser_use"): + return "Snapshot is only meaningful after browser_navigate. The navigate result already contains the page content." + + chat_id = __context__.get("chat_id", "default") + client = await _get_cdp_session(chat_id) + return await client.snapshot() + + +async def browser_click(ref: str, *, __context__: dict) -> str: + """Click an element on the page identified by its ref ID from the snapshot (e.g. @e1). + :param ref: The ref ID of the element to click (e.g. @e1, @e5). + """ + cfg = await _get_browser_config() + if cfg.get("provider", "local") != "local": + return "Error: browser_click requires Local CDP provider. Cloud providers (Firecrawl, Browser-Use) don't support interactive browsing. Switch to Local CDP in Settings > Browser." + + chat_id = __context__.get("chat_id", "default") + client = await _get_cdp_session(chat_id) + await client.click(ref) + # Return updated snapshot so the AI sees the result + return await client.snapshot() + + +async def browser_type(ref: str, text: str, *, __context__: dict) -> str: + """Type text into an input element identified by its ref ID from the snapshot. + :param ref: The ref ID of the input element (e.g. @e3). + :param text: The text to type. + """ + cfg = await _get_browser_config() + if cfg.get("provider", "local") != "local": + return "Error: browser_type requires Local CDP provider. Switch to Local CDP in Settings > Browser." + + chat_id = __context__.get("chat_id", "default") + client = await _get_cdp_session(chat_id) + await client.type_text(ref, text) + return await client.snapshot() + + +async def browser_screenshot(*, __context__: dict) -> str: + """Take a screenshot of the current browser page. Saves the image to the workspace. + """ + cfg = await _get_browser_config() + if cfg.get("provider", "local") != "local": + return "Error: browser_screenshot requires Local CDP provider." + + chat_id = __context__.get("chat_id", "default") + client = await _get_cdp_session(chat_id) + png_bytes = await client.screenshot() + + # Save to workspace + workspace = __context__.get("workspace", ".") + screenshots_dir = Path(workspace) / ".cptr" / "screenshots" + screenshots_dir.mkdir(parents=True, exist_ok=True) + + import time + + filename = f"screenshot_{int(time.time())}.png" + filepath = screenshots_dir / filename + filepath.write_bytes(png_bytes) + + return f"Screenshot saved: {filepath}" + + +async def browser_evaluate(javascript: str, *, __context__: dict) -> str: + """Execute JavaScript in the browser page and return the result. + :param javascript: The JavaScript expression to evaluate. + """ + cfg = await _get_browser_config() + if cfg.get("provider", "local") != "local": + return "Error: browser_evaluate requires Local CDP provider." + + chat_id = __context__.get("chat_id", "default") + client = await _get_cdp_session(chat_id) + return await client.evaluate(javascript) + + # ── Registry ──────────────────────────────────────────────── TOOLS: dict[str, dict] = { @@ -964,6 +1198,16 @@ async def view_skill( "delete_automation": {"fn": delete_automation, "auto": False}, } +# Browser tools — registered conditionally based on browser.enabled config +BROWSER_TOOLS: dict[str, dict] = { + "browser_navigate": {"fn": browser_navigate, "auto": False}, + "browser_snapshot": {"fn": browser_snapshot, "auto": True}, + "browser_click": {"fn": browser_click, "auto": False}, + "browser_type": {"fn": browser_type, "auto": False}, + "browser_screenshot": {"fn": browser_screenshot, "auto": True}, + "browser_evaluate": {"fn": browser_evaluate, "auto": False}, +} + # ── Schema from function signature ────────────────────────── @@ -1017,14 +1261,25 @@ def _fn_to_schema(name: str, fn) -> dict: } -def get_tool_list() -> list[dict]: - """Return tool schemas for the LLM.""" - return [_fn_to_schema(name, t["fn"]) for name, t in TOOLS.items()] +async def get_tool_list() -> list[dict]: + """Return tool schemas for the LLM. + + Automatically includes browser tools when browser.enabled is true in config. + """ + tools = dict(TOOLS) + try: + from cptr.models import Config + + if (await Config.get("browser.enabled")) in (True, "true", "1"): + tools.update(BROWSER_TOOLS) + except Exception: + pass + return [_fn_to_schema(name, t["fn"]) for name, t in tools.items()] async def execute_tool(name: str, args: dict, __context__: dict) -> str: """Execute a tool by name, injecting execution context.""" - info = TOOLS.get(name) + info = TOOLS.get(name) or BROWSER_TOOLS.get(name) if not info: return f"Error: unknown tool: {name}" fn = info["fn"] diff --git a/pyproject.toml b/pyproject.toml index d9f00e4..fc5c4c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cptr" -version = "0.3.0" +version = "0.3.1" description = "Your computer, from anywhere. Code, manage, and control your machine from the web." license = {file = "LICENSE"} readme = "README.md"