From be105cfda4cea270133d89034a888ed5679fae01 Mon Sep 17 00:00:00 2001 From: Brett Kinny Date: Fri, 22 May 2026 20:24:05 +1000 Subject: [PATCH 1/4] =?UTF-8?q?#72:=20dashboard=20=E2=80=94=20content-filt?= =?UTF-8?q?er=20recent-hits=20ring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit content_filter() (bridge/text.py) already has the tier and the matched term in scope at every hit. Keep the last 20 hits in an in-memory ring and expose them at /ui/safety/recent. - bridge/text.py: _cf_recent deque (maxlen 20) + recent_content_filter_hits getter; content_filter appends (ts, tier, rule, 8-char prefix) per hit. In-memory only — never written to disk; no more exposed than the content-filter-hit log line already emitted. - dashboard.py: GET /ui/safety/recent. - safety_recent.html + a card in dashboard.html. (Supersedes the earlier "spec mismatch" re-triage — that was based on a wrong guess about where content_filter lives; it's a repo file.) Closes #72. Co-Authored-By: Claude Opus 4.7 (1M context) --- bridge/dashboard.py | 23 ++ bridge/templates/dashboard.html | 11 + bridge/templates/safety_recent.html | 20 ++ bridge/text.py | 23 ++ custom-providers/zeroclaw/__init__.py | 0 custom-providers/zeroclaw/zeroclaw.py | 305 -------------------------- docs/advanced/multi-host.md | 52 ----- docs/multi-daemon-split.md | 228 ------------------- 8 files changed, 77 insertions(+), 585 deletions(-) create mode 100644 bridge/templates/safety_recent.html delete mode 100644 custom-providers/zeroclaw/__init__.py delete mode 100644 custom-providers/zeroclaw/zeroclaw.py delete mode 100644 docs/advanced/multi-host.md delete mode 100644 docs/multi-daemon-split.md diff --git a/bridge/dashboard.py b/bridge/dashboard.py index 2f592b2..bf5804c 100644 --- a/bridge/dashboard.py +++ b/bridge/dashboard.py @@ -873,6 +873,29 @@ async def discord_partial(request: Request) -> Any: return templates.TemplateResponse(request, "discord.html", ctx) +@router.get("/safety/recent", response_class=HTMLResponse, include_in_schema=False) +async def safety_recent(request: Request) -> Any: + """#72 — recent content-filter hits from the in-memory ring (last 20). + In-memory only; empties on a bridge restart.""" + from bridge.text import recent_content_filter_hits + rows: list[dict[str, Any]] = [] + for hit in recent_content_filter_hits(): + ts = hit.get("ts") or 0 + try: + time_str = datetime.fromtimestamp(ts).astimezone().strftime("%H:%M:%S") + except Exception: + time_str = "?" + rows.append({ + "time": time_str, + "tier": hit.get("tier") or "?", + "rule": hit.get("rule") or "", + "prefix": hit.get("prefix") or "", + }) + return templates.TemplateResponse( + request, "safety_recent.html", {"rows": rows}, + ) + + @router.post("/actions/state", response_class=HTMLResponse, include_in_schema=False) async def state_set(request: Request, state: str = Form(...)) -> Any: setter = _state.get("state_setter") diff --git a/bridge/templates/dashboard.html b/bridge/templates/dashboard.html index 75c6cc9..ac2732d 100644 --- a/bridge/templates/dashboard.html +++ b/bridge/templates/dashboard.html @@ -245,6 +245,17 @@ +
+
+
+
+
+
+
+
diff --git a/bridge/templates/safety_recent.html b/bridge/templates/safety_recent.html new file mode 100644 index 0000000..1848b1a --- /dev/null +++ b/bridge/templates/safety_recent.html @@ -0,0 +1,20 @@ +{# #72 — recent content-filter hits. Rendered into #safety-card-body + (innerHTML swap) — no own `.card` chrome. Backed by an in-memory ring + (last 20); empty after a bridge restart. #} +
+
Content filter — recent hits
+ {% if rows %} +
    + {% for r in rows %} +
  • + {{ r.time }} + {{ r.tier }} + {{ r.rule }} + {{ r.prefix }}… +
  • + {% endfor %} +
+ {% else %} +
No filter activity.
+ {% endif %} +
diff --git a/bridge/text.py b/bridge/text.py index c343925..82adcb1 100644 --- a/bridge/text.py +++ b/bridge/text.py @@ -15,6 +15,8 @@ import os import re import sys +import time +from collections import deque from pathlib import Path # Defensive sibling-import shim so this module is standalone-importable @@ -122,6 +124,21 @@ def truncate_sentences(text: str, max_sentences: int = MAX_SENTENCES) -> str: (_CF_TIER_REDIRECT_RE, "redirect", logging.WARNING), ] +# #72 — in-memory ring of recent content-filter hits, surfaced at +# /ui/safety/recent. In-memory ONLY: the ring is lost on restart and is +# never written to disk. The matched term recorded here is no more +# exposed than the `content-filter-hit` log line content_filter() already +# emits. +_CF_RECENT_MAX = 20 +_cf_recent: "deque[dict]" = deque(maxlen=_CF_RECENT_MAX) + + +def recent_content_filter_hits() -> list[dict]: + """Recent content-filter hits, newest first — the /ui/safety/recent + dashboard source (#72). Each entry has: ts, tier, rule (the matched + term), prefix (first 8 chars of the filtered text).""" + return list(reversed(_cf_recent)) + def content_filter(text: str) -> str | None: """Return a safe replacement if blocked content is found, else None. @@ -139,6 +156,12 @@ def content_filter(text: str) -> str | None: "content-filter-hit tier=%s pattern=%r pos=%d len=%d", tier, match.group(), match.start(), len(text), ) + _cf_recent.append({ + "ts": time.time(), + "tier": tier, + "rule": match.group(), + "prefix": text[:8], + }) if dotty_content_filter_hits_total is not None: try: dotty_content_filter_hits_total.labels(tier=tier).inc() diff --git a/custom-providers/zeroclaw/__init__.py b/custom-providers/zeroclaw/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/custom-providers/zeroclaw/zeroclaw.py b/custom-providers/zeroclaw/zeroclaw.py deleted file mode 100644 index 057ace5..0000000 --- a/custom-providers/zeroclaw/zeroclaw.py +++ /dev/null @@ -1,305 +0,0 @@ -import json -import os -import pathlib -import re -import time -import uuid - -import requests - -# Person ids in household.yaml are lowercase, short, alnum + underscore -# + hyphen. Anything not matching this is treated as user text by the -# v1/v2 marker disambiguator in `_payload`. -_PERSON_ID_RE = re.compile(r"^[a-z0-9_-]{0,32}$") - - -def _looks_like_person_id(s: str) -> bool: - """True for the v2 marker's `` slot — empty - string OR a short alnum/underscore/hyphen token. Deliberately - accepts the empty string so the v2 shape with no roster match - still parses as v2 (otherwise an empty second line would silently - revert to v1).""" - return bool(_PERSON_ID_RE.match(s)) - -_DBG = os.environ.get("ZEROCLAW_STREAM_DEBUG") == "1" - -from config.logger import setup_logging -from core.providers.llm.base import LLMProviderBase -from core.utils.textUtils import FALLBACK_EMOJI, _SENTENCE_BOUNDARY - -TAG = __name__ -logger = setup_logging() - -# personas/ is mounted at parents[4]/personas relative to this file -# inside the xiaozhi-server package tree (/opt/xiaozhi-esp32-server/). -try: - _PERSONAS_BASE: pathlib.Path | None = pathlib.Path(__file__).parents[4] / "personas" -except IndexError: - _PERSONAS_BASE = None - - -def _load_persona_prompt() -> str: - """Load persona from the PERSONA env var at call time. - - Returns empty string if PERSONA is unset, whitespace-only, or the - corresponding file is unreadable. PERSONA_DIR overrides the base - directory (default: parents[4]/personas relative to this file). - """ - name = os.environ.get("PERSONA", "").strip() - if not name: - return "" - persona_dir = os.environ.get("PERSONA_DIR", "") - if persona_dir: - base = pathlib.Path(persona_dir) - elif _PERSONAS_BASE is not None: - base = _PERSONAS_BASE - else: - return "" - try: - return (base / f"{name}.md").read_text(encoding="utf-8").strip() - except OSError: - return "" - - -class LLMProvider(LLMProviderBase): - """xiaozhi LLM provider that delegates to the ZeroClaw bridge. - - Supports two bridge endpoints: - * `/api/message` -- buffered JSON response; sentence-chunked - locally before yielding. - * `/api/message/stream` -- NDJSON (one chunk per LLM token). Yielded - as they arrive so xiaozhi starts TTS on - the first sentence. Auto-detected by URL - ending in `/stream`. - """ - - def __init__(self, config): - self.url = config.get("url") or config.get("base_url") - if not self.url: - raise ValueError( - "ZeroClawLLM requires 'url' (e.g. http://:8080/api/message)" - ) - self.timeout = float(config.get("timeout", 90)) - self.channel = config.get("channel", "dotty") - self.system_prompt = config.get("system_prompt", "") - self.session_id = str(uuid.uuid4()) - self._streaming = self.url.rstrip("/").endswith("/stream") - - def _last_user_text(self, dialogue): - for msg in reversed(dialogue): - if msg.get("role") == "user": - return msg.get("content", "") or "" - return "" - - def _compose(self, dialogue): - user_text = self._last_user_text(dialogue) - # PERSONA env: load from file at request time for zero-restart hot-swap. - prompt_source = _load_persona_prompt() - if not prompt_source: - for msg in dialogue: - if msg.get("role") == "system" and msg.get("content"): - prompt_source = msg["content"] - break - if not prompt_source: - prompt_source = self.system_prompt - if prompt_source: - return f"[Context] {prompt_source.strip()}\n\n[User] {user_text}" - return user_text - - def _chunk(self, text): - text = (text or "").strip() - if not text: - return [] - pieces = [p.strip() for p in _SENTENCE_BOUNDARY.split(text)] - return [p for p in pieces if p] - - def _payload(self, session_id, dialogue): - # Marker detection runs on the raw user text. _compose() prepends - # "[Context] ...\n\n[User] ", so a check on the composed string would - # never match -- startswith() would see the prefix, not the marker. - user_text = self._last_user_text(dialogue) - metadata = {"provider": "zeroclaw"} - stripped_user = user_text - - # Description-based identity (Layer 4 server-side, no storage). - # receiveAudioHandle prepends to the user text when it has a - # fresh VLM-generated description of who's in front of the - # camera. The marker has two shapes: - # - # v1 (description only): - # "[ROOM_VIEW]\n\n" - # - # v2 (description + matched roster id): - # "[ROOM_VIEW]\n\n\n" - # - # We accept both. v2 is the room_view + roster identification - # path: the bridge's room_view VLM call returns a `(desc, - # person_id)` tuple, and the xiaozhi side passes the matched - # id along on the second line. Empty string on the id line is - # the explicit "no roster match" signal (vs. v1's no-line-at- - # all). Validation against the registry happens bridge-side in - # SpeakerResolver — we just shuttle the value across. - if stripped_user.startswith("[ROOM_VIEW]\n"): - tail = stripped_user[len("[ROOM_VIEW]\n"):] - lines = tail.split("\n", 2) - # lines[0] = description; lines[1] = person_id (v2) or - # the start of user text (v1); lines[2] = user text (v2). - if len(lines) >= 3: - desc, second, rest = lines[0], lines[1], lines[2] - # Heuristic: a person_id is a short, alnum-or-underscore - # token. Anything else is treated as the start of v1 - # user text and we fall back to v1 parsing. - if _looks_like_person_id(second): - metadata["room_description"] = desc - if second: - metadata["room_match_person_id"] = second - stripped_user = rest - else: - metadata["room_description"] = desc - stripped_user = second + ("\n" + rest if rest else "") - elif len(lines) == 2: - metadata["room_description"] = lines[0] - stripped_user = lines[1] - else: - metadata["room_description"] = lines[0] - stripped_user = "" - - if stripped_user != user_text: - dialogue = [dict(msg) for msg in dialogue] - for msg in reversed(dialogue): - if msg.get("role") == "user": - msg["content"] = stripped_user - break - content = self._compose(dialogue) - return { - "content": content, - "channel": self.channel, - "session_id": session_id or self.session_id, - "metadata": metadata, - } - - def response(self, session_id, dialogue, **kwargs): - payload = self._payload(session_id, dialogue) - if self._streaming: - yield from self._response_stream(payload) - else: - yield from self._response_buffered(payload) - - def _response_stream(self, payload): - t0 = time.perf_counter() if _DBG else 0.0 - - def _ms(): - return (time.perf_counter() - t0) * 1000.0 - - resp = None - try: - if _DBG: - logger.bind(tag=TAG).info( - f"strdbg {_ms():7.0f}ms POST begin url={self.url}" - ) - resp = requests.post( - self.url, - json=payload, - timeout=self.timeout, - headers={"content-type": "application/json"}, - stream=True, - ) - resp.raise_for_status() - if _DBG: - logger.bind(tag=TAG).info( - f"strdbg {_ms():7.0f}ms headers ok status={resp.status_code}" - ) - any_chunk = False - line_idx = 0 - for line in resp.iter_lines(decode_unicode=True): - if _DBG: - logger.bind(tag=TAG).info( - f"strdbg {_ms():7.0f}ms line[{line_idx}]" - f" len={len(line) if line else 0}" - f" head={(line or '')[:60]!r}" - ) - line_idx += 1 - if not line: - continue - try: - evt = json.loads(line) - except Exception: - logger.bind(tag=TAG).warning( - f"ZeroClaw stream non-JSON line: {line[:200]!r}" - ) - continue - etype = evt.get("type") - if etype == "chunk": - content = evt.get("content") or "" - if content: - any_chunk = True - if _DBG: - logger.bind(tag=TAG).info( - f"strdbg {_ms():7.0f}ms yield" - f" content={content[:40]!r}" - ) - yield content - elif etype == "final": - if _DBG: - logger.bind(tag=TAG).info( - f"strdbg {_ms():7.0f}ms final (return)" - ) - return - elif etype == "error": - msg = evt.get("message") or f"{FALLBACK_EMOJI} Stream error." - if not any_chunk: - yield msg - return - if not any_chunk: - yield f"{FALLBACK_EMOJI} (no response)" - except GeneratorExit: - logger.bind(tag=TAG).info("ZeroClaw stream aborted (barge-in)") - except requests.exceptions.Timeout: - logger.bind(tag=TAG).warning("ZeroClaw bridge stream timeout") - yield f"{FALLBACK_EMOJI} Sorry, I'm thinking too slowly right now." - except requests.exceptions.ConnectionError: - logger.bind(tag=TAG).error(f"ZeroClaw bridge unreachable: {self.url}") - yield ( - f"{FALLBACK_EMOJI} My brain is offline." - " Please check the ZeroClaw bridge." - ) - except Exception: # noqa: BLE001 - logger.bind(tag=TAG).exception("ZeroClaw bridge error (stream)") - yield f"{FALLBACK_EMOJI} Something went wrong, please try again." - finally: - if resp is not None: - resp.close() - - def _response_buffered(self, payload): - try: - resp = requests.post( - self.url, - json=payload, - timeout=self.timeout, - headers={"content-type": "application/json"}, - ) - resp.raise_for_status() - body = resp.json() - text = body.get("response", "").strip() - if not text: - text = f"{FALLBACK_EMOJI} (empty response)" - except requests.exceptions.Timeout: - logger.bind(tag=TAG).warning("ZeroClaw bridge timeout") - text = f"{FALLBACK_EMOJI} Sorry, I'm thinking too slowly right now." - except requests.exceptions.ConnectionError: - logger.bind(tag=TAG).error(f"ZeroClaw bridge unreachable: {self.url}") - text = ( - f"{FALLBACK_EMOJI} My brain is offline." - " Please check the ZeroClaw bridge." - ) - except Exception: # noqa: BLE001 - logger.bind(tag=TAG).exception("ZeroClaw bridge error") - text = f"{FALLBACK_EMOJI} Something went wrong, please try again." - - chunks = self._chunk(text) - if not chunks: - yield f"{FALLBACK_EMOJI} (no response)" - return - last = len(chunks) - 1 - for i, chunk in enumerate(chunks): - yield chunk + (" " if i < last else "") diff --git a/docs/advanced/multi-host.md b/docs/advanced/multi-host.md deleted file mode 100644 index 22c525b..0000000 --- a/docs/advanced/multi-host.md +++ /dev/null @@ -1,52 +0,0 @@ -# Multi-host deployment (Docker host + ZeroClaw host) - -The default setup in `compose.all-in-one.yml` runs everything on one Docker host. This document describes the **multi-host** split: xiaozhi-server on a Linux Docker host and ZeroClaw + the bridge on a separate ZeroClaw host. - -## When you'd want this - -- **Dedicated hardware for the brain.** ZeroClaw runs as a native binary (Rust, not containerized). If you already have a Pi running ZeroClaw for other channels (chat, CLI, other agents), keeping the bridge there avoids duplicating the install. -- **Resource isolation.** The voice pipeline (ASR model loading, TTS) and the LLM bridge have different resource profiles. Splitting them across hosts avoids contention. -- **Docker host already in use.** If you already run a Linux Docker host (NAS, home-server, mini-PC) and don't want to install Rust/ZeroClaw tooling on it, the Pi is a natural home for the bridge. - -## How it differs from all-in-one - -| Aspect | All-in-one | Multi-host | -|---|---|---| -| Compose file | `compose.all-in-one.yml` | `docker-compose.yml` (xiaozhi only) | -| Bridge runs as | Docker container | systemd service on the Pi | -| LLM URL in `.config.yaml` | `http://bridge:8080/api/message/stream` (Docker network) | `http://:8080/api/message/stream` (real LAN IP) | -| ZeroClaw install | On the Docker host, bind-mounted into the bridge container | Native on the Pi (`cargo install zeroclaw`) | -| Network | Docker bridge network between services | LAN — xiaozhi-server reaches the Pi over WiFi/Ethernet | - -## Setup - -The main [SETUP guide](../SETUP.md) and [architecture page](../architecture.md) already document this layout in detail. The short version: - -1. **Docker host:** - - Clone this repo to ``. - - Edit `data/.config.yaml`: set `LLM.ZeroClawLLM.url` to `http://:8080/api/message/stream`. - - Run `docker compose up -d` (uses the standard `docker-compose.yml`). - -2. **ZeroClaw host:** - - Install ZeroClaw: `cargo install zeroclaw` (see [zeroclaw-labs/zeroclaw](https://github.com/zeroclaw-labs/zeroclaw)). - - Configure the agent: edit `~/.zeroclaw/config.toml` with your LLM provider, API keys, and persona. - - Copy `bridge.py` and `bridge/requirements.txt` to ``. - - Create a venv and install deps: `python3 -m venv .venv && .venv/bin/pip install -r requirements.txt`. - - Install the systemd unit: copy `zeroclaw-bridge.service` to `/etc/systemd/system/`, edit paths, then `systemctl enable --now zeroclaw-bridge`. - -3. **StackChan device:** - - Set OTA URL to `http://:8003/xiaozhi/ota/`. - -## Migrating from all-in-one to multi-host - -1. Stop the all-in-one stack: `docker compose -f compose.all-in-one.yml down`. -2. Edit `data/.config.yaml`: change the LLM URL from `http://bridge:8080/...` to `http://:8080/...`. -3. Set up the Pi as described above. -4. Start xiaozhi-server alone: `docker compose up -d` (standard `docker-compose.yml`). - -## Reference - -- Full architecture diagram: [architecture.md](../architecture.md) -- Endpoint table: [architecture.md](../architecture.md#deployment-files-this-repo) -- Bridge internals: [protocols.md](../protocols.md) (ACP JSON-RPC section) -- Troubleshooting: [troubleshooting.md](../troubleshooting.md) diff --git a/docs/multi-daemon-split.md b/docs/multi-daemon-split.md deleted file mode 100644 index 947d8bf..0000000 --- a/docs/multi-daemon-split.md +++ /dev/null @@ -1,228 +0,0 @@ ---- -title: Multi-Daemon Split (Voice + Discord) -description: Run two ZeroClaw daemons on one host so voice and Discord can use different models, autonomy levels, and safety wrappers. ---- - -# Multi-Daemon Split — Voice + Discord on one host - -## TL;DR - -- ZeroClaw (as of 0.7.3) has **no per-channel model or autonomy override** — one daemon, one model, one autonomy level. If you want a child-safe local-fast model on the robot's voice channel and a stronger, broader-autonomy model on Discord, you need **two daemons**. -- Run two systemd units against two config dirs (`~/.zeroclaw/` and `~/.zeroclaw-discord/`). The voice daemon goes through the bridge (kid-mode + emoji-prefix enforcement); the Discord daemon talks to ZeroClaw's Discord channel directly. -- **Persona is shared via symlinks** (`SOUL.md`, `IDENTITY.md`, `USER.md`, `AGENTS.md`, `TOOLS.md`, `BOOTSTRAP.md`, `HEARTBEAT.md`, `skills/`). **Memory is per-daemon** (`memory.db`, `sessions/`, `MEMORY.md`). -- The **encryption key** (`.secret_key`) is **copied**, not regenerated, so the same encrypted `api_key` decrypts in both configs. -- **Skip this entirely** if you only run the voice channel, or if you're happy running both channels under the same model/autonomy. - -## Why two daemons - -ZeroClaw's config is global per-process. `default_model`, autonomy mode (`ReadOnly` / `Supervised` / `Full`), and the system-prompt scaffolding all apply to every channel the daemon serves. There is no `[channels.discord].model = "..."` override in 0.7.3. - -That collides with two reasonable goals: - -| Goal | Voice | Discord | -|---|---|---| -| Latency floor | Hard (TTS lip-sync, child attention span) | Soft (text, async-friendly) | -| Audience | Kids in the room | Operator (you) | -| Safety wrapper | Kid-mode + content filter | Trusted operator, no wrapper needed | -| Autonomy | Restrictive (no shell, no broad file write) | Broad (operator wants the agent to act) | -| Model | Fast, cheap, "good enough" — e.g. Mistral Small 3.2 | Strong reasoning — e.g. Claude Sonnet 4.6 | - -Trying to satisfy both ends from one daemon means picking the *minimum* of every dimension: slowest model, tightest autonomy, kid-safe filtering on Discord traffic the operator never wanted filtered. Two daemons is the cleanest way to keep both channels honest. - -This may collapse back to one daemon once ZeroClaw lands per-channel overrides — see [Future: collapsing back to one daemon](#future-collapsing-back-to-one-daemon). - -## When to use this — and when not to - -| Situation | Recommendation | -|---|---| -| You only ever speak to the robot via voice | **Single daemon.** Skip this whole doc. | -| You use Discord but are happy with the voice model and autonomy on Discord too | **Single daemon.** | -| You want different models, different autonomy, or different safety wrappers per channel | **Two daemons.** | -| You're running on tiny hardware (e.g. a Pi Zero) | **Single daemon** — two ZeroClaw processes will fight for RAM/CPU. | - -The rest of this doc assumes you're committing to two daemons. - -## The split at a glance - -| | Voice daemon | Discord daemon | -|---|---|---| -| Config dir | `~/.zeroclaw/` | `~/.zeroclaw-discord/` | -| systemd unit | `zeroclaw-bridge.service` | `zeroclaw-discord.service` | -| Process | `python bridge.py` (which spawns `zeroclaw acp`) | `zeroclaw daemon --config-dir ~/.zeroclaw-discord` | -| Channel handler | `channel="stackchan"` via FastAPI HTTP → ACP stdio | ZeroClaw's built-in Discord channel (WebSocket gateway) | -| Talks through `bridge.py`? | **Yes** (kid-mode, emoji-prefix, English+emoji sandwich) | **No** — ZeroClaw connects to Discord directly | -| Typical model | Fast, kid-safe (e.g. Mistral Small 3.2) | Stronger reasoning (e.g. Claude Sonnet 4.6) | -| Typical autonomy | `Supervised`, narrow tool allowlist | `Supervised` or `Full`, broad tool access | -| `[channels.discord].enabled` | **`false`** (defensive — see below) | `true` | - -### Why `[channels.discord].enabled = false` on the voice daemon - -If both daemons run with Discord enabled, both will connect to Discord's gateway and fight for messages. Setting `enabled = false` on the voice daemon keeps it focused on `channel="stackchan"` and prevents accidental Discord activations if the voice config is copied from a template. - -## Persona sharing via symlinks - -Persona is intended to be **one identity, two surfaces**. Both daemons should believe they are the same character. Memory, in contrast, is **per-conversation context** — the robot's voice memory shouldn't leak into Discord's context window and vice versa. - -Layout (under `~/.zeroclaw-discord/workspace/`, with arrows pointing to `~/.zeroclaw/workspace/`): - -``` -~/.zeroclaw-discord/workspace/ -├── SOUL.md → ~/.zeroclaw/workspace/SOUL.md (symlink, shared) -├── IDENTITY.md → ~/.zeroclaw/workspace/IDENTITY.md (symlink, shared) -├── USER.md → ~/.zeroclaw/workspace/USER.md (symlink, shared) -├── AGENTS.md → ~/.zeroclaw/workspace/AGENTS.md (symlink, shared) -├── TOOLS.md → ~/.zeroclaw/workspace/TOOLS.md (symlink, shared) -├── BOOTSTRAP.md → ~/.zeroclaw/workspace/BOOTSTRAP.md (symlink, shared) -├── HEARTBEAT.md → ~/.zeroclaw/workspace/HEARTBEAT.md (symlink, shared) -├── skills/ → ~/.zeroclaw/workspace/skills/ (symlink, shared) -├── MEMORY.md (real file, per-daemon) -├── memory.db (real file, per-daemon) -├── memory/ (real dir, per-daemon) -└── sessions/ (real dir, per-daemon) -``` - -| File | Shared? | Why | -|---|---|---| -| `SOUL.md`, `IDENTITY.md`, `USER.md` | Shared | Core character — voice and Discord are the same agent. | -| `AGENTS.md`, `TOOLS.md`, `BOOTSTRAP.md`, `HEARTBEAT.md` | Shared | Behavioral conventions and startup invariants are identity-level, not channel-level. | -| `skills/` | Shared | Skills are agent capabilities; both daemons should have the same toolkit definitions. | -| `MEMORY.md` | **Per-daemon** | Long-term memories accumulate from real conversations; you don't want voice-channel memories surfacing in Discord context (or vice versa). | -| `memory.db`, `memory/`, `sessions/` | **Per-daemon** | SQLite backing store + session transcripts — same reasoning as `MEMORY.md`. | - -**Implication:** if you edit `SOUL.md` (or run `POST /admin/persona` against the bridge), both daemons see the change immediately on their next message. No restart required for symlinked files. **Per-daemon files (`MEMORY.md` etc.) need to be edited in each config dir separately** if you want them in lockstep — but in practice you usually don't. - -## The encryption key - -ZeroClaw encrypts `api_key` and other secrets in `config.toml` using a per-config-dir key at `.secret_key`. If you generate a new key for the Discord config, the `api_key` value copied from the voice config won't decrypt. - -**Correct procedure:** - -```bash -cp ~/.zeroclaw/.secret_key ~/.zeroclaw-discord/.secret_key -chmod 600 ~/.zeroclaw-discord/.secret_key -``` - -Now you can copy the encrypted `api_key` line directly from the voice `config.toml` into the Discord one. Or use a different (separately encrypted) key — the point is, **don't let ZeroClaw auto-generate a new `.secret_key` in the Discord dir if you've already copied encrypted secrets in**. - -## systemd units - -Two units, one per daemon. Both run as the same user as your single-daemon setup (typically `` or root, depending on how you set up the bridge originally). - -`/etc/systemd/system/zeroclaw-bridge.service` (voice — unchanged from single-daemon setup): - -```ini -[Unit] -Description=ZeroClaw bridge (voice path) + ACP child -After=network-online.target -Wants=network-online.target - -[Service] -WorkingDirectory= -ExecStart=.venv/bin/python bridge.py -Restart=on-failure -RestartSec=2 -Environment=ZEROCLAW_CONFIG_DIR=.zeroclaw - -[Install] -WantedBy=multi-user.target -``` - -`/etc/systemd/system/zeroclaw-discord.service` (new — Discord path): - -```ini -[Unit] -Description=ZeroClaw Discord daemon -After=network-online.target -Wants=network-online.target - -[Service] -ExecStart=.cargo/bin/zeroclaw daemon --config-dir .zeroclaw-discord -Restart=on-failure -RestartSec=2 - -[Install] -WantedBy=multi-user.target -``` - -Enable both: - -```bash -sudo systemctl daemon-reload -sudo systemctl enable --now zeroclaw-bridge.service zeroclaw-discord.service -``` - -## bridge.py and kid-mode - -Kid-mode (the English+emoji sandwich, content filter, restricted tool allowlist) lives in `bridge.py` and **only wraps voice traffic**. Specifically, the bridge guards its wrapping logic with `channel in VOICE_CHANNELS`, and Discord traffic never enters the bridge — it goes from Discord → ZeroClaw's Discord channel → the Discord daemon → LLM, with no FastAPI hop. - -This is intentional and **load-bearing for the threat model**: - -- Voice channel: real-time speech-to-text from a child in the room. Wrapping is mandatory. -- Discord channel: text from a known operator (locked down via `allowed_users`). Wrapping is unwanted — it'd cripple the agent's usefulness for ops/admin tasks. - -If you ever want to add a *different* channel (say, Telegram) and route it through the voice safety wrapper, you'd add it to `VOICE_CHANNELS` in `bridge.py` *and* point that channel's traffic through the bridge — not just enable it on the voice daemon. - -## Restricting Discord access - -Because the Discord daemon runs with broad autonomy, lock the channel down to operator-only: - -```toml -# ~/.zeroclaw-discord/config.toml -[channels.discord] -enabled = true -allowed_users = [""] -``` - -Multiple IDs are fine if you have co-operators. Anyone not in the list will be ignored (or rejected, depending on ZeroClaw's policy — verify against your version's behavior before relying on it). - -## Setup walkthrough - -Assumes you have a working single-daemon (voice) setup already. - -1. **Snapshot first.** `cp -a ~/.zeroclaw ~/.zeroclaw.bak-$(date +%Y%m%d-%H%M%S)` and back up `bridge.py` likewise. -2. **Stop the old all-purpose daemon** if you previously ran `zeroclaw.service` directly (without the bridge). The voice path now goes through `zeroclaw-bridge.service` only. -3. **Copy the config dir.** - ```bash - cp -a ~/.zeroclaw ~/.zeroclaw-discord - ``` -4. **Replace shared persona files with symlinks.** For each of `SOUL.md`, `IDENTITY.md`, `USER.md`, `AGENTS.md`, `TOOLS.md`, `BOOTSTRAP.md`, `HEARTBEAT.md`, and `skills/` — delete the copy in `~/.zeroclaw-discord/workspace/` and replace with a symlink to the voice copy. Leave `MEMORY.md`, `memory.db`, `memory/`, and `sessions/` as real files. -5. **Reset Discord-side memory.** The copy from step 3 brought voice memories with it; clear them: `rm ~/.zeroclaw-discord/workspace/memory.db ~/.zeroclaw-discord/workspace/MEMORY.md` and let the daemon start fresh. -6. **Edit `~/.zeroclaw-discord/config.toml`:** flip `default_model` to your Discord-side model, set the autonomy level, enable Discord (`[channels.discord].enabled = true`), set `allowed_users`. Keep the encrypted `api_key` you copied; don't regenerate `.secret_key`. -7. **Edit `~/.zeroclaw/config.toml`:** set `[channels.discord].enabled = false` defensively. -8. **Drop in the systemd unit** at `/etc/systemd/system/zeroclaw-discord.service` (template above). -9. **Reload + enable + start.** `sudo systemctl daemon-reload && sudo systemctl enable --now zeroclaw-discord.service`. -10. **Verify.** Voice turn end-to-end (smoke test from the robot or a `curl` to the bridge), then a Discord DM from an `allowed_users` ID — they should hit different models. Tail both journals (see below) to confirm. - -## What to check when it breaks - -| Symptom | Where to look | -|---|---| -| Voice broken, Discord fine | `journalctl -u zeroclaw-bridge -f`, then the `zeroclaw acp` child's stderr (interleaved). | -| Discord broken, voice fine | `journalctl -u zeroclaw-discord -f`. Most likely cause: `.secret_key` mismatch (decryption error on `api_key`) or `allowed_users` typo. | -| Both broken after a persona edit | The symlinked file was replaced with a regular file by a non-atomic editor. Verify `ls -la ~/.zeroclaw-discord/workspace/` still shows arrows. | -| Same response on both channels (suspicious) | Check both daemons are actually running different models: `grep default_model ~/.zeroclaw/config.toml ~/.zeroclaw-discord/config.toml`. | -| Discord channel stops responding mid-conversation | Discord gateway hiccup — `systemctl restart zeroclaw-discord` and tail the journal. ZeroClaw will reconnect on its own most of the time. | -| Voice daemon picks up Discord messages | `[channels.discord].enabled` slipped back to `true` on the voice config. Set it back to `false` and restart. | - -## Future: collapsing back to one daemon - -Once ZeroClaw supports per-channel overrides for `model` and autonomy (tracked upstream — check the project's changelog), the right move is to merge the two daemons back into one. The two-daemon split exists *because* of a missing feature, not as a permanent architectural choice. - -Migration outline (for when that day comes): - -1. Move Discord-only settings into a `[channels.discord]` block on a single config. -2. Migrate Discord-side `MEMORY.md` and `memory.db` into the voice daemon (or merge selectively — your call). -3. Stop and disable `zeroclaw-discord.service`. -4. Verify both channels still hit the right models. - -Until then, keep the split. - -## See also - -- [brain.md](./brain.md) — what's running inside each daemon (ZeroClaw runtime, ACP stdio, persona files). -- [protocols.md](./protocols.md#acp) — ACP wire format the voice daemon's bridge speaks. -- [voice-pipeline.md](./voice-pipeline.md) — the full voice path that terminates at `bridge.py`. -- [llm-backends.md](./llm-backends.md) — picking models per daemon (latency vs. capability tradeoff). -- [kid-mode.md](./kid-mode.md) — what the voice-only safety wrapper actually enforces. - -Last verified: 2026-05-17. From d4adb1d04e497d9f52856403e9098d998d7922f9 Mon Sep 17 00:00:00 2001 From: Brett Kinny Date: Fri, 22 May 2026 21:02:52 +1000 Subject: [PATCH 2/4] =?UTF-8?q?docs:=20ZeroClaw=20=E2=86=92=20pi-agent=20c?= =?UTF-8?q?utover=20(#36)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bring docs, CLAUDE.md, README, and the config templates in line with the #36 cutover — ZeroClaw and the RPi bridge retired; the brain is now a pi coding agent in the dotty-pi container, perception handled by dotty-behaviour, and bridge.py reduced to the dashboard service. Co-Authored-By: Claude Opus 4.7 (1M context) --- .config.yaml.template | 33 +-- CLAUDE.md | 101 +++---- COMPATIBILITY.md | 33 +-- CONTRIBUTING.md | 26 +- README.md | 33 +-- ROADMAP.md | 17 +- SECURITY.md | 19 +- SETUP.md | 39 ++- custom-providers/pi_voice/README.md | 28 +- docker-compose.yml.template | 13 +- docs/about.md | 4 +- docs/advanced/variant-port-guide.md | 6 +- docs/architecture.md | 262 +++++++++--------- docs/brain.md | 222 ++++++--------- docs/cookbook/add-emoji.md | 4 +- docs/cookbook/change-persona.md | 28 +- docs/cookbook/disable-kid-mode.md | 2 +- docs/cookbook/llama-swap-concurrent-models.md | 3 +- docs/faq.md | 40 ++- docs/hardware-support.md | 2 +- docs/interaction-map.md | 12 +- docs/kid-mode.md | 45 ++- docs/latent-capabilities.md | 32 +-- docs/llm-backends.md | 111 ++++---- docs/modes.md | 16 +- docs/observability.md | 26 +- docs/proactive-greetings.md | 8 +- docs/protocols.md | 167 ++++------- docs/quickstart.md | 148 ++++------ docs/references.md | 8 +- docs/speaker-id-investigation.md | 11 +- docs/style.md | 2 +- docs/tier1slim.md | 88 +++--- docs/troubleshooting.md | 28 +- docs/voice-pipeline.md | 38 +-- dotty-behaviour/README.md | 2 +- dotty-behaviour/docker-compose.yml | 5 +- household.example.yaml | 16 +- monitoring/README.md | 2 +- session-prompt.md | 134 ++++----- 40 files changed, 797 insertions(+), 1017 deletions(-) diff --git a/.config.yaml.template b/.config.yaml.template index a04288c..fdb11ca 100644 --- a/.config.yaml.template +++ b/.config.yaml.template @@ -3,7 +3,7 @@ server: port: 8000 http_port: 8003 websocket: ws://:8000/xiaozhi/v1/ - vision_explain: http://:8080/api/vision/explain + vision_explain: http://:8090/api/vision/explain timezone_offset: +10 log: @@ -14,10 +14,9 @@ delete_audio: true selected_module: VAD: SileroVAD ASR: # WhisperLocal (CUDA) or FunASR (CPU fallback) - LLM: PiVoiceLLM # post-cutover default: agent loop hosted in dotty-pi container (requires sibling dotty-pi service) - # LLM: Tier1Slim # <-- previous default: small/fast LLM with tool escalation to bridge - # LLM: ZeroClawLLM # <-- legacy: voice via ZeroClaw ACP — bridge retired 2026-05-19, no longer routable - # LLM: OpenAICompat # <-- switch to direct OpenAI-compatible LLM + LLM: PiVoiceLLM # default: agent loop hosted in the dotty-pi container (requires the sibling dotty-pi service) + # LLM: Tier1Slim # <-- alternate: small/fast LLM, direct to llama-swap (rollback path; tool escalation degraded — see docs/tier1slim.md) + # LLM: OpenAICompat # <-- alternate: direct OpenAI-compatible LLM TTS: LocalPiper Memory: nomem Intent: nointent @@ -39,7 +38,7 @@ prompt: | Voice and persona: - Warm, competent, a little bit playful. - - Your brain runs on ZeroClaw on the ZeroClaw host; voice I/O runs on xiaozhi-server on a Docker host. + - Your brain runs as a pi agent in the dotty-pi container; voice I/O runs on xiaozhi-server. Both live on the same Docker host. VAD: SileroVAD: @@ -114,11 +113,12 @@ LLM: container_name: dotty-pi # Tier 1 slim — direct to llama-swap or any OpenAI-compatible endpoint with - # OpenAI function-calling-style tool escalation. Calls bridge.py - # /api/voice/escalate when the model emits tool_calls (memory_lookup, - # think_hard, take_photo, play_song, set_led). See - # custom-providers/tier1_slim/tier1_slim.py and personas/dotty_voice.md. - # Voice memory writes go to bridge /api/voice/{memory_log,remember}. + # OpenAI function-calling-style tool escalation. A rollback path from + # PiVoiceLLM: plain conversational turns work directly against llama-swap. + # Tool escalation POSTs to BRIDGE_URL /api/voice/escalate — but that path + # ultimately depended on ZeroClaw, retired 2026-05-19, so tool calls are + # currently non-functional on this provider. See docs/tier1slim.md and + # custom-providers/tier1_slim/tier1_slim.py. Tier1Slim: type: tier1_slim url: # e.g. http://192.168.1.67:8080/v1 @@ -129,17 +129,6 @@ LLM: max_tokens: 256 temperature: 0.7 - # Legacy: full ZeroClaw ACP path (memory, agent tools, autonomy) — pays - # ~24K tokens of system prompt + tool schemas every turn. Tier1Slim - # introduced 2026-05-1x to skip that cost on the voice critical path. - ZeroClawLLM: - type: zeroclaw - url: http://:8080/api/message/stream - channel: dotty - timeout: 90 - system_prompt: | - You are , a desktop robot (StackChan body). Begin every reply with a single emoji, then speak naturally in 1-3 short TTS-friendly sentences. - # Generic OpenAI-compatible provider — works with OpenAI, OpenRouter, # Ollama (http://host:11434/v1), LM Studio, vLLM, etc. OpenAICompat: diff --git a/CLAUDE.md b/CLAUDE.md index f398f33..89297e3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,54 +2,56 @@ ## What This Is -Your self-hosted StackChan robot assistant. A fully self-hosted voice stack for the M5Stack **StackChan** desktop robot. The default persona is "Dotty" (customizable via `make setup`). Voice I/O routes through a self-hosted xiaozhi-esp32-server; brain is ZeroClaw on whatever Linux host you've chosen for it. No cloud AI services — fully self-hosted except for the LLM call (replaceable with local Ollama). +Your self-hosted StackChan robot assistant. A fully self-hosted voice stack for the M5Stack **StackChan** desktop robot. The default persona is "Dotty" (customizable via `make setup`). Voice I/O routes through a self-hosted xiaozhi-esp32-server; the brain is a **pi** coding agent running in the `dotty-pi` container. No cloud AI services — fully self-hosted except for the LLM call (replaceable with local Ollama). ## Architecture -Two voice-LLM paths coexist (selected via `selected_module.LLM` in `data/.config.yaml`): +The voice path runs through a single LLM provider — `PiVoiceLLM`, selected via `selected_module.LLM` in `data/.config.yaml`. Two alternate providers ship for fallback (`Tier1Slim`, `OpenAICompat`). ``` - StackChan hardware → configured persona - │ ESP32-S3, xiaozhi firmware (built from m5stack/StackChan source) - │ WiFi / WebSocket (Xiaozhi protocol) - ▼ - xiaozhi-esp32-server (Docker on a Linux host) - ├─ ASR: FunASR SenseVoiceSmall (local) - ├─ TTS: LocalPiper (en_US-kristin-medium); EdgeTTS / StreamingEdgeTTS alternates - └─ LLM: ZeroClawLLM ──────┐ ┌────── Tier1Slim - │ │ - Legacy single-tier path (DOTTY_VOICE_PROVIDER=zeroclaw): Two-tier path (DOTTY_VOICE_PROVIDER=tier1slim): - │ │ - HTTP POST /api/message inner-loop chat hits llama-swap directly - ▼ (qwen3.5:4b, ~sub-second) - zeroclaw-bridge (FastAPI) │ - on the bridge host │ tool calls only - │ ▼ - JSON-RPC 2.0 / ACP over stdio POST /api/voice/escalate → zeroclaw-bridge - ▼ │ - ZeroClaw (the brain) ◄─────────────────┘ (think_hard / memory_lookup / take_photo / play_song) + StackChan hardware → configured persona + │ ESP32-S3, xiaozhi firmware (built from m5stack/StackChan source) + │ WiFi / WebSocket (Xiaozhi protocol) + ▼ + xiaozhi-esp32-server (Docker) + ├─ ASR: FunASR SenseVoiceSmall / WhisperLocal (local) + ├─ TTS: LocalPiper; EdgeTTS / StreamingEdgeTTS alternates + └─ LLM: PiVoiceLLM + │ PiClient → `docker exec -i dotty-pi pi --mode rpc …` (JSONL over stdio) + ▼ + dotty-pi container — the pi coding agent (the brain) + ├─ outer loop: qwen3.5:4b on llama-swap + └─ dotty-pi-ext extension → 5 voice tools: + memory_lookup · remember · think_hard (→ qwen3.6:27b-think) · take_photo · play_song + only TTS-bound text streams back to xiaozhi-server + + Perception + ambient behaviour: firmware `event` frames → xiaozhi relay → dotty-behaviour (FastAPI, :8090) + Admin dashboard: bridge.py (FastAPI, :8080, served at /ui) ``` -Smart-mode flips the backend model: legacy path rewrites `~/.zeroclaw/config.toml` and restarts the bridge daemon; Tier1Slim path hot-swaps the live provider via `/xiaozhi/admin/set-tier1slim-model`. +All four server-side services — xiaozhi-server, `dotty-pi`, `dotty-behaviour`, and the `bridge.py` dashboard — run as Docker containers on a single Docker host. + +Smart-mode currently flips behaviour but **not** the backend model — the model-swap path was dropped in the #36 cutover and is v2 scope (see `docs/cutover-behaviour.md`). + +> **Cutover note:** until the #36 cutover (executed 2026-05-19) the brain was **ZeroClaw**, a Rust AI-agent fronted by a FastAPI bridge on a separate Raspberry Pi. That path — ZeroClaw, the ACP protocol, the `ZeroClawLLM` provider, and the RPi host — has been retired. `bridge.py` survived as the dashboard service; its voice and perception roles moved to `dotty-pi` and `dotty-behaviour`. Historical record: `docs/cutover-behaviour.md`. See `README.md` for the full visual architecture and message-flow diagrams. ## Network - **Admin workstation** (this machine): Development/admin workstation. Runs Claude Code sessions. -- **Docker host**: runs xiaozhi-esp32-server. Any Linux box with Docker works. Reachable on the LAN (and optionally Tailscale). -- **ZeroClaw host**: Runs ZeroClaw + the HTTP bridge (any Linux host with a working `zeroclaw` install). Reachable on the LAN (and optionally Tailscale). +- **Docker host**: runs xiaozhi-esp32-server, `dotty-pi`, `dotty-behaviour`, and the `bridge.py` dashboard — all as containers. Any Linux box with Docker works. Reachable on the LAN (and optionally Tailscale). - **StackChan**: On LAN WiFi only (not on Tailnet). Needs LAN IPs for OTA and WebSocket. SSH access is via Tailscale hostnames. Discover actual Tailscale hostnames at runtime with `tailscale status`. -This repo uses placeholders (``, ``, ``, ``, etc.) everywhere real values would normally appear — see the "Configuring for your environment" section of `README.md` for the full list. +This repo uses placeholders (``, ``, ``, etc.) everywhere real values would normally appear — see the "Configuring for your environment" section of `README.md` for the full list. ## Key Paths - **xiaozhi-server install dir** (on the Docker host): `` (e.g. `/opt/xiaozhi-server/`) -- **Custom LLM provider** (on the Docker host): mounted into container at `/opt/xiaozhi-server/core/providers/llm/zeroclaw/` -- **ZeroClaw bridge install dir**: `` (e.g. `~/zeroclaw-bridge/`) +- **Custom LLM provider** (on the Docker host): mounted into the xiaozhi container at `/opt/xiaozhi-server/core/providers/llm/pi_voice/` +- **dotty-pi / dotty-behaviour / bridge.py**: each deployed as its own container on the Docker host (see their respective `README.md` files and `scripts/deploy-behaviour.sh`) - **This project dir**: wherever you cloned `dotty-stackchan` ## Ports @@ -58,21 +60,23 @@ This repo uses placeholders (``, ``, ``, ``, `@ 'docker logs -f xiaozhi-esp32-server'` - **Restart pipeline**: `ssh @ 'cd && docker compose restart'` -- **Test bridge**: `curl http://:8080/health` -- **Test full round-trip**: `curl -X POST http://:8080/api/message -H 'Content-Type: application/json' -d '{"content":"hello"}'` +- **Test the dashboard service**: `curl http://:8080/health` +- **Test dotty-behaviour**: `curl http://:8090/health` ## Firmware iteration @@ -144,7 +150,7 @@ Gotchas hit in real sessions: ## Ambient perception layer (Phase 1) -Forward-looking modes (face-detected greeting, sound-direction head-turn, future curiosity / boredom mode) all subscribe to a single perception event bus on the bridge. Producers are firmware-resident and emit JSON `event` frames over the WS: +Forward-looking modes (face-detected greeting, sound-direction head-turn, future curiosity / boredom mode) all subscribe to a single perception event bus in `dotty-behaviour`. Producers are firmware-resident and emit JSON `event` frames over the WS: ```json {"type":"event","name":"face_detected","data":{}} @@ -155,17 +161,17 @@ Forward-looking modes (face-detected greeting, sound-direction head-turn, future Plumbing: - **Firmware emit**: `Application::SendEvent(name, data_json)` in upstream `application.cc` (lazy-opens the WS via `OpenAudioChannel()` because xiaozhi WS is otherwise session-scoped — without lazy-open, perception events from idle silently drop). -- **xiaozhi-server relay**: custom override at `custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py` adds an `EventTextMessageHandler` that POSTs each event frame to the bridge's `/api/perception/event`. -- **Bridge bus**: `_perception_listeners` pub/sub + `_perception_state[device_id]` per-device state in `bridge.py`, mirrored on the existing `_dashboard_event_listeners` pattern. -- **Consumers** (also bridge-side): `_perception_face_greeter` (Hi! greeting via `/xiaozhi/admin/inject-text`), `_perception_sound_turner` (head-turn via `/xiaozhi/admin/set-head-angles`), `_perception_face_lost_aborter` (TTS abort when audience walks away). +- **xiaozhi-server relay**: custom override at `custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py` adds an `EventTextMessageHandler` that POSTs each event frame to `dotty-behaviour`'s `/api/perception/event`. +- **dotty-behaviour bus**: the perception event bus + per-device state live in `dotty-behaviour` (`perception/state.py`, `perception/snapshot.py`). +- **Consumers** (`dotty-behaviour/consumers/`): `face_greeter` (Hi! greeting via `/xiaozhi/admin/inject-text`), `sound_turner` (head-turn via `/xiaozhi/admin/set-head-angles`), `face_lost_aborter` (TTS abort when audience walks away), and six more — see `dotty-behaviour/README.md`. WS lifecycle is the structural fact most easily forgotten: **xiaozhi only opens the WS during a conversation**, not persistently. Anything that needs to fire a server-bound event from idle has to either (a) trigger `OpenAudioChannel()` first or (b) accept that events are session-only. Producer A and B both assume (a) — done in `SendEvent`. -The Phase 4 firmware **StateManager** (`firmware/main/stackchan/modes/state_manager.{h,cpp}`) is a producer too — it emits `state_changed` on every mutex-state transition (`idle / talk / story_time / security / sleep / dance`) so bridge consumers can gate behaviour on state. The bridge tracks `_perception_state[device_id]["current_state"]` from those events. +The Phase 4 firmware **StateManager** (`firmware/main/stackchan/modes/state_manager.{h,cpp}`) is a producer too — it emits `state_changed` on every mutex-state transition (`idle / talk / story_time / security / sleep / dance`) so `dotty-behaviour` consumers can gate behaviour on state. `dotty-behaviour` tracks per-device `current_state` from those events. ## States, toggles & LEDs -`docs/modes.md` is the **authoritative source** for the six-state mutex (`idle / talk / story_time / security / sleep / dance`), the orthogonal toggles (`kid_mode`, `smart_mode`), the LED contract (state arc on left ring 0-5; face-state / kid / smart / listening indicators on right ring 6 / 8 / 9 / 11 with reserved pixels at 7 / 10 — all six right-ring pixels owned by StateManager and re-asserted at 5 Hz), the voice-phrase triggers, and the per-state backing-architecture (which states use ZeroClaw vs direct OpenRouter). When adding behaviour that responds to or changes Dotty's mode, read modes.md first — don't reinvent. +`docs/modes.md` is the **authoritative source** for the six-state mutex (`idle / talk / story_time / security / sleep / dance`), the orthogonal toggles (`kid_mode`, `smart_mode`), the LED contract (state arc on left ring 0-5; face-state / kid / smart / listening indicators on right ring 6 / 8 / 9 / 11 with reserved pixels at 7 / 10 — all six right-ring pixels owned by StateManager and re-asserted at 5 Hz), the voice-phrase triggers, and the per-state backing-architecture (which states use the pi agent vs direct OpenRouter). When adding behaviour that responds to or changes Dotty's mode, read modes.md first — don't reinvent. ## Deeper reference @@ -175,6 +181,5 @@ For hardware specs, protocol details, model internals, latent capabilities, and - xiaozhi-esp32-server: https://github.com/xinnan-tech/xiaozhi-esp32-server - xiaozhi-esp32 firmware (upstream): https://github.com/78/xiaozhi-esp32 -- ZeroClaw: https://github.com/zeroclaw-labs/zeroclaw - StackChan (hardware + firmware patches): https://github.com/m5stack/StackChan - Emotion protocol: https://xiaozhi.dev/en/docs/development/emotion/ diff --git a/COMPATIBILITY.md b/COMPATIBILITY.md index 9578054..1c4a78e 100644 --- a/COMPATIBILITY.md +++ b/COMPATIBILITY.md @@ -3,9 +3,9 @@ ## What this document covers This document defines the contract between the StackChan firmware and the -server-side components: xiaozhi-esp32-server, zeroclaw-bridge (`bridge.py`), -and the ZeroClaw agent. It describes what each component exposes, what counts -as a breaking change, and how to upgrade safely. +server-side components: xiaozhi-esp32-server, the `dotty-pi` agent container, +dotty-behaviour, and the `bridge.py` dashboard service. It describes what each +component exposes, what counts as a breaking change, and how to upgrade safely. For protocol wire formats see [protocols.md](https://brettkinny.github.io/dotty-stackchan/latest/protocols/). @@ -15,8 +15,9 @@ For protocol wire formats see [protocols.md](https://brettkinny.github.io/dotty- |---|---|---|---| | StackChan firmware (m5stack/StackChan v1.2.4) | v1.2.4 | Xiaozhi WebSocket protocol, MCP over WS (JSON-RPC 2.0) | Pin firmware to a known-good build; do not OTA-update without verifying server compatibility first | | xiaozhi-esp32-server (local build) | `xiaozhi-esp32-server-piper:local` | Custom LLM provider API, `.config.yaml` schema, Xiaozhi WS server | Rebuild image only after checking upstream changelog for provider API or config schema changes | -| zeroclaw-bridge (`bridge.py`) | unversioned (HEAD) | HTTP API (`/api/message`, `/api/message/stream`, `/health`), ACP JSON-RPC 2.0 over stdio | Endpoint signatures and NDJSON streaming format are stable; changes require updating the custom LLM provider in lockstep | -| ZeroClaw | latest (`zeroclaw acp`) | ACP protocol (session management, `session/prompt`, `session/update`), tool surface | Bridge auto-approves tool calls; new ZeroClaw versions that change ACP semantics require bridge review | +| dotty-pi (pi agent) | `dotty-pi:0.1.0` | pi RPC (JSONL over stdio), the five `dotty-pi-ext` voice tools | Pin the image tag; pi-version or model changes need end-to-end cutover testing | +| dotty-behaviour | `dotty-behaviour:0.1.0` | HTTP API (`/api/perception/*`, `/api/vision/*`, `/api/audio/*`, `/health`) | Endpoint signatures stable; perception event-schema changes require firmware + xiaozhi review | +| bridge.py (dashboard) | unversioned (HEAD) | `/ui` dashboard, `/admin/*`, `/health` | Dashboard/admin service only post-#36; admin route changes require updating dashboard callers | ## What counts as a breaking change @@ -27,17 +28,17 @@ Any of the following require coordinated updates across components: - **WebSocket frame shape** -- changes to the JSON message-type catalog (`hello`, `listen`, `stt`, `tts`, `llm`, `mcp`, `abort`) or binary audio framing versions. -- **Emotion-emoji protocol** -- changes to the emoji allowlist in `bridge.py` - (`_ensure_emoji_prefix`), the upstream 21-emotion catalog, or the +- **Emotion-emoji protocol** -- changes to the emoji allowlist (enforced in + the persona prompts), the upstream 21-emotion catalog, or the `llm`-type frame format. - **OTA handshake** -- changes to the OTA endpoint (`/ota/`), expected headers, or firmware version negotiation. - **Config schema** -- structural changes to `.config.yaml` (new required keys, renamed sections, removed defaults). -- **Bridge HTTP API** -- changes to request/response shapes on `/api/message` - or `/api/message/stream`, or to the NDJSON streaming format. -- **ACP session semantics** -- changes to `session/new`, `session/prompt`, or - `session/request_permission` behavior between ZeroClaw and the bridge. +- **dotty-behaviour HTTP API** -- changes to request/response shapes on + `/api/perception/*`, `/api/vision/*`, or `/api/audio/*`. +- **pi RPC** -- changes to the JSONL message shapes exchanged between + `PiClient` and the `dotty-pi` agent. ## Versioning strategy @@ -57,9 +58,9 @@ compatibility matrix"). When adopted, the plan is: compatible with the versions of the other components you are running. 2. **Back up before upgrading.** Run `scripts/backup.sh` (or the equivalent manual steps) to snapshot config, persona files, and bridge state. -3. **Upgrade one component at a time.** Validate with a round-trip test - (`curl -X POST http://:8080/api/message ...`) before moving to the - next component. +3. **Upgrade one component at a time.** Validate with a health check + (`curl http://:8090/health` and `:8080/health`) plus a live + voice turn before moving to the next component. 4. **Tail logs during validation.** Watch both the xiaozhi-server container logs and the bridge journal simultaneously to catch mismatches early. 5. **Roll back if broken.** Restore from the backup taken in step 2 and @@ -78,7 +79,7 @@ Server and firmware are versioned independently: | Bump | Server | Firmware | |------|--------|----------| -| **Major** | Breaking change to bridge HTTP API, NDJSON streaming format, or ACP session semantics | Breaking change to WebSocket frame shape, MCP tool surface, or OTA handshake | +| **Major** | Breaking change to the dotty-behaviour HTTP API or the pi RPC message format | Breaking change to WebSocket frame shape, MCP tool surface, or OTA handshake | | **Minor** | New endpoint, new provider, new config key (backward-compatible) | New emotion, new MCP tool, new config option | | **Patch** | Bug fix, performance improvement, doc-only change | Bug fix, cosmetic animation change | @@ -108,4 +109,4 @@ versions work with which firmware versions. --- -Last verified: 2026-05-17. +Last verified: 2026-05-22. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9836c44..18d9dba 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,10 +27,10 @@ modifications — please open an issue first to discuss the approach. deployment. - **Check for leaked placeholders or real values:** - - Files in this repo must use placeholders (``, ``, - ``, ``, etc.) everywhere a real IP, hostname, - username, or filesystem path would appear. See the "Configuring for your - environment" table in `README.md` for the full list. + - Files in this repo must use placeholders (``, ``, + ``, etc.) everywhere a real IP, hostname, username, or filesystem + path would appear. See the "Configuring for your environment" table in + `README.md` for the full list. - **Never commit real IPs, hostnames, usernames, API keys, or filesystem paths.** If your diff introduces a literal IP address or path that isn't a well-known default (like `127.0.0.1` or a standard port number), it @@ -46,8 +46,9 @@ Changes tend to fall into one of these areas: | Area | Files | Notes | |---|---|---| -| **Voice pipeline (xiaozhi-server)** | `docker-compose.yml`, `.config.yaml`, custom providers (`zeroclaw.py`, `edge_stream.py`, `fun_local.py`, `piper_local.py`) | These run inside the xiaozhi-server Docker container on whatever Linux Docker host you've chosen. | -| **Bridge** | `bridge.py`, `zeroclaw-bridge.service`, `bridge/` | Runs on the ZeroClaw host. The `bridge/` directory contains the Dockerfile, compose file, and deployment docs. | +| **Voice pipeline (xiaozhi-server)** | `docker-compose.yml`, `.config.yaml`, custom providers (`pi_voice/`, `tier1_slim/`, `edge_stream.py`, `fun_local.py`, `piper_local.py`) | These run inside the xiaozhi-server Docker container on the Docker host. | +| **Brain / behaviour** | `dotty-pi/`, `dotty-pi-ext/`, `dotty-behaviour/` | Docker containers on the same host as xiaozhi-server. `dotty-pi` is the pi agent (voice brain); `dotty-behaviour` is the perception/greeter service. | +| **Admin dashboard** | `bridge.py`, `bridge/` | FastAPI service on port 8080, running as a container on the Docker host. | | **Documentation** | `README.md`, `SETUP.md`, `docs/`, `session-prompt.md` | Docs under `docs/` follow conventions listed in `docs/README.md` (TL;DR at top, tables over prose, freshness footer). | | **CI** | `.github/workflows/` | Currently just the bridge Docker image build. | @@ -67,8 +68,7 @@ forked and configured per-deployment. Every value that varies between deployments must use a placeholder: - ``, ``, ``, `` -- ``, ``, ``, `` -- ``, `` +- `` - ``, `` Port numbers (`8000`, `8003`, `8080`, `18789`, `42617`) are product-generic @@ -90,11 +90,11 @@ only relevant for a future version, note that in the PR description. ## Safety-related changes -The child-safety enforcement layer (prompt sandwich in `bridge.py`, audience -framing in `.config.yaml`) is load-bearing. If your change touches the system -prompt, turn suffix, or emoji enforcement logic, please describe your -red-team testing in the PR description. See the commit history for examples -of the red-team battery format. +The child-safety enforcement layer (persona prompt sandwich, audience framing +in `.config.yaml`) is load-bearing. If your change touches the system prompt, +turn suffix, or emoji enforcement logic, please describe your red-team testing +in the PR description. See the commit history for examples of the red-team +battery format. ## Where to ask questions diff --git a/README.md b/README.md index 952b9c1..89fd4a8 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ > > **Known rough edges:** face emoji rendering is missing visual differentiation for 4 of 9 emotions (sad / surprise / love / laughing); sound-direction localizer has a hardware-AEC-related left-bias on M5Stack CoreS3 (energy detection works, direction is unreliable); kid-voice ASR accuracy on SenseVoice has a kid-speech gap that whisper.cpp will close in a follow-up. -Dotty is a fully self-hosted voice stack for the M5Stack StackChan desktop robot. Open-source firmware on the robot, [xiaozhi-esp32-server](https://github.com/xinnan-tech/xiaozhi-esp32-server) for voice I/O, and a small FastAPI bridge to whatever LLM agent you want as the brain. ASR, TTS, and session state all run on your own hardware. The LLM is pluggable — the shipped default is a two-tier path (a small fast model handles plain chat; tool calls escalate to a more capable model), with [llama-swap](./docs/cookbook/llama-swap-concurrent-models.md) as the recommended local backend. Swap in [Ollama](./docs/cookbook/run-fully-local.md) for the simpler single-binary option, or point at OpenRouter / any OpenAI-compatible API if you'd rather use the cloud. +Dotty is a fully self-hosted voice stack for the M5Stack StackChan desktop robot. Open-source firmware on the robot, [xiaozhi-esp32-server](https://github.com/xinnan-tech/xiaozhi-esp32-server) for voice I/O, and a local **pi** coding agent as the brain. ASR, TTS, and session state all run on your own hardware. The LLM is pluggable — the shipped default runs a small fast model for plain conversation and escalates hard questions to a more capable model, with [llama-swap](./docs/cookbook/llama-swap-concurrent-models.md) as the recommended local backend. Swap in [Ollama](./docs/cookbook/run-fully-local.md) for the simpler single-binary option, or point at OpenRouter / any OpenAI-compatible API if you'd rather use the cloud. Out of the box, Dotty ships in **Kid Mode** — age-appropriate language, safety guardrails, and content filtering are on by default. Disable Kid Mode for a general-purpose assistant. @@ -27,7 +27,7 @@ So Dotty is the version that passes: every component runs on hardware I own, eve - **Local or cloud TTS** — Piper (offline) or EdgeTTS (cloud). Swap with a config change. - **Streaming responses** — the bridge streams LLM output to the voice pipeline for lower perceived latency. - **Emoji expressions** — every response starts with an emoji that the firmware maps to a face animation (smile, laugh, sad, surprise, thinking, angry, love, sleepy, neutral). -- **MCP tools** — ZeroClaw exposes tools (web search, memory, etc.) to the LLM via the Model Context Protocol. +- **Voice tools** — the pi agent can search its memory, escalate hard questions to a bigger model, take a photo, and play songs, all mid-conversation. - **States, toggles & LEDs** — a six-state mutex (`idle / talk / story_time / security / sleep / dance`) plus two orthogonal toggles (`kid_mode`, `smart_mode`), all owned by the firmware StateManager and surfaced on the 12-pixel LED ring. Shipped on the active firmware fork (commit `d78118b`, 2026-04-27); the `firmware/firmware/` submodule pin in this repo lags, so flash from the active fork to get it. See "States, Toggles & LEDs" below and [`docs/modes.md`](./docs/modes.md). - **Vision (camera)** — the robot's built-in camera can capture images for multimodal LLM queries. - **Calendar context** — optional calendar integration feeds upcoming events into the conversation context. @@ -56,7 +56,7 @@ Full state taxonomy, colour palette, transition diagram, and per-state backing a ## Web dashboard (locally hosted) -The bridge serves a web dashboard at `http://:8080/ui` — host status, mode toggles (Kid Mode / Smart Mode), state switcher, perception card (face / identity), emoji presets, and a live event log (turns, perception events, errors). Light and dark themes follow the system preference. It's served from the same FastAPI process as the bridge, so there's nothing extra to deploy and no external service ever sees your data. +The dashboard service serves a web dashboard at `http://:8080/ui` — host status, mode toggles (Kid Mode / Smart Mode), state switcher, perception card (face / identity), emoji presets, and a live event log (turns, perception events, errors). Light and dark themes follow the system preference. It runs as a small FastAPI service (`bridge.py`) on your own hardware — no external service ever sees your data.

Dotty dashboard — light theme @@ -67,29 +67,23 @@ The bridge serves a web dashboard at `http://:8080/ui` — host s ## Reference deployment - **Hardware**: M5Stack StackChan (CoreS3 + servo kit), firmware built from `m5stack/StackChan`. -- **Brain**: a two-tier voice path — `qwen3.5:4b` on local [llama-swap](./docs/cookbook/llama-swap-concurrent-models.md) handles plain conversational turns directly; tool calls escalate to `qwen3.6:27b-think` (also on llama-swap) for hard reasoning or to [ZeroClaw](https://github.com/zeroclaw-labs/zeroclaw) for memory lookups. The legacy single-tier path (ZeroClaw + Qwen3-30B via OpenRouter on every turn) is still supported via `selected_module.LLM: ZeroClawLLM`. See [`docs/tier1slim.md`](./docs/tier1slim.md) and [`docs/brain.md`](./docs/brain.md). -- **Voice I/O**: xiaozhi-esp32-server on Docker (any Linux Docker host; single-host works too). +- **Brain**: a **pi** coding agent running in the `dotty-pi` container. It runs `qwen3.5:4b` on local [llama-swap](./docs/cookbook/llama-swap-concurrent-models.md) for the conversation loop and escalates hard questions to `qwen3.6:27b-think` (also on llama-swap) via its `think_hard` tool. xiaozhi-server's `PiVoiceLLM` provider hands each voice turn to the agent. See [`docs/brain.md`](./docs/brain.md). +- **Voice I/O**: xiaozhi-esp32-server on Docker (any Linux Docker host). ## What runs where | Component | Host | Notes | |---|---|---| | StackChan (device) | ESP32-S3 on the desk | Firmware built from `m5stack/StackChan` (see `SETUP.md`) | -| xiaozhi-esp32-server | server (``) | Docker, ports 8000 + 8003 | -| zeroclaw-bridge | ZeroClaw host (``) | FastAPI on port 8080, systemd | -| ZeroClaw daemon | ZeroClaw host (``) | `` | +| xiaozhi-esp32-server | server (``) | Docker — voice I/O, ports 8000 + 8003 | +| dotty-pi | server (``) | Docker — the pi agent, Dotty's voice brain | +| dotty-behaviour | server (``) | Docker — FastAPI: perception bus, ambient consumers, vision, greeter; port 8090 | +| dashboard service | server (``) | Docker — FastAPI admin dashboard (`bridge.py`); port 8080 | | Admin workstation | any LAN box | Development / `ssh` only | ## Get it running -The stack is three moving pieces — the device, xiaozhi-server (voice I/O), and zeroclaw-bridge (the FastAPI gateway in front of the LLM/ZeroClaw). You have a choice about how to host the two server pieces: - -| Shape | What you run | When to pick it | -|---|---|---| -| **Single-host** | One Docker compose file ([`compose.all-in-one.yml`](./compose.all-in-one.yml)) brings up xiaozhi-server **and** the bridge as containers on the same host. | Easiest first install. Recommended unless you already have a reason to split. | -| **Multi-host** | xiaozhi-server runs from [`docker-compose.yml`](./docker-compose.yml) on a Docker host; the bridge is installed natively (systemd) on a different machine via [`scripts/install-bridge.sh`](./scripts/install-bridge.sh). | You want the bridge on a low-power always-on box (e.g. a Raspberry Pi) and the GPU/Docker host on a beefier machine. The reference deployment runs this way. | - -Want fully offline? Add [`compose.local.override.yml`](./compose.local.override.yml) to either shape — it layers in an Ollama container so the LLM call no longer leaves the LAN. +The stack is the device plus four server-side pieces — xiaozhi-server (voice I/O), `dotty-pi` (the pi agent brain), `dotty-behaviour` (perception, ambient behaviour, and the proactive greeter), and the admin dashboard service. The four server pieces run as Docker containers on a single Docker host, alongside a local model backend ([llama-swap](./docs/cookbook/llama-swap-concurrent-models.md), or [Ollama](./docs/cookbook/run-fully-local.md) for the simpler single-binary option). Then: @@ -103,9 +97,9 @@ For what the stack *is* underneath — hardware specs, protocol docs, model fact - [docs/architecture.md](./docs/architecture.md) — end-to-end data flow, topology, deployment files, admin surface, perception bus, threat model. - [docs/hardware.md](./docs/hardware.md) — M5Stack StackChan body + firmware lineage + on-device MCP tool catalog. - [docs/voice-pipeline.md](./docs/voice-pipeline.md) — xiaozhi-esp32-server internals, FunASR/SenseVoice, VAD, TTS. -- [docs/tier1slim.md](./docs/tier1slim.md) — the default two-tier voice LLM provider, escalation contract, hot-swap. -- [docs/brain.md](./docs/brain.md) — model matrix, ZeroClaw architecture, the FastAPI bridge. -- [docs/protocols.md](./docs/protocols.md) — Xiaozhi WS framing, MCP-over-WS, ACP JSON-RPC, bridge HTTP API, emotion channel. +- [docs/tier1slim.md](./docs/tier1slim.md) — the Tier1Slim two-tier voice LLM provider (an alternate backend), escalation contract, hot-swap. +- [docs/brain.md](./docs/brain.md) — model matrix, the pi agent runtime, and how voice turns reach it. +- [docs/protocols.md](./docs/protocols.md) — Xiaozhi WS framing, MCP-over-WS, pi RPC, the dashboard HTTP API, emotion channel. - [docs/modes.md](./docs/modes.md) — behavioural mode taxonomy + LED contract + transition diagram (with shipped-vs-planned breakdown). - [docs/latent-capabilities.md](./docs/latent-capabilities.md) — features upstream supports that we aren't using yet. - [docs/references.md](./docs/references.md) — canonical upstream URLs, model cards, licenses. @@ -114,6 +108,5 @@ For what the stack *is* underneath — hardware specs, protocol docs, model fact - xiaozhi-esp32-server: https://github.com/xinnan-tech/xiaozhi-esp32-server - xiaozhi-esp32 firmware (upstream): https://github.com/78/xiaozhi-esp32 -- ZeroClaw: https://github.com/zeroclaw-labs/zeroclaw - StackChan (hardware + open firmware): https://github.com/m5stack/StackChan - Emotion protocol: https://xiaozhi.dev/en/docs/development/emotion/ diff --git a/ROADMAP.md b/ROADMAP.md index 5387804..81b3f70 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -17,18 +17,15 @@ v0.1 is the first tagged release — early-feedback alpha. Everything in this li - **Calendar context injection** -- Google Calendar events surfaced to the LLM for contextual reminders - **Length-aware brevity** -- default 1-2 short sentences, up to 6 for open-ended asks (story, explanation, list); cap enforced in code via `MAX_SENTENCES` - **ASR noise filtering** -- rejects punctuation-only / sub-threshold utterances -- **ACP session caching** -- long-lived sessions with idle/turn-count/wall-clock rotation -- **Single-host deployment** -- `compose.all-in-one.yml` runs everything on one machine -- **Multi-host deployment** -- documented split across Docker host + ZeroClaw host +- **Single-host deployment** -- all four server services (xiaozhi-server, dotty-pi, dotty-behaviour, bridge.py) run as Docker containers on one machine - **`make setup` wizard** -- interactive first-run: name your robot, fetch models, validate config - **MkDocs Material docs site** -- architecture, protocols, quickstart, troubleshooting, FAQ -- **Kid Mode channel routing** -- voice channels are kid-safe by default; the bridge's kid-mode sandwich (English-pin, emoji prefix, topic blocklist, jailbreak resistance) only applies when the inbound `channel` is in `VOICE_CHANNELS`, so messaging-platform channels (Discord, Telegram, etc.) skip it automatically. Pair with a separate ZeroClaw daemon on a more capable model for an unrestricted chat surface -- **Bridge `/admin/*` endpoints** -- localhost-only HTTP API for runtime config mutation: toggle kid-mode (`/admin/kid-mode`), flip smart-mode (`/admin/smart-mode`), overwrite persona files (`/admin/persona`), swap a daemon's `default_model` in its `config.toml` (`/admin/model`), and amend the MCP tool allowlist (`/admin/safety`, py_compile-validated). Paths and systemd unit names are env-configurable +- **Kid Mode channel routing** -- voice channels are kid-safe by default; the kid-mode sandwich (English-pin, emoji prefix, topic blocklist, jailbreak resistance) only applies when the inbound `channel` is in `VOICE_CHANNELS`, so messaging-platform channels (Discord, Telegram, etc.) skip it automatically - **`/xiaozhi/admin/*` endpoints** -- live-session control surface on xiaozhi-server: `set-state`, `set-toggle`, `set-tier1slim-model`, `set-face-identified`, `set-head-angles`, `inject-text`, `abort`, `take-photo`, `play-asset`, `songs`, `say`, `devices`. See [`architecture.md`](https://brettkinny.github.io/dotty-stackchan/latest/architecture/#admin-surface-two-services-two-prefixes) -- **Two-tier voice LLM (Tier1Slim)** -- `qwen3.5:4b` on local llama-swap handles plain conversational turns directly; tool calls (`memory_lookup`, `think_hard`, `take_photo`, `play_song`) escalate to the bridge via `/api/voice/escalate`. Default LLM since commit `b73f583`. See [`tier1slim.md`](https://brettkinny.github.io/dotty-stackchan/latest/tier1slim/) -- **Smart-mode hot-swap** -- when `DOTTY_VOICE_PROVIDER=tier1slim`, smart-mode flips call `/xiaozhi/admin/set-tier1slim-model` to mutate the live provider's `model` / `url` / `api_key` in place — no docker restart, no daemon restart, instant. Legacy `=zeroclaw` path still rewrites `config.toml` and restarts the daemon +- **Two-tier voice LLM (Tier1Slim)** -- alternate LLM provider: `qwen3.5:4b` on local llama-swap handles plain conversational turns directly; tool-call escalation path is non-functional post-#36 (escalation targeted the retired ZeroClaw bridge). Kept as a chitchat-only rollback behind `selected_module.LLM: Tier1Slim`. Default since commit `b73f583`; superseded by `PiVoiceLLM`. See [`tier1slim.md`](https://brettkinny.github.io/dotty-stackchan/latest/tier1slim/) +- **Smart-mode hot-swap** -- when `DOTTY_VOICE_PROVIDER=tier1slim`, smart-mode flip calls `/xiaozhi/admin/set-tier1slim-model` to mutate the live provider's `model` / `url` / `api_key` in place — no docker restart, no daemon restart, instant. The `PiVoiceLLM` path does not yet implement model-swap (v2 scope) - **llama-swap voice/coding matrix** -- `qwen3.5:4b` (voice inner loop) + `qwen3.6:27b-think` (think_hard target) co-resident under the `voice` matrix set; `qwen3.6:27b` for `pi` CLI runs alone under `coding`. See [`cookbook/llama-swap-concurrent-models.md`](https://brettkinny.github.io/dotty-stackchan/latest/cookbook/llama-swap-concurrent-models/) -- **Perception event bus** -- firmware `face_detected` / `face_lost` / `sound_event` / `state_changed` frames relay through xiaozhi's `EventTextMessageHandler` to the bridge's `/api/perception/event`, fanned out to six consumer tasks (face_greeter, sound_turner, face_lost_aborter, wake_word_turner, face_identified_refresher, purr_player) +- **Perception event bus** -- firmware `face_detected` / `face_lost` / `sound_event` / `state_changed` frames relay through xiaozhi's `EventTextMessageHandler` to `dotty-behaviour`'s `/api/perception/event`, fanned out to six consumer tasks (face_greeter, sound_turner, face_lost_aborter, wake_word_turner, face_identified_refresher, purr_player) - **Fully-local backend support** -- `compose.local.override.yml` for Ollama (single binary, simple) plus llama-swap recipe for concurrent multi-model serving. Both shipped; choose based on whether you need multiple models resident at once - **Voice catalog + install helper** -- `docs/voice-catalog.md` (12 Piper + 6 EdgeTTS) + `make voice-install` -- shipped - **Versioned docs via `mike`** -- `/latest/`, `/v0.1/`, `/dev/` URL structure shipped @@ -56,7 +53,7 @@ Actively being worked on or partially complete. **Big push 2026-04-25 evening:** - **First-audio latency reduction** -- two-tier path lands inner-loop turns under 1 s warm; further improvements queued (escalation parallelism, llama.cpp MTP PR #22673 for ~1.5-2× on think_hard) - **ASR accuracy for children's speech** -- post-ASR corrections live; Whisper Phase 1 scaffold landed at v0.1; A/B verification pending - **Face detection + tracking** -- shipped firmware-side; smoother+faster tuning queued (EMA 0.5, speed 500, deadband, MSR thr 0.40). Flash + bench-test pending -- **Layer 4 identity (description-based)** -- shipped + deployed. VLM (Gemini 2.0 Flash) returns a free-form description plus a roster name match against `~/.zeroclaw/household.yaml`'s `appearance:` field. No biometrics, no persistent identifiers. The earlier dlib biometric scaffold (`bridge/face_db.py` + `face_recognizer.py` + on-device `FaceRecognizer` + `ParentalGate` + 4 MCP tools) was removed — description-based covers the use case and biometrics conflicted with the no-storage identity posture +- **Layer 4 identity (description-based)** -- shipped + deployed. VLM (Gemini 2.0 Flash) returns a free-form description plus a roster name match against `household.yaml`'s `appearance:` field. No biometrics, no persistent identifiers. The earlier dlib biometric scaffold (`bridge/face_db.py` + `face_recognizer.py` + on-device `FaceRecognizer` + `ParentalGate` + 4 MCP tools) was removed — description-based covers the use case and biometrics conflicted with the no-storage identity posture - **Layer 6 proactive greetings** -- `bridge/proactive_greeter.py` + lifespan wiring shipped. Cooldown + time-of-day windowing + kid-safe sandwich + calendar-aware prompt + template fallback. Depends on Layer 4 for named greetings; works today with `face_detected` (unknown identity) for generic - **Layer 1 privacy-indicator LEDs** -- firmware scaffold drives mic/camera state via RAII peripheral guards. Camera `VIDIOC_STREAMOFF` wiring deferred (closes the always-streaming hole; queued) - **Wake word "Hey Dotty"** -- interim shipped: firmware default switched Chinese → English "Hi, ESP". Custom "Hey Dotty" microWakeWord roadmap documented (`docs/wake-word.md`); needs sample collection + Colab training (~2 weeks calendar) @@ -78,7 +75,7 @@ Designed but not yet started. Roughly in priority order. - **Runtime OTA provisioning** -- captive-portal WiFi + OTA URL setup on first boot (no rebuild to retarget) - **Layer 2.5 stereo mic + camera person tracking** -- sound-source localization + camera fusion for 360° awareness in idle mode - **Phase 3 continuous vision classifier** -- EfficientDet/YOLOX at 1Hz on the Docker host GPU once dual RTX 3060s land -- **Sleep-mode "dream" memory compaction** -- while Dotty is in `sleep` state (idle, overnight), a background pass feeds the day's ZeroClaw memory writes (perception events, conversation turns, declared facts, scene snapshots) to the smart model for compaction + summarisation. Two outputs: rewrite/prune the raw memory store (drop duplicates and low-signal perception spam, keep durable facts and notable events), and emit a separate human-readable daily summary that next-day turns can pull as "yesterday's context". Sleep-state-gated so the heavy LLM call never runs during interactive states. Pairs with the per-person memory and ambient scene memory work +- **Sleep-mode "dream" memory compaction** -- while Dotty is in `sleep` state (idle, overnight), a background pass feeds the day's memory writes (perception events, conversation turns, declared facts, scene snapshots) to the smart model for compaction + summarisation. Two outputs: rewrite/prune the raw memory store (drop duplicates and low-signal perception spam, keep durable facts and notable events), and emit a separate human-readable daily summary that next-day turns can pull as "yesterday's context". Sleep-state-gated so the heavy LLM call never runs during interactive states. Pairs with the per-person memory and ambient scene memory work - **Variant board port guide** -- walkthrough for adding support for other ESP32-S3 boards ## Community wishlist diff --git a/SECURITY.md b/SECURITY.md index 5700ce5..57babe9 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -11,8 +11,8 @@ typical hobby project; when Kid Mode is active the threat model escalates further because children become the audience: - **Voice pipeline exposure:** ASR transcripts, LLM prompts, and TTS audio - traverse the LAN between the device, the Docker host, and the ZeroClaw host. An - attacker on the LAN could intercept or inject traffic. + traverse the LAN between the device and the Docker host. An attacker on the + LAN could intercept or inject traffic. - **Kid Mode safety:** When Kid Mode is active, children are the intended audience. Prompt injection or jailbreaks that bypass the content-safety enforcement layer could expose a child to harmful content. @@ -25,12 +25,14 @@ further because children become the audience: ## What is in scope -- The FastAPI bridge (`bridge.py`) and its ACP session handling -- Custom xiaozhi-server providers (`zeroclaw.py`, `edge_stream.py`, +- The admin dashboard service (`bridge.py`) and its HTTP endpoints +- The `dotty-pi` agent container and `dotty-behaviour` perception service +- Custom xiaozhi-server providers (`pi_voice/`, `tier1_slim/`, `edge_stream.py`, `fun_local.py`, `piper_local.py`) -- Docker Compose configuration and container security -- Content-safety prompt enforcement (prompt sandwich, emoji prefix enforcement, - Kid Mode filtering) +- Docker Compose configuration and container security (including the + `/var/run/docker.sock` bind-mount used by `PiVoiceLLM`) +- Content-safety prompt enforcement (persona prompt sandwich, emoji prefix + enforcement, Kid Mode filtering) - The bridge Docker image and its CI pipeline - Documentation that could lead to insecure deployments if followed as-is @@ -38,8 +40,7 @@ further because children become the audience: - Upstream xiaozhi-esp32-server vulnerabilities (report to [xinnan-tech/xiaozhi-esp32-server](https://github.com/xinnan-tech/xiaozhi-esp32-server)) -- Upstream ZeroClaw vulnerabilities (report to - [zeroclaw-labs/zeroclaw](https://github.com/zeroclaw-labs/zeroclaw)) +- Upstream pi coding agent vulnerabilities (report to the `@earendil-works/pi-coding-agent` maintainer) - Upstream M5Stack StackChan firmware vulnerabilities (report to [m5stack/StackChan](https://github.com/m5stack/StackChan)) - LLM model behavior that is not caused by this project's prompts or code diff --git a/SETUP.md b/SETUP.md index 0ac161e..64e9485 100644 --- a/SETUP.md +++ b/SETUP.md @@ -1,8 +1,8 @@ # First-Boot Setup — Bringing Up Your StackChan Step-by-step for taking a fresh M5Stack StackChan from the box to a working, -fully-self-hosted voice robot. The backend (xiaozhi-server on a Docker host, -ZeroClaw bridge on a separate host) is assumed to already be deployed — if you're +fully-self-hosted voice robot. The backend (all four Docker containers on a +single Docker host) is assumed to already be deployed — if you're starting fresh, skim `README.md` first. > This guide assumes you've already substituted the placeholders from @@ -20,19 +20,19 @@ second or two: curl -s http://:8003/xiaozhi/ota/ # Expect: OTA接口运行正常,向设备发送的websocket地址是:ws://:8000/xiaozhi/v1/ -curl -s http://:8080/health -# Expect: {"status":"ok","service":"zeroclaw-bridge","acp_running":true} +curl -s http://:8080/health +# Expect: {"status":"ok", ...} -curl -s -X POST http://:8080/api/message \ - -H 'content-type: application/json' \ - -d '{"content":"hi"}' | jq . -# Expect: {"response":" ","session_id":"..."} +curl -s http://:8090/health +# Expect: {"status":"ok", ...} ``` If any fails, fix the backend before dealing with the robot: - OTA down → `ssh @ 'docker logs --tail 40 xiaozhi-esp32-server'` -- Bridge down → `ssh @ 'sudo journalctl -u zeroclaw-bridge -n 40 --no-pager'` +- Dashboard/bridge down → `ssh @ 'docker logs --tail 40 bridge'` +- dotty-behaviour down → `ssh @ 'docker logs --tail 40 dotty-behaviour'` +- dotty-pi down → `ssh @ 'docker logs --tail 40 dotty-pi'` --- @@ -176,22 +176,21 @@ If the device isn't on the list after 60s: also support a press-to-talk button on the side. 3. Watch the logs — you should see: - An ASR line with transcribed text - - A `ZeroClawLLM` call (hits the bridge) + - A `PiVoiceLLM` call (routes to the dotty-pi brain container) - A TTS line with the response text - Face animation changes to match the leading emoji 4. The robot speaks. If you hear audio but no face change, check that the response starts with one of: 😊 😆 😢 😮 🤔 😠 😐 😍 😴. -Expected first-audio latency: **~2–4s** after you stop speaking. If it's -way slower, check `http://:8080/health` for `acp_running:true` -(a dead ACP child means the bridge is re-spawning on every request). +Expected first-audio latency: **~5–8s** after you stop speaking (the pi agent +cold-start on first turn). Subsequent turns are faster once the model is warm. --- ## 6. Tune if needed All of these are edits to `data/.config.yaml` on the Docker host followed by -`docker compose restart`, except the LLM model (lives on the ZeroClaw host). +`docker compose restart`. | Complaint | Edit | File | |---|---|---| @@ -199,7 +198,7 @@ All of these are edits to `data/.config.yaml` on the Docker host followed by | "It waits forever after I stop talking" | lower `min_silence_duration_ms` to 400 | same | | "I don't like the voice" | change `voice:` to any Edge Neural voice | `data/.config.yaml` → TTS.EdgeTTS / StreamingEdgeTTS | | "Responses are too long" | add "Keep replies under 20 words." to the persona | `data/.config.yaml` → `prompt:` block | -| "Too slow to reply" | switch LLM model | `` on ZeroClaw host → `default_model` | +| "Too slow to reply" | switch LLM model | see [dotty-pi/README.md](../dotty-pi/README.md) for model selection rules | | "No facial expression change" | check response actually starts with a supported emoji (tail logs) | — | --- @@ -240,9 +239,7 @@ It just isn't what M5Stack ships today. ## 9. When it's working: bookmark these - **Tail voice pipeline**: `ssh @ 'docker logs -f xiaozhi-esp32-server'` -- **Tail bridge**: `ssh @ 'sudo journalctl -u zeroclaw-bridge -f'` -- **Smoke test end-to-end**: `curl -X POST http://:8080/api/message -H 'content-type: application/json' -d '{"content":"test"}'` -- **ZeroClaw's web UI** (for tweaking the agent persona directly): - `ssh -L 42617:127.0.0.1:42617 @` then open - http://localhost:42617 in a browser — pair with the code printed by - `sudo gateway get-paircode`. +- **Tail brain container**: `ssh @ 'docker logs -f dotty-pi'` +- **Tail perception/behaviour**: `ssh @ 'docker logs -f dotty-behaviour'` +- **Admin dashboard**: open `http://:8080/ui` in a browser. +- **Dashboard health**: `curl http://:8080/health` diff --git a/custom-providers/pi_voice/README.md b/custom-providers/pi_voice/README.md index dd15289..c1253e9 100644 --- a/custom-providers/pi_voice/README.md +++ b/custom-providers/pi_voice/README.md @@ -4,10 +4,7 @@ xiaozhi-server custom LLM provider that routes voice turns through the [`dotty-pi`](../../dotty-pi/) container instead of bridge.py. The RPi-replacement path per [#36](https://github.com/BrettKinny/dotty-stackchan/issues/36). -**Status: skeleton with working PiClient + LLMProvider, 6/6 unit tests -passing.** Not yet wired into xiaozhi-server's `selected_module:` config. -The legacy provider (`zeroclaw`) and the Tier1Slim provider (`tier1_slim`) -remain the production paths until this is soaked. +**Status: production.** Wired into xiaozhi-server via `selected_module.LLM: PiVoiceLLM`. Tier1Slim (`tier1_slim`) is the documented rollback path. 6/6 unit tests pass. What works: - `pi_client.py` — long-lived `pi --mode rpc` client; spawns once, @@ -123,10 +120,9 @@ about them. The container default is `qwen3.5:4b` outer + `qwen3.6:27b-think` escalation per `dotty-pi/README.md` — using `qwen3.6:27b` here would evict the voice matrix set, see that README's "Model selection" section. -Existing `DOTTY_VOICE_PROVIDER=pi` env-var contract on the bridge will -become the soak-toggle: when the xiaozhi-server side is on `PiVoiceLLM` -and the bridge is still up, the bridge becomes a no-op pass-through; -once soaked, bridge.py goes away entirely. +The `bridge.py` admin dashboard service continues to run independently; +it is no longer in the voice path. Its former `/api/voice/*` and +`/api/message` routes were retired in #36. ### Recovery: known-good rollback @@ -157,15 +153,15 @@ mount above is harmless when running other LLM providers. sandwich into the pi extension's system prompt. Latter is cleaner but means kid-mode toggles need to push a system-prompt swap into the container. -- **Memory write-back.** Tier1Slim posts every turn to - `bridge.py:/api/voice/memory_log` + `/api/voice/remember`. Once - bridge.py retires, those need new homes — likely a small write - inside the pi extension (sqlite_brain_db.write) triggered by a +- **Memory write-back.** Tier1Slim posted every turn to + `bridge.py:/api/voice/memory_log` + `/api/voice/remember` — those + endpoints are retired. Memory write-back now belongs in the pi + extension: a small write (sqlite_brain_db.write) triggered by a `[REMEMBER: …]` marker in the final assistant text, plus a per-turn - log row. Belongs in the pi extension, not here. -- **Persona file location.** Tier1Slim reads from a path on the bridge; - the pi extension will need its own path under - `/mnt/user/appdata/dotty-pi/persona/`. Wiring TBD. + log row. +- **Persona file location.** The pi extension reads from + `/mnt/user/appdata/dotty-pi/persona/` (bind-mounted into the container). + Wiring is stable; runtime persona-swap mechanism TBD. ## See also diff --git a/docker-compose.yml.template b/docker-compose.yml.template index 4a74c94..4fb117e 100644 --- a/docker-compose.yml.template +++ b/docker-compose.yml.template @@ -24,9 +24,11 @@ services: - NVIDIA_DRIVER_CAPABILITIES=compute,utility # --- END CUDA ENV --- - TZ= - # EDIT: set to the ZeroClaw bridge URL for vision intent handling - - VISION_BRIDGE_URL=http://:8080 - # Active persona file under ./personas/ (loaded by ZeroClawLLM provider). + # EDIT: set to the dotty-behaviour service URL for vision intent handling. + # dotty-behaviour runs on the same host; use the host LAN IP, not loopback + # (this container is on a bridge network, so 127.0.0.1 resolves to itself). + - VISION_BRIDGE_URL=http://:8090 + # Active persona file under ./personas/ (loaded by the LLM provider). # Currently fixed to "default" — smart.md exists for manual swap when you # want an adult-framed persona on the smart-mode cloud path. - PERSONA=default @@ -37,7 +39,6 @@ services: - ./data/.config.yaml:/opt/xiaozhi-esp32-server/data/.config.yaml:ro - ./models/SenseVoiceSmall:/opt/xiaozhi-esp32-server/models/SenseVoiceSmall - ./tmp:/opt/xiaozhi-esp32-server/tmp - - ./custom-providers/zeroclaw:/opt/xiaozhi-esp32-server/core/providers/llm/zeroclaw - ./custom-providers/openai_compat:/opt/xiaozhi-esp32-server/core/providers/llm/openai_compat - ./custom-providers/tier1_slim:/opt/xiaozhi-esp32-server/core/providers/llm/tier1_slim - ./custom-providers/pi_voice:/opt/xiaozhi-esp32-server/core/providers/llm/pi_voice @@ -47,7 +48,7 @@ services: # effective root on the docker host (it can `docker run --privileged # anything`). Acceptable for a single-purpose self-hosted appliance; # do not enable on a multi-tenant host. Required for `selected_module.LLM: - # PiVoiceLLM` — comment both lines out if running Tier1Slim / ZeroClawLLM. + # PiVoiceLLM` — comment both lines out if running Tier1Slim / OpenAICompat. - /var/run/docker.sock:/var/run/docker.sock - /usr/bin/docker:/usr/bin/docker:ro - ./personas:/opt/xiaozhi-esp32-server/personas:ro @@ -70,7 +71,7 @@ services: # DOTTY OTA: patched ota_handler that composes the firmware download URL # directly from local_ip+http_port instead of doing a string-replace on # `vision_explain` (which only worked when vision lived at the canonical - # `/mcp/vision/explain` path; our setup points it at the ZeroClaw host bridge). + # `/mcp/vision/explain` path; our setup points it at the dotty-behaviour service). - ./custom-providers/xiaozhi-patches/ota_handler.py:/opt/xiaozhi-esp32-server/core/api/ota_handler.py:ro # OTA artifact directory — drop m5stack-stack-chan_.bin here # and the device polls + self-flashes on next boot. See docs/ota-verification.md. diff --git a/docs/about.md b/docs/about.md index bf1e186..5658d7c 100644 --- a/docs/about.md +++ b/docs/about.md @@ -18,7 +18,7 @@ Every component in the pipeline is swappable: the LLM, the TTS engine, the ASR p - **Local speech recognition.** FunASR SenseVoiceSmall runs on your server. Audio never leaves your LAN. - **Pluggable LLM.** The reference config uses Qwen via OpenRouter. Swap in any OpenAI-compatible API or Ollama for fully local inference. - **Local TTS option.** Piper TTS runs entirely on-host. EdgeTTS (Microsoft's cloud neural voices) is also supported as a low-friction alternative. -- **Emoji-driven facial expressions.** The LLM's response starts with an emoji (smile, laugh, sad, surprise, thinking, angry, neutral, love, sleepy). The firmware parses it into a face animation on the robot's display. Three layers enforce this: the agent prompt, the server system prompt, and a bridge-level fallback. +- **Emoji-driven facial expressions.** The LLM's response starts with an emoji (smile, laugh, sad, surprise, thinking, angry, neutral, love, sleepy). The firmware parses it into a face animation on the robot's display. Two layers enforce this: the pi agent persona prompt and the xiaozhi-server system prompt. - **Fully local deployment.** Ollama (LLM) + Piper (TTS) + FunASR (ASR) = zero outbound network calls. Your data stays on your hardware. ## Who is this for? @@ -42,7 +42,7 @@ This is a hackable starting point, not a product. There are no releases, no inst - A working voice pipeline: robot audio in, transcription, LLM response, speech out, facial expression. - Infrastructure-as-config: Docker Compose files, systemd units, custom provider code, config templates with placeholders. - Documentation for the architecture, protocols, and deployment. -- A reference persona and agent configuration (ZeroClaw + Qwen). +- A reference persona and agent configuration (pi agent + Qwen via llama-swap). ## What's out of scope diff --git a/docs/advanced/variant-port-guide.md b/docs/advanced/variant-port-guide.md index 6da29a1..550731d 100644 --- a/docs/advanced/variant-port-guide.md +++ b/docs/advanced/variant-port-guide.md @@ -5,7 +5,7 @@ description: How to run Dotty's voice stack on an ESP32-S3 board other than the # Variant port guide -Dotty's server stack (xiaozhi-server, bridge, ZeroClaw) is protocol-agnostic — it doesn't care which ESP32-S3 board is on the other end of the WebSocket. All the interesting porting work is in the firmware. +Dotty's server stack (xiaozhi-server, bridge, dotty-pi, dotty-behaviour) is protocol-agnostic — it doesn't care which ESP32-S3 board is on the other end of the WebSocket. All the interesting porting work is in the firmware. This guide explains how to bring up the voice pipeline on a different ESP32-S3 board, and what hardware adaptation is needed to get the robot-body features (servos, LEDs, display) working. @@ -20,7 +20,7 @@ This guide explains how to bring up the voice pipeline on a different ESP32-S3 b ## Server side: nothing to change -xiaozhi-server, the bridge, and ZeroClaw run on the server host — not on the device. They communicate over the Xiaozhi WebSocket protocol, which is board-agnostic. +xiaozhi-server, the bridge, dotty-pi, and dotty-behaviour all run on the Docker host — not on the device. They communicate over the Xiaozhi WebSocket protocol, which is board-agnostic. The only server-side value that varies per board is the OTA firmware filename, which you set in the device's `sdkconfig` before flashing. @@ -182,7 +182,7 @@ Once the device connects, run through: 2. **Voice round-trip** — speak a simple phrase and confirm ASR → LLM → TTS returns audio to the device. 3. **MCP tool call** — send a test instruction through the bridge: ```bash - curl -X POST http://:8080/api/message \ + curl -X POST http://:8080/api/message \ -H 'Content-Type: application/json' \ -d '{"content":"Turn your head to the right"}' ``` diff --git a/docs/architecture.md b/docs/architecture.md index 5a368cd..8b67857 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,20 +1,19 @@ --- title: Architecture -description: Three-host architecture and message flow for the self-hosted voice stack. +description: Single-host architecture and message flow for the self-hosted voice stack (post-#36 cutover). --- # Architecture ## TL;DR -- Three hosts: **robot** (StackChan on your desk) → **server** (runs xiaozhi-esp32-server) → **ZeroClaw host** (runs ZeroClaw + a FastAPI bridge). -- Audio goes robot → server → (text) → ZeroClaw host → (response text) → server → (audio) → robot. The ZeroClaw host never touches audio. -- **Two voice paths coexist**, selected by `selected_module.LLM` in `.config.yaml`: - - **`Tier1Slim`** (current default) — a small/fast LLM (`qwen3.5:4b`) handles plain turns directly from xiaozhi-server against a local llama-swap, no bridge round-trip. Tool calls escalate to the bridge via `/api/voice/escalate`. - - **`ZeroClawLLM`** (legacy single-tier) — every turn goes through the bridge → ZeroClaw → OpenRouter (Qwen3-30B). -- Everything is LAN-local **except** cloud-routed LLM calls (smart-mode, ZeroClawLLM, VLM, audio caption). EdgeTTS is cloud too when selected; Piper is fully local. -- The "xiaozhi ↔ brain" seam is HTTP + ACP-over-stdio, not a library call — either side can be swapped independently. -- The robot speaks the **Xiaozhi WebSocket protocol** (see [protocols.md](./protocols.md)). It has no hardcoded knowledge of ZeroClaw. +- Two hosts: **robot** (StackChan on your desk) and a **single Docker host** (``) that runs all four server-side services. +- Audio goes robot → xiaozhi-server → (text) → dotty-pi → (response text) → xiaozhi-server → (audio) → robot. The Docker host never sends audio to the robot — xiaozhi-server handles that. +- The default voice provider is **`PiVoiceLLM`**, selected via `selected_module.LLM` in `.config.yaml`. Two documented alternates exist (`Tier1Slim`, `OpenAICompat`) — see [llm-backends.md](./llm-backends.md). +- Everything is LAN-local **except** cloud-routed LLM calls (smart-mode, VLM, audio caption). EdgeTTS is cloud when selected; Piper is fully local. +- The robot speaks the **Xiaozhi WebSocket protocol** (see [protocols.md](./protocols.md)). It has no knowledge of the services running on the Docker host. + +> **Cutover note (2026-05-19, issue #36):** The stack previously ran on three hosts — a separate ZeroClaw host (Raspberry Pi) ran the ZeroClaw Rust agent + a FastAPI bridge under systemd. That host has been retired. The brain is now the `dotty-pi` container; the voice provider is `PiVoiceLLM`. See [cutover-behaviour.md](./cutover-behaviour.md) for the historical runbook. ## Topology @@ -24,175 +23,161 @@ flowchart LR SC["M5Stack StackChan
ESP32-S3"] end - subgraph DockerHost["Server - <XIAOZHI_HOST>"] - XZ["xiaozhi-esp32-server
(Docker)
:8000 WS + :8003 HTTP"] + subgraph DockerHost["Docker host — <XIAOZHI_HOST>"] + XZ["xiaozhi-esp32-server
:8000 WS + :8003 HTTP"] subgraph XZMods["voice pipeline"] VAD["SileroVAD"] - ASR["FunASR
SenseVoiceSmall"] - T1S["Tier1Slim
(default LLM)"] - ZCL["ZeroClawLLM
(legacy LLM)"] + ASR["FunASR SenseVoiceSmall
/ WhisperLocal"] + PV["PiVoiceLLM
(default LLM provider)"] TTS["TTS
(LocalPiper default,
EdgeTTS available)"] end + PI["dotty-pi
(pi agent container)"] + BH["dotty-behaviour
FastAPI :8090"] + BR["bridge.py
FastAPI :8080 (/ui dashboard)"] end - subgraph Llama["llama-swap (Unraid)"] - SLIM["qwen3.5:4b
(Tier1 inner loop)"] + subgraph Llama["llama-swap (same host or LAN GPU host)"] + SLIM["qwen3.5:4b
(pi outer loop)"] THINK["qwen3.6:27b-think
(think_hard target)"] end - subgraph ZCHost["ZeroClaw host - <ZEROCLAW_HOST>"] - Bridge["zeroclaw-bridge
FastAPI :8080
systemd unit"] - ZC["ZeroClaw daemon
(agent persona)"] - end - - Cloud["OpenRouter
(smart_mode, VLM,
audio caption, legacy)"] + Cloud["OpenRouter
(smart_mode, VLM,
audio caption)"] SC -->|"WebSocket
Xiaozhi protocol"| XZ - XZ --- VAD & ASR & T1S & ZCL & TTS - T1S -->|"plain turn (default)"| SLIM - T1S -->|"tool_call escalation
POST /api/voice/escalate"| Bridge - ZCL -->|"HTTP POST
/api/message"| Bridge - Bridge -->|"think_hard"| THINK - Bridge -->|"JSON-RPC 2.0
over stdio (ACP)"| ZC - ZC -.->|"LLM API call"| Cloud - Bridge -.->|"VLM / audio caption"| Cloud - T1S -.->|"smart_mode ON"| Cloud - XZ -->|"perception event
POST /api/perception/event"| Bridge + XZ --- VAD & ASR & PV & TTS + PV -->|"docker exec -i dotty-pi
pi --mode rpc (JSONL stdio)"| PI + PI -->|"outer loop"| SLIM + PI -->|"think_hard escalation"| THINK + PI -.->|"smart_mode ON"| Cloud + XZ -->|"perception event
POST /api/perception/event"| BH + BH -.->|"VLM / audio caption"| Cloud TTS -->|"audio stream"| XZ XZ -->|"WebSocket"| SC ``` -Solid arrows are per-turn data flow; dotted arrows are cloud / conditional. The two voice paths share the same physical xiaozhi container — only one is active at a time via `selected_module.LLM`. +Solid arrows are per-turn data flow; dotted arrows are cloud / conditional. All four server-side services share one Docker host. ## Actors | Actor | Host | Role | Process | |---|---|---|---| | **StackChan** | Desk | Captures audio, plays audio, renders face, runs MCP tools for head/LED/camera | ESP32-S3 firmware built from `m5stack/StackChan` | -| **xiaozhi-esp32-server** | Server | VAD → ASR → LLM (proxy) → TTS pipeline, emotion dispatch, OTA, admin surface | Docker container | -| **Tier1Slim custom provider** | Server (inside container) | Default LLM provider — talks directly to llama-swap for plain turns, posts to `/api/voice/escalate` for tool calls | Python, mounted via volume | -| **ZeroClawLLM custom provider** | Server (inside container) | Legacy single-tier LLM provider — translates xiaozhi's LLM-provider interface to an HTTP POST to the bridge | Python, mounted via volume | -| **llama-swap** | Unraid (or any GPU host) | Routes OpenAI-compatible requests to per-model llama-server children; co-loads `qwen3.5:4b` (voice inner loop) and `qwen3.6:27b-think` (`think_hard` target) | Docker container (`ghcr.io/mostlygeek/llama-swap:cuda`) | -| **zeroclaw-bridge** | ZeroClaw host | Accepts HTTP POSTs from both voice paths + xiaozhi-server perception relay; spawns/holds a `zeroclaw acp` child; speaks ACP JSON-RPC to it | FastAPI + uvicorn under systemd | -| **ZeroClaw daemon** | ZeroClaw host | The configured persona — runs the agent loop, calls the LLM, consults `SOUL.md`/`IDENTITY.md`/`MEMORY.md` | Rust binary (`zeroclaw acp`) | -| **OpenRouter** | Cloud | Routes cloud LLM calls (smart_mode `claude-sonnet-4-6`, VLM `gemini-2.0-flash`, audio caption `gemini-2.5-flash`, and the legacy ZeroClaw path's Qwen3-30B) | External | +| **xiaozhi-esp32-server** | Docker host | VAD → ASR → LLM (proxy) → TTS pipeline, emotion dispatch, OTA, admin surface | Docker container | +| **PiVoiceLLM custom provider** | Docker host (inside xiaozhi container) | Default LLM provider — translates each voice turn into a pi RPC request, streams TTS-bound text back | Python, mounted via volume | +| **dotty-pi** | Docker host | The voice-tool brain — pi coding agent with the `dotty-pi-ext` extension; owns the agent loop and tool dispatch | Docker container (`dotty-pi`) | +| **dotty-behaviour** | Docker host | Perception event bus, 9 ambient consumers, vision/audio explain endpoints, proactive greeter, calendar context | FastAPI container, port 8090 | +| **bridge.py** | Docker host | Admin dashboard service (`/ui`, port 8080). Voice and perception roles were retired in #36; dashboard port to dotty-behaviour is pending. | FastAPI container, port 8080 | +| **llama-swap** | Same host or LAN GPU host | Routes OpenAI-compatible requests to per-model llama-server children; co-loads `qwen3.5:4b` (pi outer loop) and `qwen3.6:27b-think` (`think_hard` target) | Docker container (`ghcr.io/mostlygeek/llama-swap:cuda`) | +| **OpenRouter** | Cloud | Routes cloud LLM calls (smart_mode `claude-sonnet-4-6`, VLM `gemini-2.0-flash`, audio caption `gemini-2.5-flash`) | External | -## Data flow (single utterance, Tier1Slim path — plain turn) +## Data flow (single utterance, PiVoiceLLM — normal turn) ```mermaid sequenceDiagram autonumber participant User participant SC as StackChan - participant XZ as xiaozhi-server
(server) + participant XZ as xiaozhi-server + participant PV as PiVoiceLLM / PiClient + participant PI as dotty-pi
(pi agent) participant LS as llama-swap
qwen3.5:4b User->>SC: speaks SC->>XZ: Opus audio frames (WebSocket) XZ->>XZ: SileroVAD → speech end XZ->>XZ: FunASR / Whisper → text - XZ->>LS: chat/completions (tools=auto) - LS-->>XZ: "😊 The sky is blue!" (no tool_calls) + XZ->>PV: generate() call + PV->>PI: docker exec -i dotty-pi pi --mode rpc (JSONL over stdio) + PI->>LS: chat/completions + LS-->>PI: "😊 The sky is blue!" + PI-->>PV: JSONL text chunks (TTS-bound only) + PV-->>XZ: streamed text XZ->>XZ: strip leading emoji → emotion frame XZ->>XZ: TTS (Piper / EdgeTTS) → Opus frames XZ-->>SC: audio + emotion SC-->>User: speaks + face animation ``` -## Data flow (Tier1Slim path — escalated tool call) - -```mermaid -sequenceDiagram - autonumber - participant XZ as xiaozhi-server - participant LS as llama-swap
qwen3.5:4b - participant Br as zeroclaw-bridge - participant ZC as ZeroClaw - participant THK as llama-swap
qwen3.6:27b-think - - XZ->>LS: chat/completions (tools=auto) - LS-->>XZ: tool_calls=[think_hard(...)] - XZ-->>XZ: yield filler ("hmm, let me think…") to TTS - XZ->>Br: POST /api/voice/escalate
{"tool":"think_hard","args":...} - Br->>THK: chat completion - THK-->>Br: reasoned answer - Br-->>XZ: {"result":"..."} - XZ->>LS: chat/completions (tool result in context) - LS-->>XZ: streamed final answer - XZ->>XZ: TTS → Opus frames -``` +## Data flow (PiVoiceLLM — tool call inside the agent) -## Data flow (legacy ZeroClawLLM path) +Tool dispatch happens entirely inside the `dotty-pi` container. The pi agent with the `dotty-pi-ext` extension drives the tool loop; xiaozhi-server and PiVoiceLLM see only the final streamed text. ```mermaid sequenceDiagram autonumber - participant XZ as xiaozhi-server - participant Br as zeroclaw-bridge - participant ZC as ZeroClaw - participant LLM as OpenRouter
→ Qwen3-30B - - XZ->>Br: POST /api/message
{"content":"...","channel":"stackchan"} - Br->>ZC: session/new + session/prompt
(JSON-RPC over stdio) - ZC->>LLM: chat completion - LLM-->>ZC: response - ZC-->>Br: stdout JSON-RPC result - Br->>Br: ensure emoji prefix
enforce English suffix - Br-->>XZ: {"response":"😊 Sure, the weather is..."} + participant PI as dotty-pi
(pi agent + dotty-pi-ext) + participant LS4 as llama-swap
qwen3.5:4b + participant THK as llama-swap
qwen3.6:27b-think + participant BH as dotty-behaviour
:8090 + + PI->>LS4: chat/completions + LS4-->>PI: tool_call: think_hard(question) + PI->>THK: direct POST /v1/chat/completions
(enable_thinking=false, 200-token cap) + THK-->>PI: reasoned answer + PI->>LS4: chat/completions (tool result in context) + LS4-->>PI: streamed final answer + + Note over PI,BH: take_photo tool variant + PI->>BH: GET /api/voice/take_photo + BH-->>PI: latest cached vision description ``` -See [protocols.md](./protocols.md) for every wire format referenced above, and [tier1slim.md](./tier1slim.md) for the four-tool escalation catalogue. +The five voice tools in `dotty-pi-ext`: `memory_lookup`, `remember`, `think_hard`, `take_photo`, `play_song`. See [brain.md](./brain.md) for the full tool catalogue. ## Why this shape -- **Audio lives with the server** because the StackChan firmware already speaks the Xiaozhi WS protocol; xiaozhi-esp32-server is the matching server. Putting the brain next to the mic would require us to reimplement that protocol, and we'd still need a voice server. -- **The brain lives with ZeroClaw** because ZeroClaw already has memory, tools, persona, channels, LLM routing wired up. The robot is just another channel into the same agent. -- **A bridge lives between them** because ZeroClaw's HTTP API is observational; to *prompt* a running agent you have to use ACP (JSON-RPC 2.0 over stdio) against a `zeroclaw acp` child. The bridge is a tiny FastAPI adapter that does exactly that. -- **The seam is a custom xiaozhi LLM provider** (`zeroclaw.py`, mounted into the container). xiaozhi-server thinks it's calling a local Python LLM class; the class just does an HTTP POST to `:8080/api/message`. That means we could swap ZeroClaw for anything HTTP-serviceable without touching xiaozhi. +- **Audio lives with xiaozhi-server** because the StackChan firmware already speaks the Xiaozhi WS protocol. xiaozhi-esp32-server is the matching server; any alternative would require reimplementing that protocol. +- **The brain is a container on the same host** because co-locating dotty-pi with xiaozhi-server eliminates network latency on the `docker exec` stdio pipe and avoids a second host to manage. +- **The seam is a custom xiaozhi LLM provider** (`pi_voice.py`, mounted into the container). xiaozhi-server thinks it's calling a local Python LLM class; the class runs pi via `docker exec`. That means the brain can be swapped for anything without touching xiaozhi. +- **dotty-behaviour is a peer container** rather than code inside xiaozhi-server because perception consumers fire 200-token narrative LLM calls; blocking the xiaozhi event loop with that would spike voice latency. A peer container preserves the operational separation that already worked on the Raspberry Pi. -## What each host sees +## What each service sees -**Robot** knows only: +**StackChan** knows only: - An OTA HTTP URL (`http://:8003/xiaozhi/ota/`) - A WS URL (provided by OTA response, typically `ws://:8000/xiaozhi/v1/`) -It does **not** know about the ZeroClaw host, ZeroClaw itself, or any LLM. +It does **not** know about dotty-pi, dotty-behaviour, or any LLM. -**Server / xiaozhi-server** knows: +**xiaozhi-server** knows: - Its own device-facing WS + OTA ports - A handful of pluggable providers selected via `data/.config.yaml` `selected_module:` -- The LLM provider's `base_url: http://:8080` to reach the bridge +- The `container_name: dotty-pi` config key, which PiVoiceLLM uses for `docker exec` -It does **not** know about ACP, ZeroClaw workspace files, or the OpenRouter key. +It does **not** know about llama-swap model names, brain.db, or OpenRouter keys. -**ZeroClaw host / bridge** knows: -- How to `subprocess.Popen("zeroclaw acp")` and speak JSON-RPC 2.0 on stdin/stdout -- How to wrap turns that arrive with `channel="stackchan"` in the English+emoji sandwich -- The OpenRouter API key lives in ZeroClaw's config, not in the bridge +**dotty-pi (pi agent)** knows: +- The llama-swap endpoint and model aliases (via `models.json` inside the container) +- The `dotty-pi-ext` extension with the five voice tools +- The persona files and `brain.db` (mounted from host appdata) -**ZeroClaw** knows everything agent-side — provider keys, memory, tools, persona files. It does **not** know what host/channel the request came from beyond the `channel` string passed in. +It does **not** know about the xiaozhi WebSocket protocol or audio. -## Admin surface (two services, two prefixes) +**dotty-behaviour** knows: +- The xiaozhi admin endpoints (for inject-text, set-head-angles, abort) +- Vision/audio caption API keys (for scene synthesis) -Admin routes are split across the two services and reached at different prefixes. Don't conflate them. +**bridge.py** (dashboard service) knows: +- The xiaozhi admin endpoints (for dashboard relay) +- Its voice and perception routing tables were retired in #36; dashboard port to dotty-behaviour is pending -### Bridge `/admin/*` (ZeroClaw host, `127.0.0.1:8080` only) +## Admin surface (two services, two prefixes) -Runtime mutations to ZeroClaw / the bridge process itself. Bound to localhost only — LAN callers get `403`. +Admin routes are split across two services and reached at different prefixes. -| Endpoint | Effect | Restart | -|---|---|---| -| `POST /admin/kid-mode` `{enabled: bool}` | Persists + hot-reloads kid-mode via `_apply_kid_mode()` (no daemon restart needed; module globals re-bind atomically). Pushes the kid pip via the xiaozhi admin relay. | none | -| `POST /admin/smart-mode` `{enabled: bool, device_id?}` | Persists + pushes the smart pip. When `DOTTY_VOICE_PROVIDER=tier1slim`: hot-swaps Tier1Slim's model/url/api_key via `/xiaozhi/admin/set-tier1slim-model` (in-process; no restart). When `DOTTY_VOICE_PROVIDER=zeroclaw`: rewrites ZeroClaw's `config.toml` and restarts the voice daemon. | conditional (see above) | -| `POST /admin/persona` `{file, content}` | Overwrites a workspace persona file (`SOUL.md`, `IDENTITY.md`, `USER.md`, `AGENTS.md`, `TOOLS.md`, `BOOTSTRAP.md`, `HEARTBEAT.md`, `MEMORY.md`). Atomic via `.new` + rename. | none | -| `POST /admin/model` `{daemon, model}` | TOML-edits `default_model` in the chosen daemon's `config.toml`. | named daemon | -| `POST /admin/safety` `{action, tool}` | Adds/removes a tool in `MCP_TOOL_ALLOWLIST` via the `# === ADMIN_ALLOWLIST_START/END ===` marker block. py_compile-validated; on syntax error the bridge is left untouched. | bridge (self-restart) | +### bridge.py `/admin/*` (Docker host, `127.0.0.1:8080` only) -Paths and systemd unit names are env-configurable (`ZEROCLAW_VOICE_CFG`, `ZEROCLAW_VOICE_UNIT`, `ZEROCLAW_DISCORD_CFG`, `ZEROCLAW_DISCORD_UNIT`, `ZEROCLAW_WORKSPACE`); defaults match the documented ZeroClaw-host layout. +Runtime mutations. Bound to localhost only — LAN callers get `403`. Note: the voice-path mutations (`smart-mode` daemon restart, `persona` workspace files) referenced pre-cutover ZeroClaw config; those endpoints exist but their ZeroClaw-specific side-effects are retired. + +| Endpoint | Effect | +|---|---| +| `POST /admin/kid-mode` `{enabled: bool}` | Persists + hot-reloads kid-mode. Pushes the kid pip via the xiaozhi admin relay. | +| `POST /admin/smart-mode` `{enabled: bool, device_id?}` | Persists + pushes the smart pip. Smart-mode model swap now handled in PiVoiceLLM. | +| `POST /admin/safety` `{action, tool}` | Edits `MCP_TOOL_ALLOWLIST` via marker block; py_compile-validated. | -### xiaozhi-server `/xiaozhi/admin/*` (server, port 8003) +### xiaozhi-server `/xiaozhi/admin/*` (Docker host, port 8003) -Operations that need to touch a live device session — head servos, MCP dispatch, TTS injection, the live LLM provider's runtime config. Exposed by `custom-providers/xiaozhi-patches/http_server.py`. Bound to the xiaozhi container's listen address (not localhost-only); protect with network ACLs if the server is reachable from untrusted networks. +Operations that need to touch a live device session — head servos, MCP dispatch, TTS injection. Exposed by `custom-providers/xiaozhi-patches/http_server.py`. | Endpoint | Purpose | |---|---| @@ -203,11 +188,10 @@ Operations that need to touch a live device session — head servos, MCP dispatc | `POST /xiaozhi/admin/set-state` | Dispatch a `set_state` MCP call to firmware. | | `POST /xiaozhi/admin/set-toggle` | Dispatch a `set_toggle` MCP call (kid/smart pip on the firmware ring). | | `POST /xiaozhi/admin/set-face-identified` | Light the face-identified pixel for ~4 s. | -| `POST /xiaozhi/admin/take-photo` | Trigger a camera capture from the bridge (separate from the firmware MCP path). | -| `POST /xiaozhi/admin/set-tier1slim-model` | Hot-swap the live Tier1Slim provider's `model` / `url` / `api_key`. Driven by smart-mode flips. | +| `POST /xiaozhi/admin/take-photo` | Trigger a camera capture. | | `GET /xiaozhi/admin/songs` | List audio assets available to `play_song`. | | `POST /xiaozhi/admin/play-asset` | Play a named audio asset through the speaker. | -| `POST /xiaozhi/admin/say` | Synthesise + play arbitrary text (lower-level than `inject-text`). | +| `POST /xiaozhi/admin/say` | Synthesise + play arbitrary text. | ### Perception event bus @@ -219,7 +203,7 @@ Firmware-resident producers emit JSON `event` frames over the xiaozhi WebSocket: {"type":"event","name":"sound_event","data":{"direction":"left","balance":0.997,"energy":1807933247}} ``` -The xiaozhi-server's `EventTextMessageHandler` (`custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py`) POSTs each frame to the bridge's `POST /api/perception/event`. The bridge maintains a pub/sub bus (`_perception_listeners`) and per-device state (`_perception_state[device_id]`), and runs six consumer tasks against it: +The xiaozhi-server's `EventTextMessageHandler` (`custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py`) POSTs each frame to `dotty-behaviour`'s `POST /api/perception/event`. dotty-behaviour maintains the pub/sub bus and runs 9 ambient consumer tasks against it: | Consumer | What it does | |---|---| @@ -229,45 +213,49 @@ The xiaozhi-server's `EventTextMessageHandler` (`custom-providers/xiaozhi-patche | `_perception_wake_word_turner` | Head-turn toward the speaker on wake-word event. | | `_perception_face_identified_refresher` | Re-asserts the face-identified pixel every ~3 s so the firmware's 4 s timeout doesn't drop it. | | `_perception_purr_player` | Plays an idle purr asset when conditions match. | +| (and 3 additional consumers in dotty-behaviour) | Vision narrative, audio caption, idle photographer. | WebSocket lifecycle gotcha: xiaozhi only opens the WS during a conversation. Firmware-side perception producers must call `OpenAudioChannel()` first, or events from idle silently drop. ## Threat-model implications -- **Device compromise** gives an attacker a WS session to xiaozhi-server and the ability to invoke any server-exposed MCP tool. It does **not** give them the LLM key, ZeroClaw's memory, or network access to OpenRouter beyond what proxied prompts can achieve. -- **Server compromise** gives them access to the bridge over HTTP (no auth currently). Anything the bridge can ask ZeroClaw, the attacker can ask it. The `/admin/*` mutation endpoints are unreachable (they're `127.0.0.1`-only). -- **ZeroClaw host compromise** gives them everything — LLM keys, memory DB, workspace persona files. -- **OpenRouter compromise** gives them log access to every prompt sent. Treat prompts as non-confidential. +- **Device compromise** gives an attacker a WS session to xiaozhi-server and the ability to invoke any server-exposed MCP tool. It does **not** give them LLM keys or network access to OpenRouter beyond what proxied prompts can achieve. +- **Docker host compromise** gives them access to all four services — xiaozhi-server, dotty-pi (with brain.db and persona files), dotty-behaviour, bridge.py. The `/admin/*` mutation endpoints on bridge.py are `127.0.0.1`-only. +- **OpenRouter compromise** gives log access to every prompt sent via cloud models. Treat prompts as non-confidential. See [`ROADMAP.md`](ROADMAP.md) for related backlog items (privacy-indicator LEDs, child-safety hardening). ## Deployment files (this repo) -The canonical working copies live in this repo. The deployed copies on each host should match — if they drift, redeploy from here. +The canonical working copies live in this repo. -| File | Deployed to | Purpose | +| File / Directory | Deployed to | Purpose | |---|---|---| -| `bridge.py` | ZeroClaw host `/bridge.py` | FastAPI HTTP→ZeroClaw translator (ACP over stdio) | -| `bridge/requirements.txt` | bare-metal venv | Pinned Python deps for the bridge (fastapi, uvicorn, pydantic) | -| `zeroclaw-bridge.service` | ZeroClaw host `/etc/systemd/system/` | systemd unit for bridge | -| `custom-providers/zeroclaw/zeroclaw.py` | Server `core/providers/llm/zeroclaw/zeroclaw.py` | xiaozhi LLM provider, proxies to the ZeroClaw bridge | -| `custom-providers/zeroclaw/__init__.py` | Server `core/providers/llm/zeroclaw/__init__.py` | Python package marker | -| `custom-providers/edge_stream/edge_stream.py` | Server `core/providers/tts/edge_stream.py` | Streaming EdgeTTS provider | -| `custom-providers/piper_local/piper_local.py` | Server `core/providers/tts/piper_local.py` | Local Piper TTS provider (offline alternative to EdgeTTS) | -| `custom-providers/asr/fun_local.py` | Server `core/providers/asr/fun_local.py` | Patched FunASR provider (adds `language` config key) | -| `.config.yaml` | Server `data/.config.yaml` | xiaozhi-server config override | -| `.env.example` | reference only | Documented environment variables with defaults | -| `docker-compose.yml` | Server `` | Container definition | - -Volume mounts (server) are listed in [quickstart.md](./quickstart.md#deployment-layout). +| `dotty-pi/` | Docker host `/mnt/user/appdata/dotty-pi-src/` | pi agent container (Dockerfile + docker-compose.yml) | +| `dotty-pi-ext/` | Docker host (bind-mounted into dotty-pi) | dotty-pi-ext extension — five voice tools | +| `dotty-behaviour/` | Docker host `/mnt/user/appdata/dotty-behaviour-src/` | Perception + ambient behaviour container | +| `bridge.py` | Docker host (bridge.py container) | Admin dashboard FastAPI service | +| `bridge/requirements.txt` | bridge.py container | Pinned Python deps | +| `custom-providers/pi_voice/` | xiaozhi container `core/providers/llm/pi_voice/` | PiVoiceLLM + PiClient | +| `custom-providers/tier1_slim/` | xiaozhi container `core/providers/llm/tier1_slim/` | Tier1Slim alternate provider | +| `custom-providers/openai_compat/` | xiaozhi container `core/providers/llm/openai_compat/` | OpenAICompat alternate provider | +| `custom-providers/edge_stream/` | xiaozhi container `core/providers/tts/` | Streaming EdgeTTS provider | +| `custom-providers/piper_local/` | xiaozhi container `core/providers/tts/` | Local Piper TTS provider | +| `custom-providers/asr/fun_local.py` | xiaozhi container `core/providers/asr/` | Patched FunASR provider (adds `language` config key) | +| `custom-providers/xiaozhi-patches/` | xiaozhi container (drop-in overrides) | Admin routes + shared_llm singleton | +| `.config.yaml` | Docker host `data/.config.yaml` | xiaozhi-server config override | +| `docker-compose.yml.template` | Docker host `` | Container definition | +| `scripts/deploy-behaviour.sh` | run from admin workstation | Deploy dotty-behaviour to Docker host | + +Volume mounts (xiaozhi-server) are listed in [quickstart.md](./quickstart.md#deployment-layout). ## See also - [hardware.md](./hardware.md) — what the robot actually is. -- [voice-pipeline.md](./voice-pipeline.md) — what the server runs. -- [tier1slim.md](./tier1slim.md) — the default voice LLM and its escalation contract. -- [brain.md](./brain.md) — model matrix + what the ZeroClaw host runs. -- [protocols.md](./protocols.md) — what's on the wire (including `/api/voice/escalate` and `/api/perception/event`). +- [voice-pipeline.md](./voice-pipeline.md) — what xiaozhi-server runs. +- [brain.md](./brain.md) — the pi agent, model matrix, and dotty-pi-ext tools. +- [protocols.md](./protocols.md) — what's on the wire (pi RPC mode, `/api/perception/event`). - [quickstart.md](./quickstart.md) — deployment placeholders, volume mounts, common ops. +- [llm-backends.md](./llm-backends.md) — choosing between PiVoiceLLM, Tier1Slim, OpenAICompat. -Last verified: 2026-05-17. +Last verified: 2026-05-22. diff --git a/docs/brain.md b/docs/brain.md index a13d05b..78b3e00 100644 --- a/docs/brain.md +++ b/docs/brain.md @@ -1,182 +1,138 @@ --- title: Brain -description: ZeroClaw agent runtime, the model matrix (Tier1Slim inner loop + escalation targets, legacy ZeroClawLLM), and the FastAPI bridge. +description: The pi agent runtime (dotty-pi container), the model matrix, and the admin dashboard service (bridge.py). --- -# Brain — ZeroClaw + the model matrix + the bridge +# Brain — the pi agent + the model matrix ## TL;DR -- The "brain" is two cooperating pieces: **[ZeroClaw](https://github.com/zeroclaw-labs/zeroclaw)** (a Rust AI-agent runtime, MIT/Apache-2.0 dual-licensed) on the ZeroClaw host, plus a **FastAPI bridge** (`bridge.py`) that fronts it over HTTP. -- The bridge accepts POSTs from the voice path and translates them into **ACP (Agent Client Protocol) JSON-RPC 2.0 over stdio** against a long-running `zeroclaw acp` child. -- **Which LLM runs which turn depends on the active voice provider.** Two paths coexist: - - **Tier1Slim path (default in `.config.yaml`)** — small/fast model (`qwen3.5:4b` on a local llama-swap by default) handles every conversational turn; only tool calls escalate to the bridge. Smart-mode flips the inner-loop model to a cloud model in-process. - - **ZeroClawLLM path (legacy)** — every turn runs through ZeroClaw with [Qwen3-30B-A3B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507) (a 30.5 B-param MoE, 3.3 B active per token) via OpenRouter. -- The bridge picks the smart-mode dispatch path based on the `DOTTY_VOICE_PROVIDER` env var: `"tier1slim"` → hot-swap via `/xiaozhi/admin/set-tier1slim-model`; `"zeroclaw"` → rewrite ZeroClaw's `config.toml` and restart the daemon. -- Persona lives in two different places depending on path: Tier1Slim reads `personas/dotty_voice.md`; ZeroClawLLM reads `~/.zeroclaw/workspace/{SOUL,IDENTITY,MEMORY,AGENTS}.md`. See [change-persona.md](./cookbook/change-persona.md). -- Known weak spot of Qwen3: it leaks Chinese on long-context English-only prompts. Both paths compensate — the bridge wraps `channel="stackchan"` turns in an English+emoji sandwich; Tier1Slim appends an English-only suffix per turn. +- The "brain" is the **`dotty-pi` Docker container** running the pi coding agent with the `dotty-pi-ext` extension. +- **`PiVoiceLLM`** (the default xiaozhi LLM provider) translates each voice turn into a pi RPC request via `docker exec -i dotty-pi pi --mode rpc`. TTS-bound text streams back to xiaozhi-server; tool dispatch happens entirely inside the container. +- The `dotty-pi-ext` extension exposes **five voice tools** to the agent loop: `memory_lookup`, `remember`, `think_hard`, `take_photo`, `play_song`. +- **Which LLM runs which turn:** the pi outer loop targets `qwen3.5:4b` (local llama-swap, ~500 ms warm); `think_hard` escalates directly to `qwen3.6:27b-think` (co-resident on llama-swap). Smart-mode flips the inner-loop model to a cloud model. +- Two documented alternate voice providers exist: **`Tier1Slim`** (handles plain turns directly against llama-swap, escalates tool calls via POST) and **`OpenAICompat`**. See [llm-backends.md](./llm-backends.md). + +> **Cutover note (2026-05-19, issue #36):** The brain previously ran as the ZeroClaw Rust agent on a Raspberry Pi, fronted by a FastAPI bridge (`bridge.py`) under systemd. ZeroClaw and the RPi host are retired. `bridge.py` survives as the admin dashboard service (port 8080, `/ui`) on the Docker host; its voice and perception roles moved to `PiVoiceLLM`/`dotty-pi` and `dotty-behaviour` respectively. ## Model matrix | Path | Model | Where | When called | |---|---|---|---| -| Tier1Slim inner loop (smart_mode OFF) | `qwen3.5:4b` | local llama-swap (`TIER1SLIM_LOCAL_URL`, defaults to `http://localhost:8080/v1`) | Every plain conversational turn. ~500 ms warm. | -| Tier1Slim inner loop (smart_mode ON) | `anthropic/claude-sonnet-4-6` (`SMART_MODEL`) | OpenRouter | Every conversational turn while smart_mode is on. | -| Tier1Slim escalation: `think_hard` | `qwen3.6:27b-think` | local llama-swap | Multi-step reasoning, 3+ digit arithmetic, anything the small model would have to guess at. | -| Tier1Slim escalation: `memory_lookup` | (no LLM call — FTS) | ZeroClaw memory | `"do you remember…"` queries. | -| Tier1Slim escalation: `take_photo` | `google/gemini-2.0-flash-001` (`VLM_MODEL`) | OpenRouter (or local Ollama if `VLM_API_URL` is repointed) | Camera describe. | -| Tier1Slim escalation: `play_song` | (no LLM call) | Firmware via xiaozhi `/xiaozhi/admin/play-asset` | Song request. | -| ZeroClawLLM (legacy single tier) | `Qwen3-30B-A3B-Instruct-2507` | OpenRouter | Every turn when `selected_module.LLM = ZeroClawLLM`. | -| Vision narrative LLM (security/scene synthesis) | `VISION_MODEL` (`google/gemini-2.0-flash-001` by default) | OpenRouter | Bridge-internal — describes the camera frame for journaling and security mode. | -| Audio captioning (security mode) | `AUDIO_CAPTION_MODEL` (`google/gemini-2.5-flash` by default) | OpenRouter | Bridge-internal — `what does Dotty hear` describer. | - -The full Tier1Slim wire format, escalation payload, and `set_runtime()` hot-swap are documented in [tier1slim.md](./tier1slim.md). - -## ZeroClaw architecture - -From [github.com/zeroclaw-labs/zeroclaw](https://github.com/zeroclaw-labs/zeroclaw) (see [references.md](./references.md#brain)): - -| Component | Role | +| PiVoiceLLM outer agent loop | `qwen3.5:4b` | local llama-swap | Every voice turn. ~500 ms warm. | +| pi tool: `think_hard` | `qwen3.6:27b-think` | local llama-swap | Multi-step reasoning; direct POST from dotty-pi-ext, no agent overhead. | +| pi tool: `memory_lookup` | (no LLM call — FTS5) | brain.db inside dotty-pi | `"do you remember…"` queries. | +| pi tool: `take_photo` | `google/gemini-2.0-flash-001` (`VLM_MODEL`) | dotty-behaviour → OpenRouter | Camera describe. | +| pi tool: `play_song` | (no LLM call) | Firmware via `/xiaozhi/admin/play-asset` | Song request. | +| Smart-mode inner loop | `anthropic/claude-sonnet-4-6` (`SMART_MODEL`) | OpenRouter | Every conversational turn while smart_mode is on. | +| Vision narrative (security/scene synthesis) | `VISION_MODEL` (`google/gemini-2.0-flash-001`) | OpenRouter | dotty-behaviour internal — camera frame description. | +| Audio captioning (security mode) | `AUDIO_CAPTION_MODEL` (`google/gemini-2.5-flash`) | OpenRouter | dotty-behaviour internal — ambient sound description. | +| Tier1Slim inner loop (alternate provider, smart_mode OFF) | `qwen3.5:4b` | local llama-swap | Every plain turn when Tier1Slim is selected. | +| Tier1Slim inner loop (alternate provider, smart_mode ON) | `anthropic/claude-sonnet-4-6` | OpenRouter | Every turn when Tier1Slim + smart_mode is on. | + +## The pi agent runtime + +### dotty-pi container + +`dotty-pi` is a pinned `node:25.9-alpine3.23` image with `@earendil-works/pi-coding-agent` installed globally. It idles via `sleep infinity`; voice turns invoke pi on demand via `docker exec -i` from `PiClient` (in `custom-providers/pi_voice/pi_client.py`). + +The runtime contract: +1. **xiaozhi-server** calls `PiVoiceLLM.generate()` with the dialogue. +2. **PiClient** runs `docker exec -i dotty-pi pi --mode rpc` — JSONL messages over stdin/stdout. +3. **pi** runs the prompt against llama-swap (`qwen3.5:4b` by default) with the `dotty-pi-ext` extension loaded. +4. Thinking deltas and extension UI requests are filtered by PiClient; only TTS-bound text chunks reach xiaozhi-server. +5. `PiVoiceLLM` holds one long-lived `PiClient`; between turns it issues `new_session` to reset pi's working state without re-spawning the process. + +Appdata layout on the Docker host: + +``` +/mnt/user/appdata/dotty-pi/ +├── agent/ +│ └── models.json # provider config (llama-swap endpoints + aliases) +├── sessions/ # pi session state +├── persona/ # Dotty persona files +├── memory/ +│ └── brain.db # FTS5 store (see brain-db-fts-only.md memory note) +└── extensions/ + └── dotty-pi-ext/ # voice-tool extension +``` + +### dotty-pi-ext — the five voice tools + +`dotty-pi-ext` is the pi extension that exposes Dotty's voice tools to the agent loop. Installed inside the container at `/root/.pi/extensions/dotty-pi-ext/`. + +| Tool | What it does | |---|---| -| **Gateway** | Control plane: HTTP / WS / SSE, sessions, config, cron, webhooks, web dashboard (localhost:42617 by default) | -| **Runtime** | Agent execution. Two modes: **Native** (direct process, default, fastest) or **Docker** (sandboxed) | -| **Channels** | Pluggable inputs. Supports WhatsApp, Telegram, Slack, Discord, Signal, iMessage, Matrix, IRC, Email, Bluesky, Nostr, Mattermost, Nextcloud Talk, DingTalk, Lark, QQ, Reddit, LinkedIn, Twitter, MQTT, WeChat Work, and others. This bridge uses `channel="stackchan"` as an arbitrary string identifier — ZeroClaw treats it as just another channel. | -| **Providers** | LLM backends. OpenAI, Anthropic (API or OAuth), Gemini, and 17+ OpenAI-compatible endpoints (including OpenRouter, Ollama, GLM). Failover + multi-account auth profiles supported. | -| **Memory** | Pluggable backends. SQLite is the default; PostgreSQL and Markdown are options. Hybrid keyword+vector search per upstream wiki. | -| **Tools / MCP** | 70+ built-in tools plus bidirectional MCP — ZeroClaw is both an MCP client (consumes external servers) and an MCP server (can expose its internals to other agents). | - -Resource claims from upstream: ~8.8 MB static binary, <5 MB runtime RAM on release builds. The Pi can comfortably run it. - -### Workspace files — the persona surface - -Under `/root/.zeroclaw/workspace/`: - -| File | Upstream description | What we use it for | -|---|---|---| -| `SOUL.md` | "Core identity and operating principles" | The configured persona's voice, values, role in the household. The dedicated `channel="stackchan"` section was removed after bridge-level English-only enforcement subsumed it. | -| `IDENTITY.md` | Agent personality and role definition | Name, backstory, household/family context specific to your deployment. | -| `USER.md` | User context and preferences | Per-user context; optional. Not heavily populated in our deploy. | -| `MEMORY.md` | "Long-term facts and lessons learned" | Git-visible core memories. Complemented by an SQLite `brain.db`. | -| `AGENTS.md` | "Session conventions and initialization rules" | Cross-agent behavioral guardrails. Self-modifying per upstream — the agent writes to it. | - -**Important**: the bridge wraps voice turns in a hard English+emoji sandwich **outside** of ZeroClaw's persona files. The enforcement lives in `bridge.py`, not in `SOUL.md`, because a persona-level constraint wasn't strong enough to keep Qwen3 from leaking Chinese mid-session. See [Qwen3 caveat](#qwen3-caveat-chinese-leak-and-long-context-drift). - -### The confusing "ACP" terminology in ZeroClaw +| `memory_lookup(query)` | FTS5 search against `brain.db`; returns top-3 snippets, ≤200 chars each. | +| `remember(fact)` | Stores a durable fact (≤300 codepoints) into `brain.db` with `category=core`, `importance=0.7`. | +| `think_hard(question)` | Direct POST to llama-swap `qwen3.6:27b-think` (`enable_thinking=false`, 200-token cap, terse answer). | +| `take_photo()` | GET to `dotty-behaviour /api/voice/take_photo` — returns latest cached vision description if ≤30 s old. | +| `play_song(name)` | Resolves free-form name against `/xiaozhi/admin/songs` catalogue (60 s cache), then POSTs `/xiaozhi/admin/play-asset`. | -ZeroClaw uses the acronym **ACP** in two different contexts. Don't conflate them: +In addition, an `agent_end` handler in the extension automatically writes a `category=conversation` row to `brain.db` after every completed user prompt — the agent does not decide to log; every successful turn is recorded. -1. **Autonomy Control (ACP Mode)** — ZeroClaw's README describes autonomy *levels* (`ReadOnly`, `Supervised`, `Full`) under a heading that calls them "ACP Mode". This is about how much the agent is allowed to do without asking. -2. **Agent Client Protocol** — the `zeroclaw acp` CLI subcommand launches ZeroClaw as an ACP server speaking JSON-RPC 2.0 over stdio. This is the Zed-originated [Agent Client Protocol](https://agentclientprotocol.com) — unrelated to autonomy levels, despite sharing an acronym. +### Model selection for dotty-pi -Our bridge uses the **second** one. The robot's autonomy mode is whatever ZeroClaw's `config.toml` sets (likely `Supervised`), and is separate from the stdio protocol. +The outer pi loop must target `qwen3.5:4b` — **not** `qwen3.6:27b`. llama-swap groups models into matrix sets: `voice` (`qwen3.5:4b` + `qwen3.6:27b-think`) and `coding` (`qwen3.6:27b` alone). Requesting the coding-set model evicts both voice models; reloading either voice model is a 30–50 s cold hit. `think_hard` calls `qwen3.6:27b-think` directly from the extension, which is in the `voice` set and resident alongside `4b`. See [cookbook/llama-swap-concurrent-models.md](./cookbook/llama-swap-concurrent-models.md). -See [protocols.md](./protocols.md#acp) for the ACP wire format. +## The bridge — `bridge.py` (dashboard service) -## The bridge — `bridge.py` +`bridge.py` was the original HTTP→ZeroClaw translator, running under systemd on the RPi. Post-cutover (#36) it runs as a Docker container on the same Docker host, port 8080. Its **voice path** (`/api/message`, `/api/voice/*`) and **perception relay** (`/api/perception/event`) roles are retired — those functions moved to `PiVoiceLLM`/`dotty-pi` and `dotty-behaviour`. What remains: -Lives at `/bridge.py`, runs under systemd (`zeroclaw-bridge.service`). +- **Admin dashboard** (`/ui`) — the operator web UI for monitoring turns, toggling kid-mode/smart-mode, viewing scene context, and LED state. +- **`/admin/*` endpoints** (localhost-only) — runtime toggles for kid-mode, smart-mode, safety allowlist. -**HTTP surface:** +A dashboard port to `dotty-behaviour` is still pending; until then, bridge.py's dashboard panels that relied on the bridge's own perception bus may show stale or empty data. -| Endpoint | Caller | Purpose | -|---|---|---| -| `POST /api/message` | `ZeroClawLLM` (legacy single-tier) | Accept a message + channel, return a response string. | -| `POST /api/voice/escalate` | `Tier1Slim` | Dispatch a tool call (`memory_lookup` / `think_hard` / `take_photo` / `play_song`) and return the result. | -| `POST /api/voice/remember` | `Tier1Slim` | Fire-and-forget: persist a `[REMEMBER: ...]` fact extracted from the model's reply. | -| `POST /api/voice/memory_log` | `Tier1Slim` | Fire-and-forget: log the completed turn so ZeroClaw indexes it for future recall. | -| `POST /api/perception/event` | xiaozhi-server perception relay | Receive `face_detected` / `face_lost` / `sound_event` / `state_changed` events from the firmware via xiaozhi. | -| `GET /health` | health checks | Liveness probe. | -| `POST /admin/*` (localhost-only) | operator scripts | Runtime mutations (kid-mode, persona, model, safety). | - -See [protocols.md](./protocols.md) for exact wire formats. - -**Per-turn responsibilities on the `ZeroClawLLM` path:** - -1. Spawn (or reuse) a `zeroclaw acp` child, holding stdin/stdout open. -2. Send JSON-RPC `session/new` to get a fresh session_id. *(Note: currently one `session/new` per turn — see session reuse in [latent-capabilities.md](./latent-capabilities.md#brain-unused).)* -3. Wrap the user content in the English+emoji sandwich when `channel == "stackchan"`. -4. Send JSON-RPC `session/prompt` with the wrapped content. -5. Wait for the terminal result (not streamed — see [latent-capabilities.md](./latent-capabilities.md#brain-unused)). -6. Run `_ensure_emoji_prefix()` — if the first non-whitespace char isn't in the 9-emoji allowlist (😊😆😢😮🤔😠😐😍😴), prepend 😐. -7. Return JSON `{"response": ""}`. - -**Per-turn responsibilities on the `Tier1Slim` path:** - -The bridge doesn't see plain conversational turns at all — they happen entirely inside Tier1Slim against llama-swap. The bridge is invoked only when the small model emits a `tool_call`, at which point it: - -1. Looks up the tool name and dispatches to ZeroClaw (`memory_lookup`), llama-swap's `qwen3.6:27b-think` (`think_hard`), the VLM (`take_photo`), or xiaozhi's `/xiaozhi/admin/play-asset` (`play_song`). -2. Returns the result string truncated to 1000 chars. -3. Separately, accepts `/api/voice/remember` and `/api/voice/memory_log` posts so future `memory_lookup` calls find the new content. +See [protocols.md](./protocols.md) for the admin endpoint wire formats. ## The LLMs -### Qwen3-30B-A3B-Instruct-2507 (legacy `ZeroClawLLM` path) - -From the [HuggingFace card](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507): +### Qwen3-30B-A3B-Instruct-2507 (legacy path — retired) -| Fact | Value | -|---|---| -| Total parameters | 30.5 B | -| Activated per token | 3.3 B | -| Non-embedding params | 29.9 B | -| Layers | 48 | -| Attention heads | 32 Q / 4 KV (GQA) | -| Experts | 128 total, 8 active per token | -| Native context | 262 144 tokens (256 K) | -| Extended context | up to 1 M tokens with Dual Chunk Attention + MInference sparse attention | -| Mode | Non-thinking only (no `` blocks) | -| Recommended sampling | T=0.7, top_p=0.8, top_k=20, min_p=0, presence_penalty 0–2 | -| Tool calling | Supported (OpenAI-compatible; Qwen-Agent framework recommended for full features) | -| Suggested output length | 16 384 tokens | - -The "2507" suffix indicates the 2025 revision (the HF card calls it the May 2025 update; the YYMM reading would be July 2025 — these conflict in upstream copy, treat the revision semantically rather than calendar-pinning it). +Previously used by the ZeroClawLLM provider via OpenRouter. Not used in the current architecture. ### Qwen3 caveat — Chinese leak and long-context drift -Qwen3 is multilingual by training and occasionally **leaks Chinese mid-response** when: -- Context is long (persona + memory + history = lots of tokens before the user turn). -- System-prompt adherence is weakened by MoE expert routing. +Qwen3 is multilingual by training and occasionally **leaks Chinese mid-response** when context is long or system-prompt adherence is weakened by MoE expert routing. Observed symptom: the model starts a response in English and drops a Chinese character or phrase partway through; `en-*` EdgeTTS voices return silent audio on non-English input, making it sound like a dead mic. -Observed symptom in our deploy: the model would start a response in English and drop a Chinese character or phrase partway through. `en-*` EdgeTTS voices return silent audio on non-English input, so the whole response sounded like a dead mic — it was a language bug, not a TTS bug. +**Mitigation in the current stack:** -**Our mitigation, layered:** +1. The pi agent persona (`persona/dotty_voice.md`) has English hard rules. +2. xiaozhi-server's top-level `prompt:` in `data/.config.yaml` is also English-only. +3. `custom-providers/textUtils.py` appends a per-turn English-only suffix (used by Tier1Slim and PiVoiceLLM). -1. ZeroClaw's own system prompt has English hard rules. -2. xiaozhi-server's top-level `prompt:` is also English-only. -3. The bridge adds **both a prefix and a suffix** around every voice turn — the suffix sits at the end-of-prompt position (max attention) and reiterates the constraint. 8/8 adversarial prompts (broken English, embedded Japanese, short utterances) passed cleanly after this change. +### qwen3.5:4b (pi outer agent loop) -### qwen3.5:4b (Tier1Slim inner loop) +Local on llama-swap (dual RTX 3060). Fast: ~500 ms warm round-trip including TTS dispatch. Trained for tool calling, which is what lets the five-tool catalogue work reliably at 4 B parameters. See the dotty-pi-ext tool table above. -Local on llama-swap (Unraid, dual RTX 3060). Fast: ~500 ms warm round-trip including TTS dispatch. Trained for tool calling, which is what lets the four-tool catalogue work reliably at 4 B. See [tier1slim.md](./tier1slim.md) for the wire format. +### qwen3.6:27b-think (think_hard target) -### qwen3.6:27b-think (Tier1Slim `think_hard` target) - -Local on the same llama-swap, separate alias. ~18 tok/s generation, ~30 s cold-load. Co-resident with `qwen3.5:4b` under the `voice` matrix set in `llama-swap/config.yaml` so an escalation doesn't evict the inner loop. See [cookbook/llama-swap-concurrent-models.md](./cookbook/llama-swap-concurrent-models.md). +Local on the same llama-swap, separate alias. ~18 tok/s generation, ~30–50 s cold-load. Co-resident with `qwen3.5:4b` under the `voice` matrix set in `llama-swap/config.yaml` so an escalation doesn't evict the inner loop. See [cookbook/llama-swap-concurrent-models.md](./cookbook/llama-swap-concurrent-models.md). ### Cloud models (smart_mode + visual + audio) -- **Smart-mode inner loop:** `anthropic/claude-sonnet-4-6` (`SMART_MODEL` env var). Used by Tier1Slim when smart_mode is on; flipped in-process via `set_runtime()`. -- **VLM (`take_photo`, security camera frames):** `google/gemini-2.0-flash-001` (`VLM_MODEL`). -- **Audio captioning (security mode):** `google/gemini-2.5-flash` (`AUDIO_CAPTION_MODEL`). +- **Smart-mode inner loop:** `anthropic/claude-sonnet-4-6` (`SMART_MODEL` env var). Used when smart_mode is on. +- **VLM (`take_photo`, security camera frames):** `google/gemini-2.0-flash-001` (`VLM_MODEL`). Served by dotty-behaviour. +- **Audio captioning (security mode):** `google/gemini-2.5-flash` (`AUDIO_CAPTION_MODEL`). Served by dotty-behaviour. ## OpenRouter -Routing: OpenRouter fronts cloud models (`SMART_MODEL`, `VLM_MODEL`, `AUDIO_CAPTION_MODEL`, and the legacy ZeroClaw path's Qwen3-30B). It handles multiple upstream providers and exposes an OpenAI-compatible API. ZeroClaw's config references it via an `openrouter` provider section with an encrypted API key; the bridge reads `OPENROUTER_API_KEY` from the systemd unit's environment for the VLM and audio-caption calls. +Routing: OpenRouter fronts cloud models (`SMART_MODEL`, `VLM_MODEL`, `AUDIO_CAPTION_MODEL`). It handles multiple upstream providers and exposes an OpenAI-compatible API. The bridge reads `OPENROUTER_API_KEY` from its container environment for dashboard-adjacent calls; dotty-behaviour reads its own copy for vision and audio-caption calls. Observability OpenRouter itself offers (not currently surfaced in this stack): - Per-request latency + cost dashboards. - Multi-model A/B routing. - Per-provider failover for the same model. -If per-turn latency ever needs deeper analysis than ZeroClaw's `state/costs.jsonl` and `state/runtime-trace.jsonl`, OpenRouter's dashboard is where to look next. - ## See also -- [tier1slim.md](./tier1slim.md) — the default voice path. -- [protocols.md](./protocols.md#acp) — exact ACP RPC surface the bridge speaks. -- [voice-pipeline.md](./voice-pipeline.md) — what drives the bridge. -- [llm-backends.md](./llm-backends.md) — choosing between Tier1Slim, ZeroClawLLM, OpenAICompat. -- [latent-capabilities.md](./latent-capabilities.md#brain-unused) — streaming, session reuse, tool-use, MCP-server mode. -- [references.md](./references.md#brain) — ZeroClaw, ACP, Qwen3, OpenRouter links. +- [voice-pipeline.md](./voice-pipeline.md) — what xiaozhi-server runs. +- [architecture.md](./architecture.md) — full topology and data-flow diagrams. +- [protocols.md](./protocols.md) — pi RPC mode wire format, admin endpoints. +- [llm-backends.md](./llm-backends.md) — choosing between PiVoiceLLM, Tier1Slim, OpenAICompat. +- [latent-capabilities.md](./latent-capabilities.md) — streaming, session reuse, tool-use, MCP-server mode. +- [references.md](./references.md) — Qwen3, OpenRouter, pi coding agent links. +- [cutover-behaviour.md](./cutover-behaviour.md) — historical runbook for the #36 ZeroClaw → pi-agent cutover. -Last verified: 2026-05-17. +Last verified: 2026-05-22. diff --git a/docs/cookbook/add-emoji.md b/docs/cookbook/add-emoji.md index 5d06268..78bed65 100644 --- a/docs/cookbook/add-emoji.md +++ b/docs/cookbook/add-emoji.md @@ -38,8 +38,8 @@ See [protocols.md](../protocols.md) and the upstream ## 4. Restart ```bash -systemctl restart zeroclaw-bridge # ZeroClaw host -docker compose restart xiaozhi-server # Docker host (if config changed) +docker compose restart bridge # bridge container (if bridge.py changed) +docker compose restart xiaozhi-server # xiaozhi container (if config changed) ``` Current set: 😊 smile, 😆 laugh, 😢 sad, 😮 surprise, 🤔 thinking, diff --git a/docs/cookbook/change-persona.md b/docs/cookbook/change-persona.md index d2fb4eb..99b16ef 100644 --- a/docs/cookbook/change-persona.md +++ b/docs/cookbook/change-persona.md @@ -11,9 +11,9 @@ Three personas ship in `personas/`: | File | Style | Used by | |---|---|---| -| `default.md` | Cheerful, curious desktop robot. The general-purpose persona used by `ZeroClawLLM` and other generic providers. | `ZeroClawLLM`, `OpenAICompat` | -| `dotty_voice.md` | Voice-tuned variant of `default.md` — same character but pruned for short replies, with the 4-tool catalogue and `[REMEMBER: ...]` markers baked in. | `Tier1Slim` | -| `smart.md` | More capable, allowed longer answers — for when `smart_mode` is on and the cloud model is doing the heavy lifting. | optional override for either provider | +| `default.md` | Cheerful, curious desktop robot. The general-purpose persona for generic providers. | `OpenAICompat` | +| `dotty_voice.md` | Voice-tuned variant of `default.md` — same character but pruned for short replies, with the tool catalogue and `[REMEMBER: ...]` markers baked in. | `PiVoiceLLM`, `Tier1Slim` | +| `smart.md` | More capable, allowed longer answers — for when `smart_mode` is on and the cloud model is doing the heavy lifting. | optional override | ## Which file controls the persona? @@ -21,15 +21,13 @@ Check `selected_module.LLM` in `.config.yaml`, then read the matching block: | Provider | Persona source | |---|---| -| `Tier1Slim` (current default) | `LLM.Tier1Slim.persona_file` in `.config.yaml`. Defaults to `personas/dotty_voice.md`. | -| `ZeroClawLLM` | ZeroClaw's own workspace files: `~/.zeroclaw/workspace/{SOUL,IDENTITY,MEMORY,AGENTS}.md` on the ZeroClaw host. **The `persona_file` key is ignored** — ZeroClaw doesn't read it. | +| `PiVoiceLLM` (current default) | The persona file configured in the pi agent's extension (`dotty-pi-ext`). Defaults to `personas/dotty_voice.md`. | +| `Tier1Slim` | `LLM.Tier1Slim.persona_file` in `.config.yaml`. Defaults to `personas/dotty_voice.md`. | | `OpenAICompat` (and similar generic providers) | `LLM.OpenAICompat.persona_file` in `.config.yaml`. | -Setting `persona_file` for a `ZeroClawLLM` selection is a silent no-op — edit the workspace files instead, or switch the provider first. +## Switch to a different shipped persona -## Switch to a different shipped persona (Tier1Slim or generic providers) - -1. Edit `.config.yaml`: +1. Edit `.config.yaml` (or the pi agent persona config for `PiVoiceLLM`): ```yaml LLM: @@ -39,16 +37,6 @@ Setting `persona_file` for a `ZeroClawLLM` selection is a silent no-op — edit 2. Restart: `docker compose restart xiaozhi-server`. -## Edit ZeroClaw's persona - -1. SSH to the ZeroClaw host. -2. Edit the relevant file under `~/.zeroclaw/workspace/`: - - `SOUL.md` — voice, values, and role in the household. - - `IDENTITY.md` — name, backstory, family context. - - `USER.md` — per-user preferences (optional). - - `MEMORY.md` — long-term facts; the agent also writes to this. -3. No restart needed — ZeroClaw hot-reads these on each turn. - ## Create your own persona 1. Copy an existing file: `cp personas/dotty_voice.md personas/pirate.md`. @@ -61,6 +49,6 @@ Edit the top-level `prompt:` block in `.config.yaml` directly. This is the xiaoz ## Notes -- Always keep the emoji-leader rule in any persona — removing it breaks face animations and `_ensure_emoji_prefix` in the bridge will paper over it with 😐. +- Always keep the emoji-leader rule in any persona — removing it breaks face animations. The persona prompt and the xiaozhi-server system prompt are the two enforcement layers. - See [tier1slim.md](../tier1slim.md) for why Tier1Slim treats persona files differently from other providers. - See [protocols.md](../protocols.md) for the emoji → face frame mapping. diff --git a/docs/cookbook/disable-kid-mode.md b/docs/cookbook/disable-kid-mode.md index 19c5dc3..345b488 100644 --- a/docs/cookbook/disable-kid-mode.md +++ b/docs/cookbook/disable-kid-mode.md @@ -16,7 +16,7 @@ Set the environment variable (in `.env` or the shell environment): DOTTY_KID_MODE=false ``` -Restart the bridge: `systemctl restart zeroclaw-bridge` +Restart the bridge container: `docker compose restart bridge` ## What changes (removed) diff --git a/docs/cookbook/llama-swap-concurrent-models.md b/docs/cookbook/llama-swap-concurrent-models.md index 2056e4f..728b640 100644 --- a/docs/cookbook/llama-swap-concurrent-models.md +++ b/docs/cookbook/llama-swap-concurrent-models.md @@ -243,8 +243,7 @@ reasoning-on models). For Qwen3-family models you can also pass ## See also -- [llm-backends.md](../llm-backends.md) — picking among cloud / Ollama / - ZeroClaw backends at the xiaozhi-server slot. +- [llm-backends.md](../llm-backends.md) — picking among `PiVoiceLLM`, `Tier1Slim`, `OpenAICompat`, and other LLM providers at the xiaozhi-server slot. - [run-fully-local.md](./run-fully-local.md) — single-model local setup via the Ollama compose override. - [llama-swap upstream README](https://github.com/mostlygeek/llama-swap) diff --git a/docs/faq.md b/docs/faq.md index 0442232..62c6f81 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -9,7 +9,7 @@ description: Frequently asked questions about hardware, setup, and configuration ### What hardware do I need? -The verified setup is an **M5Stack CoreS3** mounted in the **M5Stack StackChan servo kit** (2x SG90 servos for pan/tilt, 12 RGB LEDs, 3D-printed chassis). You also need a Docker-capable host on your LAN (a spare PC or any Linux box with Docker) and a ZeroClaw host for the agent brain. +The verified setup is an **M5Stack CoreS3** mounted in the **M5Stack StackChan servo kit** (2x SG90 servos for pan/tilt, 12 RGB LEDs, 3D-printed chassis). You also need a Docker-capable host on your LAN (a spare PC or any Linux box with Docker) to run the voice and brain containers. See [hardware-support.md](./hardware-support.md) for the full spec table and support tiers. @@ -17,12 +17,13 @@ See [hardware-support.md](./hardware-support.md) for the full spec table and sup ### Can I use a different LLM? -Yes. The LLM is pluggable at two levels: +Yes. The LLM is pluggable via `selected_module.LLM` in `data/.config.yaml`: -1. **At the ZeroClaw level** (the brain): change the `default_model` and provider in ZeroClaw's `config.toml` on the ZeroClaw host. ZeroClaw supports OpenAI-compatible APIs, Anthropic, Gemini, Ollama, and 17+ other backends. Restart the bridge after changing. -2. **At the xiaozhi-server level**: if you want to bypass ZeroClaw entirely, swap the `selected_module` for LLM in `data/.config.yaml` to any of the built-in providers (OpenAI-compatible, Ollama, Dify, FastGPT, etc.). You lose ZeroClaw's agent features (memory, tools, persona files) but gain a simpler stack. +1. **`PiVoiceLLM` (the default)** routes voice turns to the `dotty-pi` container — the pi agent — which runs a local model on llama-swap. To change the model, see [dotty-pi/README.md](../dotty-pi/README.md) for the model-selection rules. +2. **`Tier1Slim`** runs a small/fast model directly against any OpenAI-compatible endpoint. +3. **`OpenAICompat`** points straight at OpenAI, OpenRouter, Ollama, or any OpenAI-compatible API. -The reference config uses Qwen3-30B-A3B via OpenRouter. Any model that handles English well and can follow emoji-prefix instructions will work. Larger models give better persona adherence; smaller models respond faster. +See [llm-backends.md](./llm-backends.md) for the full comparison. Any model that handles English well and can follow emoji-prefix instructions will work. Larger models give better persona adherence; smaller models respond faster. --- @@ -32,9 +33,9 @@ Almost, and it can be with two swaps: - **ASR** (speech recognition): already fully local. FunASR runs on your server. - **TTS** (speech synthesis): local if you use Piper TTS. EdgeTTS requires internet (it hits Microsoft's servers). -- **LLM**: cloud by default (OpenRouter). Swap in Ollama pointing at a local model for fully offline inference. +- **LLM**: local by default — the `dotty-pi` agent runs against a local llama-swap model. Cloud is only used if you switch to a cloud backend or turn on smart-mode. -With Piper TTS and a local Ollama model, nothing leaves your LAN. The trade-off is that local LLMs need a GPU or beefy CPU to run at conversational speed. +With Piper TTS and the default local model, nothing leaves your LAN. The trade-off is that local LLMs need a GPU or beefy CPU to run at conversational speed. --- @@ -42,13 +43,11 @@ With Piper TTS and a local Ollama model, nothing leaves your LAN. The trade-off **Hardware (one-time):** - M5Stack StackChan kit: check current pricing on the [M5Stack store](https://shop.m5stack.com/). Expect roughly $60-80 USD for the CoreS3 + servo kit. -- Docker host: whatever you already have. Any machine that can run Docker and has a few GB of RAM. -- ZeroClaw host: any Pi 3 or later. The ZeroClaw binary is ~9 MB and uses <5 MB RAM. +- Docker host: whatever you already have. Any machine that can run Docker (and, for the local LLM, a GPU or beefy CPU). **Recurring:** -- Electricity for the hosts (negligible for most home setups). -- LLM API costs if using a cloud provider. OpenRouter pricing for Qwen3-30B-A3B is on the order of $0.10-0.30 per million tokens — casual household use is pennies per day. Check [OpenRouter's pricing page](https://openrouter.ai/qwen/qwen3-30b-a3b-instruct-2507) for current rates. -- $0 if you run a local model via Ollama. +- Electricity for the host (negligible for most home setups). +- LLM API costs **only** if you use a cloud backend or turn on smart-mode — the default local model is free beyond electricity. Cloud backends (OpenRouter, OpenAI, etc.) are pay-per-token. --- @@ -57,12 +56,12 @@ With Piper TTS and a local Ollama model, nothing leaves your LAN. The trade-off **Kid Mode is ON by default** (`DOTTY_KID_MODE=true`). It enforces child-safe guardrails. You can disable it with `DOTTY_KID_MODE=false` for general-purpose use. What Kid Mode enforces: -- Per-turn sandwich enforcement in the bridge forces the LLM to respond in English with an emoji prefix, which limits the scope of unexpected output. -- The ZeroClaw persona files (`SOUL.md`, `IDENTITY.md`) define the robot's personality and boundaries with kid-safe defaults. +- Per-turn sandwich enforcement forces the LLM to respond in English with an emoji prefix, which limits the scope of unexpected output. +- The persona prompt (`personas/dotty_voice.md`) defines the robot's personality and boundaries with kid-safe defaults. - Content and tone are constrained to be age-appropriate. What Kid Mode does **not** do: -- Content-filter the LLM's output at a network level. If the LLM says something inappropriate, the bridge passes it through. +- Content-filter the LLM's output at a network level. If the LLM says something inappropriate, the stack passes it through. - Prevent a determined child from asking adversarial questions. - Guarantee the LLM won't hallucinate inappropriate content (no model can). @@ -72,14 +71,9 @@ This is a self-hosted system — you control the prompt, the model, and every lo ### Can I change the robot's personality? -Yes. The persona is defined in Markdown files on the ZeroClaw host: +Yes. The persona is a Markdown file — `personas/dotty_voice.md`, loaded by the active LLM provider. Edit it and restart the relevant container. -- `~/.zeroclaw/workspace/SOUL.md` — core identity and values. -- `~/.zeroclaw/workspace/IDENTITY.md` — name, backstory, role. - -These are hot-read by ZeroClaw — edit them and the next conversation turn picks up the changes, no restart needed. - -There's also a secondary `prompt:` key in `data/.config.yaml` on the server that gets injected as a system message. ZeroClaw's own persona files take precedence, but this is a useful place for voice-pipeline-level hints. +There's also a secondary `prompt:` key in `data/.config.yaml` that gets injected as a system message — a useful place for voice-pipeline-level hints. Full instructions: [cookbook/change-persona.md](./cookbook/change-persona.md). --- @@ -113,4 +107,4 @@ See [hardware-support.md](./hardware-support.md) for the full support matrix. - [troubleshooting.md](./troubleshooting.md) — symptom-based debugging guide. - [SETUP.md](SETUP.md) — deployment guide. -Last verified: 2026-05-17. +Last verified: 2026-05-22. diff --git a/docs/hardware-support.md b/docs/hardware-support.md index ce8192d..972d281 100644 --- a/docs/hardware-support.md +++ b/docs/hardware-support.md @@ -55,7 +55,7 @@ What will likely **not** work without board-specific adaptation: - LED patterns (hardcoded to the kit's 12-LED layout). - MCP tools that touch kit-specific peripherals (head yaw/pitch, LED color, NFC, IR). -If you want to run this stack on a different ESP32-S3 board, you are signing up for firmware-level porting work. The server-side infrastructure (xiaozhi-esp32-server, bridge, ZeroClaw) doesn't care what board is on the other end of the WebSocket. +If you want to run this stack on a different ESP32-S3 board, you are signing up for firmware-level porting work. The server-side infrastructure (xiaozhi-esp32-server, bridge, dotty-pi, dotty-behaviour) doesn't care what board is on the other end of the WebSocket. ### Out of scope diff --git a/docs/interaction-map.md b/docs/interaction-map.md index bd75776..492bb29 100644 --- a/docs/interaction-map.md +++ b/docs/interaction-map.md @@ -1,6 +1,6 @@ --- title: Cross-Layer Interaction Map -description: Signal flow between StackChan firmware, xiaozhi-server, and zeroclaw-bridge. +description: Signal flow between StackChan firmware, xiaozhi-server, and the dotty-pi agent. --- # Cross-Layer Interaction Map @@ -11,7 +11,7 @@ One-page reference for every cross-layer signal in the Dotty stack. 1. **StackChan firmware** -- ESP32-S3 (m5stack/StackChan). The physical robot. 2. **xiaozhi-esp32-server** -- Docker on a Linux host. Voice I/O pipeline (ASR, TTS, VAD, emotion parsing). -3. **zeroclaw-bridge** -- FastAPI on ZeroClaw host. LLM brain interface (HTTP to ZeroClaw ACP-over-stdio). +3. **dotty-pi** -- the pi coding agent (Docker container on the same host). The LLM brain; reached by xiaozhi-server's `PiVoiceLLM` provider via `docker exec` pi RPC. (Ambient perception runs in a sibling `dotty-behaviour` container — see [architecture.md](./architecture.md).) --- @@ -29,17 +29,17 @@ One-page reference for every cross-layer signal in the Dotty stack. | Signal | Source | Destination | Protocol | Notes | |---|---|---|---|---| -| LLM request | xiaozhi (ZeroClawLLM provider) | bridge | HTTP POST `/api/message/stream` | Carries the user text; bridge wraps it in ACP JSON-RPC to ZeroClaw | -| LLM response | bridge | xiaozhi | NDJSON streaming (HTTP chunked) | Each chunk is one partial sentence; bridge enforces emoji prefix | +| LLM request | xiaozhi (PiVoiceLLM provider) | dotty-pi | `docker exec` pi RPC (JSONL over stdio) | Carries the user text; pi runs the agent loop and tools inside the container | +| LLM response | dotty-pi | xiaozhi | JSONL text chunks over stdio | Only TTS-bound text streams back; tool dispatch stays inside the agent | | Sentence chunks | xiaozhi | TTS then StackChan | Internal then WebSocket Opus | xiaozhi splits response into sentences, synthesizes each, streams audio back | ## Emotion & Expression | Signal | Source | Destination | Protocol | Notes | |---|---|---|---|---| -| Emoji in LLM text | bridge (LLM output) | xiaozhi | First char of NDJSON response text | Three-layer enforcement: ZeroClaw prompt, xiaozhi system prompt, `_ensure_emoji_prefix` fallback | +| Emoji in LLM text | dotty-pi (agent output) | xiaozhi | First char of the response text | Two-layer enforcement: the pi agent persona prompt + the xiaozhi system prompt | | Emotion frame | xiaozhi | StackChan | WebSocket JSON `{"type":"llm","text":"emoji","emotion":"name"}` | Mapped from leading emoji (e.g. `😊`=smile, `🤔`=thinking); 9-emoji subset used | -| Thinking emotion | bridge | xiaozhi (forwarded to StackChan) | Emitted before LLM call starts | Shows thinking face while waiting for first token | +| Thinking emotion | xiaozhi-server | StackChan | Emitted before the LLM call starts | Shows thinking face while waiting for first token | | Face animation | StackChan firmware (local) | Avatar renderer (local) | Internal | Firmware maps emotion string to animated face expression | ## MCP Tools diff --git a/docs/kid-mode.md b/docs/kid-mode.md index d6f5a3a..2943738 100644 --- a/docs/kid-mode.md +++ b/docs/kid-mode.md @@ -31,7 +31,7 @@ asks). Only the child-specific rules (4-9) are removed. Both the bridge's `POST /admin/kid-mode` endpoint and the dashboard toggle persist the new value and call `_apply_kid_mode(enabled)`, which atomically re-binds every kid-mode-derived module global (`KID_MODE`, `VISION_SYSTEM_PROMPT`, `MCP_TOOL_DENYLIST`, `VOICE_TURN_SUFFIX`, `VOICE_TURN_SUFFIX_SHORT`) in a single store-global pass. Readers see either the old or new value, never a torn intermediate, and the cost per turn is unchanged. **No daemon restart is required** to flip kid-mode at runtime. -The xiaozhi-server side of kid-mode lives in the active LLM provider's persona / suffix. For `Tier1Slim`, `KID_MODE` is read at module import and baked into `_TURN_SUFFIX`; a flip there does currently require a container restart to take effect on Tier1Slim's side (the bridge side rebinds instantly, but the suffix already loaded into the live `Tier1Slim` instance is unchanged). For `ZeroClawLLM`, the suffix is generated per-turn by the bridge so the flip lands on the very next turn with no restart at all. +The xiaozhi-server side of kid-mode lives in the active LLM provider's persona / suffix. For `Tier1Slim`, `KID_MODE` is read at module import and baked into `_TURN_SUFFIX`; a flip there does currently require a container restart to take effect on Tier1Slim's side (the bridge side rebinds instantly, but the suffix already loaded into the live `Tier1Slim` instance is unchanged). For `PiVoiceLLM`, the persona is loaded per-session by the `dotty-pi` agent, so the flip lands on the very next turn with no restart at all. ## Guardrail details @@ -47,12 +47,12 @@ speaker. Each layer reinforces the same rules so that a failure in one layer is caught by the next. > **Provider-dependent layering.** The exact layering depends on which LLM provider is active: -> - **`Tier1Slim`** (current default) — Layer 1 is `personas/dotty_voice.md`; Layer 2 is **skipped** (Tier1Slim deliberately discards xiaozhi's top-level `prompt:` because the 4 B chat template only honours one system message); Layer 3 is `_TURN_SUFFIX` appended per-turn by Tier1Slim itself (read from `build_turn_suffix(KID_MODE)` at module import). Layer 3b/3c only apply to escalated tool calls that pass through the bridge. -> - **`ZeroClawLLM`** (legacy) — all three layers fire on every turn as described below. +> - **`PiVoiceLLM`** (current default) — Layer 1 is `personas/dotty_voice.md` (loaded by the `dotty-pi` agent); Layer 2 is the `prompt:` block in `.config.yaml` injected by xiaozhi-server; Layer 3 enforcement (suffix sandwich, emoji fallback, content filter) was part of the retired ZeroClaw bridge and is not present on the `PiVoiceLLM` path — Layers 1 and 2 are load-bearing. +> - **`Tier1Slim`** (alternate) — Layer 1 is `personas/dotty_voice.md`; Layer 2 is **skipped** (Tier1Slim deliberately discards xiaozhi's top-level `prompt:` because the 4 B chat template only honours one system message); Layer 3 is `_TURN_SUFFIX` appended per-turn by Tier1Slim itself (read from `build_turn_suffix(KID_MODE)` at module import). Layer 3b/3c only apply to escalated tool calls that pass through the bridge. -### Layer 1 -- ZeroClaw Agent Prompt (ZeroClaw host) +### Layer 1 -- Agent Persona Prompt (dotty-pi container) -The ZeroClaw agent's own persona prompt sets the baseline: stay cheerful, +The `dotty-pi` agent's persona prompt (`personas/dotty_voice.md`) sets the baseline: stay cheerful, age-appropriate, begin every reply with an emoji. This is the "inner" system prompt that the LLM sees at the top of its context. @@ -73,10 +73,11 @@ prompt: | markdown, no code blocks. ``` -### Layer 3 -- Bridge Prefix + Suffix Sandwich (ZeroClaw host, `bridge.py`) +### Layer 3 -- Bridge Prefix + Suffix Sandwich (`bridge.py` / `Tier1Slim`) -This is the strongest enforcement layer. Every turn on the `stackchan` -channel is wrapped in a prefix and a suffix before being sent to the LLM: +> **Note:** This layer applied to the retired `ZeroClawLLM` path and the `Tier1Slim` alternate provider. On the current default `PiVoiceLLM` path, Layers 1 and 2 are the active enforcement layers. + +On the `Tier1Slim` path, every turn is wrapped in a prefix and a suffix before being sent to the LLM: ``` VOICE_TURN_PREFIX + context + user_message + suffix @@ -88,16 +89,12 @@ constraints in the suffix are the last thing the model reads before generating its reply, making them the hardest to override. **Per-session suffix caching.** The full ~600-token suffix -(`VOICE_TURN_SUFFIX`) is sent on the first turn of each ACP session. +(`VOICE_TURN_SUFFIX`) is sent on the first turn of each session. Subsequent turns receive a shorter reminder (`VOICE_TURN_SUFFIX_SHORT`) that explicitly restates the English-only, emoji-leader, and child-safe constraints. This saves ~550 tokens per turn while the full rules remain in the LLM's -conversation history from turn 0. Sessions rotate on idle timeout (5 min), -turn count (50), or wall-clock age (30 min), at which point the full suffix -is re-sent. The suffix choice is made inside `app_lock` (via a `prepare` -callback on `ACPClient.prompt()`) to avoid a TOCTOU race where session -rotation could send a short suffix on a fresh session's first turn. +conversation history from turn 0. **Why a suffix, not just a system prompt?** System prompts are seen once and can be diluted by long conversations. The suffix is re-injected on every @@ -236,12 +233,10 @@ The emoji that begins each reply is not decorative -- the StackChan firmware parses it into a facial expression on the robot's screen. If the emoji is missing, the face stays blank. Three layers enforce it: -1. **ZeroClaw agent prompt** -- tells the model to begin with an emoji. +1. **Agent persona prompt** (`personas/dotty_voice.md`, loaded by `dotty-pi`) -- tells the model to begin with an emoji. 2. **xiaozhi-server system prompt** (`.config.yaml` `prompt:` block) -- repeats the rule with the exact emoji set. -3. **`_ensure_emoji_prefix` in `bridge.py`** -- programmatic fallback. If - the first non-whitespace character is not one of the nine allowed emojis, - the neutral face `😐` is prepended. +3. **`_ensure_emoji_prefix` in `bridge.py`** -- programmatic fallback available on the `Tier1Slim` path. On the default `PiVoiceLLM` path, Layers 1 and 2 are load-bearing. Allowed emojis and their face mappings: @@ -271,7 +266,7 @@ than exposing raw error text or going silent: | Failure mode | Response | |---|---| | LLM timeout | `😐 I'm thinking too slowly right now, try again.` | -| ZeroClaw binary missing | `😐 My AI brain is offline.` | +| dotty-pi container unavailable | `😐 My AI brain is offline.` | | Any other exception | `😐 Something went wrong, please try again.` | | Empty LLM response | `😐 (no response)` | @@ -310,7 +305,7 @@ inappropriate content through). | Fail-safe error responses | `bridge.py` | Exception handlers in both endpoint handlers | | Allowed emoji list | `bridge.py` | `ALLOWED_EMOJIS`, `FALLBACK_EMOJI` | | xiaozhi system prompt | `.config.yaml` | Top-level `prompt:` block | -| LLM provider system prompt | `.config.yaml` | `LLM.ZeroClawLLM.system_prompt` | +| LLM provider system prompt | `personas/dotty_voice.md` | loaded by `dotty-pi` agent | --- @@ -345,7 +340,7 @@ refuse+redirect, refuse+log, and refuse+alert. The current system uses the same LLM for all channels. A planned improvement is to route the `stackchan` channel to a model with stronger built-in safety -(e.g., Claude Haiku) via ZeroClaw's model routing, as an additional layer. +(e.g., Claude Haiku), as an additional layer. --- @@ -353,13 +348,7 @@ is to route the `stackchan` channel to a model with stronger built-in safety ### Modifying the Topic Blocklist -Edit `VOICE_TURN_SUFFIX` in `bridge.py` (lines 25-46). The blocked -topics are in rule 5, as a bulleted list. Add or remove entries, then -restart the bridge service: - -```bash -systemctl restart zeroclaw-bridge -``` +Edit `VOICE_TURN_SUFFIX` in `bridge.py` (lines 25-46) for the `Tier1Slim` path, or edit rule 5 in `personas/dotty_voice.md` for the `PiVoiceLLM` path. After editing, restart the relevant container. ### Changing the Self-Harm Response diff --git a/docs/latent-capabilities.md b/docs/latent-capabilities.md index e240dc2..a211f53 100644 --- a/docs/latent-capabilities.md +++ b/docs/latent-capabilities.md @@ -44,31 +44,31 @@ Features xiaozhi-esp32-server supports upstream that aren't turned on or surface | **Custom wake word** | Replace/add to the stock wake word via ESP-SR MultiNet | Low | **New-task candidate** | | **Voiceprint speaker ID** | Distinguish family members; apply per-user persona/context | Medium | Cross-refs child-safety task (different guardrails for kids vs adults) | | **xiaozhi-server VLLM module** | Server-side "What's in this photo?" pipeline | Medium | Already covered by the bridge-side `take_photo` + VLM long-poll path described in [`modes.md`](./modes.md#vision); this row tracks the *upstream* xiaozhi-server VLLM module, which we don't enable. | -| **PowerMem** | Dual-layer short-term + summarized memory (currently ZeroClaw owns memory) | Low | Would overlap with ZeroClaw's memory — probably don't | +| **PowerMem** | Dual-layer short-term + summarized memory (currently the `dotty-pi` agent owns memory via its FTS brain.db) | Low | Would overlap with the pi agent's memory — probably don't | | **Intent router** (`function_call` mode) | Route simple commands (turn off lights, set timer) without round-tripping to the LLM | Medium | **New-task candidate** | | **RagFlow knowledge base** | Retrieval-augmented responses against a household doc store | Low | **New-task candidate** | -| **Multi-device routing** | Run the StackChan as one of several voice surfaces on the same ZeroClaw brain | Low | Needs the full-module deployment (DB-backed) | +| **Multi-device routing** | Run the StackChan as one of several voice surfaces on the same pi agent brain | Low | Needs the full-module deployment (DB-backed) | | **Piper streaming synthesis** | Lower first-audio latency than the current batch synthesis | Medium | `ROADMAP.md` → "Reduce first-audio latency" | | **ffmpeg post-processing on TTS** | Robot-voice character via ring modulator / bitcrush / vocoder | Medium | `ROADMAP.md` → "TTS provider swap — robot-sounding voice" | ## Brain — unused -ZeroClaw + Qwen3 + OpenRouter features that could be wired into the bridge. +`dotty-pi` (pi agent + qwen3.5:4b on llama-swap) + `dotty-pi-ext` features that could be wired up. | Capability | What it unlocks | Priority | Cross-ref | |---|---|---|---| -| **ACP `session/update` streaming** | First-token TTS instead of waiting for the full response (perceived-latency win) | **High** | `ROADMAP.md` → "Reduce first-audio latency" | -| **Long-lived ZeroClaw sessions** | Skip `session/new` per turn — carry context across turns within a conversation | Medium | `ROADMAP.md` → "Reduce first-audio latency" (ACP session overhead) | -| **`session/request_permission`** | Bridge confirms tool calls before they execute — useful for child-safety. Bridge now auto-approves (2026-04-25); tool allowlist for child-safety is a follow-up. | Medium | `ROADMAP.md` → "Lock down for child-safe operation" | -| ~~**Qwen3 function-calling / tool-use**~~ | **Wired up (2026-04-25).** ZeroClaw auto-approves tools in `auto_approve` list and sends tool execution as `session/event` notifications. Bridge logs tool calls at INFO level. Works for `weather`, `web_search_tool`, `calculator`, etc. | ~~Medium~~ Done | — | -| **ZeroClaw MCP-server mode** | Expose ZeroClaw's tools/memory to other MCP clients | Low | **New-task candidate** | -| **Qwen3 `role: "system"` injection** | Move the English+emoji constraints into a proper system message instead of a prompt prefix/suffix; better MoE adherence | Medium | Rework of bridge's wrapping logic | -| **Qwen3 extended context (256K native)** | Keep long conversation history / memory verbatim instead of summarising | Low | Costs more tokens per turn — probably not worth it yet | -| **OpenRouter latency/cost dashboard** | Observability beyond the local `state/costs.jsonl` | Low | Already available — just point a browser at it | -| **OpenRouter failover / multi-model** | A/B a smaller faster model for voice turns specifically | Medium | `ROADMAP.md` → "Reduce first-audio latency" (smaller model for voice) | -| **ZeroClaw cost/trace surfacing** | Expose `state/costs.jsonl` + `runtime-trace.jsonl` via the bridge `/health` or a new `/stats` endpoint | Low | **New-task candidate** | -| **ZeroClaw cron scheduler** | The robot could say "good morning" on a schedule, not just on demand | Low | **New-task candidate** | +| **Streaming first-token to TTS** | First-token TTS instead of waiting for the full response (perceived-latency win) | **High** | `ROADMAP.md` → "Reduce first-audio latency" | +| **Long-lived pi agent sessions** | Carry context across turns within a conversation without re-loading the persona each time | Medium | `ROADMAP.md` → "Reduce first-audio latency" | +| **Tool pre-approval gate** | Bridge confirms tool calls before they execute — useful for child-safety. | Medium | `ROADMAP.md` → "Lock down for child-safe operation" | +| ~~**Tool-use**~~ | **Wired up.** The `dotty-pi-ext` extension exposes 5 voice tools (`memory_lookup`, `remember`, `think_hard`, `take_photo`, `play_song`). | Done | — | +| **pi agent MCP-server mode** | Expose the agent's tools/memory to other MCP clients | Low | **New-task candidate** | +| **Qwen3 `role: "system"` injection** | Move the English+emoji constraints into a proper system message instead of a prompt prefix/suffix; better MoE adherence | Medium | Rework of persona prompt structure | +| **Qwen3 extended context (96K native)** | Keep long conversation history / memory verbatim instead of summarising | Low | Costs more tokens per turn — probably not worth it yet | +| **llama-swap latency/cost dashboard** | Observability into per-turn inference cost on the local model | Low | **New-task candidate** | +| **Model A/B for voice turns** | Test a smaller/faster model for chitchat, escalate to 27B-think only when needed | Medium | `ROADMAP.md` → "Reduce first-audio latency" | +| **Per-turn cost/trace surfacing** | Expose pi agent trace data via the bridge `/health` or a new `/stats` endpoint | Low | **New-task candidate** | +| **pi agent cron scheduler** | The robot could say "good morning" on a schedule, not just on demand | Low | **New-task candidate** | ## Cross-cutting — observability @@ -82,7 +82,7 @@ None of these are feature requests — they're gaps in what we can *see* about t | Per-turn cost breakdown | Whether Qwen3 via OpenRouter is cheaper than a smaller local model | | Per-session trace diff | Whether English-sandwich is still needed after a hypothetical model upgrade | -These are all feeders for the **`ROADMAP.md`** "Map the ZeroClaw ↔ xiaozhi-server ↔ StackChan firmware interaction" backlog item. +These are all feeders for the **`ROADMAP.md`** "Map the dotty-pi ↔ xiaozhi-server ↔ StackChan firmware interaction" backlog item. ## Prioritisation rule of thumb @@ -99,7 +99,7 @@ These are all feeders for the **`ROADMAP.md`** "Map the ZeroClaw ↔ xiaozhi-ser - [ROADMAP.md](ROADMAP.md) — live backlog; this file is a *source* for it, not a replacement. - [hardware.md](./hardware.md) — what the hardware features actually are. - [voice-pipeline.md](./voice-pipeline.md) — what the server supports upstream. -- [brain.md](./brain.md) — what ZeroClaw/Qwen/OpenRouter expose. +- [brain.md](./brain.md) — what the pi agent / Qwen / llama-swap expose. - [references.md](./references.md) — upstream source for every capability claim. Last verified: 2026-05-17. diff --git a/docs/llm-backends.md b/docs/llm-backends.md index f12ff6f..d9ebae5 100644 --- a/docs/llm-backends.md +++ b/docs/llm-backends.md @@ -1,6 +1,6 @@ --- title: Choose Your LLM Backend -description: Side-by-side comparison of LLM backend options for Dotty. +description: Side-by-side comparison of LLM backend options for Dotty — PiVoiceLLM (default), Tier1Slim, OpenAICompat, and llama-swap. --- # Choose Your LLM Backend @@ -11,17 +11,17 @@ and the matching block under `LLM:` in `.config.yaml`. ## Comparison -| | OpenAI-compatible API | llama-swap (local, multi-model) | Tier1Slim (two-tier voice) | ZeroClaw (single-tier agent) | +| | OpenAI-compatible API | llama-swap (local, multi-model) | Tier1Slim (two-tier voice) | PiVoiceLLM (pi agent — default) | |---|---|---|---|---| -| **Provider key** | `OpenAICompat` | `OpenAICompat` | `Tier1Slim` | `ZeroClawLLM` | -| **Runs where** | Cloud (OpenRouter, OpenAI, etc.) | Local GPU host (Docker, llama.cpp) | Inner loop on llama-swap; escalations through the bridge | ZeroClaw host or server | -| **Latency** | 300-800 ms (network-bound) | 200-600 ms (GPU-bound; `qwen3.5:4b` warm <500 ms) | <500 ms plain chat; +bridge round-trip on tool calls | 500-1500 ms (full agent overhead on every turn) | +| **Provider key** | `OpenAICompat` | `OpenAICompat` | `Tier1Slim` | `PiVoiceLLM` | +| **Runs where** | Cloud (OpenRouter, OpenAI, etc.) | Local GPU host (Docker, llama.cpp) | Inner loop on llama-swap; tool escalation via bridge | dotty-pi container on the Docker host | +| **Latency** | 300-800 ms (network-bound) | 200-600 ms (GPU-bound; `qwen3.5:4b` warm <500 ms) | <500 ms plain chat; +bridge round-trip on tool calls | 500-1500 ms (pi agent turn overhead) | | **Cost** | Pay-per-token | Free (electricity + hardware) | Free for inner loop; pay-per-token in smart mode | Free (electricity + hardware) | -| **Privacy** | Tokens sent to cloud provider | Fully local, nothing leaves LAN | Fully local for plain turns; cloud only when smart_mode is on | Fully local (if local LLM backend) | -| **Setup complexity** | Low — API key + model name | Medium — GPU, Docker, GGUF download | Medium — llama-swap + Tier1Slim block; bridge for escalations | High — ZeroClaw install, bridge, systemd | -| **Memory / tools** | None | None | `memory_lookup` / `think_hard` / `take_photo` / `play_song` via escalation | Yes — persistent memory, 70+ tools, MCP | -| **Hot-swappable** | Restart container | Restart container | **Yes** — `set_runtime()` mutates the live provider; smart-mode flip is instant | No — daemon restart on model swap | -| **Best for** | Quick start, best-in-class models | Privacy + concurrent multi-model serving | Default for snappy voice; agent features only when needed | Always-on agentic features, deep tool use | +| **Privacy** | Tokens sent to cloud provider | Fully local, nothing leaves LAN | Fully local for plain turns; cloud only when smart_mode is on | Fully local | +| **Setup complexity** | Low — API key + model name | Medium — GPU, Docker, GGUF download | Medium — llama-swap + Tier1Slim block | Medium — dotty-pi container + llama-swap | +| **Memory / tools** | None | None | Chitchat-only post-cutover (escalation endpoint non-functional) | Yes — memory_lookup, remember, think_hard, take_photo, play_song | +| **Hot-swappable** | Restart container | Restart container | **Yes** — `set_runtime()` mutates the live provider; smart-mode flip is instant | Restart container | +| **Best for** | Quick start, best-in-class models | Privacy + concurrent multi-model serving | Chitchat-only fallback; no tool calls | **Default — snappy voice with full tool support** | ## 1. OpenAI-compatible API @@ -76,18 +76,11 @@ LLM: Caveats when running Anthropic-only (no OpenRouter): -- **Vision intents** (`take_photo`) go through the bridge's `_call_vision_api`, - which reads `VLM_API_KEY` → `VISION_API_KEY` → `OPENROUTER_API_KEY` in turn - and defaults to OpenRouter for the upload. Point those env vars at your - Anthropic key and set the bridge's VLM model+URL env to Anthropic's - endpoint to keep vision working without OpenRouter. -- **Smart-mode escalation** defaults to `anthropic/claude-sonnet-4-6` via - OpenRouter — flip `SMART_MODEL` in `zeroclaw-bridge.service` to a bare - Anthropic model id and `VOICE_CLOUD_PROFILE_KEY` to - `custom:https://api.anthropic.com/v1` to route smart-mode there too. +- **Vision intents** (`take_photo`) rely on the VLM call path. Point the + bridge's VLM env vars (`VLM_API_KEY`, `VLM_MODEL`, `VLM_URL`) at your + Anthropic key and endpoint to keep vision working without OpenRouter. - The compat shim doesn't support every OpenAI option (streaming and tools - work; `logprobs`, `seed`, etc. don't). Tier1Slim's `think_hard` / - `memory_lookup` tool calls go through the bridge, so they're unaffected. + work; `logprobs`, `seed`, etc. don't). ## 2. llama-swap (local, multi-model) @@ -135,11 +128,13 @@ LLM: - No memory between sessions — stateless like the cloud option. - If you don't need concurrent multi-model serving, Ollama is the simpler single-binary alternative. -## 3. Tier1Slim (two-tier voice — current default) +## 3. Tier1Slim (two-tier voice — alternate) -The default in the shipped `.config.yaml`. A small, fast model (`qwen3.5:4b` against llama-swap) handles every plain conversational turn without involving the bridge. When the model emits a structured `tool_call`, the provider escalates to `POST /api/voice/escalate` and the bridge dispatches the tool (ZeroClaw memory for `memory_lookup`, `qwen3.6:27b-think` for `think_hard`, the VLM for `take_photo`, or `/xiaozhi/admin/play-asset` for `play_song`). +An alternate voice backend. A small, fast model (`qwen3.5:4b` against llama-swap) handles every plain conversational turn without involving the bridge. When the model emits a structured `tool_call`, the provider escalates to `POST /api/voice/escalate` on the bridge. -Smart-mode flips repoint the inner loop at a cloud model (default `anthropic/claude-sonnet-4-6`) via in-process `set_runtime()` — no docker restart and no daemon restart. +**Post-cutover caveat:** `POST /api/voice/escalate` was served by the ZeroClaw bridge voice path, which was retired in issue #36 (2026-05-19). The escalation endpoint is non-functional in the current stack. Tier1Slim is therefore a **chitchat-only rollback path** — plain conversational turns work, tool calls (`memory_lookup`, `think_hard`, `take_photo`, `play_song`) do not. Use `PiVoiceLLM` (the default) for full tool support. + +Smart-mode flips repoint the inner loop at a cloud model (default `anthropic/claude-sonnet-4-6`) via in-process `set_runtime()` — no docker restart. ### `.config.yaml` snippet @@ -159,73 +154,67 @@ LLM: timeout: 60 ``` -Plus environment variables (consumed by the bridge for smart-mode dispatch): - -``` -DOTTY_VOICE_PROVIDER=tier1slim -TIER1SLIM_CLOUD_API_KEY=sk-or-... # required for OFF→ON smart-mode flip -``` - Full reference: [tier1slim.md](./tier1slim.md). ### Notes -- The inner loop bypasses the bridge entirely on plain turns, so `bridge.py` going down doesn't break chitchat (only tool calls fail). -- `set_runtime()` lets the bridge hot-swap the live provider — used for smart-mode flips and would also support per-time-of-day model selection in future. +- The inner loop bypasses the bridge entirely on plain turns, so chitchat works even if `bridge.py` is unreachable. +- `set_runtime()` lets the bridge hot-swap the live provider — used for smart-mode flips. - Persona uses `personas/dotty_voice.md`; the top-level `prompt:` block is deliberately ignored because the 4 B chat template only honours one system message. -## 4. ZeroClaw (always-on single-tier agent) +## 4. PiVoiceLLM (pi agent — default) + +The default in the shipped `.config.yaml`. The `PiVoiceLLM` provider routes each voice turn to the **dotty-pi container** — the pi coding agent running on the same Docker host as xiaozhi-server. + +`PiClient` drives the agent by running `docker exec -i dotty-pi pi --mode rpc …` and exchanging JSONL messages over its stdin/stdout. The agent's outer loop uses `qwen3.5:4b` on local llama-swap for fast chitchat and loads the **dotty-pi-ext extension**, which exposes five voice-focused tools: + +| Tool | Purpose | +|---|---| +| `memory_lookup` | Recall a fact from past conversations (FTS on brain.db) | +| `remember` | Stash a new fact into brain.db | +| `think_hard` | Escalate a hard question to `qwen3.6:27b-think` | +| `take_photo` | Describe what Dotty's camera sees via a VLM | +| `play_song` | Play a song through the speaker | -The `ZeroClawLLM` provider routes through the FastAPI bridge on the ZeroClaw host into a long-running ZeroClaw agent process. ZeroClaw handles its own LLM calls (to OpenRouter, Ollama, or any supported provider), persistent memory, tool execution, and MCP integration. Every voice turn round-trips through ZeroClaw — heavier than Tier1Slim, but you get the full agent loop on every turn whether you need it or not. +Only TTS-bound text streams back to xiaozhi-server — tool results stay internal to the agent loop. ### Prerequisites -- ZeroClaw installed on the ZeroClaw host (or another host): `cargo install zeroclaw`. -- `bridge.py` running as a systemd service (`zeroclaw-bridge.service`). -- Persona configured in `~/.zeroclaw/workspace/` (`SOUL.md`, `IDENTITY.md`, etc.). +- dotty-pi container running on the Docker host. +- llama-swap running and reachable by the dotty-pi container (`qwen3.5:4b` for the outer loop; `qwen3.6:27b-think` for `think_hard`). ### `.config.yaml` snippet ```yaml selected_module: - LLM: ZeroClawLLM + LLM: PiVoiceLLM LLM: - ZeroClawLLM: - type: zeroclaw - url: http://:8080/api/message/stream - channel: dotty - timeout: 90 - system_prompt: | - You are , a desktop robot (StackChan body). Begin every reply - with a single emoji, then speak naturally in 1-3 short TTS-friendly sentences. + PiVoiceLLM: + type: pi_voice + container_name: dotty-pi ``` ### Notes -- Higher latency because ZeroClaw may invoke tools or consult memory before - replying. The `timeout: 90` accommodates this. -- The bridge enforces an English + emoji sandwich around every turn to prevent - Qwen3's Chinese-leak tendency (see [brain.md](./brain.md)). -- Persistent memory (SQLite-backed) means the robot remembers across sessions. -- Supports 70+ built-in tools plus any MCP servers you connect. -- Set `DOTTY_VOICE_PROVIDER=zeroclaw` (the default) so smart-mode flips know to rewrite ZeroClaw's `config.toml` rather than Tier1Slim's runtime. +- Higher latency than a raw llama-swap call because the pi agent loop adds overhead — tool-using turns are slower than plain chitchat. +- Persistent memory (`brain.db`, FTS5) means the robot remembers across sessions. +- All four server-side services (xiaozhi-server, dotty-pi, dotty-behaviour, bridge.py) run as Docker containers on the same host — no separate "brain host" required. ## Switching backends 1. Edit `.config.yaml` — change `selected_module.LLM` and the relevant `LLM:` block. -2. If you're switching the smart-mode dispatch path, also set `DOTTY_VOICE_PROVIDER` (`tier1slim` or `zeroclaw`) in the bridge's systemd unit env block. -3. Restart xiaozhi-server: `docker compose restart xiaozhi-server`. -4. Test with a voice command or `curl` to the bridge endpoint. +2. Restart xiaozhi-server: `docker compose restart xiaozhi-server`. +3. Test with a voice command or a `curl` to the health endpoint. -All four `LLM:` blocks can coexist in the config; only the one named in `selected_module.LLM` is active. +All `LLM:` blocks can coexist in the config; only the one named in `selected_module.LLM` is active. ## See also -- [tier1slim.md](./tier1slim.md) — the default voice path in detail. -- [brain.md](./brain.md) — model matrix and ZeroClaw architecture. +- [tier1slim.md](./tier1slim.md) — Tier1Slim alternate voice path in detail. +- [brain.md](./brain.md) — model matrix and dotty-pi agent architecture. - [voice-pipeline.md](./voice-pipeline.md) — ASR, TTS, and VAD modules. - [architecture.md](./architecture.md) — how the LLM slot fits into the full pipeline. - [cookbook/llama-swap-concurrent-models.md](./cookbook/llama-swap-concurrent-models.md) — running multiple resident models on one GPU. -Last verified: 2026-05-17. +Last verified: 2026-05-22. diff --git a/docs/modes.md b/docs/modes.md index f103da4..f563574 100644 --- a/docs/modes.md +++ b/docs/modes.md @@ -30,7 +30,7 @@ The firmware boots into `idle` with both toggles **off**. The bridge resyncs tog **Speech sub-states** are conveyed by face animations (eye gestures, talking mouth) and the dedicated **listening pixel** at right-ring index 11. `thinking` and `speaking` have no LED — they live on the face. `listening` lights pixel 11 red so the user knows when their voice is being captured as a turn. -**Smart-mode flip is in-process** when `DOTTY_VOICE_PROVIDER=tier1slim` (the recommended default): the bridge calls `/xiaozhi/admin/set-tier1slim-model` to mutate the live Tier1Slim provider's model/url/api_key with no docker restart. Legacy `=zeroclaw` still rewrites ZeroClaw's `config.toml` and restarts the daemon. +**Smart-mode flip is in-process** when `DOTTY_VOICE_PROVIDER=tier1slim`: the bridge calls `/xiaozhi/admin/set-tier1slim-model` to mutate the live Tier1Slim provider's model/url/api_key with no docker restart. For the default `PiVoiceLLM` path, smart-mode model-swap is v2 scope (see `docs/cutover-behaviour.md`). --- @@ -39,8 +39,8 @@ The firmware boots into `idle` with both toggles **off**. The bridge resyncs tog | State | LED arc (left ring 0-5) | Idle profile | Behaviour | Backing path | |---|---|---|---|---| | `idle` | off `(0,0,0)` | NORMAL | Ambient awareness, gentle idle motion. Default. | n/a (no chat in flight) | -| `talk` | dim green `(0,60,0)` | NORMAL (face_tracking overlay active) | Conversation engaged. Listening pixel (right 11) lights red while the user has the turn; `thinking` and `speaking` are face-animation only. | xiaozhi → Tier1Slim → llama-swap (default), or xiaozhi → ZeroClawLLM → bridge → ACP (legacy) | -| `story_time` | warm `(100,40,0)` | NORMAL | Long-running interactive story. Bridge bypasses ZeroClaw, calls OpenRouter directly with story persona + rolling context. | bridge → direct OpenRouter (Phase 7 pending) | +| `talk` | dim green `(0,60,0)` | NORMAL (face_tracking overlay active) | Conversation engaged. Listening pixel (right 11) lights red while the user has the turn; `thinking` and `speaking` are face-animation only. | xiaozhi → PiVoiceLLM → dotty-pi (default), or xiaozhi → Tier1Slim → llama-swap (alternate) | +| `story_time` | warm `(100,40,0)` | NORMAL | Long-running interactive story. Bridge calls OpenRouter directly with story persona + rolling context, bypassing the standard LLM provider. | bridge → direct OpenRouter (Phase 7 pending) | | `security` | white `(80,80,80)` **flashing 1 Hz** across all 6 left pixels (`kSecurityFlashHalfMs = 500`) | SURVEILLANCE | Wide deliberate scan, serious face, periodic photo + audio capture. No proactive greet. | bridge ambient task (Phase 6 partial) | | `sleep` | very dim blue `(0,0,16)` | SLEEPY | Head face-down + centred, servo torque off (with `kSleepTorqueReleaseTimeoutMs = 3000` fallback), sleeping emoji on screen, ambient awareness paused. Wakes on face / voice / head-pet. | firmware-only quiescence (Phase 5) | | `dance` | rainbow sweep (left ring) | NORMAL | Transient performance — choreography + audio. Pre-existing dance handler. | `receiveAudioHandle.py::_handle_dance` | @@ -67,8 +67,8 @@ The `idle → talk` trigger is the firmware `face_detected` event (any face, fam | Toggle | Toggle pip (right ring) | What it does | Persistence | |---|---|---|---| -| `kid_mode` | salmon pink `(220, 80, 80)` at index **8** (G == B so PY32 RGB565 quantization stays warm) | Guardrails only — content sandwich, camera tools denied, kid-safe persona. Does not pick the model. Bridge-side hot-reload via `_apply_kid_mode()` (no daemon restart). | `/root/zeroclaw-bridge/state/kid-mode` | -| `smart_mode` | orange `(168, 80, 0)` at index **9** | Voice-LLM model selector. ON → `SMART_MODEL` (`claude-sonnet-4-6` by default) via OpenRouter; OFF → local default. Flip is in-process when `DOTTY_VOICE_PROVIDER=tier1slim`; daemon-restart when `=zeroclaw`. | `/root/zeroclaw-bridge/state/smart-mode` | +| `kid_mode` | salmon pink `(220, 80, 80)` at index **8** (G == B so PY32 RGB565 quantization stays warm) | Guardrails only — content sandwich, camera tools denied, kid-safe persona. Does not pick the model. Bridge-side hot-reload via `_apply_kid_mode()` (no daemon restart). | `bridge` container state file | +| `smart_mode` | orange `(168, 80, 0)` at index **9** | Voice-LLM model selector. ON → `SMART_MODEL` (`claude-sonnet-4-6` by default) via OpenRouter; OFF → local default. Flip is instantaneous when `DOTTY_VOICE_PROVIDER=tier1slim` (in-process hot-swap); v2 scope for `PiVoiceLLM`. | `bridge` container state file | The two toggles are orthogonal — they compose freely. `kid_mode = on` AND `smart_mode = on` runs the smart model behind the kid-safe sandwich. Both toggles are sticky across turns, daemon restarts, and reboots. @@ -152,7 +152,7 @@ Both `kid_mode` and `smart_mode` are voice-untoggleable — they are guardian-co | Endpoint | Body | Effect | Where | |---|---|---|---| | `POST /admin/kid-mode` | `{"enabled": bool}` | Persists + hot-reloads kid-mode globals atomically via `_apply_kid_mode()`. No daemon restart. Also pushes the kid pip via xiaozhi `/xiaozhi/admin/set-toggle`. | bridge (localhost-only) | -| `POST /admin/smart-mode` | `{"enabled": bool, "device_id": ""}` | Persists + flips voice provider's model. When `DOTTY_VOICE_PROVIDER=tier1slim`: in-process hot-swap via `/xiaozhi/admin/set-tier1slim-model`. When `=zeroclaw`: rewrites `config.toml` + restarts daemon. Also pushes the smart pip. | bridge (localhost-only) | +| `POST /admin/smart-mode` | `{"enabled": bool, "device_id": ""}` | Persists + flips voice provider's model. When `DOTTY_VOICE_PROVIDER=tier1slim`: in-process hot-swap via `/xiaozhi/admin/set-tier1slim-model`. For `PiVoiceLLM`: model-swap is v2 scope. Also pushes the smart pip. | bridge (localhost-only) | | `POST /xiaozhi/admin/set-state` | `{"state": "", "device_id": ""}` | Dispatches MCP `self.robot.set_state` onto the device WS; firmware StateManager applies it. | xiaozhi-server | | `POST /xiaozhi/admin/set-toggle` | `{"name": "kid_mode\|smart_mode", "enabled": bool, "device_id": ""}` | Dispatches MCP `self.robot.set_toggle`; firmware StateManager updates the pip without disturbing the active state. | xiaozhi-server | | `POST /xiaozhi/admin/set-face-identified` | `{"device_id": ""}` | Lights the face-identified pixel green; refresh required every < `kFaceIdentifiedTimeoutMs` (4 s) to hold. | xiaozhi-server | @@ -172,13 +172,13 @@ Both `kid_mode` and `smart_mode` are voice-untoggleable — they are guardian-co | State | Voice path | Memory? | Tools? | |---|---|---|---| | `idle` | n/a | n/a | n/a | -| `talk` | xiaozhi → Tier1Slim → llama-swap (default), or xiaozhi → ZeroClawLLM → bridge → ZeroClaw ACP (legacy). Smart-mode swaps the inner-loop model. | yes (FTS via `memory_lookup` tool / full ZeroClaw memory) | yes (4-tool Tier1 catalogue / full ZeroClaw MCP) | +| `talk` | xiaozhi → PiVoiceLLM → dotty-pi (default), or xiaozhi → Tier1Slim → llama-swap (alternate). Smart-mode swaps the inner-loop model. | yes (FTS via `memory_lookup` / `remember` tools in dotty-pi-ext) | yes (5-tool dotty-pi-ext catalogue) | | `story_time` | xiaozhi → bridge → direct OpenRouter (story persona overlay + rolling context) | per-session list (Phase 7) | no | | `security` | bridge ambient task (no voice path active) | logs to journal | photo + audio capture | | `sleep` | mic stays on for "wake up"; no LLM round-trip | n/a | n/a | | `dance` | bridge handler dispatches choreography + audio file | n/a | dance MCP | -`smart_mode` flips the inner-loop model and is sticky across turns. With `DOTTY_VOICE_PROVIDER=tier1slim` (the recommended default) the flip is instantaneous — Tier1Slim's `set_runtime()` mutates the live provider; no docker restart and no daemon restart. `story_time` is the only voice path that bypasses both ZeroClaw and Tier1Slim, with its own session memory (Phase 7). +`smart_mode` flips the inner-loop model and is sticky across turns. With `DOTTY_VOICE_PROVIDER=tier1slim` the flip is instantaneous — Tier1Slim's `set_runtime()` mutates the live provider; no docker restart. For the default `PiVoiceLLM` path, smart-mode model-swap is v2 scope. `story_time` is the only voice path with its own session memory (Phase 7). --- diff --git a/docs/observability.md b/docs/observability.md index 1127d42..8b8d45f 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -1,13 +1,13 @@ --- title: Observability -description: Prometheus metrics and a starter Grafana dashboard for the zeroclaw-bridge. +description: Prometheus metrics and a starter Grafana dashboard for the bridge dashboard service. --- # Observability -The zeroclaw-bridge exposes a Prometheus exposition endpoint at `/metrics` -covering first-audio latency, request rate / errors per endpoint, ACP -session state, perception events, calendar health, and Kid Mode state. +The `bridge.py` dashboard service exposes a Prometheus exposition endpoint at `/metrics` +covering first-audio latency, request rate / errors per endpoint, +perception events, calendar health, and Kid Mode state. A starter Grafana dashboard lives at [`monitoring/grafana-dashboard.json`](https://github.com/BrettKinny/dotty-stackchan/blob/main/monitoring/grafana-dashboard.json). @@ -27,8 +27,8 @@ Metrics are on by default once the bridge has its dependency installed: ```bash pip install -r bridge/requirements.txt # picks up prometheus-client -systemctl restart zeroclaw-bridge # or `docker compose restart bridge` -curl -s http://:8080/metrics | head -20 +docker compose restart bridge # or restart however you deployed it +curl -s http://:8080/metrics | head -20 ``` If `prometheus-client` is missing the bridge still serves traffic — it @@ -45,14 +45,14 @@ scrape_configs: metrics_path: /metrics scrape_interval: 15s static_configs: - - targets: [":8080"] + - targets: [":8080"] labels: - service: zeroclaw-bridge + service: dotty-bridge env: home ``` -Replace `` with the LAN address of the box running the -bridge. Reload Prometheus (`SIGHUP` or `/-/reload`) and confirm the +Replace `` with the LAN address of the Docker host running +the bridge. Reload Prometheus (`SIGHUP` or `/-/reload`) and confirm the target shows `UP` under **Status → Targets**. ## Import the Grafana dashboard @@ -65,7 +65,7 @@ target shows `UP` under **Status → Targets**. The dashboard ships with eight panels: first-audio latency (P50/P95/P99), request rate by endpoint, error rate by -endpoint+kind, active ACP sessions, Smart-Mode invocation rate, +endpoint+kind, active sessions (legacy panel — always 0), Smart-Mode invocation rate, perception events per minute (stacked by type), calendar fetch failure rate, and a Kid Mode single-stat toggle. @@ -77,7 +77,7 @@ failure rate, and a Kid Mode single-stat toggle. | `dotty_request_duration_seconds{endpoint}` | Histogram | End-to-end duration per endpoint (`message`, `message_stream`, `vision_explain`, `calendar_today`, `perception_event`). | | `dotty_request_errors_total{endpoint,kind}` | Counter | Errors partitioned by endpoint and `kind` (`timeout`, `binary_missing`, `exception`). | | `dotty_llm_tokens_total{kind,model}` | Counter | LLM token volume; reserved for future per-call accounting. | -| `dotty_active_acp_sessions` | Gauge | Live ACP child sessions. The bridge is single-child so this is normally 0 (idle) or 1 (in flight). | +| `dotty_active_acp_sessions` | Gauge | Legacy metric from the retired ZeroClaw path — retained in the schema but always 0. | | `dotty_calendar_fetch_failures_total{kind}` | Counter | Google Calendar fetch errors partitioned by `kind` (`timeout`, `parse`, `other`, `orchestrator`). The cache backs off automatically; sustained failures mean look at the bridge log. A spike of `timeout` reads as a network/quota issue; `parse` usually means the upstream `gws` CLI changed shape. | | `dotty_smart_mode_invocations_total` | Counter | Smart-Mode requests (the `metadata.smart_mode` flag opted into the larger LLM). | | `dotty_kid_mode_active` | Gauge | `1` if Kid Mode guardrails are active, `0` otherwise. Flipped live by the portal admin endpoint. | @@ -96,7 +96,7 @@ home-deployed robot: `sum(rate(dotty_calendar_fetch_failures_total[15m])) > 0.005` for 30 m. - **Bridge target down.** `up{job="dotty-bridge"} == 0` for 5 m. Catches the case where - systemd / Docker hasn't restarted the bridge. + Docker hasn't restarted the bridge container. ## Adding new metrics diff --git a/docs/proactive-greetings.md b/docs/proactive-greetings.md index a545c2f..44f8eea 100644 --- a/docs/proactive-greetings.md +++ b/docs/proactive-greetings.md @@ -29,15 +29,15 @@ sequenceDiagram autonumber participant FW as Firmware (camera) participant XZ as xiaozhi-server - participant BR as zeroclaw-bridge + participant DB as dotty-behaviour participant PG as ProactiveGreeter participant CAL as Calendar cache participant LLM as LLM (OpenRouter) participant TTS as inject-text + TTS FW->>XZ: face_recognized {identity: "Hudson"} - XZ->>BR: POST /api/perception/event - BR-->>PG: perception bus event + XZ->>DB: POST /api/perception/event + DB-->>PG: perception bus event PG->>PG: cooldown + per-day cap check PG->>CAL: summarize_for_prompt(events, person="Hudson") CAL-->>PG: ["09:00 [Hudson] Library day"] @@ -81,7 +81,7 @@ while this one targets `face_recognized` (Layer 4 output). | `GREETER_GREET_UNKNOWN` | `false` | When true, greet unrecognised faces with a generic "Hello! I don't think we've met." | | `GREETER_COOLDOWN_HOURS` | `4` | Minimum hours between greetings for the same identity. | | `GREETER_PER_DAY_MAX` | `3` | Hard cap on greetings per identity per day. The 4h cooldown already prevents back-to-back firings, so this is a safety ceiling rather than a politeness lever — turn it down if 3 greetings/day feels noisy. | -| `GREETER_STATE_PATH` | `~/.zeroclaw/greeter_state.json` | Persistent greet log so a restart doesn't re-greet everyone. | +| `GREETER_STATE_PATH` | `~/.dotty/greeter_state.json` | Persistent greet log so a restart doesn't re-greet everyone. | | `GREETER_GREETING_MAX_WORDS` | `15` | Word cap fed to the LLM prompt; the model is also told "one sentence". | State file format (atomic write, JSON): diff --git a/docs/protocols.md b/docs/protocols.md index a32a1fe..2453746 100644 --- a/docs/protocols.md +++ b/docs/protocols.md @@ -1,6 +1,6 @@ --- title: Protocols -description: Xiaozhi WebSocket protocol, ACP JSON-RPC, and the emotion frame format. +description: Xiaozhi WebSocket protocol, pi RPC transport, emotion frame format, and the HTTP APIs served by dotty-behaviour and bridge.py. --- # Protocols — what's on the wire @@ -10,8 +10,8 @@ description: Xiaozhi WebSocket protocol, ACP JSON-RPC, and the emotion frame for - **Xiaozhi WebSocket protocol** — between device and xiaozhi-server. Opus audio + JSON control frames. Supports MCP over JSON-RPC 2.0 in-band. Canonical spec: `github.com/78/xiaozhi-esp32/blob/main/docs/websocket.md`. - **Emotion channel** — 21 upstream emotion identifiers; the server picks one from the LLM's leading emoji and emits a separate `llm`-type frame. This stack uses a 9-emoji subset. - **MCP over WS** — the device acts as an MCP server; xiaozhi-server calls `tools/list` and `tools/call` against it. Tool names use dotted namespaces like `self.audio_speaker.set_volume`. -- **Bridge HTTP API** — `POST /api/message` (legacy `ZeroClawLLM` path), `POST /api/voice/escalate` + `POST /api/voice/remember` + `POST /api/voice/memory_log` (Tier1Slim), `POST /api/perception/event` (xiaozhi → bridge perception relay). -- **Agent Client Protocol (ACP)** — JSON-RPC 2.0 over stdio between the FastAPI bridge and `zeroclaw acp`. Zed-originated spec, maintained at `agentclientprotocol.com`. +- **pi RPC** — `PiClient` ↔ the dotty-pi agent communicate as JSONL messages over the stdin/stdout of `docker exec -i dotty-pi pi --mode rpc`. This is the voice transport for the default `PiVoiceLLM` provider. +- **HTTP APIs** — split across two services: dotty-behaviour (:8090) serves perception, vision, audio, and calendar endpoints; bridge.py (:8080) serves the admin dashboard `/ui` and admin routes. ## Xiaozhi WebSocket @@ -141,19 +141,20 @@ Server emits a dedicated `llm`-type frame: ### Default emoji allowlist -`bridge.py` enforces a 9-emoji subset: +The persona prompt and xiaozhi-server's top-level `prompt:` block enforce the following 9-emoji subset: ``` 😊 😆 😢 😮 🤔 😠 😐 😍 😴 ``` -If the LLM returns a leading emoji outside the allowlist (or no emoji at all), the bridge prepends 😐. Rationale: smaller set = more predictable face animations, fewer corner-cases in the xiaozhi emoji-stripper. +Smaller set = more predictable face animations, fewer corner-cases in the xiaozhi emoji-stripper. -### Three-layer enforcement +### Two-layer enforcement -1. **ZeroClaw persona prompt** — asks for leading emoji. -2. **xiaozhi-server top-level `prompt:`** — also asks for leading emoji. -3. **Bridge `_ensure_emoji_prefix`** — last line of defence; prepends 😐 if absent. +1. **Persona prompt** (`personas/dotty_voice.md`) — asks for a leading emoji. +2. **xiaozhi-server top-level `prompt:`** — also asks for a leading emoji. + +(A third bridge-side `_ensure_emoji_prefix` fallback existed in the retired ZeroClaw voice path; it is not present in the current `PiVoiceLLM` path.) ## MCP tools over WS @@ -228,145 +229,75 @@ Device signals MCP support in `hello.features.mcp = true`. Server then queries t See [hardware.md](./hardware.md#on-device-mcp-tools) for the default 11-tool MCP surface. - -## Bridge HTTP API - -The FastAPI bridge (`bridge.py`) listens on port 8080 (LAN-reachable, no auth currently). All payloads are JSON unless noted. - -### `POST /api/message` — legacy `ZeroClawLLM` path - -Used by the `ZeroClawLLM` provider on every voice turn. - -Request: - -```json -{"content": "", "channel": "stackchan", "session_id": ""} -``` - -Response: - -```json -{"response": "😊 Sure, the weather is..."} -``` - -The bridge wraps `channel == "stackchan"` content in the English+emoji sandwich and re-enforces the emoji prefix on the response. - -### `POST /api/voice/escalate` — Tier1Slim tool dispatch + +## pi RPC — PiVoiceLLM transport -Used by `Tier1Slim` when the small inner-loop model emits a `tool_call`. Blocks until the result returns (or the per-tool timeout fires). +The `PiVoiceLLM` provider communicates with the dotty-pi agent via **pi RPC mode**: JSONL messages exchanged over the stdin/stdout of a `docker exec` invocation. -Request: - -```json -{ - "tool": "", - "args": {"query": "..."}, - "session_id": "" -} -``` - -Response: - -```json -{"result": ""} ``` - -Timeouts: `memory_lookup` 5 s, `think_hard` 30 s, others 5 s (env-overridable via `BRIDGE_TIMEOUT_SHORT` / `BRIDGE_TIMEOUT_LONG`). - -### `POST /api/voice/remember` — Tier1Slim fact-stash (fire-and-forget) - -Triggered when the model embeds a `[REMEMBER: ...]` marker in the reply. - -Request: - -```json -{"fact": "user's favourite colour is blue", "session_id": "..."} +xiaozhi-server + └─ PiClient + └─ docker exec -i dotty-pi pi --mode rpc … + │ ▲ + JSONL request │ + (stdin) │ JSONL response + │ (stdout, streamed) ``` -Response: `{"ok": true}` (Tier1Slim doesn't wait for it; 2 s timeout client-side). +Each turn is a single JSONL object written to stdin; the agent streams JSONL response chunks back on stdout. Only TTS-bound text chunks are forwarded to xiaozhi-server — tool call details stay internal to the agent loop. The agent exits cleanly after each turn; `PiClient` re-invokes `docker exec` for the next turn. -### `POST /api/voice/memory_log` — Tier1Slim turn log (fire-and-forget) +The dotty-pi agent loads the **dotty-pi-ext extension** at startup, which registers the five voice tools (`memory_lookup`, `remember`, `think_hard`, `take_photo`, `play_song`). Tool results never appear in the TTS stream. -Posted at end-of-turn so ZeroClaw can index the conversation for future `memory_lookup`. + +## HTTP APIs -Request: +Server-side HTTP is split across two services. All payloads are JSON unless noted. -```json -{"user": "what colour is the sky", "assistant": "😊 the sky is blue!", "session_id": "..."} -``` +### dotty-behaviour — perception, vision, audio, calendar (:8090) -### `POST /api/perception/event` — xiaozhi → bridge perception relay +`dotty-behaviour` is a FastAPI service (port 8090, same Docker host) that owns the ambient behaviour layer. -Used by `EventTextMessageHandler` in `custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py` to forward firmware `event` frames. +| Endpoint | Purpose | +|---|---| +| `POST /api/perception/event` | xiaozhi → dotty-behaviour perception relay (face, sound, state events) | +| `POST /api/vision/explain` | VLM describe-image call | +| `POST /api/audio/explain` | Audio event explanation | +| `POST /api/voice/take_photo` | Voice-triggered camera snapshot + VLM describe | +| `GET /api/calendar/*` | Calendar context queries | -Request mirrors the firmware frame: +`POST /api/perception/event` is the primary inbound path for firmware `event` frames forwarded by `EventTextMessageHandler` in `custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py`: ```json { - "name": "", - "data": {"...": "..."}, + "name": "", + "data": {"…": "…"}, "device_id": "", "session_id": "", "ts": 1715000000.0 } ``` -Response: `{"ok": true}`. The bridge broadcasts the event to all `_perception_listeners` and updates `_perception_state[device_id]` accordingly. Consumer tasks (face_greeter, sound_turner, face_lost_aborter, wake_word_turner, face_identified_refresher, purr_player) each subscribe to the bus and react. See [architecture.md](./architecture.md#perception-event-bus). - -### `GET /health` - -Liveness probe. Returns `{"ok": true}` when the bridge is up and the ACP child is reachable. +Response: `{"ok": true}`. dotty-behaviour broadcasts the event to all perception listeners and updates per-device state. See [architecture.md](./architecture.md#perception-event-bus) for the 9 ambient consumers. -### `POST /admin/*` (localhost-only) +### bridge.py — dashboard and admin (:8080) -Administrative mutations — see [architecture.md](./architecture.md#bridge-adminadmin-zeroclaw-host-127001-only). +`bridge.py` is a FastAPI service (port 8080, same Docker host) that serves the admin dashboard. Its voice and perception relay roles were retired in issue #36 (2026-05-19); it survives as the dashboard service. - -## ACP — Agent Client Protocol - -Canonical spec: [agentclientprotocol.com](https://agentclientprotocol.com). Zed-Industries-originated, JSON-RPC 2.0, designed for editor↔agent interop; reusable for any agent-over-stdio situation. - -**Our transport:** `zeroclaw acp` is spawned with `stdin`/`stdout` inherited. The FastAPI bridge reads/writes JSON-RPC 2.0 framed messages (one JSON object per line or Content-Length-prefixed, per ACP spec). - -### Core methods - -| Method | Direction | Params | Returns / effect | -|---|---|---|---| -| `initialize` | client → agent | Protocol version, client capabilities | Agent capabilities, supported tool-sets | -| `session/new` | client → agent | `working_directory` | `sessionId` and metadata | -| `session/prompt` | client → agent | `sessionId`, `prompt: ContentBlock[]` (text/images/resources) | `stopReason: "end_turn" \| "max_tokens" \| "max_turn_requests" \| "refusal" \| "cancelled"` | -| `session/update` | agent → client (notification) | `sessionId`, `update.sessionUpdate: "plan" \| "agent_message_chunk" \| "tool_call" \| "tool_call_update"` with content | Agent streams progress | -| `session/request_permission` | agent → client | `sessionId`, tool call details | Client approves/denies tool execution | -| `session/cancel` | client → agent | `sessionId` | Agent halts; pending `session/prompt` resolves with `cancelled` | - -### What our bridge uses today - -- `initialize` (once at child startup) -- `session/new` (with session caching — reuses across turns, rotates on idle/turn-count/age) -- `session/prompt` (streaming via `session/event` chunks; bridge also supports buffered mode) -- `session/event` — tool call/result logging (`tool_call`, `tool_result` types) and streaming text chunks -- `session/request_permission` — auto-approves tool calls (safety net for tools not in ZeroClaw's `auto_approve` list) - -- `session/cancel` → sent on barge-in (device emits `abort`, xiaozhi closes the streaming HTTP connection, bridge cancels the in-flight ACP prompt and drains stale output) - -### ACP vs MCP — how they differ - -| | MCP | ACP | -|---|---|---| -| Purpose | Expose tools to a model | Drive a whole agent | -| Typical client | An LLM harness | A code editor (or here, our bridge) | -| Message shapes | `tools/list`, `tools/call`, `resources/*`, `prompts/*` | `session/prompt`, `session/update`, `session/cancel`, `session/request_permission` | -| Re-uses MCP | — | Yes — shares ContentBlock and resource JSON shapes | +| Endpoint | Purpose | +|---|---| +| `GET /ui` | Admin dashboard web UI | +| `POST /admin/*` | Admin mutations (toggle, kid-mode, smart-mode, set-tier1slim-model, play-asset, etc.) | +| `GET /health` | Liveness probe; returns `{"ok": true}` | -Both are JSON-RPC 2.0. The device's MCP exchanges ride the Xiaozhi WS; the bridge's ACP exchanges ride local stdio. +`POST /api/voice/escalate` (used by the `Tier1Slim` alternate provider) is also defined on bridge.py but is non-functional in the current stack — the ZeroClaw voice dispatch layer it depended on was retired in #36. See [docs/cutover-behaviour.md](./cutover-behaviour.md) for the historical runbook. ## See also - [hardware.md](./hardware.md) — what emits the device-side frames. - [voice-pipeline.md](./voice-pipeline.md) — what xiaozhi-server does between frames. -- [tier1slim.md](./tier1slim.md) — the Tier1Slim provider that drives `/api/voice/escalate`. -- [brain.md](./brain.md) — what the bridge does with the ACP results. +- [tier1slim.md](./tier1slim.md) — the Tier1Slim alternate provider and its (now non-functional) escalation wire format. +- [brain.md](./brain.md) — the dotty-pi agent and its tool set. - [architecture.md](./architecture.md#perception-event-bus) — the perception bus consumers. - [references.md](./references.md#protocols) — all protocol spec links. -Last verified: 2026-05-17. +Last verified: 2026-05-22. diff --git a/docs/quickstart.md b/docs/quickstart.md index 2af8c1b..79beef8 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -14,7 +14,7 @@ alternative configurations. | Item | Notes | |------|-------| | **M5Stack CoreS3 + StackChan servo kit** | The robot. See [hardware-support.md](hardware-support.md) for details. | -| **Linux or macOS host with Docker** | Runs the voice pipeline. Any distro works. | +| **Linux or macOS host with Docker** | Runs all four server-side containers. Any distro works. | | **2.4 GHz WiFi** | The ESP32-S3 does not support 5 GHz. | ## 1. Flash the firmware @@ -68,15 +68,12 @@ supports multiple resident models). See [cookbook/run-fully-local.md](cookbook/run-fully-local.md) and [cookbook/llama-swap-concurrent-models.md](cookbook/llama-swap-concurrent-models.md). -The shipped `.config.yaml` selects `Tier1Slim` as the default LLM, -which expects a llama-swap (or other OpenAI-compatible) endpoint at the -URL pointed to by `LLM.Tier1Slim.url` (and a matching `api_key`). If -that endpoint isn't reachable, either: -- stand up llama-swap (cookbook above), or -- switch `selected_module.LLM` to `OpenAICompat` and point it at any - cloud OpenAI-compatible API, or -- switch to `ZeroClawLLM` and run the full ZeroClaw agent on a second - host (see [SETUP.md](SETUP.md)). +The shipped `.config.yaml` selects `PiVoiceLLM` as the default LLM +provider, which runs the `dotty-pi` container (the pi coding agent) +on the same Docker host. Alternate providers — `Tier1Slim` (small +model inner loop, no agent overhead) and `OpenAICompat` (any +OpenAI-compatible cloud endpoint) — are available via +`selected_module.LLM` in `data/.config.yaml`. ## 4. Run setup @@ -87,7 +84,7 @@ make setup The interactive wizard prompts for your server IP, robot name, timezone, and LLM provider. It downloads the ASR and TTS models (~100 MB), substitutes placeholders in config files, and starts the Docker -container. +containers. Verify everything is healthy: @@ -98,29 +95,29 @@ make doctor All checks should pass (green). If any fail, see [troubleshooting.md](troubleshooting.md). -## 5. Install the bridge +## 5. Bring up the containers -This step depends on which deployment shape you picked (see the table in the [README](../README.md#get-it-running)). +All four server-side services run as Docker containers on the same host. +`docker compose up -d` (from `docker-compose.yml.template` after `make +setup` substitutes your placeholders) starts the main `xiaozhi-esp32-server` +container. The brain container and the perception/dashboard container +are brought up separately: -**Single-host (compose.all-in-one):** the bridge runs as a container in the same compose stack. There is no separate install step — skip to step 6. +- **dotty-pi** (the voice-tool brain): see [dotty-pi/README.md](../dotty-pi/README.md) + for build and run instructions. +- **dotty-behaviour** (perception bus + admin dashboard): see + [dotty-behaviour/README.md](../dotty-behaviour/README.md) for build and + run instructions. The `scripts/deploy-behaviour.sh` helper deploys it. +- **bridge.py** (admin dashboard service, `:8080`): runs as a container + on the same host; started via the compose file. -**Multi-host (default `make setup`):** install the bridge natively on the host that will run it. From a checkout of this repo *on that host* (not the Docker host): - -```bash -sudo scripts/install-bridge.sh \ - --bridge-dir /root/zeroclaw-bridge \ - --zeroclaw-bin "$(which zeroclaw)" -``` - -The script copies `bridge.py` + the `custom-providers/` and `bridge/` trees into the install dir, creates a Python venv, writes a systemd unit, runs an import smoke test, and starts the service. Health check at `http://:8080/health` should return `{"status":"ok",...}`. - -If the bridge host is a different machine from the Docker host, clone the repo there first. +No separate host, no systemd bridge unit, no SSH to a second machine. ## 6. Connect the robot 1. Power on the robot (USB-C or battery). 2. On the device screen, navigate to **Settings > Advanced Options**. -3. Enter the OTA URL: `http://:8003/xiaozhi/ota/` +3. Enter the OTA URL: `http://:8003/xiaozhi/ota/` 4. The robot connects via WebSocket and shows a face. ## 7. First voice turn @@ -156,58 +153,34 @@ This repo uses placeholders in place of real IPs, usernames, and filesystem path | Placeholder | Meaning | |---|---| -| `` | LAN IP of the server running xiaozhi-server. The robot reaches this on WiFi, so it must be a LAN IP, not a Tailscale/VPN IP. | +| `` | LAN IP of the server running all Docker containers. The robot reaches this on WiFi, so it must be a LAN IP, not a Tailscale/VPN IP. | | `` | SSH user for the server (whatever your distro defaults to: `root`, `ubuntu`, `dietpi`, etc.). | | `` | Hostname or Tailscale name of the server (optional, IP works for everything). | -| `` | Path on the server where you clone/install xiaozhi-server (e.g. `/opt/xiaozhi-server/` or `/srv/xiaozhi-server/`). | -| `` | LAN IP of the host running ZeroClaw + the bridge. Anything that runs the `zeroclaw` binary works (a small Linux box, your existing home server, or the same server as xiaozhi-server). | -| `` | SSH user on the ZeroClaw host (whatever your distro defaults to). | -| `` | Home directory on the ZeroClaw host for the user that owns the bridge (e.g. `/root/` or `/home//`). | -| `` | Full path to the zeroclaw-bridge working directory (e.g. `/root/zeroclaw-bridge/`). | -| `` | Absolute path to the `zeroclaw` binary (cargo default: `~/.cargo/bin/zeroclaw`). | -| `` | ZeroClaw config file path (default: `/root/.zeroclaw/config.toml`). | +| `` | Path on the server where you clone/install this repo (e.g. `/opt/xiaozhi-server/` or `/srv/xiaozhi-server/`). | | `` | Your name / org, used in the persona prompt in `.config.yaml`. | | `` | Name the robot introduces itself as, referenced in the persona prompt in `.config.yaml`. Any string — pick whatever you want. The default example uses the hardware name ("StackChan"). | -Port numbers (`8000`, `8003`, `8080`, `18789`, `42617`) are product-generic and should not be changed unless you also reconfigure the respective services. +Port numbers (`8000`, `8003`, `8080`, `8090`) are product-generic and should not be changed unless you also reconfigure the respective services. Files you will definitely need to edit before first run: -- `.config.yaml` — replace ``, ``, and customise the `prompt:` block. +- `.config.yaml` — replace `` and customise the `prompt:` block. - `docker-compose.yml` — set `TZ` to your timezone. -- `zeroclaw-bridge.service` — only if you're installing the bridge by hand. `scripts/install-bridge.sh` (see step 5) writes its own copy with the right paths and you shouldn't need to touch this file. --- ## Deployment layout -```mermaid -flowchart TB - subgraph DockerHost_fs["Server filesystem - <XIAOZHI_PATH>"] - direction TB - UA1["data/.config.yaml
(override - voice, persona, endpoints)"] - UA2["models/SenseVoiceSmall/
(model.pt + configs)"] - UA3["custom-providers/zeroclaw/
(zeroclaw.py + __init__.py)"] - UA3b["custom-providers/edge_stream/
(edge_stream.py + __init__.py)"] - UA3d["custom-providers/piper_local/
(piper_local.py + __init__.py)"] - UA3c["custom-providers/asr/fun_local.py
(patched ASR - language key)"] - UA4["tmp/
(TTS audio scratch)"] - UA5["repo/
(git clone, reference only)"] - UA6["docker-compose.yml"] - end - - subgraph ZCHost_fs["ZeroClaw host filesystem"] - direction TB - RA["<BRIDGE_PATH>"] - RA1["bridge.py"] - RA2[".venv/
(fastapi + uvicorn)"] - RB["/etc/systemd/system/
zeroclaw-bridge.service"] - RC["~/.zeroclaw/
(agent persona config)"] - RA --> RA1 & RA2 - end -``` +All four containers run on the single Docker host (``): -Container volume mounts: +| Container | Purpose | Port | +|---|---|---| +| `xiaozhi-esp32-server` | Voice pipeline: ASR, TTS, WebSocket to StackChan | 8000 (WS), 8003 (OTA/HTTP) | +| `dotty-pi` | pi coding agent — the voice-tool brain | internal (via `docker exec`) | +| `dotty-behaviour` | Perception bus + ambient consumers + calendar | 8090 | +| `bridge.py` | Admin dashboard | 8080 | + +Container volume mounts for `xiaozhi-esp32-server`: | Host path | Container path | Purpose | |---|---|---| @@ -215,12 +188,13 @@ Container volume mounts: | `models/SenseVoiceSmall/` | `/opt/xiaozhi-esp32-server/models/SenseVoiceSmall/` | ASR weights | | `models/piper/` | `/opt/xiaozhi-esp32-server/models/piper/` | Piper TTS voice models (`.onnx` + `.json`) | | `tmp/` | `/opt/xiaozhi-esp32-server/tmp/` | Scratch | -| `custom-providers/zeroclaw/` | `/opt/xiaozhi-esp32-server/core/providers/llm/zeroclaw/` | Custom LLM provider (directory mount) | +| `custom-providers/pi_voice/` | `/opt/xiaozhi-esp32-server/core/providers/llm/pi_voice/` | PiVoiceLLM provider (directory mount) | +| `custom-providers/tier1_slim/` | `/opt/xiaozhi-esp32-server/core/providers/llm/tier1_slim/` | Tier1Slim alternate provider | | `custom-providers/edge_stream/edge_stream.py` | `/opt/xiaozhi-esp32-server/core/providers/tts/edge_stream.py` | Streaming EdgeTTS provider (file mount) | | `custom-providers/piper_local/piper_local.py` | `/opt/xiaozhi-esp32-server/core/providers/tts/piper_local.py` | Local Piper TTS provider (file mount) | | `custom-providers/asr/fun_local.py` | `/opt/xiaozhi-esp32-server/core/providers/asr/fun_local.py` | Patched FunASR — adds `language` config key so SenseVoiceSmall can be pinned to English | -The full file inventory (with `/etc/systemd/system/` paths and the bare-metal venv) lives in [architecture.md](./architecture.md#deployment-files-this-repo). +The full file inventory lives in [architecture.md](./architecture.md#deployment-files-this-repo). --- @@ -230,23 +204,19 @@ The full file inventory (with `/etc/systemd/system/` paths and the bare-metal ve |---|---|---| | OTA (enter into StackChan settings) | `http://:8003/xiaozhi/ota/` | The robot on boot | | WebSocket | `ws://:8000/xiaozhi/v1/` | The robot after OTA handshake | -| Bridge (chat) | `http://:8080/api/message` | xiaozhi-server's ZeroClawLLM | -| Bridge (health) | `http://:8080/health` | Humans, monitoring | -| Bridge (dashboard) | `http://:8080/ui` | Humans (LAN-only HTMX UI) | -| ZeroClaw gateway | `http://127.0.0.1:42617` (host-local) | ZeroClaw's web UI only | +| Perception / ambient events | `http://:8090` | xiaozhi-server → dotty-behaviour | +| Admin dashboard | `http://:8080/ui` | Humans (LAN-only HTMX UI) | +| Bridge health | `http://:8080/health` | Humans, monitoring | --- ## Reboot survival -Both services restart themselves without manual intervention: - -| Host | Mechanism | -|---|---| -| Server | Container `restart: unless-stopped` in `docker-compose.yml` + ensure dockerd starts at boot on your distro. | -| ZeroClaw host | `zeroclaw-bridge.service` is `enabled`, `Restart=on-failure`. | - -Caveat: if you run `docker compose down`, the container is marked stopped and won't come back on reboot. Use `docker compose restart` or `docker restart xiaozhi-esp32-server` for transient restarts instead. +All containers use `restart: unless-stopped`. Ensure dockerd starts at +boot on your distro. Use `docker compose restart` or +`docker restart ` for transient restarts rather than `docker +compose down` (which marks the container stopped and prevents +auto-restart on reboot). --- @@ -256,35 +226,33 @@ Caveat: if you run `docker compose down`, the container is marked stopped and wo # Tail xiaozhi-server logs (voice pipeline) ssh @ 'docker logs -f xiaozhi-esp32-server' -# Tail bridge logs -ssh @ 'sudo journalctl -u zeroclaw-bridge -f' +# Tail dotty-behaviour logs (perception + dashboard) +ssh @ 'docker logs -f dotty-behaviour' + +# Tail dotty-pi logs (brain container) +ssh @ 'docker logs -f dotty-pi' # Restart voice pipeline after config change ssh @ 'cd && docker compose restart' -# Restart the bridge -ssh @ 'sudo systemctl restart zeroclaw-bridge' - -# Smoke test full round-trip -curl -X POST http://:8080/api/message \ - -H 'content-type: application/json' \ - -d '{"content":"hello","channel":"dotty"}' +# Admin dashboard +open http://:8080/ui # Bridge health -curl http://:8080/health +curl http://:8080/health ``` ### Changing voice The default TTS is `LocalPiper` (offline, runs inside the container). To change the Piper voice, edit `TTS.LocalPiper.voice` and the corresponding `model_path` / `config_path` in `data/.config.yaml`. To switch to cloud EdgeTTS instead, set `selected_module.TTS: EdgeTTS` and edit `TTS.EdgeTTS.voice` (any Microsoft Edge Neural voice ID works, e.g. `en-US-AvaNeural`). Restart the container after changes. ### Changing persona (the robot's personality) -Where the persona lives depends on which LLM provider is active. With the shipped default (`selected_module.LLM: Tier1Slim`), edit `personas/dotty_voice.md` and `docker compose restart` — Tier1Slim deliberately ignores the top-level `prompt:` block because the 4 B chat template only honours one system message. With `selected_module.LLM: ZeroClawLLM`, the persona lives in `` plus the workspace files at `~/.zeroclaw/workspace/{SOUL,IDENTITY}.md` on the ZeroClaw host; the `prompt:` key in `data/.config.yaml` is then a secondary hint that the bridge passes to ZeroClaw as context. Full instructions: [cookbook/change-persona.md](cookbook/change-persona.md). +Edit `personas/dotty_voice.md` (for the `PiVoiceLLM` / `Tier1Slim` paths) and restart the relevant container. The `prompt:` key in `data/.config.yaml` is also injected as a secondary system message. Full instructions: [cookbook/change-persona.md](cookbook/change-persona.md). ### Changing VAD sensitivity `VAD.SileroVAD.min_silence_duration_ms` in `data/.config.yaml`. Default: 700 ms. Lower = cuts off quicker. Higher = waits longer for slow speakers. ### Changing the LLM model -For the `Tier1Slim` path (default): edit `LLM.Tier1Slim.model` (or repoint `url` / `api_key`) in `data/.config.yaml` and `docker compose restart`. Or for in-flight swaps, use the bridge's `/admin/smart-mode` toggle — it calls `/xiaozhi/admin/set-tier1slim-model` to hot-swap without a restart (see [tier1slim.md](tier1slim.md)). For the legacy `ZeroClawLLM` path: edit `default_model` near the top of `` on the ZeroClaw host (provider and encrypted api_key live next to it). ACP mode caches config in the long-running child, so restart the bridge (`sudo systemctl restart zeroclaw-bridge`) after editing. Confirm with `sudo status | grep Model`. +For the `PiVoiceLLM` path (default): see [dotty-pi/README.md](../dotty-pi/README.md) for the model selection rules — in particular, the llama-swap matrix DSL constraint that prevents the voice-model set from being evicted. For the `Tier1Slim` path: edit `LLM.Tier1Slim.model` (or repoint `url` / `api_key`) in `data/.config.yaml` and `docker compose restart`. Or for in-flight swaps, use the bridge's `/admin/smart-mode` toggle — it calls `/xiaozhi/admin/set-tier1slim-model` to hot-swap without a restart (see [tier1slim.md](tier1slim.md)). --- @@ -293,7 +261,7 @@ For the `Tier1Slim` path (default): edit `LLM.Tier1Slim.model` (or repoint `url` ```bash make doctor # health checks make logs # tail server logs -curl http://:8080/health # test the bridge +curl http://:8080/health # test the bridge/dashboard ``` See [troubleshooting.md](troubleshooting.md) for common issues. diff --git a/docs/references.md b/docs/references.md index c54037a..4580235 100644 --- a/docs/references.md +++ b/docs/references.md @@ -39,7 +39,8 @@ source-of-truth for re-verification. | Resource | URL | What's there | |---|---|---| -| ZeroClaw | https://github.com/zeroclaw-labs/zeroclaw | Agent runtime. Architecture, workspace files, providers, MCP support. Rust / dual MIT+Apache-2.0. | +| pi agent (dotty-pi) | https://github.com/BrettKinny/dotty-stackchan/tree/main/dotty-pi | The Docker container that runs the pi coding agent — the current Dotty brain. | +| dotty-pi-ext | https://github.com/BrettKinny/dotty-stackchan/tree/main/dotty-pi-ext | pi extension providing the 5 voice tools (memory_lookup, remember, think_hard, take_photo, play_song). | | Qwen3-30B-A3B-Instruct-2507 | https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507 | Model card. Param counts, experts, context length, sampling recommendations. | | OpenRouter (Qwen3 page) | https://openrouter.ai/qwen/qwen3-30b-a3b-instruct-2507 | Pricing, latency, provider availability. | | OpenRouter docs | https://openrouter.ai/docs | Tool calling, streaming, failover. | @@ -53,9 +54,7 @@ source-of-truth for re-verification. | Xiaozhi WebSocket protocol | https://github.com/78/xiaozhi-esp32/blob/main/docs/websocket.md | Full message catalog, hello shape, binary audio framing. | | Xiaozhi MCP protocol | https://github.com/78/xiaozhi-esp32/blob/main/docs/mcp-protocol.md | `tools/list`, `tools/call`, `AddTool` vs `AddUserOnlyTool`. | | Xiaozhi emotion docs | https://xiaozhi.dev/en/docs/development/emotion/ | 21-emotion catalog + wire format. | -| Agent Client Protocol | https://agentclientprotocol.com | ACP spec (Zed-originated). JSON-RPC 2.0 over stdio/HTTP/WS. | -| ACP prompt-turn spec | https://agentclientprotocol.com/protocol/prompt-turn | `session/prompt`, `session/update`, `session/cancel`, `session/request_permission`. | -| MCP (Model Context Protocol) | https://modelcontextprotocol.io | The base spec ACP reuses JSON shapes from. | +| MCP (Model Context Protocol) | https://modelcontextprotocol.io | JSON-RPC 2.0 tool-use protocol used by the firmware's MCP tools and the pi agent. | ## Pricing (volatile) @@ -70,7 +69,6 @@ OpenRouter pricing changes often — values below are most recently observed per | Component | License | Notes | |---|---|---| -| ZeroClaw | MIT / Apache-2.0 (dual) | Binary is yours to run. | | FunASR / SenseVoiceSmall | Check the HF card | Weights license varies. | | SileroVAD | MIT | — | | Piper engine | MIT | — | diff --git a/docs/speaker-id-investigation.md b/docs/speaker-id-investigation.md index 4627db5..ac268ab 100644 --- a/docs/speaker-id-investigation.md +++ b/docs/speaker-id-investigation.md @@ -36,7 +36,7 @@ similar latent capabilities the deployment doesn't expose. fork) wires: - `self._asr` (ASR module instance, line 60) -- `self._llm` (LLM provider — currently the ZeroClaw bridge) +- `self._llm` (LLM provider — currently `PiVoiceLLM`) - `self._memory` (initialised but disabled — `Memory: nomem` in `.config.yaml`) @@ -45,9 +45,8 @@ There is **no voiceprint hook**. Searching for `speaker`, `voiceprint`, matches (other than the unrelated "small speaker" persona text in the config). -`custom-providers/zeroclaw/zeroclaw.py:_payload()` builds a metadata -dict: `{provider: "zeroclaw", smart_mode: ...}`. There is no slot for -a speaker hint coming from xiaozhi. +`custom-providers/pi_voice/pi_voice.py` builds a metadata dict for the +pi RPC call. There is no slot for a speaker hint coming from xiaozhi. ## What it would take to wire it @@ -65,8 +64,8 @@ To turn "voiceprint exists upstream" into "Signal E in the resolver": recognised speaker id (and confidence) to the LLM call metadata so the bridge can read it. This is one extra field on the dict already passed to the `LLMProvider`. -4. **Provider passthrough** — `custom-providers/zeroclaw/zeroclaw.py` - forwards xiaozhi metadata into the bridge's `MessageIn.metadata`. +4. **Provider passthrough** — `custom-providers/pi_voice/pi_voice.py` + forwards xiaozhi metadata into the pi RPC request. Add a passthrough for `speaker_id` / `speaker_confidence`. 5. **Resolver Signal E** — extend `bridge/speaker.py:_signal_perception` (or a new `_signal_voiceprint`) to read `payload.metadata` and diff --git a/docs/style.md b/docs/style.md index c364033..c524476 100644 --- a/docs/style.md +++ b/docs/style.md @@ -80,7 +80,7 @@ instead of bullet points. Tables scan faster and diff cleaner. ## Placeholders -Use ``, ``, ``, etc. for any value that varies +Use ``, ``, ``, etc. for any value that varies per deployment. See the full list in `CONTRIBUTING.md`. Never commit real IPs, hostnames, or API keys. diff --git a/docs/tier1slim.md b/docs/tier1slim.md index 607ec7b..54be2cc 100644 --- a/docs/tier1slim.md +++ b/docs/tier1slim.md @@ -1,14 +1,16 @@ --- -title: Tier1Slim — Two-Tier Voice LLM -description: How the Tier1Slim provider runs a small/fast model for inner-loop chat and escalates tool calls to ZeroClaw via the bridge. +title: Tier1Slim — Two-Tier Voice LLM (alternate) +description: How the Tier1Slim alternate provider runs a small/fast model for inner-loop chat and escalates tool calls via the bridge escalation endpoint (non-functional post-cutover). --- -# Tier1Slim — Two-Tier Voice LLM +# Tier1Slim — Two-Tier Voice LLM (alternate) -Tier1Slim is one of two LLM providers Dotty can use for the voice path; the other is `ZeroClawLLM` (full agent runtime, single tier). Tier1Slim splits the work in two: +Tier1Slim is an **alternate** voice LLM backend. The default is `PiVoiceLLM` (see [llm-backends.md](./llm-backends.md)). Tier1Slim splits the work in two: - **Inner loop** — every plain conversational turn goes directly to a small, fast model (default: `qwen3.5:4b` against a local llama-swap endpoint), no bridge round-trip. Warm latency is well under 1 s. -- **Escalation** — when the small model emits a structured `tool_call`, Tier1Slim POSTs the call to the bridge's `/api/voice/escalate` endpoint, which dispatches to ZeroClaw memory, the 27 B thinker, or a firmware MCP tool, then streams the final answer back through TTS. +- **Escalation** — when the small model emits a structured `tool_call`, Tier1Slim POSTs the call to `POST /api/voice/escalate` on bridge.py. + +**Post-cutover status (2026-05-19, issue #36):** `POST /api/voice/escalate` was served by the ZeroClaw bridge voice path, which was retired in #36. The escalation endpoint is non-functional in the current stack. Tier1Slim is therefore a **chitchat-only rollback path** — plain conversational turns work, but tool calls (`memory_lookup`, `think_hard`, `take_photo`, `play_song`) do not reach a live backend. Use `PiVoiceLLM` for full tool support. The provider is selected with `selected_module.LLM: Tier1Slim` in `.config.yaml`. Source: `custom-providers/tier1_slim/tier1_slim.py`. @@ -16,32 +18,29 @@ The provider is selected with `selected_module.LLM: Tier1Slim` in `.config.yaml` | You want | Use | |---|---| -| Snappy chitchat ("what colour is the sky?") under 1 s | **Tier1Slim** | -| Every voice turn to go through a full agent loop (memory, multi-step reasoning, tool chains) | `ZeroClawLLM` | -| Voice path that can hot-swap between local and cloud backends with no daemon restart | **Tier1Slim** | +| Snappy plain chitchat ("what colour is the sky?") under 1 s, no tool calls needed | **Tier1Slim** (chitchat-only rollback) | +| Every voice turn to go through a full agent loop (memory, multi-step reasoning, tool chains) | `PiVoiceLLM` (default) | +| Voice path that can hot-swap between local and cloud backends with no daemon restart | **Tier1Slim** (inner loop only; smart-mode flip still works) | -Bridge code reads `DOTTY_VOICE_PROVIDER` to know which path is live. `"zeroclaw"` (default) means smart-mode flips rewrite ZeroClaw's TOML and restart the daemon; `"tier1slim"` means smart-mode flips call `/xiaozhi/admin/set-tier1slim-model` to hot-swap the live provider in xiaozhi-server. +Note: tool escalation (`memory_lookup`, `think_hard`, `take_photo`, `play_song`) via `POST /api/voice/escalate` is non-functional post-cutover. If the small model emits a `tool_call`, the escalation POST will fail. Tier1Slim is best used as a lightweight chitchat fallback when the dotty-pi agent is unavailable. ## Models and routing ``` - selected_module.LLM = Tier1Slim - │ - ▼ - Tier1Slim (custom-providers/tier1_slim/) - │ - ┌───────────────────┴────────────────────┐ - │ │ - No tool_calls emitted tool_calls emitted - │ │ - ▼ ▼ - llama-swap (default) POST /api/voice/escalate - qwen3.5:4b @ :8080/v1 ──→ bridge.py - ~500 ms warm │ - ┌─────────────────┼─────────────────┬─────────────┐ - ▼ ▼ ▼ ▼ - memory_lookup think_hard play_song take_photo - (ZeroClaw FTS) (qwen3.6:27b-think) (firmware) (VLM via bridge) + selected_module.LLM = Tier1Slim + │ + ▼ + Tier1Slim (custom-providers/tier1_slim/) + │ + ┌───────────────────┴────────────────────┐ + │ │ +No tool_calls emitted tool_calls emitted + │ │ + ▼ ▼ +llama-swap (default) POST /api/voice/escalate → bridge.py +qwen3.5:4b @ :8080/v1 (non-functional post-cutover; endpoint +~500 ms warm was served by the retired ZeroClaw + voice path) ``` Smart-mode flips the inner-loop target between local and cloud: @@ -55,16 +54,16 @@ The flip is in-process and instant — the next turn lands on the new backend wi ## The four escalation tools -The catalogue is intentionally small to stay reliable on a 4 B model. Defined in `tier1_slim.py:TOOLS`. +These tools are defined in `tier1_slim.py:TOOLS` and sent to `POST /api/voice/escalate`. **They are non-functional in the current stack** — the escalation endpoint was served by the retired ZeroClaw voice path. Documented here for reference; use `PiVoiceLLM` for equivalent functionality. -| Tool | Purpose | Bridge dispatch | Filler phrase | +| Tool | Purpose | Escalation target (pre-cutover) | Filler phrase | |---|---|---|---| -| `memory_lookup` | Recall a fact from a past conversation. Use when the user says "do you remember…" or refers to a past topic by name. | ZeroClaw memory (FTS). Short timeout (`BRIDGE_TIMEOUT_SHORT`, default 5 s). | none (lands fast) | -| `think_hard` | Delegate a hard question (multi-step planning, 3+ digit arithmetic). | `qwen3.6:27b-think` via llama-swap. Long timeout (`BRIDGE_TIMEOUT_LONG`, default 30 s). | none | -| `play_song` | Play a song through the speaker. | Bridge → xiaozhi `/xiaozhi/admin/play-asset`. | none (fire-and-forget) | -| `take_photo` | Look through Dotty's camera and describe what's visible. | Bridge → VLM (`VLM_MODEL`, default `google/gemini-2.0-flash-001`). | "😮 Let me have a look." | +| `memory_lookup` | Recall a fact from a past conversation. Use when the user says "do you remember…" or refers to a past topic by name. | bridge `/api/voice/escalate` (short timeout, 5 s) | none (lands fast) | +| `think_hard` | Delegate a hard question (multi-step planning, 3+ digit arithmetic). | bridge `/api/voice/escalate` → `qwen3.6:27b-think` via llama-swap (long timeout, 30 s) | none | +| `play_song` | Play a song through the speaker. | bridge → xiaozhi `/xiaozhi/admin/play-asset` (fire-and-forget) | none | +| `take_photo` | Look through Dotty's camera and describe what's visible. | bridge → VLM (`VLM_MODEL`, default `google/gemini-2.0-flash-001`) | "😮 Let me have a look." | -Per-tool filler phrases (`tier1_slim.py:TOOL_FILLERS`) give TTS something to say while a slow tool runs. `None` means silent — used where the action lands instantly or would make a filler misleading. +Per-tool filler phrases (`tier1_slim.py:TOOL_FILLERS`) give TTS something to say while a slow tool runs. `None` means silent. ## Wire format @@ -74,6 +73,8 @@ Plain OpenAI-compatible chat completion with `tools=auto`. The slim 4 B model de ### Escalation call (Tier1Slim → bridge) +Defined in the source for reference; non-functional in the current stack. + ```http POST {BRIDGE_URL}/api/voice/escalate Content-Type: application/json @@ -91,14 +92,12 @@ Response: {"result": ""} ``` -`memory_lookup` and `think_hard` block until the result arrives. `play_song` and `take_photo` block too but the bridge returns quickly because the side effect is dispatched downstream. - ### Memory side channel -Two fire-and-forget POSTs run alongside escalation: +Two fire-and-forget POSTs are defined alongside escalation (also non-functional post-cutover): - `POST /api/voice/remember` — `{"fact": "...", "session_id": "..."}`. Triggered when the model emits a `[REMEMBER: ...]` marker inside the final reply. The marker is stripped before TTS. -- `POST /api/voice/memory_log` — `{"user": "...", "assistant": "...", "session_id": "..."}`. Logs the turn so ZeroClaw can index it for future `memory_lookup` calls. Posted at end-of-turn. +- `POST /api/voice/memory_log` — `{"user": "...", "assistant": "...", "session_id": "..."}`. Logs the turn for future `memory_lookup` calls. Posted at end-of-turn. Both have 2 s timeouts and never raise — failures log and continue. @@ -122,11 +121,11 @@ LLM: persona_file: personas/dotty_voice.md ``` -Environment variables (read by the bridge for smart-mode flips): +Environment variables (read by bridge.py for smart-mode flips): | Variable | Default | Purpose | |---|---|---| -| `DOTTY_VOICE_PROVIDER` | `zeroclaw` | Set to `tier1slim` to enable the hot-swap path. | +| `DOTTY_VOICE_PROVIDER` | `tier1slim` | Set to `tier1slim` to enable the Tier1Slim hot-swap path. | | `TIER1SLIM_LOCAL_URL` | `http://localhost:8080/v1` | Inner-loop endpoint when smart_mode is OFF. | | `TIER1SLIM_LOCAL_MODEL` | `qwen3.5:4b` | Model name on the local endpoint. | | `TIER1SLIM_LOCAL_API_KEY` | `dotty-voice` | Sent as `Authorization: Bearer …`. llama-swap ignores. | @@ -139,15 +138,16 @@ Environment variables (read by the bridge for smart-mode flips): ## Persona handling -Tier1Slim uses a single small system prompt (`personas/dotty_voice.md` by default) and discards xiaozhi-server's top-level `prompt:` block. The 4 B chat template only honours one system message, and xiaozhi's default prompt is sized for the ZeroClawLLM agentic path — concatenating both starves the small model's attention. If no `persona_file` is set, Tier1Slim falls back to merging the dialogue's system messages. +Tier1Slim uses a single small system prompt (`personas/dotty_voice.md` by default) and discards xiaozhi-server's top-level `prompt:` block. The 4 B chat template only honours one system message, and xiaozhi's default prompt is sized for full agentic paths — concatenating both starves the small model's attention. If no `persona_file` is set, Tier1Slim falls back to merging the dialogue's system messages. The emoji + English rules are appended per turn via `build_turn_suffix(KID_MODE)` (`custom-providers/textUtils.py`). Same set as elsewhere: 😊😆😢😮🤔😠😐😍😴. Fallback prefix is 😐. ## See also +- [llm-backends.md](./llm-backends.md) — choosing between PiVoiceLLM (default), Tier1Slim, and OpenAICompat. - [voice-pipeline.md](./voice-pipeline.md) — where Tier1Slim sits in the ASR → LLM → TTS chain. -- [brain.md](./brain.md) — the ZeroClaw agent that Tier1Slim escalates to. -- [protocols.md](./protocols.md) — `/api/voice/escalate`, `/api/voice/remember`, `/api/voice/memory_log` wire formats. -- [llm-backends.md](./llm-backends.md) — choosing between Tier1Slim, ZeroClawLLM, OpenAICompat. +- [brain.md](./brain.md) — the dotty-pi agent (the active brain) and its tool set. +- [protocols.md](./protocols.md) — `/api/voice/escalate` wire format and its post-cutover status. - [modes.md](./modes.md) — how smart_mode swaps the inner-loop backend. -- [cookbook/llama-swap-concurrent-models.md](./cookbook/llama-swap-concurrent-models.md) — running `qwen3.5:4b` + `qwen3.6:27b-think` concurrently on one GPU pair so Tier1Slim's escalation tools don't evict the inner-loop model. +- [cutover-behaviour.md](./cutover-behaviour.md) — historical runbook for the ZeroClaw→PiVoiceLLM cutover. +- [cookbook/llama-swap-concurrent-models.md](./cookbook/llama-swap-concurrent-models.md) — running `qwen3.5:4b` + `qwen3.6:27b-think` concurrently on one GPU pair. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 0cfbe49..b867ad8 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -16,8 +16,8 @@ Symptom-first lookup table covering common and obscure failure modes. Pair with **Cause:** Language mismatch between the TTS voice and the response text. EdgeTTS `en-*` voices return empty audio when given non-English text (Chinese, Japanese, etc.). This is not a throttle or rate limit — it's a silent failure in the EdgeTTS service. **Fix:** -1. Check the bridge logs for the LLM response text. If it contains non-English characters, the LLM is ignoring the English enforcement. -2. Verify the sandwich enforcement in `bridge.py` is active — the `_ensure_emoji_prefix` and per-turn English wrapping should be preventing this. +1. Check the xiaozhi-server logs for the LLM response text. If it contains non-English characters, the LLM is ignoring the English enforcement in the persona prompt and the `.config.yaml` `prompt:` block. +2. Confirm English enforcement is active: check `personas/dotty_voice.md` and the top-level `prompt:` in `data/.config.yaml` both contain explicit English-only instructions. 3. Check `data/.config.yaml` to confirm the TTS voice matches the expected response language (e.g., `en-AU-WilliamNeural` for English). 4. If using Piper TTS instead of EdgeTTS, confirm the selected voice model matches the response language. @@ -32,7 +32,7 @@ Symptom-first lookup table covering common and obscure failure modes. Pair with **Fix:** 1. Confirm the bridge's per-turn sandwich enforcement is active. Static system prompts alone are not enough — the bridge must wrap every turn with explicit English+emoji instructions. This is the `_build_sandwich_prompt` logic in `bridge.py`. 2. Check that the bridge is actually being called (not bypassed). Tail the bridge logs while testing. -3. If the leak happens on the first turn, the ZeroClaw persona files may contain non-English text. Check `SOUL.md` and `IDENTITY.md` in `~/.zeroclaw/workspace/`. +3. If the leak happens on the first turn, check the persona file (`personas/dotty_voice.md`) for any non-English text. 4. As a last resort, the ASR may be mis-transcribing English as another language. Check the `ASR.FunASR.language` key in `data/.config.yaml` is set to `en` (not `auto`). --- @@ -65,7 +65,7 @@ Symptom-first lookup table covering common and obscure failure modes. Pair with **Symptom:** The robot boots and connects to WiFi, but never responds to voice. May show a face but no indication of listening. **Fix:** -1. Check the bridge health endpoint: `curl http://:8080/health`. If the bridge is down, restart it. +1. Check the bridge health endpoint: `curl http://:8080/health`. If the bridge is down, restart it. 2. Check xiaozhi-server logs: `docker logs -f xiaozhi-esp32-server` on the server. Look for connection attempts from the robot. 3. Verify the robot's OTA URL hasn't changed. After a firmware update, re-enter the OTA URL (`http://:8003/xiaozhi/ota/`) in the robot's Advanced Options if needed. 4. Open the browser test page (`repo/main/xiaozhi-server/test/test_page.html`) and point it at `ws://:8000/xiaozhi/v1/`. If the browser page works but the robot doesn't, it's a robot-side configuration issue. @@ -81,7 +81,7 @@ Symptom-first lookup table covering common and obscure failure modes. Pair with **Fix:** 1. Check `docker logs xiaozhi-esp32-server` for the exact missing module name. 2. Verify the volume mounts in `docker-compose.yml` match the expected paths. The custom providers must land at: - - `custom-providers/zeroclaw/` -> `/opt/xiaozhi-esp32-server/core/providers/llm/zeroclaw/` + - `custom-providers/pi_voice/` -> `/opt/xiaozhi-esp32-server/core/providers/llm/pi_voice/` - `custom-providers/edge_stream/` -> `/opt/xiaozhi-esp32-server/core/providers/tts/edge_stream/` - `custom-providers/asr/fun_local.py` -> `/opt/xiaozhi-esp32-server/core/providers/asr/fun_local.py` 3. If the missing module is a Python dependency (e.g., `pydub`, `edge-tts`), it may not be in the base image. Add it via the compose file's environment or bake a custom image layer. @@ -110,9 +110,9 @@ Symptom-first lookup table covering common and obscure failure modes. Pair with | `😴` | Sleepy | **Fix:** -1. Check bridge logs to see the raw response from ZeroClaw. The bridge has a `_ensure_emoji_prefix` fallback that prepends `😐` if no emoji is detected — if the response still has no emoji, the fallback isn't firing. +1. Check the xiaozhi-server logs for the raw LLM response. Two enforcement layers apply: (a) the configured persona prompt (`personas/dotty_voice.md`), (b) the `prompt:` key in `data/.config.yaml`. If the response still has no emoji after both layers, something is fundamentally wrong with the response path. 2. If the response has an emoji but the face doesn't change, it may be an unsupported emoji. Only the nine listed above are mapped to animations. -3. The three enforcement layers are: (a) ZeroClaw's agent prompt, (b) the `prompt:` key in `data/.config.yaml`, (c) the bridge fallback. If all three fail, something is fundamentally wrong with the response path. +3. On the `PiVoiceLLM` path the `_ensure_emoji_prefix` fallback in `bridge.py` is not active — emoji enforcement relies entirely on the persona prompt and the `.config.yaml` `prompt:` block. --- @@ -128,15 +128,13 @@ Symptom-first lookup table covering common and obscure failure modes. Pair with ## Bridge unreachable / "(no response)" in the robot's voice -**Symptom:** The robot says something like "no response" or goes silent after you speak. xiaozhi-server logs show a failed HTTP POST to the bridge. +**Symptom:** The robot says something like "no response" or goes silent after you speak. xiaozhi-server logs show a failed HTTP POST or a failed `docker exec` call. **Fix:** -1. Check bridge status on the ZeroClaw host: - - Bare metal: `systemctl status zeroclaw-bridge` - - Docker: `docker ps | grep zeroclaw-bridge` -2. Test the health endpoint: `curl http://:8080/health` -3. If the bridge is running but unreachable, check firewall rules on the ZeroClaw host. Port 8080 must be open for LAN traffic. -4. If the bridge crashes on startup, check logs for a ZeroClaw binary issue: the bridge spawns `zeroclaw acp` as a child process. If the binary is missing or the config is invalid, the bridge won't start. +1. Check that the `bridge.py` dashboard container and the `dotty-pi` brain container are running: `docker ps | grep -E 'bridge|dotty-pi'` +2. Test the bridge dashboard health endpoint: `curl http://:8080/health` +3. For `PiVoiceLLM` failures, check that the `dotty-pi` container is healthy and the Docker socket is bind-mounted into the xiaozhi container. `docker exec -i dotty-pi echo ok` should return `ok`. +4. If the bridge container crashes on startup, check its logs: `docker logs bridge` --- @@ -156,7 +154,7 @@ Symptom-first lookup table covering common and obscure failure modes. Pair with - [quickstart.md](./quickstart.md) — happy-path setup + common ops + reboot survival. - [voice-pipeline.md](./voice-pipeline.md) — details on ASR, TTS, VAD tuning. -- [protocols.md](./protocols.md) — WebSocket and ACP wire format for debugging. +- [protocols.md](./protocols.md) — WebSocket wire format for debugging. - [hardware.md](./hardware.md) — hardware specs and safety notes. Last verified: 2026-05-17. diff --git a/docs/voice-pipeline.md b/docs/voice-pipeline.md index a6115d1..5b24396 100644 --- a/docs/voice-pipeline.md +++ b/docs/voice-pipeline.md @@ -8,8 +8,8 @@ description: xiaozhi-esp32-server pipeline stages -- VAD, ASR, LLM proxy, and TT ## TL;DR - **Server** is `xinnan-tech/xiaozhi-esp32-server` running in Docker on a Linux host. Plugin-based: each of VAD, ASR, LLM, TTS, Memory, Intent is a swappable provider picked via `data/.config.yaml`'s `selected_module:` block. -- Our live pipeline: **SileroVAD** (speech-end) → **FunASR SenseVoiceSmall** or **WhisperLocal** (ASR, pinned to English) → **Tier1Slim** custom provider (current default — talks directly to llama-swap, escalates tool calls to the bridge) or **ZeroClawLLM** (legacy — HTTP POST to bridge for every turn) → **LocalPiper** en_US-kristin-medium (TTS; EdgeTTS / StreamingEdgeTTS as alternates). -- The xiaozhi container also runs a perception relay (`EventTextMessageHandler`) that forwards firmware `face_detected` / `face_lost` / `sound_event` / `state_changed` frames to the bridge's `/api/perception/event`. +- Our live pipeline: **SileroVAD** (speech-end) → **FunASR SenseVoiceSmall** or **WhisperLocal** (ASR, pinned to English) → **PiVoiceLLM** custom provider (current default — `docker exec -i dotty-pi pi --mode rpc` over stdio, brain is the `dotty-pi` container) or **Tier1Slim** (alternate — talks directly to llama-swap, escalates tool calls to the bridge) → **LocalPiper** en_GB-cori-medium (TTS; EdgeTTS / StreamingEdgeTTS as alternates). +- The xiaozhi container also runs a perception relay (`EventTextMessageHandler`) that forwards firmware `face_detected` / `face_lost` / `sound_event` / `state_changed` frames to `dotty-behaviour`'s `/api/perception/event`. - **Emotion** is not a pipeline stage — it's extracted post-hoc from the LLM's emoji prefix and emitted as a separate WS frame. See [protocols.md](./protocols.md#emotion-protocol). - Custom providers are mounted into the container via Docker volumes at `/opt/xiaozhi-esp32-server/core/providers/{asr,tts,llm}/…`. They override the baked-in files at module-import time. - **Lots of upstream features are unused** — voiceprint speaker-ID, VLLM vision, knowledge-base RAG, PowerMem, multi-user routing. See [latent-capabilities.md](./latent-capabilities.md#voice-pipeline-unused). @@ -31,7 +31,7 @@ From the `xinnan-tech/xiaozhi-esp32-server` README (see [references.md](./refere | **Intent** | intent_llm, function_call, nointent | | **Knowledge base** | RagFlow | -**What we use:** SileroVAD + FunASR (patched) + custom ZeroClawLLM + LocalPiper (or EdgeTTS on rollback). Every other row is unused. +**What we use:** SileroVAD + FunASR (patched) + custom PiVoiceLLM + LocalPiper (or EdgeTTS on rollback). Every other row is unused. ## Our deployed stages @@ -63,29 +63,21 @@ Model: `FunAudioLLM/SenseVoiceSmall` on HuggingFace. From the model card: Deployment: mounted as a file-level override at `/opt/xiaozhi-esp32-server/core/providers/asr/fun_local.py`. -### LLM — two providers; one selected at a time +### LLM — provider selected at a time -Pick one via `selected_module.LLM` in `.config.yaml`. +Pick one via `selected_module.LLM` in `.config.yaml`. The default is `PiVoiceLLM`; `Tier1Slim` and `OpenAICompat` are alternates. See [llm-backends.md](./llm-backends.md) for the full comparison. -#### `Tier1Slim` (current default) +#### `PiVoiceLLM` (default) -Custom provider at `custom-providers/tier1_slim/tier1_slim.py` (mounted into `/opt/xiaozhi-esp32-server/core/providers/llm/tier1_slim/`). Talks directly to a local llama-swap endpoint with a ~500-token system prompt and a four-tool catalogue (`memory_lookup`, `think_hard`, `take_photo`, `play_song`). Plain conversational turns are answered by the small inner-loop model (`qwen3.5:4b` by default) in well under 1 s warm — **no bridge round-trip**. +Custom provider at `custom-providers/pi_voice/` (mounted into `/opt/xiaozhi-esp32-server/core/providers/llm/pi_voice/`). It doesn't run a model itself — it hands each voice turn to the **`dotty-pi` container** by running `docker exec -i dotty-pi pi --mode rpc` and exchanging JSONL messages over stdio. The pi agent owns the conversation loop (`qwen3.5:4b` on local llama-swap) and the five `dotty-pi-ext` voice tools (`memory_lookup`, `remember`, `think_hard`, `take_photo`, `play_song`); only TTS-bound text streams back. See [brain.md](./brain.md). -When the model emits structured `tool_calls`, the provider: +#### `Tier1Slim` (alternate) -1. Yields an immediate filler phrase to TTS so the user hears something within ~500 ms. -2. POSTs each tool call to the bridge's `/api/voice/escalate` endpoint, which dispatches to ZeroClaw memory (`memory_lookup`), the 27 B thinker on the same llama-swap (`think_hard`), the VLM (`take_photo`), or `/xiaozhi/admin/play-asset` (`play_song`), then returns the result. -3. Makes a second streaming chat call with the tool result in context and streams the final answer to TTS. +Custom provider at `custom-providers/tier1_slim/tier1_slim.py`. Talks directly to a local llama-swap endpoint; plain conversational turns are answered by the small inner-loop model (`qwen3.5:4b`) in well under 1 s warm. When the model emits structured `tool_calls`, the provider POSTs them to `/api/voice/escalate` — but that endpoint was served by the retired ZeroClaw bridge, so tool escalation is non-functional post-#36. Tier1Slim is now a chitchat-only rollback path. See [tier1slim.md](./tier1slim.md). -Tier1Slim also exposes `set_runtime(model, url, api_key)` for the bridge to hot-swap the live provider's backend without a docker restart — this is how smart-mode flips land instantly. See [tier1slim.md](./tier1slim.md) for the full wire format and configuration. +### Perception relay (xiaozhi → dotty-behaviour) -#### `ZeroClawLLM` (legacy single-tier path) - -Custom provider at `custom-providers/zeroclaw/zeroclaw.py`. Not really an LLM — it's a proxy. The `response()` method is a thin HTTP POST to `http://:8080/api/message`. Every turn round-trips through the bridge to ZeroClaw to OpenRouter, with full agent memory and tooling on every call. See [brain.md](./brain.md). - -### Perception relay (xiaozhi → bridge) - -`custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py` adds an `EventTextMessageHandler` that intercepts firmware `event` frames over the WS and POSTs each one to the bridge's `/api/perception/event`. This is what feeds the bridge-side `_perception_*` consumers — see [architecture.md](./architecture.md#perception-event-bus). +`custom-providers/xiaozhi-patches/textMessageHandlerRegistry.py` adds an `EventTextMessageHandler` that intercepts firmware `event` frames over the WS and POSTs each one to `dotty-behaviour`'s `/api/perception/event`. This is what feeds the `dotty-behaviour` perception consumers — see [architecture.md](./architecture.md#perception-event-bus). ### TTS — LocalPiper (active) / EdgeTTS (rollback) @@ -125,15 +117,15 @@ xiaozhi-server doesn't run an emotion classifier. It **strips the leading emoji* The TTS provider receives text **with the emoji already stripped**. The device receives the emotion and sets the face animation; the speaker plays the clean text. -**Surprising consequence**: the LLM must emit the emoji as its very first character for emotion dispatch to fire. Our bridge (`bridge.py`) prefixes 😐 as a fallback so the feature never silently fails. See [protocols.md](./protocols.md#emotion-protocol) for the 3-layer enforcement. +**Surprising consequence**: the LLM must emit the emoji as its very first character for emotion dispatch to fire. On the `PiVoiceLLM` path, enforcement relies on the persona prompt and the `.config.yaml` `prompt:` block — the `bridge.py` `_ensure_emoji_prefix` fallback only applies to the retired ZeroClaw path. See [protocols.md](./protocols.md#emotion-protocol) for the enforcement layers. **Note — we don't use SenseVoice's built-in SER.** The model card advertises speech emotion recognition and audio-event detection (bgm / applause / laughter / crying / coughing / sneezing). xiaozhi-server's FunASR provider returns only the transcription text; the SER/AED fields aren't piped through. That's a genuine latent capability — see [latent-capabilities.md](./latent-capabilities.md#voice-pipeline-unused). ## See also -- [protocols.md](./protocols.md#xiaozhi-websocket) — how audio gets in and out (and the `/api/voice/escalate` / `/api/perception/event` wire formats). -- [tier1slim.md](./tier1slim.md) — the default LLM provider in detail. -- [brain.md](./brain.md) — the model matrix and the legacy ZeroClawLLM path. +- [protocols.md](./protocols.md#xiaozhi-websocket) — how audio gets in and out (and the `/api/perception/event` wire format). +- [brain.md](./brain.md) — the pi agent, model matrix, and dotty-pi-ext voice tools. +- [tier1slim.md](./tier1slim.md) — the Tier1Slim alternate provider in detail. - [latent-capabilities.md](./latent-capabilities.md#voice-pipeline-unused) — unused upstream features. - [references.md](./references.md#voice) — all upstream voice-stack links. diff --git a/dotty-behaviour/README.md b/dotty-behaviour/README.md index 888b4f4..399d735 100644 --- a/dotty-behaviour/README.md +++ b/dotty-behaviour/README.md @@ -99,7 +99,7 @@ Subsequent slices land: ## Cutover (historical) Cutover landed 2026-05-19. xiaozhi-server's `VISION_BRIDGE_URL` env -var was flipped from `http://:8080` to +var was flipped from the old RPi bridge URL to `http://:8090` (the Unraid LAN IP, not loopback — see the networking note above), and the matching `plugins.vision_explain` URL in `data/.config.yaml` was flipped the same way. Full runbook + diff --git a/dotty-behaviour/docker-compose.yml b/dotty-behaviour/docker-compose.yml index f37a6ca..7b0d419 100644 --- a/dotty-behaviour/docker-compose.yml +++ b/dotty-behaviour/docker-compose.yml @@ -12,8 +12,9 @@ # dashboard's host-status probes work without DNS gymnastics. # # Port 8090 (not the bridge's 8080) — llama-swap owns 8080 on Unraid. -# xiaozhi-server's VISION_BRIDGE_URL must move from -# http://:8080 → http://localhost:8090 at cutover. +# xiaozhi-server's VISION_BRIDGE_URL must point to +# http://:8090 (the Unraid LAN IP — loopback works only +# if xiaozhi-server is also on host networking). services: dotty-behaviour: diff --git a/household.example.yaml b/household.example.yaml index acd70bc..bc81508 100644 --- a/household.example.yaml +++ b/household.example.yaml @@ -1,9 +1,9 @@ # Household registry for Dotty. # -# Copy this file to `~/.zeroclaw/household.yaml` (or wherever -# HOUSEHOLD_YAML_PATH points) on the bridge host (the ZeroClaw host) and edit the -# entries to match your household. The bridge hot-reloads on save — no -# restart needed. +# Copy this file to the path set by HOUSEHOLD_YAML_PATH (default: +# /mnt/user/appdata/dotty-behaviour/state/household.yaml) on the Docker host +# and edit the entries to match your household. dotty-behaviour hot-reloads +# on save — no restart needed. # # Schema (all fields under each person are optional except display_name): # @@ -29,10 +29,10 @@ # ("[Sam]") so events get bucketed per person. # voice_print_id null until speaker-ID lands; reserved. # -# Privacy: this file lives on the ZeroClaw host, never egresses. The bridge sends -# only the *compact_description* of the active person to the LLM (~200 -# chars). Birthdate is used locally for "birthday in N days" — never -# sent to the LLM in raw ISO form. +# Privacy: this file lives on the Docker host, never egresses. dotty-behaviour +# sends only the *compact_description* of the active person to the LLM (~200 +# chars). Birthdate is used locally for "birthday in N days" — never sent to +# the LLM in raw ISO form. default_person: _household diff --git a/monitoring/README.md b/monitoring/README.md index eba56cf..82eaac6 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -1,6 +1,6 @@ # monitoring/ -Operational artifacts for observing the zeroclaw-bridge. +Operational artifacts for observing the `bridge.py` dashboard service. - `grafana-dashboard.json` — starter Grafana dashboard for the Prometheus metrics exposed by `bridge.py` at `/metrics`. Import it diff --git a/session-prompt.md b/session-prompt.md index 330b1c9..4f9fbc9 100644 --- a/session-prompt.md +++ b/session-prompt.md @@ -1,10 +1,10 @@ # Claude Code Session Prompt — StackChan Infrastructure Setup -> Historical bootstrap prompt. Kept for reference — describes how this -> infra was originally stood up across two remote machines. Not all of it -> matches the current implementation (see `README.md` and `SETUP.md` for -> current truth), and the firmware-provisioning steps in particular now -> require the build-from-source flow in `SETUP.md`. +> Historical bootstrap prompt. Describes how this infra was originally stood +> up. Updated after the 2026-05-19 cutover (#36) which retired the separate +> ZeroClaw/RPi host and consolidated all services onto a single Docker host. +> The firmware-provisioning steps still require the build-from-source flow in +> `SETUP.md`. Paste this into your terminal: @@ -16,75 +16,85 @@ claude --prompt-file ./session-prompt.md ## Prompt content (save as `session-prompt.md`): -I need you to set up infrastructure across two remote machines for an M5Stack StackChan robot. You'll be SSHing from this workstation to both targets via Tailscale. Read the CLAUDE.md in this directory first for full architecture context. +I need you to set up infrastructure on a single Linux Docker host for an M5Stack StackChan robot. You'll be SSHing from this workstation to the target via Tailscale. Read the CLAUDE.md in this directory first for full architecture context. ## What you're building -A self-hosted voice pipeline that routes StackChan's audio through a xiaozhi-esp32-server (ASR + TTS) running on a Linux Docker host, with all AI processing forwarded to a ZeroClaw instance running on a separate host. +A self-hosted voice pipeline with four Docker containers on one host: + +1. **xiaozhi-esp32-server** — ASR (FunASR SenseVoiceSmall) + TTS (Piper/EdgeTTS) + WebSocket voice gateway for StackChan. +2. **dotty-pi** — the pi coding agent: the voice-tool brain. Runs `qwen3.5:4b` on a local llama-swap instance; invoked via `docker exec -i` by the `PiVoiceLLM` provider. See `dotty-pi/README.md`. +3. **dotty-behaviour** — perception event bus, ambient consumers, greeter, calendar. FastAPI on `:8090`. See `dotty-behaviour/README.md` and `scripts/deploy-behaviour.sh`. +4. **bridge.py** (admin dashboard) — HTMX admin UI on `:8080`. Runs as a container on the same host. + +All four containers run on the same machine (the Docker host). There is no separate brain host or RPi. ## Discovery steps (do these first) -1. Run `tailscale status` (if you use Tailscale) to find the hostnames and IPs for both the Docker host and the ZeroClaw host. Identify which is which from the OS/hostname. -2. SSH into the Docker host. Find its LAN IP (not Tailscale IP) — StackChan will need this because it's on WiFi, not Tailnet. Check `ip addr` or `hostname -I`. Also confirm Docker is available and pick a directory for the xiaozhi-server install (e.g. `/opt/xiaozhi-server/` or `/srv/xiaozhi-server/`). -3. SSH into the ZeroClaw host. Find its LAN IP similarly. Confirm ZeroClaw is running — check `zeroclaw status` or look for the gateway process on port 18789. Note the exact port and any API endpoints it exposes. Also check what Python version is available and whether pip/fastapi are already installed. -4. Test basic connectivity: from the Docker host, can you reach the ZeroClaw host's LAN IP? You may need to test this from inside a throwaway container (`docker run --rm alpine ping ZEROCLAW_LAN_IP`). +1. Run `tailscale status` (if you use Tailscale) to find the hostname and IP for the Docker host. +2. SSH into the Docker host. Find its LAN IP (not Tailscale IP) — StackChan will need this because it's on WiFi, not Tailnet. Check `ip addr` or `hostname -I`. Also confirm Docker is available and pick a directory for the install (e.g. `/opt/xiaozhi-server/` or `/srv/xiaozhi-server/`). +3. Check whether a local model backend is already running — llama-swap at `:8080/health` (or Ollama at `:11434/api/tags`). If not, set up Ollama as the simpler single-binary option (see `cookbook/run-fully-local.md`) or llama-swap if you need concurrent voice+coding model sets. ## Docker host setup (xiaozhi-esp32-server) On the Docker host: 1. Create the directory structure at your chosen install path (e.g. `/opt/xiaozhi-server/`) with subdirs: `data/`, `models/SenseVoiceSmall/`, `tmp/`. -2. Clone `https://github.com/xinnan-tech/xiaozhi-esp32-server.git` into a `repo/` subdir. -3. Download the SenseVoiceSmall ASR model (`model.pt`, ~250MB) into `models/SenseVoiceSmall/`. Try ModelScope first: `https://www.modelscope.cn/models/iic/SenseVoiceSmall/resolve/master/model.pt`. If that's slow, use HuggingFace: `https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/model.pt`. Verify the file is >200MB after download. -4. Create the custom ZeroClaw LLM provider at `repo/main/xiaozhi-server/core/providers/llm/zeroclaw/zeroclaw.py` plus `__init__.py`. The provider: - - Extends `LLMProviderBase` from `core.providers.llm.base` - - Sends HTTP POST to the ZeroClaw bridge on the ZeroClaw host - - Passes the user's transcribed text plus a system prompt that enforces emoji-first responses for StackChan face animations - - Handles connection errors gracefully with emoji-prefixed fallback messages - - Implements both `response()` and `response_stream()` (stream can just yield the non-stream result for now) - - Returns `False` from `function_call_supported()` — ZeroClaw handles its own tools -5. Create `data/.config.yaml` with: +2. Clone this repo (`dotty-stackchan`) into the install path or copy the relevant files. The `make setup` wizard handles most substitution. +3. Download the SenseVoiceSmall ASR model (`model.pt`, ~250 MB) into `models/SenseVoiceSmall/`. Try ModelScope first: `https://www.modelscope.cn/models/iic/SenseVoiceSmall/resolve/master/model.pt`. If that's slow, use HuggingFace: `https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/model.pt`. Verify the file is >200 MB after download. +4. Create `data/.config.yaml` with: - `selected_module.ASR: FunASRLocal` - - `selected_module.LLM: ZeroClawLLM` - - `selected_module.TTS: EdgeTTS` + - `selected_module.LLM: PiVoiceLLM` + - `selected_module.TTS: LocalPiper` (or `EdgeTTS` if you don't want offline TTS) - `selected_module.VAD: SileroVAD` - - EdgeTTS voice: `en-AU-WilliamNeural` - - ZeroClaw URL pointing to the ZeroClaw host's LAN IP, port 8080 + - PiVoiceLLM container name: `dotty-pi` (the `docker exec` target) - A system prompt that identifies as a desktop robot assistant. Enforce emoji-first responses. Keep TTS-friendly (short sentences). - - VAD silence duration 700ms (so it doesn't cut off slow speakers) - - Use the actual LAN IPs you discovered, not placeholders. -6. Create `docker-compose.yml` that: - - Uses `ghcr.io/xinnan-tech/xiaozhi-esp32-server:server_latest` - - Exposes ports 8000 (WebSocket) and 8003 (OTA/HTTP) - - Mounts `data/.config.yaml`, `models/`, `tmp/`, and the custom ZeroClaw provider directory - - Sets TZ to Australia/Brisbane - - Pip installs `aiohttp` on startup (the base image may not have it) - - **Important**: Check the actual container's internal directory structure first before writing the volume mounts. Run `docker run --rm ghcr.io/xinnan-tech/xiaozhi-esp32-server:server_latest ls /opt/xiaozhi-server/` (or wherever the app lives) to find the correct internal paths. The mount targets must match where the app actually loads providers from. -7. Start the container, tail the logs, and confirm you see the WebSocket and OTA addresses in the output. - -## ZeroClaw host setup (ZeroClaw HTTP bridge) - -On the ZeroClaw host: - -1. First, understand how ZeroClaw actually accepts messages. Check the running config, look at the gateway's API, examine any webchat or REST endpoints. The bridge needs to translate a simple HTTP POST into whatever ZeroClaw actually expects. Don't assume the API shape — discover it. -2. Create `~/zeroclaw-bridge/bridge.py` — a FastAPI app that: - - Listens on 0.0.0.0:8080 - - Accepts POST `/api/message` with `{"content": "...", "channel": "stackchan", "session_id": "...", "metadata": {...}}` - - Forwards to ZeroClaw's actual API/gateway - - Returns `{"response": "emoji-prefixed text"}` - - Has a GET `/health` endpoint -3. Install deps (fastapi, uvicorn, whatever HTTP client is needed). -4. Create a systemd user service for it so it persists across reboots. -5. Start it and verify the health endpoint responds. + - VAD silence duration 700 ms (so it doesn't cut off slow speakers) + - Use the actual LAN IP for ``, not a placeholder. +5. Bring up the xiaozhi-esp32-server container via `docker compose up -d` (from `docker-compose.yml.template` after `make setup` substitutes your placeholders). +6. Check the container's internal directory structure before writing volume mounts. Run `docker run --rm ghcr.io/xinnan-tech/xiaozhi-esp32-server:server_latest ls /opt/xiaozhi-server/` (or wherever the app lives) to confirm internal paths. The mount targets must match where the app actually loads providers from. +7. Tail the logs and confirm you see the WebSocket and OTA addresses in the output. -## Testing +## Bring up dotty-pi (brain container) + +Follow `dotty-pi/README.md` for build and run instructions. Key points: + +- The container idles via `sleep infinity`; voice turns invoke pi on demand via `docker exec -i`. +- Model target for the outer agent loop: `qwen3.5:4b` (in the llama-swap `voice` matrix set). Do **not** use `qwen3.6:27b` here — it evicts the voice model set and causes cold-reload latency on every `think_hard` escalation. +- Mount `persona/`, `memory/brain.db`, and the `dotty-pi-ext` extension directory as documented in `dotty-pi/README.md`. + +## Bring up dotty-behaviour (perception + dashboard backend) -After both sides are up: +Follow `dotty-behaviour/README.md` for build and run instructions. The `scripts/deploy-behaviour.sh` helper handles the deploy step. Key points: + +- Runs in `network_mode: host` on port `:8090`. +- xiaozhi-server talks to it on `http://:8090` (the LAN IP, not loopback — xiaozhi-server is on bridge networking so its loopback resolves to itself). +- Set `VISION_BRIDGE_URL` env var in the xiaozhi-server compose to `http://:8090`. + +## Bring up the admin dashboard (bridge.py) + +The bridge.py dashboard container starts as part of the main compose stack. It serves the HTMX admin UI on `:8080/ui` and a health endpoint at `:8080/health`. No separate deployment step beyond `docker compose up -d`. + +## Testing -1. Curl the bridge health endpoint from the Docker host (from inside a Docker container to simulate the xiaozhi-server's network perspective). -2. Send a test message through the bridge and confirm you get an emoji-prefixed response back. -3. Check xiaozhi-server logs to confirm the WebSocket endpoint is listening and the OTA endpoint reports healthy. -4. If the repo includes a test HTML page (usually at `repo/main/xiaozhi-server/test/test_page.html`), note its location so I can open it in a browser for audio testing. +After all containers are up: + +1. Check the OTA endpoint from a LAN machine: + ```bash + curl -s http://:8003/xiaozhi/ota/ + # Expect: OTA接口运行正常... + ``` +2. Check the dashboard health: + ```bash + curl -s http://:8080/health + # Expect: {"status":"ok", ...} + ``` +3. Check dotty-behaviour health: + ```bash + curl -s http://:8090/health + # Expect: {"status":"ok", ...} + ``` +4. Tail the xiaozhi-server logs to confirm the WebSocket endpoint is listening. ## Final output @@ -99,12 +109,11 @@ OTA URL (enter this in StackChan's Advanced Settings): WebSocket endpoint: ws://X.X.X.X:8000/xiaozhi/v1/ -ZeroClaw bridge: - http://X.X.X.X:8080/api/message +Admin dashboard: + http://X.X.X.X:8080/ui -Test page for browser audio testing: - file:///path/to/test_page.html - (point it at the WebSocket endpoint above) +dotty-behaviour (perception bus): + http://X.X.X.X:8090/health When StackChan arrives: 1. Flash open firmware built from https://github.com/m5stack/StackChan @@ -118,7 +127,8 @@ When StackChan arrives: ## Important constraints - Use `micro` if you need to interactively edit files (not nano, not vim). -- Don't install anything on the local workstation — everything happens via SSH to the remote machines. +- Don't install anything on the local workstation — everything happens via SSH to the remote machine. - All IPs in config files must be real LAN IPs discovered at runtime, not Tailscale IPs (StackChan isn't on the Tailnet). - If any step fails, diagnose from the logs before retrying. Don't just re-run blindly. - The xiaozhi-esp32-server Docker image's internal directory structure may differ from the repo layout. Inspect the container before writing volume mounts. +- Be conservative about commands not documented in the component READMEs — link to `dotty-pi/README.md` and `dotty-behaviour/README.md` rather than inventing invocations. From ea296e8248e7e4b30b971408c0db5ebe3c67e964 Mon Sep 17 00:00:00 2001 From: Brett Kinny Date: Fri, 22 May 2026 21:18:21 +1000 Subject: [PATCH 3/4] =?UTF-8?q?docs:=20finish=20ZeroClaw=20retirement=20?= =?UTF-8?q?=E2=80=94=20nav,=20index,=20changelog,=20compose?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the reconciliation in d4adb1d: - compose.all-in-one.yml: drop the dead zeroclaw-bridge service; it now brings up xiaozhi-server only and points at the dotty-pi / dotty-behaviour compose files. - mkdocs.yml + docs/README.md: drop the deleted multi-host / multi-daemon-split docs from nav and index. - pyproject.toml: remove the per-file lint rule for the deleted custom-providers/zeroclaw/zeroclaw.py. - bridge/requirements.txt: comment-only — drop the zeroclaw.py reference. - CHANGELOG.md: add the reconciliation entry. - brain.md / proactive-greetings.md: drop a phantom doc reference and correct the greeter state-file path to its real default. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 7 +++ bridge/requirements.txt | 7 +-- compose.all-in-one.yml | 92 +++++++++---------------------------- docs/README.md | 18 ++++---- docs/brain.md | 2 +- docs/proactive-greetings.md | 2 +- mkdocs.yml | 1 - pyproject.toml | 3 -- 8 files changed, 41 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb3dc99..3314dfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Added - **Bridge systemd unit loads API keys from `${BRIDGE_DIR}/.env`** (#15) — `zeroclaw-bridge.service.template` and `scripts/install-bridge.sh` now emit `EnvironmentFile=-${BRIDGE_DIR}/.env`. `install-bridge.sh` creates a mode-0600 stub `.env` containing `OPENROUTER_API_KEY=` (and commented `VISION_API_KEY` / `VLM_API_KEY` placeholders) when one isn't already present, so the missing-vision-key failure surfaces as the bridge's existing ERROR ("camera offline") instead of a silent confabulation. Existing `.env` files are preserved. +### Changed +- **Documentation reconciled to the post-#36 architecture** — `README.md`, `CLAUDE.md`, and the `docs/` tree previously described the retired ZeroClaw bridge and its Raspberry-Pi brain host. They now describe the live stack: the `dotty-pi` pi-agent container (the voice brain, reached via the `PiVoiceLLM` provider), `dotty-behaviour` (perception bus + ambient consumers + greeter, port 8090), and `bridge.py` as the admin dashboard service (port 8080). `.config.yaml.template` and `docker-compose.yml.template` updated to match — `vision_explain` / `VISION_BRIDGE_URL` now point at dotty-behaviour, and the `zeroclaw` provider mount + `ZeroClawLLM` config block are gone. The #36 cutover was executed 2026-05-19; this is the follow-up doc sweep its runbook deferred. + +### Removed +- **`custom-providers/zeroclaw/`** — the `ZeroClawLLM` voice provider, dead since the #36 cutover. +- **`docs/multi-daemon-split.md`, `docs/advanced/multi-host.md`** — both documented ZeroClaw-host topologies that no longer exist. + ## [server-v0.1.0] - 2026-05-17 First git-tagged public release. Covers all server + firmware work shipped to `main` between project inception and 2026-05-17. The earlier `[0.1.0] - 2026-04-25` entry below describes a pre-tag internal milestone — retained for historical reference, but `server-v0.1.0` is the canonical first release. diff --git a/bridge/requirements.txt b/bridge/requirements.txt index 18a3355..1490f9b 100644 --- a/bridge/requirements.txt +++ b/bridge/requirements.txt @@ -1,14 +1,11 @@ -# zeroclaw-bridge Python dependencies +# bridge.py (admin dashboard service) Python dependencies # Pin to major.minor — patch bumps are safe; minors may change API. fastapi>=0.115,<1 uvicorn[standard]>=0.34,<1 pydantic>=2.9,<3 jinja2>=3.1,<4 python-multipart>=0.0.9,<1 -# HTTP client — used by bridge.py for /admin endpoints + greeter inject-text -# and by custom-providers/zeroclaw/zeroclaw.py for OpenRouter SSE streaming. -# Was previously imported lazily in 6 functions and only worked on the live -# host because face-recognition pulled it in transitively. +# HTTP client — used by bridge.py for /admin endpoints + greeter inject-text. requests>=2.31,<3 # Observability — Prometheus /metrics endpoint and counters/histograms. # Bridge degrades gracefully if the import fails; see bridge/metrics.py. diff --git a/compose.all-in-one.yml b/compose.all-in-one.yml index bd0cf79..2b6dc30 100644 --- a/compose.all-in-one.yml +++ b/compose.all-in-one.yml @@ -1,22 +1,28 @@ -# compose.all-in-one.yml — single-host deployment +# compose.all-in-one.yml — single-host deployment (xiaozhi-server) # -# Runs both xiaozhi-esp32-server (voice I/O) and zeroclaw-bridge (LLM brain) -# on one Docker host. The two services talk over a shared Docker network; -# no real IPs needed between them. +# Since the #36 cutover (2026-05-19) the Dotty stack is several containers, +# all on one Docker host: # -# For the multi-host (Docker host + ZeroClaw host) deployment, see docs/advanced/multi-host.md. +# - xiaozhi-esp32-server — voice I/O ← this file +# - dotty-pi — the pi agent brain ← dotty-pi/docker-compose.yml +# - dotty-behaviour — perception+greeter ← dotty-behaviour/docker-compose.yml +# - bridge.py — the admin dashboard (port 8080) # -# ─── Prerequisites ────────────────────────────────────────────────────── +# This file brings up xiaozhi-esp32-server only. Bring up the brain and +# behaviour containers from their own compose files — see dotty-pi/README.md +# and dotty-behaviour/README.md. For the fully-annotated xiaozhi-server +# service (CUDA block, docker-socket mount, every xiaozhi-patch volume), use +# docker-compose.yml.template — that is the canonical definition; this file +# is the trimmed single-host convenience copy. # -# 1. ZeroClaw must be installed on the Docker host (or inside a volume -# the bridge container can reach). The bridge container needs: -# - The `zeroclaw` binary (default: /root/.cargo/bin/zeroclaw) -# - A ZeroClaw config directory (default: /root/.zeroclaw/) -# Install ZeroClaw: https://github.com/zeroclaw-labs/zeroclaw +# ─── Prerequisites ────────────────────────────────────────────────────── # +# 1. The `dotty-pi` container must be running — the default LLM provider +# (PiVoiceLLM) reaches it via `docker exec`. See dotty-pi/README.md. +# That also requires the host docker socket mounted into this container +# (see docker-compose.yml.template). # 2. ASR model weights must be downloaded into models/SenseVoiceSmall/. # See the main README for setup instructions. -# # 3. If using LocalPiper TTS, download Piper voice files into models/piper/. # # ─── Quick start ──────────────────────────────────────────────────────── @@ -24,15 +30,12 @@ # 1. Copy .env.example to .env and fill in real values. # 2. Edit data/.config.yaml — replace with this host's LAN IP # (the StackChan device needs a LAN-reachable IP for WebSocket/OTA). -# The LLM URL is already set to use the Docker network (http://bridge:8080/...). # 3. docker compose -f compose.all-in-one.yml up -d # 4. Point StackChan OTA URL to http://:8003/xiaozhi/ota/ # # ─── What to change for your environment ──────────────────────────────── # # - TZ: set to your IANA timezone (e.g. America/New_York, Europe/London) -# - ZEROCLAW_BIN: path to the zeroclaw binary on the host -# - ZeroClaw config volume: path to your .zeroclaw/ directory on the host # - Piper model paths in data/.config.yaml if you use different voices # @@ -40,8 +43,9 @@ services: # ── xiaozhi-esp32-server ───────────────────────────────────────────── # Voice pipeline: ASR, TTS, emotion parsing, WebSocket to the StackChan. - # The LLM provider (ZeroClawLLM) calls the bridge service over the - # Docker network — no real IP needed. + # The default LLM provider (PiVoiceLLM) reaches the `dotty-pi` container + # via `docker exec` — see docker-compose.yml.template for the docker-socket + # mount that requires. xiaozhi-server: build: context: . @@ -61,7 +65,7 @@ services: - ./data/.config.yaml:/opt/xiaozhi-esp32-server/data/.config.yaml:ro - ./models/SenseVoiceSmall:/opt/xiaozhi-esp32-server/models/SenseVoiceSmall - ./tmp:/opt/xiaozhi-esp32-server/tmp - - ./custom-providers/zeroclaw:/opt/xiaozhi-esp32-server/core/providers/llm/zeroclaw + - ./custom-providers/pi_voice:/opt/xiaozhi-esp32-server/core/providers/llm/pi_voice - ./custom-providers/edge_stream/edge_stream.py:/opt/xiaozhi-esp32-server/core/providers/tts/edge_stream.py:ro - ./custom-providers/piper_local/piper_local.py:/opt/xiaozhi-esp32-server/core/providers/tts/piper_local.py:ro - ./models/piper:/opt/xiaozhi-esp32-server/models/piper:ro @@ -73,55 +77,3 @@ services: # doesn't contain "/mcp/vision/explain"). See docs/ota-verification.md. - ./custom-providers/xiaozhi-patches/ota_handler.py:/opt/xiaozhi-esp32-server/core/api/ota_handler.py:ro - ./data/bin:/opt/xiaozhi-esp32-server/data/bin - networks: - - dotty - depends_on: - bridge: - condition: service_started - - # ── zeroclaw-bridge ────────────────────────────────────────────────── - # FastAPI HTTP bridge: translates xiaozhi LLM provider requests into - # ZeroClaw ACP calls (JSON-RPC 2.0 over stdio to a long-running - # `zeroclaw acp` child process). - # - # PREREQUISITE: ZeroClaw must be installed on the Docker host. - # The binary and config directory are bind-mounted into the container. - # Install: https://github.com/zeroclaw-labs/zeroclaw - bridge: - build: - context: . - dockerfile_inline: | - FROM python:3.12-slim - WORKDIR /app - COPY bridge/requirements.txt /app/requirements.txt - RUN pip install --no-cache-dir -r requirements.txt - COPY bridge.py /app/bridge.py - EXPOSE 8080 - CMD ["python", "bridge.py"] - container_name: zeroclaw-bridge - restart: unless-stopped - environment: - # EDIT: must match the bind-mounted path inside the container - - ZEROCLAW_BIN=/root/.cargo/bin/zeroclaw - - PORT=8080 - - ZEROCLAW_TIMEOUT=90 - - ZEROCLAW_INIT_TIMEOUT=10 - - ZEROCLAW_SESSION_IDLE=300 - - ZEROCLAW_SESSION_MAX_TURNS=50 - - ZEROCLAW_SESSION_MAX_AGE_SEC=1800 - ports: - # Exposed for external health checks: curl http://localhost:8080/health - - "8080:8080" - volumes: - # EDIT: path to the zeroclaw binary on the host. - # Default assumes cargo install under /root/.cargo/bin/. - - /root/.cargo/bin/zeroclaw:/root/.cargo/bin/zeroclaw:ro - # EDIT: path to the ZeroClaw config directory on the host. - # Contains config.toml (agent persona, LLM provider, API keys). - - /root/.zeroclaw:/root/.zeroclaw:ro - networks: - - dotty - -networks: - dotty: - driver: bridge diff --git a/docs/README.md b/docs/README.md index 262d6ad..ddabcf1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -19,9 +19,8 @@ re-verify claims against the canonical specs rather than trusting our paraphrase | Understand the overall shape | [architecture.md](./architecture.md) | | Know what the physical robot can do | [hardware.md](./hardware.md) | | Understand the voice pipeline (ASR/TTS/VAD) | [voice-pipeline.md](./voice-pipeline.md) | -| Understand the default voice LLM (Tier1Slim + escalation) | [tier1slim.md](./tier1slim.md) | -| Understand the brain (model matrix + ZeroClaw) | [brain.md](./brain.md) | -| Run different models on voice vs. Discord | [multi-daemon-split.md](./multi-daemon-split.md) | +| Understand the brain (the pi agent + model matrix) | [brain.md](./brain.md) | +| Understand the Tier1Slim voice LLM (alternate backend) | [tier1slim.md](./tier1slim.md) | | Know what's on the wire between components | [protocols.md](./protocols.md) | | See every cross-layer signal at a glance | [interaction-map.md](./interaction-map.md) | | Know what mode the robot is in (and what the LEDs mean) | [modes.md](./modes.md) | @@ -37,10 +36,9 @@ docs/ ├── architecture.md ← high-level data flow, actor responsibilities ├── hardware.md ← M5Stack StackChan body + firmware lineage + MCP tool catalog ├── voice-pipeline.md ← xiaozhi-esp32-server, FunASR/Whisper, VAD, Piper/EdgeTTS -├── tier1slim.md ← two-tier voice LLM provider + escalation contract -├── brain.md ← model matrix (Tier1Slim + ZeroClaw), bridge, OpenRouter -├── multi-daemon-split.md ← split voice + Discord across two ZeroClaw daemons -├── protocols.md ← Xiaozhi WebSocket, MCP-over-WS, ACP JSON-RPC, emotion +├── brain.md ← the pi agent runtime + model matrix + dashboard service +├── tier1slim.md ← Tier1Slim voice LLM provider (alternate) + escalation contract +├── protocols.md ← Xiaozhi WebSocket, MCP-over-WS, pi RPC, emotion ├── interaction-map.md ← every cross-layer signal: source, dest, protocol, notes ├── modes.md ← behavioural mode taxonomy + LED contract + transitions ├── latent-capabilities.md ← upstream features we could wire up (cross-refs ROADMAP.md) @@ -55,14 +53,14 @@ docs/ - **Grep-bait headers** — e.g. `## MCP tool handshake`, `## session/prompt` — so you can navigate by header search. - **Relative links only** — `[voice-pipeline.md](./voice-pipeline.md)`; never absolute paths. - **Freshness footer** — every non-index file ends with `Last verified: YYYY-MM-DD`. -- **Placeholders for per-deployment values** — ``, ``, etc. (mapping lives with the deployer, not in this repo). +- **Placeholders for per-deployment values** — ``, ``, etc. (mapping lives with the deployer, not in this repo). - **Soft claims where unverified** — if a fact came from a secondary source or we couldn't verify, the text says so rather than pretending to cite upstream. ## Relationship to the rest of the repo - `../README.md` — deployment & ops (commands, layout, troubleshooting). - `../CLAUDE.md` — agent orientation for this repo specifically. -- `../bridge.py`, `../zeroclaw.py`, `../edge_stream.py`, `../fun_local.py`, `../piper_local.py` — canonical source for the custom provider patches. +- `../bridge.py`, `../custom-providers/` — canonical source for the dashboard service and the custom ASR/LLM/TTS provider patches. - These `docs/` — the *why* and the *what else is possible* behind the above. ## When docs here are stale @@ -78,4 +76,4 @@ Each sub-file has a `Last verified:` date. Freshness decays roughly as follows: If you're reading this a year from now, treat the protocol + model claims as *starting points for re-verification*, not ground truth. -Last verified: 2026-05-17. +Last verified: 2026-05-22. diff --git a/docs/brain.md b/docs/brain.md index 78b3e00..6cd4d9e 100644 --- a/docs/brain.md +++ b/docs/brain.md @@ -52,7 +52,7 @@ Appdata layout on the Docker host: ├── sessions/ # pi session state ├── persona/ # Dotty persona files ├── memory/ -│ └── brain.db # FTS5 store (see brain-db-fts-only.md memory note) +│ └── brain.db # FTS5 full-text store └── extensions/ └── dotty-pi-ext/ # voice-tool extension ``` diff --git a/docs/proactive-greetings.md b/docs/proactive-greetings.md index 44f8eea..8e4e281 100644 --- a/docs/proactive-greetings.md +++ b/docs/proactive-greetings.md @@ -81,7 +81,7 @@ while this one targets `face_recognized` (Layer 4 output). | `GREETER_GREET_UNKNOWN` | `false` | When true, greet unrecognised faces with a generic "Hello! I don't think we've met." | | `GREETER_COOLDOWN_HOURS` | `4` | Minimum hours between greetings for the same identity. | | `GREETER_PER_DAY_MAX` | `3` | Hard cap on greetings per identity per day. The 4h cooldown already prevents back-to-back firings, so this is a safety ceiling rather than a politeness lever — turn it down if 3 greetings/day feels noisy. | -| `GREETER_STATE_PATH` | `~/.dotty/greeter_state.json` | Persistent greet log so a restart doesn't re-greet everyone. | +| `GREETER_STATE_PATH` | `/var/lib/dotty-behaviour/state/greeter_state.json` | Persistent greet log so a restart doesn't re-greet everyone. | | `GREETER_GREETING_MAX_WORDS` | `15` | Word cap fed to the LLM prompt; the model is also told "one sentence". | State file format (atomic write, JSON): diff --git a/mkdocs.yml b/mkdocs.yml index ed21f3f..b544803 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,7 +59,6 @@ nav: - Add Emoji: cookbook/add-emoji.md - Disable Kid Mode: cookbook/disable-kid-mode.md - Advanced: - - Multi-host deployment: advanced/multi-host.md - Variant port guide: advanced/variant-port-guide.md - Reference: - Emoji Mapping: emoji-mapping.md diff --git a/pyproject.toml b/pyproject.toml index 71fe533..7b3269c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,3 @@ show_missing = true # file structure (imports after a duck-typed shim class, etc.) — adding a # top-level import block would diverge from the source they're patching. "custom-providers/xiaozhi-patches/**" = ["E402"] -# zeroclaw.py imports core.utils.textUtils after the try/except dance -# that ensures the bind-mount path resolves first. -"custom-providers/zeroclaw/zeroclaw.py" = ["E402"] From 08b6bd7263078ff58145dd7e01dc12e7509c5145 Mon Sep 17 00:00:00 2001 From: Brett Kinny Date: Sat, 23 May 2026 09:03:59 +1000 Subject: [PATCH 4/4] =?UTF-8?q?test:=20drop=20orphan=20test=5Fzeroclaw=5Fp?= =?UTF-8?q?ersona=20=E2=80=94=20provider=20was=20deleted=20in=20#36?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `custom-providers/zeroclaw/` was retired in the cutover but the unit-test file outlived it. At pytest collection time the test loaded `zeroclaw.py` via importlib and raised FileNotFoundError, aborting the whole suite (CI Python Tests on #90). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_zeroclaw_persona.py | 167 --------------------------------- 1 file changed, 167 deletions(-) delete mode 100644 tests/test_zeroclaw_persona.py diff --git a/tests/test_zeroclaw_persona.py b/tests/test_zeroclaw_persona.py deleted file mode 100644 index 546859d..0000000 --- a/tests/test_zeroclaw_persona.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Unit tests for zeroclaw _load_persona_prompt -- persona hot-swap. - -Pure unit -- uses importlib to load zeroclaw.py directly from its source -path, mocking the xiaozhi-server packages (config.logger, -core.providers.llm.base) that are only available inside the container. -""" -from __future__ import annotations - -import importlib.util -import os -import sys -import tempfile -import unittest -from pathlib import Path -from unittest.mock import MagicMock - - -def _import_zeroclaw(): - """Import zeroclaw.py with xiaozhi-server internal deps mocked out. - - Pre-loads custom-providers/textUtils.py under the canonical - `core.utils.textUtils` name (the bind-mount path inside the - xiaozhi container). zeroclaw.py imports - `from core.utils.textUtils import FALLBACK_EMOJI, _SENTENCE_BOUNDARY` - so we need real values, not a MagicMock — otherwise the regex - operations downstream would fail. - - Restores sys.modules to its pre-call state after exec. Without this, - a MagicMock leaks into sys.modules['core.providers.llm.base'] and - pi_voice.py's `try: from core.providers.llm.base import ...` fallback - binds LLMProviderBase to a Mock attribute, producing a Mock-class - when `class LLMProvider(LLMProviderBase)` runs at import time. - """ - polluted_keys = ( - "config", - "config.logger", - "core", - "core.providers", - "core.providers.llm", - "core.providers.llm.base", - "core.utils", - "core.utils.textUtils", - ) - _MISSING = object() - saved = {k: sys.modules.get(k, _MISSING) for k in polluted_keys} - - try: - mock_logger_mod = MagicMock() - mock_logger_mod.setup_logging.return_value = MagicMock() - for pkg in polluted_keys[:-1]: # all except core.utils.textUtils - sys.modules.setdefault(pkg, MagicMock()) - sys.modules["config.logger"] = mock_logger_mod - - repo_root = Path(__file__).resolve().parents[1] - - # Pre-load real textUtils under the canonical bind-mount name so - # zeroclaw's `from core.utils.textUtils import ...` resolves to the - # actual module (not a Mock). - text_utils_path = repo_root / "custom-providers" / "textUtils.py" - text_utils_spec = importlib.util.spec_from_file_location( - "core.utils.textUtils", text_utils_path, - ) - text_utils_mod = importlib.util.module_from_spec(text_utils_spec) # type: ignore[arg-type] - text_utils_spec.loader.exec_module(text_utils_mod) # type: ignore[union-attr] - sys.modules["core.utils.textUtils"] = text_utils_mod - - path = repo_root / "custom-providers" / "zeroclaw" / "zeroclaw.py" - spec = importlib.util.spec_from_file_location("zeroclaw_provider", path) - mod = importlib.util.module_from_spec(spec) # type: ignore[arg-type] - spec.loader.exec_module(mod) # type: ignore[union-attr] - return mod - finally: - for k, v in saved.items(): - if v is _MISSING: - sys.modules.pop(k, None) - else: - sys.modules[k] = v # type: ignore[assignment] - - -_mod = _import_zeroclaw() -_load_persona_prompt = _mod._load_persona_prompt - - -class LoadPersonaPromptTests(unittest.TestCase): - """_load_persona_prompt reads PERSONA env var and loads the persona file.""" - - def setUp(self) -> None: - os.environ.pop("PERSONA", None) - os.environ.pop("PERSONA_DIR", None) - - def tearDown(self) -> None: - os.environ.pop("PERSONA", None) - os.environ.pop("PERSONA_DIR", None) - - def test_returns_empty_when_persona_not_set(self): - self.assertEqual(_load_persona_prompt(), "") - - def test_returns_empty_when_persona_empty_string(self): - os.environ["PERSONA"] = "" - self.assertEqual(_load_persona_prompt(), "") - - def test_returns_empty_when_persona_whitespace_only(self): - os.environ["PERSONA"] = " " - self.assertEqual(_load_persona_prompt(), "") - - def test_returns_empty_when_file_not_found(self): - with tempfile.TemporaryDirectory() as tmpdir: - os.environ["PERSONA"] = "nonexistent_persona_xyz" - os.environ["PERSONA_DIR"] = tmpdir - result = _load_persona_prompt() - self.assertEqual(result, "") - - def test_loads_persona_file_content(self): - with tempfile.TemporaryDirectory() as tmpdir: - (Path(tmpdir) / "test_persona.md").write_text( - "You are a test robot.", encoding="utf-8" - ) - os.environ["PERSONA"] = "test_persona" - os.environ["PERSONA_DIR"] = tmpdir - result = _load_persona_prompt() - self.assertEqual(result, "You are a test robot.") - - def test_strips_leading_trailing_whitespace(self): - with tempfile.TemporaryDirectory() as tmpdir: - (Path(tmpdir) / "padded.md").write_text( - "\n\nYou are a robot.\n\n", encoding="utf-8" - ) - os.environ["PERSONA"] = "padded" - os.environ["PERSONA_DIR"] = tmpdir - result = _load_persona_prompt() - self.assertEqual(result, "You are a robot.") - - def test_persona_dir_override_via_env(self): - with tempfile.TemporaryDirectory() as tmpdir: - (Path(tmpdir) / "custom.md").write_text( - "Custom persona.", encoding="utf-8" - ) - os.environ["PERSONA"] = "custom" - os.environ["PERSONA_DIR"] = tmpdir - result = _load_persona_prompt() - self.assertEqual(result, "Custom persona.") - - def test_multiline_persona_preserved(self): - content = "# Persona\n\nYou are Dotty.\nKeep replies short." - with tempfile.TemporaryDirectory() as tmpdir: - (Path(tmpdir) / "dotty.md").write_text(content, encoding="utf-8") - os.environ["PERSONA"] = "dotty" - os.environ["PERSONA_DIR"] = tmpdir - result = _load_persona_prompt() - self.assertEqual(result, content) - - def test_returns_empty_on_unreadable_dir(self): - os.environ["PERSONA"] = "any" - os.environ["PERSONA_DIR"] = "/nonexistent_dir_abc123/personas" - self.assertEqual(_load_persona_prompt(), "") - - def test_returns_empty_when_no_persona_dir_and_no_base(self): - # In test environments _PERSONAS_BASE is None (shallow __file__ path). - # Verify graceful empty return rather than AttributeError. - os.environ["PERSONA"] = "default" - # PERSONA_DIR intentionally not set; _PERSONAS_BASE is None in test env. - result = _load_persona_prompt() - self.assertEqual(result, "") - - -if __name__ == "__main__": - unittest.main()