From 163ebf5abdef23d5c89b6bc9304b35fc729cdce3 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Thu, 4 Jun 2026 02:17:05 -0500 Subject: [PATCH 01/18] Add notifications-only Telegram alerter (#121) Ship a thin, notifications-only Telegram pusher for v1.0: node down/recovered, worker offline/back online, and sync finished. Off by default; no interactive bot (that stays in #45). Consumes signals the data loop already computes rather than re-collecting: - node down/recovered from NodeHealthMonitor's debounced `down` edge (#31) - sync finished from the sync-gate `miner_released` latch (#35) - worker offline/back via a new flap-protected per-worker presence tracker New dashboard modules: - telegram_notifier.py: thin sendMessage transport; enabled only with token + chat_id; fail-silent on offline/Tor-only hosts; never logs the bot token. - worker_presence.py: WorkerPresenceMonitor, the per-worker analogue of NodeHealthMonitor (debounce + recovery hysteresis + silent baseline + reset when the proxy is intentionally stopped). - alert_service.py: folds per-cycle signals into debounced alerts; pure evaluate() + off-thread process(); wired into data_service.run(). Plumbing: config.json telegram.* -> pithead render_env -> per-event env vars -> config.py. bot_token rendered to the owner-only .env and masked in the apply preview. Injected into the dashboard container in docker-compose; added to the advanced example config. Docs: new docs/telegram.md setup guide (BotFather, chat id, per-event toggles, "one chat, two bots" with #79, Tor-only caveat, troubleshooting); cross-refs in the docs index, configuration reference, and CHANGELOG. Tests: 49 new pytest cases (notifier/monitor/alert service), plus stack tests for env propagation and bot-token secrecy. Full suite green; coverage 93%. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 12 + .../mining_dashboard/config/config.py | 42 ++++ .../mining_dashboard/service/alert_service.py | 138 +++++++++++ .../mining_dashboard/service/data_service.py | 22 ++ .../service/telegram_notifier.py | 79 +++++++ .../service/worker_presence.py | 105 +++++++++ .../tests/service/test_alert_service.py | 162 +++++++++++++ .../tests/service/test_telegram_notifier.py | 81 +++++++ .../tests/service/test_worker_presence.py | 126 ++++++++++ config.advanced.example.json | 13 ++ docker-compose.yml | 13 ++ docs/README.md | 1 + docs/configuration.md | 4 + docs/telegram.md | 217 ++++++++++++++++++ pithead | 35 +++ tests/stack/run.sh | 31 +++ 16 files changed, 1081 insertions(+) create mode 100644 build/dashboard/mining_dashboard/service/alert_service.py create mode 100644 build/dashboard/mining_dashboard/service/telegram_notifier.py create mode 100644 build/dashboard/mining_dashboard/service/worker_presence.py create mode 100644 build/dashboard/tests/service/test_alert_service.py create mode 100644 build/dashboard/tests/service/test_telegram_notifier.py create mode 100644 build/dashboard/tests/service/test_worker_presence.py create mode 100644 docs/telegram.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 372a494..c3aa0e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,18 @@ per the process in [`docs/releasing.md`](docs/releasing.md). ### Added +- **Telegram alerts (notifications-only)** (#121): the stack can push a small, high-value set + of operational alerts to Telegram — **node down / recovered**, **worker offline / back + online**, and **sync finished**. Off by default; enable it with a `telegram` block in + `config.json` (`enabled`, `bot_token`, `chat_id`, and per-event `events` toggles). Every + alert is **debounced** so a momentary blip won't ping you and you get one message per real + transition: node edges reuse the existing failover detector, and worker offline/online uses a + new flap-protected per-worker presence tracker. The `bot_token` is treated as a secret + (owner-only `.env`, never logged), and sends **fail silently** on a Tor-only / offline host. + Messages are prefixed with the dashboard hostname so multiple stacks can share one chat. Full + walkthrough — creating a bot, finding your chat id, and the "one chat, two bots" pattern for + sharing a chat with the Healthchecks.io monitor (#79) — in [`docs/telegram.md`](docs/telegram.md). + The interactive bot / command interface remains a separate, later feature (#45). - **P2Pool Earnings (estimated) card** on the dashboard's Advanced view: expected XMR per day / month / year from **P2Pool mining only**, computed from your P2Pool hashrate and the live Monero block reward + network difficulty, plus an expected time-to-share. diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index d4f8093..1938719 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -105,6 +105,48 @@ NODE_DOWN_AFTER_SEC = int(os.environ.get("NODE_DOWN_AFTER_SEC", 90)) NODE_RECOVERY_AFTER_SEC = int(os.environ.get("NODE_RECOVERY_AFTER_SEC", 60)) +# --- Operator alerts: Telegram (Issue #121) --- +# Notifications-only Telegram pusher: a thin notifier that pushes a small, high-value set of +# operational edges (node down/recovered, worker offline/back, sync finished) to one chat. +# Disabled by default — with TELEGRAM_ENABLED unset/false the stack runs with no Telegram +# config and never sends or errors. The interactive bot / command interface is a separate +# feature (#45); this is the notifications-only split. +# +# `bot_token` is a secret: the pithead CLI renders it into the owner-only .env (like the node +# RPC password), and the notifier never writes it to a log line. On a Tor-only / no-clearnet +# host the Telegram API is unreachable and sends fail silently (consistent with #59). +TELEGRAM_ENABLED = os.environ.get("TELEGRAM_ENABLED", "false").strip().lower() == "true" +TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "").strip() +TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "").strip() + + +def _telegram_event_enabled(name, default=True): + """Read one per-event toggle from TELEGRAM_EVENT_ (rendered from config.json's + telegram.events by pithead). Any toggle left unset defaults to on, so enabling Telegram + turns on the full set and an operator only has to opt *out* of the noisy ones.""" + raw = os.environ.get(f"TELEGRAM_EVENT_{name.upper()}") + if raw is None or raw.strip() == "": + return default + return raw.strip().lower() == "true" + + +# Per-event delivery toggles. Keys here are the canonical event names used throughout the +# alerter (AlertService.EVT_*) and must match the config.json telegram.events block. +TELEGRAM_EVENTS = { + "node_down": _telegram_event_enabled("node_down"), + "node_recovered": _telegram_event_enabled("node_recovered"), + "worker_offline": _telegram_event_enabled("worker_offline"), + "worker_recovered": _telegram_event_enabled("worker_recovered"), + "sync_finished": _telegram_event_enabled("sync_finished"), +} + +# Worker offline/online debounce (Issue #121). A worker must be unseen this long before it's +# reported OFFLINE, and seen continuously this long before "back online" — so a brief miner +# reconnect doesn't spam the chat. Workers flap more than nodes (rig reboots, Wi-Fi blips), +# so the window is wider than the node debounce above. +WORKER_OFFLINE_AFTER_SEC = int(os.environ.get("WORKER_OFFLINE_AFTER_SEC", 300)) +WORKER_RECOVERY_AFTER_SEC = int(os.environ.get("WORKER_RECOVERY_AFTER_SEC", 120)) + # --- Monero Configuration --- # Used to determine if the node is local (Docker) or remote MONERO_NODE_HOST = os.environ.get("MONERO_NODE_HOST", "172.28.0.26") diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py new file mode 100644 index 0000000..1e26902 --- /dev/null +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -0,0 +1,138 @@ +import asyncio +import logging + +from mining_dashboard.config.config import ( + HOST_IP, + TELEGRAM_BOT_TOKEN, + TELEGRAM_CHAT_ID, + TELEGRAM_ENABLED, + TELEGRAM_EVENTS, +) +from mining_dashboard.service.telegram_notifier import TelegramNotifier +from mining_dashboard.service.worker_presence import WorkerPresenceMonitor + +logger = logging.getLogger("AlertService") + + +def build_default_notifier(): + """Construct the Telegram notifier from the process config (Issue #121).""" + return TelegramNotifier( + enabled=TELEGRAM_ENABLED, + bot_token=TELEGRAM_BOT_TOKEN, + chat_id=TELEGRAM_CHAT_ID, + events=TELEGRAM_EVENTS, + ) + + +class AlertService: + """ + Turns the data loop's per-cycle signals into a small set of debounced operator alerts and + pushes them over Telegram (Issue #121). Notifications-only — no interactive bot (#45). + + It *consumes* signals the loop already computes rather than re-collecting anything: + + - **node down / recovered** — transitions of ``NodeHealthMonitor``'s debounced ``down`` + flag per node (#31). Tari is only alerted when it's treated as required; a non-blocking + Tari going down isn't operator-critical (we keep mining Monero), matching the + worker-rejection rule. + - **sync finished** — the sync gate's ``miner_released`` latch flipping open once (#35). + - **worker offline / back online** — a debounced :class:`WorkerPresenceMonitor` over the + live worker list (the one genuinely new building block this issue adds). + + Edge state is seeded silently on the first observation (``None`` baselines), so a dashboard + restart can't replay a stale transition as a fresh alert. + + :meth:`evaluate` is pure (folds signals into the alert list, no I/O) so it's fully + unit-testable; :meth:`process` calls it and dispatches each message off-thread so a slow or + blocked Telegram send never stalls the data loop. + """ + + # Event keys — must match config.json's telegram.events toggles and TELEGRAM_EVENTS. + EVT_NODE_DOWN = "node_down" + EVT_NODE_RECOVERED = "node_recovered" + EVT_WORKER_OFFLINE = "worker_offline" + EVT_WORKER_RECOVERED = "worker_recovered" + EVT_SYNC_FINISHED = "sync_finished" + + def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): + self.notifier = notifier if notifier is not None else build_default_notifier() + self.workers = worker_monitor if worker_monitor is not None else WorkerPresenceMonitor() + # "Unknown Host" is config.py's placeholder when HOST_IP isn't set — don't prefix with it. + self.host_label = "" if host_label in (None, "", "Unknown Host") else host_label + # None = "not yet observed": the first cycle seeds the baseline without emitting. + self._prev_monero_down = None + self._prev_tari_down = None + self._prev_released = None + + @property + def enabled(self): + return self.notifier.enabled + + def evaluate(self, *, monero_down, tari_down, tari_required, miner_released, + online_workers, workers_expected, now=None): + """Pure: fold this cycle's signals into the list of ``(event_key, text)`` to send, + filtered to the events the operator left enabled.""" + alerts = [] + + # --- Node down / recovered (consume NodeHealthMonitor edges) --- + alerts += self._node_edges("Monero", monero_down, "_prev_monero_down") + if tari_required: + alerts += self._node_edges("Tari", tari_down, "_prev_tari_down") + else: + # Keep the baseline current while Tari is non-blocking, so flipping it back to + # required later doesn't fire a stale edge from a state we never alerted on. + self._prev_tari_down = tari_down + + # --- Sync finished (one-shot when the gate first opens) --- + if self._prev_released is None: + self._prev_released = miner_released + elif miner_released and not self._prev_released: + alerts.append((self.EVT_SYNC_FINISHED, self._fmt( + "✅ Node ready — required chain(s) synced; mining has started."))) + self._prev_released = miner_released + + # --- Worker offline / back online (debounced) --- + # Only meaningful while workers are actually expected: when the proxy is intentionally + # stopped (initial sync hold, or node-down failover) their absence is by design, so we + # reset the tracker instead of aging every rig into a false "offline". + if workers_expected: + for name, event in self.workers.update(online_workers, now=now): + if event == "offline": + alerts.append((self.EVT_WORKER_OFFLINE, + self._fmt(f"\U0001f534 Worker offline: {name}"))) + else: + alerts.append((self.EVT_WORKER_RECOVERED, + self._fmt(f"\U0001f7e2 Worker back online: {name}"))) + else: + self.workers.reset() + + return [(evt, text) for evt, text in alerts if self.notifier.event_enabled(evt)] + + def _node_edges(self, label, down, attr): + prev = getattr(self, attr) + setattr(self, attr, down) + if prev is None or down == prev: + return [] + if down: + return [(self.EVT_NODE_DOWN, self._fmt( + f"\U0001f534 {label} node is DOWN — workers failing over to backup pools."))] + return [(self.EVT_NODE_RECOVERED, self._fmt( + f"\U0001f7e2 {label} node recovered — workers readmitted."))] + + def _fmt(self, text): + return f"[{self.host_label}] {text}" if self.host_label else text + + async def process(self, **signals): + """Evaluate this cycle's signals and dispatch any alerts. No-op (and cheap) when the + notifier is disabled. Each send runs off-thread so a slow Telegram call can't stall + the data loop. Returns the alerts that were dispatched (handy for tests/logging).""" + if not self.notifier.enabled: + return [] + try: + alerts = self.evaluate(**signals) + except Exception as exc: # never let an alerting bug break the data loop + logger.debug("Alert evaluation failed (%s)", type(exc).__name__) + return [] + for _evt, text in alerts: + await asyncio.to_thread(self.notifier.send, text) + return alerts diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index d69169b..2605f05 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -16,6 +16,7 @@ from mining_dashboard.collector.logs import get_monero_sync_status from mining_dashboard.collector.system import get_disk_usage, get_hugepages_status, get_memory_usage, get_load_average, get_cpu_usage from mining_dashboard.service.node_health import NodeHealthMonitor +from mining_dashboard.service.alert_service import AlertService logger = logging.getLogger("DataService") @@ -206,6 +207,12 @@ def __init__(self, state_manager, proxy_client, xvb_client): self.docker_control = DockerControl() self.monero_health = NodeHealthMonitor() self.tari_health = NodeHealthMonitor() + + # Notifications-only Telegram alerter (Issue #121). Consumes the loop's existing edges + # (node down/recovered, sync gate open) plus a debounced per-worker presence tracker. + # Disabled unless telegram.enabled + bot_token + chat_id are configured, so this is a + # cheap no-op for the default stack. + self.alert_service = AlertService() # True while we've stopped the proxy to reject workers. Persisted in the snapshot so # a dashboard restart mid-outage still readmits workers once the node recovers. self.workers_rejected = False @@ -437,6 +444,21 @@ async def run(self): if self.miner_released: await self._apply_worker_rejection(monero_down, tari_down) + # 5. Operator alerts (Issue #121): push debounced node/worker/sync edges to + # Telegram. Consumes the flags computed above; worker presence is only + # tracked while the proxy is actually serving (miner released and not + # rejected) — its intentional absence otherwise must not read as offline. + # No-op unless Telegram is configured; never raises. + await self.alert_service.process( + monero_down=monero_down, + tari_down=tari_down, + tari_required=TARI_REQUIRED, + miner_released=self.miner_released, + online_workers=[w["name"] for w in final_workers + if w.get("status") == "online"], + workers_expected=self.miner_released and not self.workers_rejected, + ) + # Fetch fresh shares list to populate UI shares_list = await asyncio.to_thread(self.state_manager.get_shares) diff --git a/build/dashboard/mining_dashboard/service/telegram_notifier.py b/build/dashboard/mining_dashboard/service/telegram_notifier.py new file mode 100644 index 0000000..ddef13c --- /dev/null +++ b/build/dashboard/mining_dashboard/service/telegram_notifier.py @@ -0,0 +1,79 @@ +import logging + +import requests + +logger = logging.getLogger("TelegramNotifier") + +# Telegram Bot API base. Overridable in tests so we never touch the network. +TELEGRAM_API_BASE = "https://api.telegram.org" + + +class TelegramNotifier: + """ + Thin, fire-and-forget Telegram push notifier (Issue #121). + + Pushes short operational alerts to a single chat via the Bot API ``sendMessage`` + endpoint. Deliberately minimal — there is no interactive bot, no commands, no polling + (that's #45); this is the notifications-only half. + + Discipline (mirrors the rest of the stack): + + - **Disabled by default.** ``enabled`` is only true when explicitly switched on *and* + both ``bot_token`` and ``chat_id`` are present. A missing/half-filled config leaves it + off, so :meth:`send` is a silent no-op rather than an error on every cycle. + - **Per-event toggles.** ``events`` gates which alert kinds are delivered, so an operator + can enable Telegram and still silence the ones they find noisy. + - **Fail silent.** Any network error (offline host, Tor-only egress with no clearnet, + Telegram unreachable) is swallowed and logged at debug — an alerter must never crash + the data loop or spam ERROR for the very condition it exists to report (#59 discipline). + - **Never logs the token.** ``bot_token`` is a secret; it only ever appears in the request + URL and is never written to a log line — not even inside an exception message (which for + ``requests`` would otherwise include the full URL). + """ + + def __init__(self, enabled=False, bot_token="", chat_id="", events=None, + timeout=10, api_base=TELEGRAM_API_BASE): + self.bot_token = (bot_token or "").strip() + # chat_id may be a negative integer (Telegram group ids look like -1001234567890); + # keep it as a string so render/transport never reformat it. + self.chat_id = str(chat_id or "").strip() + self.events = dict(events or {}) + self.timeout = timeout + self._api_base = api_base.rstrip("/") + self.enabled = bool(enabled and self.bot_token and self.chat_id) + + if enabled and not self.enabled: + # Switched on but unusable — tell the operator once, without leaking the token. + logger.warning( + "Telegram alerts enabled but bot_token/chat_id are missing — alerts stay off." + ) + + def event_enabled(self, event): + """True only when the notifier is usable *and* this event kind is toggled on.""" + return self.enabled and bool(self.events.get(event, False)) + + def send(self, text): + """Push one message. Returns True on a successful 2xx send, False otherwise + (including when disabled). Never raises.""" + if not self.enabled: + return False + + url = f"{self._api_base}/bot{self.bot_token}/sendMessage" + try: + resp = requests.post( + url, + json={ + "chat_id": self.chat_id, + "text": text, + "disable_web_page_preview": True, + }, + timeout=self.timeout, + ) + resp.raise_for_status() + return True + except requests.RequestException as exc: + # Log only the exception *type*: a requests error message can embed the full URL, + # which contains the bot token. Telegram being unreachable on a private/Tor-only + # host is expected, so this stays at debug to avoid log noise. + logger.debug("Telegram send failed (%s)", type(exc).__name__) + return False diff --git a/build/dashboard/mining_dashboard/service/worker_presence.py b/build/dashboard/mining_dashboard/service/worker_presence.py new file mode 100644 index 0000000..87e76e5 --- /dev/null +++ b/build/dashboard/mining_dashboard/service/worker_presence.py @@ -0,0 +1,105 @@ +import time + +from mining_dashboard.config.config import ( + WORKER_OFFLINE_AFTER_SEC, + WORKER_RECOVERY_AFTER_SEC, + WORKER_RETENTION_SEC, +) + + +class WorkerPresenceMonitor: + """ + Per-worker, flap-protected offline/online tracker (Issue #121). + + The alerter needs a stable "rig-3 went offline / rig-3 is back" signal, but the raw + per-cycle worker list is noisy: xmrig-proxy keeps a disconnected worker around with a + decaying hashrate, miners briefly reconnect, and a worker can drop out of the list + entirely. This is the per-worker analogue of ``NodeHealthMonitor`` (#31), multiplexed + over many workers keyed by name: + + - **Debounce / hysteresis.** A worker must be *unseen* for ``offline_after`` seconds + before it's declared OFFLINE, and *seen continuously* for ``recovery_after`` seconds + before OFFLINE clears — so a single dropped poll, or a brief reconnect during an + outage, can't emit a recovered→offline spam. + - **Silent baseline.** A worker's first sighting registers it as ONLINE with no edge — a + brand-new rig is not a "recovery", and a dashboard restart must not replay every known + worker as a fresh alert. + - **Bounded memory.** A worker gone longer than ``retention`` is forgotten, so state stays + bounded and a long-absent rig that returns counts as new (mirrors ``StateManager``'s + 7-day worker retention, the persisted analogue of this in-memory tracker). + + :meth:`update` takes the set of worker names seen *online* this cycle and returns a list + of ``(name, event)`` edges, ``event`` in ``{"offline", "recovered"}``. + + Clock defaults to wall-clock ``time.time`` (so it lines up with ``last_seen`` semantics); + injectable for deterministic tests. + """ + + def __init__(self, offline_after=WORKER_OFFLINE_AFTER_SEC, + recovery_after=WORKER_RECOVERY_AFTER_SEC, + retention=WORKER_RETENTION_SEC, clock=time.time): + self.offline_after = offline_after + self.recovery_after = recovery_after + self.retention = retention + self._clock = clock + # name -> {state, seen_since, unseen_since, last_present} + # state : "online" | "offline" (the debounced, edge-emitting state) + # seen_since : when the current continuous-presence streak began (None while absent) + # unseen_since : when the current continuous-absence streak began (None while present) + # last_present : last cycle the worker was seen (drives retention pruning) + self._workers = {} + + def update(self, present, now=None): + """Feed this cycle's online worker names; return the list of debounced transitions.""" + now = self._clock() if now is None else now + present = set(present) + edges = [] + + # Present workers: refresh the presence streak, clear OFFLINE once it's stable. + for name in present: + w = self._workers.get(name) + if w is None: + # First sighting — baseline as ONLINE, no edge. + self._workers[name] = { + "state": "online", "seen_since": now, + "unseen_since": None, "last_present": now, + } + continue + w["last_present"] = now + w["unseen_since"] = None + if w["seen_since"] is None: + w["seen_since"] = now + if w["state"] == "offline" and (now - w["seen_since"]) >= self.recovery_after: + w["state"] = "online" + edges.append((name, "recovered")) + + # Absent workers: age the absence streak, declare OFFLINE once past the threshold. + for name, w in self._workers.items(): + if name in present: + continue + w["seen_since"] = None + if w["unseen_since"] is None: + w["unseen_since"] = now + if w["state"] == "online" and (now - w["unseen_since"]) >= self.offline_after: + w["state"] = "offline" + edges.append((name, "offline")) + + self._prune(now) + return edges + + def reset(self): + """Drop all per-worker state. + + Called when the proxy is *intentionally* stopped — during the initial sync hold (#35) + or node-down worker failover (#31) — so the expected absence of workers doesn't age + into false "offline" alerts, and re-admission re-baselines every worker silently. + """ + self._workers.clear() + + def _prune(self, now): + stale = [ + name for name, w in self._workers.items() + if w["unseen_since"] is not None and (now - w["last_present"]) >= self.retention + ] + for name in stale: + del self._workers[name] diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py new file mode 100644 index 0000000..ab7983e --- /dev/null +++ b/build/dashboard/tests/service/test_alert_service.py @@ -0,0 +1,162 @@ +import pytest + +from mining_dashboard.service.alert_service import AlertService +from mining_dashboard.service.worker_presence import WorkerPresenceMonitor + + +class _FakeNotifier: + """Stand-in transport: records sends, lets tests gate which events are 'enabled'.""" + def __init__(self, enabled=True, allow=None): + self.enabled = enabled + self._allow = allow # None => every event allowed + self.sent = [] + + def event_enabled(self, event): + if not self.enabled: + return False + return True if self._allow is None else event in self._allow + + def send(self, text): + self.sent.append(text) + return True + + +def _svc(notifier=None, **kw): + notifier = notifier if notifier is not None else _FakeNotifier() + kw.setdefault("worker_monitor", WorkerPresenceMonitor(offline_after=300, recovery_after=120)) + kw.setdefault("host_label", "") + return AlertService(notifier=notifier, **kw) + + +def _ev(svc, *, monero_down=False, tari_down=False, tari_required=True, + miner_released=True, online_workers=(), workers_expected=False, now=0): + return svc.evaluate( + monero_down=monero_down, tari_down=tari_down, tari_required=tari_required, + miner_released=miner_released, online_workers=list(online_workers), + workers_expected=workers_expected, now=now) + + +def _keys(alerts): + return [k for k, _ in alerts] + + +class TestNodeEdges: + def test_first_cycle_seeds_baseline_silently(self): + svc = _svc() + # Already-down at startup must not replay as a fresh alert (restart semantics). + assert _ev(svc, monero_down=True) == [] + + def test_down_then_recovered(self): + svc = _svc() + _ev(svc, monero_down=False) # seed + assert _keys(_ev(svc, monero_down=True)) == [AlertService.EVT_NODE_DOWN] + assert _ev(svc, monero_down=True) == [] # no repeat while still down + assert _keys(_ev(svc, monero_down=False)) == [AlertService.EVT_NODE_RECOVERED] + + def test_node_text_names_the_chain(self): + svc = _svc() + _ev(svc, monero_down=False) + _, text = _ev(svc, monero_down=True)[0] + assert "Monero" in text + + +class TestTariGating: + def test_non_blocking_tari_does_not_alert(self): + svc = _svc() + _ev(svc, tari_down=False, tari_required=False) + assert _ev(svc, tari_down=True, tari_required=False) == [] + + def test_no_stale_edge_when_tari_becomes_required(self): + # Tari went down while non-blocking (no alert). Re-marking it required must not then + # replay a down edge for a state we never alerted on. + svc = _svc() + _ev(svc, tari_down=False, tari_required=False) + _ev(svc, tari_down=True, tari_required=False) # silently tracked + assert _ev(svc, tari_down=True, tari_required=True) == [] + # ...but a genuine recovery from there still fires. + assert _keys(_ev(svc, tari_down=False, tari_required=True)) == [AlertService.EVT_NODE_RECOVERED] + + def test_required_tari_alerts(self): + svc = _svc() + _ev(svc, tari_down=False, tari_required=True) + _, text = _ev(svc, tari_down=True, tari_required=True)[0] + assert "Tari" in text + + +class TestSyncFinished: + def test_fires_once_when_gate_opens(self): + svc = _svc() + _ev(svc, miner_released=False) # seed: still syncing + assert _keys(_ev(svc, miner_released=True)) == [AlertService.EVT_SYNC_FINISHED] + assert _ev(svc, miner_released=True) == [] # one-shot + + def test_no_alert_on_restart_after_sync(self): + svc = _svc() + # First observation is already-released (restart after sync) -> baseline, no alert. + assert _ev(svc, miner_released=True) == [] + + +class TestWorkerEdges: + def test_offline_then_recovered(self): + svc = _svc() + assert _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) == [] + assert _ev(svc, online_workers=[], workers_expected=True, now=0) == [] + assert _keys(_ev(svc, online_workers=[], workers_expected=True, now=300)) == \ + [AlertService.EVT_WORKER_OFFLINE] + _ev(svc, online_workers=["rig-1"], workers_expected=True, now=300) + assert _keys(_ev(svc, online_workers=["rig-1"], workers_expected=True, now=420)) == \ + [AlertService.EVT_WORKER_RECOVERED] + + def test_not_expected_resets_and_silences(self): + svc = _svc() + _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) + _ev(svc, online_workers=[], workers_expected=True, now=0) + _ev(svc, online_workers=[], workers_expected=True, now=300) # rig-1 now offline + # Proxy intentionally stopped (sync hold / failover): reset, no alert. + assert _ev(svc, online_workers=[], workers_expected=False, now=330) == [] + # Re-admission re-baselines silently — no spurious "recovered". + assert _ev(svc, online_workers=["rig-1"], workers_expected=True, now=360) == [] + + +class TestEventFiltering: + def test_disabled_events_are_dropped(self): + svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) + _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) + _ev(svc, online_workers=[], workers_expected=True, now=0) + # worker_offline is computed but filtered out because it's not in the allow-set. + assert _ev(svc, online_workers=[], workers_expected=True, now=300) == [] + + +class TestHostLabel: + def test_prefixes_when_set(self): + svc = _svc(host_label="box.lan") + _ev(svc, monero_down=False) + _, text = _ev(svc, monero_down=True)[0] + assert text.startswith("[box.lan] ") + + def test_placeholder_host_is_not_prefixed(self): + svc = _svc(host_label="Unknown Host") + _ev(svc, monero_down=False) + _, text = _ev(svc, monero_down=True)[0] + assert not text.startswith("[") + + +class TestProcess: + async def test_disabled_notifier_is_noop(self): + notifier = _FakeNotifier(enabled=False) + svc = _svc(notifier=notifier) + out = await svc.process(monero_down=True, tari_down=False, tari_required=True, + miner_released=True, online_workers=[], workers_expected=False) + assert out == [] + assert notifier.sent == [] + + async def test_enabled_notifier_dispatches(self): + notifier = _FakeNotifier() + svc = _svc(notifier=notifier) + # seed + await svc.process(monero_down=False, tari_down=False, tari_required=True, + miner_released=True, online_workers=[], workers_expected=False) + out = await svc.process(monero_down=True, tari_down=False, tari_required=True, + miner_released=True, online_workers=[], workers_expected=False) + assert _keys(out) == [AlertService.EVT_NODE_DOWN] + assert len(notifier.sent) == 1 and "DOWN" in notifier.sent[0] diff --git a/build/dashboard/tests/service/test_telegram_notifier.py b/build/dashboard/tests/service/test_telegram_notifier.py new file mode 100644 index 0000000..f3dd17b --- /dev/null +++ b/build/dashboard/tests/service/test_telegram_notifier.py @@ -0,0 +1,81 @@ +from unittest.mock import MagicMock, patch + +import requests + +import mining_dashboard.service.telegram_notifier as tg_mod +from mining_dashboard.service.telegram_notifier import TelegramNotifier + +EVENTS = {"node_down": True, "node_recovered": False} + + +def _enabled(**kw): + opts = dict(enabled=True, bot_token="TOKEN", chat_id="123", events=EVENTS) + opts.update(kw) + return TelegramNotifier(**opts) + + +class TestEnabledGating: + def test_disabled_by_default(self): + assert TelegramNotifier().enabled is False + + def test_enabled_requires_token_and_chat(self): + assert TelegramNotifier(enabled=True, bot_token="", chat_id="123").enabled is False + assert TelegramNotifier(enabled=True, bot_token="t", chat_id="").enabled is False + assert TelegramNotifier(enabled=True, bot_token="t", chat_id="123").enabled is True + + def test_enabled_flag_off_disables_even_with_creds(self): + assert TelegramNotifier(enabled=False, bot_token="t", chat_id="123").enabled is False + + def test_event_enabled_respects_toggle_and_enabled(self): + n = _enabled() + assert n.event_enabled("node_down") is True + assert n.event_enabled("node_recovered") is False # toggled off + assert n.event_enabled("worker_offline") is False # absent -> off + # A disabled notifier reports every event as off. + assert TelegramNotifier(events={"node_down": True}).event_enabled("node_down") is False + + +class TestSend: + def test_send_noop_when_disabled(self): + with patch.object(tg_mod.requests, "post") as post: + assert TelegramNotifier().send("hi") is False + post.assert_not_called() + + def test_send_posts_to_bot_api(self): + n = _enabled(api_base="https://tg.test") + resp = MagicMock() + resp.raise_for_status = MagicMock() + with patch.object(tg_mod.requests, "post", return_value=resp) as post: + assert n.send("node down") is True + url = post.call_args.args[0] + assert url == "https://tg.test/botTOKEN/sendMessage" + body = post.call_args.kwargs["json"] + assert body["chat_id"] == "123" + assert body["text"] == "node down" + + def test_send_swallows_network_error(self): + n = _enabled() + with patch.object(tg_mod.requests, "post", + side_effect=requests.RequestException("offline")): + assert n.send("x") is False + + def test_send_swallows_http_error(self): + n = _enabled() + resp = MagicMock() + resp.raise_for_status.side_effect = requests.HTTPError("401") + with patch.object(tg_mod.requests, "post", return_value=resp): + assert n.send("x") is False + + +class TestTokenSecrecy: + def test_token_never_logged_on_failure(self, caplog): + # A requests error message embeds the request URL (with the token). Assert the token + # never reaches the logs even when a send fails. + n = _enabled(bot_token="SUPERSECRETTOKEN") + boom = requests.RequestException( + "HTTPSConnectionPool failed for url: " + "https://api.telegram.org/botSUPERSECRETTOKEN/sendMessage") + with caplog.at_level("DEBUG"): + with patch.object(tg_mod.requests, "post", side_effect=boom): + n.send("x") + assert "SUPERSECRETTOKEN" not in caplog.text diff --git a/build/dashboard/tests/service/test_worker_presence.py b/build/dashboard/tests/service/test_worker_presence.py new file mode 100644 index 0000000..eda9dcc --- /dev/null +++ b/build/dashboard/tests/service/test_worker_presence.py @@ -0,0 +1,126 @@ +from mining_dashboard.service.worker_presence import WorkerPresenceMonitor + + +class _Clock: + """Manually advanced clock for deterministic debounce tests.""" + def __init__(self): + self.t = 1000.0 + + def __call__(self): + return self.t + + def advance(self, secs): + self.t += secs + + +def _monitor(offline_after=300, recovery_after=120, retention=7 * 24 * 3600): + clock = _Clock() + m = WorkerPresenceMonitor(offline_after=offline_after, recovery_after=recovery_after, + retention=retention, clock=clock) + return m, clock + + +class TestBaseline: + def test_first_sighting_is_silent(self): + # A brand-new worker is registered ONLINE with no edge — it's not a "recovery". + m, _ = _monitor() + assert m.update({"rig-1"}) == [] + + def test_steady_online_emits_nothing(self): + m, clock = _monitor() + m.update({"rig-1"}) + for _ in range(5): + clock.advance(30) + assert m.update({"rig-1"}) == [] + + +class TestOfflineDebounce: + def test_not_offline_before_threshold(self): + m, clock = _monitor() + m.update({"rig-1"}) # baseline online + clock.advance(30) + assert m.update(set()) == [] # absence streak starts here (within debounce) + clock.advance(269) + assert m.update(set()) == [] # 269s absent — still under the 300s threshold + + def test_offline_after_threshold(self): + m, clock = _monitor() + m.update({"rig-1"}) + m.update(set()) # absence streak starts here + clock.advance(300) + assert m.update(set()) == [("rig-1", "offline")] + + def test_offline_emitted_once(self): + m, clock = _monitor() + m.update({"rig-1"}) + m.update(set()) + clock.advance(300) + assert m.update(set()) == [("rig-1", "offline")] + clock.advance(300) + assert m.update(set()) == [] # already offline — no repeat + + def test_brief_drop_does_not_trip(self): + m, clock = _monitor() + m.update({"rig-1"}) + clock.advance(60); assert m.update(set()) == [] # gone 60s + clock.advance(30); assert m.update({"rig-1"}) == [] # back well before 300s + + +class TestRecoveryHysteresis: + def _take_offline(self, m, clock): + m.update({"rig-1"}) + m.update(set()) + clock.advance(300) + assert m.update(set()) == [("rig-1", "offline")] + + def test_recovered_only_after_stable_window(self): + m, clock = _monitor() + self._take_offline(m, clock) + # Reappears, but "back online" holds until it's been present for recovery_after. + assert m.update({"rig-1"}) == [] + clock.advance(119); assert m.update({"rig-1"}) == [] + clock.advance(1); assert m.update({"rig-1"}) == [("rig-1", "recovered")] + + def test_flap_during_recovery_does_not_emit(self): + # A one-cycle reconnect during an outage must not produce a recovered→offline spam. + m, clock = _monitor() + self._take_offline(m, clock) + clock.advance(30); assert m.update({"rig-1"}) == [] # blink on (still offline) + clock.advance(30); assert m.update(set()) == [] # blink off — no recovered, no re-offline + clock.advance(30); assert m.update(set()) == [] + + +class TestMultipleWorkers: + def test_independent_per_worker_state(self): + m, clock = _monitor() + m.update({"rig-1", "rig-2"}) + # rig-2 stays online; rig-1 drops. + m.update({"rig-2"}) + clock.advance(300) + assert m.update({"rig-2"}) == [("rig-1", "offline")] + + +class TestReset: + def test_reset_clears_state_and_rebaselines_silently(self): + m, clock = _monitor() + m.update({"rig-1"}) + m.update(set()) + clock.advance(300) + assert m.update(set()) == [("rig-1", "offline")] + m.reset() + # After a reset (e.g. proxy intentionally stopped), the worker re-appears as a fresh + # baseline — no "recovered" edge. + assert m.update({"rig-1"}) == [] + + +class TestRetention: + def test_long_absent_worker_is_forgotten(self): + m, clock = _monitor(retention=1000) + m.update({"rig-1"}) + m.update(set()) + clock.advance(300); m.update(set()) # offline emitted + clock.advance(1000) + m.update(set()) # past retention -> pruned + assert "rig-1" not in m._workers + # Returning after retention counts as new: silent baseline, not a recovery. + assert m.update({"rig-1"}) == [] diff --git a/config.advanced.example.json b/config.advanced.example.json index 0bd0026..ff7727f 100644 --- a/config.advanced.example.json +++ b/config.advanced.example.json @@ -48,5 +48,18 @@ "timezone": "auto", "data_dir": "auto", "tari_required": true + }, + + "telegram": { + "enabled": false, + "bot_token": "", + "chat_id": "", + "events": { + "node_down": true, + "node_recovered": true, + "worker_offline": true, + "worker_recovered": true, + "sync_finished": true + } } } diff --git a/docker-compose.yml b/docker-compose.yml index a42fa01..614d297 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -282,6 +282,19 @@ services: # miner waits for Tari's sync, and whether a Tari-only sync takes over the dashboard # (Issue #51). Set dashboard.tari_required:false to make Tari non-blocking. - TARI_REQUIRED=${TARI_REQUIRED:-true} + # --- Operator alerts: Telegram (Issue #121) --- + # Notifications-only push alerter; disabled by default. bot_token is a secret sourced from + # the owner-only .env and is never logged by the dashboard. On a Tor-only / no-clearnet + # host the Telegram API is unreachable and sends fail silently. Per-event toggles default + # on. See docs/telegram.md. + - TELEGRAM_ENABLED=${TELEGRAM_ENABLED:-false} + - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-} + - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID:-} + - TELEGRAM_EVENT_NODE_DOWN=${TELEGRAM_EVENT_NODE_DOWN:-true} + - TELEGRAM_EVENT_NODE_RECOVERED=${TELEGRAM_EVENT_NODE_RECOVERED:-true} + - TELEGRAM_EVENT_WORKER_OFFLINE=${TELEGRAM_EVENT_WORKER_OFFLINE:-true} + - TELEGRAM_EVENT_WORKER_RECOVERED=${TELEGRAM_EVENT_WORKER_RECOVERED:-true} + - TELEGRAM_EVENT_SYNC_FINISHED=${TELEGRAM_EVENT_SYNC_FINISHED:-true} # --- Docker Socket Proxy (read-only) --- # Read-only window onto the Docker API for the dashboard's container stats/logs. diff --git a/docs/README.md b/docs/README.md index b30dd61..162666f 100644 --- a/docs/README.md +++ b/docs/README.md @@ -14,6 +14,7 @@ deeper on individual topics once you're up and running. | [Hardware Requirements](hardware.md) | Minimum vs. recommended specs for the **stack host** — CPU, RAM, disk, network, OS — plus lighter-footprint options. (Miner hardware lives in [RigForge](https://github.com/p2pool-starter-stack/rigforge).) | | [Configuration](configuration.md) | Every `config.json` key and default, applying changes safely, **reusing an existing node via data directories**, and connecting to a **remote Monero node**. | | [The Dashboard](dashboard.md) | **Sync Mode**, the live operational view, and how to read every panel. | +| [Telegram Alerts](telegram.md) | Push **operator alerts** (node down/recovered, worker offline/back, sync finished) to Telegram — creating a bot, finding your chat id, and per-event toggles. | | [Connecting Miners](workers.md) | Pointing any existing rig at the stack, plus [RigForge](https://github.com/p2pool-starter-stack/rigforge) for setting up new miners. | | [Architecture](architecture.md) | The nine services, how they fit together, the privacy model, and the algorithmic XvB switching engine. | | [Operations & Maintenance](operations.md) | The full `pithead` command reference, upgrades, backups, and troubleshooting. | diff --git a/docs/configuration.md b/docs/configuration.md index 535000a..99c3b12 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -89,6 +89,10 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `dashboard.timezone` | `auto` | Timezone for the dashboard's timestamps and charts. `auto` = **the host machine's timezone** (auto-detected, falling back to `Etc/UTC`); set an IANA name (e.g. `America/Chicago`) to override. | | `dashboard.data_dir` | `auto` | Where the dashboard's database lives. `auto` = `./data/dashboard`. | | `dashboard.tari_required` | `true` | How much a Tari problem holds up the rest of the stack. Monero is **required** to mine, so its behavior isn't configurable: a monerod outage always rejects workers (stops `xmrig-proxy` so miners **fail over to their backup pools**), and the miner is always held until monerod finishes syncing. Tari is **only needed for merge mining**, so this one flag decides how much it blocks. **`true` (default):** a Tari outage also rejects workers, the miner waits for Tari's initial sync too, and a Tari-only (re)sync shows the full-screen Sync view. **`false` (non-blocking):** keep mining Monero through a Tari outage, start mining as soon as Monero is synced (Tari finishes in the background), and keep the normal dashboard — with a `Tari syncing` indicator — instead of the takeover screen. | +| `telegram.enabled` | `false` | Push operational alerts (node down/recovered, worker offline/back, sync finished) to Telegram. Off by default. Requires `bot_token` + `chat_id` to actually send. Full walkthrough: [Telegram Alerts](telegram.md). | +| `telegram.bot_token` | `""` | Your BotFather bot token. **A secret** — stored owner-only in `.env`, git-ignored, and never logged. Get one from [@BotFather](https://t.me/BotFather). | +| `telegram.chat_id` | `""` | Where alerts are sent. A Telegram group id (negative, e.g. `-1001234567890`) or a personal chat id. See [how to find it](telegram.md#3-find-your-chat-id). | +| `telegram.events.*` | all `true` | Per-event toggles: `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `sync_finished`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. | --- diff --git a/docs/telegram.md b/docs/telegram.md new file mode 100644 index 0000000..5d90c93 --- /dev/null +++ b/docs/telegram.md @@ -0,0 +1,217 @@ +# Telegram Alerts + +Pithead can push a small set of **operational alerts** to Telegram, so you find out the moment +something needs attention — without sitting on the dashboard. It's **off by default**; this guide +takes you from nothing to a working alert in about five minutes. + +> **What this is — and isn't.** This is a **notifications-only** push: the stack sends you +> messages, you don't send it commands. There's no interactive bot, no `/status` command, no +> remote control. (That's a separate, later feature.) Think of it as a pager, not a chat. + +--- + +## What you'll get + +When enabled, the stack sends a short message on each of these events. Every alert is +**debounced** — a momentary blip won't ping you, and you get **one** message per real +transition, not a stream: + +| Alert | When it fires | +|---|---| +| 🔴 **Node down** | Your Monero (or Tari) node has been unreachable long enough to be considered down — the stack has stopped serving your rigs so they **fail over to their backup pools**. | +| 🟢 **Node recovered** | The node is back and stable; the stack has readmitted your rigs. | +| 🔴 **Worker offline** | A rig stopped hashing and hasn't been seen for a few minutes (a reboot, a dropped connection, a dead miner). | +| 🟢 **Worker back online** | A rig that had gone offline is hashing again. | +| ✅ **Sync finished** | The initial blockchain sync completed and mining has started — handy on first run, when the sync can take hours. | + +Every message is prefixed with your dashboard hostname (e.g. `[rig-box.lan]`), so if you point +more than one stack at the same chat you can tell them apart. + +Each of these can be **turned off individually** — see [Choosing which alerts you get](#choosing-which-alerts-you-get). + +--- + +## Setup + +You need two things: a **bot token** (the credential Pithead uses to send) and a **chat id** (where +the messages go). Both come from Telegram, in a few taps. + +### 1. Create a bot and get its token + +1. In Telegram, open a chat with **[@BotFather](https://t.me/BotFather)** (the official bot for + making bots). +2. Send `/newbot` and follow the prompts — pick a name and a username (the username must end in + `bot`, e.g. `my_pithead_bot`). +3. BotFather replies with a **token** that looks like: + ``` + 123456789:AAExampleExampleExampleExampleExample + ``` + This is your `bot_token`. **Treat it like a password** — anyone with it can post as your bot. + +### 2. Pick where alerts go (a group is recommended) + +You can send alerts straight to your own Telegram account, but a **dedicated group** is the +cleaner choice: it keeps alerts out of your personal chats, lets you mute them with one tap, and +lets you add other operators (or a second alert source — see +[One chat, two bots](#one-chat-two-bots)). + +1. Create a new Telegram **group** (e.g. "Pithead alerts"). +2. Add your bot to it: open the group → **Add members** → search for your bot's username. + +> If you'd rather have alerts come as a normal direct message instead, skip the group and just +> **send your bot a `/start`** message — that's enough for it to be allowed to message you back. + +### 3. Find your chat id + +The easiest way: + +1. Add **[@userinfobot](https://t.me/userinfobot)** to the same group (or message it directly for a + 1-to-1 chat). It immediately replies with the chat's **id**. +2. Note the number. **Group ids are negative** and often long, e.g. `-1001234567890`. A direct + chat id is a positive number, e.g. `987654321`. +3. You can remove `@userinfobot` afterwards. + +> **Manual alternative** (no third-party bot): send any message in the group, then open +> `https://api.telegram.org/bot/getUpdates` in a browser and read the `chat.id` field +> from the JSON. (You may need to send the message *after* adding your bot for it to show up.) + +### 4. Put it in `config.json` + +Add a `telegram` block to your `config.json`. The minimum to switch it on: + +```json +{ + "monero": { "wallet_address": "your_monero_wallet_address" }, + "tari": { "wallet_address": "your_tari_wallet_address" }, + + "telegram": { + "enabled": true, + "bot_token": "123456789:AAExampleExampleExampleExampleExample", + "chat_id": "-1001234567890" + } +} +``` + +`chat_id` can be written as a string (recommended, since group ids are long and negative) or a +number — both work. + +### 5. Apply + +```bash +./pithead apply +``` + +`apply` re-renders the stack and restarts the dashboard with the new settings. On the next health +cycle, alerting is live. To confirm it works end-to-end, you can stop a rig (or briefly stop a +node) and wait for the offline/down alert — remember the debounce means it's a few minutes, not +instant, by design. + +--- + +## Choosing which alerts you get + +Every event is on by default once Telegram is enabled. To silence one, add it to a `telegram.events` +block and set it to `false` — any event you don't list stays on: + +```json +"telegram": { + "enabled": true, + "bot_token": "…", + "chat_id": "…", + "events": { + "worker_offline": false, + "worker_recovered": false + } +} +``` + +| Event key | Default | Alert | +|---|---|---| +| `node_down` | `true` | Monero/Tari node went down | +| `node_recovered` | `true` | …and came back | +| `worker_offline` | `true` | A rig dropped off | +| `worker_recovered` | `true` | …and came back | +| `sync_finished` | `true` | Initial sync done, mining started | + +Run `./pithead apply` after editing. + +> **Tari note.** A node-down/recovered alert fires for **Tari only when Tari is treated as +> required** (`dashboard.tari_required: true`, the default). If you've made Tari non-blocking, a +> Tari outage doesn't stop your Monero mining, so it isn't alerted as a node-down — matching how +> the rest of the stack treats a non-blocking Tari. Monero is always alerted. + +--- + +## One chat, two bots + +Pithead's companion **Healthchecks.io** monitor (a "dead-man's switch" that detects the whole host +going dark from *outside* the stack) can deliver its alerts to Telegram too. The two are +complementary: + +- **This (in-stack) alerter** reports everything the host can tell you **while it's alive** — a + node down, a rig offline, sync finished. +- **Healthchecks.io** reports the case this one can't: the **whole host is dead** (power cut, + kernel panic, network gone) and therefore can't send anything itself. + +The clean setup is to point **both at the same Telegram group** — one place for every alert. They +necessarily use **two different bots**: + +- This alerter uses **your own BotFather bot** (`bot_token` above) posting to your `chat_id`. +- Healthchecks.io has **its own** Telegram integration bot that you authorize into the chat on the + Healthchecks.io side — you never paste a token into Healthchecks.io. + +So the thing you share is the **chat**, not the token: create the group, add **both** bots to it, +and use that group's id here. Each source labels its own messages, so you can always tell which is +which. (Keeping them in separate chats is fine too — only useful if you want to mute or route them +differently.) + +> Healthchecks.io setup is documented separately under operator monitoring; see +> [issue #79](https://github.com/p2pool-starter-stack/pithead/issues/79). + +--- + +## Privacy and secrets + +- **The bot token is a secret.** Pithead stores it in `.env`, which is created **owner-only** + (`chmod 600`) and is **git-ignored**, exactly like the Monero node RPC password. The dashboard + **never writes the token to a log line** — not even inside an error message. +- **Telegram is a clearnet service.** The dashboard reaches `api.telegram.org` directly. On a + **Tor-only host with no clearnet egress**, the Telegram API is unreachable and sends simply + **fail silently** — no errors, no log spam, the rest of the stack is unaffected. (Same applies if + your network blocks Telegram.) If you run Tor-only and want these alerts, you'll need clearnet + egress for the dashboard, or rely on Healthchecks.io's own delivery. + +--- + +## Tuning the debounce (advanced) + +The defaults err on the side of **not** crying wolf. If you want faster (or quieter) worker alerts, +override these environment variables for the dashboard container — both are in seconds: + +| Variable | Default | Meaning | +|---|---|---| +| `WORKER_OFFLINE_AFTER_SEC` | `300` | A rig must be unseen this long before "offline" fires. | +| `WORKER_RECOVERY_AFTER_SEC` | `120` | A rig must be back this long before "back online" fires. | + +Node-down timing is shared with the existing failover logic (`NODE_DOWN_AFTER_SEC` / +`NODE_RECOVERY_AFTER_SEC`). These are advanced knobs; most operators never touch them. + +--- + +## Troubleshooting + +| Symptom | Likely cause / fix | +|---|---| +| No messages at all | Confirm `telegram.enabled` is `true` **and** both `bot_token` and `chat_id` are set — a missing one keeps alerting **off**. Did you run `./pithead apply`? | +| Still nothing | Make sure the bot has been **added to the group** (or that you sent it `/start` for a direct chat). A bot can't message a chat it isn't in. | +| `chat_id` looks wrong | Group ids are **negative** and long (`-100…`). Re-check with `@userinfobot`. | +| Works for "down" but not a specific alert | Check `telegram.events` — that event may be toggled `false`. | +| Tor-only host | Expected: Telegram is clearnet, so sends fail silently. See [Privacy and secrets](#privacy-and-secrets). | + +--- + +## See also + +- [Configuration](configuration.md) — every `config.json` key, including the `telegram.*` block. +- [The Dashboard](dashboard.md) — the live view these alerts complement. +- [Operations & Maintenance](operations.md) — `apply`, upgrades, and troubleshooting. diff --git a/pithead b/pithead index 1ddc7f2..280500b 100755 --- a/pithead +++ b/pithead @@ -1247,6 +1247,24 @@ render_env() { local tari_required tari_required=$(jq -r 'if .dashboard.tari_required != null then .dashboard.tari_required | tostring else "true" end' "$CONFIG_FILE") + # Telegram alerts (#121). Notifications-only push alerter, disabled by default. bot_token is + # a secret: it lives only in this owner-only .env (chmod 600 below) and the dashboard never + # logs it. Per-event toggles default to on, so enabling Telegram turns on the full set and an + # operator only opts *out* of the noisy ones. A blank chat_id/bot_token keeps it off even if + # enabled=true (the dashboard guards that too). See docs/telegram.md. + local tg_enabled tg_token tg_chat + tg_enabled=$(jq -r 'if .telegram.enabled != null then .telegram.enabled | tostring else "false" end' "$CONFIG_FILE") + tg_token=$(jq -r '.telegram.bot_token // empty' "$CONFIG_FILE") + tg_chat=$(jq -r '.telegram.chat_id // empty' "$CONFIG_FILE") + # One toggle per event, defaulting to true when the key is absent. + tg_event() { jq -r --arg k "$1" 'if .telegram.events[$k] != null then .telegram.events[$k] | tostring else "true" end' "$CONFIG_FILE"; } + local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered tg_ev_sync_finished + tg_ev_node_down=$(tg_event node_down) + tg_ev_node_recovered=$(tg_event node_recovered) + tg_ev_worker_offline=$(tg_event worker_offline) + tg_ev_worker_recovered=$(tg_event worker_recovered) + tg_ev_sync_finished=$(tg_event sync_finished) + # Tari memory cap (#55). Tari officially needs only a few GB (min 4 GB host, 8 GB+ recommended), # but its memory grows unbounded over time — one 32 GB host was seen at ~11 GB while staying # healthy. Uncapped, that growth can OOM the whole host on small machines. So the cap is a SAFETY @@ -1311,6 +1329,14 @@ XVB_ENABLED=$xvb_enabled XVB_DONATION_LEVEL=$xvb_donation_level TARI_REQUIRED=$tari_required TARI_MEM_LIMIT=$tari_mem_limit +TELEGRAM_ENABLED=$tg_enabled +TELEGRAM_BOT_TOKEN=$tg_token +TELEGRAM_CHAT_ID=$tg_chat +TELEGRAM_EVENT_NODE_DOWN=$tg_ev_node_down +TELEGRAM_EVENT_NODE_RECOVERED=$tg_ev_node_recovered +TELEGRAM_EVENT_WORKER_OFFLINE=$tg_ev_worker_offline +TELEGRAM_EVENT_WORKER_RECOVERED=$tg_ev_worker_recovered +TELEGRAM_EVENT_SYNC_FINISHED=$tg_ev_sync_finished P2POOL_URL=172.28.0.28:3333 PROXY_API_PORT=3344 PROXY_AUTH_TOKEN=$PROXY_AUTH_TOKEN @@ -1610,6 +1636,15 @@ describe_change() { msg="Monero node RPC credential updated ($key)." ;; XVB_ENABLED|XVB_POOL_URL|XVB_DONOR_ID|XVB_DONATION_LEVEL) msg="XMRvsBeast setting ($key): $old → $new." ;; + TELEGRAM_ENABLED) + msg="Telegram alerts → $([ "$new" == "true" ] && echo on || echo off)." ;; + TELEGRAM_BOT_TOKEN) + # Never echo the token value into the apply preview — it's a secret. + msg="Telegram bot token updated." ;; + TELEGRAM_CHAT_ID) + msg="Telegram chat id: $old → $new." ;; + TELEGRAM_EVENT_*) + msg="Telegram alert toggle ($key): $old → $new." ;; TARI_REQUIRED) if [ "$new" == "true" ]; then msg="Tari → required — a Tari outage rejects workers, the miner waits for Tari's sync, and a Tari-only sync takes over the dashboard." diff --git a/tests/stack/run.sh b/tests/stack/run.sh index 77045f6..5f52c2a 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -82,6 +82,15 @@ assert_contains "wallet is DEST" "$(run_sourced "$SANDBOX" describe_change M assert_contains "xvb url is INFO" "$(run_sourced "$SANDBOX" describe_change XVB_POOL_URL a b)" "INFO" assert_contains "data_dir is DEST" "$(run_sourced "$SANDBOX" describe_change MONERO_DATA_DIR /a /b)" "DEST" assert_contains "tari mem is INFO" "$(run_sourced "$SANDBOX" describe_change TARI_MEM_LIMIT 2048m 4g)" "INFO" +assert_contains "telegram enable is INFO" "$(run_sourced "$SANDBOX" describe_change TELEGRAM_ENABLED false true)" "INFO" +assert_contains "telegram event is INFO" "$(run_sourced "$SANDBOX" describe_change TELEGRAM_EVENT_NODE_DOWN true false)" "INFO" +# The bot token is a secret: its describe_change line must NOT echo the old/new value. +tg_tok_msg="$(run_sourced "$SANDBOX" describe_change TELEGRAM_BOT_TOKEN oldsecret newsecret)" +assert_contains "telegram token change noted" "$tg_tok_msg" "Telegram bot token updated" +case "$tg_tok_msg" in + *oldsecret*|*newsecret*) bad "telegram token value not leaked in preview" "leaked: $tg_tok_msg" ;; + *) ok "telegram token value not leaked in preview" ;; +esac echo "== unit: env helpers ==" printf 'A=1\nB=two\nPROXY_AUTH_TOKEN=keep=me\n' > "$SANDBOX/old.env" @@ -260,6 +269,28 @@ printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","n out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" assert_eq "tari_required propagated false" "$(run_sourced "$V" env_get_file "$V/.env" TARI_REQUIRED)" "false" +# Telegram defaults (#121): no telegram block => disabled, per-event toggles default on. +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" > "$V/config.json" +out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +assert_eq "telegram disabled by default" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_ENABLED)" "false" +assert_eq "telegram event defaults on" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_NODE_DOWN)" "true" + +# Telegram enabled: token/chat_id + per-event toggles propagate from config.json into .env. +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"}, "telegram":{"enabled":true,"bot_token":"BOTSECRET","chat_id":"-100123","events":{"worker_offline":false}} }\n' "$WALLET" > "$V/config.json" +out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +assert_eq "telegram enabled propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_ENABLED)" "true" +assert_eq "telegram token propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_BOT_TOKEN)" "BOTSECRET" +assert_eq "telegram chat_id propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_CHAT_ID)" "-100123" +assert_eq "telegram per-event override off" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_WORKER_OFFLINE)" "false" +assert_eq "telegram unset event stays on" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_NODE_DOWN)" "true" +# The bot token is a secret: the apply preview must not print it. +case "$out" in + *BOTSECRET*) bad "telegram token not printed by apply" "leaked in: $out" ;; + *) ok "telegram token not printed by apply" ;; +esac + # An explicit tari.mem_limit is passed through verbatim (overriding the "auto" host-RAM scaling). seed_env printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T","mem_limit":"3072m"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" > "$V/config.json" From d5cf71f16402ca66fc850f9257d2a0a49cebbe7d Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Thu, 2 Jul 2026 16:23:44 -0500 Subject: [PATCH 02/18] feat(#45): on-demand Telegram status commands + ruff/format the salvaged alerter Add TelegramCommandBot: an in-process long-poll (getUpdates) loop that answers read-only /status, /hashrate, /workers, /sync, /help from the configured chat. Reuses build_metrics so replies match the dashboard exactly; access-gated to the one chat_id; long-poll needs no inbound port and rides the same egress as the alerts; fail-silent + never logs the token, same discipline as the notifier. Wired as a third background task in main.py (no-op unless telegram.commands.enabled). Config: telegram.commands.enabled -> TELEGRAM_COMMANDS_ENABLED (default false). Also ruff-format + lint-fix the #121 modules the draft predated (import sort, drop an unused import). 37 new unit tests; patch coverage 95%. Co-Authored-By: Claude Opus 4.8 --- .../mining_dashboard/config/config.py | 8 + build/dashboard/mining_dashboard/main.py | 10 +- .../mining_dashboard/service/alert_service.py | 52 ++- .../mining_dashboard/service/data_service.py | 7 +- .../service/telegram_commands.py | 304 ++++++++++++++ .../service/telegram_notifier.py | 11 +- .../service/worker_presence.py | 19 +- .../tests/service/test_alert_service.py | 85 ++-- .../tests/service/test_telegram_commands.py | 389 ++++++++++++++++++ .../tests/service/test_telegram_notifier.py | 12 +- .../tests/service/test_worker_presence.py | 36 +- docs/telegram.md | 71 +++- docs/test-inventory.md | 78 +++- tests/stack/run.sh | 26 +- 14 files changed, 1012 insertions(+), 96 deletions(-) create mode 100644 build/dashboard/mining_dashboard/service/telegram_commands.py create mode 100644 build/dashboard/tests/service/test_telegram_commands.py diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 7c60d2f..619dbe3 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -233,6 +233,14 @@ TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "").strip() TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "").strip() +# Interactive command interface (#45), separate opt-in from the alerts above. When on, the +# dashboard long-polls Telegram (getUpdates — outbound only, no inbound port) and answers +# read-only status commands from the configured chat_id. Off by default; the alerter works +# without it. See telegram_commands.py / docs/telegram.md. +TELEGRAM_COMMANDS_ENABLED = ( + os.environ.get("TELEGRAM_COMMANDS_ENABLED", "false").strip().lower() == "true" +) + def _telegram_event_enabled(name, default=True): """Read one per-event toggle from TELEGRAM_EVENT_ (rendered from config.json's diff --git a/build/dashboard/mining_dashboard/main.py b/build/dashboard/mining_dashboard/main.py index e1956a4..27fc0cb 100644 --- a/build/dashboard/mining_dashboard/main.py +++ b/build/dashboard/mining_dashboard/main.py @@ -14,6 +14,7 @@ from mining_dashboard.service.algo_service import AlgoService from mining_dashboard.service.data_service import DataService from mining_dashboard.service.storage_service import StateManager +from mining_dashboard.service.telegram_commands import TelegramCommandBot from mining_dashboard.web.server import create_app logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") @@ -33,17 +34,24 @@ def build_app() -> web.Application: xvb_client = XvbClient(wallet_address=MONERO_WALLET_ADDRESS) data_service = DataService(state_manager, proxy_client, xvb_client) algo_service = AlgoService(state_manager, proxy_client, data_service) + # On-demand Telegram command interface (#45). Reads the snapshot data_service already collects; + # a no-op unless telegram.enabled + telegram.commands.enabled + bot_token + chat_id are set. + telegram_bot = TelegramCommandBot(data_service) async def start_background_tasks(app): """Initializes background services upon web application startup.""" app["data_task"] = asyncio.create_task(data_service.run()) app["algo_task"] = asyncio.create_task(algo_service.run()) + app["telegram_task"] = asyncio.create_task(telegram_bot.run()) async def cleanup_background_tasks(app): """Stops background tasks and closes resources on shutdown.""" app["data_task"].cancel() app["algo_task"].cancel() - await asyncio.gather(app["data_task"], app["algo_task"], return_exceptions=True) + app["telegram_task"].cancel() + await asyncio.gather( + app["data_task"], app["algo_task"], app["telegram_task"], return_exceptions=True + ) if "state_manager" in app: app["state_manager"].close() diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 1e26902..f3f0402 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -68,8 +68,17 @@ def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): def enabled(self): return self.notifier.enabled - def evaluate(self, *, monero_down, tari_down, tari_required, miner_released, - online_workers, workers_expected, now=None): + def evaluate( + self, + *, + monero_down, + tari_down, + tari_required, + miner_released, + online_workers, + workers_expected, + now=None, + ): """Pure: fold this cycle's signals into the list of ``(event_key, text)`` to send, filtered to the events the operator left enabled.""" alerts = [] @@ -87,8 +96,12 @@ def evaluate(self, *, monero_down, tari_down, tari_required, miner_released, if self._prev_released is None: self._prev_released = miner_released elif miner_released and not self._prev_released: - alerts.append((self.EVT_SYNC_FINISHED, self._fmt( - "✅ Node ready — required chain(s) synced; mining has started."))) + alerts.append( + ( + self.EVT_SYNC_FINISHED, + self._fmt("✅ Node ready — required chain(s) synced; mining has started."), + ) + ) self._prev_released = miner_released # --- Worker offline / back online (debounced) --- @@ -98,11 +111,16 @@ def evaluate(self, *, monero_down, tari_down, tari_required, miner_released, if workers_expected: for name, event in self.workers.update(online_workers, now=now): if event == "offline": - alerts.append((self.EVT_WORKER_OFFLINE, - self._fmt(f"\U0001f534 Worker offline: {name}"))) + alerts.append( + (self.EVT_WORKER_OFFLINE, self._fmt(f"\U0001f534 Worker offline: {name}")) + ) else: - alerts.append((self.EVT_WORKER_RECOVERED, - self._fmt(f"\U0001f7e2 Worker back online: {name}"))) + alerts.append( + ( + self.EVT_WORKER_RECOVERED, + self._fmt(f"\U0001f7e2 Worker back online: {name}"), + ) + ) else: self.workers.reset() @@ -114,10 +132,20 @@ def _node_edges(self, label, down, attr): if prev is None or down == prev: return [] if down: - return [(self.EVT_NODE_DOWN, self._fmt( - f"\U0001f534 {label} node is DOWN — workers failing over to backup pools."))] - return [(self.EVT_NODE_RECOVERED, self._fmt( - f"\U0001f7e2 {label} node recovered — workers readmitted."))] + return [ + ( + self.EVT_NODE_DOWN, + self._fmt( + f"\U0001f534 {label} node is DOWN — workers failing over to backup pools." + ), + ) + ] + return [ + ( + self.EVT_NODE_RECOVERED, + self._fmt(f"\U0001f7e2 {label} node recovered — workers readmitted."), + ) + ] def _fmt(self, text): return f"[{self.host_label}] {text}" if self.host_label else text diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index 23bfe16..0aeed63 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -48,10 +48,10 @@ pplns_block_time, shares_in_pplns_window, ) +from mining_dashboard.service.alert_service import AlertService from mining_dashboard.service.clearnet_sync import ClearnetSyncSupervisor from mining_dashboard.service.healthchecks import HealthchecksClient from mining_dashboard.service.node_health import NodeHealthMonitor -from mining_dashboard.service.alert_service import AlertService from mining_dashboard.service.update_checker import GitHubReleaseClient, UpdateChecker logger = logging.getLogger("DataService") @@ -788,8 +788,9 @@ async def run(self): tari_down=tari_down, tari_required=TARI_REQUIRED, miner_released=self.miner_released, - online_workers=[w["name"] for w in final_workers - if w.get("status") == "online"], + online_workers=[ + w["name"] for w in final_workers if w.get("status") == "online" + ], workers_expected=self.miner_released and not self.workers_rejected, ) diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py new file mode 100644 index 0000000..dad6fd5 --- /dev/null +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -0,0 +1,304 @@ +import asyncio +import logging + +import aiohttp + +from mining_dashboard.config.config import ( + HOST_IP, + TELEGRAM_BOT_TOKEN, + TELEGRAM_CHAT_ID, + TELEGRAM_COMMANDS_ENABLED, + TELEGRAM_ENABLED, +) +from mining_dashboard.helper.utils import format_duration, format_hashrate +from mining_dashboard.service.metrics import build_metrics +from mining_dashboard.service.telegram_notifier import TELEGRAM_API_BASE + +logger = logging.getLogger("TelegramCommands") + +# Seconds handed to getUpdates: Telegram holds the request open until an update arrives or this +# elapses, so the bot makes ~one request per interval while idle (long-poll, not busy-poll). +LONG_POLL_SECONDS = 25 +# Quiet retry after a failed poll — a Tor-only / offline host can't reach api.telegram.org, so a +# persistently-blocked bot backs off instead of hot-looping (and never spams ERROR; #59 discipline). +POLL_ERROR_BACKOFF_SECONDS = 15 + +# The commands the bot answers. All are read-only status queries — the bot can never change the +# stack (start/stop/apply live on the CLI), so a leaked chat can at worst read status, not act. +COMMANDS = ("status", "hashrate", "workers", "sync", "help") + +HELP_TEXT = ( + "Pithead bot — commands:\n" + "/status — stack health at a glance\n" + "/hashrate — total + per-worker hashrate\n" + "/workers — each rig's online/offline state\n" + "/sync — Monero + Tari node sync progress\n" + "/help — this message" +) + + +def _prefix(host_label): + """Hostname tag so replies from several stacks sharing one chat stay distinguishable. + 'Unknown Host' is config.py's placeholder when HOST_IP is unset — drop it, don't print it.""" + if host_label in (None, "", "Unknown Host"): + return "" + return f"[{host_label}] " + + +def parse_command(text): + """Extract the command word from a message, or ``None`` if it isn't a slash command. + + Returns the bare command (lowercased, with any ``@botname`` suffix stripped — Telegram appends + it in groups, e.g. ``/status@PitheadBot``). An unrecognized slash command comes back as + ``"unknown"`` so the caller can nudge with the help text; plain chatter returns ``None`` and is + ignored, so the bot never talks over a group it happens to share. + """ + if not text: + return None + text = text.strip() + if not text.startswith("/"): + return None + word = text.split(maxsplit=1)[0] + cmd = word[1:].split("@", 1)[0].lower() + if not cmd: + return None + return cmd if cmd in COMMANDS else "unknown" + + +def _node_state(sync): + """One-glance node health from a :class:`~mining_dashboard.service.metrics.SyncMetric`.""" + if sync.down: + return "\U0001f534 down" + if sync.done: + return "\U0001f7e2 synced" + return f"⏳ syncing {sync.percent:.1f}%" + + +def format_status(metrics, mining_active, host_label=""): + """Overall stack health — the answer to '/status'. Pure: folds a :class:`Metrics` (plus the + mining-active flag the loop derives from the sync gate) into text; no I/O.""" + lines = [ + f"{_prefix(host_label)}\U0001f4ca Pithead status", + f"Monero node: {_node_state(metrics.monero)}", + f"Tari node: {_node_state(metrics.tari)}", + ] + if metrics.global_syncing: + lines.append("Mining: ⏳ holding — chain(s) syncing") + elif mining_active: + lines.append(f"Mining: \U0001f7e2 active ({metrics.mode})") + else: + lines.append("Mining: \U0001f534 not mining") + lines.append(f"Workers: {metrics.workers_online}/{metrics.workers_total} online") + lines.append(f"Hashrate: {format_hashrate(metrics.total_h15)} (10m avg)") + lines.append(f"PPLNS shares: {metrics.shares_in_window} in window") + return "\n".join(lines) + + +def format_hashrate_reply(metrics, workers, host_label=""): + """Total + per-online-worker hashrate — the answer to '/hashrate'.""" + lines = [ + f"{_prefix(host_label)}⚡ Hashrate", + f"Total: {format_hashrate(metrics.total_h15)} (10m avg)", + ] + online = [w for w in workers if w.get("status") == "online"] + if not online: + lines.append("No workers online.") + for w in sorted(online, key=lambda w: w.get("h15", 0) or 0, reverse=True): + lines.append(f"• {w.get('name', '?')}: {format_hashrate(w.get('h15', 0))}") + return "\n".join(lines) + + +def format_workers(workers, host_label=""): + """Per-worker online/offline roll-call — the answer to '/workers'. Offline first-sighted + workers are those xmrig-proxy still lists with a dead connection.""" + if not workers: + return f"{_prefix(host_label)}\U0001f477 Workers\nNo workers connected." + lines = [f"{_prefix(host_label)}\U0001f477 Workers"] + # Online first, then by name — the offline ones are what an operator scans for. + for w in sorted(workers, key=lambda w: (w.get("status") != "online", w.get("name", ""))): + if w.get("status") == "online": + up = w.get("uptime") or 0 + tail = f" · up {format_duration(up)}" if up else "" + lines.append( + f"\U0001f7e2 {w.get('name', '?')} — {format_hashrate(w.get('h15', 0))}{tail}" + ) + else: + lines.append(f"\U0001f534 {w.get('name', '?')} — offline") + return "\n".join(lines) + + +def _sync_line(name, sync): + if sync.down: + return f"{name}: \U0001f534 node down" + if sync.done: + return f"{name}: \U0001f7e2 synced" + if sync.has_target: + return f"{name}: ⏳ {sync.percent:.1f}% ({sync.current:,}/{sync.target:,})" + return f"{name}: ⏳ syncing {sync.percent:.1f}%" + + +def format_sync(metrics, host_label=""): + """Monero + Tari sync progress — the answer to '/sync'.""" + return "\n".join( + [ + f"{_prefix(host_label)}\U0001f504 Sync status", + _sync_line("Monero", metrics.monero), + _sync_line("Tari", metrics.tari), + ] + ) + + +class TelegramCommandBot: + """ + On-demand Telegram command interface (Issue #45) — the interactive half of the operator bot. + + Answers a small set of **read-only** status commands (``/status``, ``/hashrate``, ``/workers``, + ``/sync``, ``/help``) from the data the dashboard already collects, so it never re-implements + collection — it reuses :func:`build_metrics`, the same domain layer the web UI renders, so a + Telegram reply and the dashboard can never disagree. + + Discipline (mirrors :class:`TelegramNotifier`): + + - **Off by default, opt-in.** Enabled only when Telegram is on *and* ``telegram.commands.enabled`` + is set *and* both ``bot_token`` and ``chat_id`` are present. Otherwise :meth:`run` returns + immediately, so the background task is a cheap no-op for the default stack. + - **Long-poll, no inbound port.** Uses ``getUpdates`` (outbound only) over the same egress the + notifier uses — a webhook would need a public inbound endpoint the Tor-first appliance can't + offer. Nothing is exposed. + - **Single-chat access control.** Only the configured ``chat_id`` is answered; every other update + is dropped silently, so an unknown chat gets no reply and can't use the bot as a probe oracle. + - **Read-only.** No command mutates the stack (lifecycle stays on the CLI), so a compromised chat + can at worst read status. + - **Fail silent, never leaks the token.** Network errors (offline / Tor-only host) are swallowed + at debug and the poll backs off; the ``bot_token`` only ever appears in the request URL and is + never written to a log line. + - **No stale replay.** On startup the backlog is skipped (offset primed past pending updates), so + a command sent while the dashboard was down isn't executed minutes later on restart. + """ + + def __init__( + self, + data_service, + *, + enabled=None, + bot_token=TELEGRAM_BOT_TOKEN, + chat_id=TELEGRAM_CHAT_ID, + host_label=HOST_IP, + api_base=TELEGRAM_API_BASE, + long_poll=LONG_POLL_SECONDS, + ): + self.data_service = data_service + self._token = (bot_token or "").strip() + # chat_id may be a negative group id (e.g. -1001234567890); keep it a string for exact + # equality against the id Telegram sends back. + self.chat_id = str(chat_id or "").strip() + self.host_label = host_label + self._api_base = api_base.rstrip("/") + self.long_poll = long_poll + if enabled is None: + enabled = bool(TELEGRAM_ENABLED and TELEGRAM_COMMANDS_ENABLED) + self.enabled = bool(enabled and self._token and self.chat_id) + self._offset = None + + def reply_for(self, text): + """Map an incoming message to a reply string, or ``None`` to stay silent. + + Reads the latest snapshot and runs the shared :func:`build_metrics` (a couple of quick local + SQLite reads); the caller runs this off-thread so a slow read can't stall the poll loop. + """ + cmd = parse_command(text) + if cmd is None: + return None + if cmd == "help": + return f"{_prefix(self.host_label)}{HELP_TEXT}" + if cmd == "unknown": + return f"{_prefix(self.host_label)}Unknown command.\n{HELP_TEXT}" + + data = self.data_service.latest_data or {} + metrics = build_metrics(data, self.data_service.state_manager) + if cmd == "status": + mining = bool(data.get("miner_released") and not data.get("workers_rejected")) + return format_status(metrics, mining, self.host_label) + if cmd == "hashrate": + return format_hashrate_reply(metrics, data.get("workers", []), self.host_label) + if cmd == "workers": + return format_workers(data.get("workers", []), self.host_label) + if cmd == "sync": + return format_sync(metrics, self.host_label) + return None + + async def run(self): + """Long-poll for commands until cancelled. A no-op when disabled.""" + if not self.enabled: + return + logger.info("Telegram command interface enabled — polling for commands.") + async with aiohttp.ClientSession() as session: + await self._prime_offset(session) + while True: + try: + updates = await self._get_updates(session, self.long_poll) + except asyncio.CancelledError: + raise + except Exception as exc: + logger.debug("Telegram getUpdates failed (%s)", type(exc).__name__) + await asyncio.sleep(POLL_ERROR_BACKOFF_SECONDS) + continue + for update in updates: + self._offset = update.get("update_id", 0) + 1 + await self._handle_update(session, update) + + async def _prime_offset(self, session): + """Advance the offset past any pending backlog without acting on it, so a command queued + while the dashboard was down isn't run on startup.""" + try: + updates = await self._get_updates(session, 0) + if updates: + self._offset = updates[-1].get("update_id", 0) + 1 + except Exception as exc: + logger.debug("Telegram offset prime skipped (%s)", type(exc).__name__) + + async def _get_updates(self, session, poll_timeout): + params = {"timeout": poll_timeout, "allowed_updates": '["message"]'} + if self._offset is not None: + params["offset"] = self._offset + url = f"{self._api_base}/bot{self._token}/getUpdates" + # The client read timeout must outlast Telegram's long-poll hold, or aiohttp aborts the + # request the server is legitimately keeping open. + client_timeout = aiohttp.ClientTimeout(total=poll_timeout + 10) + async with session.get(url, params=params, timeout=client_timeout) as resp: + resp.raise_for_status() + payload = await resp.json() + if not payload.get("ok"): + return [] + return payload.get("result", []) + + async def _handle_update(self, session, update): + message = update.get("message") or {} + chat = message.get("chat") or {} + # Access control: only the configured chat may drive the bot. Anything else is dropped + # silently — no reply, so an unknown chat can't even confirm the bot exists. + if str(chat.get("id")) != self.chat_id: + return + reply = await asyncio.to_thread(self._safe_reply_for, message.get("text", "")) + if reply: + await self._send(session, reply) + + def _safe_reply_for(self, text): + """Never let a formatting/read bug kill the poll loop — a broken command just goes quiet.""" + try: + return self.reply_for(text) + except Exception as exc: + logger.debug("Telegram command handling failed (%s)", type(exc).__name__) + return None + + async def _send(self, session, text): + url = f"{self._api_base}/bot{self._token}/sendMessage" + payload = {"chat_id": self.chat_id, "text": text, "disable_web_page_preview": True} + try: + async with session.post( + url, json=payload, timeout=aiohttp.ClientTimeout(total=10) + ) as resp: + resp.raise_for_status() + except Exception as exc: + # Log only the exception type — a requests/aiohttp error can embed the token-bearing URL. + logger.debug("Telegram reply failed (%s)", type(exc).__name__) diff --git a/build/dashboard/mining_dashboard/service/telegram_notifier.py b/build/dashboard/mining_dashboard/service/telegram_notifier.py index ddef13c..2ebb59d 100644 --- a/build/dashboard/mining_dashboard/service/telegram_notifier.py +++ b/build/dashboard/mining_dashboard/service/telegram_notifier.py @@ -31,8 +31,15 @@ class TelegramNotifier: ``requests`` would otherwise include the full URL). """ - def __init__(self, enabled=False, bot_token="", chat_id="", events=None, - timeout=10, api_base=TELEGRAM_API_BASE): + def __init__( + self, + enabled=False, + bot_token="", + chat_id="", + events=None, + timeout=10, + api_base=TELEGRAM_API_BASE, + ): self.bot_token = (bot_token or "").strip() # chat_id may be a negative integer (Telegram group ids look like -1001234567890); # keep it as a string so render/transport never reformat it. diff --git a/build/dashboard/mining_dashboard/service/worker_presence.py b/build/dashboard/mining_dashboard/service/worker_presence.py index 87e76e5..ef55c93 100644 --- a/build/dashboard/mining_dashboard/service/worker_presence.py +++ b/build/dashboard/mining_dashboard/service/worker_presence.py @@ -35,9 +35,13 @@ class WorkerPresenceMonitor: injectable for deterministic tests. """ - def __init__(self, offline_after=WORKER_OFFLINE_AFTER_SEC, - recovery_after=WORKER_RECOVERY_AFTER_SEC, - retention=WORKER_RETENTION_SEC, clock=time.time): + def __init__( + self, + offline_after=WORKER_OFFLINE_AFTER_SEC, + recovery_after=WORKER_RECOVERY_AFTER_SEC, + retention=WORKER_RETENTION_SEC, + clock=time.time, + ): self.offline_after = offline_after self.recovery_after = recovery_after self.retention = retention @@ -61,8 +65,10 @@ def update(self, present, now=None): if w is None: # First sighting — baseline as ONLINE, no edge. self._workers[name] = { - "state": "online", "seen_since": now, - "unseen_since": None, "last_present": now, + "state": "online", + "seen_since": now, + "unseen_since": None, + "last_present": now, } continue w["last_present"] = now @@ -98,7 +104,8 @@ def reset(self): def _prune(self, now): stale = [ - name for name, w in self._workers.items() + name + for name, w in self._workers.items() if w["unseen_since"] is not None and (now - w["last_present"]) >= self.retention ] for name in stale: diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index ab7983e..5defaed 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -1,14 +1,13 @@ -import pytest - from mining_dashboard.service.alert_service import AlertService from mining_dashboard.service.worker_presence import WorkerPresenceMonitor class _FakeNotifier: """Stand-in transport: records sends, lets tests gate which events are 'enabled'.""" + def __init__(self, enabled=True, allow=None): self.enabled = enabled - self._allow = allow # None => every event allowed + self._allow = allow # None => every event allowed self.sent = [] def event_enabled(self, event): @@ -28,12 +27,26 @@ def _svc(notifier=None, **kw): return AlertService(notifier=notifier, **kw) -def _ev(svc, *, monero_down=False, tari_down=False, tari_required=True, - miner_released=True, online_workers=(), workers_expected=False, now=0): +def _ev( + svc, + *, + monero_down=False, + tari_down=False, + tari_required=True, + miner_released=True, + online_workers=(), + workers_expected=False, + now=0, +): return svc.evaluate( - monero_down=monero_down, tari_down=tari_down, tari_required=tari_required, - miner_released=miner_released, online_workers=list(online_workers), - workers_expected=workers_expected, now=now) + monero_down=monero_down, + tari_down=tari_down, + tari_required=tari_required, + miner_released=miner_released, + online_workers=list(online_workers), + workers_expected=workers_expected, + now=now, + ) def _keys(alerts): @@ -48,9 +61,9 @@ def test_first_cycle_seeds_baseline_silently(self): def test_down_then_recovered(self): svc = _svc() - _ev(svc, monero_down=False) # seed + _ev(svc, monero_down=False) # seed assert _keys(_ev(svc, monero_down=True)) == [AlertService.EVT_NODE_DOWN] - assert _ev(svc, monero_down=True) == [] # no repeat while still down + assert _ev(svc, monero_down=True) == [] # no repeat while still down assert _keys(_ev(svc, monero_down=False)) == [AlertService.EVT_NODE_RECOVERED] def test_node_text_names_the_chain(self): @@ -71,10 +84,12 @@ def test_no_stale_edge_when_tari_becomes_required(self): # replay a down edge for a state we never alerted on. svc = _svc() _ev(svc, tari_down=False, tari_required=False) - _ev(svc, tari_down=True, tari_required=False) # silently tracked + _ev(svc, tari_down=True, tari_required=False) # silently tracked assert _ev(svc, tari_down=True, tari_required=True) == [] # ...but a genuine recovery from there still fires. - assert _keys(_ev(svc, tari_down=False, tari_required=True)) == [AlertService.EVT_NODE_RECOVERED] + assert _keys(_ev(svc, tari_down=False, tari_required=True)) == [ + AlertService.EVT_NODE_RECOVERED + ] def test_required_tari_alerts(self): svc = _svc() @@ -86,9 +101,9 @@ def test_required_tari_alerts(self): class TestSyncFinished: def test_fires_once_when_gate_opens(self): svc = _svc() - _ev(svc, miner_released=False) # seed: still syncing + _ev(svc, miner_released=False) # seed: still syncing assert _keys(_ev(svc, miner_released=True)) == [AlertService.EVT_SYNC_FINISHED] - assert _ev(svc, miner_released=True) == [] # one-shot + assert _ev(svc, miner_released=True) == [] # one-shot def test_no_alert_on_restart_after_sync(self): svc = _svc() @@ -101,17 +116,19 @@ def test_offline_then_recovered(self): svc = _svc() assert _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) == [] assert _ev(svc, online_workers=[], workers_expected=True, now=0) == [] - assert _keys(_ev(svc, online_workers=[], workers_expected=True, now=300)) == \ - [AlertService.EVT_WORKER_OFFLINE] + assert _keys(_ev(svc, online_workers=[], workers_expected=True, now=300)) == [ + AlertService.EVT_WORKER_OFFLINE + ] _ev(svc, online_workers=["rig-1"], workers_expected=True, now=300) - assert _keys(_ev(svc, online_workers=["rig-1"], workers_expected=True, now=420)) == \ - [AlertService.EVT_WORKER_RECOVERED] + assert _keys(_ev(svc, online_workers=["rig-1"], workers_expected=True, now=420)) == [ + AlertService.EVT_WORKER_RECOVERED + ] def test_not_expected_resets_and_silences(self): svc = _svc() _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) _ev(svc, online_workers=[], workers_expected=True, now=0) - _ev(svc, online_workers=[], workers_expected=True, now=300) # rig-1 now offline + _ev(svc, online_workers=[], workers_expected=True, now=300) # rig-1 now offline # Proxy intentionally stopped (sync hold / failover): reset, no alert. assert _ev(svc, online_workers=[], workers_expected=False, now=330) == [] # Re-admission re-baselines silently — no spurious "recovered". @@ -145,8 +162,14 @@ class TestProcess: async def test_disabled_notifier_is_noop(self): notifier = _FakeNotifier(enabled=False) svc = _svc(notifier=notifier) - out = await svc.process(monero_down=True, tari_down=False, tari_required=True, - miner_released=True, online_workers=[], workers_expected=False) + out = await svc.process( + monero_down=True, + tari_down=False, + tari_required=True, + miner_released=True, + online_workers=[], + workers_expected=False, + ) assert out == [] assert notifier.sent == [] @@ -154,9 +177,21 @@ async def test_enabled_notifier_dispatches(self): notifier = _FakeNotifier() svc = _svc(notifier=notifier) # seed - await svc.process(monero_down=False, tari_down=False, tari_required=True, - miner_released=True, online_workers=[], workers_expected=False) - out = await svc.process(monero_down=True, tari_down=False, tari_required=True, - miner_released=True, online_workers=[], workers_expected=False) + await svc.process( + monero_down=False, + tari_down=False, + tari_required=True, + miner_released=True, + online_workers=[], + workers_expected=False, + ) + out = await svc.process( + monero_down=True, + tari_down=False, + tari_required=True, + miner_released=True, + online_workers=[], + workers_expected=False, + ) assert _keys(out) == [AlertService.EVT_NODE_DOWN] assert len(notifier.sent) == 1 and "DOWN" in notifier.sent[0] diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py new file mode 100644 index 0000000..10aa9b0 --- /dev/null +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -0,0 +1,389 @@ +"""Unit tests for the on-demand Telegram command interface (Issue #45). + +Covers command parsing, the pure reply formatters (fed hand-built Metrics), reply routing +(``build_metrics`` stubbed so no DB is touched), single-chat access control, and the +enabled/disabled gating. No network — the transport is stubbed throughout. +""" + +import asyncio +from dataclasses import replace +from types import SimpleNamespace + +import pytest + +from mining_dashboard.service import telegram_commands as tc +from mining_dashboard.service.metrics import Metrics, SyncMetric + +_SYNCED = SyncMetric( + percent=100, current=10, target=10, remaining=0, has_target=True, done=True, down=False +) +_DOWN = SyncMetric( + percent=0, current=0, target=0, remaining=0, has_target=False, done=False, down=True +) +_SYNCING = SyncMetric( + percent=42.5, current=850, target=2000, remaining=1150, has_target=True, done=False, down=False +) + +_BASE = Metrics( + total_h15=10500.0, + p2pool_1h=8000.0, + p2pool_24h=8100.0, + xvb_1h=2100.0, + xvb_24h=2300.0, + xvb_routed_1h=2000.0, + xvb_routed_24h=2050.0, + stratum_h15=10300.0, + stratum_h1h=10400.0, + stratum_h24h=10200.0, + mode="P2POOL", + xvb_enabled=True, + current_tier="Donor", + target_tier="Donor", + target_threshold=1000.0, + target_sustainable=True, + low_hr_warning=False, + xvb_fail_count=0, + xvb_last_update=0, + workers_online=2, + workers_total=3, + shares_in_window=5, + pplns_window=2160, + block_time=10, + pool_type="Mini", + pool_hashrate=120_000_000.0, + pool_difficulty=250_000_000.0, + network_difficulty=380_000_000_000.0, + network_height=3210001, + global_syncing=False, + monero=_SYNCED, + tari=_SYNCED, + monero_mode="Unknown", + tari_mining=True, +) + + +def _metrics(**over): + return replace(_BASE, **over) + + +# --- parse_command ------------------------------------------------------------------------ + + +@pytest.mark.parametrize( + "text,expected", + [ + ("/status", "status"), + (" /sync ", "sync"), + ("/HASHRATE", "hashrate"), + ("/status@PitheadBot", "status"), # group @mention suffix stripped + ("/workers now please", "workers"), # only the first word matters + ("/help", "help"), + ("/frobnicate", "unknown"), # a slash command we don't answer + ("hello there", None), # plain chatter is ignored + ("", None), + (None, None), + ("/", None), + ], +) +def test_parse_command(text, expected): + assert tc.parse_command(text) == expected + + +# --- formatters --------------------------------------------------------------------------- + + +def test_status_active(): + out = tc.format_status(_metrics(), mining_active=True) + assert "Monero node: 🟢 synced" in out + assert "Mining: 🟢 active (P2POOL)" in out + assert "Workers: 2/3 online" in out + assert "10.50 kH/s" in out + assert "PPLNS shares: 5 in window" in out + + +def test_status_syncing_beats_mining_flag(): + # While the whole stack is syncing, the reply says "holding", never "active". + out = tc.format_status(_metrics(global_syncing=True), mining_active=True) + assert "holding" in out + assert "active" not in out + + +def test_status_node_down_and_not_mining(): + out = tc.format_status(_metrics(monero=_DOWN), mining_active=False) + assert "Monero node: 🔴 down" in out + assert "Mining: 🔴 not mining" in out + + +def test_hashrate_lists_online_workers_desc(): + workers = [ + {"name": "rig-a", "status": "online", "h15": 3000}, + {"name": "rig-b", "status": "online", "h15": 7000}, + {"name": "rig-c", "status": "offline", "h15": 0}, + ] + out = tc.format_hashrate_reply(_metrics(), workers) + # Highest first, offline excluded. + assert out.index("rig-b") < out.index("rig-a") + assert "rig-c" not in out + + +def test_hashrate_no_online_workers(): + out = tc.format_hashrate_reply(_metrics(), [{"name": "x", "status": "offline"}]) + assert "No workers online." in out + + +def test_workers_online_first_with_offline_flagged(): + workers = [ + {"name": "off-1", "status": "offline", "h15": 0}, + {"name": "on-1", "status": "online", "h15": 5000, "uptime": 3661}, + ] + out = tc.format_workers(workers) + lines = out.splitlines() + assert "🟢 on-1" in lines[1] and "up 1h 1m" in lines[1] # online first, uptime shown + assert "🔴 off-1 — offline" in lines[2] + + +def test_workers_empty(): + assert "No workers connected." in tc.format_workers([]) + + +def test_status_node_syncing_percent(): + # _node_state's "syncing %" branch (not down, not done). + out = tc.format_status(_metrics(monero=_SYNCING), mining_active=True) + assert "Monero node: ⏳ syncing 42.5%" in out + + +def test_sync_line_variants(): + out = tc.format_sync(_metrics(monero=_SYNCING, tari=_DOWN)) + assert "Monero: ⏳ 42.5% (850/2,000)" in out + assert "Tari: 🔴 node down" in out + + +def test_sync_line_no_target(): + # A chain that's syncing but hasn't discovered a target height yet. + no_target = SyncMetric( + percent=12.0, current=0, target=0, remaining=0, has_target=False, done=False, down=False + ) + assert "Monero: ⏳ syncing 12.0%" in tc.format_sync(_metrics(monero=no_target)) + + +def test_host_label_prefix(): + assert tc.format_sync(_metrics(), host_label="rig-box").startswith("[rig-box] ") + # The placeholder is never printed. + assert not tc.format_sync(_metrics(), host_label="Unknown Host").startswith("[") + + +# --- reply_for routing -------------------------------------------------------------------- + + +def _bot(monkeypatch, latest_data=None, **over): + monkeypatch.setattr(tc, "build_metrics", lambda data, sm: _metrics(**over)) + ds = SimpleNamespace(latest_data=latest_data or {}, state_manager=object()) + return tc.TelegramCommandBot(ds, enabled=True, bot_token="tok", chat_id="42", host_label="") + + +def test_reply_for_help_and_unknown_need_no_metrics(): + ds = SimpleNamespace(latest_data={}, state_manager=object()) + bot = tc.TelegramCommandBot(ds, enabled=True, bot_token="t", chat_id="1", host_label="") + assert "/status" in bot.reply_for("/help") + assert "Unknown command" in bot.reply_for("/nope") + assert bot.reply_for("just chatting") is None + + +def test_reply_for_status_uses_mining_flag(monkeypatch): + bot = _bot(monkeypatch, latest_data={"miner_released": True, "workers_rejected": False}) + assert "🟢 active" in bot.reply_for("/status") + # Rejected workers (node-down failover) reads as not mining even when released. + bot2 = _bot(monkeypatch, latest_data={"miner_released": True, "workers_rejected": True}) + assert "🔴 not mining" in bot2.reply_for("/status") + + +def test_reply_for_workers_reads_snapshot(monkeypatch): + workers = [{"name": "z", "status": "online", "h15": 1000}] + bot = _bot(monkeypatch, latest_data={"workers": workers}) + assert "z" in bot.reply_for("/workers") + + +# --- enabled gating ----------------------------------------------------------------------- + + +def test_disabled_without_token_or_chat(): + ds = SimpleNamespace(latest_data={}, state_manager=object()) + assert not tc.TelegramCommandBot(ds, enabled=True, bot_token="", chat_id="1").enabled + assert not tc.TelegramCommandBot(ds, enabled=True, bot_token="t", chat_id="").enabled + assert not tc.TelegramCommandBot(ds, enabled=False, bot_token="t", chat_id="1").enabled + assert tc.TelegramCommandBot(ds, enabled=True, bot_token="t", chat_id="1").enabled + + +async def test_run_is_noop_when_disabled(): + ds = SimpleNamespace(latest_data={}, state_manager=object()) + bot = tc.TelegramCommandBot(ds, enabled=False, bot_token="", chat_id="") + # Returns immediately without touching the network — no session, no poll. + await bot.run() + + +# --- access control ----------------------------------------------------------------------- + + +async def test_handle_update_ignores_foreign_chat(monkeypatch): + bot = _bot(monkeypatch) + sent = [] + + async def _record(session, text): + sent.append(text) + + monkeypatch.setattr(bot, "_send", _record) + # chat_id 999 != configured 42 → dropped, nothing sent. + await bot._handle_update(None, {"message": {"chat": {"id": 999}, "text": "/help"}}) + assert sent == [] + + +async def test_handle_update_replies_to_configured_chat(monkeypatch): + bot = _bot(monkeypatch) + sent = [] + + async def _record(session, text): + sent.append(text) + + monkeypatch.setattr(bot, "_send", _record) + await bot._handle_update(None, {"message": {"chat": {"id": 42}, "text": "/help"}}) + assert len(sent) == 1 and "/status" in sent[0] + + +# --- transport (stubbed aiohttp session) -------------------------------------------------- + + +class _Resp: + """Minimal stand-in for an aiohttp response context manager.""" + + def __init__(self, payload=None, raise_on_enter=None, raise_on_status=False): + self._payload = payload or {} + self._raise_on_enter = raise_on_enter + self._raise_on_status = raise_on_status + + async def __aenter__(self): + if self._raise_on_enter: + raise self._raise_on_enter + return self + + async def __aexit__(self, *exc): + return False + + def raise_for_status(self): + if self._raise_on_status: + raise RuntimeError("http error") + + async def json(self): + return self._payload + + +class _Session: + """Stub aiohttp session: hands back queued responses and records calls.""" + + def __init__(self, gets=None, posts=None): + self._gets = list(gets or []) + self._posts = list(posts or []) + self.get_calls = [] + self.post_calls = [] + + def get(self, url, params=None, timeout=None): + self.get_calls.append((url, params)) + return self._gets.pop(0) + + def post(self, url, json=None, timeout=None): + self.post_calls.append((url, json)) + return self._posts.pop(0) + + +def _make_bot(): + ds = SimpleNamespace(latest_data={}, state_manager=object()) + return tc.TelegramCommandBot(ds, enabled=True, bot_token="tok", chat_id="42") + + +async def test_get_updates_parses_results_and_sends_offset(): + bot = _make_bot() + bot._offset = 7 + session = _Session(gets=[_Resp({"ok": True, "result": [{"update_id": 8}]})]) + result = await bot._get_updates(session, 0) + assert result == [{"update_id": 8}] + url, params = session.get_calls[0] + assert "bottok" in url and params["offset"] == 7 # token in URL, offset forwarded + + +async def test_get_updates_not_ok_returns_empty(): + bot = _make_bot() + session = _Session(gets=[_Resp({"ok": False})]) + assert await bot._get_updates(session, 0) == [] + + +async def test_prime_offset_skips_backlog(): + bot = _make_bot() + session = _Session(gets=[_Resp({"ok": True, "result": [{"update_id": 3}, {"update_id": 9}]})]) + await bot._prime_offset(session) + assert bot._offset == 10 # past the last pending update + + +async def test_prime_offset_swallows_error(): + bot = _make_bot() + session = _Session(gets=[_Resp(raise_on_enter=OSError("offline"))]) + await bot._prime_offset(session) # must not raise + assert bot._offset is None + + +async def test_send_posts_message(): + bot = _make_bot() + session = _Session(posts=[_Resp({"ok": True})]) + await bot._send(session, "hi") + url, body = session.post_calls[0] + assert "bottok" in url and body["chat_id"] == "42" and body["text"] == "hi" + + +async def test_send_swallows_network_error(): + bot = _make_bot() + session = _Session(posts=[_Resp(raise_on_status=True)]) + await bot._send(session, "hi") # must not raise + + +async def test_run_processes_update_then_honours_cancel(monkeypatch): + bot = _make_bot() + monkeypatch.setattr(bot, "_prime_offset", _async_noop) + handled = [] + + async def _fake_handle(session, update): + handled.append(update) + + calls = {"n": 0} + + async def _fake_get(session, poll_timeout): + calls["n"] += 1 + if calls["n"] == 1: + return [{"update_id": 1}] + raise asyncio.CancelledError + + monkeypatch.setattr(bot, "_handle_update", _fake_handle) + monkeypatch.setattr(bot, "_get_updates", _fake_get) + with pytest.raises(asyncio.CancelledError): + await bot.run() + assert handled == [{"update_id": 1}] and bot._offset == 2 + + +async def test_run_backs_off_on_poll_error(monkeypatch): + bot = _make_bot() + monkeypatch.setattr(bot, "_prime_offset", _async_noop) + slept = [] + + async def _sleep(secs): + slept.append(secs) + raise asyncio.CancelledError # break out after the first backoff + + async def _boom(session, poll_timeout): + raise OSError("telegram unreachable") + + monkeypatch.setattr(tc.asyncio, "sleep", _sleep) + monkeypatch.setattr(bot, "_get_updates", _boom) + with pytest.raises(asyncio.CancelledError): + await bot.run() + assert slept == [tc.POLL_ERROR_BACKOFF_SECONDS] + + +async def _async_noop(*args, **kwargs): + return None diff --git a/build/dashboard/tests/service/test_telegram_notifier.py b/build/dashboard/tests/service/test_telegram_notifier.py index f3dd17b..77147c4 100644 --- a/build/dashboard/tests/service/test_telegram_notifier.py +++ b/build/dashboard/tests/service/test_telegram_notifier.py @@ -29,8 +29,8 @@ def test_enabled_flag_off_disables_even_with_creds(self): def test_event_enabled_respects_toggle_and_enabled(self): n = _enabled() assert n.event_enabled("node_down") is True - assert n.event_enabled("node_recovered") is False # toggled off - assert n.event_enabled("worker_offline") is False # absent -> off + assert n.event_enabled("node_recovered") is False # toggled off + assert n.event_enabled("worker_offline") is False # absent -> off # A disabled notifier reports every event as off. assert TelegramNotifier(events={"node_down": True}).event_enabled("node_down") is False @@ -55,8 +55,9 @@ def test_send_posts_to_bot_api(self): def test_send_swallows_network_error(self): n = _enabled() - with patch.object(tg_mod.requests, "post", - side_effect=requests.RequestException("offline")): + with patch.object( + tg_mod.requests, "post", side_effect=requests.RequestException("offline") + ): assert n.send("x") is False def test_send_swallows_http_error(self): @@ -74,7 +75,8 @@ def test_token_never_logged_on_failure(self, caplog): n = _enabled(bot_token="SUPERSECRETTOKEN") boom = requests.RequestException( "HTTPSConnectionPool failed for url: " - "https://api.telegram.org/botSUPERSECRETTOKEN/sendMessage") + "https://api.telegram.org/botSUPERSECRETTOKEN/sendMessage" + ) with caplog.at_level("DEBUG"): with patch.object(tg_mod.requests, "post", side_effect=boom): n.send("x") diff --git a/build/dashboard/tests/service/test_worker_presence.py b/build/dashboard/tests/service/test_worker_presence.py index eda9dcc..73b2c02 100644 --- a/build/dashboard/tests/service/test_worker_presence.py +++ b/build/dashboard/tests/service/test_worker_presence.py @@ -3,6 +3,7 @@ class _Clock: """Manually advanced clock for deterministic debounce tests.""" + def __init__(self): self.t = 1000.0 @@ -15,8 +16,9 @@ def advance(self, secs): def _monitor(offline_after=300, recovery_after=120, retention=7 * 24 * 3600): clock = _Clock() - m = WorkerPresenceMonitor(offline_after=offline_after, recovery_after=recovery_after, - retention=retention, clock=clock) + m = WorkerPresenceMonitor( + offline_after=offline_after, recovery_after=recovery_after, retention=retention, clock=clock + ) return m, clock @@ -37,7 +39,7 @@ def test_steady_online_emits_nothing(self): class TestOfflineDebounce: def test_not_offline_before_threshold(self): m, clock = _monitor() - m.update({"rig-1"}) # baseline online + m.update({"rig-1"}) # baseline online clock.advance(30) assert m.update(set()) == [] # absence streak starts here (within debounce) clock.advance(269) @@ -46,7 +48,7 @@ def test_not_offline_before_threshold(self): def test_offline_after_threshold(self): m, clock = _monitor() m.update({"rig-1"}) - m.update(set()) # absence streak starts here + m.update(set()) # absence streak starts here clock.advance(300) assert m.update(set()) == [("rig-1", "offline")] @@ -62,8 +64,10 @@ def test_offline_emitted_once(self): def test_brief_drop_does_not_trip(self): m, clock = _monitor() m.update({"rig-1"}) - clock.advance(60); assert m.update(set()) == [] # gone 60s - clock.advance(30); assert m.update({"rig-1"}) == [] # back well before 300s + clock.advance(60) + assert m.update(set()) == [] # gone 60s + clock.advance(30) + assert m.update({"rig-1"}) == [] # back well before 300s class TestRecoveryHysteresis: @@ -78,16 +82,21 @@ def test_recovered_only_after_stable_window(self): self._take_offline(m, clock) # Reappears, but "back online" holds until it's been present for recovery_after. assert m.update({"rig-1"}) == [] - clock.advance(119); assert m.update({"rig-1"}) == [] - clock.advance(1); assert m.update({"rig-1"}) == [("rig-1", "recovered")] + clock.advance(119) + assert m.update({"rig-1"}) == [] + clock.advance(1) + assert m.update({"rig-1"}) == [("rig-1", "recovered")] def test_flap_during_recovery_does_not_emit(self): # A one-cycle reconnect during an outage must not produce a recovered→offline spam. m, clock = _monitor() self._take_offline(m, clock) - clock.advance(30); assert m.update({"rig-1"}) == [] # blink on (still offline) - clock.advance(30); assert m.update(set()) == [] # blink off — no recovered, no re-offline - clock.advance(30); assert m.update(set()) == [] + clock.advance(30) + assert m.update({"rig-1"}) == [] # blink on (still offline) + clock.advance(30) + assert m.update(set()) == [] # blink off — no recovered, no re-offline + clock.advance(30) + assert m.update(set()) == [] class TestMultipleWorkers: @@ -118,9 +127,10 @@ def test_long_absent_worker_is_forgotten(self): m, clock = _monitor(retention=1000) m.update({"rig-1"}) m.update(set()) - clock.advance(300); m.update(set()) # offline emitted + clock.advance(300) + m.update(set()) # offline emitted clock.advance(1000) - m.update(set()) # past retention -> pruned + m.update(set()) # past retention -> pruned assert "rig-1" not in m._workers # Returning after retention counts as new: silent baseline, not a recovery. assert m.update({"rig-1"}) == [] diff --git a/docs/telegram.md b/docs/telegram.md index 5d90c93..7076215 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -1,12 +1,15 @@ -# Telegram Alerts +# Telegram Bot Pithead can push a small set of **operational alerts** to Telegram, so you find out the moment -something needs attention — without sitting on the dashboard. It's **off by default**; this guide -takes you from nothing to a working alert in about five minutes. +something needs attention — without sitting on the dashboard. It can also answer a few **read-only +status commands** on demand, so you can check on the stack from your phone. Both are **off by +default**; this guide takes you from nothing to a working alert in about five minutes, then adds +commands if you want them. -> **What this is — and isn't.** This is a **notifications-only** push: the stack sends you -> messages, you don't send it commands. There's no interactive bot, no `/status` command, no -> remote control. (That's a separate, later feature.) Think of it as a pager, not a chat. +> **What this is — and isn't.** Alerts are a one-way push (a pager). Commands are **read-only** — +> `/status`, `/hashrate`, and friends report what the dashboard already knows; **nothing controls +> the stack over Telegram** (start/stop/`apply` stay on the CLI). So the worst a leaked chat can do +> is *read* your status, never change anything. --- @@ -43,9 +46,11 @@ the messages go). Both come from Telegram, in a few taps. 2. Send `/newbot` and follow the prompts — pick a name and a username (the username must end in `bot`, e.g. `my_pithead_bot`). 3. BotFather replies with a **token** that looks like: + ``` 123456789:AAExampleExampleExampleExampleExample ``` + This is your `bot_token`. **Treat it like a password** — anyone with it can post as your bot. ### 2. Pick where alerts go (a group is recommended) @@ -142,6 +147,45 @@ Run `./pithead apply` after editing. --- +## Commands + +Beyond alerts, the bot can answer **status queries on demand** — ask it how things are and it +replies with what the dashboard already knows. This is a **separate opt-in** from alerts: turn it on +by adding a `commands` block. + +```json +"telegram": { + "enabled": true, + "bot_token": "…", + "chat_id": "…", + "commands": { "enabled": true } +} +``` + +Run `./pithead apply` after editing. The commands: + +| Command | Reply | +|---|---| +| `/status` | One-glance health: each node up/down/syncing, whether mining is active, workers online, total hashrate, PPLNS shares in window. | +| `/hashrate` | Total hashrate plus a per-rig breakdown of everything currently online. | +| `/workers` | Every rig's online/offline state, with uptime for the ones that are up. | +| `/sync` | Monero and Tari sync progress (percent and block height). | +| `/help` | The command list. | + +The numbers come from the **same source as the dashboard**, so a reply and the web view always +agree. In a group, address the bot directly if you like — `/status@your_pithead_bot` works too. + +**Only the configured `chat_id` is answered.** A message from any other chat is ignored with no +reply, so the bot can't be used by anyone you haven't put in that chat. The bot **long-polls** +Telegram (`getUpdates`) rather than exposing a webhook, so it needs **no inbound port** and rides +the same outbound path as the alerts — nothing about your host is exposed to receive commands. + +> **Tor-only host.** Like alerts, commands reach `api.telegram.org` over clearnet. With no clearnet +> egress the poll just fails silently and the bot answers nothing — see +> [Privacy and secrets](#privacy-and-secrets). + +--- + ## One chat, two bots Pithead's companion **Healthchecks.io** monitor (a "dead-man's switch" that detects the whole host @@ -175,11 +219,12 @@ differently.) - **The bot token is a secret.** Pithead stores it in `.env`, which is created **owner-only** (`chmod 600`) and is **git-ignored**, exactly like the Monero node RPC password. The dashboard **never writes the token to a log line** — not even inside an error message. -- **Telegram is a clearnet service.** The dashboard reaches `api.telegram.org` directly. On a - **Tor-only host with no clearnet egress**, the Telegram API is unreachable and sends simply - **fail silently** — no errors, no log spam, the rest of the stack is unaffected. (Same applies if - your network blocks Telegram.) If you run Tor-only and want these alerts, you'll need clearnet - egress for the dashboard, or rely on Healthchecks.io's own delivery. +- **Telegram is a clearnet service.** The dashboard reaches `api.telegram.org` directly — both to + send alerts and (if commands are on) to poll for them. On a **Tor-only host with no clearnet + egress**, the Telegram API is unreachable and both **fail silently** — no errors, no log spam, the + rest of the stack is unaffected. (Same applies if your network blocks Telegram.) If you run + Tor-only and want these, you'll need clearnet egress for the dashboard, or rely on Healthchecks.io's + own delivery. --- @@ -206,7 +251,9 @@ Node-down timing is shared with the existing failover logic (`NODE_DOWN_AFTER_SE | Still nothing | Make sure the bot has been **added to the group** (or that you sent it `/start` for a direct chat). A bot can't message a chat it isn't in. | | `chat_id` looks wrong | Group ids are **negative** and long (`-100…`). Re-check with `@userinfobot`. | | Works for "down" but not a specific alert | Check `telegram.events` — that event may be toggled `false`. | -| Tor-only host | Expected: Telegram is clearnet, so sends fail silently. See [Privacy and secrets](#privacy-and-secrets). | +| Alerts work but commands don't | Commands are a **separate** switch: set `telegram.commands.enabled` to `true` and `./pithead apply`. | +| Bot ignores my commands | It only answers the configured `chat_id`. Send from that exact chat, and check the id with `@userinfobot`. | +| Tor-only host | Expected: Telegram is clearnet, so both alerts and command polling fail silently. See [Privacy and secrets](#privacy-and-secrets). | --- diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 727faa5..f1b80a4 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 630 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 692 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 630 | +| 1 — Unit | dashboard pytest | 692 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 630 tests +### Dashboard (pytest) — 692 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,6 +210,23 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error +#### tests/service/test_alert_service.py — 15 +- test_first_cycle_seeds_baseline_silently +- test_down_then_recovered +- test_node_text_names_the_chain +- test_non_blocking_tari_does_not_alert +- test_no_stale_edge_when_tari_becomes_required +- test_required_tari_alerts +- test_fires_once_when_gate_opens +- test_no_alert_on_restart_after_sync +- test_offline_then_recovered +- test_not_expected_resets_and_silences +- test_disabled_events_are_dropped +- test_prefixes_when_set +- test_placeholder_host_is_not_prefixed +- test_disabled_notifier_is_noop +- test_enabled_notifier_dispatches + #### tests/service/test_algo_service.py — 38 - test_xvb_disabled_forces_p2pool - test_zero_shares_forces_p2pool @@ -506,6 +523,46 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires +#### tests/service/test_telegram_commands.py — 27 +- test_parse_command +- test_status_active +- test_status_syncing_beats_mining_flag +- test_status_node_down_and_not_mining +- test_hashrate_lists_online_workers_desc +- test_hashrate_no_online_workers +- test_workers_online_first_with_offline_flagged +- test_workers_empty +- test_status_node_syncing_percent +- test_sync_line_variants +- test_sync_line_no_target +- test_host_label_prefix +- test_reply_for_help_and_unknown_need_no_metrics +- test_reply_for_status_uses_mining_flag +- test_reply_for_workers_reads_snapshot +- test_disabled_without_token_or_chat +- test_run_is_noop_when_disabled +- test_handle_update_ignores_foreign_chat +- test_handle_update_replies_to_configured_chat +- test_get_updates_parses_results_and_sends_offset +- test_get_updates_not_ok_returns_empty +- test_prime_offset_skips_backlog +- test_prime_offset_swallows_error +- test_send_posts_message +- test_send_swallows_network_error +- test_run_processes_update_then_honours_cancel +- test_run_backs_off_on_poll_error + +#### tests/service/test_telegram_notifier.py — 9 +- test_disabled_by_default +- test_enabled_requires_token_and_chat +- test_enabled_flag_off_disables_even_with_creds +- test_event_enabled_respects_toggle_and_enabled +- test_send_noop_when_disabled +- test_send_posts_to_bot_api +- test_send_swallows_network_error +- test_send_swallows_http_error +- test_token_never_logged_on_failure + #### tests/service/test_update_checker.py — 16 - test_accepts_plain_and_v_prefixed - test_ignores_prerelease_and_build_suffix @@ -524,6 +581,19 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_failed_fetch_keeps_previous_result - test_up_to_date_yields_none +#### tests/service/test_worker_presence.py — 11 +- test_first_sighting_is_silent +- test_steady_online_emits_nothing +- test_not_offline_before_threshold +- test_offline_after_threshold +- test_offline_emitted_once +- test_brief_drop_does_not_trip +- test_recovered_only_after_stable_window +- test_flap_during_recovery_does_not_emit +- test_independent_per_worker_state +- test_reset_clears_state_and_rebaselines_silently +- test_long_absent_worker_is_forgotten + #### tests/sim/test_donation_model.py — 10 - test_holds_tier_without_overshoot - test_no_windup_from_cold_start @@ -995,5 +1065,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **793** enumerated cases/sections across the four tiers (plus the live +_Grand total: **855** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/tests/stack/run.sh b/tests/stack/run.sh index f6cc476..cb42baa 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -262,7 +262,7 @@ esac # Telegram (#121): toggles/events are a brief dashboard restart (INFO); the bot token is a secret, # so its change line must NOT echo the old/new value. assert_contains "telegram enable is INFO" "$(run_sourced "$SANDBOX" describe_change TELEGRAM_ENABLED false true)" "INFO" -assert_contains "telegram event is INFO" "$(run_sourced "$SANDBOX" describe_change TELEGRAM_EVENT_NODE_DOWN true false)" "INFO" +assert_contains "telegram event is INFO" "$(run_sourced "$SANDBOX" describe_change TELEGRAM_EVENT_NODE_DOWN true false)" "INFO" tg_tok_msg="$(run_sourced "$SANDBOX" describe_change TELEGRAM_BOT_TOKEN oldsecret newsecret)" assert_contains "telegram token change noted" "$tg_tok_msg" "Telegram bot token updated" case "$tg_tok_msg" in @@ -1517,33 +1517,33 @@ assert_eq "check_for_updates opt-out propagated false" "$(run_sourced "$V" env_g # Telegram defaults (#121): no telegram block => disabled, per-event toggles default on. seed_env -printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" > "$V/config.json" +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" >"$V/config.json" out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" -assert_eq "telegram disabled by default" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_ENABLED)" "false" -assert_eq "telegram event defaults on" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_NODE_DOWN)" "true" +assert_eq "telegram disabled by default" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_ENABLED)" "false" +assert_eq "telegram event defaults on" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_NODE_DOWN)" "true" # Telegram enabled: token/chat_id + per-event toggles propagate from config.json into .env. seed_env -printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"}, "telegram":{"enabled":true,"bot_token":"BOTSECRET","chat_id":"-100123","events":{"worker_offline":false}} }\n' "$WALLET" > "$V/config.json" +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"}, "telegram":{"enabled":true,"bot_token":"BOTSECRET","chat_id":"-100123","events":{"worker_offline":false}} }\n' "$WALLET" >"$V/config.json" out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" -assert_eq "telegram enabled propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_ENABLED)" "true" -assert_eq "telegram token propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_BOT_TOKEN)" "BOTSECRET" -assert_eq "telegram chat_id propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_CHAT_ID)" "-100123" +assert_eq "telegram enabled propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_ENABLED)" "true" +assert_eq "telegram token propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_BOT_TOKEN)" "BOTSECRET" +assert_eq "telegram chat_id propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_CHAT_ID)" "-100123" assert_eq "telegram per-event override off" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_WORKER_OFFLINE)" "false" -assert_eq "telegram unset event stays on" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_NODE_DOWN)" "true" +assert_eq "telegram unset event stays on" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_EVENT_NODE_DOWN)" "true" # The bot token is a secret: the apply preview must not print it. case "$out" in - *BOTSECRET*) bad "telegram token not printed by apply" "leaked in: $out" ;; - *) ok "telegram token not printed by apply" ;; +*BOTSECRET*) bad "telegram token not printed by apply" "leaked in: $out" ;; +*) ok "telegram token not printed by apply" ;; esac # Interactive command interface (#45): off by default, opt-in via telegram.commands.enabled. seed_env -printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" > "$V/config.json" +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" >"$V/config.json" out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" assert_eq "telegram commands off by default" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_COMMANDS_ENABLED)" "false" seed_env -printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"}, "telegram":{"enabled":true,"bot_token":"BOTSECRET","chat_id":"-100123","commands":{"enabled":true}} }\n' "$WALLET" > "$V/config.json" +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"}, "telegram":{"enabled":true,"bot_token":"BOTSECRET","chat_id":"-100123","commands":{"enabled":true}} }\n' "$WALLET" >"$V/config.json" out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" assert_eq "telegram commands opt-in propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_COMMANDS_ENABLED)" "true" From b940ee629e3cde93782f4349caab9af826e8ed2d Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Thu, 2 Jul 2026 16:35:20 -0500 Subject: [PATCH 03/18] refactor(#121): base worker-offline on the DOWN status, not list-absence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per feedback: the dashboard already computes each rig's DOWN state (status != online, shown in the uptime column) and keeps the row visible ~1h before it falls off the proxy table (#182). Drive the worker-offline debounce off that same status instead of inferring offline from absence in the online-name set — so a Telegram 'offline' alert lines up with the rig showing DOWN on screen, and a rig that vanishes from the table entirely is forgotten (the dashboard no longer shows it) rather than aged into a false offline. WorkerPresenceMonitor.update now takes the worker rows (name+status); offline fires while DOWN for offline_after, recovered after recovery_after online, forgotten when the proxy stops listing it. Drops the redundant retention timer (WORKER_RETENTION_SEC now removed — it mirrored the lifecycle's own falloff). worker_presence.py 100% covered; make test green; patch coverage 94%. Co-Authored-By: Claude Opus 4.8 --- .../mining_dashboard/config/config.py | 4 - .../mining_dashboard/service/alert_service.py | 7 +- .../mining_dashboard/service/data_service.py | 6 +- .../service/worker_presence.py | 130 ++++++++---------- .../tests/service/test_alert_service.py | 47 ++++--- .../tests/service/test_worker_presence.py | 123 ++++++++++------- docs/test-inventory.md | 18 +-- 7 files changed, 181 insertions(+), 154 deletions(-) diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 619dbe3..3d8c385 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -367,10 +367,6 @@ def _telegram_event_enabled(name, default=True): # --- Data Retention Policies --- HISTORY_RETENTION_SEC = 30 * 24 * 3600 # 30 Days -# Retention for the known_workers persistence layer removed in #144. No live consumer in the current -# tree; kept for the deferred Telegram worker-presence monitor (#121), which reuses it as its -# retention default — consult that work before removing. -WORKER_RETENTION_SEC = 7 * 24 * 3600 # 7 Days # How long an offline worker lingers in the live "Workers Alive" table before it falls off (#182). # Operates on the live proxy-sourced list. A reconnect re-adds the worker. 1h keeps a # just-disconnected rig visible (shown as DOWN) but clears ghosts. diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index f3f0402..962ab27 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -75,7 +75,7 @@ def evaluate( tari_down, tari_required, miner_released, - online_workers, + workers, workers_expected, now=None, ): @@ -104,12 +104,13 @@ def evaluate( ) self._prev_released = miner_released - # --- Worker offline / back online (debounced) --- + # --- Worker offline / back online (debounced off the DOWN status) --- + # Driven by each rig's status in the same worker rows the dashboard shows (DOWN = offline). # Only meaningful while workers are actually expected: when the proxy is intentionally # stopped (initial sync hold, or node-down failover) their absence is by design, so we # reset the tracker instead of aging every rig into a false "offline". if workers_expected: - for name, event in self.workers.update(online_workers, now=now): + for name, event in self.workers.update(workers, now=now): if event == "offline": alerts.append( (self.EVT_WORKER_OFFLINE, self._fmt(f"\U0001f534 Worker offline: {name}")) diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index 0aeed63..73e0433 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -788,9 +788,9 @@ async def run(self): tari_down=tari_down, tari_required=TARI_REQUIRED, miner_released=self.miner_released, - online_workers=[ - w["name"] for w in final_workers if w.get("status") == "online" - ], + # The same worker rows the dashboard shows; the monitor reads each rig's + # status (DOWN = offline) so alerts line up with the on-screen state. + workers=final_workers, workers_expected=self.miner_released and not self.workers_rejected, ) diff --git a/build/dashboard/mining_dashboard/service/worker_presence.py b/build/dashboard/mining_dashboard/service/worker_presence.py index ef55c93..3d7858b 100644 --- a/build/dashboard/mining_dashboard/service/worker_presence.py +++ b/build/dashboard/mining_dashboard/service/worker_presence.py @@ -3,7 +3,6 @@ from mining_dashboard.config.config import ( WORKER_OFFLINE_AFTER_SEC, WORKER_RECOVERY_AFTER_SEC, - WORKER_RETENTION_SEC, ) @@ -11,88 +10,90 @@ class WorkerPresenceMonitor: """ Per-worker, flap-protected offline/online tracker (Issue #121). - The alerter needs a stable "rig-3 went offline / rig-3 is back" signal, but the raw - per-cycle worker list is noisy: xmrig-proxy keeps a disconnected worker around with a - decaying hashrate, miners briefly reconnect, and a worker can drop out of the list - entirely. This is the per-worker analogue of ``NodeHealthMonitor`` (#31), multiplexed - over many workers keyed by name: - - - **Debounce / hysteresis.** A worker must be *unseen* for ``offline_after`` seconds - before it's declared OFFLINE, and *seen continuously* for ``recovery_after`` seconds - before OFFLINE clears — so a single dropped poll, or a brief reconnect during an - outage, can't emit a recovered→offline spam. - - **Silent baseline.** A worker's first sighting registers it as ONLINE with no edge — a - brand-new rig is not a "recovery", and a dashboard restart must not replay every known - worker as a fresh alert. - - **Bounded memory.** A worker gone longer than ``retention`` is forgotten, so state stays - bounded and a long-absent rig that returns counts as new (mirrors ``StateManager``'s - 7-day worker retention, the persisted analogue of this in-memory tracker). - - :meth:`update` takes the set of worker names seen *online* this cycle and returns a list - of ``(name, event)`` edges, ``event`` in ``{"offline", "recovered"}``. - - Clock defaults to wall-clock ``time.time`` (so it lines up with ``last_seen`` semantics); - injectable for deterministic tests. + The alerter needs a stable "rig-3 went offline / rig-3 is back" signal. It's driven off the + **same worker rows the dashboard shows** — each rig's ``status`` (``"online"`` when xmrig-proxy + reports live connections, else the ``DOWN`` state the UI renders). This is the per-worker + analogue of ``NodeHealthMonitor`` (#31), multiplexed over many workers keyed by name, so a + Telegram "offline" alert lines up with the rig showing **DOWN** on screen: + + - **DOWN drives offline.** A rig is offline-pending while it's shown DOWN (listed by the proxy + but not connected). Once it's been DOWN continuously for ``offline_after`` it's declared + OFFLINE; once it's been back online for ``recovery_after`` the OFFLINE clears — so a single + dropped poll or a brief reconnect can't spam recovered→offline. + - **Silent baseline.** A rig's first sighting registers its current state with no edge — a + brand-new rig is not a "recovery", and a rig already DOWN at dashboard start is not a fresh + "offline" (a restart must not replay a stale transition). + - **Follows the table.** A rig the proxy stops listing entirely (aged off the worker table, + #182) is forgotten — the dashboard no longer shows it, DOWN or otherwise, so neither does the + alerter. Its state is bounded by exactly what's on screen; if it later returns it re-baselines + silently. + + :meth:`update` takes this cycle's worker rows (dicts with ``name`` + ``status``) and returns a + list of ``(name, event)`` edges, ``event`` in ``{"offline", "recovered"}``. + + Clock defaults to wall-clock ``time.time``; injectable for deterministic tests. """ def __init__( self, offline_after=WORKER_OFFLINE_AFTER_SEC, recovery_after=WORKER_RECOVERY_AFTER_SEC, - retention=WORKER_RETENTION_SEC, clock=time.time, ): self.offline_after = offline_after self.recovery_after = recovery_after - self.retention = retention self._clock = clock - # name -> {state, seen_since, unseen_since, last_present} + # name -> {state, online_since, down_since} # state : "online" | "offline" (the debounced, edge-emitting state) - # seen_since : when the current continuous-presence streak began (None while absent) - # unseen_since : when the current continuous-absence streak began (None while present) - # last_present : last cycle the worker was seen (drives retention pruning) + # online_since : when the current continuous-online streak began (None while DOWN) + # down_since : when the current continuous-DOWN streak began (None while online) self._workers = {} - def update(self, present, now=None): - """Feed this cycle's online worker names; return the list of debounced transitions.""" + def update(self, workers, now=None): + """Feed this cycle's worker rows; return the list of debounced transitions.""" now = self._clock() if now is None else now - present = set(present) + online = {w.get("name") for w in workers if w.get("status") == "online"} + present = {w.get("name") for w in workers} edges = [] - # Present workers: refresh the presence streak, clear OFFLINE once it's stable. + # Forget rigs the proxy no longer lists at all — they've fallen off the worker table and + # the dashboard no longer shows them, so the alerter drops them too (a later return + # re-baselines silently). + for name in list(self._workers): + if name not in present: + del self._workers[name] + for name in present: - w = self._workers.get(name) - if w is None: - # First sighting — baseline as ONLINE, no edge. - self._workers[name] = { - "state": "online", - "seen_since": now, - "unseen_since": None, - "last_present": now, - } - continue - w["last_present"] = now - w["unseen_since"] = None - if w["seen_since"] is None: - w["seen_since"] = now - if w["state"] == "offline" and (now - w["seen_since"]) >= self.recovery_after: + self._step(name, name in online, now, edges) + + return edges + + def _step(self, name, is_online, now, edges): + w = self._workers.get(name) + if w is None: + # First sighting — baseline to the rig's current state, no edge. + self._workers[name] = { + "state": "online" if is_online else "offline", + "online_since": now if is_online else None, + "down_since": None if is_online else now, + } + return + + if is_online: + w["down_since"] = None + if w["online_since"] is None: + w["online_since"] = now + if w["state"] == "offline" and (now - w["online_since"]) >= self.recovery_after: w["state"] = "online" edges.append((name, "recovered")) - - # Absent workers: age the absence streak, declare OFFLINE once past the threshold. - for name, w in self._workers.items(): - if name in present: - continue - w["seen_since"] = None - if w["unseen_since"] is None: - w["unseen_since"] = now - if w["state"] == "online" and (now - w["unseen_since"]) >= self.offline_after: + else: + w["online_since"] = None + if w["down_since"] is None: + w["down_since"] = now + if w["state"] == "online" and (now - w["down_since"]) >= self.offline_after: w["state"] = "offline" edges.append((name, "offline")) - self._prune(now) - return edges - def reset(self): """Drop all per-worker state. @@ -101,12 +102,3 @@ def reset(self): into false "offline" alerts, and re-admission re-baselines every worker silently. """ self._workers.clear() - - def _prune(self, now): - stale = [ - name - for name, w in self._workers.items() - if w["unseen_since"] is not None and (now - w["last_present"]) >= self.retention - ] - for name in stale: - del self._workers[name] diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index 5defaed..46391dd 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -34,7 +34,7 @@ def _ev( tari_down=False, tari_required=True, miner_released=True, - online_workers=(), + workers=(), workers_expected=False, now=0, ): @@ -43,7 +43,7 @@ def _ev( tari_down=tari_down, tari_required=tari_required, miner_released=miner_released, - online_workers=list(online_workers), + workers=list(workers), workers_expected=workers_expected, now=now, ) @@ -53,6 +53,16 @@ def _keys(alerts): return [k for k, _ in alerts] +def _on(*names): + """Worker rows the proxy reports online.""" + return [{"name": n, "status": "online"} for n in names] + + +def _down(*names): + """Worker rows still listed but disconnected — the DOWN state the dashboard shows.""" + return [{"name": n, "status": "offline"} for n in names] + + class TestNodeEdges: def test_first_cycle_seeds_baseline_silently(self): svc = _svc() @@ -113,35 +123,36 @@ def test_no_alert_on_restart_after_sync(self): class TestWorkerEdges: def test_offline_then_recovered(self): + # Offline is driven by the DOWN status the dashboard shows, not by the rig vanishing. svc = _svc() - assert _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) == [] - assert _ev(svc, online_workers=[], workers_expected=True, now=0) == [] - assert _keys(_ev(svc, online_workers=[], workers_expected=True, now=300)) == [ + assert _ev(svc, workers=_on("rig-1"), workers_expected=True, now=0) == [] + assert _ev(svc, workers=_down("rig-1"), workers_expected=True, now=0) == [] + assert _keys(_ev(svc, workers=_down("rig-1"), workers_expected=True, now=300)) == [ AlertService.EVT_WORKER_OFFLINE ] - _ev(svc, online_workers=["rig-1"], workers_expected=True, now=300) - assert _keys(_ev(svc, online_workers=["rig-1"], workers_expected=True, now=420)) == [ + _ev(svc, workers=_on("rig-1"), workers_expected=True, now=300) + assert _keys(_ev(svc, workers=_on("rig-1"), workers_expected=True, now=420)) == [ AlertService.EVT_WORKER_RECOVERED ] def test_not_expected_resets_and_silences(self): svc = _svc() - _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) - _ev(svc, online_workers=[], workers_expected=True, now=0) - _ev(svc, online_workers=[], workers_expected=True, now=300) # rig-1 now offline + _ev(svc, workers=_on("rig-1"), workers_expected=True, now=0) + _ev(svc, workers=_down("rig-1"), workers_expected=True, now=0) + _ev(svc, workers=_down("rig-1"), workers_expected=True, now=300) # rig-1 now offline # Proxy intentionally stopped (sync hold / failover): reset, no alert. - assert _ev(svc, online_workers=[], workers_expected=False, now=330) == [] + assert _ev(svc, workers=[], workers_expected=False, now=330) == [] # Re-admission re-baselines silently — no spurious "recovered". - assert _ev(svc, online_workers=["rig-1"], workers_expected=True, now=360) == [] + assert _ev(svc, workers=_on("rig-1"), workers_expected=True, now=360) == [] class TestEventFiltering: def test_disabled_events_are_dropped(self): svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) - _ev(svc, online_workers=["rig-1"], workers_expected=True, now=0) - _ev(svc, online_workers=[], workers_expected=True, now=0) + _ev(svc, workers=_on("rig-1"), workers_expected=True, now=0) + _ev(svc, workers=_down("rig-1"), workers_expected=True, now=0) # worker_offline is computed but filtered out because it's not in the allow-set. - assert _ev(svc, online_workers=[], workers_expected=True, now=300) == [] + assert _ev(svc, workers=_down("rig-1"), workers_expected=True, now=300) == [] class TestHostLabel: @@ -167,7 +178,7 @@ async def test_disabled_notifier_is_noop(self): tari_down=False, tari_required=True, miner_released=True, - online_workers=[], + workers=[], workers_expected=False, ) assert out == [] @@ -182,7 +193,7 @@ async def test_enabled_notifier_dispatches(self): tari_down=False, tari_required=True, miner_released=True, - online_workers=[], + workers=[], workers_expected=False, ) out = await svc.process( @@ -190,7 +201,7 @@ async def test_enabled_notifier_dispatches(self): tari_down=False, tari_required=True, miner_released=True, - online_workers=[], + workers=[], workers_expected=False, ) assert _keys(out) == [AlertService.EVT_NODE_DOWN] diff --git a/build/dashboard/tests/service/test_worker_presence.py b/build/dashboard/tests/service/test_worker_presence.py index 73b2c02..38abad0 100644 --- a/build/dashboard/tests/service/test_worker_presence.py +++ b/build/dashboard/tests/service/test_worker_presence.py @@ -14,123 +14,148 @@ def advance(self, secs): self.t += secs -def _monitor(offline_after=300, recovery_after=120, retention=7 * 24 * 3600): +def _monitor(offline_after=300, recovery_after=120): clock = _Clock() m = WorkerPresenceMonitor( - offline_after=offline_after, recovery_after=recovery_after, retention=retention, clock=clock + offline_after=offline_after, recovery_after=recovery_after, clock=clock ) return m, clock +def _on(*names): + """Worker rows the proxy reports online.""" + return [{"name": n, "status": "online"} for n in names] + + +def _down(*names): + """Worker rows still listed by the proxy but disconnected — the DOWN state the UI shows.""" + return [{"name": n, "status": "offline"} for n in names] + + class TestBaseline: - def test_first_sighting_is_silent(self): - # A brand-new worker is registered ONLINE with no edge — it's not a "recovery". + def test_first_sighting_online_is_silent(self): + # A brand-new worker is baselined ONLINE with no edge — it's not a "recovery". + m, _ = _monitor() + assert m.update(_on("rig-1")) == [] + + def test_first_sighting_down_is_silent(self): + # A rig already DOWN at startup baselines OFFLINE silently — a restart must not replay it. m, _ = _monitor() - assert m.update({"rig-1"}) == [] + assert m.update(_down("rig-1")) == [] + assert m._workers["rig-1"]["state"] == "offline" def test_steady_online_emits_nothing(self): m, clock = _monitor() - m.update({"rig-1"}) + m.update(_on("rig-1")) for _ in range(5): clock.advance(30) - assert m.update({"rig-1"}) == [] + assert m.update(_on("rig-1")) == [] class TestOfflineDebounce: def test_not_offline_before_threshold(self): m, clock = _monitor() - m.update({"rig-1"}) # baseline online + m.update(_on("rig-1")) # baseline online clock.advance(30) - assert m.update(set()) == [] # absence streak starts here (within debounce) + assert m.update(_down("rig-1")) == [] # DOWN streak starts here (within debounce) clock.advance(269) - assert m.update(set()) == [] # 269s absent — still under the 300s threshold + assert m.update(_down("rig-1")) == [] # 269s DOWN — still under the 300s threshold def test_offline_after_threshold(self): m, clock = _monitor() - m.update({"rig-1"}) - m.update(set()) # absence streak starts here + m.update(_on("rig-1")) + m.update(_down("rig-1")) # DOWN streak starts here clock.advance(300) - assert m.update(set()) == [("rig-1", "offline")] + assert m.update(_down("rig-1")) == [("rig-1", "offline")] def test_offline_emitted_once(self): m, clock = _monitor() - m.update({"rig-1"}) - m.update(set()) + m.update(_on("rig-1")) + m.update(_down("rig-1")) clock.advance(300) - assert m.update(set()) == [("rig-1", "offline")] + assert m.update(_down("rig-1")) == [("rig-1", "offline")] clock.advance(300) - assert m.update(set()) == [] # already offline — no repeat + assert m.update(_down("rig-1")) == [] # already offline — no repeat - def test_brief_drop_does_not_trip(self): + def test_brief_down_does_not_trip(self): m, clock = _monitor() - m.update({"rig-1"}) + m.update(_on("rig-1")) clock.advance(60) - assert m.update(set()) == [] # gone 60s + assert m.update(_down("rig-1")) == [] # DOWN 60s clock.advance(30) - assert m.update({"rig-1"}) == [] # back well before 300s + assert m.update(_on("rig-1")) == [] # back well before 300s + + def test_vanishing_from_table_is_not_offline(self): + # A rig the proxy stops listing entirely (fell off the worker table) is forgotten, not + # aged to "offline" — the dashboard no longer shows it, so neither does the alerter. + m, clock = _monitor() + m.update(_on("rig-1")) + clock.advance(600) + assert m.update([]) == [] # gone from the table, never went DOWN → no alert + assert "rig-1" not in m._workers class TestRecoveryHysteresis: def _take_offline(self, m, clock): - m.update({"rig-1"}) - m.update(set()) + m.update(_on("rig-1")) + m.update(_down("rig-1")) clock.advance(300) - assert m.update(set()) == [("rig-1", "offline")] + assert m.update(_down("rig-1")) == [("rig-1", "offline")] def test_recovered_only_after_stable_window(self): m, clock = _monitor() self._take_offline(m, clock) - # Reappears, but "back online" holds until it's been present for recovery_after. - assert m.update({"rig-1"}) == [] + # Reappears online, but "back online" holds until it's been present for recovery_after. + assert m.update(_on("rig-1")) == [] clock.advance(119) - assert m.update({"rig-1"}) == [] + assert m.update(_on("rig-1")) == [] clock.advance(1) - assert m.update({"rig-1"}) == [("rig-1", "recovered")] + assert m.update(_on("rig-1")) == [("rig-1", "recovered")] def test_flap_during_recovery_does_not_emit(self): # A one-cycle reconnect during an outage must not produce a recovered→offline spam. m, clock = _monitor() self._take_offline(m, clock) clock.advance(30) - assert m.update({"rig-1"}) == [] # blink on (still offline) + assert m.update(_on("rig-1")) == [] # blink online (still offline) clock.advance(30) - assert m.update(set()) == [] # blink off — no recovered, no re-offline + assert m.update(_down("rig-1")) == [] # blink DOWN — no recovered, no re-offline clock.advance(30) - assert m.update(set()) == [] + assert m.update(_down("rig-1")) == [] class TestMultipleWorkers: def test_independent_per_worker_state(self): m, clock = _monitor() - m.update({"rig-1", "rig-2"}) - # rig-2 stays online; rig-1 drops. - m.update({"rig-2"}) + m.update(_on("rig-1", "rig-2")) + # rig-2 stays online; rig-1 goes DOWN. + m.update(_on("rig-2") + _down("rig-1")) clock.advance(300) - assert m.update({"rig-2"}) == [("rig-1", "offline")] + assert m.update(_on("rig-2") + _down("rig-1")) == [("rig-1", "offline")] class TestReset: def test_reset_clears_state_and_rebaselines_silently(self): m, clock = _monitor() - m.update({"rig-1"}) - m.update(set()) + m.update(_on("rig-1")) + m.update(_down("rig-1")) clock.advance(300) - assert m.update(set()) == [("rig-1", "offline")] + assert m.update(_down("rig-1")) == [("rig-1", "offline")] m.reset() # After a reset (e.g. proxy intentionally stopped), the worker re-appears as a fresh # baseline — no "recovered" edge. - assert m.update({"rig-1"}) == [] + assert m.update(_on("rig-1")) == [] -class TestRetention: - def test_long_absent_worker_is_forgotten(self): - m, clock = _monitor(retention=1000) - m.update({"rig-1"}) - m.update(set()) +class TestFalloff: + def test_worker_forgotten_when_it_leaves_the_table(self): + m, clock = _monitor() + m.update(_on("rig-1")) + m.update(_down("rig-1")) clock.advance(300) - m.update(set()) # offline emitted - clock.advance(1000) - m.update(set()) # past retention -> pruned + m.update(_down("rig-1")) # offline emitted + # The lifecycle eventually drops the ghost from the worker table (#182). + assert m.update([]) == [] assert "rig-1" not in m._workers - # Returning after retention counts as new: silent baseline, not a recovery. - assert m.update({"rig-1"}) == [] + # Returning after that counts as new: silent baseline, not a recovery. + assert m.update(_on("rig-1")) == [] diff --git a/docs/test-inventory.md b/docs/test-inventory.md index f1b80a4..ed41b2c 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 692 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 694 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 692 | +| 1 — Unit | dashboard pytest | 694 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 692 tests +### Dashboard (pytest) — 694 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -581,18 +581,20 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_failed_fetch_keeps_previous_result - test_up_to_date_yields_none -#### tests/service/test_worker_presence.py — 11 -- test_first_sighting_is_silent +#### tests/service/test_worker_presence.py — 13 +- test_first_sighting_online_is_silent +- test_first_sighting_down_is_silent - test_steady_online_emits_nothing - test_not_offline_before_threshold - test_offline_after_threshold - test_offline_emitted_once -- test_brief_drop_does_not_trip +- test_brief_down_does_not_trip +- test_vanishing_from_table_is_not_offline - test_recovered_only_after_stable_window - test_flap_during_recovery_does_not_emit - test_independent_per_worker_state - test_reset_clears_state_and_rebaselines_silently -- test_long_absent_worker_is_forgotten +- test_worker_forgotten_when_it_leaves_the_table #### tests/sim/test_donation_model.py — 10 - test_holds_tier_without_overshoot @@ -1065,5 +1067,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **855** enumerated cases/sections across the four tiers (plus the live +_Grand total: **857** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ From 9ecd1886993098df250fac613d7b8e839594d23c Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Thu, 2 Jul 2026 16:49:46 -0500 Subject: [PATCH 04/18] feat(#121/#45): worker join/leave + disk/DB alerts, and /system /pool /xvb commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alerts (all reuse what the dashboard already computes, per feedback): - worker joined/left — woven into WorkerPresenceMonitor via a prime flag so a restart/failover-readmit doesn't replay the roster as joins; 'left' fires when a rig drops off the proxy table entirely (vs 'offline' = DOWN-but-still-listed). - disk filling/critical — crosses the same DISK_WARN/CRITICAL_PERCENT thresholds as the dashboard's low-disk badge (#138); a full disk corrupts monerod's DB. - DB write failing — StateManager.is_db_healthy flipping false (#131). Disk usage is read once in the loop and reused in the snapshot. Commands (read-only, reuse build_metrics + the system snapshot): - /system (disk/RAM/CPU/load/HugePages), /pool (sidechain + Monero network), /xvb (mode/tier/routed/raffle-eligibility). Config: telegram.events gains worker_joined/worker_left/disk_space/db_unhealthy (all default true); plumbed through pithead render, compose, config.reference.json. make test green; patch coverage 97%. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 32 ++++--- .../mining_dashboard/config/config.py | 4 + .../mining_dashboard/service/alert_service.py | 93 ++++++++++++++++--- .../mining_dashboard/service/data_service.py | 17 ++-- .../service/telegram_commands.py | 65 ++++++++++++- .../service/worker_presence.py | 57 +++++++----- .../tests/service/test_alert_service.py | 46 +++++++++ .../tests/service/test_telegram_commands.py | 53 +++++++++++ .../tests/service/test_worker_presence.py | 42 +++++++-- config.reference.json | 6 +- docker-compose.yml | 4 + docs/telegram.md | 15 ++- docs/test-inventory.md | 32 +++++-- pithead | 11 ++- 14 files changed, 400 insertions(+), 77 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0962382..73374e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,20 +100,24 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay silently when offline / Tor down. The URL is the on/off switch and is stored as a secret in the owner-only `.env`. See [`docs/monitoring.md`](docs/monitoring.md) (#79). - **Telegram operator bot — push alerts + on-demand status** (#121, #45): the dashboard can push a - small, high-value set of operational alerts to Telegram — **node down / recovered**, **worker - offline / back online**, and **sync finished** — and answer status commands on demand: - **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, and **`/help`**. Off by default; - enable it with a `telegram` block in `config.json` (`enabled`, `bot_token`, `chat_id`, per-event - `events` toggles, and an `commands.enabled` switch for the interactive half). Every alert is - **debounced** so a momentary blip won't ping you and you get one message per real transition: node - edges reuse the existing failover detector, and worker offline/online uses a new flap-protected - per-worker presence tracker. Commands **long-poll** (`getUpdates`) so they need no inbound port and - ride the same Tor egress as the alerts, and only the configured `chat_id` is answered — every other - update is ignored. The `bot_token` is treated as a secret (owner-only `.env`, never logged), and - both sends and polling **fail silently** on a Tor-only / offline host. Messages are prefixed with the - dashboard hostname so multiple stacks can share one chat. Full walkthrough — creating a bot, finding - your chat id, the command list, and the "one chat, two bots" pattern for sharing a chat with the - Healthchecks.io monitor (#79) — in [`docs/telegram.md`](docs/telegram.md). + high-value set of operational alerts to Telegram — **node down / recovered**, **worker offline / + back online**, **new worker joined / left**, **sync finished**, **data disk filling up**, and + **dashboard DB write failing** — and answer status commands on demand: **`/status`**, + **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, **`/xvb`**, and + **`/help`**. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, + `bot_token`, `chat_id`, per-event `events` toggles, and a `commands.enabled` switch for the + interactive half). Every alert is **debounced** so a momentary blip won't ping you and you get one + message per real transition — and each is built by *reusing* what the dashboard already computes: + worker offline/joined/left keys off the same per-rig **DOWN** status the UI shows, and the disk / + DB alerts cross the same thresholds as the dashboard's own low-disk and DB-health badges. Commands + **long-poll** (`getUpdates`) so they need no inbound port and ride the same Tor egress as the + alerts, are **read-only** (they never change the stack), and only the configured `chat_id` is + answered — every other update is ignored. The `bot_token` is treated as a secret (owner-only + `.env`, never logged), and both sends and polling **fail silently** on a Tor-only / offline host. + Messages are prefixed with the dashboard hostname so multiple stacks can share one chat. Full + walkthrough — creating a bot, finding your chat id, the command list, and the "one chat, two bots" + pattern for sharing a chat with the Healthchecks.io monitor (#79) — in + [`docs/telegram.md`](docs/telegram.md). - **Optional clearnet initial sync (#183).** A default-off, per-component opt-in (`monero.clearnet_initial_sync` / `tari.clearnet_initial_sync`) that lets a node do its **one-time initial block download over clearnet** — much faster than over bandwidth-capped Tor circuits, which diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 3d8c385..9f3caf4 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -259,7 +259,11 @@ def _telegram_event_enabled(name, default=True): "node_recovered": _telegram_event_enabled("node_recovered"), "worker_offline": _telegram_event_enabled("worker_offline"), "worker_recovered": _telegram_event_enabled("worker_recovered"), + "worker_joined": _telegram_event_enabled("worker_joined"), + "worker_left": _telegram_event_enabled("worker_left"), "sync_finished": _telegram_event_enabled("sync_finished"), + "disk_space": _telegram_event_enabled("disk_space"), + "db_unhealthy": _telegram_event_enabled("db_unhealthy"), } # Worker offline/online debounce (Issue #121). A worker must be unseen this long before it's diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 962ab27..8ad9cbb 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -2,6 +2,8 @@ import logging from mining_dashboard.config.config import ( + DISK_CRITICAL_PERCENT, + DISK_WARN_PERCENT, HOST_IP, TELEGRAM_BOT_TOKEN, TELEGRAM_CHAT_ID, @@ -36,8 +38,14 @@ class AlertService: Tari going down isn't operator-critical (we keep mining Monero), matching the worker-rejection rule. - **sync finished** — the sync gate's ``miner_released`` latch flipping open once (#35). - - **worker offline / back online** — a debounced :class:`WorkerPresenceMonitor` over the - live worker list (the one genuinely new building block this issue adds). + - **worker offline / back online / joined / left** — a debounced :class:`WorkerPresenceMonitor` + over the live worker rows (offline keys off the same DOWN status the dashboard shows; joined / + left track fleet membership). + - **disk filling / critical** — the data disk crossing the same ``DISK_WARN_PERCENT`` / + ``DISK_CRITICAL_PERCENT`` thresholds the dashboard's low-disk badge uses (#138): a full disk + corrupts monerod's DB mid-write. + - **DB write failing** — ``StateManager.is_db_healthy`` flipping false (#131): the dashboard + keeps serving but history/shares/stats stop persisting. Edge state is seeded silently on the first observation (``None`` baselines), so a dashboard restart can't replay a stale transition as a fresh alert. @@ -52,7 +60,19 @@ class AlertService: EVT_NODE_RECOVERED = "node_recovered" EVT_WORKER_OFFLINE = "worker_offline" EVT_WORKER_RECOVERED = "worker_recovered" + EVT_WORKER_JOINED = "worker_joined" + EVT_WORKER_LEFT = "worker_left" EVT_SYNC_FINISHED = "sync_finished" + EVT_DISK_SPACE = "disk_space" + EVT_DB_UNHEALTHY = "db_unhealthy" + + # WorkerPresenceMonitor edge -> (event key, message template). + _WORKER_EDGES = { + "offline": (EVT_WORKER_OFFLINE, "\U0001f534 Worker offline: {name}"), + "recovered": (EVT_WORKER_RECOVERED, "\U0001f7e2 Worker back online: {name}"), + "joined": (EVT_WORKER_JOINED, "\U0001f7e2 New worker joined: {name}"), + "left": (EVT_WORKER_LEFT, "⚪ Worker left: {name}"), + } def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): self.notifier = notifier if notifier is not None else build_default_notifier() @@ -63,6 +83,8 @@ def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): self._prev_monero_down = None self._prev_tari_down = None self._prev_released = None + self._prev_disk_level = None + self._prev_db_healthy = None @property def enabled(self): @@ -77,6 +99,8 @@ def evaluate( miner_released, workers, workers_expected, + disk_percent=0, + db_healthy=True, now=None, ): """Pure: fold this cycle's signals into the list of ``(event_key, text)`` to send, @@ -104,27 +128,22 @@ def evaluate( ) self._prev_released = miner_released - # --- Worker offline / back online (debounced off the DOWN status) --- + # --- Worker offline / recovered / joined / left (debounced off the DOWN status) --- # Driven by each rig's status in the same worker rows the dashboard shows (DOWN = offline). # Only meaningful while workers are actually expected: when the proxy is intentionally # stopped (initial sync hold, or node-down failover) their absence is by design, so we # reset the tracker instead of aging every rig into a false "offline". if workers_expected: for name, event in self.workers.update(workers, now=now): - if event == "offline": - alerts.append( - (self.EVT_WORKER_OFFLINE, self._fmt(f"\U0001f534 Worker offline: {name}")) - ) - else: - alerts.append( - ( - self.EVT_WORKER_RECOVERED, - self._fmt(f"\U0001f7e2 Worker back online: {name}"), - ) - ) + evt, template = self._WORKER_EDGES[event] + alerts.append((evt, self._fmt(template.format(name=name)))) else: self.workers.reset() + # --- Host health: data disk filling up, dashboard DB write failing --- + alerts += self._disk_edges(disk_percent) + alerts += self._db_edges(db_healthy) + return [(evt, text) for evt, text in alerts if self.notifier.event_enabled(evt)] def _node_edges(self, label, down, attr): @@ -148,6 +167,52 @@ def _node_edges(self, label, down, attr): ) ] + def _disk_edges(self, disk_percent): + """Alert on the data disk crossing the dashboard's own warn/critical thresholds (#138).""" + level = ( + "critical" + if disk_percent >= DISK_CRITICAL_PERCENT + else "warn" + if disk_percent >= DISK_WARN_PERCENT + else "ok" + ) + prev = self._prev_disk_level + self._prev_disk_level = level + if prev is None or level == prev: + return [] + pct = f"{disk_percent:.0f}%" + if level == "critical": + return [ + ( + self.EVT_DISK_SPACE, + self._fmt( + f"\U0001f534 Data disk almost full ({pct}) — free space now; a full disk " + "can corrupt the Monero database." + ), + ) + ] + if level == "warn": + return [(self.EVT_DISK_SPACE, self._fmt(f"\U0001f7e0 Data disk filling up ({pct})."))] + return [(self.EVT_DISK_SPACE, self._fmt(f"\U0001f7e2 Data disk back to healthy ({pct})."))] + + def _db_edges(self, db_healthy): + """Alert when the dashboard can no longer persist to its SQLite DB (#131).""" + prev = self._prev_db_healthy + self._prev_db_healthy = db_healthy + if prev is None or db_healthy == prev: + return [] + if not db_healthy: + return [ + ( + self.EVT_DB_UNHEALTHY, + self._fmt( + "\U0001f534 Dashboard DB write failing — hashrate history, shares and stats " + "won't persist. Check disk space and permissions on the dashboard data dir." + ), + ) + ] + return [(self.EVT_DB_UNHEALTHY, self._fmt("\U0001f7e2 Dashboard DB writes recovered."))] + def _fmt(self, text): return f"[{self.host_label}] {text}" if self.host_label else text diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index 73e0433..2fdb749 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -778,11 +778,14 @@ async def run(self): if self.miner_released: await self._apply_worker_rejection(monero_down, tari_down) - # 5. Operator alerts (Issue #121): push debounced node/worker/sync edges to - # Telegram. Consumes the flags computed above; worker presence is only - # tracked while the proxy is actually serving (miner released and not - # rejected) — its intentional absence otherwise must not read as offline. - # No-op unless Telegram is configured; never raises. + # 5. Operator alerts (Issues #121/#45): push debounced node/worker/sync/host + # edges to Telegram. Consumes the flags computed above; worker presence is only + # tracked while the proxy is actually serving (miner released and not rejected) — + # its intentional absence otherwise must not read as offline. Disk usage is read + # once here and reused in the snapshot below. No-op unless Telegram is configured; + # never raises. + disk_usage = get_disk_usage() + db_healthy = self.state_manager.is_db_healthy() await self.alert_service.process( monero_down=monero_down, tari_down=tari_down, @@ -792,6 +795,8 @@ async def run(self): # status (DOWN = offline) so alerts line up with the on-screen state. workers=final_workers, workers_expected=self.miner_released and not self.workers_rejected, + disk_percent=(disk_usage or {}).get("percent", 0) or 0, + db_healthy=db_healthy, ) # Fetch fresh shares list to populate UI @@ -816,7 +821,7 @@ async def run(self): "miner_held": self.miner_held, "clearnet_sync": self.clearnet_sync_state, "system": { - "disk": get_disk_usage(), + "disk": disk_usage, "hugepages": get_hugepages_status(), "memory": get_memory_usage(), "load": get_load_average(), diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index dad6fd5..7055d70 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -25,7 +25,7 @@ # The commands the bot answers. All are read-only status queries — the bot can never change the # stack (start/stop/apply live on the CLI), so a leaked chat can at worst read status, not act. -COMMANDS = ("status", "hashrate", "workers", "sync", "help") +COMMANDS = ("status", "hashrate", "workers", "sync", "system", "pool", "xvb", "help") HELP_TEXT = ( "Pithead bot — commands:\n" @@ -33,6 +33,9 @@ "/hashrate — total + per-worker hashrate\n" "/workers — each rig's online/offline state\n" "/sync — Monero + Tari node sync progress\n" + "/system — host disk, RAM, CPU, HugePages\n" + "/pool — P2Pool sidechain + Monero network\n" + "/xvb — XvB mode, tier, and raffle eligibility\n" "/help — this message" ) @@ -148,6 +151,58 @@ def format_sync(metrics, host_label=""): ) +def format_system(system, host_label=""): + """Host resource usage — the answer to '/system'. Reads the ``system`` snapshot the dashboard + already collects (disk / RAM / CPU / load / HugePages).""" + disk = system.get("disk", {}) + mem = system.get("memory", {}) + hp_status, _hp_class, hp_value = system.get("hugepages", ["Unknown", "", "0/0"]) + return "\n".join( + [ + f"{_prefix(host_label)}\U0001f5a5️ System", + f"Disk: {disk.get('used_gb', 0):.1f}/{disk.get('total_gb', 0):.1f} GB " + f"({disk.get('percent_str', '0%')})", + f"RAM: {mem.get('used_gb', 0):.1f}/{mem.get('total_gb', 0):.1f} GB " + f"({mem.get('percent_str', '0%')})", + f"CPU: {system.get('cpu_percent', '0%')} · load {system.get('load', 'n/a')}", + f"HugePages: {hp_status} ({hp_value})", + ] + ) + + +def format_pool(metrics, host_label=""): + """P2Pool sidechain + Monero network figures — the answer to '/pool'.""" + return "\n".join( + [ + f"{_prefix(host_label)}\U0001f30a Pool & network", + f"Sidechain: P2Pool {metrics.pool_type}", + f"Pool hashrate: {format_hashrate(metrics.pool_hashrate)}", + f"Network height: {metrics.network_height:,}", + f"PPLNS shares: {metrics.shares_in_window} in window ({metrics.pplns_window} blocks)", + ] + ) + + +def format_xvb(metrics, host_label=""): + """XvB mode / tier / raffle eligibility — the answer to '/xvb'.""" + prefix = _prefix(host_label) + if not metrics.xvb_enabled: + return f"{prefix}\U0001f3b0 XvB is disabled." + lines = [ + f"{prefix}\U0001f3b0 XvB", + f"Mode: {metrics.mode}", + f"Current tier: {metrics.current_tier}", + f"Target tier: {metrics.target_tier}", + f"Routed to XvB: {format_hashrate(metrics.xvb_routed_1h)} (1h)", + ] + # The share half of raffle eligibility (#158): no PPLNS share means XvB wins are skipped. + if metrics.shares_in_window > 0: + lines.append("PPLNS share: \U0001f7e2 held (raffle-eligible)") + else: + lines.append("PPLNS share: ⚠ none — XvB wins skipped") + return "\n".join(lines) + + class TelegramCommandBot: """ On-demand Telegram command interface (Issue #45) — the interactive half of the operator bot. @@ -215,6 +270,10 @@ def reply_for(self, text): return f"{_prefix(self.host_label)}Unknown command.\n{HELP_TEXT}" data = self.data_service.latest_data or {} + # /system reads the raw snapshot only — no need to build the full metrics. + if cmd == "system": + return format_system(data.get("system", {}), self.host_label) + metrics = build_metrics(data, self.data_service.state_manager) if cmd == "status": mining = bool(data.get("miner_released") and not data.get("workers_rejected")) @@ -225,6 +284,10 @@ def reply_for(self, text): return format_workers(data.get("workers", []), self.host_label) if cmd == "sync": return format_sync(metrics, self.host_label) + if cmd == "pool": + return format_pool(metrics, self.host_label) + if cmd == "xvb": + return format_xvb(metrics, self.host_label) return None async def run(self): diff --git a/build/dashboard/mining_dashboard/service/worker_presence.py b/build/dashboard/mining_dashboard/service/worker_presence.py index 3d7858b..10ddd46 100644 --- a/build/dashboard/mining_dashboard/service/worker_presence.py +++ b/build/dashboard/mining_dashboard/service/worker_presence.py @@ -23,13 +23,15 @@ class WorkerPresenceMonitor: - **Silent baseline.** A rig's first sighting registers its current state with no edge — a brand-new rig is not a "recovery", and a rig already DOWN at dashboard start is not a fresh "offline" (a restart must not replay a stale transition). - - **Follows the table.** A rig the proxy stops listing entirely (aged off the worker table, - #182) is forgotten — the dashboard no longer shows it, DOWN or otherwise, so neither does the - alerter. Its state is bounded by exactly what's on screen; if it later returns it re-baselines - silently. + - **Joined / left the table.** A rig the proxy reports for the first time is a ``joined`` edge; + one it stops listing entirely (aged off the worker table, #182) is a ``left`` edge — the + fleet-membership signal, distinct from a known rig going DOWN and back. Both are suppressed + until the monitor is *primed*: the very first cycle (and the first after a :meth:`reset`) + baselines whatever's already connected silently, so a dashboard restart doesn't replay every + rig as a fresh "joined", nor a readmission after a failover. :meth:`update` takes this cycle's worker rows (dicts with ``name`` + ``status``) and returns a - list of ``(name, event)`` edges, ``event`` in ``{"offline", "recovered"}``. + list of ``(name, event)`` edges, ``event`` in ``{"offline", "recovered", "joined", "left"}``. Clock defaults to wall-clock ``time.time``; injectable for deterministic tests. """ @@ -48,37 +50,48 @@ def __init__( # online_since : when the current continuous-online streak began (None while DOWN) # down_since : when the current continuous-DOWN streak began (None while online) self._workers = {} + # False until the first cycle has baselined the currently-connected rigs, so joined/left + # edges don't fire for the startup (or post-reset) roster. + self._primed = False def update(self, workers, now=None): """Feed this cycle's worker rows; return the list of debounced transitions.""" now = self._clock() if now is None else now online = {w.get("name") for w in workers if w.get("status") == "online"} present = {w.get("name") for w in workers} + primed = self._primed edges = [] - # Forget rigs the proxy no longer lists at all — they've fallen off the worker table and - # the dashboard no longer shows them, so the alerter drops them too (a later return - # re-baselines silently). + # Rigs the proxy no longer lists at all have fallen off the worker table — the dashboard no + # longer shows them, so forget them (a later return re-baselines) and, once primed, report + # them as having LEFT the fleet. for name in list(self._workers): if name not in present: del self._workers[name] + if primed: + edges.append((name, "left")) for name in present: - self._step(name, name in online, now, edges) - + if name in self._workers: + self._step(name, name in online, now, edges) + else: + # First sighting — baseline to the rig's current state; once primed, a genuinely + # new rig is a JOINED edge. + is_online = name in online + self._workers[name] = { + "state": "online" if is_online else "offline", + "online_since": now if is_online else None, + "down_since": None if is_online else now, + } + if primed: + edges.append((name, "joined")) + + self._primed = True return edges def _step(self, name, is_online, now, edges): - w = self._workers.get(name) - if w is None: - # First sighting — baseline to the rig's current state, no edge. - self._workers[name] = { - "state": "online" if is_online else "offline", - "online_since": now if is_online else None, - "down_since": None if is_online else now, - } - return - + """Debounce a *known* rig's online/DOWN status into offline/recovered edges.""" + w = self._workers[name] if is_online: w["down_since"] = None if w["online_since"] is None: @@ -99,6 +112,8 @@ def reset(self): Called when the proxy is *intentionally* stopped — during the initial sync hold (#35) or node-down worker failover (#31) — so the expected absence of workers doesn't age - into false "offline" alerts, and re-admission re-baselines every worker silently. + into false "offline" alerts, and re-admission re-baselines every worker silently (no + spurious "recovered", and no "joined"/"left" for the readmitted roster). """ self._workers.clear() + self._primed = False diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index 46391dd..acee804 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -36,6 +36,8 @@ def _ev( miner_released=True, workers=(), workers_expected=False, + disk_percent=0, + db_healthy=True, now=0, ): return svc.evaluate( @@ -45,6 +47,8 @@ def _ev( miner_released=miner_released, workers=list(workers), workers_expected=workers_expected, + disk_percent=disk_percent, + db_healthy=db_healthy, now=now, ) @@ -146,6 +150,48 @@ def test_not_expected_resets_and_silences(self): assert _ev(svc, workers=_on("rig-1"), workers_expected=True, now=360) == [] +class TestWorkerMembership: + def test_joined_after_baseline(self): + svc = _svc() + _ev(svc, workers=_on("rig-1"), workers_expected=True) # prime + assert _keys(_ev(svc, workers=_on("rig-1", "rig-2"), workers_expected=True)) == [ + AlertService.EVT_WORKER_JOINED + ] + + def test_left_when_rig_drops_off_the_table(self): + svc = _svc() + _ev(svc, workers=_on("rig-1", "rig-2"), workers_expected=True) # prime + assert _keys(_ev(svc, workers=_on("rig-1"), workers_expected=True)) == [ + AlertService.EVT_WORKER_LEFT + ] + + +class TestDiskEdges: + def test_warn_then_critical_then_recover(self): + svc = _svc() + assert _ev(svc, disk_percent=40) == [] # seed silently + assert _keys(_ev(svc, disk_percent=88)) == [AlertService.EVT_DISK_SPACE] # -> warn + assert _ev(svc, disk_percent=90) == [] # still warn, no repeat + assert _keys(_ev(svc, disk_percent=97)) == [AlertService.EVT_DISK_SPACE] # -> critical + _, text = _ev(svc, disk_percent=40)[0] # -> recovered + assert "healthy" in text + + def test_seed_high_does_not_replay(self): + svc = _svc() + # Already-full at startup must not fire (restart semantics). + assert _ev(svc, disk_percent=99) == [] + + +class TestDbEdges: + def test_unhealthy_then_recovered(self): + svc = _svc() + assert _ev(svc, db_healthy=True) == [] # seed + assert _keys(_ev(svc, db_healthy=False)) == [AlertService.EVT_DB_UNHEALTHY] + assert _ev(svc, db_healthy=False) == [] # no repeat + _, text = _ev(svc, db_healthy=True)[0] + assert "recovered" in text + + class TestEventFiltering: def test_disabled_events_are_dropped(self): svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index 10aa9b0..37b310d 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -75,6 +75,9 @@ def _metrics(**over): ("/status", "status"), (" /sync ", "sync"), ("/HASHRATE", "hashrate"), + ("/system", "system"), + ("/pool", "pool"), + ("/xvb", "xvb"), ("/status@PitheadBot", "status"), # group @mention suffix stripped ("/workers now please", "workers"), # only the first word matters ("/help", "help"), @@ -166,6 +169,43 @@ def test_sync_line_no_target(): assert "Monero: ⏳ syncing 12.0%" in tc.format_sync(_metrics(monero=no_target)) +def test_system_reads_snapshot(): + system = { + "disk": {"used_gb": 120.4, "total_gb": 500.0, "percent_str": "24%"}, + "memory": {"used_gb": 3.2, "total_gb": 16.0, "percent_str": "20%"}, + "cpu_percent": "12.5%", + "load": "0.50 0.40 0.30", + "hugepages": ["Enabled", "status-ok", "3072/3072"], + } + out = tc.format_system(system) + assert "Disk: 120.4/500.0 GB (24%)" in out + assert "RAM: 3.2/16.0 GB (20%)" in out + assert "CPU: 12.5%" in out + assert "HugePages: Enabled (3072/3072)" in out + + +def test_pool_reads_metrics(): + out = tc.format_pool(_metrics(pool_type="Mini", network_height=3210001)) + assert "P2Pool Mini" in out + assert "Network height: 3,210,001" in out + assert "5 in window" in out # shares_in_window from _BASE + + +def test_xvb_enabled_with_share(): + out = tc.format_xvb(_metrics(xvb_enabled=True, shares_in_window=5)) + assert "Current tier: Donor" in out + assert "raffle-eligible" in out + + +def test_xvb_no_share_warns(): + out = tc.format_xvb(_metrics(xvb_enabled=True, shares_in_window=0)) + assert "wins skipped" in out + + +def test_xvb_disabled(): + assert "disabled" in tc.format_xvb(_metrics(xvb_enabled=False)) + + def test_host_label_prefix(): assert tc.format_sync(_metrics(), host_label="rig-box").startswith("[rig-box] ") # The placeholder is never printed. @@ -203,6 +243,19 @@ def test_reply_for_workers_reads_snapshot(monkeypatch): assert "z" in bot.reply_for("/workers") +def test_reply_for_system_reads_snapshot_without_metrics(): + # /system reads only the raw snapshot — build_metrics must not be needed (left unstubbed). + ds = SimpleNamespace(latest_data={"system": {"cpu_percent": "9%"}}, state_manager=None) + bot = tc.TelegramCommandBot(ds, enabled=True, bot_token="t", chat_id="1", host_label="") + assert "CPU: 9%" in bot.reply_for("/system") + + +def test_reply_for_pool_and_xvb(monkeypatch): + bot = _bot(monkeypatch, latest_data={}, pool_type="Nano") + assert "P2Pool Nano" in bot.reply_for("/pool") + assert "XvB" in bot.reply_for("/xvb") + + # --- enabled gating ----------------------------------------------------------------------- diff --git a/build/dashboard/tests/service/test_worker_presence.py b/build/dashboard/tests/service/test_worker_presence.py index 38abad0..e4ac43e 100644 --- a/build/dashboard/tests/service/test_worker_presence.py +++ b/build/dashboard/tests/service/test_worker_presence.py @@ -85,13 +85,13 @@ def test_brief_down_does_not_trip(self): clock.advance(30) assert m.update(_on("rig-1")) == [] # back well before 300s - def test_vanishing_from_table_is_not_offline(self): - # A rig the proxy stops listing entirely (fell off the worker table) is forgotten, not - # aged to "offline" — the dashboard no longer shows it, so neither does the alerter. + def test_vanishing_from_table_is_left_not_offline(self): + # A rig the proxy stops listing entirely (fell off the worker table) is reported as having + # LEFT — never aged to "offline", which is reserved for the DOWN-but-still-listed state. m, clock = _monitor() - m.update(_on("rig-1")) + m.update(_on("rig-1")) # prime + baseline clock.advance(600) - assert m.update([]) == [] # gone from the table, never went DOWN → no alert + assert m.update([]) == [("rig-1", "left")] assert "rig-1" not in m._workers @@ -154,8 +154,32 @@ def test_worker_forgotten_when_it_leaves_the_table(self): m.update(_down("rig-1")) clock.advance(300) m.update(_down("rig-1")) # offline emitted - # The lifecycle eventually drops the ghost from the worker table (#182). - assert m.update([]) == [] + # The lifecycle eventually drops the ghost from the worker table (#182) → LEFT edge. + assert m.update([]) == [("rig-1", "left")] assert "rig-1" not in m._workers - # Returning after that counts as new: silent baseline, not a recovery. - assert m.update(_on("rig-1")) == [] + # Returning after that counts as a fresh JOIN. + assert m.update(_on("rig-1")) == [("rig-1", "joined")] + + +class TestJoinLeave: + def test_first_cycle_baselines_silently(self): + # The startup roster is baselined without joined edges — a restart isn't a fleet change. + m, _ = _monitor() + assert m.update(_on("rig-1", "rig-2")) == [] + + def test_new_worker_after_prime_joins(self): + m, _ = _monitor() + m.update(_on("rig-1")) # prime + assert m.update(_on("rig-1", "rig-2")) == [("rig-2", "joined")] + + def test_worker_leaving_emits_left(self): + m, _ = _monitor() + m.update(_on("rig-1", "rig-2")) # prime + assert m.update(_on("rig-1")) == [("rig-2", "left")] + + def test_reset_reprimes_without_joins(self): + m, _ = _monitor() + m.update(_on("rig-1")) # prime + m.reset() # e.g. proxy stopped for a failover — clears the prime flag + # Readmission re-baselines the whole roster silently — no "joined" spam. + assert m.update(_on("rig-1", "rig-2")) == [] diff --git a/config.reference.json b/config.reference.json index 3b50b79..0298c74 100644 --- a/config.reference.json +++ b/config.reference.json @@ -89,7 +89,11 @@ "node_recovered": true, "worker_offline": true, "worker_recovered": true, - "sync_finished": true + "worker_joined": true, + "worker_left": true, + "sync_finished": true, + "disk_space": true, + "db_unhealthy": true }, "commands": { "enabled": false diff --git a/docker-compose.yml b/docker-compose.yml index 2bb0622..977e8ab 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -436,7 +436,11 @@ services: - TELEGRAM_EVENT_NODE_RECOVERED=${TELEGRAM_EVENT_NODE_RECOVERED:-true} - TELEGRAM_EVENT_WORKER_OFFLINE=${TELEGRAM_EVENT_WORKER_OFFLINE:-true} - TELEGRAM_EVENT_WORKER_RECOVERED=${TELEGRAM_EVENT_WORKER_RECOVERED:-true} + - TELEGRAM_EVENT_WORKER_JOINED=${TELEGRAM_EVENT_WORKER_JOINED:-true} + - TELEGRAM_EVENT_WORKER_LEFT=${TELEGRAM_EVENT_WORKER_LEFT:-true} - TELEGRAM_EVENT_SYNC_FINISHED=${TELEGRAM_EVENT_SYNC_FINISHED:-true} + - TELEGRAM_EVENT_DISK_SPACE=${TELEGRAM_EVENT_DISK_SPACE:-true} + - TELEGRAM_EVENT_DB_UNHEALTHY=${TELEGRAM_EVENT_DB_UNHEALTHY:-true} - TELEGRAM_COMMANDS_ENABLED=${TELEGRAM_COMMANDS_ENABLED:-false} # --- Docker Socket Proxy (read-only) --- diff --git a/docs/telegram.md b/docs/telegram.md index 7076215..199b716 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -23,9 +23,13 @@ transition, not a stream: |---|---| | 🔴 **Node down** | Your Monero (or Tari) node has been unreachable long enough to be considered down — the stack has stopped serving your rigs so they **fail over to their backup pools**. | | 🟢 **Node recovered** | The node is back and stable; the stack has readmitted your rigs. | -| 🔴 **Worker offline** | A rig stopped hashing and hasn't been seen for a few minutes (a reboot, a dropped connection, a dead miner). | +| 🔴 **Worker offline** | A rig stopped hashing and hasn't been seen for a few minutes (a reboot, a dropped connection, a dead miner) — it's showing **DOWN** on the dashboard. | | 🟢 **Worker back online** | A rig that had gone offline is hashing again. | +| 🟢 **New worker joined** | A rig the stack hasn't seen before connected — a new miner joined the fleet. | +| ⚪ **Worker left** | A rig dropped off the dashboard entirely (removed from the worker list, not just DOWN). | | ✅ **Sync finished** | The initial blockchain sync completed and mining has started — handy on first run, when the sync can take hours. | +| 🟠 **Disk filling up** | The data disk crossed the warn/critical threshold — a full disk can corrupt the Monero database, so free space before it runs out. | +| 🔴 **DB write failing** | The dashboard can no longer write to its database; hashrate history, shares, and stats will be lost on restart until it's fixed (usually disk space or permissions). | Every message is prefixed with your dashboard hostname (e.g. `[rig-box.lan]`), so if you point more than one stack at the same chat you can tell them apart. @@ -134,9 +138,13 @@ block and set it to `false` — any event you don't list stays on: |---|---|---| | `node_down` | `true` | Monero/Tari node went down | | `node_recovered` | `true` | …and came back | -| `worker_offline` | `true` | A rig dropped off | +| `worker_offline` | `true` | A rig went DOWN | | `worker_recovered` | `true` | …and came back | +| `worker_joined` | `true` | A new rig joined the fleet | +| `worker_left` | `true` | A rig dropped off the dashboard entirely | | `sync_finished` | `true` | Initial sync done, mining started | +| `disk_space` | `true` | Data disk filling up / critical / recovered | +| `db_unhealthy` | `true` | Dashboard database writes failing / recovered | Run `./pithead apply` after editing. @@ -170,6 +178,9 @@ Run `./pithead apply` after editing. The commands: | `/hashrate` | Total hashrate plus a per-rig breakdown of everything currently online. | | `/workers` | Every rig's online/offline state, with uptime for the ones that are up. | | `/sync` | Monero and Tari sync progress (percent and block height). | +| `/system` | Host resources: disk, RAM, CPU + load, and HugePages. | +| `/pool` | P2Pool sidechain type, pool hashrate, Monero network height, and PPLNS shares in window. | +| `/xvb` | XvB mode, current and target tier, hashrate routed to XvB, and raffle eligibility (PPLNS share). | | `/help` | The command list. | The numbers come from the **same source as the dashboard**, so a reply and the web view always diff --git a/docs/test-inventory.md b/docs/test-inventory.md index ed41b2c..0e80ac4 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 694 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 710 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 694 | +| 1 — Unit | dashboard pytest | 710 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 694 tests +### Dashboard (pytest) — 710 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 15 +#### tests/service/test_alert_service.py — 20 - test_first_cycle_seeds_baseline_silently - test_down_then_recovered - test_node_text_names_the_chain @@ -221,6 +221,11 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_no_alert_on_restart_after_sync - test_offline_then_recovered - test_not_expected_resets_and_silences +- test_joined_after_baseline +- test_left_when_rig_drops_off_the_table +- test_warn_then_critical_then_recover +- test_seed_high_does_not_replay +- test_unhealthy_then_recovered - test_disabled_events_are_dropped - test_prefixes_when_set - test_placeholder_host_is_not_prefixed @@ -523,7 +528,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires -#### tests/service/test_telegram_commands.py — 27 +#### tests/service/test_telegram_commands.py — 34 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -535,10 +540,17 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_status_node_syncing_percent - test_sync_line_variants - test_sync_line_no_target +- test_system_reads_snapshot +- test_pool_reads_metrics +- test_xvb_enabled_with_share +- test_xvb_no_share_warns +- test_xvb_disabled - test_host_label_prefix - test_reply_for_help_and_unknown_need_no_metrics - test_reply_for_status_uses_mining_flag - test_reply_for_workers_reads_snapshot +- test_reply_for_system_reads_snapshot_without_metrics +- test_reply_for_pool_and_xvb - test_disabled_without_token_or_chat - test_run_is_noop_when_disabled - test_handle_update_ignores_foreign_chat @@ -581,7 +593,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_failed_fetch_keeps_previous_result - test_up_to_date_yields_none -#### tests/service/test_worker_presence.py — 13 +#### tests/service/test_worker_presence.py — 17 - test_first_sighting_online_is_silent - test_first_sighting_down_is_silent - test_steady_online_emits_nothing @@ -589,12 +601,16 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_offline_after_threshold - test_offline_emitted_once - test_brief_down_does_not_trip -- test_vanishing_from_table_is_not_offline +- test_vanishing_from_table_is_left_not_offline - test_recovered_only_after_stable_window - test_flap_during_recovery_does_not_emit - test_independent_per_worker_state - test_reset_clears_state_and_rebaselines_silently - test_worker_forgotten_when_it_leaves_the_table +- test_first_cycle_baselines_silently +- test_new_worker_after_prime_joins +- test_worker_leaving_emits_left +- test_reset_reprimes_without_joins #### tests/sim/test_donation_model.py — 10 - test_holds_tier_without_overshoot @@ -1067,5 +1083,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **857** enumerated cases/sections across the four tiers (plus the live +_Grand total: **873** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/pithead b/pithead index 88e0a61..bfcc1da 100755 --- a/pithead +++ b/pithead @@ -2093,12 +2093,17 @@ render_env() { tg_commands=$(jq -r 'if .telegram.commands.enabled != null then .telegram.commands.enabled | tostring else "false" end' "$CONFIG_FILE") # One toggle per event, defaulting to true when the key is absent. tg_event() { jq -r --arg k "$1" 'if .telegram.events[$k] != null then .telegram.events[$k] | tostring else "true" end' "$CONFIG_FILE"; } - local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered tg_ev_sync_finished + local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered + local tg_ev_worker_joined tg_ev_worker_left tg_ev_sync_finished tg_ev_disk_space tg_ev_db_unhealthy tg_ev_node_down=$(tg_event node_down) tg_ev_node_recovered=$(tg_event node_recovered) tg_ev_worker_offline=$(tg_event worker_offline) tg_ev_worker_recovered=$(tg_event worker_recovered) + tg_ev_worker_joined=$(tg_event worker_joined) + tg_ev_worker_left=$(tg_event worker_left) tg_ev_sync_finished=$(tg_event sync_finished) + tg_ev_disk_space=$(tg_event disk_space) + tg_ev_db_unhealthy=$(tg_event db_unhealthy) # Tari memory cap (#55). Tari officially needs only a few GB (min 4 GB host, 8 GB+ recommended), # but its memory grows unbounded over time — one 32 GB host was seen at ~11 GB while staying @@ -2185,7 +2190,11 @@ TELEGRAM_EVENT_NODE_DOWN=$tg_ev_node_down TELEGRAM_EVENT_NODE_RECOVERED=$tg_ev_node_recovered TELEGRAM_EVENT_WORKER_OFFLINE=$tg_ev_worker_offline TELEGRAM_EVENT_WORKER_RECOVERED=$tg_ev_worker_recovered +TELEGRAM_EVENT_WORKER_JOINED=$tg_ev_worker_joined +TELEGRAM_EVENT_WORKER_LEFT=$tg_ev_worker_left TELEGRAM_EVENT_SYNC_FINISHED=$tg_ev_sync_finished +TELEGRAM_EVENT_DISK_SPACE=$tg_ev_disk_space +TELEGRAM_EVENT_DB_UNHEALTHY=$tg_ev_db_unhealthy MONERO_MEM_LIMIT=$monero_mem_limit P2POOL_URL=${NETWORK_PREFIX}.28:3333 NETWORK_SUBNET=$NETWORK_SUBNET From fbade8559390b96b12475548dab44ce68c4e9e0c Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Thu, 2 Jul 2026 17:00:19 -0500 Subject: [PATCH 05/18] feat(#121/#45): XvB no-share + clearnet-exposed alerts, /earnings command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed the dashboard's own noteworthy-state catalog (build_badges) and added the high-value, cheap-to-reuse signals: - xvb_no_share — donating to XvB with no PPLNS share in the window means raffle wins are skipped (#158); revenue make-or-break. Gated on XvB enabled. - clearnet_exposed — a node syncing over clearnet exposes the host IP (#183); privacy signal on a Tor-first stack. Reverts to Tor automatically (#234). Both computed in the data loop from existing figures (shares_in_pplns_window, clearnet_sync_state) and passed as scalars, keeping evaluate() pure. Events default true. /earnings command — estimated P2Pool XMR/day, reusing service/earnings.xmr_per_hs_day applied to the displayed P2Pool 1h hashrate (no web-layer import). Larger follow-ups raised as separate issues (block/payout-found, container crash-loop, two-way control, XvB-registration alert, etc). make test green; patch coverage 97%. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 9 +-- .../mining_dashboard/config/config.py | 2 + .../mining_dashboard/service/alert_service.py | 70 +++++++++++++++++++ .../mining_dashboard/service/data_service.py | 16 ++++- .../service/telegram_commands.py | 26 ++++++- .../tests/service/test_alert_service.py | 38 ++++++++++ .../tests/service/test_telegram_commands.py | 18 +++++ config.reference.json | 4 +- docker-compose.yml | 2 + docs/telegram.md | 5 ++ docs/test-inventory.md | 18 +++-- pithead | 5 ++ 12 files changed, 198 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73374e0..fe77967 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -101,10 +101,11 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay owner-only `.env`. See [`docs/monitoring.md`](docs/monitoring.md) (#79). - **Telegram operator bot — push alerts + on-demand status** (#121, #45): the dashboard can push a high-value set of operational alerts to Telegram — **node down / recovered**, **worker offline / - back online**, **new worker joined / left**, **sync finished**, **data disk filling up**, and - **dashboard DB write failing** — and answer status commands on demand: **`/status`**, - **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, **`/xvb`**, and - **`/help`**. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, + back online**, **new worker joined / left**, **sync finished**, **data disk filling up**, + **dashboard DB write failing**, **no PPLNS share while donating to XvB** (raffle wins skipped), and + **a node exposed on clearnet** during initial sync — and answer status commands on demand: + **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, + **`/xvb`**, **`/earnings`**, and **`/help`**. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, `bot_token`, `chat_id`, per-event `events` toggles, and a `commands.enabled` switch for the interactive half). Every alert is **debounced** so a momentary blip won't ping you and you get one message per real transition — and each is built by *reusing* what the dashboard already computes: diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 9f3caf4..a0a9c68 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -264,6 +264,8 @@ def _telegram_event_enabled(name, default=True): "sync_finished": _telegram_event_enabled("sync_finished"), "disk_space": _telegram_event_enabled("disk_space"), "db_unhealthy": _telegram_event_enabled("db_unhealthy"), + "xvb_no_share": _telegram_event_enabled("xvb_no_share"), + "clearnet_exposed": _telegram_event_enabled("clearnet_exposed"), } # Worker offline/online debounce (Issue #121). A worker must be unseen this long before it's diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 8ad9cbb..1fc1b7a 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -65,6 +65,8 @@ class AlertService: EVT_SYNC_FINISHED = "sync_finished" EVT_DISK_SPACE = "disk_space" EVT_DB_UNHEALTHY = "db_unhealthy" + EVT_XVB_NO_SHARE = "xvb_no_share" + EVT_CLEARNET_EXPOSED = "clearnet_exposed" # WorkerPresenceMonitor edge -> (event key, message template). _WORKER_EDGES = { @@ -85,6 +87,8 @@ def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): self._prev_released = None self._prev_disk_level = None self._prev_db_healthy = None + self._prev_xvb_has_share = None + self._prev_clearnet_active = None @property def enabled(self): @@ -101,6 +105,9 @@ def evaluate( workers_expected, disk_percent=0, db_healthy=True, + xvb_enabled=False, + shares_in_window=0, + clearnet_active=False, now=None, ): """Pure: fold this cycle's signals into the list of ``(event_key, text)`` to send, @@ -144,6 +151,10 @@ def evaluate( alerts += self._disk_edges(disk_percent) alerts += self._db_edges(db_healthy) + # --- Revenue / privacy: XvB PPLNS-share gate, clearnet-sync exposure --- + alerts += self._xvb_share_edges(xvb_enabled, shares_in_window) + alerts += self._clearnet_edges(clearnet_active) + return [(evt, text) for evt, text in alerts if self.notifier.event_enabled(evt)] def _node_edges(self, label, down, attr): @@ -213,6 +224,65 @@ def _db_edges(self, db_healthy): ] return [(self.EVT_DB_UNHEALTHY, self._fmt("\U0001f7e2 Dashboard DB writes recovered."))] + def _xvb_share_edges(self, xvb_enabled, shares_in_window): + """Alert on losing / regaining the PPLNS share XvB needs to bank a raffle win (#158). + + Only meaningful while XvB is on. A donating rig with **no** share in the PPLNS window has + its wins skipped (and accrues a fail) regardless of tier — a make-or-break, revenue-costing + state worth a ping.""" + if not xvb_enabled: + # No XvB → the share gate doesn't apply; drop the baseline so turning XvB back on later + # doesn't replay a stale edge. + self._prev_xvb_has_share = None + return [] + has_share = shares_in_window > 0 + prev = self._prev_xvb_has_share + self._prev_xvb_has_share = has_share + if prev is None or has_share == prev: + return [] + if not has_share: + return [ + ( + self.EVT_XVB_NO_SHARE, + self._fmt( + "⚠ No PPLNS share — XvB raffle wins are skipped until you land one " + "(donations are wasted meanwhile)." + ), + ) + ] + return [ + ( + self.EVT_XVB_NO_SHARE, + self._fmt("\U0001f7e2 PPLNS share restored — XvB raffle wins count again."), + ) + ] + + def _clearnet_edges(self, clearnet_active): + """Alert while a node is doing its initial sync over CLEARNET (#183): the host IP is exposed + to that chain's P2P network until it finishes (it reverts to Tor automatically, #234).""" + prev = self._prev_clearnet_active + self._prev_clearnet_active = clearnet_active + if prev is None or clearnet_active == prev: + return [] + if clearnet_active: + return [ + ( + self.EVT_CLEARNET_EXPOSED, + self._fmt( + "⚠ Clearnet initial sync ACTIVE — this host's IP is exposed to the chain's " + "P2P network until it finishes syncing (reverts to Tor automatically)." + ), + ) + ] + return [ + ( + self.EVT_CLEARNET_EXPOSED, + self._fmt( + "\U0001f7e2 Back on Tor-only — clearnet sync finished, host IP no longer exposed." + ), + ) + ] + def _fmt(self, text): return f"[{self.host_label}] {text}" if self.host_label else text diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index 2fdb749..83d23d6 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -786,6 +786,16 @@ async def run(self): # never raises. disk_usage = get_disk_usage() db_healthy = self.state_manager.is_db_healthy() + # Fetch fresh shares list (also used to populate the UI below) so the PPLNS-share + # gate the XvB alert watches is computed from the same figure the dashboard shows. + shares_list = await asyncio.to_thread(self.state_manager.get_shares) + pool_local = p2pool_stats.get("pool", {}) + pool_type = p2pool_stats.get("p2p", {}).get("type", "Main") + shares_in_window = shares_in_pplns_window( + shares_list, + pool_local.get("pplns_window", DEFAULT_PPLNS_WINDOW), + pplns_block_time(pool_type), + ) await self.alert_service.process( monero_down=monero_down, tari_down=tari_down, @@ -797,11 +807,11 @@ async def run(self): workers_expected=self.miner_released and not self.workers_rejected, disk_percent=(disk_usage or {}).get("percent", 0) or 0, db_healthy=db_healthy, + xvb_enabled=ENABLE_XVB, + shares_in_window=shares_in_window, + clearnet_active=bool(self.clearnet_sync_state.get("active")), ) - # Fetch fresh shares list to populate UI - shares_list = await asyncio.to_thread(self.state_manager.get_shares) - self.latest_data.update( { "workers": final_workers, diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 7055d70..647da1f 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -11,6 +11,7 @@ TELEGRAM_ENABLED, ) from mining_dashboard.helper.utils import format_duration, format_hashrate +from mining_dashboard.service.earnings import xmr_per_hs_day from mining_dashboard.service.metrics import build_metrics from mining_dashboard.service.telegram_notifier import TELEGRAM_API_BASE @@ -25,7 +26,7 @@ # The commands the bot answers. All are read-only status queries — the bot can never change the # stack (start/stop/apply live on the CLI), so a leaked chat can at worst read status, not act. -COMMANDS = ("status", "hashrate", "workers", "sync", "system", "pool", "xvb", "help") +COMMANDS = ("status", "hashrate", "workers", "sync", "system", "pool", "xvb", "earnings", "help") HELP_TEXT = ( "Pithead bot — commands:\n" @@ -36,6 +37,7 @@ "/system — host disk, RAM, CPU, HugePages\n" "/pool — P2Pool sidechain + Monero network\n" "/xvb — XvB mode, tier, and raffle eligibility\n" + "/earnings — estimated P2Pool XMR per day\n" "/help — this message" ) @@ -203,6 +205,26 @@ def format_xvb(metrics, host_label=""): return "\n".join(lines) +def format_earnings(metrics, network, host_label=""): + """Estimated P2Pool XMR earnings — the answer to '/earnings'. Reuses the same rate the dashboard + calculator uses (``xmr_per_hs_day``) applied to the displayed P2Pool 1h-average hashrate; Tari + merge-mining earnings are a separate thing and not included (#12).""" + reward_atomic = (network or {}).get("reward", 0) or 0 + coeff_day = xmr_per_hs_day(reward_atomic, metrics.network_difficulty) + if coeff_day <= 0: + return f"{_prefix(host_label)}\U0001f4b0 Earnings estimate unavailable (waiting on network data)." + daily = coeff_day * metrics.p2pool_1h + return "\n".join( + [ + f"{_prefix(host_label)}\U0001f4b0 Estimated P2Pool earnings", + f"Hashrate (P2Pool 1h): {format_hashrate(metrics.p2pool_1h)}", + f"~{daily:.6f} XMR/day", + f"~{daily * 30:.5f} XMR/30d", + "Estimate only — excludes XvB-donated hashrate and Tari merge-mining.", + ] + ) + + class TelegramCommandBot: """ On-demand Telegram command interface (Issue #45) — the interactive half of the operator bot. @@ -288,6 +310,8 @@ def reply_for(self, text): return format_pool(metrics, self.host_label) if cmd == "xvb": return format_xvb(metrics, self.host_label) + if cmd == "earnings": + return format_earnings(metrics, data.get("network", {}), self.host_label) return None async def run(self): diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index acee804..d1ede6f 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -38,6 +38,9 @@ def _ev( workers_expected=False, disk_percent=0, db_healthy=True, + xvb_enabled=False, + shares_in_window=0, + clearnet_active=False, now=0, ): return svc.evaluate( @@ -49,6 +52,9 @@ def _ev( workers_expected=workers_expected, disk_percent=disk_percent, db_healthy=db_healthy, + xvb_enabled=xvb_enabled, + shares_in_window=shares_in_window, + clearnet_active=clearnet_active, now=now, ) @@ -192,6 +198,38 @@ def test_unhealthy_then_recovered(self): assert "recovered" in text +class TestXvbShareEdges: + def test_no_share_then_restored(self): + svc = _svc() + assert _ev(svc, xvb_enabled=True, shares_in_window=3) == [] # seed: has a share + assert _keys(_ev(svc, xvb_enabled=True, shares_in_window=0)) == [ + AlertService.EVT_XVB_NO_SHARE + ] + assert _ev(svc, xvb_enabled=True, shares_in_window=0) == [] # no repeat + _, text = _ev(svc, xvb_enabled=True, shares_in_window=1)[0] # restored + assert "restored" in text + + def test_silent_while_xvb_disabled(self): + svc = _svc() + # XvB off → the share gate doesn't apply, even with zero shares. + assert _ev(svc, xvb_enabled=False, shares_in_window=0) == [] + # Turning XvB on re-seeds silently (no stale replay), then alerts on a real loss. + assert _ev(svc, xvb_enabled=True, shares_in_window=2) == [] + assert _keys(_ev(svc, xvb_enabled=True, shares_in_window=0)) == [ + AlertService.EVT_XVB_NO_SHARE + ] + + +class TestClearnetEdges: + def test_exposed_then_reverted(self): + svc = _svc() + assert _ev(svc, clearnet_active=False) == [] # seed + assert _keys(_ev(svc, clearnet_active=True)) == [AlertService.EVT_CLEARNET_EXPOSED] + assert _ev(svc, clearnet_active=True) == [] # no repeat + _, text = _ev(svc, clearnet_active=False)[0] + assert "Tor-only" in text + + class TestEventFiltering: def test_disabled_events_are_dropped(self): svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index 37b310d..cbd5473 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -78,6 +78,7 @@ def _metrics(**over): ("/system", "system"), ("/pool", "pool"), ("/xvb", "xvb"), + ("/earnings", "earnings"), ("/status@PitheadBot", "status"), # group @mention suffix stripped ("/workers now please", "workers"), # only the first word matters ("/help", "help"), @@ -206,6 +207,18 @@ def test_xvb_disabled(): assert "disabled" in tc.format_xvb(_metrics(xvb_enabled=False)) +def test_earnings_estimate(): + # network reward present + a real difficulty → a positive daily figure. + out = tc.format_earnings(_metrics(p2pool_1h=8000.0), {"reward": 600_000_000_000}) + assert "XMR/day" in out + assert "XMR/30d" in out + + +def test_earnings_unavailable_without_network_data(): + out = tc.format_earnings(_metrics(), {}) # no reward → coeff 0 + assert "unavailable" in out + + def test_host_label_prefix(): assert tc.format_sync(_metrics(), host_label="rig-box").startswith("[rig-box] ") # The placeholder is never printed. @@ -256,6 +269,11 @@ def test_reply_for_pool_and_xvb(monkeypatch): assert "XvB" in bot.reply_for("/xvb") +def test_reply_for_earnings(monkeypatch): + bot = _bot(monkeypatch, latest_data={"network": {"reward": 600_000_000_000}}, p2pool_1h=8000.0) + assert "XMR/day" in bot.reply_for("/earnings") + + # --- enabled gating ----------------------------------------------------------------------- diff --git a/config.reference.json b/config.reference.json index 0298c74..ec545ef 100644 --- a/config.reference.json +++ b/config.reference.json @@ -93,7 +93,9 @@ "worker_left": true, "sync_finished": true, "disk_space": true, - "db_unhealthy": true + "db_unhealthy": true, + "xvb_no_share": true, + "clearnet_exposed": true }, "commands": { "enabled": false diff --git a/docker-compose.yml b/docker-compose.yml index 977e8ab..b112c69 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -441,6 +441,8 @@ services: - TELEGRAM_EVENT_SYNC_FINISHED=${TELEGRAM_EVENT_SYNC_FINISHED:-true} - TELEGRAM_EVENT_DISK_SPACE=${TELEGRAM_EVENT_DISK_SPACE:-true} - TELEGRAM_EVENT_DB_UNHEALTHY=${TELEGRAM_EVENT_DB_UNHEALTHY:-true} + - TELEGRAM_EVENT_XVB_NO_SHARE=${TELEGRAM_EVENT_XVB_NO_SHARE:-true} + - TELEGRAM_EVENT_CLEARNET_EXPOSED=${TELEGRAM_EVENT_CLEARNET_EXPOSED:-true} - TELEGRAM_COMMANDS_ENABLED=${TELEGRAM_COMMANDS_ENABLED:-false} # --- Docker Socket Proxy (read-only) --- diff --git a/docs/telegram.md b/docs/telegram.md index 199b716..f6f16ee 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -30,6 +30,8 @@ transition, not a stream: | ✅ **Sync finished** | The initial blockchain sync completed and mining has started — handy on first run, when the sync can take hours. | | 🟠 **Disk filling up** | The data disk crossed the warn/critical threshold — a full disk can corrupt the Monero database, so free space before it runs out. | | 🔴 **DB write failing** | The dashboard can no longer write to its database; hashrate history, shares, and stats will be lost on restart until it's fixed (usually disk space or permissions). | +| ⚠ **No PPLNS share (XvB)** | You're donating to XvB but hold no share in the PPLNS window, so raffle wins are **skipped** — donations are wasted until you land one. Only fires when XvB is enabled. | +| ⚠ **Clearnet sync active** | A node is doing its initial sync over **clearnet**, so this host's IP is exposed to that chain's P2P network until it finishes (it reverts to Tor automatically). | Every message is prefixed with your dashboard hostname (e.g. `[rig-box.lan]`), so if you point more than one stack at the same chat you can tell them apart. @@ -145,6 +147,8 @@ block and set it to `false` — any event you don't list stays on: | `sync_finished` | `true` | Initial sync done, mining started | | `disk_space` | `true` | Data disk filling up / critical / recovered | | `db_unhealthy` | `true` | Dashboard database writes failing / recovered | +| `xvb_no_share` | `true` | XvB on but no PPLNS share (wins skipped) / restored | +| `clearnet_exposed` | `true` | A node is syncing over clearnet (IP exposed) / back on Tor | Run `./pithead apply` after editing. @@ -181,6 +185,7 @@ Run `./pithead apply` after editing. The commands: | `/system` | Host resources: disk, RAM, CPU + load, and HugePages. | | `/pool` | P2Pool sidechain type, pool hashrate, Monero network height, and PPLNS shares in window. | | `/xvb` | XvB mode, current and target tier, hashrate routed to XvB, and raffle eligibility (PPLNS share). | +| `/earnings` | Estimated P2Pool XMR per day/month from your current hashrate (P2Pool only — excludes XvB-donated hashrate and Tari). | | `/help` | The command list. | The numbers come from the **same source as the dashboard**, so a reply and the web view always diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 0e80ac4..7add683 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 710 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 716 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 710 | +| 1 — Unit | dashboard pytest | 716 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 710 tests +### Dashboard (pytest) — 716 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 20 +#### tests/service/test_alert_service.py — 23 - test_first_cycle_seeds_baseline_silently - test_down_then_recovered - test_node_text_names_the_chain @@ -226,6 +226,9 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_warn_then_critical_then_recover - test_seed_high_does_not_replay - test_unhealthy_then_recovered +- test_no_share_then_restored +- test_silent_while_xvb_disabled +- test_exposed_then_reverted - test_disabled_events_are_dropped - test_prefixes_when_set - test_placeholder_host_is_not_prefixed @@ -528,7 +531,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires -#### tests/service/test_telegram_commands.py — 34 +#### tests/service/test_telegram_commands.py — 37 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -545,12 +548,15 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_xvb_enabled_with_share - test_xvb_no_share_warns - test_xvb_disabled +- test_earnings_estimate +- test_earnings_unavailable_without_network_data - test_host_label_prefix - test_reply_for_help_and_unknown_need_no_metrics - test_reply_for_status_uses_mining_flag - test_reply_for_workers_reads_snapshot - test_reply_for_system_reads_snapshot_without_metrics - test_reply_for_pool_and_xvb +- test_reply_for_earnings - test_disabled_without_token_or_chat - test_run_is_noop_when_disabled - test_handle_update_ignores_foreign_chat @@ -1083,5 +1089,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **873** enumerated cases/sections across the four tiers (plus the live +_Grand total: **879** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/pithead b/pithead index bfcc1da..3a7f25a 100755 --- a/pithead +++ b/pithead @@ -2095,6 +2095,7 @@ render_env() { tg_event() { jq -r --arg k "$1" 'if .telegram.events[$k] != null then .telegram.events[$k] | tostring else "true" end' "$CONFIG_FILE"; } local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered local tg_ev_worker_joined tg_ev_worker_left tg_ev_sync_finished tg_ev_disk_space tg_ev_db_unhealthy + local tg_ev_xvb_no_share tg_ev_clearnet_exposed tg_ev_node_down=$(tg_event node_down) tg_ev_node_recovered=$(tg_event node_recovered) tg_ev_worker_offline=$(tg_event worker_offline) @@ -2104,6 +2105,8 @@ render_env() { tg_ev_sync_finished=$(tg_event sync_finished) tg_ev_disk_space=$(tg_event disk_space) tg_ev_db_unhealthy=$(tg_event db_unhealthy) + tg_ev_xvb_no_share=$(tg_event xvb_no_share) + tg_ev_clearnet_exposed=$(tg_event clearnet_exposed) # Tari memory cap (#55). Tari officially needs only a few GB (min 4 GB host, 8 GB+ recommended), # but its memory grows unbounded over time — one 32 GB host was seen at ~11 GB while staying @@ -2195,6 +2198,8 @@ TELEGRAM_EVENT_WORKER_LEFT=$tg_ev_worker_left TELEGRAM_EVENT_SYNC_FINISHED=$tg_ev_sync_finished TELEGRAM_EVENT_DISK_SPACE=$tg_ev_disk_space TELEGRAM_EVENT_DB_UNHEALTHY=$tg_ev_db_unhealthy +TELEGRAM_EVENT_XVB_NO_SHARE=$tg_ev_xvb_no_share +TELEGRAM_EVENT_CLEARNET_EXPOSED=$tg_ev_clearnet_exposed MONERO_MEM_LIMIT=$monero_mem_limit P2POOL_URL=${NETWORK_PREFIX}.28:3333 NETWORK_SUBNET=$NETWORK_SUBNET From ebc691de707d61b79c393d84083165478085679c Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Thu, 2 Jul 2026 17:26:10 -0500 Subject: [PATCH 06/18] feat(telegram): route over Tor, 'online' heartbeat, more alerts, emoji, egress diagram MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Tor routing (#340): notifier + command bot now dial api.telegram.org through the bridge Tor SOCKS proxy (socks5h), same as Healthchecks/XvB — never leaks the host IP. Command bot swapped from aiohttp to requests-in-a-thread to reuse the SOCKS path (no new dep); getUpdates long-poll runs via asyncio.to_thread. - New alerts (folding in the achievable parts of #339): 'Pithead online' one-shot heartbeat on start, XvB registration rejected/failing/recovered, new-release-available. - Emoji enrichment across all alert messages (⛏️ workers, ⛓️ nodes, 💾 disk, 🗄️ db, 🎰 XvB, 🧅/🌐 clearnet, 🚀 online). - Network diagram (docs/architecture.md): now shows each dashboard egress path (Telegram, Healthchecks, XvB stats, GitHub) tagged with its route — all 🟢 Tor. - config.json telegram.events gains xvb_registration/new_release/stack_online (default true); plumbed pithead->compose->config.py; docs updated. Kept as issues (genuinely need new infra): #336 block/payout (no per-node/payout signal), #337 crash-loop (read proxy has no inspect/health), #338 two-way control (auth model). make test green; patch coverage 96%. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 14 +- .../mining_dashboard/config/config.py | 3 + .../mining_dashboard/service/alert_service.py | 132 +++++++++++++--- .../mining_dashboard/service/data_service.py | 8 + .../service/telegram_commands.py | 79 +++++----- .../service/telegram_notifier.py | 16 +- .../tests/service/test_alert_service.py | 55 ++++++- .../tests/service/test_telegram_commands.py | 142 ++++++++---------- .../tests/service/test_telegram_notifier.py | 12 ++ config.reference.json | 5 +- docker-compose.yml | 3 + docs/architecture.md | 49 ++++-- docs/configuration.md | 2 +- docs/telegram.md | 20 ++- docs/test-inventory.md | 23 ++- pithead | 8 +- 16 files changed, 398 insertions(+), 173 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe77967..4530ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,12 +100,14 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay silently when offline / Tor down. The URL is the on/off switch and is stored as a secret in the owner-only `.env`. See [`docs/monitoring.md`](docs/monitoring.md) (#79). - **Telegram operator bot — push alerts + on-demand status** (#121, #45): the dashboard can push a - high-value set of operational alerts to Telegram — **node down / recovered**, **worker offline / - back online**, **new worker joined / left**, **sync finished**, **data disk filling up**, - **dashboard DB write failing**, **no PPLNS share while donating to XvB** (raffle wins skipped), and - **a node exposed on clearnet** during initial sync — and answer status commands on demand: - **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, - **`/xvb`**, **`/earnings`**, and **`/help`**. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, + high-value set of operational alerts to Telegram — a **🚀 "Pithead online"** heartbeat on start, + **node down / recovered**, **worker offline / back online**, **new worker joined / left**, **sync + finished**, **data disk filling up**, **dashboard DB write failing**, **no PPLNS share while + donating to XvB** (raffle wins skipped), **XvB registration rejected / failing**, **a node exposed + on clearnet** during initial sync, and **a new release being available** — and answer status + commands on demand: **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, + **`/pool`**, **`/xvb`**, **`/earnings`**, and **`/help`**. All traffic is **routed over Tor** (the + same bridge SOCKS as Healthchecks/XvB), so the bot never exposes the host IP to Telegram. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, `bot_token`, `chat_id`, per-event `events` toggles, and a `commands.enabled` switch for the interactive half). Every alert is **debounced** so a momentary blip won't ping you and you get one message per real transition — and each is built by *reusing* what the dashboard already computes: diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index a0a9c68..32b0b02 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -266,6 +266,9 @@ def _telegram_event_enabled(name, default=True): "db_unhealthy": _telegram_event_enabled("db_unhealthy"), "xvb_no_share": _telegram_event_enabled("xvb_no_share"), "clearnet_exposed": _telegram_event_enabled("clearnet_exposed"), + "xvb_registration": _telegram_event_enabled("xvb_registration"), + "new_release": _telegram_event_enabled("new_release"), + "stack_online": _telegram_event_enabled("stack_online"), } # Worker offline/online debounce (Issue #121). A worker must be unseen this long before it's diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 1fc1b7a..c812e3e 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -67,13 +67,16 @@ class AlertService: EVT_DB_UNHEALTHY = "db_unhealthy" EVT_XVB_NO_SHARE = "xvb_no_share" EVT_CLEARNET_EXPOSED = "clearnet_exposed" + EVT_XVB_REGISTRATION = "xvb_registration" + EVT_NEW_RELEASE = "new_release" + EVT_STACK_ONLINE = "stack_online" # WorkerPresenceMonitor edge -> (event key, message template). _WORKER_EDGES = { - "offline": (EVT_WORKER_OFFLINE, "\U0001f534 Worker offline: {name}"), - "recovered": (EVT_WORKER_RECOVERED, "\U0001f7e2 Worker back online: {name}"), - "joined": (EVT_WORKER_JOINED, "\U0001f7e2 New worker joined: {name}"), - "left": (EVT_WORKER_LEFT, "⚪ Worker left: {name}"), + "offline": (EVT_WORKER_OFFLINE, "\U0001f534 ⛏️ Worker offline: {name}"), + "recovered": (EVT_WORKER_RECOVERED, "\U0001f7e2 ⛏️ Worker back online: {name}"), + "joined": (EVT_WORKER_JOINED, "\U0001f389 New worker joined: {name}"), + "left": (EVT_WORKER_LEFT, "\U0001f44b Worker left: {name}"), } def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): @@ -89,6 +92,10 @@ def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): self._prev_db_healthy = None self._prev_xvb_has_share = None self._prev_clearnet_active = None + self._prev_xvb_reg = None + self._prev_update_available = None + # One-shot "stack is online" ping, sent on the first cycle after the dashboard starts. + self._announced_online = False @property def enabled(self): @@ -108,12 +115,24 @@ def evaluate( xvb_enabled=False, shares_in_window=0, clearnet_active=False, + xvb_registration_state="", + update_available=False, now=None, ): """Pure: fold this cycle's signals into the list of ``(event_key, text)`` to send, filtered to the events the operator left enabled.""" alerts = [] + # --- Stack online (one-shot on the first cycle after the dashboard starts) --- + if not self._announced_online: + self._announced_online = True + alerts.append( + ( + self.EVT_STACK_ONLINE, + self._fmt("\U0001f680 Pithead is online — dashboard up and monitoring."), + ) + ) + # --- Node down / recovered (consume NodeHealthMonitor edges) --- alerts += self._node_edges("Monero", monero_down, "_prev_monero_down") if tari_required: @@ -155,6 +174,10 @@ def evaluate( alerts += self._xvb_share_edges(xvb_enabled, shares_in_window) alerts += self._clearnet_edges(clearnet_active) + # --- XvB auto-registration health, and a new Pithead release being available --- + alerts += self._registration_edges(xvb_enabled, xvb_registration_state) + alerts += self._release_edges(update_available) + return [(evt, text) for evt, text in alerts if self.notifier.event_enabled(evt)] def _node_edges(self, label, down, attr): @@ -167,14 +190,14 @@ def _node_edges(self, label, down, attr): ( self.EVT_NODE_DOWN, self._fmt( - f"\U0001f534 {label} node is DOWN — workers failing over to backup pools." + f"\U0001f534 ⛓️ {label} node is DOWN — workers failing over to backup pools." ), ) ] return [ ( self.EVT_NODE_RECOVERED, - self._fmt(f"\U0001f7e2 {label} node recovered — workers readmitted."), + self._fmt(f"\U0001f7e2 ⛓️ {label} node recovered — workers readmitted."), ) ] @@ -197,14 +220,24 @@ def _disk_edges(self, disk_percent): ( self.EVT_DISK_SPACE, self._fmt( - f"\U0001f534 Data disk almost full ({pct}) — free space now; a full disk " - "can corrupt the Monero database." + f"\U0001f534 \U0001f4be Data disk almost full ({pct}) — free space now; a " + "full disk can corrupt the Monero database." ), ) ] if level == "warn": - return [(self.EVT_DISK_SPACE, self._fmt(f"\U0001f7e0 Data disk filling up ({pct})."))] - return [(self.EVT_DISK_SPACE, self._fmt(f"\U0001f7e2 Data disk back to healthy ({pct})."))] + return [ + ( + self.EVT_DISK_SPACE, + self._fmt(f"\U0001f7e0 \U0001f4be Data disk filling up ({pct})."), + ) + ] + return [ + ( + self.EVT_DISK_SPACE, + self._fmt(f"\U0001f7e2 \U0001f4be Data disk back to healthy ({pct})."), + ) + ] def _db_edges(self, db_healthy): """Alert when the dashboard can no longer persist to its SQLite DB (#131).""" @@ -217,12 +250,17 @@ def _db_edges(self, db_healthy): ( self.EVT_DB_UNHEALTHY, self._fmt( - "\U0001f534 Dashboard DB write failing — hashrate history, shares and stats " - "won't persist. Check disk space and permissions on the dashboard data dir." + "\U0001f534 \U0001f5c4️ Dashboard DB write failing — hashrate history, shares " + "and stats won't persist. Check disk space + permissions on the data dir." ), ) ] - return [(self.EVT_DB_UNHEALTHY, self._fmt("\U0001f7e2 Dashboard DB writes recovered."))] + return [ + ( + self.EVT_DB_UNHEALTHY, + self._fmt("\U0001f7e2 \U0001f5c4️ Dashboard DB writes recovered."), + ) + ] def _xvb_share_edges(self, xvb_enabled, shares_in_window): """Alert on losing / regaining the PPLNS share XvB needs to bank a raffle win (#158). @@ -245,15 +283,17 @@ def _xvb_share_edges(self, xvb_enabled, shares_in_window): ( self.EVT_XVB_NO_SHARE, self._fmt( - "⚠ No PPLNS share — XvB raffle wins are skipped until you land one " - "(donations are wasted meanwhile)." + "⚠️ \U0001f3b0 No PPLNS share — XvB raffle wins are skipped until you land " + "one (donations are wasted meanwhile)." ), ) ] return [ ( self.EVT_XVB_NO_SHARE, - self._fmt("\U0001f7e2 PPLNS share restored — XvB raffle wins count again."), + self._fmt( + "\U0001f7e2 \U0001f3b0 PPLNS share restored — XvB raffle wins count again." + ), ) ] @@ -269,8 +309,8 @@ def _clearnet_edges(self, clearnet_active): ( self.EVT_CLEARNET_EXPOSED, self._fmt( - "⚠ Clearnet initial sync ACTIVE — this host's IP is exposed to the chain's " - "P2P network until it finishes syncing (reverts to Tor automatically)." + "⚠️ \U0001f310 Clearnet initial sync ACTIVE — this host's IP is exposed to the " + "chain's P2P network until it finishes syncing (reverts to Tor automatically)." ), ) ] @@ -278,7 +318,61 @@ def _clearnet_edges(self, clearnet_active): ( self.EVT_CLEARNET_EXPOSED, self._fmt( - "\U0001f7e2 Back on Tor-only — clearnet sync finished, host IP no longer exposed." + "\U0001f7e2 \U0001f9c5 Back on Tor-only — clearnet sync finished, host IP no " + "longer exposed." + ), + ) + ] + + def _registration_edges(self, xvb_enabled, state): + """Alert on XvB auto-registration going bad / recovering (#263). ``state`` is one of + ``""`` / ``registered`` / ``invalid`` (wallet rejected — permanent) / ``failing``.""" + if not xvb_enabled: + self._prev_xvb_reg = None + return [] + prev = self._prev_xvb_reg + self._prev_xvb_reg = state + if prev is None or state == prev: + return [] + if state == "invalid": + return [ + ( + self.EVT_XVB_REGISTRATION, + self._fmt( + "\U0001f534 \U0001f3b0 XvB wallet rejected — auto-registration failed " + "(check the payout address); raffle wins won't count." + ), + ) + ] + if state == "failing": + return [ + ( + self.EVT_XVB_REGISTRATION, + self._fmt("⚠️ \U0001f3b0 XvB auto-registration failing — retrying."), + ) + ] + if state == "registered" and prev in ("invalid", "failing"): + return [ + ( + self.EVT_XVB_REGISTRATION, + self._fmt( + "\U0001f7e2 \U0001f3b0 XvB registration recovered — you're in the raffle." + ), + ) + ] + return [] + + def _release_edges(self, update_available): + """One-shot ping when a newer Pithead release becomes available (#224).""" + prev = self._prev_update_available + self._prev_update_available = bool(update_available) + if prev is None or not update_available or update_available == prev: + return [] + return [ + ( + self.EVT_NEW_RELEASE, + self._fmt( + "\U0001f195 A new Pithead release is available — see the dashboard header." ), ) ] diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index 83d23d6..adf1792 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -810,6 +810,14 @@ async def run(self): xvb_enabled=ENABLE_XVB, shares_in_window=shares_in_window, clearnet_active=bool(self.clearnet_sync_state.get("active")), + xvb_registration_state=(self.state_manager.get_xvb_stats() or {}).get( + "registration_state", "" + ), + # From the previous cycle's snapshot (the update check writes it below); a + # one-cycle lag is fine for a one-shot "new release" ping. + update_available=bool( + (self.latest_data.get("update") or {}).get("available") + ), ) self.latest_data.update( diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 647da1f..c09f244 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -1,7 +1,7 @@ import asyncio import logging -import aiohttp +import requests from mining_dashboard.config.config import ( HOST_IP, @@ -9,6 +9,7 @@ TELEGRAM_CHAT_ID, TELEGRAM_COMMANDS_ENABLED, TELEGRAM_ENABLED, + TOR_SOCKS_PROXY, ) from mining_dashboard.helper.utils import format_duration, format_hashrate from mining_dashboard.service.earnings import xmr_per_hs_day @@ -263,6 +264,7 @@ def __init__( host_label=HOST_IP, api_base=TELEGRAM_API_BASE, long_poll=LONG_POLL_SECONDS, + tor_proxy=TOR_SOCKS_PROXY, ): self.data_service = data_service self._token = (bot_token or "").strip() @@ -272,6 +274,9 @@ def __init__( self.host_label = host_label self._api_base = api_base.rstrip("/") self.long_poll = long_poll + # Route getUpdates + replies over the bridge Tor SOCKS proxy, so polling Telegram never + # exposes the host IP (same discipline as the notifier / Healthchecks pinger). + self._proxies = {"http": tor_proxy, "https": tor_proxy} if tor_proxy else None if enabled is None: enabled = bool(TELEGRAM_ENABLED and TELEGRAM_COMMANDS_ENABLED) self.enabled = bool(enabled and self._token and self.chat_id) @@ -315,51 +320,56 @@ def reply_for(self, text): return None async def run(self): - """Long-poll for commands until cancelled. A no-op when disabled.""" + """Long-poll for commands until cancelled. A no-op when disabled. + + The network calls use ``requests`` (so they ride the same Tor SOCKS proxy as the notifier) + run off the event loop via :func:`asyncio.to_thread`, so a 25s long-poll never blocks it. + """ if not self.enabled: return - logger.info("Telegram command interface enabled — polling for commands.") - async with aiohttp.ClientSession() as session: - await self._prime_offset(session) - while True: - try: - updates = await self._get_updates(session, self.long_poll) - except asyncio.CancelledError: - raise - except Exception as exc: - logger.debug("Telegram getUpdates failed (%s)", type(exc).__name__) - await asyncio.sleep(POLL_ERROR_BACKOFF_SECONDS) - continue - for update in updates: - self._offset = update.get("update_id", 0) + 1 - await self._handle_update(session, update) - - async def _prime_offset(self, session): + logger.info("Telegram command interface enabled — polling for commands (over Tor).") + await asyncio.to_thread(self._prime_offset) + while True: + try: + updates = await asyncio.to_thread(self._get_updates, self.long_poll) + except asyncio.CancelledError: + raise + except Exception as exc: + logger.debug("Telegram getUpdates failed (%s)", type(exc).__name__) + await asyncio.sleep(POLL_ERROR_BACKOFF_SECONDS) + continue + for update in updates: + self._offset = update.get("update_id", 0) + 1 + await self._handle_update(update) + + def _prime_offset(self): """Advance the offset past any pending backlog without acting on it, so a command queued while the dashboard was down isn't run on startup.""" try: - updates = await self._get_updates(session, 0) + updates = self._get_updates(0) if updates: self._offset = updates[-1].get("update_id", 0) + 1 except Exception as exc: logger.debug("Telegram offset prime skipped (%s)", type(exc).__name__) - async def _get_updates(self, session, poll_timeout): + def _get_updates(self, poll_timeout): + """Blocking ``getUpdates`` over Tor. Called via ``to_thread`` from the loop.""" params = {"timeout": poll_timeout, "allowed_updates": '["message"]'} if self._offset is not None: params["offset"] = self._offset url = f"{self._api_base}/bot{self._token}/getUpdates" - # The client read timeout must outlast Telegram's long-poll hold, or aiohttp aborts the - # request the server is legitimately keeping open. - client_timeout = aiohttp.ClientTimeout(total=poll_timeout + 10) - async with session.get(url, params=params, timeout=client_timeout) as resp: - resp.raise_for_status() - payload = await resp.json() + # The read timeout must outlast Telegram's long-poll hold, or requests aborts the request + # the server is legitimately keeping open; (connect, read) tuple. + resp = requests.get( + url, params=params, timeout=(10, poll_timeout + 10), proxies=self._proxies + ) + resp.raise_for_status() + payload = resp.json() if not payload.get("ok"): return [] return payload.get("result", []) - async def _handle_update(self, session, update): + async def _handle_update(self, update): message = update.get("message") or {} chat = message.get("chat") or {} # Access control: only the configured chat may drive the bot. Anything else is dropped @@ -368,7 +378,7 @@ async def _handle_update(self, session, update): return reply = await asyncio.to_thread(self._safe_reply_for, message.get("text", "")) if reply: - await self._send(session, reply) + await asyncio.to_thread(self._send, reply) def _safe_reply_for(self, text): """Never let a formatting/read bug kill the poll loop — a broken command just goes quiet.""" @@ -378,14 +388,13 @@ def _safe_reply_for(self, text): logger.debug("Telegram command handling failed (%s)", type(exc).__name__) return None - async def _send(self, session, text): + def _send(self, text): + """Blocking reply over Tor. Called via ``to_thread``.""" url = f"{self._api_base}/bot{self._token}/sendMessage" payload = {"chat_id": self.chat_id, "text": text, "disable_web_page_preview": True} try: - async with session.post( - url, json=payload, timeout=aiohttp.ClientTimeout(total=10) - ) as resp: - resp.raise_for_status() + resp = requests.post(url, json=payload, timeout=10, proxies=self._proxies) + resp.raise_for_status() except Exception as exc: - # Log only the exception type — a requests/aiohttp error can embed the token-bearing URL. + # Log only the exception type — a requests error can embed the token-bearing URL. logger.debug("Telegram reply failed (%s)", type(exc).__name__) diff --git a/build/dashboard/mining_dashboard/service/telegram_notifier.py b/build/dashboard/mining_dashboard/service/telegram_notifier.py index 2ebb59d..510a972 100644 --- a/build/dashboard/mining_dashboard/service/telegram_notifier.py +++ b/build/dashboard/mining_dashboard/service/telegram_notifier.py @@ -2,6 +2,8 @@ import requests +from mining_dashboard.config.config import TOR_SOCKS_PROXY + logger = logging.getLogger("TelegramNotifier") # Telegram Bot API base. Overridable in tests so we never touch the network. @@ -23,9 +25,12 @@ class TelegramNotifier: off, so :meth:`send` is a silent no-op rather than an error on every cycle. - **Per-event toggles.** ``events`` gates which alert kinds are delivered, so an operator can enable Telegram and still silence the ones they find noisy. - - **Fail silent.** Any network error (offline host, Tor-only egress with no clearnet, - Telegram unreachable) is swallowed and logged at debug — an alerter must never crash - the data loop or spam ERROR for the very condition it exists to report (#59 discipline). + - **Always over Tor.** Sends ride the bridge Tor SOCKS proxy (``socks5h``, so the DNS lookup + goes through Tor too), so Telegram sees a Tor exit, not the host IP — never a clearnet beacon, + matching the Healthchecks.io pinger (#79) and the XvB fetch (#163). + - **Fail silent.** Any network error (offline host, Tor down, Telegram unreachable / blocking a + Tor exit) is swallowed and logged at debug — an alerter must never crash the data loop or spam + ERROR for the very condition it exists to report (#59 discipline). - **Never logs the token.** ``bot_token`` is a secret; it only ever appears in the request URL and is never written to a log line — not even inside an exception message (which for ``requests`` would otherwise include the full URL). @@ -39,6 +44,7 @@ def __init__( events=None, timeout=10, api_base=TELEGRAM_API_BASE, + tor_proxy=TOR_SOCKS_PROXY, ): self.bot_token = (bot_token or "").strip() # chat_id may be a negative integer (Telegram group ids look like -1001234567890); @@ -47,6 +53,9 @@ def __init__( self.events = dict(events or {}) self.timeout = timeout self._api_base = api_base.rstrip("/") + # Route over the bridge Tor SOCKS proxy so the host IP is never exposed to Telegram. + # tor_proxy is a test seam; the default wires the configured proxy. + self._proxies = {"http": tor_proxy, "https": tor_proxy} if tor_proxy else None self.enabled = bool(enabled and self.bot_token and self.chat_id) if enabled and not self.enabled: @@ -75,6 +84,7 @@ def send(self, text): "disable_web_page_preview": True, }, timeout=self.timeout, + proxies=self._proxies, ) resp.raise_for_status() return True diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index d1ede6f..a0c6476 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -20,11 +20,16 @@ def send(self, text): return True -def _svc(notifier=None, **kw): +def _svc(notifier=None, announce_online=True, **kw): notifier = notifier if notifier is not None else _FakeNotifier() kw.setdefault("worker_monitor", WorkerPresenceMonitor(offline_after=300, recovery_after=120)) kw.setdefault("host_label", "") - return AlertService(notifier=notifier, **kw) + svc = AlertService(notifier=notifier, **kw) + # The one-shot "stack online" ping fires on the first evaluate; mark it already sent so it + # doesn't perturb the per-signal tests. TestStackOnline opts out to exercise it. + if announce_online: + svc._announced_online = True + return svc def _ev( @@ -41,6 +46,8 @@ def _ev( xvb_enabled=False, shares_in_window=0, clearnet_active=False, + xvb_registration_state="", + update_available=False, now=0, ): return svc.evaluate( @@ -55,6 +62,8 @@ def _ev( xvb_enabled=xvb_enabled, shares_in_window=shares_in_window, clearnet_active=clearnet_active, + xvb_registration_state=xvb_registration_state, + update_available=update_available, now=now, ) @@ -230,6 +239,48 @@ def test_exposed_then_reverted(self): assert "Tor-only" in text +class TestStackOnline: + def test_online_fires_once_on_first_cycle(self): + svc = _svc(announce_online=False) + assert _keys(_ev(svc)) == [AlertService.EVT_STACK_ONLINE] + assert _ev(svc) == [] # one-shot — not on later cycles + + def test_online_text_is_friendly(self): + svc = _svc(announce_online=False) + _, text = _ev(svc)[0] + assert "online" in text.lower() + + +class TestXvbRegistration: + def test_invalid_then_recovered(self): + svc = _svc() + assert _ev(svc, xvb_enabled=True, xvb_registration_state="registered") == [] # seed + assert _keys(_ev(svc, xvb_enabled=True, xvb_registration_state="invalid")) == [ + AlertService.EVT_XVB_REGISTRATION + ] + assert _keys(_ev(svc, xvb_enabled=True, xvb_registration_state="registered")) == [ + AlertService.EVT_XVB_REGISTRATION + ] + + def test_failing_alerts(self): + svc = _svc() + _ev(svc, xvb_enabled=True, xvb_registration_state="registered") + _, text = _ev(svc, xvb_enabled=True, xvb_registration_state="failing")[0] + assert "failing" in text.lower() + + def test_silent_while_disabled(self): + svc = _svc() + assert _ev(svc, xvb_enabled=False, xvb_registration_state="invalid") == [] + + +class TestNewRelease: + def test_fires_once_on_rising_edge(self): + svc = _svc() + assert _ev(svc, update_available=False) == [] # seed + assert _keys(_ev(svc, update_available=True)) == [AlertService.EVT_NEW_RELEASE] + assert _ev(svc, update_available=True) == [] # no repeat while still available + + class TestEventFiltering: def test_disabled_events_are_dropped(self): svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index cbd5473..3f86e49 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -298,133 +298,121 @@ async def test_run_is_noop_when_disabled(): async def test_handle_update_ignores_foreign_chat(monkeypatch): bot = _bot(monkeypatch) sent = [] - - async def _record(session, text): - sent.append(text) - - monkeypatch.setattr(bot, "_send", _record) + monkeypatch.setattr(bot, "_send", sent.append) # _send is sync now (run via to_thread) # chat_id 999 != configured 42 → dropped, nothing sent. - await bot._handle_update(None, {"message": {"chat": {"id": 999}, "text": "/help"}}) + await bot._handle_update({"message": {"chat": {"id": 999}, "text": "/help"}}) assert sent == [] async def test_handle_update_replies_to_configured_chat(monkeypatch): bot = _bot(monkeypatch) sent = [] - - async def _record(session, text): - sent.append(text) - - monkeypatch.setattr(bot, "_send", _record) - await bot._handle_update(None, {"message": {"chat": {"id": 42}, "text": "/help"}}) + monkeypatch.setattr(bot, "_send", sent.append) + await bot._handle_update({"message": {"chat": {"id": 42}, "text": "/help"}}) assert len(sent) == 1 and "/status" in sent[0] -# --- transport (stubbed aiohttp session) -------------------------------------------------- +# --- transport (stubbed requests, over Tor) ----------------------------------------------- class _Resp: - """Minimal stand-in for an aiohttp response context manager.""" + """Minimal stand-in for a requests.Response.""" - def __init__(self, payload=None, raise_on_enter=None, raise_on_status=False): + def __init__(self, payload=None, raise_status=False): self._payload = payload or {} - self._raise_on_enter = raise_on_enter - self._raise_on_status = raise_on_status - - async def __aenter__(self): - if self._raise_on_enter: - raise self._raise_on_enter - return self - - async def __aexit__(self, *exc): - return False + self._raise = raise_status def raise_for_status(self): - if self._raise_on_status: + if self._raise: raise RuntimeError("http error") - async def json(self): + def json(self): return self._payload -class _Session: - """Stub aiohttp session: hands back queued responses and records calls.""" - - def __init__(self, gets=None, posts=None): - self._gets = list(gets or []) - self._posts = list(posts or []) - self.get_calls = [] - self.post_calls = [] - - def get(self, url, params=None, timeout=None): - self.get_calls.append((url, params)) - return self._gets.pop(0) - - def post(self, url, json=None, timeout=None): - self.post_calls.append((url, json)) - return self._posts.pop(0) - - -def _make_bot(): +def _make_bot(tor_proxy="socks5h://tor:9050"): ds = SimpleNamespace(latest_data={}, state_manager=object()) - return tc.TelegramCommandBot(ds, enabled=True, bot_token="tok", chat_id="42") + return tc.TelegramCommandBot( + ds, enabled=True, bot_token="tok", chat_id="42", tor_proxy=tor_proxy + ) -async def test_get_updates_parses_results_and_sends_offset(): +def test_get_updates_parses_results_over_tor(monkeypatch): bot = _make_bot() bot._offset = 7 - session = _Session(gets=[_Resp({"ok": True, "result": [{"update_id": 8}]})]) - result = await bot._get_updates(session, 0) - assert result == [{"update_id": 8}] - url, params = session.get_calls[0] - assert "bottok" in url and params["offset"] == 7 # token in URL, offset forwarded + seen = {} + + def fake_get(url, params=None, timeout=None, proxies=None): + seen.update(url=url, params=params, proxies=proxies) + return _Resp({"ok": True, "result": [{"update_id": 8}]}) + + monkeypatch.setattr(tc.requests, "get", fake_get) + assert bot._get_updates(0) == [{"update_id": 8}] + assert "bottok" in seen["url"] and seen["params"]["offset"] == 7 # token + offset forwarded + assert seen["proxies"] == {"http": "socks5h://tor:9050", "https": "socks5h://tor:9050"} -async def test_get_updates_not_ok_returns_empty(): +def test_get_updates_not_ok_returns_empty(monkeypatch): bot = _make_bot() - session = _Session(gets=[_Resp({"ok": False})]) - assert await bot._get_updates(session, 0) == [] + monkeypatch.setattr(tc.requests, "get", lambda *a, **k: _Resp({"ok": False})) + assert bot._get_updates(0) == [] -async def test_prime_offset_skips_backlog(): +def test_prime_offset_skips_backlog(monkeypatch): bot = _make_bot() - session = _Session(gets=[_Resp({"ok": True, "result": [{"update_id": 3}, {"update_id": 9}]})]) - await bot._prime_offset(session) + monkeypatch.setattr( + tc.requests, + "get", + lambda *a, **k: _Resp({"ok": True, "result": [{"update_id": 3}, {"update_id": 9}]}), + ) + bot._prime_offset() assert bot._offset == 10 # past the last pending update -async def test_prime_offset_swallows_error(): +def test_prime_offset_swallows_error(monkeypatch): bot = _make_bot() - session = _Session(gets=[_Resp(raise_on_enter=OSError("offline"))]) - await bot._prime_offset(session) # must not raise + + def boom(*a, **k): + raise OSError("offline") + + monkeypatch.setattr(tc.requests, "get", boom) + bot._prime_offset() # must not raise assert bot._offset is None -async def test_send_posts_message(): +def test_send_posts_over_tor(monkeypatch): bot = _make_bot() - session = _Session(posts=[_Resp({"ok": True})]) - await bot._send(session, "hi") - url, body = session.post_calls[0] - assert "bottok" in url and body["chat_id"] == "42" and body["text"] == "hi" + seen = {} + def fake_post(url, json=None, timeout=None, proxies=None): + seen.update(url=url, body=json, proxies=proxies) + return _Resp({"ok": True}) -async def test_send_swallows_network_error(): + monkeypatch.setattr(tc.requests, "post", fake_post) + bot._send("hi") + assert ( + "bottok" in seen["url"] and seen["body"]["chat_id"] == "42" and seen["body"]["text"] == "hi" + ) + assert seen["proxies"]["https"] == "socks5h://tor:9050" + + +def test_send_swallows_network_error(monkeypatch): bot = _make_bot() - session = _Session(posts=[_Resp(raise_on_status=True)]) - await bot._send(session, "hi") # must not raise + monkeypatch.setattr(tc.requests, "post", lambda *a, **k: _Resp(raise_status=True)) + bot._send("hi") # must not raise async def test_run_processes_update_then_honours_cancel(monkeypatch): bot = _make_bot() - monkeypatch.setattr(bot, "_prime_offset", _async_noop) + monkeypatch.setattr(bot, "_prime_offset", lambda: None) handled = [] - async def _fake_handle(session, update): + async def _fake_handle(update): handled.append(update) calls = {"n": 0} - async def _fake_get(session, poll_timeout): + def _fake_get(poll_timeout): calls["n"] += 1 if calls["n"] == 1: return [{"update_id": 1}] @@ -439,14 +427,14 @@ async def _fake_get(session, poll_timeout): async def test_run_backs_off_on_poll_error(monkeypatch): bot = _make_bot() - monkeypatch.setattr(bot, "_prime_offset", _async_noop) + monkeypatch.setattr(bot, "_prime_offset", lambda: None) slept = [] async def _sleep(secs): slept.append(secs) raise asyncio.CancelledError # break out after the first backoff - async def _boom(session, poll_timeout): + def _boom(poll_timeout): raise OSError("telegram unreachable") monkeypatch.setattr(tc.asyncio, "sleep", _sleep) @@ -454,7 +442,3 @@ async def _boom(session, poll_timeout): with pytest.raises(asyncio.CancelledError): await bot.run() assert slept == [tc.POLL_ERROR_BACKOFF_SECONDS] - - -async def _async_noop(*args, **kwargs): - return None diff --git a/build/dashboard/tests/service/test_telegram_notifier.py b/build/dashboard/tests/service/test_telegram_notifier.py index 77147c4..8072c08 100644 --- a/build/dashboard/tests/service/test_telegram_notifier.py +++ b/build/dashboard/tests/service/test_telegram_notifier.py @@ -53,6 +53,18 @@ def test_send_posts_to_bot_api(self): assert body["chat_id"] == "123" assert body["text"] == "node down" + def test_send_routes_over_tor(self): + # The bot dial must ride the Tor SOCKS proxy, never leak the host IP to Telegram. + n = _enabled(tor_proxy="socks5h://tor:9050") + resp = MagicMock() + resp.raise_for_status = MagicMock() + with patch.object(tg_mod.requests, "post", return_value=resp) as post: + n.send("x") + assert post.call_args.kwargs["proxies"] == { + "http": "socks5h://tor:9050", + "https": "socks5h://tor:9050", + } + def test_send_swallows_network_error(self): n = _enabled() with patch.object( diff --git a/config.reference.json b/config.reference.json index ec545ef..f08be91 100644 --- a/config.reference.json +++ b/config.reference.json @@ -95,7 +95,10 @@ "disk_space": true, "db_unhealthy": true, "xvb_no_share": true, - "clearnet_exposed": true + "clearnet_exposed": true, + "xvb_registration": true, + "new_release": true, + "stack_online": true }, "commands": { "enabled": false diff --git a/docker-compose.yml b/docker-compose.yml index b112c69..adfcc74 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -443,6 +443,9 @@ services: - TELEGRAM_EVENT_DB_UNHEALTHY=${TELEGRAM_EVENT_DB_UNHEALTHY:-true} - TELEGRAM_EVENT_XVB_NO_SHARE=${TELEGRAM_EVENT_XVB_NO_SHARE:-true} - TELEGRAM_EVENT_CLEARNET_EXPOSED=${TELEGRAM_EVENT_CLEARNET_EXPOSED:-true} + - TELEGRAM_EVENT_XVB_REGISTRATION=${TELEGRAM_EVENT_XVB_REGISTRATION:-true} + - TELEGRAM_EVENT_NEW_RELEASE=${TELEGRAM_EVENT_NEW_RELEASE:-true} + - TELEGRAM_EVENT_STACK_ONLINE=${TELEGRAM_EVENT_STACK_ONLINE:-true} - TELEGRAM_COMMANDS_ENABLED=${TELEGRAM_COMMANDS_ENABLED:-false} # --- Docker Socket Proxy (read-only) --- diff --git a/docs/architecture.md b/docs/architecture.md index 263b53d..37d480b 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -28,9 +28,14 @@ flowchart TB %% ── External actors ── You(["👤 You · Browser"]) Workers(["⛏️ XMRig Workers"]) - XvB(["🎲 XMRvsBeast Pool"]) Net(["🌐 Tor Network / Internet"]) + %% ── External services the dashboard calls out to (each labeled with its route) ── + Telegram(["✈️ Telegram
alerts + commands"]) + HC(["🩺 Healthchecks.io
dead-man's switch"]) + XvB(["🎲 XMRvsBeast
pool + stats"]) + GitHub(["🐙 GitHub
release check"]) + subgraph stack ["🐳 Pithead"] direction TB @@ -52,27 +57,40 @@ flowchart TB Caddy --> Dashboard Workers ==>|"Stratum 3333"| Proxy + %% Dashboard internal control + monitoring (never leaves the box) Dashboard -.->|controls| Proxy Dashboard -.->|monitors| DockerProxy Dashboard -.->|"reads stats & sync"| core + %% ── Dashboard egress — every outbound call is routed through Tor (🟢), so none leak the host IP ── + Dashboard ==>|"🚨 alerts + commands · 🟢 Tor"| Tor + Dashboard ==>|"🩺 liveness ping · 🟢 Tor"| Tor + Dashboard ==>|"📈 XvB stats · 🟢 Tor"| Tor + Dashboard ==>|"🆕 update check · 🟢 Tor"| Tor + Proxy ==>|hashrate| P2Pool - Proxy ==>|hashrate| XvB + Proxy ==>|"hashrate · 🟢 Tor"| Tor P2Pool <-->|"RPC / ZMQ"| Monerod P2Pool -->|merge-mine| Tari - Monerod <-->|tx broadcast| Tor - Tari <-->|P2P| Tor - P2Pool <-->|P2P| Tor + Monerod <-->|"tx + P2P · 🟢 Tor"| Tor + Tari <-->|"P2P · 🟢 Tor"| Tor + P2Pool <-->|"P2P · 🟢 Tor"| Tor Tor <--> Net + %% Tor exit reaches each external service + Net -.-> Telegram + Net -.-> HC + Net -.-> XvB + Net -.-> GitHub + classDef ext fill:#1e293b,stroke:#64748b,color:#e2e8f0; classDef ctrl fill:#1d4ed8,stroke:#93c5fd,color:#eff6ff; classDef priv fill:#6d28d9,stroke:#c4b5fd,color:#f5f3ff; classDef mine fill:#047857,stroke:#6ee7b7,color:#ecfdf5; - class You,Workers,XvB,Net ext; + class You,Workers,Net,Telegram,HC,XvB,GitHub ext; class Caddy,Dashboard ctrl; class Tor,DockerProxy priv; class Proxy,P2Pool,Monerod,Tari mine; @@ -81,11 +99,20 @@ flowchart TB style core stroke:#10b981,stroke-width:1px,stroke-dasharray:5 4; ``` -Reading the diagram: thick arrows carry mining hashrate and inbound connections, dotted arrows are -the dashboard's control and monitoring, and solid arrows are internal service data and anonymized -network traffic. Node colors group services by role: 🟦 control plane (Caddy, Dashboard), 🟪 privacy -and isolation (Tor, Docker socket proxy), and 🟩 the mining core. In remote-node mode the bundled 🟠 -Monero node isn't started, and P2Pool talks to your external node instead. +Reading the diagram: thick arrows carry inbound connections and every path that **leaves the box** — +each egress edge is tagged with its route, and **🟢 Tor** means it exits through the Tor daemon (a Tor +exit IP, never your host's). Dotted arrows are the dashboard's internal control and monitoring, which +never leave the machine. The dashboard makes four outbound calls — the **Telegram** bot (alerts + +commands), the **Healthchecks.io** liveness ping, the **XvB** stats fetch, and the **GitHub** release +check — and all four are Tor-routed, so enabling any of them never reveals where your stack runs. Node +colors group services by role: 🟦 control plane (Caddy, Dashboard), 🟪 privacy and isolation (Tor, +Docker socket proxies), and 🟩 the mining core. In remote-node mode the bundled 🟠 Monero node isn't +started, and P2Pool talks to your external node instead. + +> The one exception is **optional clearnet initial sync** (`monero.clearnet_initial_sync` / +> `tari.clearnet_initial_sync`, default **off**): while active, that node's P2P leaves Tor to sync +> faster and its IP is exposed until it finishes, after which it reverts to Tor automatically (#234). +> The Telegram bot alerts you the whole time it's exposed. See [Privacy](privacy.md). ## Privacy by design diff --git a/docs/configuration.md b/docs/configuration.md index e2562ab..8687dfb 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -105,7 +105,7 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `telegram.enabled` | `false` | Push operational alerts (node down/recovered, worker offline/back, sync finished) to Telegram. Off by default. Requires `bot_token` + `chat_id` to actually send. Full walkthrough: [Telegram Bot](telegram.md). | | `telegram.bot_token` | `""` | Your BotFather bot token. A secret — stored owner-only in `.env`, git-ignored, and never logged. Get one from [@BotFather](https://t.me/BotFather). | | `telegram.chat_id` | `""` | Where alerts are sent and the only chat the command interface answers. A Telegram group id (negative, e.g. `-1001234567890`) or a personal chat id. See [how to find it](telegram.md#3-find-your-chat-id). | -| `telegram.events.*` | all `true` | Per-event toggles: `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `sync_finished`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. | +| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | | `telegram.commands.enabled` | `false` | Turn on the interactive command interface — the bot answers `/status`, `/hashrate`, `/workers`, `/sync`, and `/help` from the configured `chat_id` (every other chat is ignored). Off by default; alerts work without it. Long-polls over Tor, so it needs no inbound port. See [Telegram › Commands](telegram.md#commands). | --- diff --git a/docs/telegram.md b/docs/telegram.md index f6f16ee..065ceed 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -32,6 +32,9 @@ transition, not a stream: | 🔴 **DB write failing** | The dashboard can no longer write to its database; hashrate history, shares, and stats will be lost on restart until it's fixed (usually disk space or permissions). | | ⚠ **No PPLNS share (XvB)** | You're donating to XvB but hold no share in the PPLNS window, so raffle wins are **skipped** — donations are wasted until you land one. Only fires when XvB is enabled. | | ⚠ **Clearnet sync active** | A node is doing its initial sync over **clearnet**, so this host's IP is exposed to that chain's P2P network until it finishes (it reverts to Tor automatically). | +| 🎰 **XvB registration** | XvB auto-registration was rejected (bad payout address) or is failing — raffle wins won't count until it recovers. Only fires when XvB is enabled. | +| 🆕 **New release** | A newer Pithead release is available (the same signal as the dashboard header badge). | +| 🚀 **Pithead online** | Sent once when the dashboard starts — a heartbeat that the stack is up (and confirms the bot works after setup). | Every message is prefixed with your dashboard hostname (e.g. `[rig-box.lan]`), so if you point more than one stack at the same chat you can tell them apart. @@ -149,6 +152,9 @@ block and set it to `false` — any event you don't list stays on: | `db_unhealthy` | `true` | Dashboard database writes failing / recovered | | `xvb_no_share` | `true` | XvB on but no PPLNS share (wins skipped) / restored | | `clearnet_exposed` | `true` | A node is syncing over clearnet (IP exposed) / back on Tor | +| `xvb_registration` | `true` | XvB auto-registration rejected / failing / recovered | +| `new_release` | `true` | A newer Pithead release is available | +| `stack_online` | `true` | One-shot "dashboard is up" heartbeat on start | Run `./pithead apply` after editing. @@ -235,12 +241,12 @@ differently.) - **The bot token is a secret.** Pithead stores it in `.env`, which is created **owner-only** (`chmod 600`) and is **git-ignored**, exactly like the Monero node RPC password. The dashboard **never writes the token to a log line** — not even inside an error message. -- **Telegram is a clearnet service.** The dashboard reaches `api.telegram.org` directly — both to - send alerts and (if commands are on) to poll for them. On a **Tor-only host with no clearnet - egress**, the Telegram API is unreachable and both **fail silently** — no errors, no log spam, the - rest of the stack is unaffected. (Same applies if your network blocks Telegram.) If you run - Tor-only and want these, you'll need clearnet egress for the dashboard, or rely on Healthchecks.io's - own delivery. +- **Always over Tor.** Both the alert sends and the command long-poll reach `api.telegram.org` + **through the bundled Tor SOCKS proxy** (`socks5h`, so the DNS lookup goes through Tor too) — the + same routing as the Healthchecks.io pinger and the XvB fetch. Telegram sees a **Tor exit, not your + host IP**, so enabling the bot doesn't expose where your stack runs. If Tor is momentarily down (or + Telegram is blocking that exit), sends and polls **fail silently** — no errors, no log spam, the + rest of the stack is unaffected — and resume on their own. --- @@ -269,7 +275,7 @@ Node-down timing is shared with the existing failover logic (`NODE_DOWN_AFTER_SE | Works for "down" but not a specific alert | Check `telegram.events` — that event may be toggled `false`. | | Alerts work but commands don't | Commands are a **separate** switch: set `telegram.commands.enabled` to `true` and `./pithead apply`. | | Bot ignores my commands | It only answers the configured `chat_id`. Send from that exact chat, and check the id with `@userinfobot`. | -| Tor-only host | Expected: Telegram is clearnet, so both alerts and command polling fail silently. See [Privacy and secrets](#privacy-and-secrets). | +| No messages, Tor issues | Telegram is reached **over Tor**; if Tor is down or Telegram is blocking the exit, sends/polls fail silently and resume on their own. See [Privacy and secrets](#privacy-and-secrets). | --- diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 7add683..674b260 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 716 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 723 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 716 | +| 1 — Unit | dashboard pytest | 723 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 716 tests +### Dashboard (pytest) — 723 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 23 +#### tests/service/test_alert_service.py — 29 - test_first_cycle_seeds_baseline_silently - test_down_then_recovered - test_node_text_names_the_chain @@ -229,6 +229,12 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_no_share_then_restored - test_silent_while_xvb_disabled - test_exposed_then_reverted +- test_online_fires_once_on_first_cycle +- test_online_text_is_friendly +- test_invalid_then_recovered +- test_failing_alerts +- test_silent_while_disabled +- test_fires_once_on_rising_edge - test_disabled_events_are_dropped - test_prefixes_when_set - test_placeholder_host_is_not_prefixed @@ -561,22 +567,23 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_run_is_noop_when_disabled - test_handle_update_ignores_foreign_chat - test_handle_update_replies_to_configured_chat -- test_get_updates_parses_results_and_sends_offset +- test_get_updates_parses_results_over_tor - test_get_updates_not_ok_returns_empty - test_prime_offset_skips_backlog - test_prime_offset_swallows_error -- test_send_posts_message +- test_send_posts_over_tor - test_send_swallows_network_error - test_run_processes_update_then_honours_cancel - test_run_backs_off_on_poll_error -#### tests/service/test_telegram_notifier.py — 9 +#### tests/service/test_telegram_notifier.py — 10 - test_disabled_by_default - test_enabled_requires_token_and_chat - test_enabled_flag_off_disables_even_with_creds - test_event_enabled_respects_toggle_and_enabled - test_send_noop_when_disabled - test_send_posts_to_bot_api +- test_send_routes_over_tor - test_send_swallows_network_error - test_send_swallows_http_error - test_token_never_logged_on_failure @@ -1089,5 +1096,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **879** enumerated cases/sections across the four tiers (plus the live +_Grand total: **886** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/pithead b/pithead index 3a7f25a..7e02732 100755 --- a/pithead +++ b/pithead @@ -2095,7 +2095,7 @@ render_env() { tg_event() { jq -r --arg k "$1" 'if .telegram.events[$k] != null then .telegram.events[$k] | tostring else "true" end' "$CONFIG_FILE"; } local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered local tg_ev_worker_joined tg_ev_worker_left tg_ev_sync_finished tg_ev_disk_space tg_ev_db_unhealthy - local tg_ev_xvb_no_share tg_ev_clearnet_exposed + local tg_ev_xvb_no_share tg_ev_clearnet_exposed tg_ev_xvb_registration tg_ev_new_release tg_ev_stack_online tg_ev_node_down=$(tg_event node_down) tg_ev_node_recovered=$(tg_event node_recovered) tg_ev_worker_offline=$(tg_event worker_offline) @@ -2107,6 +2107,9 @@ render_env() { tg_ev_db_unhealthy=$(tg_event db_unhealthy) tg_ev_xvb_no_share=$(tg_event xvb_no_share) tg_ev_clearnet_exposed=$(tg_event clearnet_exposed) + tg_ev_xvb_registration=$(tg_event xvb_registration) + tg_ev_new_release=$(tg_event new_release) + tg_ev_stack_online=$(tg_event stack_online) # Tari memory cap (#55). Tari officially needs only a few GB (min 4 GB host, 8 GB+ recommended), # but its memory grows unbounded over time — one 32 GB host was seen at ~11 GB while staying @@ -2200,6 +2203,9 @@ TELEGRAM_EVENT_DISK_SPACE=$tg_ev_disk_space TELEGRAM_EVENT_DB_UNHEALTHY=$tg_ev_db_unhealthy TELEGRAM_EVENT_XVB_NO_SHARE=$tg_ev_xvb_no_share TELEGRAM_EVENT_CLEARNET_EXPOSED=$tg_ev_clearnet_exposed +TELEGRAM_EVENT_XVB_REGISTRATION=$tg_ev_xvb_registration +TELEGRAM_EVENT_NEW_RELEASE=$tg_ev_new_release +TELEGRAM_EVENT_STACK_ONLINE=$tg_ev_stack_online MONERO_MEM_LIMIT=$monero_mem_limit P2POOL_URL=${NETWORK_PREFIX}.28:3333 NETWORK_SUBNET=$NETWORK_SUBNET From a956277a8ca7115a1bfb96af01555d280869aee0 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Thu, 2 Jul 2026 19:28:55 -0500 Subject: [PATCH 07/18] test(#121/#45): guard the event-set surfaces + fill Telegram coverage gaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Testing to standard (docs/testing-strategy.md — test each behaviour once, at the lowest honest tier): - tier-1 pytest: assert AlertService.EVT_* == config.py TELEGRAM_EVENTS (adding an alert but forgetting its toggle, or vice versa, now fails a test). - tier-1 shell (run.sh): assert every telegram.events key in config.reference.json renders into .env AND is declared in docker-compose.yml — guards the config surface the pytest can't see (14 events × 2, +28 assertions). - Fill real branches: process() swallowing an evaluate() error (the never-break-the- loop guard), reply_for /hashrate + /sync, _safe_reply_for's error path, the benign XvB-registration transition. alert_service + telegram_commands now 99%. Docs reviewed against docs/STYLE.md (voice + code-accuracy) — no changes needed. make test green; patch coverage 98%; test-inventory regenerated. Co-Authored-By: Claude Opus 4.8 --- .../tests/service/test_alert_service.py | 34 +++++++++++++++++++ .../tests/service/test_telegram_commands.py | 19 +++++++++++ docs/test-inventory.md | 17 ++++++---- tests/stack/run.sh | 14 ++++++++ 4 files changed, 78 insertions(+), 6 deletions(-) diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index a0c6476..57e51b0 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -1,7 +1,17 @@ +from mining_dashboard.config.config import TELEGRAM_EVENTS from mining_dashboard.service.alert_service import AlertService from mining_dashboard.service.worker_presence import WorkerPresenceMonitor +def test_every_alert_event_has_a_config_toggle(): + # The canonical event set (AlertService.EVT_*) must line up 1:1 with the per-event toggles in + # config.py TELEGRAM_EVENTS — so adding an alert but forgetting its toggle (or vice versa) fails + # here instead of silently shipping an un-toggleable / dead event. The config-surface side + # (config.reference.json, docker-compose.yml, pithead render) is guarded in tests/stack/run.sh. + evt_values = {v for k, v in vars(AlertService).items() if k.startswith("EVT_")} + assert evt_values == set(TELEGRAM_EVENTS) + + class _FakeNotifier: """Stand-in transport: records sends, lets tests gate which events are 'enabled'.""" @@ -272,6 +282,12 @@ def test_silent_while_disabled(self): svc = _svc() assert _ev(svc, xvb_enabled=False, xvb_registration_state="invalid") == [] + def test_benign_transition_is_silent(self): + # A change that isn't into invalid/failing (nor recovering from one) doesn't alert. + svc = _svc() + _ev(svc, xvb_enabled=True, xvb_registration_state="registered") # seed + assert _ev(svc, xvb_enabled=True, xvb_registration_state="") == [] + class TestNewRelease: def test_fires_once_on_rising_edge(self): @@ -341,3 +357,21 @@ async def test_enabled_notifier_dispatches(self): ) assert _keys(out) == [AlertService.EVT_NODE_DOWN] assert len(notifier.sent) == 1 and "DOWN" in notifier.sent[0] + + async def test_process_swallows_evaluate_error(self, monkeypatch): + # A bug in evaluate() must never break the data loop — process() catches, logs, returns []. + svc = _svc(notifier=_FakeNotifier()) + + def boom(**_kw): + raise RuntimeError("kaboom") + + monkeypatch.setattr(svc, "evaluate", boom) + out = await svc.process( + monero_down=True, + tari_down=False, + tari_required=True, + miner_released=True, + workers=[], + workers_expected=False, + ) + assert out == [] diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index 3f86e49..0b135d2 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -274,6 +274,25 @@ def test_reply_for_earnings(monkeypatch): assert "XMR/day" in bot.reply_for("/earnings") +def test_reply_for_hashrate_and_sync(monkeypatch): + workers = [{"name": "z", "status": "online", "h15": 1000}] + bot = _bot(monkeypatch, latest_data={"workers": workers}) + assert "Hashrate" in bot.reply_for("/hashrate") + assert "Sync status" in bot.reply_for("/sync") + + +def test_safe_reply_for_swallows_errors(monkeypatch): + # A formatting/read bug in reply_for must never kill the poll loop — it just goes quiet. + ds = SimpleNamespace(latest_data={}, state_manager=object()) + bot = tc.TelegramCommandBot(ds, enabled=True, bot_token="t", chat_id="1") + + def boom(_text): + raise RuntimeError("kaboom") + + monkeypatch.setattr(bot, "reply_for", boom) + assert bot._safe_reply_for("/status") is None + + # --- enabled gating ----------------------------------------------------------------------- diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 674b260..bfc7350 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 723 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 728 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 723 | +| 1 — Unit | dashboard pytest | 728 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 723 tests +### Dashboard (pytest) — 728 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,8 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 29 +#### tests/service/test_alert_service.py — 32 +- test_every_alert_event_has_a_config_toggle - test_first_cycle_seeds_baseline_silently - test_down_then_recovered - test_node_text_names_the_chain @@ -234,12 +235,14 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_invalid_then_recovered - test_failing_alerts - test_silent_while_disabled +- test_benign_transition_is_silent - test_fires_once_on_rising_edge - test_disabled_events_are_dropped - test_prefixes_when_set - test_placeholder_host_is_not_prefixed - test_disabled_notifier_is_noop - test_enabled_notifier_dispatches +- test_process_swallows_evaluate_error #### tests/service/test_algo_service.py — 38 - test_xvb_disabled_forces_p2pool @@ -537,7 +540,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires -#### tests/service/test_telegram_commands.py — 37 +#### tests/service/test_telegram_commands.py — 39 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -563,6 +566,8 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_reply_for_system_reads_snapshot_without_metrics - test_reply_for_pool_and_xvb - test_reply_for_earnings +- test_reply_for_hashrate_and_sync +- test_safe_reply_for_swallows_errors - test_disabled_without_token_or_chat - test_run_is_noop_when_disabled - test_handle_update_ignores_foreign_chat @@ -1096,5 +1101,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **886** enumerated cases/sections across the four tiers (plus the live +_Grand total: **891** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/tests/stack/run.sh b/tests/stack/run.sh index cb42baa..b42b702 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -1547,6 +1547,20 @@ printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","n out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" assert_eq "telegram commands opt-in propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_COMMANDS_ENABLED)" "true" +# Event-set consistency (#121/#45): every telegram.events.* key in config.reference.json must be +# rendered by pithead into .env AND declared in docker-compose.yml — so adding an alert event in one +# surface but forgetting another fails here. (The Python side — AlertService.EVT_* vs config.py's +# TELEGRAM_EVENTS — is guarded by a dashboard unit test.) The .env above has all events at their +# default (no events overrides in that config), so each should render "true". +compose_text="$(cat "$ROOT/docker-compose.yml")" +while IFS= read -r ev; do + up=$(printf '%s' "$ev" | tr '[:lower:]' '[:upper:]') + assert_eq "telegram event '$ev' rendered to .env" \ + "$(run_sourced "$V" env_get_file "$V/.env" "TELEGRAM_EVENT_$up")" "true" + assert_contains "telegram event '$ev' declared in docker-compose.yml" \ + "$compose_text" "TELEGRAM_EVENT_$up=" +done < <(jq -r '.telegram.events | keys[]' "$ROOT/config.reference.json") + # An explicit tari.mem_limit is passed through verbatim (overriding the "auto" host-RAM scaling). seed_env printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T","mem_limit":"3072m"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" >"$V/config.json" From 6f2cd46a97e9156d9127480acb777febc4ed4f6a Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 00:08:59 -0500 Subject: [PATCH 08/18] feat(telegram): daily status digest + alert-wiring test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Daily summary: a once-a-day status roll-up (nodes/mining/workers/hashrate/shares/ disk) pushed at a configurable local time. telegram.daily_summary_time (default 08:00) + a daily_summary event toggle (default on). Fires once/day at the target; a post-time restart waits for the next day rather than replaying; malformed time disables it. Uses the dashboard container's timezone. Built lazily (only when due) from the same build_metrics the dashboard renders. - Config plumbed config.json -> pithead render -> compose -> config.py; describe_change + the run.sh event-consistency loop + a time-propagation test cover the surfaces. - Wiring test (tier 1): asserts DataService.run() hands AlertService the full signal contract each cycle + calls maybe_daily_summary — closes the one automatable e2e gap (the alert LOGIC was fully unit-tested; the loop->alerter wiring wasn't asserted). make test green; patch coverage 97%; docs (telegram.md/configuration.md/CHANGELOG) updated. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 6 +- .../mining_dashboard/config/config.py | 8 ++ .../mining_dashboard/service/alert_service.py | 62 ++++++++++++++- .../mining_dashboard/service/data_service.py | 12 +++ .../service/telegram_commands.py | 23 ++++++ .../tests/service/test_alert_service.py | 76 +++++++++++++++++++ .../tests/service/test_data_service.py | 72 ++++++++++++++++++ .../tests/service/test_telegram_commands.py | 13 ++++ config.reference.json | 4 +- docker-compose.yml | 2 + docs/configuration.md | 3 +- docs/telegram.md | 2 + docs/test-inventory.md | 21 +++-- pithead | 9 +++ tests/stack/run.sh | 7 ++ 15 files changed, 308 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4530ed9..49f7868 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -106,8 +106,10 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay donating to XvB** (raffle wins skipped), **XvB registration rejected / failing**, **a node exposed on clearnet** during initial sync, and **a new release being available** — and answer status commands on demand: **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, - **`/pool`**, **`/xvb`**, **`/earnings`**, and **`/help`**. All traffic is **routed over Tor** (the - same bridge SOCKS as Healthchecks/XvB), so the bot never exposes the host IP to Telegram. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, + **`/pool`**, **`/xvb`**, **`/earnings`**, and **`/help`**. It also pushes a **📅 once-a-day status + digest** at a configurable local time (`telegram.daily_summary_time`, default **08:00**). All + traffic is **routed over Tor** (the same bridge SOCKS as Healthchecks/XvB), so the bot never + exposes the host IP to Telegram. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, `bot_token`, `chat_id`, per-event `events` toggles, and a `commands.enabled` switch for the interactive half). Every alert is **debounced** so a momentary blip won't ping you and you get one message per real transition — and each is built by *reusing* what the dashboard already computes: diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 32b0b02..be625fb 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -269,7 +269,15 @@ def _telegram_event_enabled(name, default=True): "xvb_registration": _telegram_event_enabled("xvb_registration"), "new_release": _telegram_event_enabled("new_release"), "stack_online": _telegram_event_enabled("stack_online"), + "daily_summary": _telegram_event_enabled("daily_summary"), } +# ponytail: daily_summary is a scheduled push, not an edge — it lives in the events dict only so it +# gets a per-event on/off toggle like the rest; its time is TELEGRAM_DAILY_SUMMARY_TIME below. + +# Local time (HH:MM, 24-hour) to push the once-daily status digest, when the daily_summary event is +# on. Uses the dashboard container's timezone (dashboard.timezone), so "08:00" means 8am wherever +# the box is. Rendered from config.json telegram.daily_summary_time. +TELEGRAM_DAILY_SUMMARY_TIME = os.environ.get("TELEGRAM_DAILY_SUMMARY_TIME", "08:00").strip() # Worker offline/online debounce (Issue #121). A worker must be unseen this long before it's # reported OFFLINE, and seen continuously this long before "back online" — so a brief miner diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index c812e3e..6cac83c 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -1,5 +1,6 @@ import asyncio import logging +import time from mining_dashboard.config.config import ( DISK_CRITICAL_PERCENT, @@ -7,6 +8,7 @@ HOST_IP, TELEGRAM_BOT_TOKEN, TELEGRAM_CHAT_ID, + TELEGRAM_DAILY_SUMMARY_TIME, TELEGRAM_ENABLED, TELEGRAM_EVENTS, ) @@ -26,6 +28,19 @@ def build_default_notifier(): ) +def _parse_hhmm(value): + """Parse a 'HH:MM' 24-hour string to minutes-since-midnight, or None if malformed (which + disables the daily digest rather than guessing a time).""" + try: + hh, mm = (value or "").strip().split(":") + h, m = int(hh), int(mm) + if 0 <= h < 24 and 0 <= m < 60: + return h * 60 + m + except (ValueError, AttributeError): + pass + return None + + class AlertService: """ Turns the data loop's per-cycle signals into a small set of debounced operator alerts and @@ -70,6 +85,7 @@ class AlertService: EVT_XVB_REGISTRATION = "xvb_registration" EVT_NEW_RELEASE = "new_release" EVT_STACK_ONLINE = "stack_online" + EVT_DAILY_SUMMARY = "daily_summary" # WorkerPresenceMonitor edge -> (event key, message template). _WORKER_EDGES = { @@ -79,9 +95,20 @@ class AlertService: "left": (EVT_WORKER_LEFT, "\U0001f44b Worker left: {name}"), } - def __init__(self, notifier=None, worker_monitor=None, host_label=HOST_IP): + def __init__( + self, + notifier=None, + worker_monitor=None, + host_label=HOST_IP, + daily_time=TELEGRAM_DAILY_SUMMARY_TIME, + ): self.notifier = notifier if notifier is not None else build_default_notifier() self.workers = worker_monitor if worker_monitor is not None else WorkerPresenceMonitor() + # Once-daily digest: target local minute-of-day (HH:MM → h*60+m), and the day we last sent + # (so it fires once per day). A malformed time disables it. + self._daily_target_min = _parse_hhmm(daily_time) + self._daily_last = None + self._daily_seeded = False # "Unknown Host" is config.py's placeholder when HOST_IP isn't set — don't prefix with it. self.host_label = "" if host_label in (None, "", "Unknown Host") else host_label # None = "not yet observed": the first cycle seeds the baseline without emitting. @@ -394,3 +421,36 @@ async def process(self, **signals): for _evt, text in alerts: await asyncio.to_thread(self.notifier.send, text) return alerts + + async def maybe_daily_summary(self, now, summary_provider): + """Push a once-daily status digest at the configured local time. + + ``summary_provider()`` builds the digest text and is called **only when a send is actually + due**, so it isn't run every cycle. No-op when the ``daily_summary`` event is off, the time + is malformed, or the digest has already gone out today. On a startup that's already past + today's time it waits for tomorrow rather than firing a stale digest immediately. Returns the + text sent (handy for tests), else ``None``. + """ + if self._daily_target_min is None or not self.notifier.event_enabled( + self.EVT_DAILY_SUMMARY + ): + return None + lt = time.localtime(now) + today = (lt.tm_year, lt.tm_yday) + now_min = lt.tm_hour * 60 + lt.tm_min + if not self._daily_seeded: + self._daily_seeded = True + # Started after today's send time → don't replay it now; wait for tomorrow. + if now_min >= self._daily_target_min: + self._daily_last = today + if self._daily_last == today or now_min < self._daily_target_min: + return None + self._daily_last = today + try: + text = summary_provider() + except Exception as exc: # a bad summary build must not wedge the loop + logger.debug("Daily summary build failed (%s)", type(exc).__name__) + return None + if text: + await asyncio.to_thread(self.notifier.send, text) + return text diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index adf1792..878f84c 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -32,6 +32,7 @@ CLEARNET_STATE_DIR, ENABLE_XVB, GITHUB_RELEASES_API, + HOST_IP, MONERO_CLEARNET_SYNC, REJECT_WORKERS_CONTAINER, SYNC_GATE_CONTAINERS, @@ -51,7 +52,9 @@ from mining_dashboard.service.alert_service import AlertService from mining_dashboard.service.clearnet_sync import ClearnetSyncSupervisor from mining_dashboard.service.healthchecks import HealthchecksClient +from mining_dashboard.service.metrics import build_metrics from mining_dashboard.service.node_health import NodeHealthMonitor +from mining_dashboard.service.telegram_commands import format_daily_summary from mining_dashboard.service.update_checker import GitHubReleaseClient, UpdateChecker logger = logging.getLogger("DataService") @@ -819,6 +822,15 @@ async def run(self): (self.latest_data.get("update") or {}).get("available") ), ) + # Once-daily status digest (built lazily, only when a send is actually due). + await self.alert_service.maybe_daily_summary( + time.time(), + lambda: format_daily_summary( + build_metrics(self.latest_data, self.state_manager), + self.latest_data, + HOST_IP, + ), + ) self.latest_data.update( { diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index c09f244..6a6f9ed 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -226,6 +226,29 @@ def format_earnings(metrics, network, host_label=""): ) +def format_daily_summary(metrics, data, host_label=""): + """The once-a-day status digest pushed by the alerter — an at-a-glance roll-up built from the + same domain values as /status. ``data`` is the latest snapshot (for the disk figure + the + mining-active flag the metrics layer doesn't carry).""" + mining = bool(data.get("miner_released") and not data.get("workers_rejected")) + disk = (data.get("system", {}) or {}).get("disk", {}) or {} + lines = [ + f"{_prefix(host_label)}\U0001f4c5 Daily summary", + f"⛓️ Monero: {_node_state(metrics.monero)} · Tari: {_node_state(metrics.tari)}", + ] + if metrics.global_syncing: + lines.append("⛏️ Mining: ⏳ holding — chain(s) syncing") + elif mining: + lines.append(f"⛏️ Mining: \U0001f7e2 active ({metrics.mode})") + else: + lines.append("⛏️ Mining: \U0001f534 not mining") + lines.append(f"\U0001f477 Workers: {metrics.workers_online}/{metrics.workers_total} online") + lines.append(f"⚡ Hashrate: {format_hashrate(metrics.total_h15)}") + lines.append(f"\U0001f3b0 PPLNS shares: {metrics.shares_in_window} in window") + lines.append(f"\U0001f4be Disk: {disk.get('percent_str', 'n/a')} used") + return "\n".join(lines) + + class TelegramCommandBot: """ On-demand Telegram command interface (Issue #45) — the interactive half of the operator bot. diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index 57e51b0..2ca1645 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -1,3 +1,6 @@ +from types import SimpleNamespace + +import mining_dashboard.service.alert_service as alert_mod from mining_dashboard.config.config import TELEGRAM_EVENTS from mining_dashboard.service.alert_service import AlertService from mining_dashboard.service.worker_presence import WorkerPresenceMonitor @@ -375,3 +378,76 @@ def boom(**_kw): workers_expected=False, ) assert out == [] + + +def _fake_localtime(hour, minute, yday=100, year=2026): + """A time.localtime stand-in with just the fields maybe_daily_summary reads.""" + return lambda _now: SimpleNamespace(tm_year=year, tm_yday=yday, tm_hour=hour, tm_min=minute) + + +def _daily_svc(daily_time="08:00", notifier=None): + notifier = notifier if notifier is not None else _FakeNotifier() + return AlertService( + notifier=notifier, + worker_monitor=WorkerPresenceMonitor(), + host_label="", + daily_time=daily_time, + ) + + +class TestDailySummary: + async def test_fires_at_target_once_per_day(self, monkeypatch): + n = _FakeNotifier() + svc = _daily_svc(notifier=n) + prov = lambda: "digest" # noqa: E731 + # Before the target time → nothing. + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(7, 59)) + assert await svc.maybe_daily_summary(0, prov) is None + # At the target → fires once. + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 0)) + assert await svc.maybe_daily_summary(0, prov) == "digest" + assert n.sent == ["digest"] + # Later the same day → no repeat. + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(9, 30)) + assert await svc.maybe_daily_summary(0, prov) is None + # Next day at the target → fires again. + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 0, yday=101)) + assert await svc.maybe_daily_summary(0, prov) == "digest" + assert n.sent == ["digest", "digest"] + + async def test_late_start_waits_for_next_day(self, monkeypatch): + svc = _daily_svc() + prov = lambda: "digest" # noqa: E731 + # First observation is already past 08:00 → don't replay today. + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(10, 0)) + assert await svc.maybe_daily_summary(0, prov) is None + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(23, 0)) + assert await svc.maybe_daily_summary(0, prov) is None + # Next day at 08:00 → fires. + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 0, yday=101)) + assert await svc.maybe_daily_summary(0, prov) == "digest" + + async def test_malformed_time_disables(self, monkeypatch): + svc = _daily_svc(daily_time="not-a-time") + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 0)) + assert await svc.maybe_daily_summary(0, lambda: "x") is None + + async def test_gated_off_by_event_toggle(self, monkeypatch): + # daily_summary not in the allow-set → the notifier reports it disabled. + svc = _daily_svc(notifier=_FakeNotifier(allow=set())) + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 0)) + assert await svc.maybe_daily_summary(0, lambda: "x") is None + + async def test_provider_error_is_swallowed_and_marks_day_done(self, monkeypatch): + n = _FakeNotifier() + svc = _daily_svc(notifier=n) + + def boom(): + raise RuntimeError("bad build") + + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 0)) + assert await svc.maybe_daily_summary(0, boom) is None + assert n.sent == [] + # Marked done for today even though the build failed → no retry storm. + monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 5)) + assert await svc.maybe_daily_summary(0, lambda: "digest") is None diff --git a/build/dashboard/tests/service/test_data_service.py b/build/dashboard/tests/service/test_data_service.py index 34b7eec..4e57c76 100644 --- a/build/dashboard/tests/service/test_data_service.py +++ b/build/dashboard/tests/service/test_data_service.py @@ -768,6 +768,78 @@ async def test_run_holds_miner_while_syncing(self): assert svc.miner_released is False assert svc.latest_data["miner_held"] is True + async def test_run_wires_computed_signals_into_the_alerter(self): + # Wiring guard: the unit tests prove each signal → the right alert in isolation; this proves + # the data loop actually hands the alerter the full contract each cycle (so a dropped/renamed + # kwarg, or forgetting the daily-summary call, fails here rather than silently going dark). + svc, sm, proxy = _make_service() + sm.is_db_healthy.return_value = True + proxy.get_workers.return_value = {"workers": []} + svc._apply_worker_rejection = AsyncMock() + svc.alert_service = MagicMock() + svc.alert_service.process = AsyncMock() + svc.alert_service.maybe_daily_summary = AsyncMock() + + worker_client = MagicMock() + worker_client.get_stats = AsyncMock(return_value={}) + tari_client = MagicMock() + tari_client.get_sync_status = AsyncMock( + return_value={"is_syncing": False, "reachable": True} + ) + tari_client.close = AsyncMock() + + with ( + patch.object(ds_mod, "ClientSession", _FakeClientSession), + patch.object(ds_mod, "XMRigWorkerClient", return_value=worker_client), + patch.object(ds_mod, "TariClient", return_value=tari_client), + patch.object(ds_mod, "get_stratum_stats", return_value={}), + patch.object(ds_mod, "get_network_stats", return_value={"height": 100}), + patch.object(ds_mod, "get_tari_stats", return_value={"active": True, "height": 3}), + patch.object( + ds_mod, + "get_p2pool_stats", + return_value={"pool": {"last_share_time": 0, "difficulty": 0}}, + ), + patch.object( + ds_mod, + "get_monero_sync_status", + AsyncMock(return_value={"is_syncing": False, "reachable": True}), + ), + patch.object(ds_mod, "get_disk_usage", return_value={"percent": 42}), + patch.object(ds_mod, "get_hugepages_status", return_value=("Enabled", "ok", "1/2")), + patch.object(ds_mod, "get_memory_usage", return_value={}), + patch.object(ds_mod, "get_load_average", return_value="0"), + patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), + ): + with pytest.raises(StopAsyncIteration): + await svc.run() + + svc.alert_service.process.assert_awaited_once() + kw = svc.alert_service.process.await_args.kwargs + # The full signal contract the AlertService.evaluate() unit tests rely on. + assert set(kw) >= { + "monero_down", + "tari_down", + "tari_required", + "miner_released", + "workers", + "workers_expected", + "disk_percent", + "db_healthy", + "xvb_enabled", + "shares_in_window", + "clearnet_active", + "xvb_registration_state", + "update_available", + } + # ...sourced from the real computed values, not placeholders. + assert kw["db_healthy"] is True # from state_manager.is_db_healthy() + assert kw["disk_percent"] == 42 # from get_disk_usage() + assert isinstance(kw["workers"], list) + # The once-daily digest is wired in too. + svc.alert_service.maybe_daily_summary.assert_awaited_once() + async def test_run_releases_despite_height_override(self): # Both nodes are synced per their RPC/gRPC, but p2pool is held so its stats file is # empty → get_network_stats height 0 trips the UI "syncing" override. The gate must diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index 0b135d2..fc9cf3c 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -219,6 +219,19 @@ def test_earnings_unavailable_without_network_data(): assert "unavailable" in out +def test_daily_summary_rolls_up_status(): + data = { + "miner_released": True, + "workers_rejected": False, + "system": {"disk": {"percent_str": "42%"}}, + } + out = tc.format_daily_summary(_metrics(mode="P2POOL", workers_online=3, workers_total=3), data) + assert "Daily summary" in out + assert "Mining: 🟢 active (P2POOL)" in out + assert "Workers: 3/3 online" in out + assert "Disk: 42% used" in out + + def test_host_label_prefix(): assert tc.format_sync(_metrics(), host_label="rig-box").startswith("[rig-box] ") # The placeholder is never printed. diff --git a/config.reference.json b/config.reference.json index f08be91..c03fd1f 100644 --- a/config.reference.json +++ b/config.reference.json @@ -98,8 +98,10 @@ "clearnet_exposed": true, "xvb_registration": true, "new_release": true, - "stack_online": true + "stack_online": true, + "daily_summary": true }, + "daily_summary_time": "08:00", "commands": { "enabled": false } diff --git a/docker-compose.yml b/docker-compose.yml index adfcc74..869c900 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -446,6 +446,8 @@ services: - TELEGRAM_EVENT_XVB_REGISTRATION=${TELEGRAM_EVENT_XVB_REGISTRATION:-true} - TELEGRAM_EVENT_NEW_RELEASE=${TELEGRAM_EVENT_NEW_RELEASE:-true} - TELEGRAM_EVENT_STACK_ONLINE=${TELEGRAM_EVENT_STACK_ONLINE:-true} + - TELEGRAM_EVENT_DAILY_SUMMARY=${TELEGRAM_EVENT_DAILY_SUMMARY:-true} + - TELEGRAM_DAILY_SUMMARY_TIME=${TELEGRAM_DAILY_SUMMARY_TIME:-08:00} - TELEGRAM_COMMANDS_ENABLED=${TELEGRAM_COMMANDS_ENABLED:-false} # --- Docker Socket Proxy (read-only) --- diff --git a/docs/configuration.md b/docs/configuration.md index 8687dfb..0d4caba 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -105,7 +105,8 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `telegram.enabled` | `false` | Push operational alerts (node down/recovered, worker offline/back, sync finished) to Telegram. Off by default. Requires `bot_token` + `chat_id` to actually send. Full walkthrough: [Telegram Bot](telegram.md). | | `telegram.bot_token` | `""` | Your BotFather bot token. A secret — stored owner-only in `.env`, git-ignored, and never logged. Get one from [@BotFather](https://t.me/BotFather). | | `telegram.chat_id` | `""` | Where alerts are sent and the only chat the command interface answers. A Telegram group id (negative, e.g. `-1001234567890`) or a personal chat id. See [how to find it](telegram.md#3-find-your-chat-id). | -| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | +| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`, `daily_summary`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | +| `telegram.daily_summary_time` | `08:00` | Local time (24-hour `HH:MM`) to push the once-a-day status digest, when the `daily_summary` event is on. Uses the dashboard's timezone (`dashboard.timezone`). A malformed value disables the digest. | | `telegram.commands.enabled` | `false` | Turn on the interactive command interface — the bot answers `/status`, `/hashrate`, `/workers`, `/sync`, and `/help` from the configured `chat_id` (every other chat is ignored). Off by default; alerts work without it. Long-polls over Tor, so it needs no inbound port. See [Telegram › Commands](telegram.md#commands). | --- diff --git a/docs/telegram.md b/docs/telegram.md index 065ceed..2936bad 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -35,6 +35,7 @@ transition, not a stream: | 🎰 **XvB registration** | XvB auto-registration was rejected (bad payout address) or is failing — raffle wins won't count until it recovers. Only fires when XvB is enabled. | | 🆕 **New release** | A newer Pithead release is available (the same signal as the dashboard header badge). | | 🚀 **Pithead online** | Sent once when the dashboard starts — a heartbeat that the stack is up (and confirms the bot works after setup). | +| 📅 **Daily summary** | A once-a-day roll-up (nodes, mining, workers, hashrate, shares, disk) pushed at a set local time — **08:00** by default, set `telegram.daily_summary_time` to change it. | Every message is prefixed with your dashboard hostname (e.g. `[rig-box.lan]`), so if you point more than one stack at the same chat you can tell them apart. @@ -155,6 +156,7 @@ block and set it to `false` — any event you don't list stays on: | `xvb_registration` | `true` | XvB auto-registration rejected / failing / recovered | | `new_release` | `true` | A newer Pithead release is available | | `stack_online` | `true` | One-shot "dashboard is up" heartbeat on start | +| `daily_summary` | `true` | Once-a-day status roll-up (time set by `telegram.daily_summary_time`, default `08:00`) | Run `./pithead apply` after editing. diff --git a/docs/test-inventory.md b/docs/test-inventory.md index bfc7350..1052700 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 728 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 735 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 728 | +| 1 — Unit | dashboard pytest | 735 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 728 tests +### Dashboard (pytest) — 735 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 32 +#### tests/service/test_alert_service.py — 37 - test_every_alert_event_has_a_config_toggle - test_first_cycle_seeds_baseline_silently - test_down_then_recovered @@ -243,6 +243,11 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_disabled_notifier_is_noop - test_enabled_notifier_dispatches - test_process_swallows_evaluate_error +- test_fires_at_target_once_per_day +- test_late_start_waits_for_next_day +- test_malformed_time_disables +- test_gated_off_by_event_toggle +- test_provider_error_is_swallowed_and_marks_day_done #### tests/service/test_algo_service.py — 38 - test_xvb_disabled_forces_p2pool @@ -297,7 +302,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_per_chain_independent - test_marker_write_failure_does_not_restart -#### tests/service/test_data_service.py — 88 +#### tests/service/test_data_service.py — 89 - test_first_poll_baselines_without_backfill - test_delta_records_the_difference - test_no_change_records_nothing @@ -361,6 +366,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_rehold_stops_quietly_after_first_cycle - test_single_iteration_aggregates - test_run_holds_miner_while_syncing +- test_run_wires_computed_signals_into_the_alerter - test_run_releases_despite_height_override - test_run_nonblocking_tari_releases_and_stays_operational - test_healthchecks_pinged_when_healthy @@ -540,7 +546,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires -#### tests/service/test_telegram_commands.py — 39 +#### tests/service/test_telegram_commands.py — 40 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -559,6 +565,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_xvb_disabled - test_earnings_estimate - test_earnings_unavailable_without_network_data +- test_daily_summary_rolls_up_status - test_host_label_prefix - test_reply_for_help_and_unknown_need_no_metrics - test_reply_for_status_uses_mining_flag @@ -1101,5 +1108,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **891** enumerated cases/sections across the four tiers (plus the live +_Grand total: **898** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/pithead b/pithead index 7e02732..299a935 100755 --- a/pithead +++ b/pithead @@ -2096,6 +2096,7 @@ render_env() { local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered local tg_ev_worker_joined tg_ev_worker_left tg_ev_sync_finished tg_ev_disk_space tg_ev_db_unhealthy local tg_ev_xvb_no_share tg_ev_clearnet_exposed tg_ev_xvb_registration tg_ev_new_release tg_ev_stack_online + local tg_ev_daily_summary tg_summary_time tg_ev_node_down=$(tg_event node_down) tg_ev_node_recovered=$(tg_event node_recovered) tg_ev_worker_offline=$(tg_event worker_offline) @@ -2110,6 +2111,9 @@ render_env() { tg_ev_xvb_registration=$(tg_event xvb_registration) tg_ev_new_release=$(tg_event new_release) tg_ev_stack_online=$(tg_event stack_online) + tg_ev_daily_summary=$(tg_event daily_summary) + # Local time (HH:MM) for the daily digest; default 08:00. + tg_summary_time=$(jq -r '.telegram.daily_summary_time // "08:00"' "$CONFIG_FILE") # Tari memory cap (#55). Tari officially needs only a few GB (min 4 GB host, 8 GB+ recommended), # but its memory grows unbounded over time — one 32 GB host was seen at ~11 GB while staying @@ -2206,6 +2210,8 @@ TELEGRAM_EVENT_CLEARNET_EXPOSED=$tg_ev_clearnet_exposed TELEGRAM_EVENT_XVB_REGISTRATION=$tg_ev_xvb_registration TELEGRAM_EVENT_NEW_RELEASE=$tg_ev_new_release TELEGRAM_EVENT_STACK_ONLINE=$tg_ev_stack_online +TELEGRAM_EVENT_DAILY_SUMMARY=$tg_ev_daily_summary +TELEGRAM_DAILY_SUMMARY_TIME=$tg_summary_time MONERO_MEM_LIMIT=$monero_mem_limit P2POOL_URL=${NETWORK_PREFIX}.28:3333 NETWORK_SUBNET=$NETWORK_SUBNET @@ -2735,6 +2741,9 @@ describe_change() { TELEGRAM_EVENT_*) msg="Telegram alert toggle ($key): $old → $new (#121)." ;; + TELEGRAM_DAILY_SUMMARY_TIME) + msg="Telegram daily summary time: $old → $new (local time; #121)." + ;; MONERO_CLEARNET_SYNC) flag=DEST if [ "$new" == "true" ]; then diff --git a/tests/stack/run.sh b/tests/stack/run.sh index b42b702..7cd1a6a 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -1547,6 +1547,13 @@ printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","n out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" assert_eq "telegram commands opt-in propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_COMMANDS_ENABLED)" "true" +# Daily-summary time (#121): defaults to 08:00; an explicit telegram.daily_summary_time propagates. +assert_eq "daily summary time defaults to 08:00" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_DAILY_SUMMARY_TIME)" "08:00" +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"}, "telegram":{"enabled":true,"bot_token":"BOTSECRET","chat_id":"-100123","daily_summary_time":"21:30"} }\n' "$WALLET" >"$V/config.json" +out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +assert_eq "daily summary time propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_DAILY_SUMMARY_TIME)" "21:30" + # Event-set consistency (#121/#45): every telegram.events.* key in config.reference.json must be # rendered by pithead into .env AND declared in docker-compose.yml — so adding an alert event in one # surface but forgetting another fails here. (The Python side — AlertService.EVT_* vs config.py's From 502f7212f69af72be864832727838cc977977b3b Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 00:29:22 -0500 Subject: [PATCH 09/18] fix(telegram): consistent /hashrate, egress panel + hashrate-low alert (#339) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - /hashrate consistency: total and per-worker now use one shared effective_hashrate() (10m avg, 1m fallback for a just-connected rig without 10m history) — so per-worker lines sum to the total and a fresh worker shows its real rate, not 0.00. Fixed in /hashrate, /workers, and the daily digest label; _aggregate_hashrate reuses the helper. - Tor network panel (egress #170): the Telegram bot now appears as a dashboard egress path (Tor when enabled, else inactive) in both the egress list and the topology graph. - hashrate_low alert (#339 remainder): edge alert when a fixed XvB tier can't be sustained / recovers, from metrics.low_hr_warning (built once per cycle, only when the bot is on). #340 (Tor routing) was already complete. make test green; patch coverage 98%; docs + CHANGELOG + roadmap #333 updated; 6 issues (#99/#104/#59/#84/#118/#116) got Telegram acceptance-criteria bullets. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 12 ++++---- .../mining_dashboard/config/config.py | 1 + .../mining_dashboard/helper/utils.py | 11 ++++++++ .../mining_dashboard/service/alert_service.py | 28 +++++++++++++++++++ .../mining_dashboard/service/data_service.py | 27 ++++++++++-------- .../mining_dashboard/service/egress.py | 20 +++++++++++-- .../service/telegram_commands.py | 17 +++++++---- .../tests/service/test_alert_service.py | 12 ++++++++ .../tests/service/test_data_service.py | 4 +++ build/dashboard/tests/service/test_egress.py | 10 +++++++ .../tests/service/test_telegram_commands.py | 14 ++++++++++ config.reference.json | 3 +- docker-compose.yml | 1 + docs/configuration.md | 2 +- docs/telegram.md | 2 ++ docs/test-inventory.md | 18 +++++++----- pithead | 4 ++- 17 files changed, 151 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49f7868..4ef7cb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -103,11 +103,13 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay high-value set of operational alerts to Telegram — a **🚀 "Pithead online"** heartbeat on start, **node down / recovered**, **worker offline / back online**, **new worker joined / left**, **sync finished**, **data disk filling up**, **dashboard DB write failing**, **no PPLNS share while - donating to XvB** (raffle wins skipped), **XvB registration rejected / failing**, **a node exposed - on clearnet** during initial sync, and **a new release being available** — and answer status - commands on demand: **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, - **`/pool`**, **`/xvb`**, **`/earnings`**, and **`/help`**. It also pushes a **📅 once-a-day status - digest** at a configurable local time (`telegram.daily_summary_time`, default **08:00**). All + donating to XvB** (raffle wins skipped), **XvB registration rejected / failing**, **hashrate too + low for the chosen XvB tier**, **a node exposed on clearnet** during initial sync, and **a new + release being available** — and answer status commands on demand: **`/status`**, **`/hashrate`**, + **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, **`/xvb`**, **`/earnings`**, and + **`/help`**. It also pushes a **📅 once-a-day status digest** at a configurable local time + (`telegram.daily_summary_time`, default **08:00**). The Telegram bot appears in the dashboard's + **network-egress panel** (#170) as a Tor-routed path alongside Healthchecks/XvB/update-check. All traffic is **routed over Tor** (the same bridge SOCKS as Healthchecks/XvB), so the bot never exposes the host IP to Telegram. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, `bot_token`, `chat_id`, per-event `events` toggles, and a `commands.enabled` switch for the diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index be625fb..082b9b2 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -270,6 +270,7 @@ def _telegram_event_enabled(name, default=True): "new_release": _telegram_event_enabled("new_release"), "stack_online": _telegram_event_enabled("stack_online"), "daily_summary": _telegram_event_enabled("daily_summary"), + "hashrate_low": _telegram_event_enabled("hashrate_low"), } # ponytail: daily_summary is a scheduled push, not an edge — it lives in the events dict only so it # gets a per-event on/off toggle like the rest; its time is TELEGRAM_DAILY_SUMMARY_TIME below. diff --git a/build/dashboard/mining_dashboard/helper/utils.py b/build/dashboard/mining_dashboard/helper/utils.py index 45f251e..3df105d 100644 --- a/build/dashboard/mining_dashboard/helper/utils.py +++ b/build/dashboard/mining_dashboard/helper/utils.py @@ -36,6 +36,17 @@ def parse_hashrate(val_str, unit_str=None): return 0.0 +def effective_hashrate(worker): + """The single figure a worker contributes to the live headline total. + + Prefers the 10-minute average (the ``h15`` field — legacy name, it's the proxy's 10m rate), + falling back to the 1-minute rate (``h60`` then ``h10``) when a rig hasn't accumulated 10 + minutes yet, so a freshly-connected worker reads its real live rate instead of 0. Defined once + here so the aggregate total and every per-worker display use the *same* value and can't drift. + """ + return worker.get("h15", 0) or worker.get("h60", 0) or worker.get("h10", 0) or 0 + + def format_hashrate(hashrate): """ Formats a raw hashrate value into a human-readable string with appropriate units. diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 6cac83c..23aefd1 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -86,6 +86,7 @@ class AlertService: EVT_NEW_RELEASE = "new_release" EVT_STACK_ONLINE = "stack_online" EVT_DAILY_SUMMARY = "daily_summary" + EVT_HASHRATE_LOW = "hashrate_low" # WorkerPresenceMonitor edge -> (event key, message template). _WORKER_EDGES = { @@ -121,6 +122,7 @@ def __init__( self._prev_clearnet_active = None self._prev_xvb_reg = None self._prev_update_available = None + self._prev_hashrate_low = None # One-shot "stack is online" ping, sent on the first cycle after the dashboard starts. self._announced_online = False @@ -144,6 +146,7 @@ def evaluate( clearnet_active=False, xvb_registration_state="", update_available=False, + low_hr_warning=False, now=None, ): """Pure: fold this cycle's signals into the list of ``(event_key, text)`` to send, @@ -204,6 +207,7 @@ def evaluate( # --- XvB auto-registration health, and a new Pithead release being available --- alerts += self._registration_edges(xvb_enabled, xvb_registration_state) alerts += self._release_edges(update_available) + alerts += self._hashrate_low_edges(low_hr_warning) return [(evt, text) for evt, text in alerts if self.notifier.event_enabled(evt)] @@ -404,6 +408,30 @@ def _release_edges(self, update_available): ) ] + def _hashrate_low_edges(self, low_hr_warning): + """Alert when a manually-chosen XvB tier can't be sustained by the current hashrate (#158), + and when it recovers. Edge-only (fires on the transition, not every cycle).""" + prev = self._prev_hashrate_low + self._prev_hashrate_low = bool(low_hr_warning) + if prev is None or bool(low_hr_warning) == prev: + return [] + if low_hr_warning: + return [ + ( + self.EVT_HASHRATE_LOW, + self._fmt( + "⚠️ \U0001f4c9 Hashrate too low for the chosen XvB tier — it can't be " + "sustained; lower the tier or add hashrate." + ), + ) + ] + return [ + ( + self.EVT_HASHRATE_LOW, + self._fmt("\U0001f7e2 \U0001f4c8 Hashrate back above the chosen XvB tier."), + ) + ] + def _fmt(self, text): return f"[{self.host_label}] {text}" if self.host_label else text diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index 878f84c..f9f6fac 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -46,6 +46,7 @@ ) from mining_dashboard.helper.utils import ( DEFAULT_PPLNS_WINDOW, + effective_hashrate, pplns_block_time, shares_in_pplns_window, ) @@ -245,12 +246,7 @@ def _aggregate_hashrate(workers): total_h10 = 0 for w in workers: if w.get("status") == "online": - w_hr = w.get("h15", 0) - if w_hr == 0: - w_hr = w.get("h60", 0) - if w_hr == 0: - w_hr = w.get("h10", 0) - total_hr += w_hr + total_hr += effective_hashrate(w) total_h10 += w.get("h10", 0) return total_hr, total_h10 @@ -799,6 +795,14 @@ async def run(self): pool_local.get("pplns_window", DEFAULT_PPLNS_WINDOW), pplns_block_time(pool_type), ) + # Build the domain metrics once per cycle for the alerter — but only when the + # bot is actually on, so the default (Telegram-off) stack pays nothing. Reused + # for the hashrate-low edge and the daily digest. + alert_metrics = ( + build_metrics(self.latest_data, self.state_manager) + if self.alert_service.enabled + else None + ) await self.alert_service.process( monero_down=monero_down, tari_down=tari_down, @@ -821,15 +825,14 @@ async def run(self): update_available=bool( (self.latest_data.get("update") or {}).get("available") ), + low_hr_warning=bool(alert_metrics and alert_metrics.low_hr_warning), ) - # Once-daily status digest (built lazily, only when a send is actually due). + # Once-daily status digest, reusing the metrics built above (only when the bot + # is on, which is also the only time maybe_daily_summary would send). await self.alert_service.maybe_daily_summary( time.time(), - lambda: format_daily_summary( - build_metrics(self.latest_data, self.state_manager), - self.latest_data, - HOST_IP, - ), + # bind this cycle's metrics (the provider runs within this iteration). + lambda m=alert_metrics: format_daily_summary(m, self.latest_data, HOST_IP), ) self.latest_data.update( diff --git a/build/dashboard/mining_dashboard/service/egress.py b/build/dashboard/mining_dashboard/service/egress.py index b410ba5..838bc92 100644 --- a/build/dashboard/mining_dashboard/service/egress.py +++ b/build/dashboard/mining_dashboard/service/egress.py @@ -9,8 +9,9 @@ * The **#270 egress firewall** (``DOCKER-USER``, fail-closed) DROPs non-Tor egress from the *container* subnet — so a container's clearnet route can't actually leave while it's on. * It does **not** cover the **host-networked dashboard** (``network_mode: host``), whose own egress - (XvB stats fetch, update check) bypasses ``DOCKER-USER`` entirely. Those rely solely on their - SOCKS config — a clearnet route there is a real leak regardless of the firewall. + (XvB stats fetch, update check, Healthchecks ping, Telegram bot) bypasses ``DOCKER-USER`` entirely. + Those rely solely on their SOCKS config — a clearnet route there is a real leak regardless of the + firewall. (All four are Tor-routed by default, so none leak.) So a connection is a *leak* only when its route is clearnet AND it isn't neutralised by a backstop. """ @@ -39,6 +40,7 @@ def compute_egress_posture( tari_clearnet_sync, remote_monero, healthchecks_enabled, + telegram_enabled, ): """Pure derivation of the egress posture from config knobs. Returns ``{components, summary}``.""" xvb = _xvb_route(xvb_enabled, xvb_tor) @@ -98,6 +100,8 @@ def compute_egress_posture( {"to": "update check (github)", "route": TOR}, # socks5h, #224 # Healthchecks.io dead-man's-switch ping — always over Tor when a URL is set (#79). {"to": "Healthchecks.io ping", "route": TOR if healthchecks_enabled else INACTIVE}, + # Telegram bot (alerts + command long-poll) — always over Tor when on (#121/#340). + {"to": "Telegram bot", "route": TOR if telegram_enabled else INACTIVE}, ], }, { @@ -150,6 +154,7 @@ def egress_posture_from_config(): tari_clearnet_sync=config.TARI_CLEARNET_SYNC, remote_monero=config.MONERO_NODE_HOST != config.LOCAL_MONERO_HOST, healthchecks_enabled=bool(config.HEALTHCHECKS_PING_URL), + telegram_enabled=config.TELEGRAM_ENABLED, ) @@ -201,6 +206,7 @@ def compute_topology( tari_clearnet_sync, remote_monero, healthchecks_enabled, + telegram_enabled, ): """Pure derivation of the stack topology. Returns ``{nodes, edges, summary}``. @@ -217,6 +223,7 @@ def compute_topology( tari_clearnet_sync=tari_clearnet_sync, remote_monero=remote_monero, healthchecks_enabled=healthchecks_enabled, + telegram_enabled=telegram_enabled, ) xvb = _xvb_route(xvb_enabled, xvb_tor) sidechain = CLEARNET if p2pool_clearnet else TOR @@ -242,6 +249,14 @@ def compute_topology( "Healthchecks ping", "egress", ), + # Telegram bot (alerts + command long-poll) — always over Tor when on (#121/#340). + _edge( + "dashboard", + "tor", + TOR if telegram_enabled else INACTIVE, + "Telegram bot", + "egress", + ), # The Tor hub to the network: SOCKS egress for every daemon + onion-service ingress. _edge("tor", "internet", TOR, "SOCKS + onion circuits", "p2p"), # Internal mesh (hidden until expanded). @@ -284,4 +299,5 @@ def topology_from_config(): tari_clearnet_sync=config.TARI_CLEARNET_SYNC, remote_monero=config.MONERO_NODE_HOST != config.LOCAL_MONERO_HOST, healthchecks_enabled=bool(config.HEALTHCHECKS_PING_URL), + telegram_enabled=config.TELEGRAM_ENABLED, ) diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 6a6f9ed..4acbe18 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -11,7 +11,7 @@ TELEGRAM_ENABLED, TOR_SOCKS_PROXY, ) -from mining_dashboard.helper.utils import format_duration, format_hashrate +from mining_dashboard.helper.utils import effective_hashrate, format_duration, format_hashrate from mining_dashboard.service.earnings import xmr_per_hs_day from mining_dashboard.service.metrics import build_metrics from mining_dashboard.service.telegram_notifier import TELEGRAM_API_BASE @@ -101,7 +101,12 @@ def format_status(metrics, mining_active, host_label=""): def format_hashrate_reply(metrics, workers, host_label=""): - """Total + per-online-worker hashrate — the answer to '/hashrate'.""" + """Total + per-online-worker hashrate — the answer to '/hashrate'. + + Both the total and each per-worker figure use the same :func:`effective_hashrate` (10m average, + 1m fallback for a rig without 10m history yet), so the per-worker lines add up to the total — + a just-connected worker reads its real live rate, not 0. + """ lines = [ f"{_prefix(host_label)}⚡ Hashrate", f"Total: {format_hashrate(metrics.total_h15)} (10m avg)", @@ -109,8 +114,8 @@ def format_hashrate_reply(metrics, workers, host_label=""): online = [w for w in workers if w.get("status") == "online"] if not online: lines.append("No workers online.") - for w in sorted(online, key=lambda w: w.get("h15", 0) or 0, reverse=True): - lines.append(f"• {w.get('name', '?')}: {format_hashrate(w.get('h15', 0))}") + for w in sorted(online, key=effective_hashrate, reverse=True): + lines.append(f"• {w.get('name', '?')}: {format_hashrate(effective_hashrate(w))}") return "\n".join(lines) @@ -126,7 +131,7 @@ def format_workers(workers, host_label=""): up = w.get("uptime") or 0 tail = f" · up {format_duration(up)}" if up else "" lines.append( - f"\U0001f7e2 {w.get('name', '?')} — {format_hashrate(w.get('h15', 0))}{tail}" + f"\U0001f7e2 {w.get('name', '?')} — {format_hashrate(effective_hashrate(w))}{tail}" ) else: lines.append(f"\U0001f534 {w.get('name', '?')} — offline") @@ -243,7 +248,7 @@ def format_daily_summary(metrics, data, host_label=""): else: lines.append("⛏️ Mining: \U0001f534 not mining") lines.append(f"\U0001f477 Workers: {metrics.workers_online}/{metrics.workers_total} online") - lines.append(f"⚡ Hashrate: {format_hashrate(metrics.total_h15)}") + lines.append(f"⚡ Hashrate: {format_hashrate(metrics.total_h15)} (10m avg)") lines.append(f"\U0001f3b0 PPLNS shares: {metrics.shares_in_window} in window") lines.append(f"\U0001f4be Disk: {disk.get('percent_str', 'n/a')} used") return "\n".join(lines) diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index 2ca1645..8697264 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -61,6 +61,7 @@ def _ev( clearnet_active=False, xvb_registration_state="", update_available=False, + low_hr_warning=False, now=0, ): return svc.evaluate( @@ -77,6 +78,7 @@ def _ev( clearnet_active=clearnet_active, xvb_registration_state=xvb_registration_state, update_available=update_available, + low_hr_warning=low_hr_warning, now=now, ) @@ -300,6 +302,16 @@ def test_fires_once_on_rising_edge(self): assert _ev(svc, update_available=True) == [] # no repeat while still available +class TestHashrateLow: + def test_warns_then_recovers(self): + svc = _svc() + assert _ev(svc, low_hr_warning=False) == [] # seed + assert _keys(_ev(svc, low_hr_warning=True)) == [AlertService.EVT_HASHRATE_LOW] + assert _ev(svc, low_hr_warning=True) == [] # no repeat + _, text = _ev(svc, low_hr_warning=False)[0] + assert "back above" in text + + class TestEventFiltering: def test_disabled_events_are_dropped(self): svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) diff --git a/build/dashboard/tests/service/test_data_service.py b/build/dashboard/tests/service/test_data_service.py index 4e57c76..ecb8098 100644 --- a/build/dashboard/tests/service/test_data_service.py +++ b/build/dashboard/tests/service/test_data_service.py @@ -777,6 +777,9 @@ async def test_run_wires_computed_signals_into_the_alerter(self): proxy.get_workers.return_value = {"workers": []} svc._apply_worker_rejection = AsyncMock() svc.alert_service = MagicMock() + # Disabled → the loop skips the per-cycle build_metrics (a MagicMock state_manager can't feed + # it); process()/maybe_daily_summary are still called every cycle regardless. + svc.alert_service.enabled = False svc.alert_service.process = AsyncMock() svc.alert_service.maybe_daily_summary = AsyncMock() @@ -832,6 +835,7 @@ async def test_run_wires_computed_signals_into_the_alerter(self): "clearnet_active", "xvb_registration_state", "update_available", + "low_hr_warning", } # ...sourced from the real computed values, not placeholders. assert kw["db_healthy"] is True # from state_manager.is_db_healthy() diff --git a/build/dashboard/tests/service/test_egress.py b/build/dashboard/tests/service/test_egress.py index 69b62ad..7b38f12 100644 --- a/build/dashboard/tests/service/test_egress.py +++ b/build/dashboard/tests/service/test_egress.py @@ -23,6 +23,7 @@ "tari_clearnet_sync": False, "remote_monero": False, "healthchecks_enabled": False, + "telegram_enabled": False, } @@ -93,6 +94,14 @@ def test_healthchecks_ping_is_tor_when_configured_inactive_otherwise(): assert on["summary"]["leaks"] == 0 +def test_telegram_bot_is_tor_when_enabled_inactive_otherwise(): + # Enabling Telegram adds a dashboard Tor egress (#121/#340); off → inactive, never a leak. + assert _conn(_posture(telegram_enabled=False), "dashboard", "Telegram")["route"] == INACTIVE + on = _posture(telegram_enabled=True, firewall=True) + assert _conn(on, "dashboard", "Telegram")["route"] == TOR + assert on["summary"]["leaks"] == 0 # Tor-routed, so never a leak + + def test_remote_monerod_rpc_is_clearnet(): assert _conn(_posture(remote_monero=False), "p2pool", "monerod RPC")["route"] != CLEARNET assert _conn(_posture(remote_monero=True), "p2pool", "monerod RPC")["route"] == CLEARNET @@ -226,6 +235,7 @@ def test_tari_clearnet_sync_surfaces_in_egress_and_topology(): "tari_clearnet_sync", "remote_monero", "healthchecks_enabled", + "telegram_enabled", ) diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index fc9cf3c..87d4d96 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -135,6 +135,20 @@ def test_hashrate_no_online_workers(): assert "No workers online." in out +def test_hashrate_uses_effective_rate_for_fresh_worker(): + # A just-connected rig has no 10m (h15) history yet but is mining — it must show its live 1m + # rate (the same value the total counts), never 0.00. (This was the reported inconsistency.) + workers = [{"name": "fresh", "status": "online", "h15": 0, "h60": 42000, "h10": 42000}] + out = tc.format_hashrate_reply(_metrics(), workers) + assert "42.00 kH/s" in out + assert "0.00 H/s" not in out + + +def test_workers_hashrate_uses_effective_rate(): + workers = [{"name": "fresh", "status": "online", "h15": 0, "h60": 5000, "h10": 5000}] + assert "5.00 kH/s" in tc.format_workers(workers) + + def test_workers_online_first_with_offline_flagged(): workers = [ {"name": "off-1", "status": "offline", "h15": 0}, diff --git a/config.reference.json b/config.reference.json index c03fd1f..344399d 100644 --- a/config.reference.json +++ b/config.reference.json @@ -99,7 +99,8 @@ "xvb_registration": true, "new_release": true, "stack_online": true, - "daily_summary": true + "daily_summary": true, + "hashrate_low": true }, "daily_summary_time": "08:00", "commands": { diff --git a/docker-compose.yml b/docker-compose.yml index 869c900..ab1e511 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -447,6 +447,7 @@ services: - TELEGRAM_EVENT_NEW_RELEASE=${TELEGRAM_EVENT_NEW_RELEASE:-true} - TELEGRAM_EVENT_STACK_ONLINE=${TELEGRAM_EVENT_STACK_ONLINE:-true} - TELEGRAM_EVENT_DAILY_SUMMARY=${TELEGRAM_EVENT_DAILY_SUMMARY:-true} + - TELEGRAM_EVENT_HASHRATE_LOW=${TELEGRAM_EVENT_HASHRATE_LOW:-true} - TELEGRAM_DAILY_SUMMARY_TIME=${TELEGRAM_DAILY_SUMMARY_TIME:-08:00} - TELEGRAM_COMMANDS_ENABLED=${TELEGRAM_COMMANDS_ENABLED:-false} diff --git a/docs/configuration.md b/docs/configuration.md index 0d4caba..b957e0c 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -105,7 +105,7 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `telegram.enabled` | `false` | Push operational alerts (node down/recovered, worker offline/back, sync finished) to Telegram. Off by default. Requires `bot_token` + `chat_id` to actually send. Full walkthrough: [Telegram Bot](telegram.md). | | `telegram.bot_token` | `""` | Your BotFather bot token. A secret — stored owner-only in `.env`, git-ignored, and never logged. Get one from [@BotFather](https://t.me/BotFather). | | `telegram.chat_id` | `""` | Where alerts are sent and the only chat the command interface answers. A Telegram group id (negative, e.g. `-1001234567890`) or a personal chat id. See [how to find it](telegram.md#3-find-your-chat-id). | -| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`, `daily_summary`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | +| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`, `daily_summary`, `hashrate_low`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | | `telegram.daily_summary_time` | `08:00` | Local time (24-hour `HH:MM`) to push the once-a-day status digest, when the `daily_summary` event is on. Uses the dashboard's timezone (`dashboard.timezone`). A malformed value disables the digest. | | `telegram.commands.enabled` | `false` | Turn on the interactive command interface — the bot answers `/status`, `/hashrate`, `/workers`, `/sync`, and `/help` from the configured `chat_id` (every other chat is ignored). Off by default; alerts work without it. Long-polls over Tor, so it needs no inbound port. See [Telegram › Commands](telegram.md#commands). | diff --git a/docs/telegram.md b/docs/telegram.md index 2936bad..aef993b 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -33,6 +33,7 @@ transition, not a stream: | ⚠ **No PPLNS share (XvB)** | You're donating to XvB but hold no share in the PPLNS window, so raffle wins are **skipped** — donations are wasted until you land one. Only fires when XvB is enabled. | | ⚠ **Clearnet sync active** | A node is doing its initial sync over **clearnet**, so this host's IP is exposed to that chain's P2P network until it finishes (it reverts to Tor automatically). | | 🎰 **XvB registration** | XvB auto-registration was rejected (bad payout address) or is failing — raffle wins won't count until it recovers. Only fires when XvB is enabled. | +| 📉 **Hashrate low for tier** | You picked a fixed XvB donation tier your hashrate can't sustain — lower the tier or add hashrate. Fires on the transition and clears when it recovers. | | 🆕 **New release** | A newer Pithead release is available (the same signal as the dashboard header badge). | | 🚀 **Pithead online** | Sent once when the dashboard starts — a heartbeat that the stack is up (and confirms the bot works after setup). | | 📅 **Daily summary** | A once-a-day roll-up (nodes, mining, workers, hashrate, shares, disk) pushed at a set local time — **08:00** by default, set `telegram.daily_summary_time` to change it. | @@ -157,6 +158,7 @@ block and set it to `false` — any event you don't list stays on: | `new_release` | `true` | A newer Pithead release is available | | `stack_online` | `true` | One-shot "dashboard is up" heartbeat on start | | `daily_summary` | `true` | Once-a-day status roll-up (time set by `telegram.daily_summary_time`, default `08:00`) | +| `hashrate_low` | `true` | Hashrate can't sustain the chosen XvB tier / recovered | Run `./pithead apply` after editing. diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 1052700..75b3bf2 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 735 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 739 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 735 | +| 1 — Unit | dashboard pytest | 739 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 735 tests +### Dashboard (pytest) — 739 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 37 +#### tests/service/test_alert_service.py — 38 - test_every_alert_event_has_a_config_toggle - test_first_cycle_seeds_baseline_silently - test_down_then_recovered @@ -237,6 +237,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_silent_while_disabled - test_benign_transition_is_silent - test_fires_once_on_rising_edge +- test_warns_then_recovers - test_disabled_events_are_dropped - test_prefixes_when_set - test_placeholder_host_is_not_prefixed @@ -399,13 +400,14 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_linear_in_inputs - test_missing_or_bad_inputs_are_zero -#### tests/service/test_egress.py — 26 +#### tests/service/test_egress.py — 27 - test_safe_config_is_all_tor - test_p2pool_clearnet_blocked_by_firewall_is_not_a_leak - test_p2pool_clearnet_without_firewall_is_a_leak - test_host_networked_dashboard_leaks_despite_firewall - test_xvb_disabled_routes_are_inactive - test_healthchecks_ping_is_tor_when_configured_inactive_otherwise +- test_telegram_bot_is_tor_when_enabled_inactive_otherwise - test_remote_monerod_rpc_is_clearnet - test_clearnet_initial_sync_surfaces_only_when_enabled - test_monerod_p2p_always_tor @@ -546,13 +548,15 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires -#### tests/service/test_telegram_commands.py — 40 +#### tests/service/test_telegram_commands.py — 42 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag - test_status_node_down_and_not_mining - test_hashrate_lists_online_workers_desc - test_hashrate_no_online_workers +- test_hashrate_uses_effective_rate_for_fresh_worker +- test_workers_hashrate_uses_effective_rate - test_workers_online_first_with_offline_flagged - test_workers_empty - test_status_node_syncing_percent @@ -1108,5 +1112,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **898** enumerated cases/sections across the four tiers (plus the live +_Grand total: **902** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/pithead b/pithead index 299a935..2a042da 100755 --- a/pithead +++ b/pithead @@ -2096,7 +2096,7 @@ render_env() { local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered local tg_ev_worker_joined tg_ev_worker_left tg_ev_sync_finished tg_ev_disk_space tg_ev_db_unhealthy local tg_ev_xvb_no_share tg_ev_clearnet_exposed tg_ev_xvb_registration tg_ev_new_release tg_ev_stack_online - local tg_ev_daily_summary tg_summary_time + local tg_ev_daily_summary tg_summary_time tg_ev_hashrate_low tg_ev_node_down=$(tg_event node_down) tg_ev_node_recovered=$(tg_event node_recovered) tg_ev_worker_offline=$(tg_event worker_offline) @@ -2112,6 +2112,7 @@ render_env() { tg_ev_new_release=$(tg_event new_release) tg_ev_stack_online=$(tg_event stack_online) tg_ev_daily_summary=$(tg_event daily_summary) + tg_ev_hashrate_low=$(tg_event hashrate_low) # Local time (HH:MM) for the daily digest; default 08:00. tg_summary_time=$(jq -r '.telegram.daily_summary_time // "08:00"' "$CONFIG_FILE") @@ -2211,6 +2212,7 @@ TELEGRAM_EVENT_XVB_REGISTRATION=$tg_ev_xvb_registration TELEGRAM_EVENT_NEW_RELEASE=$tg_ev_new_release TELEGRAM_EVENT_STACK_ONLINE=$tg_ev_stack_online TELEGRAM_EVENT_DAILY_SUMMARY=$tg_ev_daily_summary +TELEGRAM_EVENT_HASHRATE_LOW=$tg_ev_hashrate_low TELEGRAM_DAILY_SUMMARY_TIME=$tg_summary_time MONERO_MEM_LIMIT=$monero_mem_limit P2POOL_URL=${NETWORK_PREFIX}.28:3333 From dd05715798747c9b729f19847da78905e4586a67 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 09:14:55 -0500 Subject: [PATCH 10/18] feat(telegram): daily summary as a 24h fleet retrospective MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per feedback, reworked the daily digest from a live snapshot into a 'what happened over the last day' report: - Date + time stamp in the message. - 24h hashrate (sum of each rig's 24h avg) with the P2Pool/XvB split for the day, shown as a fraction of that total so P2Pool + XvB == the headline and the per-rig lines add up (no divergent sources — same lesson as the /hashrate fix). - XvB tier, shares found in the day (24h), estimated daily P2Pool earnings, and a per-machine 24h breakdown (what each rig did). - Dropped the node-sync / mining-active lines (they don't change day to day). make test green; patch coverage 98%; docs + CHANGELOG updated. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 6 +- .../service/telegram_commands.py | 58 +++++++++++++------ .../tests/service/test_telegram_commands.py | 47 ++++++++++++--- docs/telegram.md | 2 +- docs/test-inventory.md | 13 +++-- 5 files changed, 92 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ef7cb8..ce2c828 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -107,8 +107,10 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay low for the chosen XvB tier**, **a node exposed on clearnet** during initial sync, and **a new release being available** — and answer status commands on demand: **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, **`/xvb`**, **`/earnings`**, and - **`/help`**. It also pushes a **📅 once-a-day status digest** at a configurable local time - (`telegram.daily_summary_time`, default **08:00**). The Telegram bot appears in the dashboard's + **`/help`**. It also pushes a **📅 once-a-day retrospective** at a configurable local time + (`telegram.daily_summary_time`, default **08:00**) — the last 24h across the fleet: 24h hashrate + with the P2Pool/XvB split, shares found in the day, an estimated daily-earnings figure, and a + per-machine 24h breakdown. The Telegram bot appears in the dashboard's **network-egress panel** (#170) as a Tor-routed path alongside Healthchecks/XvB/update-check. All traffic is **routed over Tor** (the same bridge SOCKS as Healthchecks/XvB), so the bot never exposes the host IP to Telegram. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 4acbe18..4bb1ef1 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -1,5 +1,6 @@ import asyncio import logging +import time import requests @@ -231,25 +232,46 @@ def format_earnings(metrics, network, host_label=""): ) -def format_daily_summary(metrics, data, host_label=""): - """The once-a-day status digest pushed by the alerter — an at-a-glance roll-up built from the - same domain values as /status. ``data`` is the latest snapshot (for the disk figure + the - mining-active flag the metrics layer doesn't carry).""" - mining = bool(data.get("miner_released") and not data.get("workers_rejected")) +def format_daily_summary(metrics, data, host_label="", now=None): + """The once-a-day retrospective pushed by the alerter — **what happened across the fleet over + the last 24h**, not a live snapshot. Reuses the same domain values the dashboard shows. + + Consistency by construction: the fleet 24h figure is the sum of each rig's 24h average, and the + XvB split is that total apportioned by the day's routing fraction — so the per-rig lines add up + to the headline and P2Pool + XvB equals it. ``now`` is injectable for tests; it stamps the + message and bounds the 24h share count. + """ + now = time.time() if now is None else now + stamp = time.strftime("%Y-%m-%d %H:%M", time.localtime(now)) + online = [w for w in data.get("workers", []) if w.get("status") == "online"] + fleet_24h = sum(w.get("h24h", 0) or 0 for w in online) + shares_24h = sum(1 for s in data.get("shares", []) if s.get("ts", 0) >= now - 86400) + + lines = [f"{_prefix(host_label)}\U0001f4c5 Daily summary — {stamp}"] + lines.append(f"⚡ 24h hashrate: {format_hashrate(fleet_24h)}") + if metrics.xvb_enabled: + routed = (metrics.p2pool_24h or 0) + (metrics.xvb_routed_24h or 0) + xvb_frac = (metrics.xvb_routed_24h or 0) / routed if routed else 0 + xvb_hr = fleet_24h * xvb_frac + lines.append( + f" \U0001f535 P2Pool {format_hashrate(fleet_24h - xvb_hr)} · " + f"\U0001f3b2 XvB {format_hashrate(xvb_hr)} ({xvb_frac * 100:.0f}% to XvB)" + ) + lines.append(f"\U0001f3b0 XvB tier: {metrics.current_tier}") + lines.append(f"\U0001f3af Shares (24h): {shares_24h}") + + reward = (data.get("network", {}) or {}).get("reward", 0) or 0 + coeff = xmr_per_hs_day(reward, metrics.network_difficulty) + if coeff > 0: + lines.append( + f"\U0001f4b0 Est. earnings: ~{coeff * (metrics.p2pool_24h or 0):.6f} XMR/day (P2Pool)" + ) + + lines.append(f"\U0001f477 Miners: {metrics.workers_online}/{metrics.workers_total} online") + for w in sorted(online, key=lambda w: w.get("h24h", 0) or 0, reverse=True): + lines.append(f" • {w.get('name', '?')}: {format_hashrate(w.get('h24h', 0))}") + disk = (data.get("system", {}) or {}).get("disk", {}) or {} - lines = [ - f"{_prefix(host_label)}\U0001f4c5 Daily summary", - f"⛓️ Monero: {_node_state(metrics.monero)} · Tari: {_node_state(metrics.tari)}", - ] - if metrics.global_syncing: - lines.append("⛏️ Mining: ⏳ holding — chain(s) syncing") - elif mining: - lines.append(f"⛏️ Mining: \U0001f7e2 active ({metrics.mode})") - else: - lines.append("⛏️ Mining: \U0001f534 not mining") - lines.append(f"\U0001f477 Workers: {metrics.workers_online}/{metrics.workers_total} online") - lines.append(f"⚡ Hashrate: {format_hashrate(metrics.total_h15)} (10m avg)") - lines.append(f"\U0001f3b0 PPLNS shares: {metrics.shares_in_window} in window") lines.append(f"\U0001f4be Disk: {disk.get('percent_str', 'n/a')} used") return "\n".join(lines) diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index 87d4d96..544b5b2 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -233,17 +233,50 @@ def test_earnings_unavailable_without_network_data(): assert "unavailable" in out -def test_daily_summary_rolls_up_status(): +def test_daily_summary_is_a_24h_retrospective(): + now = 1_000_000 data = { - "miner_released": True, - "workers_rejected": False, + "workers": [ + {"name": "miner-0", "status": "online", "h24h": 30000}, + {"name": "miner-1", "status": "online", "h24h": 20000}, + {"name": "old", "status": "offline", "h24h": 0}, + ], + # 2 shares within 24h, 1 older. + "shares": [{"ts": now - 100}, {"ts": now - 90000}, {"ts": now - 200}], "system": {"disk": {"percent_str": "42%"}}, + "network": {"reward": 600_000_000_000}, } - out = tc.format_daily_summary(_metrics(mode="P2POOL", workers_online=3, workers_total=3), data) - assert "Daily summary" in out - assert "Mining: 🟢 active (P2POOL)" in out - assert "Workers: 3/3 online" in out + out = tc.format_daily_summary( + _metrics( + xvb_enabled=True, + p2pool_24h=40000, + xvb_routed_24h=10000, + current_tier="Donor", + workers_online=2, + workers_total=3, + ), + data, + now=now, + ) + assert "Daily summary — " in out # date+time stamped + assert "24h hashrate: 50.00 kH/s" in out # sum of per-rig h24h (30k + 20k) + assert "20% to XvB" in out # 10k / (40k + 10k) + assert "P2Pool 40.00 kH/s" in out and "XvB 10.00 kH/s" in out # apportioned, sums to fleet + assert "XvB tier: Donor" in out + assert "Shares (24h): 2" in out + assert "Est. earnings" in out + assert "miner-0: 30.00 kH/s" in out + assert "old" not in out # offline rig excluded assert "Disk: 42% used" in out + # The retrospective drops live-status lines like node sync. + assert "synced" not in out.lower() + + +def test_daily_summary_without_xvb_omits_split(): + data = {"workers": [{"name": "m", "status": "online", "h24h": 5000}], "shares": []} + out = tc.format_daily_summary(_metrics(xvb_enabled=False), data, now=0) + assert "24h hashrate: 5.00 kH/s" in out + assert "to XvB" not in out def test_host_label_prefix(): diff --git a/docs/telegram.md b/docs/telegram.md index aef993b..180536f 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -36,7 +36,7 @@ transition, not a stream: | 📉 **Hashrate low for tier** | You picked a fixed XvB donation tier your hashrate can't sustain — lower the tier or add hashrate. Fires on the transition and clears when it recovers. | | 🆕 **New release** | A newer Pithead release is available (the same signal as the dashboard header badge). | | 🚀 **Pithead online** | Sent once when the dashboard starts — a heartbeat that the stack is up (and confirms the bot works after setup). | -| 📅 **Daily summary** | A once-a-day roll-up (nodes, mining, workers, hashrate, shares, disk) pushed at a set local time — **08:00** by default, set `telegram.daily_summary_time` to change it. | +| 📅 **Daily summary** | A once-a-day retrospective of the last 24h across your whole fleet — date/time, **24h hashrate** with the **P2Pool / XvB split**, **shares found in the day**, an **estimated daily earnings** figure, and a **per-machine 24h breakdown** — pushed at a set local time (**08:00** by default; `telegram.daily_summary_time`). | Every message is prefixed with your dashboard hostname (e.g. `[rig-box.lan]`), so if you point more than one stack at the same chat you can tell them apart. diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 75b3bf2..9f4d004 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 739 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 740 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 739 | +| 1 — Unit | dashboard pytest | 740 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 739 tests +### Dashboard (pytest) — 740 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -548,7 +548,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires -#### tests/service/test_telegram_commands.py — 42 +#### tests/service/test_telegram_commands.py — 43 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -569,7 +569,8 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_xvb_disabled - test_earnings_estimate - test_earnings_unavailable_without_network_data -- test_daily_summary_rolls_up_status +- test_daily_summary_is_a_24h_retrospective +- test_daily_summary_without_xvb_omits_split - test_host_label_prefix - test_reply_for_help_and_unknown_need_no_metrics - test_reply_for_status_uses_mining_flag @@ -1112,5 +1113,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **902** enumerated cases/sections across the four tiers (plus the live +_Grand total: **903** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ From 8b43bd91c436f6551a32ae56fe92a6dc1450c698 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 09:20:33 -0500 Subject: [PATCH 11/18] docs(telegram): step-by-step for adding HealthchecksBot to the shared group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'one chat, two bots' section pointed at issue #79 instead of giving the steps. Add the concrete flow the operator follows: invite @HealthchecksBot to the alerts group, send /start@HealthchecksBot (the @suffix matters — the Pithead bot is already in the group so a bare /start is ambiguous), tap the confirmation link, pick the project, Connect Telegram. Links to healthchecks.io/integrations/add_telegram/ and cross-references monitoring.md both ways. Co-Authored-By: Claude Opus 4.8 --- docs/monitoring.md | 7 ++++--- docs/telegram.md | 20 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index e1533d3..6ac0971 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -40,9 +40,10 @@ pings and nothing is logged. ### 2. Choose where alerts go On the check's **Integrations** tab, point it at however you want to be notified — **email**, -**Telegram**, Slack, Discord, a webhook, and more. If you already use Telegram for other -alerts, you can route Healthchecks.io to the **same** Telegram chat, so host-down alerts and -in-stack events land in one place. +**Telegram**, Slack, Discord, a webhook, and more. If you already run the [Telegram +bot](telegram.md), route Healthchecks.io to the **same** Telegram group, so host-down alerts and +in-stack events land in one place — step-by-step in +[Telegram › Adding Healthchecks.io to the same group](telegram.md#adding-healthchecksio-to-the-same-group). ### 3. Paste the ping URL into `config.json` diff --git a/docs/telegram.md b/docs/telegram.md index 180536f..1137609 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -235,8 +235,24 @@ and use that group's id here. Each source labels its own messages, so you can al which. (Keeping them in separate chats is fine too — only useful if you want to mute or route them differently.) -> Healthchecks.io setup is documented separately under operator monitoring; see -> [issue #79](https://github.com/p2pool-starter-stack/pithead/issues/79). +### Adding Healthchecks.io to the same group + +Once your Pithead bot is posting to the group, add Healthchecks.io's bot to it as well. You do +**not** paste any token into Healthchecks.io — you authorize its bot from their side: + +1. In Telegram, **add [@HealthchecksBot](https://t.me/HealthchecksBot) to your alerts group** (the + same group the Pithead bot posts to). It joins as a member with no access to group messages. +2. In the group, send **`/start@HealthchecksBot`**. Use the **`@HealthchecksBot`** suffix, not a + bare `/start` — your Pithead bot is already in the group, so a plain `/start` is ambiguous and + won't reach the right bot. +3. HealthchecksBot replies with a **confirmation link**. Tap it — Healthchecks.io opens in your + browser. +4. **Select the project** your ping URL belongs to and click **"Connect Telegram"**. Done — host-down + alerts now land in the same group as your Pithead bot's alerts. + +Full walkthrough on their site: ****. For the +rest of the Healthchecks.io setup (creating the check, the ping URL, `config.json`), see +[Monitoring & Alerting](monitoring.md). --- From 5d628ec9d8d810886b790cbb218880692c54fe1e Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 09:30:04 -0500 Subject: [PATCH 12/18] feat(telegram): daily incident log (#342) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The daily digest reported the day's averages but not what went wrong. Add an incident roll-up at the top: e.g. '🚨 Incidents (24h): 3× worker offline · 1× node down', or '🟢 No incidents in the last 24h'. - AlertService tallies each problem-state transition it already detects (node down, worker offline, disk warn/critical, DB fail, XvB no-share, XvB registration fail, clearnet exposure, hashrate-low) via _record_incident at the exact edge — recoveries and steady state don't count. In-memory, keyed by event. - drain_incidents() returns + resets the tally; the daily digest drains it when it sends, so the count spans ~the last day. No new config (rides the daily_summary toggle). Closes #342 (v1.2). make test green; patch coverage 98%; docs + CHANGELOG updated. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 6 ++-- .../mining_dashboard/service/alert_service.py | 24 ++++++++++++++ .../mining_dashboard/service/data_service.py | 10 ++++-- .../service/telegram_commands.py | 32 +++++++++++++++++- .../tests/service/test_alert_service.py | 33 +++++++++++++++++++ .../tests/service/test_telegram_commands.py | 12 +++++++ docs/telegram.md | 2 +- docs/test-inventory.md | 16 +++++---- 8 files changed, 122 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ce2c828..af48e32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -108,9 +108,9 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay release being available** — and answer status commands on demand: **`/status`**, **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, **`/xvb`**, **`/earnings`**, and **`/help`**. It also pushes a **📅 once-a-day retrospective** at a configurable local time - (`telegram.daily_summary_time`, default **08:00**) — the last 24h across the fleet: 24h hashrate - with the P2Pool/XvB split, shares found in the day, an estimated daily-earnings figure, and a - per-machine 24h breakdown. The Telegram bot appears in the dashboard's + (`telegram.daily_summary_time`, default **08:00**) — the last 24h across the fleet: an incident + roll-up (what went wrong during the day, or an all-clear), 24h hashrate with the P2Pool/XvB split, + shares found in the day, an estimated daily-earnings figure, and a per-machine 24h breakdown. The Telegram bot appears in the dashboard's **network-egress panel** (#170) as a Tor-routed path alongside Healthchecks/XvB/update-check. All traffic is **routed over Tor** (the same bridge SOCKS as Healthchecks/XvB), so the bot never exposes the host IP to Telegram. Off by default; enable it with a `telegram` block in `config.json` (`enabled`, diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 23aefd1..7ef38e7 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -123,6 +123,9 @@ def __init__( self._prev_xvb_reg = None self._prev_update_available = None self._prev_hashrate_low = None + # Tally of problem-state transitions since the last daily digest drained it (#342). Keyed by + # event, counted at the exact edge so recoveries / steady state don't inflate it. + self._incidents = {} # One-shot "stack is online" ping, sent on the first cycle after the dashboard starts. self._announced_online = False @@ -192,6 +195,8 @@ def evaluate( if workers_expected: for name, event in self.workers.update(workers, now=now): evt, template = self._WORKER_EDGES[event] + if event == "offline": + self._record_incident(self.EVT_WORKER_OFFLINE) alerts.append((evt, self._fmt(template.format(name=name)))) else: self.workers.reset() @@ -217,6 +222,7 @@ def _node_edges(self, label, down, attr): if prev is None or down == prev: return [] if down: + self._record_incident(self.EVT_NODE_DOWN) return [ ( self.EVT_NODE_DOWN, @@ -246,6 +252,8 @@ def _disk_edges(self, disk_percent): if prev is None or level == prev: return [] pct = f"{disk_percent:.0f}%" + if level in ("critical", "warn"): + self._record_incident(self.EVT_DISK_SPACE) if level == "critical": return [ ( @@ -277,6 +285,7 @@ def _db_edges(self, db_healthy): if prev is None or db_healthy == prev: return [] if not db_healthy: + self._record_incident(self.EVT_DB_UNHEALTHY) return [ ( self.EVT_DB_UNHEALTHY, @@ -310,6 +319,7 @@ def _xvb_share_edges(self, xvb_enabled, shares_in_window): if prev is None or has_share == prev: return [] if not has_share: + self._record_incident(self.EVT_XVB_NO_SHARE) return [ ( self.EVT_XVB_NO_SHARE, @@ -336,6 +346,7 @@ def _clearnet_edges(self, clearnet_active): if prev is None or clearnet_active == prev: return [] if clearnet_active: + self._record_incident(self.EVT_CLEARNET_EXPOSED) return [ ( self.EVT_CLEARNET_EXPOSED, @@ -365,6 +376,8 @@ def _registration_edges(self, xvb_enabled, state): self._prev_xvb_reg = state if prev is None or state == prev: return [] + if state in ("invalid", "failing"): + self._record_incident(self.EVT_XVB_REGISTRATION) if state == "invalid": return [ ( @@ -416,6 +429,7 @@ def _hashrate_low_edges(self, low_hr_warning): if prev is None or bool(low_hr_warning) == prev: return [] if low_hr_warning: + self._record_incident(self.EVT_HASHRATE_LOW) return [ ( self.EVT_HASHRATE_LOW, @@ -432,6 +446,16 @@ def _hashrate_low_edges(self, low_hr_warning): ) ] + def _record_incident(self, key): + """Tally one problem-state transition for the daily incident log (#342).""" + self._incidents[key] = self._incidents.get(key, 0) + 1 + + def drain_incidents(self): + """Return the incidents tallied since the last drain and reset the counter. Called by the + daily digest so the count spans ~the last day (since the previous digest).""" + incidents, self._incidents = self._incidents, {} + return incidents + def _fmt(self, text): return f"[{self.host_label}] {text}" if self.host_label else text diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index f9f6fac..b27647e 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -831,8 +831,14 @@ async def run(self): # is on, which is also the only time maybe_daily_summary would send). await self.alert_service.maybe_daily_summary( time.time(), - # bind this cycle's metrics (the provider runs within this iteration). - lambda m=alert_metrics: format_daily_summary(m, self.latest_data, HOST_IP), + # bind this cycle's metrics (the provider runs within this iteration); drain + # the day's incident tally into the digest (#342). + lambda m=alert_metrics: format_daily_summary( + m, + self.latest_data, + HOST_IP, + incidents=self.alert_service.drain_incidents(), + ), ) self.latest_data.update( diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 4bb1ef1..ab14fb8 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -232,7 +232,34 @@ def format_earnings(metrics, network, host_label=""): ) -def format_daily_summary(metrics, data, host_label="", now=None): +# Friendly labels for the daily incident log (#342), keyed by AlertService event. +_INCIDENT_LABELS = { + "node_down": "node down", + "worker_offline": "worker offline", + "disk_space": "disk warning", + "db_unhealthy": "DB write fail", + "xvb_no_share": "XvB no-share", + "xvb_registration": "XvB registration", + "clearnet_exposed": "clearnet exposure", + "hashrate_low": "hashrate low", +} + + +def _incident_line(incidents): + """One-line roll-up of the day's problems, or an all-clear. ``incidents`` is a {event: count} + dict (from ``AlertService.drain_incidents``); ``None`` means the caller didn't track any.""" + if incidents is None: + return None + if not incidents: + return "\U0001f7e2 No incidents in the last 24h" + parts = [ + f"{n}× {_INCIDENT_LABELS.get(k, k)}" + for k, n in sorted(incidents.items(), key=lambda kv: (-kv[1], kv[0])) + ] + return "\U0001f6a8 Incidents (24h): " + " · ".join(parts) + + +def format_daily_summary(metrics, data, host_label="", now=None, incidents=None): """The once-a-day retrospective pushed by the alerter — **what happened across the fleet over the last 24h**, not a live snapshot. Reuses the same domain values the dashboard shows. @@ -248,6 +275,9 @@ def format_daily_summary(metrics, data, host_label="", now=None): shares_24h = sum(1 for s in data.get("shares", []) if s.get("ts", 0) >= now - 86400) lines = [f"{_prefix(host_label)}\U0001f4c5 Daily summary — {stamp}"] + incident_line = _incident_line(incidents) + if incident_line: + lines.append(incident_line) lines.append(f"⚡ 24h hashrate: {format_hashrate(fleet_24h)}") if metrics.xvb_enabled: routed = (metrics.p2pool_24h or 0) + (metrics.xvb_routed_24h or 0) diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index 8697264..e3bf16e 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -312,6 +312,39 @@ def test_warns_then_recovers(self): assert "back above" in text +class TestIncidentLog: + def test_tallies_problems_and_drains(self): + svc = _svc() + _ev(svc, monero_down=False) # seed + _ev(svc, monero_down=True) # +node_down + _ev(svc, db_healthy=True) # seed + _ev(svc, db_healthy=False) # +db_unhealthy + _ev(svc, disk_percent=50) # seed + _ev(svc, disk_percent=97) # +disk_space (critical) + assert svc.drain_incidents() == { + "node_down": 1, + "db_unhealthy": 1, + "disk_space": 1, + } + assert svc.drain_incidents() == {} # drained → reset + + def test_recoveries_are_not_incidents(self): + svc = _svc() + _ev(svc, monero_down=False) + _ev(svc, monero_down=True) # +1 + _ev(svc, monero_down=False) # recovery — not counted + assert svc.drain_incidents() == {"node_down": 1} + + def test_worker_offline_counts_once(self): + svc = _svc() + _ev(svc, workers=_on("r"), workers_expected=True, now=0) # prime + _ev(svc, workers=_down("r"), workers_expected=True, now=0) # DOWN streak + _ev(svc, workers=_down("r"), workers_expected=True, now=300) # offline → incident + _ev(svc, workers=_on("r"), workers_expected=True, now=300) # back online + _ev(svc, workers=_on("r"), workers_expected=True, now=420) # recovered — not counted + assert svc.drain_incidents() == {"worker_offline": 1} + + class TestEventFiltering: def test_disabled_events_are_dropped(self): svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index 544b5b2..b51761b 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -279,6 +279,18 @@ def test_daily_summary_without_xvb_omits_split(): assert "to XvB" not in out +def test_daily_summary_incident_log(): + m, data = _metrics(xvb_enabled=False), {"workers": [], "shares": []} + # Incidents present → a roll-up line, highest count first. + out = tc.format_daily_summary(m, data, now=0, incidents={"worker_offline": 3, "node_down": 1}) + assert "Incidents (24h): 3× worker offline · 1× node down" in out + # Empty tally → an explicit all-clear. + assert "No incidents in the last 24h" in tc.format_daily_summary(m, data, now=0, incidents={}) + # Not tracked (None) → no incident line at all. + none = tc.format_daily_summary(m, data, now=0, incidents=None) + assert "Incidents" not in none and "No incidents" not in none + + def test_host_label_prefix(): assert tc.format_sync(_metrics(), host_label="rig-box").startswith("[rig-box] ") # The placeholder is never printed. diff --git a/docs/telegram.md b/docs/telegram.md index 1137609..1d840f9 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -36,7 +36,7 @@ transition, not a stream: | 📉 **Hashrate low for tier** | You picked a fixed XvB donation tier your hashrate can't sustain — lower the tier or add hashrate. Fires on the transition and clears when it recovers. | | 🆕 **New release** | A newer Pithead release is available (the same signal as the dashboard header badge). | | 🚀 **Pithead online** | Sent once when the dashboard starts — a heartbeat that the stack is up (and confirms the bot works after setup). | -| 📅 **Daily summary** | A once-a-day retrospective of the last 24h across your whole fleet — date/time, **24h hashrate** with the **P2Pool / XvB split**, **shares found in the day**, an **estimated daily earnings** figure, and a **per-machine 24h breakdown** — pushed at a set local time (**08:00** by default; `telegram.daily_summary_time`). | +| 📅 **Daily summary** | A once-a-day retrospective of the last 24h across your whole fleet — date/time, an **incident roll-up** (what went wrong during the day, or an all-clear), **24h hashrate** with the **P2Pool / XvB split**, **shares found in the day**, an **estimated daily earnings** figure, and a **per-machine 24h breakdown** — pushed at a set local time (**08:00** by default; `telegram.daily_summary_time`). | Every message is prefixed with your dashboard hostname (e.g. `[rig-box.lan]`), so if you point more than one stack at the same chat you can tell them apart. diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 9f4d004..8ca2bb4 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 740 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 744 dashboard unit tests · 12 contract tests · 64 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 740 | +| 1 — Unit | dashboard pytest | 744 | | 1 — Unit | frontend (node --test) | 64 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 740 tests +### Dashboard (pytest) — 744 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 38 +#### tests/service/test_alert_service.py — 41 - test_every_alert_event_has_a_config_toggle - test_first_cycle_seeds_baseline_silently - test_down_then_recovered @@ -238,6 +238,9 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_benign_transition_is_silent - test_fires_once_on_rising_edge - test_warns_then_recovers +- test_tallies_problems_and_drains +- test_recoveries_are_not_incidents +- test_worker_offline_counts_once - test_disabled_events_are_dropped - test_prefixes_when_set - test_placeholder_host_is_not_prefixed @@ -548,7 +551,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires -#### tests/service/test_telegram_commands.py — 43 +#### tests/service/test_telegram_commands.py — 44 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -571,6 +574,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_earnings_unavailable_without_network_data - test_daily_summary_is_a_24h_retrospective - test_daily_summary_without_xvb_omits_split +- test_daily_summary_incident_log - test_host_label_prefix - test_reply_for_help_and_unknown_need_no_metrics - test_reply_for_status_uses_mining_flag @@ -1113,5 +1117,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **903** enumerated cases/sections across the four tiers (plus the live +_Grand total: **907** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ From 233c8aa5b220b4a3f9250ead2d5407c4fb0c2cf0 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 10:06:27 -0500 Subject: [PATCH 13/18] =?UTF-8?q?feat(#99):=20hashrate-drop=20detector=20?= =?UTF-8?q?=E2=80=94=20chart=20markers=20+=20Telegram=20alert?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flags a sustained, significant fall in total fleet hashrate (a rig gone dark, a network cut, a stalled miner), distinct from the existing "too low for your XvB tier" warning. - DegradationMonitor: EMA baseline (frozen while degraded so an outage can't redefine normal), debounced loss/recovery edges, cold-start gate. - Persisted chart event markers (amber drop / green recovery), survive a dashboard restart; new events table with additive migration. - hashrate_loss Telegram alert + daily incident tally; two config knobs (dashboard.hashrate_drop_threshold / hashrate_drop_minutes). - README highlights: Telegram operator bot + Healthchecks dead-man's switch. - Docs: telegram.md, configuration.md, dashboard.md, CHANGELOG. - Tests: DegradationMonitor unit, events storage, degradation_alert, data_service wiring, views event points, frontend eventColors, stack knob propagation; event-set consistency auto-covers the new event. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 11 +++ README.md | 6 ++ .../mining_dashboard/config/config.py | 8 ++ .../mining_dashboard/service/alert_service.py | 19 ++++ .../mining_dashboard/service/data_service.py | 28 ++++++ .../mining_dashboard/service/degradation.py | 87 +++++++++++++++++++ .../service/storage_service.py | 46 ++++++++++ .../service/telegram_commands.py | 1 + .../mining_dashboard/web/static/chart.mjs | 28 ++++++ build/dashboard/mining_dashboard/web/views.py | 49 ++++++++++- build/dashboard/tests/frontend/chart.test.mjs | 16 +++- .../tests/service/test_alert_service.py | 29 +++++++ .../tests/service/test_data_service.py | 49 +++++++++++ .../tests/service/test_degradation.py | 81 +++++++++++++++++ .../tests/service/test_storage_service.py | 49 +++++++++++ build/dashboard/tests/web/test_views.py | 46 +++++++++- config.reference.json | 5 +- docker-compose.yml | 4 + docs/configuration.md | 4 +- docs/dashboard.md | 6 ++ docs/telegram.md | 5 ++ docs/test-inventory.md | 42 ++++++--- pithead | 10 ++- tests/stack/run.sh | 9 ++ 24 files changed, 621 insertions(+), 17 deletions(-) create mode 100644 build/dashboard/mining_dashboard/service/degradation.py create mode 100644 build/dashboard/tests/service/test_degradation.py diff --git a/CHANGELOG.md b/CHANGELOG.md index af48e32..59a0fdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -127,6 +127,17 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay walkthrough — creating a bot, finding your chat id, the command list, and the "one chat, two bots" pattern for sharing a chat with the Healthchecks.io monitor (#79) — in [`docs/telegram.md`](docs/telegram.md). +- **Hashrate-drop detector — chart markers + `hashrate_loss` alert** (#99): the dashboard now flags a + **sustained, significant fall** in total fleet hashrate — a rig gone dark, a network cut, a stalled + miner — separately from the existing "too low for your XvB tier" warning. It tracks a slow moving + average as the "normal" level (frozen while degraded so an outage can't quietly redefine normal), + and fires once the total stays below **`dashboard.hashrate_drop_threshold`** percent of that + baseline for **`dashboard.hashrate_drop_minutes`** (defaults: **50%** for **10 min**), with a + matching recovery edge. Each edge drops a **diamond marker on the hashrate chart** (amber for the + drop, green for the recovery; hover for the size) that is **persisted**, so an overnight drop is + still visible in the morning, and — when Telegram is on — pushes a **`hashrate_loss`** alert and + counts toward the daily incident roll-up. Both knobs are documented in + [`docs/configuration.md`](docs/configuration.md); the alert in [`docs/telegram.md`](docs/telegram.md). - **Optional clearnet initial sync (#183).** A default-off, per-component opt-in (`monero.clearnet_initial_sync` / `tari.clearnet_initial_sync`) that lets a node do its **one-time initial block download over clearnet** — much faster than over bandwidth-capped Tor circuits, which diff --git a/README.md b/README.md index 5b4ba61..b56f2a3 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,12 @@ a Tor daemon. The `pithead` script renders config, provisions Tor, and drives do address in the miner config; the stack routes the hashrate. - 📊 **Live dashboard.** Hashrate, the P2Pool/XvB split, the PPLNS window, and per-worker updates, served over HTTPS on your LAN. +- 📟 **Telegram operator bot.** Opt-in alerts for a downed node, a worker that dropped off, sync + finishing, low disk, a clearnet leak, or a sustained hashrate drop — plus a daily digest and + read-only commands (`/status`, `/hashrate`, `/workers`, `/earnings`). Routed over Tor. See the + [Telegram guide](docs/telegram.md). +- 🔔 **Dead-man's switch.** An optional [Healthchecks.io](https://healthchecks.io/) ping tells you + when the whole box goes dark — the one failure a monitor running *on* that box can never report. - 🚀 **Interactive setup.** `pithead setup` checks dependencies, writes config, provisions Tor, and (on Linux) tunes HugePages for RandomX. It prompts before any GRUB change, then offers to start. - 🔒 **Hardened defaults.** Non-root containers, SHA256-verified binaries, pinned image digests, diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 082b9b2..3c741d1 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -271,6 +271,7 @@ def _telegram_event_enabled(name, default=True): "stack_online": _telegram_event_enabled("stack_online"), "daily_summary": _telegram_event_enabled("daily_summary"), "hashrate_low": _telegram_event_enabled("hashrate_low"), + "hashrate_loss": _telegram_event_enabled("hashrate_loss"), } # ponytail: daily_summary is a scheduled push, not an edge — it lives in the events dict only so it # gets a per-event on/off toggle like the rest; its time is TELEGRAM_DAILY_SUMMARY_TIME below. @@ -280,6 +281,13 @@ def _telegram_event_enabled(name, default=True): # the box is. Rendered from config.json telegram.daily_summary_time. TELEGRAM_DAILY_SUMMARY_TIME = os.environ.get("TELEGRAM_DAILY_SUMMARY_TIME", "08:00").strip() +# Hashrate-degradation detector (Issue #99). Flags a sustained drop in total hashrate below +# HASHRATE_DROP_THRESHOLD_PCT of its trailing baseline for HASHRATE_DROP_MINUTES minutes — surfaced +# as a chart event marker (always on) and, when telegram.events.hashrate_loss is on, an alert. +# Rendered from config.json dashboard.hashrate_drop_threshold / dashboard.hashrate_drop_minutes. +HASHRATE_DROP_THRESHOLD_PCT = int(float(os.environ.get("HASHRATE_DROP_THRESHOLD_PCT", 50))) +HASHRATE_DROP_MINUTES = int(float(os.environ.get("HASHRATE_DROP_MINUTES", 10))) + # Worker offline/online debounce (Issue #121). A worker must be unseen this long before it's # reported OFFLINE, and seen continuously this long before "back online" — so a brief miner # reconnect doesn't spam the chat. Workers flap more than nodes (rig reboots, Wi-Fi blips), diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 7ef38e7..253ade8 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -87,6 +87,7 @@ class AlertService: EVT_STACK_ONLINE = "stack_online" EVT_DAILY_SUMMARY = "daily_summary" EVT_HASHRATE_LOW = "hashrate_low" + EVT_HASHRATE_LOSS = "hashrate_loss" # WorkerPresenceMonitor edge -> (event key, message template). _WORKER_EDGES = { @@ -474,6 +475,24 @@ async def process(self, **signals): await asyncio.to_thread(self.notifier.send, text) return alerts + async def degradation_alert(self, kind, drop_frac): + """Push a hashrate-loss / recovery alert for a :class:`DegradationMonitor` edge (#99). The + detector owns the debounce + thresholds; this only formats and sends (and records the loss + as an incident for the daily log). No-op when the event is toggled off.""" + if kind == "loss": + self._record_incident(self.EVT_HASHRATE_LOSS) + if not self.notifier.event_enabled(self.EVT_HASHRATE_LOSS): + return None + if kind == "loss": + text = self._fmt( + f"⚠️ \U0001f4c9 Hashrate dropped ~{drop_frac * 100:.0f}% — possible outage or a rig " + "gone dark." + ) + else: + text = self._fmt("\U0001f7e2 \U0001f4c8 Hashrate recovered.") + await asyncio.to_thread(self.notifier.send, text) + return text + async def maybe_daily_summary(self, now, summary_provider): """Push a once-daily status digest at the configured local time. diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index b27647e..82f690b 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -32,6 +32,8 @@ CLEARNET_STATE_DIR, ENABLE_XVB, GITHUB_RELEASES_API, + HASHRATE_DROP_MINUTES, + HASHRATE_DROP_THRESHOLD_PCT, HOST_IP, MONERO_CLEARNET_SYNC, REJECT_WORKERS_CONTAINER, @@ -47,11 +49,13 @@ from mining_dashboard.helper.utils import ( DEFAULT_PPLNS_WINDOW, effective_hashrate, + format_hashrate, pplns_block_time, shares_in_pplns_window, ) from mining_dashboard.service.alert_service import AlertService from mining_dashboard.service.clearnet_sync import ClearnetSyncSupervisor +from mining_dashboard.service.degradation import DegradationMonitor from mining_dashboard.service.healthchecks import HealthchecksClient from mining_dashboard.service.metrics import build_metrics from mining_dashboard.service.node_health import NodeHealthMonitor @@ -415,6 +419,13 @@ def __init__(self, state_manager, proxy_client, xvb_client): # Disabled unless telegram.enabled + bot_token + chat_id are configured, so this is a # cheap no-op for the default stack. self.alert_service = AlertService() + # Hashrate-degradation detector (Issue #99): flags a sustained total-hashrate drop and its + # recovery. Runs every cycle (cheap, self-contained EMA baseline) so it can mark the chart + # even with Telegram off; a loss also drives a hashrate_loss alert. + self.degradation = DegradationMonitor( + threshold_frac=HASHRATE_DROP_THRESHOLD_PCT / 100, + sustained_sec=HASHRATE_DROP_MINUTES * 60, + ) # True while we've stopped the proxy to reject workers. Persisted in the snapshot so # a dashboard restart mid-outage still readmits workers once the node recovers. self.workers_rejected = False @@ -840,6 +851,23 @@ async def run(self): incidents=self.alert_service.drain_incidents(), ), ) + # 6. Degradation detector (#99): a sustained total-hashrate drop / recovery is + # persisted as a chart event marker and pushed as a hashrate_loss alert. + deg_edge = self.degradation.update(total_hr) + if deg_edge: + kind, drop_frac, _baseline, current = deg_edge + if kind == "loss": + ev_type = "hashrate_loss" + detail = ( + f"Hashrate −{drop_frac * 100:.0f}% ({format_hashrate(current)})" + ) + else: + ev_type = "hashrate_recovered" + detail = f"Hashrate recovered ({format_hashrate(current)})" + await asyncio.to_thread( + self.state_manager.add_event, time.time(), ev_type, detail + ) + await self.alert_service.degradation_alert(kind, drop_frac) self.latest_data.update( { diff --git a/build/dashboard/mining_dashboard/service/degradation.py b/build/dashboard/mining_dashboard/service/degradation.py new file mode 100644 index 0000000..afd66ef --- /dev/null +++ b/build/dashboard/mining_dashboard/service/degradation.py @@ -0,0 +1,87 @@ +"""Hashrate-degradation detector (Issue #99). + +Flags a **sustained significant drop** in total effective hashrate — an outage or a rig going +dark — and its recovery, as debounced edges. One detector, consumed by two sinks: an event marker +on the dashboard chart, and a Telegram alert (#121). Defining it here once keeps the thresholds and +debounce in a single place rather than duplicated per sink. + +Design: + +- **Self-contained baseline.** The "normal" level is a slow exponential moving average of the total + hashrate kept in-process — no per-cycle DB read, so the detector can run every loop even when + Telegram is off (the chart marker is a passive dashboard feature). The baseline is **frozen while + degraded**, so a drop that persists doesn't drag the baseline down and mask itself. +- **Debounce / hysteresis.** A drop must stay below ``threshold_frac`` of the baseline for + ``sustained_sec`` before a ``loss`` edge fires, and climb back above ``recovery_frac`` for + ``recovery_sec`` before ``recovered`` — so a brief blip doesn't mark the chart or ping you. +- **Cold-start safe.** Until the baseline exceeds ``min_baseline`` (a tiny/just-started fleet), no + edges fire — a stack that hasn't ramped yet can't be "degraded". + +``update(current, now)`` returns ``None`` or a ``(kind, drop_frac, baseline, current)`` tuple, +``kind`` in ``{"loss", "recovered"}``. +""" + +import time + + +class DegradationMonitor: + def __init__( + self, + threshold_frac=0.5, + sustained_sec=600, + recovery_frac=0.8, + recovery_sec=120, + min_baseline=500, + ema_alpha=0.01, + clock=time.monotonic, + ): + self.threshold_frac = threshold_frac + self.sustained_sec = sustained_sec + self.recovery_frac = recovery_frac + self.recovery_sec = recovery_sec + self.min_baseline = min_baseline + self.ema_alpha = ema_alpha + self._clock = clock + self._baseline = None # EMA of total hashrate; the "normal" level + self._degraded = False + self._below_since = None + self._above_since = None + + def update(self, current, now=None): + now = self._clock() if now is None else now + current = current or 0 + # Update the baseline only while healthy, so a sustained drop can't erode it and hide itself. + if self._baseline is None: + self._baseline = current + elif not self._degraded: + self._baseline = (1 - self.ema_alpha) * self._baseline + self.ema_alpha * current + baseline = self._baseline + + if baseline < self.min_baseline: + # Not enough hashrate to judge (cold start / tiny fleet) — no false alarms. + self._below_since = self._above_since = None + return None + + drop_frac = max(0.0, 1 - current / baseline) if baseline else 0.0 + + if not self._degraded: + if current < self.threshold_frac * baseline: + if self._below_since is None: + self._below_since = now + if now - self._below_since >= self.sustained_sec: + self._degraded = True + self._below_since = None + return ("loss", drop_frac, baseline, current) + else: + self._below_since = None + else: + if current >= self.recovery_frac * baseline: + if self._above_since is None: + self._above_since = now + if now - self._above_since >= self.recovery_sec: + self._degraded = False + self._above_since = None + return ("recovered", drop_frac, baseline, current) + else: + self._above_since = None + return None diff --git a/build/dashboard/mining_dashboard/service/storage_service.py b/build/dashboard/mining_dashboard/service/storage_service.py index c5e6599..33af2c4 100644 --- a/build/dashboard/mining_dashboard/service/storage_service.py +++ b/build/dashboard/mining_dashboard/service/storage_service.py @@ -43,6 +43,7 @@ def __init__(self, db_path: str = None): self.state = { "hashrate_history": deque(), "shares": [], + "events": [], # degradation / recovery markers for the chart (#99) "xvb": { "total_donated_time": 0.0, "current_mode": "P2POOL", @@ -119,6 +120,9 @@ def _create_tables(self): self._conn.execute( "CREATE TABLE IF NOT EXISTS shares (ts REAL PRIMARY KEY, difficulty REAL)" ) + # Degradation / recovery event markers for the chart (#99). No PK — two events can share a + # timestamp; type is "hashrate_loss"|"hashrate_recovered"|... and detail is the tooltip text. + self._conn.execute("CREATE TABLE IF NOT EXISTS events (ts REAL, type TEXT, detail TEXT)") def _create_indexes(self): """Creates indexes. Called after migrations so the indexed columns are guaranteed to @@ -234,6 +238,17 @@ def load(self): ) self.state["shares"] = [dict(row) for row in cursor.fetchall()] + # 4. Load chart events (#99) — the events table is additive, so guard against a + # pre-migration DB that predates it. + try: + cursor.execute( + "SELECT ts, type, detail FROM events WHERE ts > ? ORDER BY ts ASC", + (history_cutoff,), + ) + self.state["events"] = [dict(row) for row in cursor.fetchall()] + except sqlite3.Error: + self.state["events"] = [] + self.logger.info(f"State successfully loaded from {self.db_path}") except sqlite3.Error as e: self.logger.error(f"DB Load Error: {e}") @@ -366,6 +381,37 @@ def get_shares(self) -> list[dict[str, Any]]: with self._lock: return list(self.state.get("shares", [])) + def add_event(self, ts: float, event_type: str, detail: str = ""): + """Record a chart event marker (#99) — a degradation/recovery point the chart draws and the + history window prunes, mirroring shares. Persisted so it survives a dashboard restart.""" + with self._lock: + self.state.setdefault("events", []).append( + {"ts": ts, "type": event_type, "detail": detail} + ) + cutoff = time.time() - HISTORY_RETENTION_SEC + self.state["events"] = [e for e in self.state["events"] if e["ts"] >= cutoff] + try: + with self._db_lock: + if not self._conn: + return + with self._conn: + self._conn.execute( + "INSERT INTO events (ts, type, detail) VALUES (?, ?, ?)", + (ts, event_type, detail), + ) + if random.random() < 0.05: # noqa: S311 — pruning sampler, not a security context + self._conn.execute( + "DELETE FROM events WHERE ts < ?", + (time.time() - HISTORY_RETENTION_SEC,), + ) + except sqlite3.Error as e: + self._db_error("Event Insert Error", e) + + def get_events(self) -> list[dict[str, Any]]: + """Returns a copy of the chart events (#99).""" + with self._lock: + return list(self.state.get("events", [])) + def get_xvb_stats(self) -> dict[str, Any]: """Returns the current XvB mining statistics dictionary.""" with self._lock: diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index ab14fb8..8c8513f 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -242,6 +242,7 @@ def format_earnings(metrics, network, host_label=""): "xvb_registration": "XvB registration", "clearnet_exposed": "clearnet exposure", "hashrate_low": "hashrate low", + "hashrate_loss": "hashrate drop", } diff --git a/build/dashboard/mining_dashboard/web/static/chart.mjs b/build/dashboard/mining_dashboard/web/static/chart.mjs index e030243..cf638b4 100644 --- a/build/dashboard/mining_dashboard/web/static/chart.mjs +++ b/build/dashboard/mining_dashboard/web/static/chart.mjs @@ -76,12 +76,20 @@ function paletteColors() { accent, purple, shares: v("--bad", "#da3633"), + evtLoss: v("--warn", "#d29922"), // degradation event marker (#99) + evtOk: v("--ok", "#3fb950"), // recovery event marker grid: v("--border", "#30363d"), ticks: v("--text-muted", "#8b949e"), band: withAlpha(accent, "26"), // drag-to-zoom selection band (≈ 0.15 alpha) }; } +// Per-point colour for the degradation event markers (#99): green for a recovery, warn/red for a +// loss. Returns one colour per point so a single dataset can show both. +export function eventColors(events, c) { + return (events || []).map((e) => (e.kind === "hashrate_recovered" ? c.evtOk : c.evtLoss)); +} + // Area-fill gradient stops (Issue #145): strong near the line, fading toward the axis, so a flat // series reads as a solid mass instead of a thin strip. Line a touch thicker than the default so // the top edge pops against the fill. @@ -214,6 +222,20 @@ export class ChartCard extends Component { pointHitRadius: 100, showLine: false, }, + // Degradation/recovery markers (#99) on their own hidden axis, just below the share rug. + // A diamond per event, red for a loss and green for a recovery; tooltip carries the label. + { + label: "Events", + data: d.events || [], + yAxisID: "events", + pointStyle: "rectRot", + pointRadius: 7, + pointHoverRadius: 10, + pointHitRadius: 100, + showLine: false, + pointBackgroundColor: eventColors(d.events, c), + pointBorderColor: eventColors(d.events, c), + }, ], }, options: { @@ -232,6 +254,7 @@ export class ChartCard extends Component { label(context) { if (context.dataset.label === "Shares") return self.shareCounts[context.dataIndex] + " Shares"; + if (context.dataset.label === "Events") return context.raw.label; let label = context.dataset.label || ""; if (label) label += ": "; if (context.parsed.y !== null) label += context.parsed.y + " H/s"; @@ -276,6 +299,8 @@ export class ChartCard extends Component { // Hidden 0–1 axis the Shares scatter rides on; markers pin near the top (0.93, // set server-side) so they never affect the hashrate y-range (Issue #145). shares: { type: "linear", display: false, min: 0, max: 1 }, + // Hidden 0–1 axis the degradation event markers ride on (#99), pinned near the top. + events: { type: "linear", display: false, min: 0, max: 1 }, }, }, }); @@ -310,6 +335,9 @@ export class ChartCard extends Component { ds[2].borderColor = c.shares; ds[2].backgroundColor = c.shares; ds[2].pointRadius = d.shares.map((s) => s.r); + ds[3].data = d.events || []; + ds[3].pointBackgroundColor = eventColors(d.events, c); + ds[3].pointBorderColor = eventColors(d.events, c); this.chart.options.scales.y.grid.color = c.grid; this.chart.options.scales.y.ticks.color = c.ticks; this.applyVisibility(); diff --git a/build/dashboard/mining_dashboard/web/views.py b/build/dashboard/mining_dashboard/web/views.py index d86ce80..79090af 100644 --- a/build/dashboard/mining_dashboard/web/views.py +++ b/build/dashboard/mining_dashboard/web/views.py @@ -151,7 +151,9 @@ def build_raffle_eligibility(metrics): # -------------------------------------------------------------------------------------- -def build_chart(history, shares, range_arg, window=None, avg_window=DEFAULT_HASHRATE_WINDOW): +def build_chart( + history, shares, range_arg, window=None, avg_window=DEFAULT_HASHRATE_WINDOW, events=None +): """Build the Chart.js datasets from history. Each point carries its real timestamp as the x value (epoch ms) so a linear time axis spaces points to scale; runs of missing samples (outages) are split by a ``null`` break so the line doesn't connect across the gap. @@ -188,6 +190,7 @@ def build_chart(history, shares, range_arg, window=None, avg_window=DEFAULT_HASH "p2pool": p2pool, "xvb": xvb, "shares": _share_points(filtered_history, filtered_shares), + "events": _event_points(_filter_events(events or [], range_arg, window)), "tension": _chart_tension(duration_s), } @@ -214,6 +217,21 @@ def _filter_range(history, shares, range_arg, window=None): ) +def _filter_events(events, range_arg, window=None): + """Restrict degradation events (#99) to the selected window — same bounds as ``_filter_range``, + but for the ``ts``-keyed events list.""" + if window is not None: + lo, hi = window + return [e for e in events if lo <= e["ts"] <= hi] + if range_arg == "all": + return events + secs = _RANGE_SECONDS.get(range_arg, 0) + if secs <= 0: + return events + cutoff = time.time() - secs + return [e for e in events if e["ts"] >= cutoff] + + def _window_duration(filtered_history, range_arg, window): """Seconds the chart currently spans — drives adaptive resolution/smoothing. From the window if zoomed, else the preset length, else (``all``/unknown) the actual data extent.""" @@ -339,6 +357,26 @@ def _share_points(filtered_history, filtered_shares): return points +# Event markers ride just below the share rug on their own hidden 0–1 axis (#99), so a "something +# went wrong" marker sits at the event's real time without touching the hashrate y-range. +_EVENT_MARKER_Y = 0.82 + + +def _event_points(filtered_events): + """Sparse degradation/recovery markers (#99): one point per event at its timestamp, carrying the + tooltip ``label`` and ``kind`` (e.g. ``hashrate_loss`` vs ``hashrate_recovered``) so the client + can colour a loss vs a recovery.""" + return [ + { + "x": int(e["ts"] * 1000), + "y": _EVENT_MARKER_Y, + "kind": e.get("type", ""), + "label": e.get("detail") or e.get("type", "event"), + } + for e in filtered_events + ] + + # -------------------------------------------------------------------------------------- # Section builders: Metrics (+ passthrough) -> display data. # -------------------------------------------------------------------------------------- @@ -933,7 +971,14 @@ def build_state(data, state_mgr, range_arg, window=None, avg_window=DEFAULT_HASH "proxy_summary": build_proxy_summary(data), "egress": egress, "topology": topology, - "chart": build_chart(history, data.get("shares", []), range_arg, window, avg_window), + "chart": build_chart( + history, + data.get("shares", []), + range_arg, + window, + avg_window, + events=state_mgr.get_events(), + ), } diff --git a/build/dashboard/tests/frontend/chart.test.mjs b/build/dashboard/tests/frontend/chart.test.mjs index fccff6c..806b609 100644 --- a/build/dashboard/tests/frontend/chart.test.mjs +++ b/build/dashboard/tests/frontend/chart.test.mjs @@ -9,7 +9,7 @@ import { test } from 'node:test'; import assert from 'node:assert/strict'; -import { withAlpha, padYAxis } from '../../mining_dashboard/web/static/chart.mjs'; +import { withAlpha, padYAxis, eventColors } from '../../mining_dashboard/web/static/chart.mjs'; test('withAlpha: appends an 8-bit alpha to a #rrggbb hex', () => { assert.equal(withAlpha('#58a6ff', '26'), '#58a6ff26'); @@ -52,3 +52,17 @@ test('padYAxis: no-op when the range is non-finite (all series hidden / no data) padYAxis(s); assert.ok(Number.isNaN(s.min) && Number.isNaN(s.max)); }); + +test('eventColors: maps recovery to ok, everything else to loss (#99)', () => { + const c = { evtOk: '#3fb950', evtLoss: '#d29922' }; + const events = [ + { kind: 'hashrate_loss' }, + { kind: 'hashrate_recovered' }, + { kind: '' }, + ]; + assert.deepEqual(eventColors(events, c), [c.evtLoss, c.evtOk, c.evtLoss]); +}); + +test('eventColors: tolerates a missing events list', () => { + assert.deepEqual(eventColors(undefined, { evtOk: 'g', evtLoss: 'r' }), []); +}); diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index e3bf16e..4cfb7c9 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -496,3 +496,32 @@ def boom(): # Marked done for today even though the build failed → no retry storm. monkeypatch.setattr(alert_mod.time, "localtime", _fake_localtime(8, 5)) assert await svc.maybe_daily_summary(0, lambda: "digest") is None + + +class TestDegradationAlert: + """The #99 hashrate-loss / recovery push. The DegradationMonitor owns the debounce; this only + formats + sends, tallies the loss as an incident, and honours the event toggle.""" + + async def test_loss_sends_and_records_incident(self): + n = _FakeNotifier() + svc = _svc(notifier=n) + text = await svc.degradation_alert("loss", 0.62) + assert "62%" in text and "dropped" in text.lower() + assert n.sent == [text] + assert svc.drain_incidents() == {AlertService.EVT_HASHRATE_LOSS: 1} + + async def test_recovery_sends_no_incident(self): + n = _FakeNotifier() + svc = _svc(notifier=n) + text = await svc.degradation_alert("recovered", 0.0) + assert "recovered" in text.lower() + assert n.sent == [text] + assert svc.drain_incidents() == {} # recovery is not an incident + + async def test_gated_off_still_records_loss(self): + # Toggle off suppresses the message but the incident is still tallied for the daily log. + n = _FakeNotifier(allow=set()) + svc = _svc(notifier=n) + assert await svc.degradation_alert("loss", 0.5) is None + assert n.sent == [] + assert svc.drain_incidents() == {AlertService.EVT_HASHRATE_LOSS: 1} diff --git a/build/dashboard/tests/service/test_data_service.py b/build/dashboard/tests/service/test_data_service.py index ecb8098..d379e31 100644 --- a/build/dashboard/tests/service/test_data_service.py +++ b/build/dashboard/tests/service/test_data_service.py @@ -708,6 +708,55 @@ async def test_single_iteration_aggregates(self): sm.update_history.assert_called() sm.save_snapshot.assert_called() + async def test_degradation_edge_records_event_and_alerts(self): + # #99 wiring: when the detector reports an edge, the loop persists a chart marker and pushes + # the hashrate_loss alert. Stub the detector so a single iteration produces a deterministic + # edge (the debounce itself is unit-tested in test_degradation.py). + svc, sm, proxy = _make_service() + proxy.get_workers.return_value = {"workers": []} + proxy.get_summary.return_value = {"results": {}} + svc.degradation = MagicMock() + svc.degradation.update.return_value = ("loss", 0.6, 1000.0, 400.0) + svc.alert_service.degradation_alert = AsyncMock() + + worker_client = MagicMock() + worker_client.get_stats = AsyncMock(return_value={}) + tari_client = MagicMock() + tari_client.get_sync_status = AsyncMock(return_value={"is_syncing": False}) + tari_client.close = AsyncMock() + + with ( + patch.object(ds_mod, "ClientSession", _FakeClientSession), + patch.object(ds_mod, "XMRigWorkerClient", return_value=worker_client), + patch.object(ds_mod, "TariClient", return_value=tari_client), + patch.object(ds_mod, "get_stratum_stats", return_value={}), + patch.object(ds_mod, "get_network_stats", return_value={"height": 100}), + patch.object( + ds_mod, "get_tari_stats", return_value={"active": True, "status": "OK", "height": 3} + ), + patch.object( + ds_mod, + "get_p2pool_stats", + return_value={"pool": {"last_share_time": 0, "difficulty": 0}}, + ), + patch.object( + ds_mod, + "get_monero_sync_status", + AsyncMock(return_value={"is_syncing": False, "percent": 100}), + ), + patch.object(ds_mod, "get_disk_usage", return_value={}), + patch.object(ds_mod, "get_hugepages_status", return_value=("Enabled", "ok", "1/2")), + patch.object(ds_mod, "get_memory_usage", return_value={}), + patch.object(ds_mod, "get_load_average", return_value="0"), + patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), + ): + with pytest.raises(StopAsyncIteration): + await svc.run() + + assert sm.add_event.call_args.args[1] == "hashrate_loss" + svc.alert_service.degradation_alert.assert_awaited_once_with("loss", 0.6) + async def test_run_holds_miner_while_syncing(self): # A syncing Monero node → gate holds p2pool + xmrig-proxy and #31's failover stays # dormant (no workers to fail over before we've even started mining). diff --git a/build/dashboard/tests/service/test_degradation.py b/build/dashboard/tests/service/test_degradation.py new file mode 100644 index 0000000..e618190 --- /dev/null +++ b/build/dashboard/tests/service/test_degradation.py @@ -0,0 +1,81 @@ +"""Unit tests for the hashrate-degradation detector (Issue #99). + +Pure logic, driven by an injectable clock — no timers, no sleeps. Each test walks the monitor +through a scripted (value, time) sequence and asserts which edges fire. +""" + +from mining_dashboard.service.degradation import DegradationMonitor + + +def _mon(**kw): + # Small thresholds so tests read clearly: baseline established above 500, 50% drop for 600s. + kw.setdefault("min_baseline", 500) + return DegradationMonitor(**kw) + + +def test_steady_state_never_fires(): + m = _mon() + for t in range(0, 6000, 60): + assert m.update(1000, now=t) is None + + +def test_cold_start_below_min_baseline_is_silent(): + m = _mon(min_baseline=500) + # A tiny fleet that never crosses min_baseline can't be "degraded", even at zero hashrate. + for t in range(0, 3000, 60): + assert m.update(100, now=t) is None + assert m.update(0, now=t + 30) is None + + +def test_sustained_drop_fires_loss_once(): + m = _mon(sustained_sec=600, threshold_frac=0.5) + m.update(1000, now=0) # seed baseline + # Drop to zero and hold; nothing until the debounce window elapses. + assert m.update(0, now=60) is None + assert m.update(0, now=600) is None + edge = m.update(0, now=660) # 600s below threshold since t=60 + assert edge is not None + kind, drop_frac, baseline, current = edge + assert kind == "loss" + assert drop_frac > 0.9 + assert current == 0 + # Still down — no repeat. + assert m.update(0, now=1300) is None + + +def test_brief_blip_does_not_fire(): + m = _mon(sustained_sec=600, threshold_frac=0.5) + m.update(1000, now=0) + assert m.update(0, now=60) is None # blip starts + assert m.update(1000, now=120) is None # recovers well before 600s + # Baseline intact, clock reset — a later full window from scratch is needed to fire. + assert m.update(0, now=180) is None + assert m.update(0, now=700) is None + assert m.update(0, now=781) is not None # 600s since t=180 + + +def test_recovery_fires_after_hold(): + m = _mon(sustained_sec=600, recovery_frac=0.8, recovery_sec=120) + m.update(1000, now=0) + m.update(0, now=60) + assert m.update(0, now=660)[0] == "loss" + # Climb back above 80% of baseline and hold recovery_sec. + assert m.update(1000, now=720) is None + edge = m.update(1000, now=840) # 120s above recovery threshold + assert edge is not None and edge[0] == "recovered" + + +def test_baseline_frozen_while_degraded(): + # A sustained drop must not erode the baseline (which would mask the outage and prevent recovery + # detection). Once degraded, the baseline is pinned — a long outage doesn't drag it down. + m = _mon(sustained_sec=600, recovery_frac=0.8, recovery_sec=120) + m.update(1000, now=0) + m.update(0, now=60) + assert m.update(0, now=660)[0] == "loss" + frozen = m._baseline + # Long stretch near zero — an unfrozen EMA would drift the baseline down over dozens of samples. + for t in range(700, 6000, 60): + m.update(10, now=t) + assert m._baseline == frozen # pinned to the pre-drop level + m.update(1000, now=6060) + assert m.update(1000, now=6200)[0] == "recovered" diff --git a/build/dashboard/tests/service/test_storage_service.py b/build/dashboard/tests/service/test_storage_service.py index 54dc62e..dd967b0 100644 --- a/build/dashboard/tests/service/test_storage_service.py +++ b/build/dashboard/tests/service/test_storage_service.py @@ -395,3 +395,52 @@ def test_old_history_pruned_from_db_when_cleanup_fires(self, state_manager, monk (time.time() - HISTORY_RETENTION_SEC,), ).fetchone()[0] assert remaining == 0, "expired DB rows are pruned" + + +class TestChartEvents: + """Degradation/recovery markers for the chart (#99): in-memory tally, disk persistence, and + tolerance of a pre-migration DB with no events table.""" + + def test_add_and_get_roundtrip(self, state_manager): + t0 = time.time() + state_manager.add_event(t0, "loss", "-62%") + state_manager.add_event(t0 + 100, "recovered", "") + evs = state_manager.get_events() + assert [e["type"] for e in evs] == ["loss", "recovered"] + assert evs[0] == {"ts": t0, "type": "loss", "detail": "-62%"} + # returns a copy — mutating it doesn't corrupt stored state + evs.clear() + assert len(state_manager.get_events()) == 2 + + def test_old_events_pruned_from_memory(self, state_manager): + state_manager.add_event(1.0, "loss", "ancient") # ts well before the retention cutoff + state_manager.add_event(time.time(), "recovered", "fresh") + details = [e["detail"] for e in state_manager.get_events()] + assert details == ["fresh"] + + def test_events_survive_reload(self, tmp_path): + db = str(tmp_path / "events.db") + sm = StateManager(db_path=db) + sm.add_event(time.time(), "loss", "-50%") + sm.close() + sm2 = StateManager(db_path=db) + try: + evs = sm2.get_events() + assert len(evs) == 1 and evs[0]["type"] == "loss" + finally: + sm2.close() + + def test_load_tolerates_missing_events_table(self, tmp_path): + # Upgrade path: a DB written by a pre-#99 build has no events table. Opening it must not + # crash and must report no events. (StateManager creates the table on open, so load() then + # finds it empty; the sqlite3.Error guard in load() is defence-in-depth for that ordering.) + db = str(tmp_path / "legacy.db") + conn = sqlite3.connect(db) + conn.execute("CREATE TABLE state (key TEXT PRIMARY KEY, value TEXT)") + conn.commit() + conn.close() + sm = StateManager(db_path=db) + try: + assert sm.get_events() == [] + finally: + sm.close() diff --git a/build/dashboard/tests/web/test_views.py b/build/dashboard/tests/web/test_views.py index 8d47798..038d0a4 100644 --- a/build/dashboard/tests/web/test_views.py +++ b/build/dashboard/tests/web/test_views.py @@ -246,7 +246,13 @@ def test_unknown_range_keeps_everything(self): assert len(build_chart(history, [], "bogus")["p2pool"]) == 3 def test_empty_history(self): - assert build_chart([], [], "all") == {"p2pool": [], "xvb": [], "shares": [], "tension": 0.0} + assert build_chart([], [], "all") == { + "p2pool": [], + "xvb": [], + "shares": [], + "events": [], + "tension": 0.0, + } # --- Issue #47: custom zoom window + duration-adaptive resolution/smoothing --------- @@ -1313,3 +1319,41 @@ def test_na_when_xvb_off(self): "eligible": False, "label": "N/A (XvB off)", } + + +class TestChartEvents: + """Degradation/recovery markers (#99) flow through build_chart's new `events` kwarg: shaped as + xy points on the hidden 0-1 event axis, carrying kind+label, and window-filtered like history.""" + + def _hist(self, now): + return [{"timestamp": now, "v": 800, "v_p2pool": 800, "v_xvb": 0, "t": "a"}] + + def test_absent_events_default_to_empty(self): + now = time.time() + assert build_chart(self._hist(now), [], "all")["events"] == [] + + def test_event_point_shape(self): + now = time.time() + events = [{"ts": now, "type": "loss", "detail": "-62%"}] + pt = build_chart(self._hist(now), [], "all", events=events)["events"] + assert pt == [ + {"x": int(now * 1000), "y": views._EVENT_MARKER_Y, "kind": "loss", "label": "-62%"} + ] + + def test_label_falls_back_to_type(self): + now = time.time() + events = [{"ts": now, "type": "recovered", "detail": ""}] + assert build_chart(self._hist(now), [], "all", events=events)["events"][0]["label"] == ( + "recovered" + ) + + def test_events_filtered_by_range(self): + now = time.time() + events = [ + {"ts": now - 7200, "type": "loss", "detail": "old"}, # 2h ago + {"ts": now - 60, "type": "recovered", "detail": "recent"}, + ] + labels = [ + e["label"] for e in build_chart(self._hist(now), [], "1h", events=events)["events"] + ] + assert labels == ["recent"] # the 2h-old marker is outside the 1h window diff --git a/config.reference.json b/config.reference.json index 344399d..7f48398 100644 --- a/config.reference.json +++ b/config.reference.json @@ -59,6 +59,8 @@ "data_dir": "auto", "tari_required": true, "check_for_updates": true, + "hashrate_drop_threshold": 50, + "hashrate_drop_minutes": 10, "auth": { "username": "admin", "password": "" @@ -100,7 +102,8 @@ "new_release": true, "stack_online": true, "daily_summary": true, - "hashrate_low": true + "hashrate_low": true, + "hashrate_loss": true }, "daily_summary_time": "08:00", "commands": { diff --git a/docker-compose.yml b/docker-compose.yml index ab1e511..522e868 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -448,7 +448,11 @@ services: - TELEGRAM_EVENT_STACK_ONLINE=${TELEGRAM_EVENT_STACK_ONLINE:-true} - TELEGRAM_EVENT_DAILY_SUMMARY=${TELEGRAM_EVENT_DAILY_SUMMARY:-true} - TELEGRAM_EVENT_HASHRATE_LOW=${TELEGRAM_EVENT_HASHRATE_LOW:-true} + - TELEGRAM_EVENT_HASHRATE_LOSS=${TELEGRAM_EVENT_HASHRATE_LOSS:-true} - TELEGRAM_DAILY_SUMMARY_TIME=${TELEGRAM_DAILY_SUMMARY_TIME:-08:00} + # Hashrate-degradation detector (#99). + - HASHRATE_DROP_THRESHOLD_PCT=${HASHRATE_DROP_THRESHOLD_PCT:-50} + - HASHRATE_DROP_MINUTES=${HASHRATE_DROP_MINUTES:-10} - TELEGRAM_COMMANDS_ENABLED=${TELEGRAM_COMMANDS_ENABLED:-false} # --- Docker Socket Proxy (read-only) --- diff --git a/docs/configuration.md b/docs/configuration.md index b957e0c..98ede69 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -95,6 +95,8 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `dashboard.timezone` | `auto` | Timezone for the dashboard's timestamps and charts. `auto` = the host machine's timezone (auto-detected, falling back to `Etc/UTC`); set an IANA name (e.g. `America/Chicago`) to override. | | `dashboard.data_dir` | `auto` | Where the dashboard's database lives. `auto` = `./data/dashboard`. | | `dashboard.check_for_updates` | `true` _(on)_ | The dashboard periodically asks GitHub whether a newer Pithead release exists and, if so, shows a header badge linking to it (e.g. "New release v1.4.0 available"). Notify-only: it never updates anything; you upgrade with `./pithead upgrade` on your own terms. On by default because the check is routed over Tor (the same bridge SOCKS as the XvB fetch, `socks5h` so the DNS lookup goes through Tor too), so GitHub sees a Tor exit, not your IP. It's cached (hourly) and fails silently offline. Set to `false` to opt out entirely. See [Privacy › Runtime egress](privacy.md#runtime-egress). | +| `dashboard.hashrate_drop_threshold` | `50` | Percent below the recent normal that counts as a hashrate drop for the `hashrate_loss` alert and its chart marker. `50` = fire when total fleet hashrate falls to half its baseline. Raise it to catch smaller dips, lower it to only flag near-total outages. | +| `dashboard.hashrate_drop_minutes` | `10` | How many minutes the hashrate must stay below the threshold before the drop is reported — the debounce that keeps a brief blip from pinging you. | | `dashboard.tari_required` | `true` | How much a Tari problem holds up the rest of the stack. Monero is required to mine, so its behavior isn't configurable: a monerod outage always rejects workers (stops `xmrig-proxy` so miners fail over to their backup pools), and the miner is always held until monerod finishes syncing. Tari is only needed for merge mining, so this one flag decides how much it blocks. `true` (default): a Tari outage also rejects workers, the miner waits for Tari's initial sync too, and a Tari-only (re)sync shows the full-screen Sync view. `false` (non-blocking): keep mining Monero through a Tari outage, start mining as soon as Monero is synced (Tari finishes in the background), and keep the normal dashboard, with a `Tari syncing` indicator, instead of the takeover screen. | | `network.subnet` | `172.28.0.0/24` | The private Docker bridge the stack's containers run on. Change it only if install fails with `Pool overlaps with other one on this address space`, i.e. your host already uses `172.28.0.0/24` for another Docker network or interface. Must be a free `X.Y.Z.0/24` block (e.g. `"172.30.0.0/24"`); the services keep their fixed host octets (`.25`–`.31`) within it, so the structured addressing the dashboard and the worker SSRF guard rely on is preserved. | | `network.tor_egress_firewall` | `true` _(on)_ | Privacy-relevant, default on. Enforces "behind Tor" fail-closed: at `up`/`apply`, `pithead` installs host firewall rules (Docker's `DOCKER-USER` chain) that drop any direct clearnet dial from the mining containers (monerod/p2pool/tari/xmrig-proxy). Only the Tor container reaches the internet, so a misconfigured or buggy daemon can't leak your IP. Needs root (like the GRUB/HugePages steps); removed at `down`. Set `false` to skip it and rely on per-app Tor config only (e.g. a host where you manage egress yourself, or where `iptables` isn't available). Full detail: [Privacy › Enforced fail-closed](privacy.md#enforced-fail-closed-not-just-configured-270). | @@ -105,7 +107,7 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `telegram.enabled` | `false` | Push operational alerts (node down/recovered, worker offline/back, sync finished) to Telegram. Off by default. Requires `bot_token` + `chat_id` to actually send. Full walkthrough: [Telegram Bot](telegram.md). | | `telegram.bot_token` | `""` | Your BotFather bot token. A secret — stored owner-only in `.env`, git-ignored, and never logged. Get one from [@BotFather](https://t.me/BotFather). | | `telegram.chat_id` | `""` | Where alerts are sent and the only chat the command interface answers. A Telegram group id (negative, e.g. `-1001234567890`) or a personal chat id. See [how to find it](telegram.md#3-find-your-chat-id). | -| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`, `daily_summary`, `hashrate_low`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | +| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`, `daily_summary`, `hashrate_low`, `hashrate_loss`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | | `telegram.daily_summary_time` | `08:00` | Local time (24-hour `HH:MM`) to push the once-a-day status digest, when the `daily_summary` event is on. Uses the dashboard's timezone (`dashboard.timezone`). A malformed value disables the digest. | | `telegram.commands.enabled` | `false` | Turn on the interactive command interface — the bot answers `/status`, `/hashrate`, `/workers`, `/sync`, and `/help` from the configured `chat_id` (every other chat is ignored). Off by default; alerts work without it. Long-polls over Tor, so it needs no inbound port. See [Telegram › Commands](telegram.md#commands). | diff --git a/docs/dashboard.md b/docs/dashboard.md index a34b7d3..619f167 100644 --- a/docs/dashboard.md +++ b/docs/dashboard.md @@ -130,6 +130,12 @@ progress until it catches up and merge mining resumes. A time-series chart of hashrate with selectable ranges (1h / 24h / 1w / 1mo) that switch without reloading. Shaded bands show the P2Pool/XvB split over time. +Diamond markers along the top flag **hashrate events** (#99): an amber one where total hashrate +dropped sharply and stayed down (an outage or a rig gone dark), a green one where it recovered. +Hover for the size of the drop. They mark the same transitions as the `hashrate_loss` Telegram +alert and survive a dashboard restart, so a drop that happened overnight is still on the chart in the +morning. + An **Avg** control picks the hashrate-averaging window the chart plots: `1 Min` / `10 Min` / `1 Hr` / `12 Hr` / `24 Hr` (the native windows xmrig-proxy reports). It is independent of the Range control: the range sets how much *time* the x-axis spans; the averaging window sets how *smooth* each diff --git a/docs/telegram.md b/docs/telegram.md index 1d840f9..069f73d 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -34,6 +34,7 @@ transition, not a stream: | ⚠ **Clearnet sync active** | A node is doing its initial sync over **clearnet**, so this host's IP is exposed to that chain's P2P network until it finishes (it reverts to Tor automatically). | | 🎰 **XvB registration** | XvB auto-registration was rejected (bad payout address) or is failing — raffle wins won't count until it recovers. Only fires when XvB is enabled. | | 📉 **Hashrate low for tier** | You picked a fixed XvB donation tier your hashrate can't sustain — lower the tier or add hashrate. Fires on the transition and clears when it recovers. | +| ⚠️ **Hashrate drop** | Total fleet hashrate fell sharply below its recent normal and **stayed down** — a rig gone dark, a network cut, or a stalled miner. The dashboard also drops a marker on the hashrate chart at the moment it happened. Fires once on the drop and again on recovery. Thresholds are tunable (`dashboard.hashrate_drop_threshold`, `dashboard.hashrate_drop_minutes`). | | 🆕 **New release** | A newer Pithead release is available (the same signal as the dashboard header badge). | | 🚀 **Pithead online** | Sent once when the dashboard starts — a heartbeat that the stack is up (and confirms the bot works after setup). | | 📅 **Daily summary** | A once-a-day retrospective of the last 24h across your whole fleet — date/time, an **incident roll-up** (what went wrong during the day, or an all-clear), **24h hashrate** with the **P2Pool / XvB split**, **shares found in the day**, an **estimated daily earnings** figure, and a **per-machine 24h breakdown** — pushed at a set local time (**08:00** by default; `telegram.daily_summary_time`). | @@ -283,6 +284,10 @@ override these environment variables for the dashboard container — both are in Node-down timing is shared with the existing failover logic (`NODE_DOWN_AFTER_SEC` / `NODE_RECOVERY_AFTER_SEC`). These are advanced knobs; most operators never touch them. +The **hashrate-drop** alert has its own two `config.json` knobs (not env vars): +`dashboard.hashrate_drop_threshold` (percent below the recent normal that counts as a drop, default +`50`) and `dashboard.hashrate_drop_minutes` (how long it must stay down before firing, default `10`). + --- ## Troubleshooting diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 8ca2bb4..ff5db86 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 744 dashboard unit tests · 12 contract tests · 64 frontend +**Totals:** 762 dashboard unit tests · 12 contract tests · 66 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,8 +14,8 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 744 | -| 1 — Unit | frontend (node --test) | 64 | +| 1 — Unit | dashboard pytest | 762 | +| 1 — Unit | frontend (node --test) | 66 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | | 2 — Contract | fake-daemon clients | 12 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 744 tests +### Dashboard (pytest) — 762 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -210,7 +210,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 41 +#### tests/service/test_alert_service.py — 44 - test_every_alert_event_has_a_config_toggle - test_first_cycle_seeds_baseline_silently - test_down_then_recovered @@ -252,6 +252,9 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_malformed_time_disables - test_gated_off_by_event_toggle - test_provider_error_is_swallowed_and_marks_day_done +- test_loss_sends_and_records_incident +- test_recovery_sends_no_incident +- test_gated_off_still_records_loss #### tests/service/test_algo_service.py — 38 - test_xvb_disabled_forces_p2pool @@ -306,7 +309,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_per_chain_independent - test_marker_write_failure_does_not_restart -#### tests/service/test_data_service.py — 89 +#### tests/service/test_data_service.py — 90 - test_first_poll_baselines_without_backfill - test_delta_records_the_difference - test_no_change_records_nothing @@ -369,6 +372,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_partial_start_failure_keeps_latch_closed - test_rehold_stops_quietly_after_first_cycle - test_single_iteration_aggregates +- test_degradation_edge_records_event_and_alerts - test_run_holds_miner_while_syncing - test_run_wires_computed_signals_into_the_alerter - test_run_releases_despite_height_override @@ -397,6 +401,14 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_real_worker_is_probed_internal_neighbour_is_not - test_malicious_name_is_never_used_as_a_host +#### tests/service/test_degradation.py — 6 +- test_steady_state_never_fires +- test_cold_start_below_min_baseline_is_silent +- test_sustained_drop_fires_loss_once +- test_brief_blip_does_not_fire +- test_recovery_fires_after_hold +- test_baseline_frozen_while_degraded + #### tests/service/test_earnings.py — 4 - test_matches_closed_form - test_worked_field_example @@ -518,7 +530,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_routed_fraction_in_unit_interval - test_max_donation_fraction_within_reserve_bounds -#### tests/service/test_storage_service.py — 31 +#### tests/service/test_storage_service.py — 35 - test_get_tiers - test_default_xvb_stats - test_partial_updates @@ -550,6 +562,10 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_orphaned_workers_table_dropped_on_upgrade - test_history_older_than_retention_pruned_from_memory - test_old_history_pruned_from_db_when_cleanup_fires +- test_add_and_get_roundtrip +- test_old_events_pruned_from_memory +- test_events_survive_reload +- test_load_tolerates_missing_events_table #### tests/service/test_telegram_commands.py — 44 - test_parse_command @@ -702,7 +718,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_css_lets_hostname_wrap - test_host_at_separator_styled_and_rendered -#### tests/web/test_views.py — 133 +#### tests/web/test_views.py — 137 - test_point_shape_is_xy_with_epoch_ms - test_legacy_rows_attributed_to_p2pool - test_range_filtering @@ -836,14 +852,20 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_no_when_below_tier_even_with_a_share - test_no_when_in_tier_but_no_share - test_na_when_xvb_off +- test_absent_events_default_to_empty +- test_event_point_shape +- test_label_falls_back_to_type +- test_events_filtered_by_range -### Frontend logic (node --test) — 64 tests +### Frontend logic (node --test) — 66 tests - withAlpha: appends an 8-bit alpha to a #rrggbb hex - withAlpha: non-#rrggbb values pass through opaque (a palette change cannot break fills) - padYAxis: pads the range and clamps the floor to zero - padYAxis: the floor is clamped to zero, never negative - padYAxis: the magnitude floor applies when the span is flat - padYAxis: no-op when the range is non-finite (all series hidden / no data) +- eventColors: maps recovery to ok, everything else to loss (#99) +- eventColors: tolerates a missing events list - App without state shows the right connection message - App always renders the theme switcher, even before the first load - operational App shows a disconnected banner when not connected @@ -1117,5 +1139,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **907** enumerated cases/sections across the four tiers (plus the live +_Grand total: **927** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/pithead b/pithead index 2a042da..42a90c1 100755 --- a/pithead +++ b/pithead @@ -2096,7 +2096,8 @@ render_env() { local tg_ev_node_down tg_ev_node_recovered tg_ev_worker_offline tg_ev_worker_recovered local tg_ev_worker_joined tg_ev_worker_left tg_ev_sync_finished tg_ev_disk_space tg_ev_db_unhealthy local tg_ev_xvb_no_share tg_ev_clearnet_exposed tg_ev_xvb_registration tg_ev_new_release tg_ev_stack_online - local tg_ev_daily_summary tg_summary_time tg_ev_hashrate_low + local tg_ev_daily_summary tg_summary_time tg_ev_hashrate_low tg_ev_hashrate_loss + local hr_drop_threshold hr_drop_minutes tg_ev_node_down=$(tg_event node_down) tg_ev_node_recovered=$(tg_event node_recovered) tg_ev_worker_offline=$(tg_event worker_offline) @@ -2113,6 +2114,10 @@ render_env() { tg_ev_stack_online=$(tg_event stack_online) tg_ev_daily_summary=$(tg_event daily_summary) tg_ev_hashrate_low=$(tg_event hashrate_low) + tg_ev_hashrate_loss=$(tg_event hashrate_loss) + # Degradation detector (#99): drop-below-% and sustained-minutes; defaults 50 / 10. + hr_drop_threshold=$(jq -r '.dashboard.hashrate_drop_threshold // 50' "$CONFIG_FILE") + hr_drop_minutes=$(jq -r '.dashboard.hashrate_drop_minutes // 10' "$CONFIG_FILE") # Local time (HH:MM) for the daily digest; default 08:00. tg_summary_time=$(jq -r '.telegram.daily_summary_time // "08:00"' "$CONFIG_FILE") @@ -2213,6 +2218,9 @@ TELEGRAM_EVENT_NEW_RELEASE=$tg_ev_new_release TELEGRAM_EVENT_STACK_ONLINE=$tg_ev_stack_online TELEGRAM_EVENT_DAILY_SUMMARY=$tg_ev_daily_summary TELEGRAM_EVENT_HASHRATE_LOW=$tg_ev_hashrate_low +TELEGRAM_EVENT_HASHRATE_LOSS=$tg_ev_hashrate_loss +HASHRATE_DROP_THRESHOLD_PCT=$hr_drop_threshold +HASHRATE_DROP_MINUTES=$hr_drop_minutes TELEGRAM_DAILY_SUMMARY_TIME=$tg_summary_time MONERO_MEM_LIMIT=$monero_mem_limit P2POOL_URL=${NETWORK_PREFIX}.28:3333 diff --git a/tests/stack/run.sh b/tests/stack/run.sh index 7cd1a6a..ae29b7b 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -1554,6 +1554,15 @@ printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","n out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" assert_eq "daily summary time propagated" "$(run_sourced "$V" env_get_file "$V/.env" TELEGRAM_DAILY_SUMMARY_TIME)" "21:30" +# Hashrate-loss detector knobs (#99): default 50% over 10 min; explicit dashboard overrides propagate. +assert_eq "hashrate drop threshold default 50" "$(run_sourced "$V" env_get_file "$V/.env" HASHRATE_DROP_THRESHOLD_PCT)" "50" +assert_eq "hashrate drop minutes default 10" "$(run_sourced "$V" env_get_file "$V/.env" HASHRATE_DROP_MINUTES)" "10" +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan","hashrate_drop_threshold":40,"hashrate_drop_minutes":5} }\n' "$WALLET" >"$V/config.json" +out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +assert_eq "hashrate drop threshold override propagated" "$(run_sourced "$V" env_get_file "$V/.env" HASHRATE_DROP_THRESHOLD_PCT)" "40" +assert_eq "hashrate drop minutes override propagated" "$(run_sourced "$V" env_get_file "$V/.env" HASHRATE_DROP_MINUTES)" "5" + # Event-set consistency (#121/#45): every telegram.events.* key in config.reference.json must be # rendered by pithead into .env AND declared in docker-compose.yml — so adding an alert event in one # surface but forgetting another fails here. (The Python side — AlertService.EVT_* vs config.py's From 8dc3b2328f310c19a93826dd9cfe8476fec702b4 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 10:28:49 -0500 Subject: [PATCH 14/18] feat(#104): host-perf warning badges + alerts, warnings in /status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface the persistent host conditions setup warns about as live, self-correcting dashboard badges, and alert on the actionable ones. - New badges (from live metrics): ⚠ HugePages off, ⚠ Low RAM (N GB), ⚠ No AVX2. New collector get_cpu_avx2() reads /proc/cpuinfo flags. - Telegram alerts for hugepages + low_ram: persistent host facts fire on FIRST detection (a stable bad box never "transitions"), not seed-silent; HugePages also clears on reboot. Not counted as daily incidents. AVX2 is badge-only by judgement — an immutable hardware fact with nothing to act on doesn't warrant a push. - /status now ends with the active warning/error badges (reusing build_badges so it never drifts from the top bar) or "✅ No warnings."; informational states (Syncing…, Miner held) are excluded. - Config plumbing (config.py/pithead/compose/reference) + docs (telegram.md, configuration.md, dashboard.md, CHANGELOG). Also adds the hashrate_loss row that #99 missed in the event-key table. - Tests: avx2 collector, three badges, advisory edges (fire-once + recovery/no-recovery + not-an-incident + gated-off), status_warnings filter + format_status; data_service kwarg contract extended. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 9 ++++ .../mining_dashboard/collector/system.py | 20 +++++++++ .../mining_dashboard/config/config.py | 6 +++ .../mining_dashboard/service/alert_service.py | 45 ++++++++++++++++++- .../mining_dashboard/service/data_service.py | 17 ++++++- .../service/telegram_commands.py | 32 +++++++++++-- build/dashboard/mining_dashboard/web/views.py | 31 +++++++++++++ .../dashboard/tests/collector/test_system.py | 27 +++++++++++ .../tests/service/test_alert_service.py | 42 +++++++++++++++++ .../tests/service/test_data_service.py | 10 +++++ .../tests/service/test_telegram_commands.py | 41 ++++++++++++++++- build/dashboard/tests/web/test_views.py | 40 +++++++++++++++++ config.reference.json | 4 +- docker-compose.yml | 2 + docs/configuration.md | 2 +- docs/dashboard.md | 15 +++++++ docs/telegram.md | 11 ++++- docs/test-inventory.md | 36 +++++++++++---- pithead | 5 +++ 19 files changed, 376 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59a0fdb..f85391e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -127,6 +127,15 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay walkthrough — creating a bot, finding your chat id, the command list, and the "one chat, two bots" pattern for sharing a chat with the Healthchecks.io monitor (#79) — in [`docs/telegram.md`](docs/telegram.md). +- **Host & performance warning badges + alerts** (#104): the top bar now surfaces the persistent + host conditions `setup` warns about, derived from **live** metrics (so they self-correct): **⚠ + HugePages off** (RandomX capped until reserved), **⚠ Low RAM** (under 16 GB — Tari can OOM during + sync), and **⚠ No AVX2** (slow RandomX). The first two also push a Telegram alert (`hugepages`, + `low_ram`) the first time they're seen — unlike the transient edge alerts, a stable bad state + fires on first detection, and HugePages clears with a recovery ping once a reboot applies them. + AVX2 is **badge-only** by design: a fixed hardware fact with nothing to act on at runtime doesn't + warrant a push. The bot's **`/status`** reply now ends with any active warning/error badges (the + same catalog the top bar draws) or an explicit "✅ No warnings." - **Hashrate-drop detector — chart markers + `hashrate_loss` alert** (#99): the dashboard now flags a **sustained, significant fall** in total fleet hashrate — a rig gone dark, a network cut, a stalled miner — separately from the existing "too low for your XvB tier" warning. It tracks a slow moving diff --git a/build/dashboard/mining_dashboard/collector/system.py b/build/dashboard/mining_dashboard/collector/system.py index ef5947d..b015ff6 100644 --- a/build/dashboard/mining_dashboard/collector/system.py +++ b/build/dashboard/mining_dashboard/collector/system.py @@ -6,6 +6,26 @@ BYTES_IN_GB = 1024**3 _last_cpu_times = None +_avx2_supported = None # cached: the CPU flag can't change while the process runs + + +def get_cpu_avx2(): + """Whether the CPU advertises AVX2 (#104). RandomX runs far slower without it, so setup warns on + it — surface the same persistent fact as a live badge. Reads /proc/cpuinfo (host CPU flags are + visible inside the container); cached, since the flag is fixed for the life of the process. + Returns True/False, or None when it can't be determined (non-Linux / unreadable).""" + global _avx2_supported + if _avx2_supported is not None: + return _avx2_supported + try: + with open("/proc/cpuinfo") as f: + for line in f: + if line.startswith("flags"): + _avx2_supported = "avx2" in line.split() + return _avx2_supported + except OSError: + pass + return None # unknown — don't cache, and callers treat None as "can't judge" (no badge/alert) def get_disk_usage(): diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 3c741d1..b002a2b 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -16,6 +16,10 @@ DISK_WARN_PERCENT = 85 DISK_CRITICAL_PERCENT = 95 +# Minimum host RAM (GB) below which the dashboard flags a low-RAM badge/alert (#104). Mirrors the +# setup/doctor pre-flight threshold; a code-level default (not a config.json knob). +LOW_RAM_GB = int(float(os.environ.get("LOW_RAM_GB", 16))) + # --- Data Source File Paths --- # File paths for JSON metrics generated by local collectors STRATUM_STATS_PATH = f"{BASE_STATS_DIR}/local/stratum" @@ -272,6 +276,8 @@ def _telegram_event_enabled(name, default=True): "daily_summary": _telegram_event_enabled("daily_summary"), "hashrate_low": _telegram_event_enabled("hashrate_low"), "hashrate_loss": _telegram_event_enabled("hashrate_loss"), + "hugepages": _telegram_event_enabled("hugepages"), + "low_ram": _telegram_event_enabled("low_ram"), } # ponytail: daily_summary is a scheduled push, not an edge — it lives in the events dict only so it # gets a per-event on/off toggle like the rest; its time is TELEGRAM_DAILY_SUMMARY_TIME below. diff --git a/build/dashboard/mining_dashboard/service/alert_service.py b/build/dashboard/mining_dashboard/service/alert_service.py index 253ade8..7152d3e 100644 --- a/build/dashboard/mining_dashboard/service/alert_service.py +++ b/build/dashboard/mining_dashboard/service/alert_service.py @@ -63,7 +63,9 @@ class AlertService: keeps serving but history/shares/stats stop persisting. Edge state is seeded silently on the first observation (``None`` baselines), so a dashboard - restart can't replay a stale transition as a fresh alert. + restart can't replay a stale transition as a fresh alert. The exception is the persistent + host-perf advisories (HugePages not reserved, low RAM — #104): a stable bad state never + "transitions", so those fire on first observation instead of seeding silently. :meth:`evaluate` is pure (folds signals into the alert list, no I/O) so it's fully unit-testable; :meth:`process` calls it and dispatches each message off-thread so a slow or @@ -88,6 +90,8 @@ class AlertService: EVT_DAILY_SUMMARY = "daily_summary" EVT_HASHRATE_LOW = "hashrate_low" EVT_HASHRATE_LOSS = "hashrate_loss" + EVT_HUGEPAGES = "hugepages" + EVT_LOW_RAM = "low_ram" # WorkerPresenceMonitor edge -> (event key, message template). _WORKER_EDGES = { @@ -124,6 +128,12 @@ def __init__( self._prev_xvb_reg = None self._prev_update_available = None self._prev_hashrate_low = None + # Persistent host-perf advisories (#104): unlike the transient edges above, these fire on the + # FIRST observation of the problem (a stable low-RAM box would never "transition"), so their + # baseline is "no problem" (False) rather than None — a problem present on the first cycle is + # a real edge and alerts once. + self._prev_hugepages_problem = False + self._prev_low_ram = False # Tally of problem-state transitions since the last daily digest drained it (#342). Keyed by # event, counted at the exact edge so recoveries / steady state don't inflate it. self._incidents = {} @@ -151,6 +161,8 @@ def evaluate( xvb_registration_state="", update_available=False, low_hr_warning=False, + hugepages_reserved=True, + low_ram=False, now=None, ): """Pure: fold this cycle's signals into the list of ``(event_key, text)`` to send, @@ -215,6 +227,23 @@ def evaluate( alerts += self._release_edges(update_available) alerts += self._hashrate_low_edges(low_hr_warning) + # --- Persistent host-perf advisories (#104): HugePages not reserved, low RAM --- + alerts += self._advisory_edge( + not hugepages_reserved, + "_prev_hugepages_problem", + self.EVT_HUGEPAGES, + "\U0001f7e0 \U0001f9e0 HugePages not reserved — RandomX hashrate is capped. Apply " + "setup's tuning (or edit GRUB) and reboot.", + recovery_text="\U0001f7e2 \U0001f9e0 HugePages now reserved — RandomX is unthrottled.", + ) + alerts += self._advisory_edge( + low_ram, + "_prev_low_ram", + self.EVT_LOW_RAM, + "\U0001f7e0 \U0001f4be Low RAM for this stack — syncing is memory-heavy (Tari can OOM). " + "Add RAM for a stable node.", + ) + return [(evt, text) for evt, text in alerts if self.notifier.event_enabled(evt)] def _node_edges(self, label, down, attr): @@ -447,6 +476,20 @@ def _hashrate_low_edges(self, low_hr_warning): ) ] + def _advisory_edge(self, problem, attr, event, problem_text, recovery_text=None): + """Persistent host-perf advisory (#104): fires once when ``problem`` is first observed true + (including on the first cycle — a stable bad state must still alert, unlike the seed-silent + transient edges), stays quiet while it persists, and — if ``recovery_text`` is given — fires + once when it clears. These are static host facts, not transient incidents, so they aren't + tallied in the daily incident log.""" + prev = getattr(self, attr) + setattr(self, attr, problem) + if problem == prev: + return [] + if problem: + return [(event, self._fmt(problem_text))] + return [(event, self._fmt(recovery_text))] if recovery_text else [] + def _record_incident(self, key): """Tally one problem-state transition for the daily incident log (#342).""" self._incidents[key] = self._incidents.get(key, 0) + 1 diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index 82f690b..c8a3ea7 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -21,6 +21,7 @@ get_tari_stats, ) from mining_dashboard.collector.system import ( + get_cpu_avx2, get_cpu_usage, get_disk_usage, get_hugepages_status, @@ -35,6 +36,7 @@ HASHRATE_DROP_MINUTES, HASHRATE_DROP_THRESHOLD_PCT, HOST_IP, + LOW_RAM_GB, MONERO_CLEARNET_SYNC, REJECT_WORKERS_CONTAINER, SYNC_GATE_CONTAINERS, @@ -795,6 +797,11 @@ async def run(self): # once here and reused in the snapshot below. No-op unless Telegram is configured; # never raises. disk_usage = get_disk_usage() + # Host-perf snapshot (#104), read once and reused for both the alerts and the + # system panel below. Cheap /proc reads. + hugepages = get_hugepages_status() + memory = get_memory_usage() + avx2 = get_cpu_avx2() db_healthy = self.state_manager.is_db_healthy() # Fetch fresh shares list (also used to populate the UI below) so the PPLNS-share # gate the XvB alert watches is computed from the same figure the dashboard shows. @@ -837,6 +844,11 @@ async def run(self): (self.latest_data.get("update") or {}).get("available") ), low_hr_warning=bool(alert_metrics and alert_metrics.low_hr_warning), + # Persistent host-perf conditions (#104). HugePages "Disabled" = not + # reserved (recoverable via reboot); low_ram compares live total to the + # threshold. avx2 is badge-only (no alert), so it isn't passed here. + hugepages_reserved=(hugepages[0] != "Disabled"), + low_ram=(0 < (memory.get("total_gb") or 0) < LOW_RAM_GB), ) # Once-daily status digest, reusing the metrics built above (only when the bot # is on, which is also the only time maybe_daily_summary would send). @@ -889,8 +901,9 @@ async def run(self): "clearnet_sync": self.clearnet_sync_state, "system": { "disk": disk_usage, - "hugepages": get_hugepages_status(), - "memory": get_memory_usage(), + "hugepages": hugepages, + "memory": memory, + "avx2": avx2, "load": get_load_average(), "cpu_percent": get_cpu_usage(), }, diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 8c8513f..dde45f1 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -16,6 +16,7 @@ from mining_dashboard.service.earnings import xmr_per_hs_day from mining_dashboard.service.metrics import build_metrics from mining_dashboard.service.telegram_notifier import TELEGRAM_API_BASE +from mining_dashboard.web.views import build_badges logger = logging.getLogger("TelegramCommands") @@ -81,9 +82,10 @@ def _node_state(sync): return f"⏳ syncing {sync.percent:.1f}%" -def format_status(metrics, mining_active, host_label=""): +def format_status(metrics, mining_active, host_label="", warnings=None): """Overall stack health — the answer to '/status'. Pure: folds a :class:`Metrics` (plus the - mining-active flag the loop derives from the sync gate) into text; no I/O.""" + mining-active flag the loop derives from the sync gate, and any active warning/error badges) + into text; no I/O.""" lines = [ f"{_prefix(host_label)}\U0001f4ca Pithead status", f"Monero node: {_node_state(metrics.monero)}", @@ -98,9 +100,30 @@ def format_status(metrics, mining_active, host_label=""): lines.append(f"Workers: {metrics.workers_online}/{metrics.workers_total} online") lines.append(f"Hashrate: {format_hashrate(metrics.total_h15)} (10m avg)") lines.append(f"PPLNS shares: {metrics.shares_in_window} in window") + # Surface the same warning/error badges the dashboard's top bar shows (#104), so /status is a + # one-glance "anything wrong?" — or an explicit all-clear. + if warnings: + lines.append("") + lines.append("⚠️ Warnings:") + lines.extend(f"• {w}" for w in warnings) + else: + lines.append("") + lines.append("✅ No warnings.") return "\n".join(lines) +def status_warnings(data, metrics, db_healthy): + """The active warning/error badges for /status: every ``bad`` badge plus the ``⚠``-flagged + ``warn`` badges (which the informational states — 'Syncing…', 'Miner held' — deliberately lack), + reusing :func:`build_badges` so this never drifts from the dashboard's own top bar. The leading + ``⚠`` is stripped since the section already has one header.""" + out = [] + for b in build_badges(data, metrics, "", db_healthy=db_healthy): + if b["variant"] == "bad" or b["text"].startswith("⚠"): + out.append(b["text"].lstrip("⚠ ").strip()) + return out + + def format_hashrate_reply(metrics, workers, host_label=""): """Total + per-online-worker hashrate — the answer to '/hashrate'. @@ -385,7 +408,10 @@ def reply_for(self, text): metrics = build_metrics(data, self.data_service.state_manager) if cmd == "status": mining = bool(data.get("miner_released") and not data.get("workers_rejected")) - return format_status(metrics, mining, self.host_label) + warnings = status_warnings( + data, metrics, self.data_service.state_manager.is_db_healthy() + ) + return format_status(metrics, mining, self.host_label, warnings=warnings) if cmd == "hashrate": return format_hashrate_reply(metrics, data.get("workers", []), self.host_label) if cmd == "workers": diff --git a/build/dashboard/mining_dashboard/web/views.py b/build/dashboard/mining_dashboard/web/views.py index 79090af..08ff6ce 100644 --- a/build/dashboard/mining_dashboard/web/views.py +++ b/build/dashboard/mining_dashboard/web/views.py @@ -23,6 +23,7 @@ HASHRATE_WINDOW_COLUMNS, HASHRATE_WINDOWS, HOST_IP, + LOW_RAM_GB, UPDATE_INTERVAL, ) from mining_dashboard.helper.utils import ( @@ -834,6 +835,36 @@ def build_badges(data, metrics, mode_variant, db_healthy=True): } ) + # Persistent host/performance conditions (#104), derived from live metrics so they self-correct + # (HugePages appear after a reboot, etc.). These mirror the thresholds setup/doctor pre-flight on. + system = data.get("system", {}) or {} + hp_status = (system.get("hugepages") or ["Unknown"])[0] + if hp_status == "Disabled": + badges.append( + { + "text": "⚠ HugePages off", + "variant": "warn", + "title": "HugePages aren't reserved — RandomX hashrate is capped until they are. Run setup's tuning (or edit GRUB) and reboot to apply.", + } + ) + ram_total = (system.get("memory") or {}).get("total_gb", 0) or 0 + if 0 < ram_total < LOW_RAM_GB: + badges.append( + { + "text": f"⚠ Low RAM ({ram_total:.0f} GB)", + "variant": "warn", + "title": f"Under {LOW_RAM_GB} GB of RAM — syncing (Tari especially) is memory-heavy and can OOM. Add RAM for a stable node.", + } + ) + if system.get("avx2") is False: + badges.append( + { + "text": "⚠ No AVX2", + "variant": "warn", + "title": "This CPU lacks AVX2 — RandomX mining will be significantly slower. A hardware limit; nothing to change at runtime.", + } + ) + return badges diff --git a/build/dashboard/tests/collector/test_system.py b/build/dashboard/tests/collector/test_system.py index 9e54b43..a4b526a 100644 --- a/build/dashboard/tests/collector/test_system.py +++ b/build/dashboard/tests/collector/test_system.py @@ -79,3 +79,30 @@ def test_allocated_when_unused(self): def test_unknown_when_missing(self): with patch("builtins.open", _fake_open("SomethingElse: 1\n")): assert system.get_hugepages_status() == ("Unknown", "status-warn", "0/0") + + +class TestCpuAvx2: + def setup_method(self): + system._avx2_supported = None # clear the process-lifetime cache between cases + + def test_flag_present(self): + cpuinfo = "processor\t: 0\nflags\t\t: fpu vme avx avx2 sse4_2\n" + with patch("builtins.open", _fake_open(cpuinfo)): + assert system.get_cpu_avx2() is True + + def test_flag_absent(self): + cpuinfo = "processor\t: 0\nflags\t\t: fpu vme avx sse4_2\n" # avx but not avx2 + with patch("builtins.open", _fake_open(cpuinfo)): + assert system.get_cpu_avx2() is False + + def test_unreadable_is_unknown(self): + with patch("builtins.open", side_effect=OSError): + assert system.get_cpu_avx2() is None + + def test_result_is_cached(self): + cpuinfo = "flags\t\t: avx2\n" + with patch("builtins.open", _fake_open(cpuinfo)): + assert system.get_cpu_avx2() is True + # Second call must not re-read (open would now raise) — the cached value stands. + with patch("builtins.open", side_effect=AssertionError("should not re-read")): + assert system.get_cpu_avx2() is True diff --git a/build/dashboard/tests/service/test_alert_service.py b/build/dashboard/tests/service/test_alert_service.py index 4cfb7c9..7a809d8 100644 --- a/build/dashboard/tests/service/test_alert_service.py +++ b/build/dashboard/tests/service/test_alert_service.py @@ -62,6 +62,8 @@ def _ev( xvb_registration_state="", update_available=False, low_hr_warning=False, + hugepages_reserved=True, + low_ram=False, now=0, ): return svc.evaluate( @@ -79,6 +81,8 @@ def _ev( xvb_registration_state=xvb_registration_state, update_available=update_available, low_hr_warning=low_hr_warning, + hugepages_reserved=hugepages_reserved, + low_ram=low_ram, now=now, ) @@ -498,6 +502,44 @@ def boom(): assert await svc.maybe_daily_summary(0, lambda: "digest") is None +class TestHostAdvisories: + """Persistent host-perf advisories (#104): unlike the transient edges, these fire on the FIRST + observation of the problem (a stable bad box would never 'transition'), stay quiet while it + persists, and — for HugePages — clear when fixed. They are not tallied as daily incidents.""" + + def test_hugepages_not_reserved_fires_once_then_recovers(self): + svc = _svc() + # First cycle already bad → fires (not seed-silent). + assert _keys(_ev(svc, hugepages_reserved=False)) == [AlertService.EVT_HUGEPAGES] + # Persists → silent. + assert _keys(_ev(svc, hugepages_reserved=False)) == [] + # Reboot applied HugePages → one recovery edge. + assert _keys(_ev(svc, hugepages_reserved=True)) == [AlertService.EVT_HUGEPAGES] + assert _keys(_ev(svc, hugepages_reserved=True)) == [] + + def test_healthy_hugepages_never_fires(self): + svc = _svc() + assert _keys(_ev(svc, hugepages_reserved=True)) == [] + assert _keys(_ev(svc, hugepages_reserved=True)) == [] + + def test_low_ram_fires_once_no_recovery(self): + svc = _svc() + assert _keys(_ev(svc, low_ram=True)) == [AlertService.EVT_LOW_RAM] + assert _keys(_ev(svc, low_ram=True)) == [] # persists, silent + # RAM "recovering" (unlikely at runtime) is silent — no false good-news ping. + assert _keys(_ev(svc, low_ram=False)) == [] + + def test_advisories_not_counted_as_incidents(self): + # Static host facts shouldn't inflate the daily incident roll-up (#342). + svc = _svc() + _ev(svc, hugepages_reserved=False, low_ram=True) + assert svc.drain_incidents() == {} + + def test_gated_off_by_toggle(self): + svc = _svc(notifier=_FakeNotifier(allow={AlertService.EVT_NODE_DOWN})) + assert _keys(_ev(svc, hugepages_reserved=False, low_ram=True)) == [] + + class TestDegradationAlert: """The #99 hashrate-loss / recovery push. The DegradationMonitor owns the debounce; this only formats + sends, tallies the loss as an incident, and honours the event toggle.""" diff --git a/build/dashboard/tests/service/test_data_service.py b/build/dashboard/tests/service/test_data_service.py index d379e31..07ef080 100644 --- a/build/dashboard/tests/service/test_data_service.py +++ b/build/dashboard/tests/service/test_data_service.py @@ -688,6 +688,7 @@ async def test_single_iteration_aggregates(self): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): @@ -749,6 +750,7 @@ async def test_degradation_edge_records_event_and_alerts(self): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): @@ -805,6 +807,7 @@ async def test_run_holds_miner_while_syncing(self): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): @@ -862,6 +865,7 @@ async def test_run_wires_computed_signals_into_the_alerter(self): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): @@ -885,6 +889,8 @@ async def test_run_wires_computed_signals_into_the_alerter(self): "xvb_registration_state", "update_available", "low_hr_warning", + "hugepages_reserved", + "low_ram", } # ...sourced from the real computed values, not placeholders. assert kw["db_healthy"] is True # from state_manager.is_db_healthy() @@ -934,6 +940,7 @@ async def test_run_releases_despite_height_override(self): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): @@ -993,6 +1000,7 @@ async def test_run_nonblocking_tari_releases_and_stays_operational(self): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): @@ -1032,6 +1040,7 @@ async def _run_one_iteration(self, svc, monero_sync, tari_sync): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): @@ -1159,6 +1168,7 @@ async def test_run_holds_when_tari_required_and_only_monero_synced(self): patch.object(ds_mod, "get_memory_usage", return_value={}), patch.object(ds_mod, "get_load_average", return_value="0"), patch.object(ds_mod, "get_cpu_usage", return_value="0%"), + patch.object(ds_mod, "get_cpu_avx2", return_value=True), patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)), ): with pytest.raises(StopAsyncIteration): diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index b51761b..a52ee6b 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -300,9 +300,10 @@ def test_host_label_prefix(): # --- reply_for routing -------------------------------------------------------------------- -def _bot(monkeypatch, latest_data=None, **over): +def _bot(monkeypatch, latest_data=None, db_healthy=True, **over): monkeypatch.setattr(tc, "build_metrics", lambda data, sm: _metrics(**over)) - ds = SimpleNamespace(latest_data=latest_data or {}, state_manager=object()) + sm = SimpleNamespace(is_db_healthy=lambda: db_healthy) + ds = SimpleNamespace(latest_data=latest_data or {}, state_manager=sm) return tc.TelegramCommandBot(ds, enabled=True, bot_token="tok", chat_id="42", host_label="") @@ -533,3 +534,39 @@ def _boom(poll_timeout): with pytest.raises(asyncio.CancelledError): await bot.run() assert slept == [tc.POLL_ERROR_BACKOFF_SECONDS] + + +class TestStatusWarnings: + """/status surfaces the same warning/error badges as the dashboard top bar (#104), reusing + build_badges so the two never drift; informational states ('Syncing…') are excluded.""" + + def test_bad_and_flagged_warn_badges_included_stripped(self): + # Low RAM (⚠ warn) + DB failing (bad) both surface; the leading ⚠ is stripped for the list. + warnings = tc.status_warnings( + {"system": {"memory": {"total_gb": 8}}}, _metrics(), db_healthy=False + ) + assert "Low RAM (8 GB)" in warnings + assert "DB write failing" in warnings + assert not any(w.startswith("⚠") for w in warnings) + + def test_informational_states_excluded(self): + # 'Syncing…' / 'Miner held' are warn-variant but informational (no ⚠) — not warnings. + warnings = tc.status_warnings( + {"miner_held": True}, _metrics(global_syncing=True), db_healthy=True + ) + assert warnings == [] + + def test_healthy_is_empty(self): + assert tc.status_warnings({}, _metrics(), db_healthy=True) == [] + + def test_format_status_lists_warnings(self): + text = tc.format_status( + _metrics(), True, warnings=["Low RAM (8 GB)", "HugePages not reserved"] + ) + assert "⚠️ Warnings:" in text + assert "• Low RAM (8 GB)" in text + assert "• HugePages not reserved" in text + + def test_format_status_all_clear(self): + text = tc.format_status(_metrics(), True, warnings=[]) + assert "✅ No warnings." in text diff --git a/build/dashboard/tests/web/test_views.py b/build/dashboard/tests/web/test_views.py index 038d0a4..d0e94ea 100644 --- a/build/dashboard/tests/web/test_views.py +++ b/build/dashboard/tests/web/test_views.py @@ -628,6 +628,46 @@ def test_no_disk_badge_when_missing(self): out = build_badges({}, _metrics(), "ok") assert not any("Disk" in b["text"] for b in out) + # --- Host-perf badges (#104): AVX2 / HugePages / low RAM, from live metrics ------------- + def test_hugepages_disabled_badge(self): + out = build_badges( + {"system": {"hugepages": ["Disabled", "status-bad", "0/0"]}}, _metrics(), "ok" + ) + assert any(b["variant"] == "warn" and "HugePages off" in b["text"] for b in out) + + def test_no_hugepages_badge_when_reserved(self): + for status in ("Allocated", "Enabled", "Unknown"): # only "Disabled" is a problem + out = build_badges({"system": {"hugepages": [status, "", "1/2"]}}, _metrics(), "ok") + assert not any("HugePages" in b["text"] for b in out), status + + def test_low_ram_badge(self): + out = build_badges({"system": {"memory": {"total_gb": 8}}}, _metrics(), "ok") + assert any(b["variant"] == "warn" and "Low RAM (8 GB)" in b["text"] for b in out) + + def test_no_low_ram_badge_at_or_above_threshold_or_unknown(self): + assert not any( + "Low RAM" in b["text"] + for b in build_badges({"system": {"memory": {"total_gb": 16}}}, _metrics(), "ok") + ) + # total 0 = couldn't read /proc/meminfo (not "0 GB of RAM") — no false badge. + assert not any( + "Low RAM" in b["text"] + for b in build_badges({"system": {"memory": {"total_gb": 0}}}, _metrics(), "ok") + ) + + def test_avx2_missing_badge(self): + out = build_badges({"system": {"avx2": False}}, _metrics(), "ok") + assert any(b["variant"] == "warn" and "No AVX2" in b["text"] for b in out) + + def test_no_avx2_badge_when_present_or_unknown(self): + assert not any( + "AVX2" in b["text"] for b in build_badges({"system": {"avx2": True}}, _metrics(), "ok") + ) + # None = couldn't determine (non-Linux / unreadable) — stay silent, don't cry wolf. + assert not any( + "AVX2" in b["text"] for b in build_badges({"system": {"avx2": None}}, _metrics(), "ok") + ) + # --- System (presentation thresholds) ------------------------------------------------- diff --git a/config.reference.json b/config.reference.json index 7f48398..ff49e70 100644 --- a/config.reference.json +++ b/config.reference.json @@ -103,7 +103,9 @@ "stack_online": true, "daily_summary": true, "hashrate_low": true, - "hashrate_loss": true + "hashrate_loss": true, + "hugepages": true, + "low_ram": true }, "daily_summary_time": "08:00", "commands": { diff --git a/docker-compose.yml b/docker-compose.yml index 522e868..a39d7a5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -449,6 +449,8 @@ services: - TELEGRAM_EVENT_DAILY_SUMMARY=${TELEGRAM_EVENT_DAILY_SUMMARY:-true} - TELEGRAM_EVENT_HASHRATE_LOW=${TELEGRAM_EVENT_HASHRATE_LOW:-true} - TELEGRAM_EVENT_HASHRATE_LOSS=${TELEGRAM_EVENT_HASHRATE_LOSS:-true} + - TELEGRAM_EVENT_HUGEPAGES=${TELEGRAM_EVENT_HUGEPAGES:-true} + - TELEGRAM_EVENT_LOW_RAM=${TELEGRAM_EVENT_LOW_RAM:-true} - TELEGRAM_DAILY_SUMMARY_TIME=${TELEGRAM_DAILY_SUMMARY_TIME:-08:00} # Hashrate-degradation detector (#99). - HASHRATE_DROP_THRESHOLD_PCT=${HASHRATE_DROP_THRESHOLD_PCT:-50} diff --git a/docs/configuration.md b/docs/configuration.md index 98ede69..6adb5ab 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -107,7 +107,7 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `telegram.enabled` | `false` | Push operational alerts (node down/recovered, worker offline/back, sync finished) to Telegram. Off by default. Requires `bot_token` + `chat_id` to actually send. Full walkthrough: [Telegram Bot](telegram.md). | | `telegram.bot_token` | `""` | Your BotFather bot token. A secret — stored owner-only in `.env`, git-ignored, and never logged. Get one from [@BotFather](https://t.me/BotFather). | | `telegram.chat_id` | `""` | Where alerts are sent and the only chat the command interface answers. A Telegram group id (negative, e.g. `-1001234567890`) or a personal chat id. See [how to find it](telegram.md#3-find-your-chat-id). | -| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`, `daily_summary`, `hashrate_low`, `hashrate_loss`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | +| `telegram.events.*` | all `true` | Per-event toggles: `stack_online`, `node_down`, `node_recovered`, `worker_offline`, `worker_recovered`, `worker_joined`, `worker_left`, `sync_finished`, `disk_space`, `db_unhealthy`, `xvb_no_share`, `xvb_registration`, `clearnet_exposed`, `new_release`, `daily_summary`, `hashrate_low`, `hashrate_loss`, `hugepages`, `low_ram`. Each defaults to on once Telegram is enabled; set one `false` to silence just that alert. Full list: [Telegram Bot](telegram.md#choosing-which-alerts-you-get). | | `telegram.daily_summary_time` | `08:00` | Local time (24-hour `HH:MM`) to push the once-a-day status digest, when the `daily_summary` event is on. Uses the dashboard's timezone (`dashboard.timezone`). A malformed value disables the digest. | | `telegram.commands.enabled` | `false` | Turn on the interactive command interface — the bot answers `/status`, `/hashrate`, `/workers`, `/sync`, and `/help` from the configured `chat_id` (every other chat is ignored). Off by default; alerts work without it. Long-polls over Tor, so it needs no inbound port. See [Telegram › Commands](telegram.md#commands). | diff --git a/docs/dashboard.md b/docs/dashboard.md index 619f167..4a8e3f2 100644 --- a/docs/dashboard.md +++ b/docs/dashboard.md @@ -89,6 +89,21 @@ to the version badge, linking to the GitHub release. It never updates anything; your IP. Turn it off with `dashboard.check_for_updates: false` (see [Configuration](configuration.md#configuration-reference)). +### Host & performance warnings + +The top bar also surfaces the persistent host conditions that `setup` warns about, derived from +**live** metrics so they self-correct rather than going stale: + +| Badge | Means | Fix | +|---|---|---| +| `⚠ HugePages off` | HugePages aren't reserved — RandomX hashrate is capped. | Run setup's tuning (or edit GRUB) and reboot; the badge clears once they're reserved. | +| `⚠ Low RAM (N GB)` | Under 16 GB of RAM — syncing is memory-heavy and Tari can OOM. | Add RAM for a stable node. | +| `⚠ No AVX2` | The CPU lacks AVX2, so RandomX mining is much slower. | A hardware limit; nothing to change at runtime. | + +The first two also push a Telegram alert (`hugepages`, `low_ram`) when first detected, if the bot is +on; AVX2 is badge-only (see [Telegram Bot](telegram.md#choosing-which-alerts-you-get)). All active +warning badges are echoed in the bot's `/status` reply. + ### Hero band A strip of headline KPIs sits below the top bar: diff --git a/docs/telegram.md b/docs/telegram.md index 069f73d..ca3ed72 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -35,6 +35,8 @@ transition, not a stream: | 🎰 **XvB registration** | XvB auto-registration was rejected (bad payout address) or is failing — raffle wins won't count until it recovers. Only fires when XvB is enabled. | | 📉 **Hashrate low for tier** | You picked a fixed XvB donation tier your hashrate can't sustain — lower the tier or add hashrate. Fires on the transition and clears when it recovers. | | ⚠️ **Hashrate drop** | Total fleet hashrate fell sharply below its recent normal and **stayed down** — a rig gone dark, a network cut, or a stalled miner. The dashboard also drops a marker on the hashrate chart at the moment it happened. Fires once on the drop and again on recovery. Thresholds are tunable (`dashboard.hashrate_drop_threshold`, `dashboard.hashrate_drop_minutes`). | +| 🧠 **HugePages not reserved** | RandomX runs capped until HugePages are reserved. Fires when the dashboard first sees them missing and again once a reboot applies them — so you know the tuning took. | +| 💾 **Low RAM** | This host has less RAM than the stack wants (syncing is memory-heavy; Tari can OOM). Sent once when first detected — a heads-up that instability may be under-provisioning, not a bug. | | 🆕 **New release** | A newer Pithead release is available (the same signal as the dashboard header badge). | | 🚀 **Pithead online** | Sent once when the dashboard starts — a heartbeat that the stack is up (and confirms the bot works after setup). | | 📅 **Daily summary** | A once-a-day retrospective of the last 24h across your whole fleet — date/time, an **incident roll-up** (what went wrong during the day, or an all-clear), **24h hashrate** with the **P2Pool / XvB split**, **shares found in the day**, an **estimated daily earnings** figure, and a **per-machine 24h breakdown** — pushed at a set local time (**08:00** by default; `telegram.daily_summary_time`). | @@ -160,9 +162,16 @@ block and set it to `false` — any event you don't list stays on: | `stack_online` | `true` | One-shot "dashboard is up" heartbeat on start | | `daily_summary` | `true` | Once-a-day status roll-up (time set by `telegram.daily_summary_time`, default `08:00`) | | `hashrate_low` | `true` | Hashrate can't sustain the chosen XvB tier / recovered | +| `hashrate_loss` | `true` | Total hashrate dropped sharply and stayed down (outage / rig dark) / recovered | +| `hugepages` | `true` | HugePages not reserved (RandomX capped) / reserved after a reboot | +| `low_ram` | `true` | Host has less RAM than the stack wants (one-shot heads-up) | Run `./pithead apply` after editing. +> The dashboard also shows an **AVX2-missing** badge when the CPU lacks AVX2, but it has **no +> alert** — it's a fixed hardware fact with nothing to do at runtime, so it stays a badge (and shows +> in `/status`) rather than a push you can't act on. + > **Tari note.** A node-down/recovered alert fires for **Tari only when Tari is treated as > required** (`dashboard.tari_required: true`, the default). If you've made Tari non-blocking, a > Tari outage doesn't stop your Monero mining, so it isn't alerted as a node-down — matching how @@ -189,7 +198,7 @@ Run `./pithead apply` after editing. The commands: | Command | Reply | |---|---| -| `/status` | One-glance health: each node up/down/syncing, whether mining is active, workers online, total hashrate, PPLNS shares in window. | +| `/status` | One-glance health: each node up/down/syncing, whether mining is active, workers online, total hashrate, PPLNS shares in window — followed by any active **warning/error badges** (the same ones the dashboard's top bar shows), or an explicit "✅ No warnings." | | `/hashrate` | Total hashrate plus a per-rig breakdown of everything currently online. | | `/workers` | Every rig's online/offline state, with uptime for the ones that are up. | | `/sync` | Monero and Tari sync progress (percent and block height). | diff --git a/docs/test-inventory.md b/docs/test-inventory.md index ff5db86..8649ad0 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 762 dashboard unit tests · 12 contract tests · 66 frontend +**Totals:** 782 dashboard unit tests · 12 contract tests · 66 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 762 | +| 1 — Unit | dashboard pytest | 782 | | 1 — Unit | frontend (node --test) | 66 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 762 tests +### Dashboard (pytest) — 782 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -146,7 +146,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_malformed_json_returns_empty - test_valid_json -#### tests/collector/test_system.py — 11 +#### tests/collector/test_system.py — 15 - test_normal - test_error_returns_zeros - test_parses_meminfo @@ -158,6 +158,10 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_enabled_when_used - test_allocated_when_unused - test_unknown_when_missing +- test_flag_present +- test_flag_absent +- test_unreadable_is_unknown +- test_result_is_cached #### tests/config/test_config.py — 11 - test_defaults_load @@ -210,7 +214,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_none_when_no_route - test_socket_is_closed_even_on_error -#### tests/service/test_alert_service.py — 44 +#### tests/service/test_alert_service.py — 49 - test_every_alert_event_has_a_config_toggle - test_first_cycle_seeds_baseline_silently - test_down_then_recovered @@ -252,6 +256,11 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_malformed_time_disables - test_gated_off_by_event_toggle - test_provider_error_is_swallowed_and_marks_day_done +- test_hugepages_not_reserved_fires_once_then_recovers +- test_healthy_hugepages_never_fires +- test_low_ram_fires_once_no_recovery +- test_advisories_not_counted_as_incidents +- test_gated_off_by_toggle - test_loss_sends_and_records_incident - test_recovery_sends_no_incident - test_gated_off_still_records_loss @@ -567,7 +576,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_events_survive_reload - test_load_tolerates_missing_events_table -#### tests/service/test_telegram_commands.py — 44 +#### tests/service/test_telegram_commands.py — 49 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -612,6 +621,11 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_send_swallows_network_error - test_run_processes_update_then_honours_cancel - test_run_backs_off_on_poll_error +- test_bad_and_flagged_warn_badges_included_stripped +- test_informational_states_excluded +- test_healthy_is_empty +- test_format_status_lists_warnings +- test_format_status_all_clear #### tests/service/test_telegram_notifier.py — 10 - test_disabled_by_default @@ -718,7 +732,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_css_lets_hostname_wrap - test_host_at_separator_styled_and_rendered -#### tests/web/test_views.py — 137 +#### tests/web/test_views.py — 143 - test_point_shape_is_xy_with_epoch_ms - test_legacy_rows_attributed_to_p2pool - test_range_filtering @@ -783,6 +797,12 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_disk_badge_warn - test_no_disk_badge_when_ample - test_no_disk_badge_when_missing +- test_hugepages_disabled_badge +- test_no_hugepages_badge_when_reserved +- test_low_ram_badge +- test_no_low_ram_badge_at_or_above_threshold_or_unknown +- test_avx2_missing_badge +- test_no_avx2_badge_when_present_or_unknown - test_high_usage_levels_and_fill - test_warning_fill_between_70_and_90 - test_unparseable_cpu_is_ok @@ -1139,5 +1159,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **927** enumerated cases/sections across the four tiers (plus the live +_Grand total: **947** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/pithead b/pithead index 42a90c1..7092c1b 100755 --- a/pithead +++ b/pithead @@ -2097,6 +2097,7 @@ render_env() { local tg_ev_worker_joined tg_ev_worker_left tg_ev_sync_finished tg_ev_disk_space tg_ev_db_unhealthy local tg_ev_xvb_no_share tg_ev_clearnet_exposed tg_ev_xvb_registration tg_ev_new_release tg_ev_stack_online local tg_ev_daily_summary tg_summary_time tg_ev_hashrate_low tg_ev_hashrate_loss + local tg_ev_hugepages tg_ev_low_ram local hr_drop_threshold hr_drop_minutes tg_ev_node_down=$(tg_event node_down) tg_ev_node_recovered=$(tg_event node_recovered) @@ -2115,6 +2116,8 @@ render_env() { tg_ev_daily_summary=$(tg_event daily_summary) tg_ev_hashrate_low=$(tg_event hashrate_low) tg_ev_hashrate_loss=$(tg_event hashrate_loss) + tg_ev_hugepages=$(tg_event hugepages) + tg_ev_low_ram=$(tg_event low_ram) # Degradation detector (#99): drop-below-% and sustained-minutes; defaults 50 / 10. hr_drop_threshold=$(jq -r '.dashboard.hashrate_drop_threshold // 50' "$CONFIG_FILE") hr_drop_minutes=$(jq -r '.dashboard.hashrate_drop_minutes // 10' "$CONFIG_FILE") @@ -2219,6 +2222,8 @@ TELEGRAM_EVENT_STACK_ONLINE=$tg_ev_stack_online TELEGRAM_EVENT_DAILY_SUMMARY=$tg_ev_daily_summary TELEGRAM_EVENT_HASHRATE_LOW=$tg_ev_hashrate_low TELEGRAM_EVENT_HASHRATE_LOSS=$tg_ev_hashrate_loss +TELEGRAM_EVENT_HUGEPAGES=$tg_ev_hugepages +TELEGRAM_EVENT_LOW_RAM=$tg_ev_low_ram HASHRATE_DROP_THRESHOLD_PCT=$hr_drop_threshold HASHRATE_DROP_MINUTES=$hr_drop_minutes TELEGRAM_DAILY_SUMMARY_TIME=$tg_summary_time From 38c60c473099ea99168e7c0aa3d5e80b2fced98a Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 11:23:56 -0500 Subject: [PATCH 15/18] =?UTF-8?q?feat(telegram):=20/info=20command=20?= =?UTF-8?q?=E2=80=94=20version,=20updates,=20DB=20mode,=20privacy=20postur?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New read-only /info "about this stack" card, reusing facts the stack already computes: - running version + whether a newer release is available (resolve_version + the update-check signal) - Monero DB mode (pruned / full) - P2Pool sidechain (Mini / Main) - egress posture: 🧅 Tor-only, or the count of clearnet paths exposing the host IP (egress_posture_from_config summary) Keeps /status focused on live operational health; /info answers "what is this install?". Registered in COMMANDS/HELP, docs (telegram.md, CHANGELOG) updated, unit tests cover both release/dev + up-to-date/update + pruned/ full + tor/clearnet branches and dispatch routing. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 7 +-- .../service/telegram_commands.py | 51 ++++++++++++++++++- .../tests/service/test_telegram_commands.py | 48 +++++++++++++++++ docs/telegram.md | 1 + docs/test-inventory.md | 14 +++-- 5 files changed, 112 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f85391e..6daeee7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -105,9 +105,10 @@ cd pithead && cp config.json.template config.json # set your Monero + Tari pay finished**, **data disk filling up**, **dashboard DB write failing**, **no PPLNS share while donating to XvB** (raffle wins skipped), **XvB registration rejected / failing**, **hashrate too low for the chosen XvB tier**, **a node exposed on clearnet** during initial sync, and **a new - release being available** — and answer status commands on demand: **`/status`**, **`/hashrate`**, - **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, **`/xvb`**, **`/earnings`**, and - **`/help`**. It also pushes a **📅 once-a-day retrospective** at a configurable local time + release being available** — and answer status commands on demand: **`/status`**, **`/info`** + (version + update availability, Monero DB mode, P2Pool sidechain, and Tor-only/clearnet privacy + posture), **`/hashrate`**, **`/workers`**, **`/sync`**, **`/system`**, **`/pool`**, **`/xvb`**, + **`/earnings`**, and **`/help`**. It also pushes a **📅 once-a-day retrospective** at a configurable local time (`telegram.daily_summary_time`, default **08:00**) — the last 24h across the fleet: an incident roll-up (what went wrong during the day, or an all-clear), 24h hashrate with the P2Pool/XvB split, shares found in the day, an estimated daily-earnings figure, and a per-machine 24h breakdown. The Telegram bot appears in the dashboard's diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index dde45f1..d6411c3 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -14,8 +14,10 @@ ) from mining_dashboard.helper.utils import effective_hashrate, format_duration, format_hashrate from mining_dashboard.service.earnings import xmr_per_hs_day +from mining_dashboard.service.egress import egress_posture_from_config from mining_dashboard.service.metrics import build_metrics from mining_dashboard.service.telegram_notifier import TELEGRAM_API_BASE +from mining_dashboard.version import resolve_version from mining_dashboard.web.views import build_badges logger = logging.getLogger("TelegramCommands") @@ -29,11 +31,23 @@ # The commands the bot answers. All are read-only status queries — the bot can never change the # stack (start/stop/apply live on the CLI), so a leaked chat can at worst read status, not act. -COMMANDS = ("status", "hashrate", "workers", "sync", "system", "pool", "xvb", "earnings", "help") +COMMANDS = ( + "status", + "info", + "hashrate", + "workers", + "sync", + "system", + "pool", + "xvb", + "earnings", + "help", +) HELP_TEXT = ( "Pithead bot — commands:\n" "/status — stack health at a glance\n" + "/info — version, updates, DB mode, privacy posture\n" "/hashrate — total + per-worker hashrate\n" "/workers — each rig's online/offline state\n" "/sync — Monero + Tari node sync progress\n" @@ -112,6 +126,33 @@ def format_status(metrics, mining_active, host_label="", warnings=None): return "\n".join(lines) +def format_info(version, update, metrics, egress_summary, host_label=""): + """The 'about this stack' card — the answer to '/info'. Folds the build version, whether an + upgrade is available, the Monero DB mode, the P2Pool sidechain, and the privacy (egress) posture + into one glance. Static-ish facts, kept out of /status (which is live health).""" + lines = [f"{_prefix(host_label)}\U0001f4df Pithead info"] + + ver = (version or {}).get("text", "unknown") + lines.append(f"Version: {ver}{' (dev build)' if (version or {}).get('dev') else ''}") + + update = update or {} + if update.get("available") and update.get("latest"): + lines.append(f"Updates: \U0001f195 {update['latest']} available — ./pithead upgrade") + else: + lines.append("Updates: ✅ Up to date") + + mode = metrics.monero_mode + lines.append(f"Monero DB: {mode}" if mode in ("Pruned", "Full") else "Monero DB: unknown") + lines.append(f"Sidechain: P2Pool {metrics.pool_type}") + + egress_summary = egress_summary or {} + if egress_summary.get("all_tor", True): + lines.append("Egress: \U0001f9c5 Tor-only") + else: + lines.append(f"Egress: ⚠️ {egress_summary.get('label', 'clearnet exposure')}") + return "\n".join(lines) + + def status_warnings(data, metrics, db_healthy): """The active warning/error badges for /status: every ``bad`` badge plus the ``⚠``-flagged ``warn`` badges (which the informational states — 'Syncing…', 'Miner held' — deliberately lack), @@ -412,6 +453,14 @@ def reply_for(self, text): data, metrics, self.data_service.state_manager.is_db_healthy() ) return format_status(metrics, mining, self.host_label, warnings=warnings) + if cmd == "info": + return format_info( + resolve_version(), + data.get("update"), + metrics, + egress_posture_from_config()["summary"], + self.host_label, + ) if cmd == "hashrate": return format_hashrate_reply(metrics, data.get("workers", []), self.host_label) if cmd == "workers": diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index a52ee6b..d1adf4f 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -73,6 +73,7 @@ def _metrics(**over): "text,expected", [ ("/status", "status"), + ("/info", "info"), (" /sync ", "sync"), ("/HASHRATE", "hashrate"), ("/system", "system"), @@ -570,3 +571,50 @@ def test_format_status_lists_warnings(self): def test_format_status_all_clear(self): text = tc.format_status(_metrics(), True, warnings=[]) assert "✅ No warnings." in text + + +class TestInfo: + """/info — the 'about this stack' card: build version, update availability, Monero DB mode, + P2Pool sidechain, and privacy (egress) posture. All facts the stack already computes.""" + + def test_release_up_to_date_pruned_tor(self): + out = tc.format_info( + {"text": "v1.1.0", "dev": False}, + {"available": False}, + _metrics(monero_mode="Pruned", pool_type="Mini"), + {"all_tor": True}, + ) + assert "Version: v1.1.0" in out and "(dev build)" not in out + assert "✅ Up to date" in out + assert "Monero DB: Pruned" in out + assert "Sidechain: P2Pool Mini" in out + assert "🧅 Tor-only" in out + + def test_dev_update_available_full_clearnet(self): + out = tc.format_info( + {"text": "dev · main @ abc1234", "dev": True}, + {"available": True, "latest": "v1.2.0"}, + _metrics(monero_mode="Full"), + {"all_tor": False, "label": "2 clearnet egress path(s) exposing your IP"}, + ) + assert "(dev build)" in out + assert "🆕 v1.2.0 available" in out + assert "Monero DB: Full" in out + assert "⚠️ 2 clearnet egress path(s)" in out + + def test_unknown_db_mode_and_missing_update(self): + # monero_mode "Unknown" (remote/early) → no false Pruned/Full; update None → up to date. + out = tc.format_info( + {"text": "v1.1.0"}, None, _metrics(monero_mode="Unknown"), {"all_tor": True} + ) + assert "Monero DB: unknown" in out + assert "✅ Up to date" in out + + def test_reply_for_info_routes(self, monkeypatch): + monkeypatch.setattr(tc, "resolve_version", lambda: {"text": "v1.1.0", "dev": False}) + monkeypatch.setattr( + tc, "egress_posture_from_config", lambda: {"summary": {"all_tor": True}} + ) + bot = _bot(monkeypatch, latest_data={"update": {"available": False}}, monero_mode="Pruned") + out = bot.reply_for("/info") + assert "📟 Pithead info" in out and "Version: v1.1.0" in out and "🧅 Tor-only" in out diff --git a/docs/telegram.md b/docs/telegram.md index ca3ed72..3456ed6 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -199,6 +199,7 @@ Run `./pithead apply` after editing. The commands: | Command | Reply | |---|---| | `/status` | One-glance health: each node up/down/syncing, whether mining is active, workers online, total hashrate, PPLNS shares in window — followed by any active **warning/error badges** (the same ones the dashboard's top bar shows), or an explicit "✅ No warnings." | +| `/info` | About this stack: the running **version** (and whether a newer release is available), the Monero **DB mode** (pruned / full), the P2Pool **sidechain** (Mini / Main), and the **privacy posture** (Tor-only, or how many clearnet paths are exposed). | | `/hashrate` | Total hashrate plus a per-rig breakdown of everything currently online. | | `/workers` | Every rig's online/offline state, with uptime for the ones that are up. | | `/sync` | Monero and Tari sync progress (percent and block height). | diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 8649ad0..213692a 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 782 dashboard unit tests · 12 contract tests · 66 frontend +**Totals:** 786 dashboard unit tests · 12 contract tests · 66 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 782 | +| 1 — Unit | dashboard pytest | 786 | | 1 — Unit | frontend (node --test) | 66 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 782 tests +### Dashboard (pytest) — 786 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -576,7 +576,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_events_survive_reload - test_load_tolerates_missing_events_table -#### tests/service/test_telegram_commands.py — 49 +#### tests/service/test_telegram_commands.py — 53 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -626,6 +626,10 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_healthy_is_empty - test_format_status_lists_warnings - test_format_status_all_clear +- test_release_up_to_date_pruned_tor +- test_dev_update_available_full_clearnet +- test_unknown_db_mode_and_missing_update +- test_reply_for_info_routes #### tests/service/test_telegram_notifier.py — 10 - test_disabled_by_default @@ -1159,5 +1163,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **947** enumerated cases/sections across the four tiers (plus the live +_Grand total: **951** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ From 7a99835915b39bfe3bd1969349e0f63944074afe Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 11:34:20 -0500 Subject: [PATCH 16/18] feat(telegram): enrich /pool, /xvb, /status with high-value mining data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Polish pass — surface data the stack already computes but the commands didn't show: - /pool: sidechain blocks found, Monero network difficulty, share acceptance (accepted/rejected + reject %), and the best share difficulty found (💎) — a real pool-performance snapshot. Share/blocks lines are omitted before the first proxy poll (no zeroed noise). - /xvb: the credited 1h/24h averages XvB actually measures (what sets your tier, vs routed which is what we send), plus a stale-data warning when the XvB feed is behind. - /status: a merge-mining line from the Tari gRPC-connected flag — a distinct signal from a synced Tari node (the link can be down while the node is up; the exact gap that hid #313). Omitted until Tari is polled. New _human_count() SI formatter for big figures. Docs (telegram.md command table) + tests updated; patch coverage 97%. Co-Authored-By: Claude Opus 4.8 --- .../service/telegram_commands.py | 76 ++++++++++++++---- .../tests/service/test_telegram_commands.py | 78 ++++++++++++++++++- docs/telegram.md | 6 +- docs/test-inventory.md | 17 ++-- 4 files changed, 151 insertions(+), 26 deletions(-) diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index d6411c3..0cfb8da 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -96,15 +96,30 @@ def _node_state(sync): return f"⏳ syncing {sync.percent:.1f}%" -def format_status(metrics, mining_active, host_label="", warnings=None): +def _human_count(n): + """Compact SI-suffixed number for large figures like network difficulty (380_000_000_000 → + '380.00 G'). Small values pass through as a plain integer.""" + n = float(n or 0) + for unit in ("", "K", "M", "G", "T", "P"): + if abs(n) < 1000: + return f"{n:.2f} {unit}".strip() if unit else f"{int(n)}" + n /= 1000 + return f"{n:.2f} E" + + +def format_status(metrics, mining_active, host_label="", warnings=None, merge_mining=None): """Overall stack health — the answer to '/status'. Pure: folds a :class:`Metrics` (plus the - mining-active flag the loop derives from the sync gate, and any active warning/error badges) - into text; no I/O.""" + mining-active flag the loop derives from the sync gate, any active warning/error badges, and the + Tari merge-mine link state) into text; no I/O. ``merge_mining`` is the gRPC-connected flag — a + distinct signal from a synced Tari node (the link can be down while the node is up), or ``None`` + to omit the line (Tari not in play).""" lines = [ f"{_prefix(host_label)}\U0001f4ca Pithead status", f"Monero node: {_node_state(metrics.monero)}", f"Tari node: {_node_state(metrics.tari)}", ] + if merge_mining is not None: + lines.append(f"Merge-mining: {'🟢 Tari linked' if merge_mining else '⏸ Tari not linked'}") if metrics.global_syncing: lines.append("Mining: ⏳ holding — chain(s) syncing") elif mining_active: @@ -243,17 +258,36 @@ def format_system(system, host_label=""): ) -def format_pool(metrics, host_label=""): - """P2Pool sidechain + Monero network figures — the answer to '/pool'.""" - return "\n".join( - [ - f"{_prefix(host_label)}\U0001f30a Pool & network", - f"Sidechain: P2Pool {metrics.pool_type}", - f"Pool hashrate: {format_hashrate(metrics.pool_hashrate)}", - f"Network height: {metrics.network_height:,}", - f"PPLNS shares: {metrics.shares_in_window} in window ({metrics.pplns_window} blocks)", - ] +def format_pool(metrics, data=None, host_label=""): + """P2Pool sidechain + Monero network figures — the answer to '/pool'. Enriched with the share + submission health and best share the proxy tracks, and the node's found blocks (#82).""" + data = data or {} + lines = [ + f"{_prefix(host_label)}\U0001f30a Pool & network", + f"Sidechain: P2Pool {metrics.pool_type}", + f"Pool hashrate: {format_hashrate(metrics.pool_hashrate)}", + ] + blocks = (data.get("pool", {}) or {}).get("pool", {}).get("blocks_found") + if blocks: + lines.append(f"Blocks found: {blocks:,}") + lines.append( + f"Network: height {metrics.network_height:,} · diff {_human_count(metrics.network_difficulty)}" + ) + lines.append( + f"PPLNS shares: {metrics.shares_in_window} in window ({metrics.pplns_window} blocks)" ) + # Share submission health from the xmrig-proxy /summary (#82): accepted/rejected + best found. + summary = data.get("proxy_summary", {}) or {} + accepted = summary.get("accepted", 0) or 0 + rejected = summary.get("rejected", 0) or 0 + if accepted or rejected: + total = accepted + rejected + reject_pct = (rejected / total * 100) if total else 0.0 + lines.append(f"Shares to pool: {accepted:,} ✓ / {rejected:,} ✗ ({reject_pct:.2f}% rejects)") + best = summary.get("best", 0) or 0 + if best: + lines.append(f"Best share: \U0001f48e {int(best):,}") + return "\n".join(lines) def format_xvb(metrics, host_label=""): @@ -267,12 +301,18 @@ def format_xvb(metrics, host_label=""): f"Current tier: {metrics.current_tier}", f"Target tier: {metrics.target_tier}", f"Routed to XvB: {format_hashrate(metrics.xvb_routed_1h)} (1h)", + # Credited averages are what XvB itself measures — the figures that actually set your tier + # (routed above is what we send; credited is what counts). Showing both explains the tier. + f"Credited by XvB: {format_hashrate(metrics.xvb_1h)} (1h) · " + f"{format_hashrate(metrics.xvb_24h)} (24h)", ] # The share half of raffle eligibility (#158): no PPLNS share means XvB wins are skipped. if metrics.shares_in_window > 0: lines.append("PPLNS share: \U0001f7e2 held (raffle-eligible)") else: lines.append("PPLNS share: ⚠ none — XvB wins skipped") + if metrics.xvb_stale: + lines.append("⚠ XvB stats are stale — showing last-known values.") return "\n".join(lines) @@ -452,7 +492,13 @@ def reply_for(self, text): warnings = status_warnings( data, metrics, self.data_service.state_manager.is_db_healthy() ) - return format_status(metrics, mining, self.host_label, warnings=warnings) + # Merge-mine link = the Tari gRPC being READY (not merely the node being up) — the same + # rule the dashboard's ✔ uses (#313). Omitted until Tari has been polled at all. + tari = data.get("tari") + merge = (bool(tari.get("connected")) and bool(tari.get("active"))) if tari else None + return format_status( + metrics, mining, self.host_label, warnings=warnings, merge_mining=merge + ) if cmd == "info": return format_info( resolve_version(), @@ -468,7 +514,7 @@ def reply_for(self, text): if cmd == "sync": return format_sync(metrics, self.host_label) if cmd == "pool": - return format_pool(metrics, self.host_label) + return format_pool(metrics, data, self.host_label) if cmd == "xvb": return format_xvb(metrics, self.host_label) if cmd == "earnings": diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index d1adf4f..f034204 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -200,17 +200,64 @@ def test_system_reads_snapshot(): assert "HugePages: Enabled (3072/3072)" in out +@pytest.mark.parametrize( + "n,expected", + [ + (0, "0"), + (42, "42"), + (999, "999"), + (1500, "1.50 K"), + (380e9, "380.00 G"), + (2.5e12, "2.50 T"), + (3e18, "3.00 E"), # beyond peta — the fallback branch + ], +) +def test_human_count(n, expected): + assert tc._human_count(n) == expected + + def test_pool_reads_metrics(): - out = tc.format_pool(_metrics(pool_type="Mini", network_height=3210001)) + out = tc.format_pool( + _metrics(pool_type="Mini", network_height=3210001, network_difficulty=380e9) + ) assert "P2Pool Mini" in out - assert "Network height: 3,210,001" in out + assert "height 3,210,001" in out + assert "diff 380.00 G" in out assert "5 in window" in out # shares_in_window from _BASE +def test_pool_share_health_and_best_when_present(): + # Proxy /summary + found blocks enrich /pool (#82): acceptance rate, best share, blocks. + data = { + "pool": {"pool": {"blocks_found": 3}}, + "proxy_summary": {"accepted": 125_000, "rejected": 40, "best": 2_345_678}, + } + out = tc.format_pool(_metrics(), data) + assert "Blocks found: 3" in out + assert "125,000 ✓ / 40 ✗ (0.03% rejects)" in out + assert "Best share: 💎 2,345,678" in out + + +def test_pool_omits_share_lines_before_first_poll(): + # No proxy data yet (fresh start) → no zeroed share/best/blocks lines, just the core figures. + out = tc.format_pool(_metrics(), {}) + assert "Shares to pool" not in out + assert "Best share" not in out + assert "Blocks found" not in out + + def test_xvb_enabled_with_share(): - out = tc.format_xvb(_metrics(xvb_enabled=True, shares_in_window=5)) + out = tc.format_xvb(_metrics(xvb_enabled=True, shares_in_window=5, xvb_1h=2100, xvb_24h=2300)) assert "Current tier: Donor" in out assert "raffle-eligible" in out + # Credited averages (what XvB measures → sets the tier) are shown alongside routed. + assert "Credited by XvB: 2.10 kH/s (1h) · 2.30 kH/s (24h)" in out + + +def test_xvb_stale_warns(): + out = tc.format_xvb(_metrics(xvb_enabled=True, shares_in_window=5, xvb_stale=True)) + assert "stale" in out + assert "stale" not in tc.format_xvb(_metrics(xvb_enabled=True, shares_in_window=5)) def test_xvb_no_share_warns(): @@ -222,6 +269,15 @@ def test_xvb_disabled(): assert "disabled" in tc.format_xvb(_metrics(xvb_enabled=False)) +def test_status_merge_mining_line(): + linked = tc.format_status(_metrics(), True, merge_mining=True) + assert "Merge-mining: 🟢 Tari linked" in linked + down = tc.format_status(_metrics(), True, merge_mining=False) + assert "Merge-mining: ⏸ Tari not linked" in down + # None (Tari not yet polled / not in play) omits the line entirely. + assert "Merge-mining" not in tc.format_status(_metrics(), True) + + def test_earnings_estimate(): # network reward present + a real difficulty → a positive daily figure. out = tc.format_earnings(_metrics(p2pool_1h=8000.0), {"reward": 600_000_000_000}) @@ -324,6 +380,22 @@ def test_reply_for_status_uses_mining_flag(monkeypatch): assert "🔴 not mining" in bot2.reply_for("/status") +def test_reply_for_status_merge_mining_from_tari_snapshot(monkeypatch): + # gRPC linked = connected AND active (the #313 rule) → the "linked" line. + bot = _bot(monkeypatch, latest_data={"tari": {"connected": True, "active": True}}) + assert "Merge-mining: 🟢 Tari linked" in bot.reply_for("/status") + # Node up but gRPC not ready (the exact gap that hid #313) → "not linked". + bot2 = _bot(monkeypatch, latest_data={"tari": {"connected": False, "active": True}}) + assert "Merge-mining: ⏸ Tari not linked" in bot2.reply_for("/status") + + +def test_reply_for_pool_reads_share_snapshot(monkeypatch): + data = {"proxy_summary": {"accepted": 999, "rejected": 1, "best": 555}} + bot = _bot(monkeypatch, latest_data=data, pool_type="Mini") + out = bot.reply_for("/pool") + assert "Best share: 💎 555" in out and "999 ✓ / 1 ✗" in out + + def test_reply_for_workers_reads_snapshot(monkeypatch): workers = [{"name": "z", "status": "online", "h15": 1000}] bot = _bot(monkeypatch, latest_data={"workers": workers}) diff --git a/docs/telegram.md b/docs/telegram.md index 3456ed6..2ffb5d5 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -198,14 +198,14 @@ Run `./pithead apply` after editing. The commands: | Command | Reply | |---|---| -| `/status` | One-glance health: each node up/down/syncing, whether mining is active, workers online, total hashrate, PPLNS shares in window — followed by any active **warning/error badges** (the same ones the dashboard's top bar shows), or an explicit "✅ No warnings." | +| `/status` | One-glance health: each node up/down/syncing, the **Tari merge-mine link** (gRPC connected — distinct from the node being synced), whether mining is active, workers online, total hashrate, PPLNS shares in window — followed by any active **warning/error badges** (the same ones the dashboard's top bar shows), or an explicit "✅ No warnings." | | `/info` | About this stack: the running **version** (and whether a newer release is available), the Monero **DB mode** (pruned / full), the P2Pool **sidechain** (Mini / Main), and the **privacy posture** (Tor-only, or how many clearnet paths are exposed). | | `/hashrate` | Total hashrate plus a per-rig breakdown of everything currently online. | | `/workers` | Every rig's online/offline state, with uptime for the ones that are up. | | `/sync` | Monero and Tari sync progress (percent and block height). | | `/system` | Host resources: disk, RAM, CPU + load, and HugePages. | -| `/pool` | P2Pool sidechain type, pool hashrate, Monero network height, and PPLNS shares in window. | -| `/xvb` | XvB mode, current and target tier, hashrate routed to XvB, and raffle eligibility (PPLNS share). | +| `/pool` | P2Pool sidechain type, pool hashrate, Monero network height + difficulty, PPLNS shares in window, **sidechain blocks found**, **share acceptance** (accepted/rejected + reject %), and the **best share** difficulty found. | +| `/xvb` | XvB mode, current and target tier, hashrate **routed** to XvB, the **credited** 1h/24h averages XvB measures (what sets your tier), raffle eligibility (PPLNS share), and a stale-data warning if the XvB feed is behind. | | `/earnings` | Estimated P2Pool XMR per day/month from your current hashrate (P2Pool only — excludes XvB-donated hashrate and Tari). | | `/help` | The command list. | diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 213692a..b46eef3 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 786 dashboard unit tests · 12 contract tests · 66 frontend +**Totals:** 793 dashboard unit tests · 12 contract tests · 66 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 786 | +| 1 — Unit | dashboard pytest | 793 | | 1 — Unit | frontend (node --test) | 66 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 786 tests +### Dashboard (pytest) — 793 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -576,7 +576,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_events_survive_reload - test_load_tolerates_missing_events_table -#### tests/service/test_telegram_commands.py — 53 +#### tests/service/test_telegram_commands.py — 60 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -591,10 +591,15 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_sync_line_variants - test_sync_line_no_target - test_system_reads_snapshot +- test_human_count - test_pool_reads_metrics +- test_pool_share_health_and_best_when_present +- test_pool_omits_share_lines_before_first_poll - test_xvb_enabled_with_share +- test_xvb_stale_warns - test_xvb_no_share_warns - test_xvb_disabled +- test_status_merge_mining_line - test_earnings_estimate - test_earnings_unavailable_without_network_data - test_daily_summary_is_a_24h_retrospective @@ -603,6 +608,8 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_host_label_prefix - test_reply_for_help_and_unknown_need_no_metrics - test_reply_for_status_uses_mining_flag +- test_reply_for_status_merge_mining_from_tari_snapshot +- test_reply_for_pool_reads_share_snapshot - test_reply_for_workers_reads_snapshot - test_reply_for_system_reads_snapshot_without_metrics - test_reply_for_pool_and_xvb @@ -1163,5 +1170,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **951** enumerated cases/sections across the four tiers (plus the live +_Grand total: **958** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ From b88d1aeff06e06c75b54a09326a8792f79acc6a7 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 11:41:15 -0500 Subject: [PATCH 17/18] feat(telegram): add effort to /pool and a 24h earnings estimate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - /pool: current share effort (a luck indicator — <100% = finding shares faster than average), shown once stratum has been polled. - /earnings: a 24h-average estimate alongside the 1h one to smooth the variance a 1h window carries; the steadier 24h figure drives the 30d projection (falls back to 1h before a day of history exists). Docs + tests updated; patch coverage 97%. Co-Authored-By: Claude Opus 4.8 --- .../service/telegram_commands.py | 31 +++++++++++++------ .../tests/service/test_telegram_commands.py | 25 +++++++++++++-- docs/telegram.md | 4 +-- docs/test-inventory.md | 12 ++++--- 4 files changed, 53 insertions(+), 19 deletions(-) diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 0cfb8da..38f1d71 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -276,6 +276,10 @@ def format_pool(metrics, data=None, host_label=""): lines.append( f"PPLNS shares: {metrics.shares_in_window} in window ({metrics.pplns_window} blocks)" ) + # Current share effort — a luck indicator (<100% = finding shares faster than average). + stratum = data.get("stratum", {}) or {} + if "current_effort" in stratum: + lines.append(f"Effort: {stratum['current_effort']:.1f}%") # Share submission health from the xmrig-proxy /summary (#82): accepted/rejected + best found. summary = data.get("proxy_summary", {}) or {} accepted = summary.get("accepted", 0) or 0 @@ -324,16 +328,23 @@ def format_earnings(metrics, network, host_label=""): coeff_day = xmr_per_hs_day(reward_atomic, metrics.network_difficulty) if coeff_day <= 0: return f"{_prefix(host_label)}\U0001f4b0 Earnings estimate unavailable (waiting on network data)." - daily = coeff_day * metrics.p2pool_1h - return "\n".join( - [ - f"{_prefix(host_label)}\U0001f4b0 Estimated P2Pool earnings", - f"Hashrate (P2Pool 1h): {format_hashrate(metrics.p2pool_1h)}", - f"~{daily:.6f} XMR/day", - f"~{daily * 30:.5f} XMR/30d", - "Estimate only — excludes XvB-donated hashrate and Tari merge-mining.", - ] - ) + daily_1h = coeff_day * metrics.p2pool_1h + lines = [ + f"{_prefix(host_label)}\U0001f4b0 Estimated P2Pool earnings", + f"1h avg {format_hashrate(metrics.p2pool_1h)} → ~{daily_1h:.6f} XMR/day", + ] + # The 24h average smooths the variance a 1h window carries, so it's the steadier projection — + # shown (and used for the 30-day figure) only once there's a day of history to average. + if metrics.p2pool_24h > 0: + daily_24h = coeff_day * metrics.p2pool_24h + lines.append( + f"24h avg {format_hashrate(metrics.p2pool_24h)} → ~{daily_24h:.6f} XMR/day " + f"· ~{daily_24h * 30:.5f} XMR/30d" + ) + else: + lines.append(f"~{daily_1h * 30:.5f} XMR/30d") + lines.append("Estimate only — excludes XvB-donated hashrate and Tari merge-mining.") + return "\n".join(lines) # Friendly labels for the daily incident log (#342), keyed by AlertService event. diff --git a/build/dashboard/tests/service/test_telegram_commands.py b/build/dashboard/tests/service/test_telegram_commands.py index f034204..8c57263 100644 --- a/build/dashboard/tests/service/test_telegram_commands.py +++ b/build/dashboard/tests/service/test_telegram_commands.py @@ -244,6 +244,15 @@ def test_pool_omits_share_lines_before_first_poll(): assert "Shares to pool" not in out assert "Best share" not in out assert "Blocks found" not in out + assert "Effort" not in out # no stratum data → no effort line + + +def test_pool_effort_when_stratum_present(): + # Effort is a luck indicator; shown only once stratum has been polled (the key is present). + out = tc.format_pool(_metrics(), {"stratum": {"current_effort": 87.3}}) + assert "Effort: 87.3%" in out + # Effort right after a block can legitimately be 0.0 — still shown (key present), not hidden. + assert "Effort: 0.0%" in tc.format_pool(_metrics(), {"stratum": {"current_effort": 0.0}}) def test_xvb_enabled_with_share(): @@ -280,8 +289,20 @@ def test_status_merge_mining_line(): def test_earnings_estimate(): # network reward present + a real difficulty → a positive daily figure. - out = tc.format_earnings(_metrics(p2pool_1h=8000.0), {"reward": 600_000_000_000}) - assert "XMR/day" in out + out = tc.format_earnings( + _metrics(p2pool_1h=8000.0, p2pool_24h=8100.0), {"reward": 600_000_000_000} + ) + assert "1h avg" in out and "XMR/day" in out + # The 24h average is shown once available and drives the steadier 30d projection. + assert "24h avg" in out and "XMR/30d" in out + + +def test_earnings_falls_back_to_1h_30d_without_24h_history(): + # A fresh node with no 24h average yet still gets a 30d figure (from the 1h rate). + out = tc.format_earnings( + _metrics(p2pool_1h=8000.0, p2pool_24h=0.0), {"reward": 600_000_000_000} + ) + assert "24h avg" not in out assert "XMR/30d" in out diff --git a/docs/telegram.md b/docs/telegram.md index 2ffb5d5..1ebc0f6 100644 --- a/docs/telegram.md +++ b/docs/telegram.md @@ -204,9 +204,9 @@ Run `./pithead apply` after editing. The commands: | `/workers` | Every rig's online/offline state, with uptime for the ones that are up. | | `/sync` | Monero and Tari sync progress (percent and block height). | | `/system` | Host resources: disk, RAM, CPU + load, and HugePages. | -| `/pool` | P2Pool sidechain type, pool hashrate, Monero network height + difficulty, PPLNS shares in window, **sidechain blocks found**, **share acceptance** (accepted/rejected + reject %), and the **best share** difficulty found. | +| `/pool` | P2Pool sidechain type, pool hashrate, Monero network height + difficulty, PPLNS shares in window, current **effort** (luck indicator), **sidechain blocks found**, **share acceptance** (accepted/rejected + reject %), and the **best share** difficulty found. | | `/xvb` | XvB mode, current and target tier, hashrate **routed** to XvB, the **credited** 1h/24h averages XvB measures (what sets your tier), raffle eligibility (PPLNS share), and a stale-data warning if the XvB feed is behind. | -| `/earnings` | Estimated P2Pool XMR per day/month from your current hashrate (P2Pool only — excludes XvB-donated hashrate and Tari). | +| `/earnings` | Estimated P2Pool XMR per day/month, from both your **1h** and (once available) steadier **24h** average hashrate (P2Pool only — excludes XvB-donated hashrate and Tari). | | `/help` | The command list. | The numbers come from the **same source as the dashboard**, so a reply and the web view always diff --git a/docs/test-inventory.md b/docs/test-inventory.md index b46eef3..2392304 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,7 +4,7 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 793 dashboard unit tests · 12 contract tests · 66 frontend +**Totals:** 795 dashboard unit tests · 12 contract tests · 66 frontend tests · 52 `pithead` shell sections · 18 harness self-test sections · 9 live config scenarios (17 axis values) · 8 mini-stack scenarios. @@ -14,7 +14,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 793 | +| 1 — Unit | dashboard pytest | 795 | | 1 — Unit | frontend (node --test) | 66 | | 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | @@ -27,7 +27,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 793 tests +### Dashboard (pytest) — 795 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -576,7 +576,7 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_events_survive_reload - test_load_tolerates_missing_events_table -#### tests/service/test_telegram_commands.py — 60 +#### tests/service/test_telegram_commands.py — 62 - test_parse_command - test_status_active - test_status_syncing_beats_mining_flag @@ -595,12 +595,14 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · - test_pool_reads_metrics - test_pool_share_health_and_best_when_present - test_pool_omits_share_lines_before_first_poll +- test_pool_effort_when_stratum_present - test_xvb_enabled_with_share - test_xvb_stale_warns - test_xvb_no_share_warns - test_xvb_disabled - test_status_merge_mining_line - test_earnings_estimate +- test_earnings_falls_back_to_1h_30d_without_24h_history - test_earnings_unavailable_without_network_data - test_daily_summary_is_a_24h_retrospective - test_daily_summary_without_xvb_omits_split @@ -1170,5 +1172,5 @@ tests · 52 `pithead` shell sections · 18 harness self-test sections · --- -_Grand total: **958** enumerated cases/sections across the four tiers (plus the live +_Grand total: **960** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ From 03a536a7915eef3aa42f5562abf56e1282586102 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Fri, 3 Jul 2026 11:43:51 -0500 Subject: [PATCH 18/18] refactor(telegram): clearer status_warnings prefix strip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use str.removeprefix("⚠ ") instead of lstrip(char-set)+strip — same result, explicit intent, no char-set footgun. Also refresh docs (/pool effort, /earnings 24h) and test-inventory. Co-Authored-By: Claude Opus 4.8 --- build/dashboard/mining_dashboard/service/telegram_commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/dashboard/mining_dashboard/service/telegram_commands.py b/build/dashboard/mining_dashboard/service/telegram_commands.py index 38f1d71..454f76c 100644 --- a/build/dashboard/mining_dashboard/service/telegram_commands.py +++ b/build/dashboard/mining_dashboard/service/telegram_commands.py @@ -176,7 +176,7 @@ def status_warnings(data, metrics, db_healthy): out = [] for b in build_badges(data, metrics, "", db_healthy=db_healthy): if b["variant"] == "bad" or b["text"].startswith("⚠"): - out.append(b["text"].lstrip("⚠ ").strip()) + out.append(b["text"].removeprefix("⚠ ")) return out