From 61cf0e03280d9f4d4d6fdaa557429b46f14a008a Mon Sep 17 00:00:00 2001 From: Jack Zhuang <277994282+os-zhuang@users.noreply.github.com> Date: Fri, 5 Jun 2026 09:42:32 +0800 Subject: [PATCH] =?UTF-8?q?perf(rest):=20cache=20hostname=E2=86=92env=20re?= =?UTF-8?q?solution=20(P1-4);=20document=20cluster=20pub/sub=20durability?= =?UTF-8?q?=20(P1-5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1-4: resolveByHostname() ran on every unscoped request (a control-plane DB lookup in the hot path). RestServer.resolveHostnameCached() now caches hostname→environmentId for 30s across all 3 call sites, including negative results so unknown hosts don't hammer the registry; registry errors aren't cached so a transient blip self-heals. +3 tests. P1-5: verified the Redis pub/sub fire-and-forget is by design (at-most-once) and already implied in code. Recorded the durability contract in pubsub.ts: metadata.changed is a cache-invalidation hint only — the durable source of truth is the transactional sys_metadata (+ sys_metadata_history) write, so a missed event self-heals on reload and never loses data. No delivery-semantics change; risk accepted + documented. docs/launch-readiness.md P1-4/P1-5 updated (Verify ✅, Sign-off left for the team). Co-Authored-By: Claude Opus 4.8 --- .changeset/p1-control-plane.md | 18 +++++++++ docs/launch-readiness.md | 11 +++++- packages/rest/src/rest-server.ts | 31 ++++++++++++++-- packages/rest/src/rest.test.ts | 37 +++++++++++++++++++ .../service-cluster-redis/src/pubsub.ts | 14 +++++++ 5 files changed, 106 insertions(+), 5 deletions(-) create mode 100644 .changeset/p1-control-plane.md diff --git a/.changeset/p1-control-plane.md b/.changeset/p1-control-plane.md new file mode 100644 index 000000000..6a3a8624d --- /dev/null +++ b/.changeset/p1-control-plane.md @@ -0,0 +1,18 @@ +--- +'@objectstack/rest': patch +'@objectstack/service-cluster-redis': patch +--- + +perf(rest): cache hostname→environment resolution; document cluster pub/sub durability (P1-4, P1-5) + +- **rest (P1-4):** `resolveByHostname()` ran on every unscoped request — a + control-plane lookup (typically a DB query) in the hot path. `RestServer` now + caches `hostname → environmentId` in-memory with a 30s TTL across all three + resolution sites, caching negative results too so unknown hosts don't hammer the + registry. Registry errors are not cached, so a transient blip self-heals. +- **service-cluster-redis (P1-5):** recorded the durability contract for + `metadata.changed` in `pubsub.ts`. Redis pub/sub is at-most-once **by design**; + the event is a cache-invalidation hint only — the durable source of truth is the + transactional `sys_metadata` (+ `sys_metadata_history`) write, so a missed event + causes a stale cache until the next reload, never data loss. No code change to + the delivery semantics; risk accepted and documented. diff --git a/docs/launch-readiness.md b/docs/launch-readiness.md index 23a1eb924..04827cb97 100644 --- a/docs/launch-readiness.md +++ b/docs/launch-readiness.md @@ -144,7 +144,7 @@ fix or acceptance.** - **Risk:** `resolveByHostname()` runs on every unscoped request → control-plane latency spike under load; silent fallback to default project masks it. - **Action:** Add an in-memory TTL cache (~30s) for `hostname → environmentId`. -- **Owner:** _______ · Verify ☐ · Sign-off ☐ · Notes: _______ +- **Owner:** _______ · Verify ✅ (confirmed real @ `main`) · Sign-off ☐ · Notes: Fixed — `RestServer.resolveHostnameCached()` caches hostname→env (positive **and** negative) for 30s across all 3 call sites; +3 tests. Awaiting human sign-off. ### P1-5 — Cluster pub/sub is fire-and-forget (metadata-changed) - **Area:** `service-cluster-redis` — `src/pubsub.ts:~75–90` @@ -153,7 +153,14 @@ fix or acceptance.** - **Action:** Acceptable for non-critical events if documented; ensure schema mutations re-sync on error boundaries (history exists in `sys_metadata_history`). Record the durability contract. -- **Owner:** _______ · Verify ☐ · Sign-off ☐ · Notes: _______ +- **Durability contract (recorded):** Redis pub/sub is **at-most-once** by design + (already noted in `pubsub.ts`). `metadata.changed` is a **cache-invalidation hint + only** — the durable source of truth is the transactional write to `sys_metadata` + (+ `sys_metadata_history`). A node that misses the event serves its cached schema + until the next reload and **loses no data** (self-heals on reload/restart against + the DB). Documented in `pubsub.ts publish()`. **Accept** for v1: no exactly-once + state may flow through this channel — durable state uses an outbox. +- **Owner:** _______ · Verify ✅ (by design — contract recorded) · Sign-off ☐ · Notes: No code fix needed; risk **accepted** with rationale + code comment. Awaiting human sign-off. --- diff --git a/packages/rest/src/rest-server.ts b/packages/rest/src/rest-server.ts index 89f22712e..09a9872e0 100644 --- a/packages/rest/src/rest-server.ts +++ b/packages/rest/src/rest-server.ts @@ -437,6 +437,15 @@ export class RestServer { private routeManager: RouteManager; private kernelManager?: RestKernelManager; private envRegistry?: RestEnvRegistry; + /** + * Short-TTL cache for `hostname → environmentId` (P1-4). `resolveByHostname` + * is a control-plane lookup (typically a DB query) that otherwise runs on + * *every* unscoped request; caching it — including negative results, so + * unknown hosts don't hammer the registry — removes that per-request cost. + * The TTL is short so a newly-bound hostname becomes routable quickly. + */ + private readonly hostnameCache = new Map(); + private readonly hostnameCacheTtlMs = 30_000; private defaultEnvironmentIdProvider?: () => string | undefined; private authServiceProvider?: (environmentId?: string) => Promise; private objectQLProvider?: (environmentId?: string) => Promise; @@ -501,13 +510,29 @@ export class RestServer { * (and any other client) speak a single, uniform URL family without * duplicating route logic for the platform surface. */ + /** + * Cached wrapper around `envRegistry.resolveByHostname` (P1-4). Returns the + * cached result while fresh; on a miss it queries the registry and caches the + * outcome (positive *and* negative) for {@link hostnameCacheTtlMs}. Registry + * errors are not cached so a transient control-plane blip self-heals on the + * next request. + */ + private async resolveHostnameCached(host: string): Promise<{ environmentId: string } | null | undefined> { + const now = Date.now(); + const hit = this.hostnameCache.get(host); + if (hit && hit.expiresAt > now) return hit.value; + const result = (await this.envRegistry!.resolveByHostname(host)) ?? null; + this.hostnameCache.set(host, { value: result, expiresAt: now + this.hostnameCacheTtlMs }); + return result; + } + private async resolveProtocol(environmentId?: string, req?: any): Promise { if (environmentId === 'platform') return this.protocol; if (!environmentId && req && this.envRegistry && this.kernelManager) { const host = this.extractHostname(req); if (host) { try { - const result = await this.envRegistry.resolveByHostname(host); + const result = await this.resolveHostnameCached(host); if (result?.environmentId) environmentId = result.environmentId; } catch { // fall through to next strategy @@ -567,7 +592,7 @@ export class RestServer { const host = this.extractHostname(req); if (host) { try { - const result = await this.envRegistry.resolveByHostname(host); + const result = await this.resolveHostnameCached(host); if (result?.environmentId) environmentId = result.environmentId; } catch { /* fall through */ } } @@ -644,7 +669,7 @@ export class RestServer { const host = this.extractHostname(req); if (host) { try { - const result = await this.envRegistry.resolveByHostname(host); + const result = await this.resolveHostnameCached(host); if (result?.environmentId) environmentId = result.environmentId; } catch { /* fall through */ } } diff --git a/packages/rest/src/rest.test.ts b/packages/rest/src/rest.test.ts index f68b6e342..74e22e3bd 100644 --- a/packages/rest/src/rest.test.ts +++ b/packages/rest/src/rest.test.ts @@ -1324,6 +1324,43 @@ describe('RestServer.resolveProtocol', () => { expect(f.kernelManager.getOrCreate).toHaveBeenCalledWith('proj_a'); }); + it('caches hostname→env resolution within the TTL and refreshes after it (P1-4)', async () => { + const projectKernel = makeKernel('proj_a'); + const resolveByHostname = vi.fn().mockResolvedValue({ environmentId: 'proj_a' }); + const f = makeFixture({ + envRegistry: { resolveByHostname, resolveById: vi.fn() }, + kernels: { proj_a: projectKernel }, + }); + let now = 1_000_000; + const spy = vi.spyOn(Date, 'now').mockImplementation(() => now); + try { + const req = { headers: { host: 'a.example.com' } }; + await (f.rest as any).resolveProtocol(undefined, req); + await (f.rest as any).resolveProtocol(undefined, req); + await (f.rest as any).resolveProtocol(undefined, req); + expect(resolveByHostname).toHaveBeenCalledTimes(1); // 2nd/3rd served from cache + + now += 31_000; // past the 30s TTL + await (f.rest as any).resolveProtocol(undefined, req); + expect(resolveByHostname).toHaveBeenCalledTimes(2); // refreshed + } finally { + spy.mockRestore(); + } + }); + + it('caches a negative result so unknown hosts do not hammer the registry (P1-4)', async () => { + const resolveByHostname = vi.fn().mockResolvedValue(null); + const f = makeFixture({ + envRegistry: { resolveByHostname, resolveById: vi.fn() }, + defaultProvider: () => 'proj_local', + kernels: { proj_local: makeKernel('proj_local') }, + }); + const req = { headers: { host: 'unknown.example.com' } }; + await (f.rest as any).resolveProtocol(undefined, req); + await (f.rest as any).resolveProtocol(undefined, req); + expect(resolveByHostname).toHaveBeenCalledTimes(1); // negative result cached + }); + it('routes via X-Environment-Id header when hostname resolution fails', async () => { const projectKernel = makeKernel('proj_b'); const resolveById = vi.fn().mockResolvedValue({ /* truthy driver */ }); diff --git a/packages/services/service-cluster-redis/src/pubsub.ts b/packages/services/service-cluster-redis/src/pubsub.ts index 35dc24f0a..c1a64fe70 100644 --- a/packages/services/service-cluster-redis/src/pubsub.ts +++ b/packages/services/service-cluster-redis/src/pubsub.ts @@ -75,6 +75,20 @@ export class RedisPubSub implements IPubSub { }); } + /** + * Durability contract (P1-5): this awaits the Redis `PUBLISH` command (so the + * message left this node), but Redis pub/sub is **at-most-once** — there is no + * delivery guarantee to subscribers and no replay for a node that was down or + * slow at publish time. This is acceptable **only** for events that are pure + * cache-invalidation hints, never the source of truth. + * + * In particular `metadata.changed` is such a hint: the durable record of every + * metadata mutation is the transactional write to `sys_metadata` + * (+ `sys_metadata_history`). A subscriber that misses the event keeps serving + * its cached schema until its next reload and **loses no data** — it self-heals + * on restart / reload against the DB. Do not route any state that must be + * delivered exactly-once through this channel; use a durable outbox instead. + */ async publish( channel: string, payload: T,