From fa63cde6f8b8604744df926be9e8c2a8e7c157cb Mon Sep 17 00:00:00 2001 From: Manas Srivastava <[email protected]> Date: Mon, 8 Jun 2026 23:45:50 +0530 Subject: [PATCH] feat(redis): tier-aware backend routing behind a default-off flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shared redis backend (REDIS_PROVISION_BACKEND, "k8s" in prod) serves every non-dedicated tier, so a 5MB anonymous cache costs a whole pod, PVC, Service and NetworkPolicy — pod count grows linearly with /cache/new. This is the durable fix for that capacity problem. Adds redis.TierDispatchBackend: routes Provision by tier (Team -> dedicated pod, every non-Team tier -> a shared ACL carve on one pod), and routes Deprovision/StorageBytes/Regrade by provider_resource_id prefix (the same "instant-customer-" convention server.go already uses). Regrade only ever touches dedicated (k8s) pods; a shared-carve resource soft-skips so the shared pod's server-wide maxmemory is never set on behalf of one tenant. Gated by REDIS_TIER_AWARE_ROUTING_ENABLED (default false, fail-closed: only the exact string "true" enables it). When off or unset, server.New wires the configured backend verbatim — behaviour is byte-for-byte identical to today. This is a code-only change: no data migration, no prod config/secret change, no default-backend change. Safe to merge as a no-op until an operator opts in (needs careful review before any enablement; non-Team tiers lose the per-pod maxmemory hard cap when on — memory is then enforced out-of-band via the worker's StorageBytes quota scan, the same posture as the "local" backend). Tests: dispatch_test.go covers Provision (registry-driven tier table), Deprovision/StorageBytes (PRID routing), and Regrade (forward / shared soft-skip / no-Regrader soft-skip). server_redis_tier_routing_test.go proves flag off -> backend unwrapped (no-op) and flag on -> wrapped. config_test.go covers default-false, override-true, and non-"true"-is-false. New code 100% covered; make gate green; golangci-lint clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/backend/redis/backend.go | 9 +- internal/backend/redis/dispatch.go | 143 +++++++ internal/backend/redis/dispatch_test.go | 364 ++++++++++++++++++ internal/config/config.go | 106 ++--- internal/config/config_test.go | 25 ++ internal/server/export_test.go | 23 +- internal/server/server.go | 21 +- .../server/server_redis_tier_routing_test.go | 78 ++++ 8 files changed, 720 insertions(+), 49 deletions(-) create mode 100644 internal/backend/redis/dispatch.go create mode 100644 internal/backend/redis/dispatch_test.go create mode 100644 internal/server/server_redis_tier_routing_test.go diff --git a/internal/backend/redis/backend.go b/internal/backend/redis/backend.go index 5bc1bc0..2c32f5f 100644 --- a/internal/backend/redis/backend.go +++ b/internal/backend/redis/backend.go @@ -12,7 +12,7 @@ import ( // goredisParseURL / goredisNewClient — narrow aliases so we don't import the // goredis package directly in the factory body. Keeps the call sites readable // and the dependency obvious in this file alone. -func goredisParseURL(s string) (*goredis.Options, error) { return goredis.ParseURL(s) } +func goredisParseURL(s string) (*goredis.Options, error) { return goredis.ParseURL(s) } func goredisNewClient(o *goredis.Options) *goredis.Client { return goredis.NewClient(o) } func k8sEnv(key, fallback string) string { @@ -126,6 +126,13 @@ func NewBackend(backendType, redisHost string) Backend { } } +// NewSharedCarveBackend creates a LocalBackend: an ACL user + key-prefix carve +// on a SHARED Redis instance (many tenants per pod). It is the non-Team side of +// tier-aware routing (see TierDispatchBackend). redisHost is "host:port". +func NewSharedCarveBackend(redisHost string) Backend { + return newLocalBackend(redisHost) +} + // NewDedicatedBackend creates a DedicatedProvider for Team-tier Redis provisioning. // adminRedisURL must point to a dedicated Redis instance (separate from the shared cluster). // upstashAPIKey is optional; when set the Upstash API path is used instead. diff --git a/internal/backend/redis/dispatch.go b/internal/backend/redis/dispatch.go new file mode 100644 index 0000000..e5e300e --- /dev/null +++ b/internal/backend/redis/dispatch.go @@ -0,0 +1,143 @@ +package redis + +// dispatch.go — tier-aware Redis backend routing. +// +// # Why this exists (dedicated-pod-per-token capacity problem) +// +// The shared `redisBackend` the server holds is built from REDIS_PROVISION_BACKEND. +// In production that value is "k8s", so the K8sBackend (one Redis pod + namespace +// per token) serves EVERY non-dedicated tier — anonymous, free, hobby, hobby_plus. +// A 5MB anonymous cache therefore costs a whole pod, PVC, Service and NetworkPolicy. +// That does not scale: pod count grows linearly with /cache/new calls. +// +// TierDispatchBackend is the durable fix. It routes per tier: +// - non-Team tiers (capped 5MB–512MB caches) → the SHARED-carve backend +// (LocalBackend: an ACL user + key-prefix namespace on a shared Redis), so +// many tenants share one pod. +// - Team tier (unlimited, justifies dedicated infra) → the DEDICATED backend +// (the configured K8sBackend: one pod per token). +// +// # Flag-gated, default OFF +// +// This type is ONLY constructed when REDIS_TIER_AWARE_ROUTING_ENABLED=true (see +// config.Config.RedisTierAwareRoutingEnabled and server.New). When the flag is +// off or unset the server keeps using the single configured backend for every +// tier — behaviour is byte-for-byte identical to today. The dispatcher changes +// nothing in production until an operator flips the flag. + +import ( + "context" + "log/slog" + "strings" +) + +// dedicatedTier is the single tier that receives a dedicated Redis pod under +// tier-aware routing. Every other tier is routed to the shared-carve backend. +// +// This is deliberately a named constant (not an inline "team" literal) and is +// narrower than the server's isDedicatedTier (pro/team/growth): the whole point +// of tier-aware routing is to move the high-volume capped tiers OFF dedicated +// pods, and only Team's unlimited promise justifies a pod per token. If product +// later decides another tier earns a dedicated pod, change it here. +const dedicatedTier = "team" + +// dedicatedNamespacePrefix is the provider_resource_id / namespace prefix the +// dedicated (k8s) backend stamps on every resource it provisions. It is defined +// as an alias of redisK8sNsPrefix (the constant the k8s backend actually uses in +// k8s.go), so the two can never drift: a rename of redisK8sNsPrefix flows +// through here automatically. +// +// Lifecycle RPCs (Deprovision / StorageBytes) do not carry the tier, so the +// dispatcher routes them by this prefix: a PRID that starts with it was +// provisioned on the dedicated backend; anything else (empty PRID, a pool-token +// marker, etc.) lives on the shared-carve backend. +const dedicatedNamespacePrefix = redisK8sNsPrefix + +// isDedicatedRoutingTier reports whether a tier should be served by the +// dedicated backend under tier-aware routing. +func isDedicatedRoutingTier(tier string) bool { + return tier == dedicatedTier +} + +// hasDedicatedProviderID reports whether a provider_resource_id was produced by +// the dedicated backend (a k8s namespace). Used to route lifecycle RPCs that do +// not carry the tier. +func hasDedicatedProviderID(providerResourceID string) bool { + return strings.HasPrefix(providerResourceID, dedicatedNamespacePrefix) +} + +// TierDispatchBackend routes Redis lifecycle calls to one of two backends based +// on tier (Provision) or provider_resource_id prefix (Deprovision/StorageBytes). +// +// It satisfies redis.Backend and — when the dedicated backend does — redis.Regrader. +type TierDispatchBackend struct { + // sharedCarve serves all non-Team tiers: an ACL carve on a shared Redis. + sharedCarve Backend + // dedicated serves the Team tier: a dedicated pod per token (k8s backend). + dedicated Backend +} + +// Compile-time check: the dispatcher is a Backend. +var _ Backend = (*TierDispatchBackend)(nil) + +// NewTierDispatchBackend builds a tier-aware dispatcher. Both backends must be +// non-nil; the caller (server.New) only constructs the dispatcher once it has +// both a shared-carve and a dedicated backend in hand. +func NewTierDispatchBackend(sharedCarve, dedicated Backend) *TierDispatchBackend { + return &TierDispatchBackend{sharedCarve: sharedCarve, dedicated: dedicated} +} + +// Provision routes by tier: Team → dedicated pod, everyone else → shared carve. +func (b *TierDispatchBackend) Provision(ctx context.Context, token, tier string) (*Credentials, error) { + if isDedicatedRoutingTier(tier) { + slog.Info("redis.dispatch.provision", "route", "dedicated", "token", token, "tier", tier) + return b.dedicated.Provision(ctx, token, tier) + } + slog.Info("redis.dispatch.provision", "route", "shared_carve", "token", token, "tier", tier) + return b.sharedCarve.Provision(ctx, token, tier) +} + +// Deprovision routes by provider_resource_id: a dedicated (k8s namespace) PRID +// → dedicated backend; anything else → shared-carve backend. The tier is not +// available on this call, so the PRID prefix is the only reliable signal — the +// same convention server.go already uses to split dedicated vs shared teardown. +func (b *TierDispatchBackend) Deprovision(ctx context.Context, token, providerResourceID string) error { + if hasDedicatedProviderID(providerResourceID) { + slog.Info("redis.dispatch.deprovision", "route", "dedicated", "token", token, "provider_resource_id", providerResourceID) + return b.dedicated.Deprovision(ctx, token, providerResourceID) + } + slog.Info("redis.dispatch.deprovision", "route", "shared_carve", "token", token, "provider_resource_id", providerResourceID) + return b.sharedCarve.Deprovision(ctx, token, providerResourceID) +} + +// StorageBytes routes by provider_resource_id, mirroring Deprovision. +func (b *TierDispatchBackend) StorageBytes(ctx context.Context, token, providerResourceID string) (int64, error) { + if hasDedicatedProviderID(providerResourceID) { + return b.dedicated.StorageBytes(ctx, token, providerResourceID) + } + return b.sharedCarve.StorageBytes(ctx, token, providerResourceID) +} + +// Regrade delegates to the dedicated backend's Regrader, if it implements one. +// Only dedicated (k8s) pods have a per-tenant maxmemory lever to adjust; the +// shared-carve backend has no per-user maxmemory at the Redis level (Redis ACLs +// cannot cap memory per user — maxmemory is server-wide), so a shared-carve +// resource is simply not regradeable here and returns a soft skip. +// +// Routing uses the same PRID-prefix signal as Deprovision/StorageBytes: only a +// dedicated (k8s namespace) PRID is forwarded to the dedicated Regrader; a +// shared-carve PRID short-circuits to a soft skip without touching the shared +// pod (whose maxmemory is shared by every tenant on it). +func (b *TierDispatchBackend) Regrade(ctx context.Context, token, providerResourceID string, targetMaxmemoryMB int) (RegradeResult, error) { + regrader, ok := b.dedicated.(Regrader) + if !ok { + return RegradeResult{Applied: false, SkipReason: "dedicated backend does not support redis regrade"}, nil + } + if !hasDedicatedProviderID(providerResourceID) { + // A shared-carve resource: no per-tenant maxmemory to set. Soft skip — + // never CONFIG SET maxmemory on the shared pod (that would cap EVERY + // tenant sharing it). + return RegradeResult{Applied: false, SkipReason: "shared-carve redis has no per-tenant maxmemory lever"}, nil + } + return regrader.Regrade(ctx, token, providerResourceID, targetMaxmemoryMB) +} diff --git a/internal/backend/redis/dispatch_test.go b/internal/backend/redis/dispatch_test.go new file mode 100644 index 0000000..747e8d7 --- /dev/null +++ b/internal/backend/redis/dispatch_test.go @@ -0,0 +1,364 @@ +package redis + +// dispatch_test.go — routing tests for TierDispatchBackend. +// +// These tests use lightweight recording stubs (no fake clientset needed — the +// dispatcher only delegates to two Backend interfaces) to assert that: +// - Provision routes Team → dedicated, every non-Team tier → shared carve. +// - Deprovision / StorageBytes route by provider_resource_id prefix. +// - Regrade forwards only dedicated-PRID resources to the dedicated Regrader, +// soft-skips shared-carve resources, and soft-skips when the dedicated +// backend has no Regrader. +// +// The Provision tier table is driven off the live plans registry (rule 18) so a +// new tier added to plans.yaml is automatically exercised — it cannot silently +// route to the wrong backend. + +import ( + "context" + "errors" + "testing" + + "instant.dev/common/plans" +) + +// route is which backend a stub represents, so tests can assert the call landed +// on the expected side without inspecting unexported dispatcher fields. +type route string + +const ( + routeShared route = "shared_carve" + routeDedicated route = "dedicated" +) + +// recordingBackend is a Backend stub that records the last call it received. +// It does NOT implement Regrader (regradeableBackend does) so the +// "dedicated has no Regrader" branch is reachable. +type recordingBackend struct { + route route + + provisionCalled bool + provisionTier string + deprovisionCalled bool + deprovisionPRID string + storageBytesCalled bool + storageBytesPRID string +} + +func (b *recordingBackend) Provision(_ context.Context, _, tier string) (*Credentials, error) { + b.provisionCalled = true + b.provisionTier = tier + // Stamp the route into ProviderResourceID so a caller can confirm which + // backend served the provision. + return &Credentials{ProviderResourceID: string(b.route)}, nil +} + +func (b *recordingBackend) Deprovision(_ context.Context, _, providerResourceID string) error { + b.deprovisionCalled = true + b.deprovisionPRID = providerResourceID + return nil +} + +func (b *recordingBackend) StorageBytes(_ context.Context, _, providerResourceID string) (int64, error) { + b.storageBytesCalled = true + b.storageBytesPRID = providerResourceID + return 0, nil +} + +// regradeableBackend is a recordingBackend that ALSO implements Regrader, so the +// dispatcher's Regrade-forwarding branch can be exercised. +type regradeableBackend struct { + recordingBackend + regradeCalled bool + regradePRID string + regradeMB int + regradeResult RegradeResult + regradeErr error +} + +func (b *regradeableBackend) Regrade(_ context.Context, _, providerResourceID string, targetMaxmemoryMB int) (RegradeResult, error) { + b.regradeCalled = true + b.regradePRID = providerResourceID + b.regradeMB = targetMaxmemoryMB + return b.regradeResult, b.regradeErr +} + +// ─── Provision routing ────────────────────────────────────────────────────── + +// TestDispatchProvision_RoutesByTier_RegistryDriven iterates every tier in the +// live plans registry and asserts the dispatcher sends Team to the dedicated +// backend and every other tier to the shared-carve backend. Registry-driven so +// a new plans.yaml tier is covered automatically (rule 18). +func TestDispatchProvision_RoutesByTier_RegistryDriven(t *testing.T) { + reg := plans.Default() + tiers := reg.All() + if len(tiers) == 0 { + t.Fatal("plans registry returned no tiers — cannot validate routing") + } + + sawTeam := false + sawNonTeam := false + for tier := range tiers { + t.Run(tier, func(t *testing.T) { + shared := &recordingBackend{route: routeShared} + dedicated := &recordingBackend{route: routeDedicated} + d := NewTierDispatchBackend(shared, dedicated) + + creds, err := d.Provision(context.Background(), "tok-"+tier, tier) + if err != nil { + t.Fatalf("Provision(%q) error: %v", tier, err) + } + + wantRoute := routeShared + if tier == dedicatedTier { + wantRoute = routeDedicated + } + + if creds.ProviderResourceID != string(wantRoute) { + t.Errorf("tier %q routed to %q, want %q", tier, creds.ProviderResourceID, wantRoute) + } + if wantRoute == routeDedicated { + if !dedicated.provisionCalled || shared.provisionCalled { + t.Errorf("tier %q: dedicated.called=%v shared.called=%v, want dedicated only", + tier, dedicated.provisionCalled, shared.provisionCalled) + } + sawTeam = true + } else { + if !shared.provisionCalled || dedicated.provisionCalled { + t.Errorf("tier %q: shared.called=%v dedicated.called=%v, want shared only", + tier, shared.provisionCalled, dedicated.provisionCalled) + } + sawNonTeam = true + } + }) + } + + // Guard against a registry that somehow contains only one class of tier: + // the test is only meaningful if it exercised BOTH routes. + if !sawTeam { + t.Errorf("no tier routed to the dedicated backend — expected %q to exist in the registry", dedicatedTier) + } + if !sawNonTeam { + t.Error("no tier routed to the shared-carve backend — registry must have non-Team tiers") + } +} + +// TestDispatchProvision_UnknownTier_RoutesShared asserts the fail-safe default: +// a tier the dispatcher does not recognise is treated as non-Team (shared +// carve), never silently handed a dedicated pod. +func TestDispatchProvision_UnknownTier_RoutesShared(t *testing.T) { + shared := &recordingBackend{route: routeShared} + dedicated := &recordingBackend{route: routeDedicated} + d := NewTierDispatchBackend(shared, dedicated) + + if _, err := d.Provision(context.Background(), "tok", "some-future-tier"); err != nil { + t.Fatalf("Provision error: %v", err) + } + if !shared.provisionCalled || dedicated.provisionCalled { + t.Errorf("unknown tier: shared.called=%v dedicated.called=%v, want shared only", + shared.provisionCalled, dedicated.provisionCalled) + } +} + +// ─── Deprovision routing (by PRID prefix) ─────────────────────────────────── + +func TestDispatchDeprovision_RoutesByPRID(t *testing.T) { + cases := []struct { + name string + prid string + want route + }{ + {"dedicated namespace PRID", dedicatedNamespacePrefix + "abc123", routeDedicated}, + {"empty PRID (live shared carve)", "", routeShared}, + {"pool-token marker PRID", "pooltok:pool-xyz", routeShared}, + {"local cluster PRID", "local:2", routeShared}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + shared := &recordingBackend{route: routeShared} + dedicated := &recordingBackend{route: routeDedicated} + d := NewTierDispatchBackend(shared, dedicated) + + if err := d.Deprovision(context.Background(), "tok", tc.prid); err != nil { + t.Fatalf("Deprovision error: %v", err) + } + if tc.want == routeDedicated { + if !dedicated.deprovisionCalled || shared.deprovisionCalled { + t.Errorf("PRID %q: dedicated=%v shared=%v, want dedicated only", + tc.prid, dedicated.deprovisionCalled, shared.deprovisionCalled) + } + } else { + if !shared.deprovisionCalled || dedicated.deprovisionCalled { + t.Errorf("PRID %q: shared=%v dedicated=%v, want shared only", + tc.prid, shared.deprovisionCalled, dedicated.deprovisionCalled) + } + } + }) + } +} + +// ─── StorageBytes routing (by PRID prefix) ────────────────────────────────── + +func TestDispatchStorageBytes_RoutesByPRID(t *testing.T) { + cases := []struct { + name string + prid string + want route + }{ + {"dedicated namespace PRID", dedicatedNamespacePrefix + "deadbeef", routeDedicated}, + {"empty PRID (live shared carve)", "", routeShared}, + {"pool-token marker PRID", "pooltok:pool-xyz", routeShared}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + shared := &recordingBackend{route: routeShared} + dedicated := &recordingBackend{route: routeDedicated} + d := NewTierDispatchBackend(shared, dedicated) + + if _, err := d.StorageBytes(context.Background(), "tok", tc.prid); err != nil { + t.Fatalf("StorageBytes error: %v", err) + } + if tc.want == routeDedicated { + if !dedicated.storageBytesCalled || shared.storageBytesCalled { + t.Errorf("PRID %q: dedicated=%v shared=%v, want dedicated only", + tc.prid, dedicated.storageBytesCalled, shared.storageBytesCalled) + } + } else { + if !shared.storageBytesCalled || dedicated.storageBytesCalled { + t.Errorf("PRID %q: shared=%v dedicated=%v, want shared only", + tc.prid, shared.storageBytesCalled, dedicated.storageBytesCalled) + } + } + }) + } +} + +// ─── Regrade routing ──────────────────────────────────────────────────────── + +// TestDispatchRegrade_DedicatedPRID_ForwardsToRegrader asserts a dedicated +// (k8s namespace) resource is forwarded to the dedicated backend's Regrader with +// the PRID and target untouched. +func TestDispatchRegrade_DedicatedPRID_ForwardsToRegrader(t *testing.T) { + shared := &recordingBackend{route: routeShared} + dedicated := ®radeableBackend{ + recordingBackend: recordingBackend{route: routeDedicated}, + regradeResult: RegradeResult{Applied: true, AppliedMaxmemory: 512 << 20}, + } + d := NewTierDispatchBackend(shared, dedicated) + + prid := dedicatedNamespacePrefix + "abc" + res, err := d.Regrade(context.Background(), "tok", prid, 512) + if err != nil { + t.Fatalf("Regrade error: %v", err) + } + if !dedicated.regradeCalled { + t.Fatal("dedicated Regrader was not called for a dedicated PRID") + } + if dedicated.regradePRID != prid { + t.Errorf("forwarded PRID = %q, want %q", dedicated.regradePRID, prid) + } + if dedicated.regradeMB != 512 { + t.Errorf("forwarded targetMaxmemoryMB = %d, want 512", dedicated.regradeMB) + } + if !res.Applied { + t.Errorf("Regrade result Applied = false, want true (dedicated result must pass through)") + } +} + +// TestDispatchRegrade_DedicatedPRID_PropagatesError asserts the dedicated +// Regrader's error is propagated verbatim (so the server can map it). +func TestDispatchRegrade_DedicatedPRID_PropagatesError(t *testing.T) { + wantErr := errors.New("boom") + dedicated := ®radeableBackend{ + recordingBackend: recordingBackend{route: routeDedicated}, + regradeErr: wantErr, + } + d := NewTierDispatchBackend(&recordingBackend{route: routeShared}, dedicated) + + _, err := d.Regrade(context.Background(), "tok", dedicatedNamespacePrefix+"x", 50) + if !errors.Is(err, wantErr) { + t.Errorf("Regrade error = %v, want %v", err, wantErr) + } +} + +// TestDispatchRegrade_SharedPRID_SoftSkip is the critical safety guard: a +// shared-carve resource must NEVER reach the dedicated Regrader (which would +// CONFIG SET maxmemory on the shared pod, capping every co-tenant). It returns +// a soft skip with no error and no Regrader call. +func TestDispatchRegrade_SharedPRID_SoftSkip(t *testing.T) { + dedicated := ®radeableBackend{recordingBackend: recordingBackend{route: routeDedicated}} + d := NewTierDispatchBackend(&recordingBackend{route: routeShared}, dedicated) + + for _, prid := range []string{"", "pooltok:pool-xyz", "local:1"} { + res, err := d.Regrade(context.Background(), "tok", prid, 50) + if err != nil { + t.Fatalf("Regrade(%q) error: %v (want soft skip, no error)", prid, err) + } + if res.Applied { + t.Errorf("Regrade(%q) Applied = true, want soft skip", prid) + } + if res.SkipReason == "" { + t.Errorf("Regrade(%q) SkipReason empty, want a reason", prid) + } + } + if dedicated.regradeCalled { + t.Fatal("shared-carve PRID reached the dedicated Regrader — would cap the shared pod for all tenants") + } +} + +// TestDispatchRegrade_DedicatedNoRegrader_SoftSkip asserts that when the +// dedicated backend does NOT implement Regrader, Regrade soft-skips gracefully +// (mirrors the server's existing type-assertion behaviour). +func TestDispatchRegrade_DedicatedNoRegrader_SoftSkip(t *testing.T) { + // recordingBackend does not implement Regrader. + dedicated := &recordingBackend{route: routeDedicated} + d := NewTierDispatchBackend(&recordingBackend{route: routeShared}, dedicated) + + res, err := d.Regrade(context.Background(), "tok", dedicatedNamespacePrefix+"x", 50) + if err != nil { + t.Fatalf("Regrade error: %v (want soft skip)", err) + } + if res.Applied || res.SkipReason == "" { + t.Errorf("Regrade with no Regrader = %+v, want Applied=false with a SkipReason", res) + } +} + +// ─── Interface conformance ────────────────────────────────────────────────── + +// TestDispatchImplementsRegrader documents that the dispatcher is itself a +// Regrader, so the server's `s.redisBackend.(redis.Regrader)` assertion in +// regradeRedis succeeds when tier-aware routing wraps the backend. +func TestDispatchImplementsRegrader(t *testing.T) { + var b Backend = NewTierDispatchBackend(&recordingBackend{}, &recordingBackend{}) + if _, ok := b.(Regrader); !ok { + t.Fatal("TierDispatchBackend does not implement redis.Regrader — server regradeRedis assertion would skip it") + } +} + +// TestNewSharedCarveBackend_IsLocalBackend documents that the shared-carve +// constructor returns a LocalBackend (ACL carve on a shared Redis) — the +// non-Team side of tier-aware routing. +func TestNewSharedCarveBackend_IsLocalBackend(t *testing.T) { + b := NewSharedCarveBackend("localhost:6379") + if _, ok := b.(*LocalBackend); !ok { + t.Fatalf("NewSharedCarveBackend returned %T, want *LocalBackend", b) + } + // A LocalBackend deliberately does NOT implement Regrader (no per-tenant + // maxmemory lever on a shared pod) — guard that invariant, since the + // dispatcher's Regrade safety relies on the shared side having no Regrader. + if _, ok := b.(Regrader); ok { + t.Error("shared-carve backend implements Regrader — a shared pod has no per-tenant maxmemory to regrade") + } +} + +// TestIsDedicatedRoutingTier documents the exact tier→route predicate. +func TestIsDedicatedRoutingTier(t *testing.T) { + if !isDedicatedRoutingTier(dedicatedTier) { + t.Errorf("isDedicatedRoutingTier(%q) = false, want true", dedicatedTier) + } + for _, tier := range []string{"anonymous", "free", "hobby", "hobby_plus", "pro", "growth", ""} { + if isDedicatedRoutingTier(tier) { + t.Errorf("isDedicatedRoutingTier(%q) = true, want false (non-Team must use shared carve)", tier) + } + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 2566c9a..aebf8a0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -15,11 +15,20 @@ type Config struct { NeonRegionID string // NEON_REGION_ID, default "aws-us-east-1" RedisProvisionBackend string // REDIS_PROVISION_BACKEND, default "local" RedisProvisionHost string // REDIS_PROVISION_HOST, default "localhost:6379" - MongoProvisionBackend string // MONGO_PROVISION_BACKEND, default "local" - MongoAdminURI string // MONGO_ADMIN_URI - MongoHost string // MONGO_HOST - QueueProvisionBackend string // QUEUE_PROVISION_BACKEND, default "local" - ProvisionerSecret string // PROVISIONER_SECRET — shared secret for auth interceptor + + // RedisTierAwareRoutingEnabled is the kill-switch for tier-aware Redis + // backend routing (REDIS_TIER_AWARE_ROUTING_ENABLED). DEFAULT FALSE / + // fail-closed: when false (or unset) the shared redisBackend serves every + // tier exactly as today. When true, the server wraps the shared backend in + // a redis.TierDispatchBackend that routes non-Team tiers to a shared ACL + // carve and the Team tier to a dedicated pod — the durable fix for the + // dedicated-pod-per-token capacity problem. Off until an operator opts in. + RedisTierAwareRoutingEnabled bool // REDIS_TIER_AWARE_ROUTING_ENABLED, default false + MongoProvisionBackend string // MONGO_PROVISION_BACKEND, default "local" + MongoAdminURI string // MONGO_ADMIN_URI + MongoHost string // MONGO_HOST + QueueProvisionBackend string // QUEUE_PROVISION_BACKEND, default "local" + ProvisionerSecret string // PROVISIONER_SECRET — shared secret for auth interceptor // Dedicated provisioning — Team tier. // When set, the provisioner uses these for Team-tier ProvisionResource calls. @@ -54,17 +63,17 @@ type Config struct { // - EKS (MVP): K8S_EXTERNAL_HOST={any-node-external-ip} // - EKS (production): deploy a TCP proxy (Envoy/PgBouncer) behind one NLB, // set K8S_EXTERNAL_HOST to the NLB DNS. See docs/ops-k8s-dedicated.md. - K8sDedicatedBackend bool // K8S_DEDICATED_BACKEND — enable k8s backend for Team tier - K8sKubeconfig string // K8S_KUBECONFIG — path to kubeconfig; empty = in-cluster - K8sExternalHost string // K8S_EXTERNAL_HOST — hostname in returned connection URLs - K8sStorageClass string // K8S_STORAGE_CLASS — "gp3" (EKS) or "local-path" (dev); default "gp3" - K8sPostgresImage string // K8S_POSTGRES_IMAGE — default "postgres:16" - K8sRedisImage string // K8S_REDIS_IMAGE — default "redis:7-alpine" - K8sMongoImage string // K8S_MONGO_IMAGE — default "mongo:7" - K8sPostgresStorageGi int // K8S_POSTGRES_STORAGE_GI — PVC size in GiB, default 50 - K8sRedisStorageGi int // K8S_REDIS_STORAGE_GI — PVC size in GiB, default 10 - K8sMongoStorageGi int // K8S_MONGO_STORAGE_GI — PVC size in GiB, default 50 - K8sNatsImage string // K8S_NATS_IMAGE — default "nats:2.10-alpine" + K8sDedicatedBackend bool // K8S_DEDICATED_BACKEND — enable k8s backend for Team tier + K8sKubeconfig string // K8S_KUBECONFIG — path to kubeconfig; empty = in-cluster + K8sExternalHost string // K8S_EXTERNAL_HOST — hostname in returned connection URLs + K8sStorageClass string // K8S_STORAGE_CLASS — "gp3" (EKS) or "local-path" (dev); default "gp3" + K8sPostgresImage string // K8S_POSTGRES_IMAGE — default "postgres:16" + K8sRedisImage string // K8S_REDIS_IMAGE — default "redis:7-alpine" + K8sMongoImage string // K8S_MONGO_IMAGE — default "mongo:7" + K8sPostgresStorageGi int // K8S_POSTGRES_STORAGE_GI — PVC size in GiB, default 50 + K8sRedisStorageGi int // K8S_REDIS_STORAGE_GI — PVC size in GiB, default 10 + K8sMongoStorageGi int // K8S_MONGO_STORAGE_GI — PVC size in GiB, default 50 + K8sNatsImage string // K8S_NATS_IMAGE — default "nats:2.10-alpine" // NATSHost is the hostname (no port) of the shared NATS server. // Used by the local queue backend for shared-tier provisioning. @@ -106,42 +115,46 @@ func Load() *Config { NeonRegionID: getenv("NEON_REGION_ID", "aws-us-east-1"), RedisProvisionBackend: getenv("REDIS_PROVISION_BACKEND", "local"), RedisProvisionHost: getenv("REDIS_PROVISION_HOST", "localhost:6379"), - MongoProvisionBackend: getenv("MONGO_PROVISION_BACKEND", "local"), - MongoAdminURI: getenv("MONGO_ADMIN_URI", "mongodb://root:root@localhost:27017"), - MongoHost: getenv("MONGO_HOST", "localhost:27017"), - QueueProvisionBackend: getenv("QUEUE_PROVISION_BACKEND", "local"), - ProvisionerSecret: os.Getenv("PROVISIONER_SECRET"), - DedicatedPostgresDSN: os.Getenv("DEDICATED_POSTGRES_DSN"), - DedicatedRedisURL: os.Getenv("DEDICATED_REDIS_URL"), - UpstashAPIKey: os.Getenv("UPSTASH_API_KEY"), - ProvisionerDatabaseURL: os.Getenv("PROVISIONER_DATABASE_URL"), - AESKey: os.Getenv("AES_KEY"), + // Default false: tier-aware routing is opt-in. Any value other than the + // exact string "true" leaves it off (fail-closed) — same pattern as + // K8sDedicatedBackend. + RedisTierAwareRoutingEnabled: os.Getenv("REDIS_TIER_AWARE_ROUTING_ENABLED") == "true", + MongoProvisionBackend: getenv("MONGO_PROVISION_BACKEND", "local"), + MongoAdminURI: getenv("MONGO_ADMIN_URI", "mongodb://root:root@localhost:27017"), + MongoHost: getenv("MONGO_HOST", "localhost:27017"), + QueueProvisionBackend: getenv("QUEUE_PROVISION_BACKEND", "local"), + ProvisionerSecret: os.Getenv("PROVISIONER_SECRET"), + DedicatedPostgresDSN: os.Getenv("DEDICATED_POSTGRES_DSN"), + DedicatedRedisURL: os.Getenv("DEDICATED_REDIS_URL"), + UpstashAPIKey: os.Getenv("UPSTASH_API_KEY"), + ProvisionerDatabaseURL: os.Getenv("PROVISIONER_DATABASE_URL"), + AESKey: os.Getenv("AES_KEY"), // Pool targets default to 10 (was 2/3/2/2). A drained pool now refills // concurrently (see pool.maxRefillConcurrency), so a deeper warm buffer // no longer costs a proportionally longer serial refill — and 10 ready // items absorbs the load test's concurrency-8 burst entirely on the // fast pool-claim path instead of falling through to slow live // provisioning. Override per-type via the POOL_*_SIZE env vars. - PoolPostgresSize: getenvInt("POOL_POSTGRES_SIZE", 10), - PoolRedisSize: getenvInt("POOL_REDIS_SIZE", 10), - PoolMongoSize: getenvInt("POOL_MONGO_SIZE", 10), - PoolQueueSize: getenvInt("POOL_QUEUE_SIZE", 10), - K8sDedicatedBackend: os.Getenv("K8S_DEDICATED_BACKEND") == "true", - K8sKubeconfig: os.Getenv("K8S_KUBECONFIG"), - K8sExternalHost: os.Getenv("K8S_EXTERNAL_HOST"), - K8sStorageClass: getenv("K8S_STORAGE_CLASS", "gp3"), - K8sPostgresImage: getenv("K8S_POSTGRES_IMAGE", "postgres:16"), - K8sRedisImage: getenv("K8S_REDIS_IMAGE", "redis:7-alpine"), - K8sMongoImage: getenv("K8S_MONGO_IMAGE", "mongo:7"), - K8sNatsImage: getenv("K8S_NATS_IMAGE", "nats:2.10-alpine"), - K8sPostgresStorageGi: getenvInt("K8S_POSTGRES_STORAGE_GI", 50), - K8sRedisStorageGi: getenvInt("K8S_REDIS_STORAGE_GI", 10), - K8sMongoStorageGi: getenvInt("K8S_MONGO_STORAGE_GI", 50), - NATSHost: getenv("NATS_HOST", "localhost"), - MinioEndpoint: os.Getenv("MINIO_ENDPOINT"), - MinioRootUser: os.Getenv("MINIO_ROOT_USER"), - MinioRootPassword: os.Getenv("MINIO_ROOT_PASSWORD"), - MinioBucketName: getenv("MINIO_BUCKET_NAME", "instant-shared"), + PoolPostgresSize: getenvInt("POOL_POSTGRES_SIZE", 10), + PoolRedisSize: getenvInt("POOL_REDIS_SIZE", 10), + PoolMongoSize: getenvInt("POOL_MONGO_SIZE", 10), + PoolQueueSize: getenvInt("POOL_QUEUE_SIZE", 10), + K8sDedicatedBackend: os.Getenv("K8S_DEDICATED_BACKEND") == "true", + K8sKubeconfig: os.Getenv("K8S_KUBECONFIG"), + K8sExternalHost: os.Getenv("K8S_EXTERNAL_HOST"), + K8sStorageClass: getenv("K8S_STORAGE_CLASS", "gp3"), + K8sPostgresImage: getenv("K8S_POSTGRES_IMAGE", "postgres:16"), + K8sRedisImage: getenv("K8S_REDIS_IMAGE", "redis:7-alpine"), + K8sMongoImage: getenv("K8S_MONGO_IMAGE", "mongo:7"), + K8sNatsImage: getenv("K8S_NATS_IMAGE", "nats:2.10-alpine"), + K8sPostgresStorageGi: getenvInt("K8S_POSTGRES_STORAGE_GI", 50), + K8sRedisStorageGi: getenvInt("K8S_REDIS_STORAGE_GI", 10), + K8sMongoStorageGi: getenvInt("K8S_MONGO_STORAGE_GI", 50), + NATSHost: getenv("NATS_HOST", "localhost"), + MinioEndpoint: os.Getenv("MINIO_ENDPOINT"), + MinioRootUser: os.Getenv("MINIO_ROOT_USER"), + MinioRootPassword: os.Getenv("MINIO_ROOT_PASSWORD"), + MinioBucketName: getenv("MINIO_BUCKET_NAME", "instant-shared"), } logStartupConfig(cfg) @@ -157,6 +170,7 @@ func logStartupConfig(cfg *Config) { "neon_api_key_set", cfg.NeonAPIKey != "", "redis_provision_backend", cfg.RedisProvisionBackend, "redis_provision_host", cfg.RedisProvisionHost, + "redis_tier_aware_routing_enabled", cfg.RedisTierAwareRoutingEnabled, "mongo_admin_uri_set", cfg.MongoAdminURI != "", "mongo_host", cfg.MongoHost, "queue_provision_backend", cfg.QueueProvisionBackend, diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 0e0e062..03b7573 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -23,6 +23,7 @@ var allConfigEnvKeys = []string{ "NEON_REGION_ID", "REDIS_PROVISION_BACKEND", "REDIS_PROVISION_HOST", + "REDIS_TIER_AWARE_ROUTING_ENABLED", "MONGO_PROVISION_BACKEND", "MONGO_ADMIN_URI", "MONGO_HOST", @@ -192,6 +193,12 @@ func TestLoad_Defaults(t *testing.T) { if cfg.K8sDedicatedBackend { t.Errorf("K8sDedicatedBackend default = true; want false") } + + // Tier-aware Redis routing is opt-in: the default MUST be false so a deploy + // that does not set the env var is a no-op (behaviour identical to today). + if cfg.RedisTierAwareRoutingEnabled { + t.Errorf("RedisTierAwareRoutingEnabled default = true; want false (must be opt-in)") + } } // ── Load: override arm ─────────────────────────────────────────────────────── @@ -207,6 +214,7 @@ func TestLoad_Overrides(t *testing.T) { t.Setenv("NEON_REGION_ID", "aws-eu-west-1") t.Setenv("REDIS_PROVISION_BACKEND", "upstash") t.Setenv("REDIS_PROVISION_HOST", "redis.example:6380") + t.Setenv("REDIS_TIER_AWARE_ROUTING_ENABLED", "true") t.Setenv("MONGO_PROVISION_BACKEND", "k8s") t.Setenv("MONGO_ADMIN_URI", "mongodb://admin:pw@m:27017") t.Setenv("MONGO_HOST", "m.example:27017") @@ -304,6 +312,10 @@ func TestLoad_Overrides(t *testing.T) { if !cfg.K8sDedicatedBackend { t.Errorf("K8sDedicatedBackend with =true; want true") } + + if !cfg.RedisTierAwareRoutingEnabled { + t.Errorf("RedisTierAwareRoutingEnabled with =true; want true") + } } // TestLoad_K8sDedicatedBackend_NonTrueIsFalse asserts only the exact string @@ -318,6 +330,19 @@ func TestLoad_K8sDedicatedBackend_NonTrueIsFalse(t *testing.T) { } } +// TestLoad_RedisTierAwareRouting_NonTrueIsFalse asserts only the exact string +// "true" enables tier-aware routing — any other value leaves it off +// (fail-closed), so a typo'd env value can never silently change routing. +func TestLoad_RedisTierAwareRouting_NonTrueIsFalse(t *testing.T) { + clearEnv(t) + for _, v := range []string{"1", "TRUE", "yes", "on", "false", "True", "enabled"} { + t.Setenv("REDIS_TIER_AWARE_ROUTING_ENABLED", v) + if Load().RedisTierAwareRoutingEnabled { + t.Errorf("REDIS_TIER_AWARE_ROUTING_ENABLED=%q → true; want false (only exact \"true\")", v) + } + } +} + // TestLogStartupConfig_DoesNotPanic exercises logStartupConfig directly with a // populated Config so every slog field reference is covered. Load() already // calls it on the default + override configs above, but this guards the diff --git a/internal/server/export_test.go b/internal/server/export_test.go index 39d0ade..9daf95f 100644 --- a/internal/server/export_test.go +++ b/internal/server/export_test.go @@ -1,6 +1,12 @@ package server -import "instant.dev/common/plans" +import ( + "fmt" + + "instant.dev/common/plans" + + "instant.dev/provisioner/internal/backend/redis" +) // SwapRegradeConnLimits temporarily replaces the package-level plans registry // used by RegradeResource and returns a restore func. It exists so external @@ -13,3 +19,18 @@ func SwapRegradeConnLimits(r *plans.Registry) (restore func()) { regradeConnLimits = r return func() { regradeConnLimits = prev } } + +// RedisBackendIsTierDispatch reports whether the server's shared redis backend +// is wrapped in a tier-aware dispatcher. It lets external tests assert the +// REDIS_TIER_AWARE_ROUTING_ENABLED flag wiring in New() without exporting the +// unexported redisBackend field in production code. +func RedisBackendIsTierDispatch(s *Server) bool { + _, ok := s.redisBackend.(*redis.TierDispatchBackend) + return ok +} + +// RedisBackendTypeName returns the concrete type of the server's shared redis +// backend, for diagnostic assertions in tests. +func RedisBackendTypeName(s *Server) string { + return fmt.Sprintf("%T", s.redisBackend) +} diff --git a/internal/server/server.go b/internal/server/server.go index 47e603e..650820e 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -151,10 +151,29 @@ func New(cfg *config.Config, poolMgr *pool.Manager) *Server { } } + // Redis shared backend. By default this is exactly the backend selected by + // REDIS_PROVISION_BACKEND, used for every non-dedicated tier — unchanged. + // + // REDIS_TIER_AWARE_ROUTING_ENABLED (default OFF) opts into tier-aware + // routing: the configured backend is wrapped in a redis.TierDispatchBackend + // that keeps Team on a dedicated pod (the configured backend, typically k8s) + // but moves every non-Team tier onto a shared ACL carve so a 5MB cache no + // longer costs a whole pod. Off = identical to today; this branch never runs + // in prod until an operator flips the flag. + redisBackend := redis.NewBackend(cfg.RedisProvisionBackend, cfg.RedisProvisionHost) + if cfg.RedisTierAwareRoutingEnabled { + sharedCarve := redis.NewSharedCarveBackend(cfg.RedisProvisionHost) + redisBackend = redis.NewTierDispatchBackend(sharedCarve, redisBackend) + slog.Info("provisioner: redis tier-aware routing ENABLED", + "configured_backend", cfg.RedisProvisionBackend, + "non_team_route", "shared_carve", + "team_route", "dedicated") + } + return NewWithBackends( cfg, postgres.NewBackend(cfg.PostgresProvisionBackend, cfg.PostgresCustomersURL, cfg.PostgresClusterURLs, cfg.NeonAPIKey, cfg.NeonRegionID), - redis.NewBackend(cfg.RedisProvisionBackend, cfg.RedisProvisionHost), + redisBackend, mongo.NewBackend(cfg.MongoProvisionBackend, cfg.MongoAdminURI, cfg.MongoHost), queue.NewBackend(cfg.QueueProvisionBackend, cfg.NATSHost), minioBackend, diff --git a/internal/server/server_redis_tier_routing_test.go b/internal/server/server_redis_tier_routing_test.go new file mode 100644 index 0000000..814f5aa --- /dev/null +++ b/internal/server/server_redis_tier_routing_test.go @@ -0,0 +1,78 @@ +package server_test + +// server_redis_tier_routing_test.go — wiring tests for the +// REDIS_TIER_AWARE_ROUTING_ENABLED flag in server.New. +// +// These tests prove the NO-OP guarantee: with the flag OFF (the default), the +// server's shared redis backend is exactly the backend selected by +// REDIS_PROVISION_BACKEND — NOT wrapped in a dispatcher — so every tier +// (including team) flows through the configured backend exactly as before. With +// the flag ON, the backend is wrapped in a redis.TierDispatchBackend. +// +// The dispatcher's per-tier routing itself is covered by the pure unit tests in +// internal/backend/redis/dispatch_test.go; here we only assert the flag controls +// whether that dispatcher is installed at all. + +import ( + "testing" + + "instant.dev/provisioner/internal/config" + "instant.dev/provisioner/internal/server" +) + +// baseRoutingCfg returns a config whose backends never dial anything, so +// server.New can be exercised offline. +func baseRoutingCfg() *config.Config { + return &config.Config{ + PostgresProvisionBackend: "local", + RedisProvisionBackend: "local", + MongoProvisionBackend: "local", + QueueProvisionBackend: "local", + } +} + +// TestNew_TierRoutingDisabled_BackendUnwrapped is the no-op proof: when the flag +// is unset/false the shared redis backend is NOT a TierDispatchBackend — it is +// the configured backend verbatim, so behaviour is identical to today. +func TestNew_TierRoutingDisabled_BackendUnwrapped(t *testing.T) { + cfg := baseRoutingCfg() + cfg.RedisTierAwareRoutingEnabled = false // explicit; this is also the default + + srv := server.New(cfg, nil) + if srv == nil { + t.Fatal("server.New returned nil") + } + if server.RedisBackendIsTierDispatch(srv) { + t.Fatalf("flag OFF but redis backend is a TierDispatchBackend (%s) — must be the configured backend, unwrapped", + server.RedisBackendTypeName(srv)) + } +} + +// TestNew_TierRoutingDefaultIsDisabled guards the default: a zero-value config +// (flag unset) must NOT wrap the backend. This is the "safe to merge, no-op in +// prod" guarantee — a deploy that does not set the env var changes nothing. +func TestNew_TierRoutingDefaultIsDisabled(t *testing.T) { + cfg := baseRoutingCfg() // RedisTierAwareRoutingEnabled left at its zero value (false) + + srv := server.New(cfg, nil) + if server.RedisBackendIsTierDispatch(srv) { + t.Fatalf("default config wrapped the redis backend (%s) — tier-aware routing must be opt-in", + server.RedisBackendTypeName(srv)) + } +} + +// TestNew_TierRoutingEnabled_BackendWrapped asserts the flag installs the +// dispatcher. +func TestNew_TierRoutingEnabled_BackendWrapped(t *testing.T) { + cfg := baseRoutingCfg() + cfg.RedisTierAwareRoutingEnabled = true + + srv := server.New(cfg, nil) + if srv == nil { + t.Fatal("server.New returned nil") + } + if !server.RedisBackendIsTierDispatch(srv) { + t.Fatalf("flag ON but redis backend is %s — expected a TierDispatchBackend", + server.RedisBackendTypeName(srv)) + } +}