diff --git a/internal/backend/mongo/k8s.go b/internal/backend/mongo/k8s.go index e1abf42..451b66a 100644 --- a/internal/backend/mongo/k8s.go +++ b/internal/backend/mongo/k8s.go @@ -307,15 +307,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden rollback := func(step string, cause error) error { slog.Error("k8s.mongo.provision.rollback", "step", step, "namespace", ns, "error", cause) - _ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{}) + // Cleanup uses a FRESH background ctx with its own bound: the incoming ctx + // may already be cancelled (often WHY we are rolling back), but the + // namespace teardown must still run so a failed provision does not leak a + // half-built namespace. Bounded so a wedged apiserver can't pin the goroutine. + delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer delCancel() + _ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{}) return fmt.Errorf("k8s mongo: %s: %w", step, cause) } - // Use a fresh background context — pod startup can take minutes, far exceeding - // any gRPC request deadline on the incoming ctx. + // Bound the provisioning sequence by BOTH the caller's deadline/cancellation + // and a hard 5m server-side ceiling (min of the two). Deriving from the + // incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix + // (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline + // fires or it cancels the RPC, every k8s call / waitPodReady poll returns + // promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api + // grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro), + // so legitimate 30-90s pod startup is unaffected — only pathological hangs and + // early cancellations now fast-fail (mapError classifies the ctx error as a + // retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops + // a caller that passes no deadline at all. // Carry the teamID value forward so applyNamespace can label the namespace // with instant.dev/owner-team (pentest 2026-05-16 fix). - provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute) + provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute) defer provCancel() if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" { provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID) diff --git a/internal/backend/mongo/k8s_caller_deadline_test.go b/internal/backend/mongo/k8s_caller_deadline_test.go new file mode 100644 index 0000000..b05094a --- /dev/null +++ b/internal/backend/mongo/k8s_caller_deadline_test.go @@ -0,0 +1,44 @@ +package mongo + +// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug +// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08). +// See the postgres sibling file for the full rationale: provCtx now derives from +// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the +// caller's deadline instead of blocking up to 5m on a background clock. + +import ( + "context" + "errors" + "testing" + "time" + + "k8s.io/client-go/kubernetes/fake" +) + +// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready +// must return promptly, bounded by the caller's deadline — NOT block for +// mongoK8sReadyTO on a background clock. +func TestProvision_HonoursCallerDeadline(t *testing.T) { + cs := fake.NewClientset() // empty cluster: the mongo pod never becomes Ready + b := &K8sBackend{cs: cs, storageClass: "standard", image: "mongo:7"} + + ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) + defer cancel() + + start := time.Now() + _, err := b.Provision(ctx, "pro-token", "pro") + elapsed := time.Since(start) + + if err == nil { + t.Fatal("Provision should fail when the mongo pod never becomes Ready; got nil error") + } + if elapsed > 30*time.Second { + t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+ + "~300ms deadline and fast-fail, not block for mongoK8sReadyTO. This means provCtx "+ + "no longer derives from the caller's ctx.", elapsed) + } + if !errors.Is(err, context.DeadlineExceeded) { + t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+ + "server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err) + } +} diff --git a/internal/backend/postgres/k8s.go b/internal/backend/postgres/k8s.go index 38c2c33..093eb87 100644 --- a/internal/backend/postgres/k8s.go +++ b/internal/backend/postgres/k8s.go @@ -244,16 +244,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string, connLimi rollback := func(step string, cause error) error { slog.Error("k8s.postgres.provision.rollback", "step", step, "namespace", ns, "error", cause) - _ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{}) + // Cleanup uses a FRESH background ctx with its own bound: the incoming ctx + // may already be cancelled (often WHY we are rolling back), but the + // namespace teardown must still run so a failed provision does not leak a + // half-built namespace. Bounded so a wedged apiserver can't pin the goroutine. + delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer delCancel() + _ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{}) return fmt.Errorf("k8s postgres: %s: %w", step, cause) } - // Use a fresh background context for the provisioning sequence. - // The gRPC request context (ctx) has a short deadline that would cancel - // waitPodReady, which can legitimately take 1–3 minutes for pod startup. + // Bound the provisioning sequence by BOTH the caller's deadline/cancellation + // and a hard 5m server-side ceiling (min of the two). Deriving from the + // incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix + // (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline + // fires or it cancels the RPC, every k8s call / waitPodReady poll returns + // promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api + // grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro), + // so legitimate 30-90s pod startup is unaffected — only pathological hangs and + // early cancellations now fast-fail (mapError classifies the ctx error as a + // retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops + // a caller that passes no deadline at all. // Carry the teamID value forward so applyNamespace can label the namespace // with instant.dev/owner-team (pentest 2026-05-16 fix). - provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute) + provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute) defer provCancel() if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" { provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID) diff --git a/internal/backend/postgres/k8s_caller_deadline_test.go b/internal/backend/postgres/k8s_caller_deadline_test.go new file mode 100644 index 0000000..9a0799c --- /dev/null +++ b/internal/backend/postgres/k8s_caller_deadline_test.go @@ -0,0 +1,49 @@ +package postgres + +// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug +// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08). +// +// Before the fix the provisioning context derived from context.Background(), so +// when the api caller's gRPC deadline fired (or it cancelled the RPC) the +// provisioner kept blocking up to 5m on a wedged PVC/CSI attach and the api +// handler hung. The fix derives provCtx from the incoming ctx (with a 5m +// ceiling backstop), so a stalled provision fast-fails bounded by the caller's +// deadline and the shared mapError maps the ctx error to a retryable gRPC status. + +import ( + "context" + "errors" + "testing" + "time" + + "k8s.io/client-go/kubernetes/fake" +) + +// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready +// must return promptly, bounded by the caller's deadline — NOT block for +// k8sReadyTimeout on a background clock. +func TestProvision_HonoursCallerDeadline(t *testing.T) { + cs := fake.NewClientset() // empty cluster: the postgres pod never becomes Ready + b := &K8sBackend{cs: cs, storageClass: "standard", image: "postgres:16"} + + // Caller deadline far shorter than k8sReadyTimeout and the 5m ceiling. + ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) + defer cancel() + + start := time.Now() + _, err := b.Provision(ctx, "pro-token", "pro", 8) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("Provision should fail when the postgres pod never becomes Ready; got nil error") + } + if elapsed > 30*time.Second { + t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+ + "~300ms deadline and fast-fail, not block for k8sReadyTimeout. This means provCtx "+ + "no longer derives from the caller's ctx.", elapsed) + } + if !errors.Is(err, context.DeadlineExceeded) { + t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+ + "server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err) + } +} diff --git a/internal/backend/queue/k8s.go b/internal/backend/queue/k8s.go index d30e5e7..ba3a336 100644 --- a/internal/backend/queue/k8s.go +++ b/internal/backend/queue/k8s.go @@ -208,15 +208,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden rollback := func(step string, cause error) error { slog.Error("k8s.nats.provision.rollback", "step", step, "namespace", ns, "error", cause) - _ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{}) + // Cleanup uses a FRESH background ctx with its own bound: the incoming ctx + // may already be cancelled (often WHY we are rolling back), but the + // namespace teardown must still run so a failed provision does not leak a + // half-built namespace. Bounded so a wedged apiserver can't pin the goroutine. + delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer delCancel() + _ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{}) return fmt.Errorf("k8s nats: %s: %w", step, cause) } - // Use a fresh background context — pod startup can take minutes, far exceeding - // any gRPC request deadline on the incoming ctx. + // Bound the provisioning sequence by BOTH the caller's deadline/cancellation + // and a hard 5m server-side ceiling (min of the two). Deriving from the + // incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix + // (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline + // fires or it cancels the RPC, every k8s call / waitPodReady poll returns + // promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api + // grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro), + // so legitimate 30-90s pod startup is unaffected — only pathological hangs and + // early cancellations now fast-fail (mapError classifies the ctx error as a + // retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops + // a caller that passes no deadline at all. // Carry the teamID value forward so applyNamespace can label the namespace // with instant.dev/owner-team (pentest 2026-05-16 fix). - provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute) + provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute) defer provCancel() if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" { provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID) diff --git a/internal/backend/queue/k8s_caller_deadline_test.go b/internal/backend/queue/k8s_caller_deadline_test.go new file mode 100644 index 0000000..ed8804b --- /dev/null +++ b/internal/backend/queue/k8s_caller_deadline_test.go @@ -0,0 +1,44 @@ +package queue + +// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug +// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08). +// See the postgres sibling file for the full rationale: provCtx now derives from +// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the +// caller's deadline instead of blocking up to 5m on a background clock. + +import ( + "context" + "errors" + "testing" + "time" + + "k8s.io/client-go/kubernetes/fake" +) + +// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready +// must return promptly, bounded by the caller's deadline — NOT block for +// natsK8sReadyTO on a background clock. +func TestProvision_HonoursCallerDeadline(t *testing.T) { + cs := fake.NewClientset() // empty cluster: the nats pod never becomes Ready + b := &K8sBackend{cs: cs, storageClass: "standard", image: "nats:2.10-alpine"} + + ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) + defer cancel() + + start := time.Now() + _, err := b.Provision(ctx, "pro-token", "pro") + elapsed := time.Since(start) + + if err == nil { + t.Fatal("Provision should fail when the nats pod never becomes Ready; got nil error") + } + if elapsed > 30*time.Second { + t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+ + "~300ms deadline and fast-fail, not block for natsK8sReadyTO. This means provCtx "+ + "no longer derives from the caller's ctx.", elapsed) + } + if !errors.Is(err, context.DeadlineExceeded) { + t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+ + "server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err) + } +}