InstaNode-dev · mastermanas805 · Jun 8, 2026 · Jun 8, 2026
diff --git a/internal/backend/mongo/k8s.go b/internal/backend/mongo/k8s.go
@@ -307,15 +307,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden
 
 	rollback := func(step string, cause error) error {
 		slog.Error("k8s.mongo.provision.rollback", "step", step, "namespace", ns, "error", cause)
-		_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
+		// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
+		// may already be cancelled (often WHY we are rolling back), but the
+		// namespace teardown must still run so a failed provision does not leak a
+		// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
+		delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer delCancel()
+		_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
 		return fmt.Errorf("k8s mongo: %s: %w", step, cause)
 	}
 
-	// Use a fresh background context — pod startup can take minutes, far exceeding
-	// any gRPC request deadline on the incoming ctx.
+	// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
+	// and a hard 5m server-side ceiling (min of the two). Deriving from the
+	// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
+	// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
+	// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
+	// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
+	// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
+	// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
+	// early cancellations now fast-fail (mapError classifies the ctx error as a
+	// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
+	// a caller that passes no deadline at all.
 	// Carry the teamID value forward so applyNamespace can label the namespace
 	// with instant.dev/owner-team (pentest 2026-05-16 fix).
-	provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer provCancel()
 	if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
 		provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)

diff --git a/internal/backend/mongo/k8s_caller_deadline_test.go b/internal/backend/mongo/k8s_caller_deadline_test.go
@@ -0,0 +1,44 @@
+package mongo
+
+// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
+// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
+// See the postgres sibling file for the full rationale: provCtx now derives from
+// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
+// caller's deadline instead of blocking up to 5m on a background clock.
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
+// must return promptly, bounded by the caller's deadline — NOT block for
+// mongoK8sReadyTO on a background clock.
+func TestProvision_HonoursCallerDeadline(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster: the mongo pod never becomes Ready
+	b := &K8sBackend{cs: cs, storageClass: "standard", image: "mongo:7"}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
+	defer cancel()
+
+	start := time.Now()
+	_, err := b.Provision(ctx, "pro-token", "pro")
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatal("Provision should fail when the mongo pod never becomes Ready; got nil error")
+	}
+	if elapsed > 30*time.Second {
+		t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
+			"~300ms deadline and fast-fail, not block for mongoK8sReadyTO. This means provCtx "+
+			"no longer derives from the caller's ctx.", elapsed)
+	}
+	if !errors.Is(err, context.DeadlineExceeded) {
+		t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
+			"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
+	}
+}
diff --git a/internal/backend/postgres/k8s.go b/internal/backend/postgres/k8s.go
@@ -244,16 +244,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string, connLimi
 
 	rollback := func(step string, cause error) error {
 		slog.Error("k8s.postgres.provision.rollback", "step", step, "namespace", ns, "error", cause)
-		_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
+		// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
+		// may already be cancelled (often WHY we are rolling back), but the
+		// namespace teardown must still run so a failed provision does not leak a
+		// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
+		delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer delCancel()
+		_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
 		return fmt.Errorf("k8s postgres: %s: %w", step, cause)
 	}
 
-	// Use a fresh background context for the provisioning sequence.
-	// The gRPC request context (ctx) has a short deadline that would cancel
-	// waitPodReady, which can legitimately take 1–3 minutes for pod startup.
+	// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
+	// and a hard 5m server-side ceiling (min of the two). Deriving from the
+	// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
+	// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
+	// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
+	// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
+	// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
+	// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
+	// early cancellations now fast-fail (mapError classifies the ctx error as a
+	// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
+	// a caller that passes no deadline at all.
 	// Carry the teamID value forward so applyNamespace can label the namespace
 	// with instant.dev/owner-team (pentest 2026-05-16 fix).
-	provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer provCancel()
 	if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
 		provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)

diff --git a/internal/backend/postgres/k8s_caller_deadline_test.go b/internal/backend/postgres/k8s_caller_deadline_test.go
@@ -0,0 +1,49 @@
+package postgres
+
+// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
+// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
+//
+// Before the fix the provisioning context derived from context.Background(), so
+// when the api caller's gRPC deadline fired (or it cancelled the RPC) the
+// provisioner kept blocking up to 5m on a wedged PVC/CSI attach and the api
+// handler hung. The fix derives provCtx from the incoming ctx (with a 5m
+// ceiling backstop), so a stalled provision fast-fails bounded by the caller's
+// deadline and the shared mapError maps the ctx error to a retryable gRPC status.
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
+// must return promptly, bounded by the caller's deadline — NOT block for
+// k8sReadyTimeout on a background clock.
+func TestProvision_HonoursCallerDeadline(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster: the postgres pod never becomes Ready
+	b := &K8sBackend{cs: cs, storageClass: "standard", image: "postgres:16"}
+
+	// Caller deadline far shorter than k8sReadyTimeout and the 5m ceiling.
+	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
+	defer cancel()
+
+	start := time.Now()
+	_, err := b.Provision(ctx, "pro-token", "pro", 8)
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatal("Provision should fail when the postgres pod never becomes Ready; got nil error")
+	}
+	if elapsed > 30*time.Second {
+		t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
+			"~300ms deadline and fast-fail, not block for k8sReadyTimeout. This means provCtx "+
+			"no longer derives from the caller's ctx.", elapsed)
+	}
+	if !errors.Is(err, context.DeadlineExceeded) {
+		t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
+			"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
+	}
+}
diff --git a/internal/backend/queue/k8s.go b/internal/backend/queue/k8s.go
@@ -208,15 +208,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden
 
 	rollback := func(step string, cause error) error {
 		slog.Error("k8s.nats.provision.rollback", "step", step, "namespace", ns, "error", cause)
-		_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
+		// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
+		// may already be cancelled (often WHY we are rolling back), but the
+		// namespace teardown must still run so a failed provision does not leak a
+		// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
+		delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer delCancel()
+		_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
 		return fmt.Errorf("k8s nats: %s: %w", step, cause)
 	}
 
-	// Use a fresh background context — pod startup can take minutes, far exceeding
-	// any gRPC request deadline on the incoming ctx.
+	// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
+	// and a hard 5m server-side ceiling (min of the two). Deriving from the
+	// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
+	// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
+	// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
+	// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
+	// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
+	// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
+	// early cancellations now fast-fail (mapError classifies the ctx error as a
+	// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
+	// a caller that passes no deadline at all.
 	// Carry the teamID value forward so applyNamespace can label the namespace
 	// with instant.dev/owner-team (pentest 2026-05-16 fix).
-	provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
 	defer provCancel()
 	if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
 		provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)

diff --git a/internal/backend/queue/k8s_caller_deadline_test.go b/internal/backend/queue/k8s_caller_deadline_test.go
@@ -0,0 +1,44 @@
+package queue
+
+// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
+// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
+// See the postgres sibling file for the full rationale: provCtx now derives from
+// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
+// caller's deadline instead of blocking up to 5m on a background clock.
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"k8s.io/client-go/kubernetes/fake"
+)
+
+// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
+// must return promptly, bounded by the caller's deadline — NOT block for
+// natsK8sReadyTO on a background clock.
+func TestProvision_HonoursCallerDeadline(t *testing.T) {
+	cs := fake.NewClientset() // empty cluster: the nats pod never becomes Ready
+	b := &K8sBackend{cs: cs, storageClass: "standard", image: "nats:2.10-alpine"}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
+	defer cancel()
+
+	start := time.Now()
+	_, err := b.Provision(ctx, "pro-token", "pro")
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatal("Provision should fail when the nats pod never becomes Ready; got nil error")
+	}
+	if elapsed > 30*time.Second {
+		t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
+			"~300ms deadline and fast-fail, not block for natsK8sReadyTO. This means provCtx "+
+			"no longer derives from the caller's ctx.", elapsed)
+	}
+	if !errors.Is(err, context.DeadlineExceeded) {
+		t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
+			"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
+	}
+}