Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions internal/backend/mongo/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,15 +307,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden

rollback := func(step string, cause error) error {
slog.Error("k8s.mongo.provision.rollback", "step", step, "namespace", ns, "error", cause)
_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
// may already be cancelled (often WHY we are rolling back), but the
// namespace teardown must still run so a failed provision does not leak a
// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer delCancel()
_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
return fmt.Errorf("k8s mongo: %s: %w", step, cause)
}

// Use a fresh background context — pod startup can take minutes, far exceeding
// any gRPC request deadline on the incoming ctx.
// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
// and a hard 5m server-side ceiling (min of the two). Deriving from the
// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
// early cancellations now fast-fail (mapError classifies the ctx error as a
// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
// a caller that passes no deadline at all.
// Carry the teamID value forward so applyNamespace can label the namespace
// with instant.dev/owner-team (pentest 2026-05-16 fix).
provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
defer provCancel()
if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
Expand Down
44 changes: 44 additions & 0 deletions internal/backend/mongo/k8s_caller_deadline_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package mongo

// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
// See the postgres sibling file for the full rationale: provCtx now derives from
// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
// caller's deadline instead of blocking up to 5m on a background clock.

import (
"context"
"errors"
"testing"
"time"

"k8s.io/client-go/kubernetes/fake"
)

// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
// must return promptly, bounded by the caller's deadline — NOT block for
// mongoK8sReadyTO on a background clock.
func TestProvision_HonoursCallerDeadline(t *testing.T) {
cs := fake.NewClientset() // empty cluster: the mongo pod never becomes Ready
b := &K8sBackend{cs: cs, storageClass: "standard", image: "mongo:7"}

ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
defer cancel()

start := time.Now()
_, err := b.Provision(ctx, "pro-token", "pro")
elapsed := time.Since(start)

if err == nil {
t.Fatal("Provision should fail when the mongo pod never becomes Ready; got nil error")
}
if elapsed > 30*time.Second {
t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
"~300ms deadline and fast-fail, not block for mongoK8sReadyTO. This means provCtx "+
"no longer derives from the caller's ctx.", elapsed)
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
}
}
24 changes: 19 additions & 5 deletions internal/backend/postgres/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,16 +244,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string, connLimi

rollback := func(step string, cause error) error {
slog.Error("k8s.postgres.provision.rollback", "step", step, "namespace", ns, "error", cause)
_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
// may already be cancelled (often WHY we are rolling back), but the
// namespace teardown must still run so a failed provision does not leak a
// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer delCancel()
_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
return fmt.Errorf("k8s postgres: %s: %w", step, cause)
}

// Use a fresh background context for the provisioning sequence.
// The gRPC request context (ctx) has a short deadline that would cancel
// waitPodReady, which can legitimately take 1–3 minutes for pod startup.
// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
// and a hard 5m server-side ceiling (min of the two). Deriving from the
// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
// early cancellations now fast-fail (mapError classifies the ctx error as a
// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
// a caller that passes no deadline at all.
// Carry the teamID value forward so applyNamespace can label the namespace
// with instant.dev/owner-team (pentest 2026-05-16 fix).
provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
defer provCancel()
if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
Expand Down
49 changes: 49 additions & 0 deletions internal/backend/postgres/k8s_caller_deadline_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package postgres

// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
//
// Before the fix the provisioning context derived from context.Background(), so
// when the api caller's gRPC deadline fired (or it cancelled the RPC) the
// provisioner kept blocking up to 5m on a wedged PVC/CSI attach and the api
// handler hung. The fix derives provCtx from the incoming ctx (with a 5m
// ceiling backstop), so a stalled provision fast-fails bounded by the caller's
// deadline and the shared mapError maps the ctx error to a retryable gRPC status.

import (
"context"
"errors"
"testing"
"time"

"k8s.io/client-go/kubernetes/fake"
)

// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
// must return promptly, bounded by the caller's deadline — NOT block for
// k8sReadyTimeout on a background clock.
func TestProvision_HonoursCallerDeadline(t *testing.T) {
cs := fake.NewClientset() // empty cluster: the postgres pod never becomes Ready
b := &K8sBackend{cs: cs, storageClass: "standard", image: "postgres:16"}

// Caller deadline far shorter than k8sReadyTimeout and the 5m ceiling.
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
defer cancel()

start := time.Now()
_, err := b.Provision(ctx, "pro-token", "pro", 8)
elapsed := time.Since(start)

if err == nil {
t.Fatal("Provision should fail when the postgres pod never becomes Ready; got nil error")
}
if elapsed > 30*time.Second {
t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
"~300ms deadline and fast-fail, not block for k8sReadyTimeout. This means provCtx "+
"no longer derives from the caller's ctx.", elapsed)
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
}
}
23 changes: 19 additions & 4 deletions internal/backend/queue/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,30 @@ func (b *K8sBackend) Provision(ctx context.Context, token, tier string) (*Creden

rollback := func(step string, cause error) error {
slog.Error("k8s.nats.provision.rollback", "step", step, "namespace", ns, "error", cause)
_ = b.cs.CoreV1().Namespaces().Delete(context.Background(), ns, metav1.DeleteOptions{})
// Cleanup uses a FRESH background ctx with its own bound: the incoming ctx
// may already be cancelled (often WHY we are rolling back), but the
// namespace teardown must still run so a failed provision does not leak a
// half-built namespace. Bounded so a wedged apiserver can't pin the goroutine.
delCtx, delCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer delCancel()
_ = b.cs.CoreV1().Namespaces().Delete(delCtx, ns, metav1.DeleteOptions{})
return fmt.Errorf("k8s nats: %s: %w", step, cause)
}

// Use a fresh background context — pod startup can take minutes, far exceeding
// any gRPC request deadline on the incoming ctx.
// Bound the provisioning sequence by BOTH the caller's deadline/cancellation
// and a hard 5m server-side ceiling (min of the two). Deriving from the
// incoming gRPC ctx — NOT context.Background() — is the pro-provision-hang fix
// (mirrors redis/k8s.go provisionContext, #52): when the api caller's deadline
// fires or it cancels the RPC, every k8s call / waitPodReady poll returns
// promptly instead of blocking up to 5m on a wedged PVC/CSI attach. The api
// grants a generous provision deadline (provisionTimeout: 4m anon / 5m pro),
// so legitimate 30-90s pod startup is unaffected — only pathological hangs and
// early cancellations now fast-fail (mapError classifies the ctx error as a
// retryable gRPC status → api soft-deletes + 503s). The ceiling still backstops
// a caller that passes no deadline at all.
// Carry the teamID value forward so applyNamespace can label the namespace
// with instant.dev/owner-team (pentest 2026-05-16 fix).
provCtx, provCancel := context.WithTimeout(context.Background(), 5*time.Minute)
provCtx, provCancel := context.WithTimeout(ctx, 5*time.Minute)
defer provCancel()
if teamID, ok := ctx.Value(ctxkeys.TeamIDKey).(string); ok && teamID != "" {
provCtx = context.WithValue(provCtx, ctxkeys.TeamIDKey, teamID)
Expand Down
44 changes: 44 additions & 0 deletions internal/backend/queue/k8s_caller_deadline_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package queue

// k8s_caller_deadline_test.go — regression guard for the pro-provision-hang bug
// class (mirrors redis #52, applied to postgres/mongo/queue on 2026-06-08).
// See the postgres sibling file for the full rationale: provCtx now derives from
// the incoming ctx (with a 5m ceiling), so a stalled provision fast-fails on the
// caller's deadline instead of blocking up to 5m on a background clock.

import (
"context"
"errors"
"testing"
"time"

"k8s.io/client-go/kubernetes/fake"
)

// TestProvision_HonoursCallerDeadline: a Provision whose pod never becomes Ready
// must return promptly, bounded by the caller's deadline — NOT block for
// natsK8sReadyTO on a background clock.
func TestProvision_HonoursCallerDeadline(t *testing.T) {
cs := fake.NewClientset() // empty cluster: the nats pod never becomes Ready
b := &K8sBackend{cs: cs, storageClass: "standard", image: "nats:2.10-alpine"}

ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
defer cancel()

start := time.Now()
_, err := b.Provision(ctx, "pro-token", "pro")
elapsed := time.Since(start)

if err == nil {
t.Fatal("Provision should fail when the nats pod never becomes Ready; got nil error")
}
if elapsed > 30*time.Second {
t.Fatalf("PROVISION-HANG REGRESSION: Provision took %s; it must honour the caller's "+
"~300ms deadline and fast-fail, not block for natsK8sReadyTO. This means provCtx "+
"no longer derives from the caller's ctx.", elapsed)
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Errorf("Provision error should wrap context.DeadlineExceeded (got %v) so the shared "+
"server.mapError surfaces a retryable gRPC status (api soft-deletes + 503s)", err)
}
}
Loading