diff --git a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go index 5ba87691aa..2a3aa6e6e9 100644 --- a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go @@ -158,10 +158,49 @@ type MCPRemoteProxySpec struct { // SessionAffinity controls whether the Service routes repeated client connections to the same pod. // MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. // Set to "None" for stateless servers or when using an external load balancer with its own affinity. + // + // Interaction with sessionStorage: when running multiple replicas with + // sessionStorage.provider "redis", set this to "None" so requests are + // distributed across replicas and sessions resolve via the shared store. + // Conversely, "None" without Redis-backed sessionStorage breaks session + // continuity — any request landing on a different pod fails with + // "Session not found". // +kubebuilder:validation:Enum=ClientIP;None // +kubebuilder:default=ClientIP // +optional SessionAffinity string `json:"sessionAffinity,omitempty"` + + // Replicas is the desired number of proxy pod replicas. + // MCPRemoteProxy creates a single Deployment for the proxy process, so there + // is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + // When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + // management to an HPA or other external controller. + // When set above 1, also configure sessionStorage with the redis provider and + // sessionAffinity: "None" so sessions resolve across replicas; otherwise a + // SessionStorageWarning condition is surfaced on the resource status. + // +kubebuilder:validation:Minimum=0 + // +optional + Replicas *int32 `json:"replicas,omitempty"` + + // SessionStorage configures session storage for stateful horizontal scaling. + // When nil, no session storage is configured and the proxy falls back to + // pod-local in-memory session state — incompatible with multi-replica + // deployments behind load balancers that don't preserve client-IP affinity + // (e.g. AWS ALB across multiple AZs). + // + // The transparent proxy validates `Mcp-Session-Id` against this store on + // every non-initialize request (see pkg/transport/proxy/transparent/ + // transparent_proxy.go) and rewrites client-facing session IDs to backend + // session IDs using session metadata. Both lookups require shared state + // across replicas. + // + // When using the Redis provider, also set sessionAffinity to "None" so the + // Service routes requests round-robin and all replicas rely on the shared + // session store rather than pod-local state. + // + // Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + // +optional + SessionStorage *SessionStorageConfig `json:"sessionStorage,omitempty"` } // MCPRemoteProxyStatus defines the observed state of MCPRemoteProxy diff --git a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go index 2cb12d6c3c..4c82672d70 100644 --- a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go +++ b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go @@ -1457,6 +1457,16 @@ func (in *MCPRemoteProxySpec) DeepCopyInto(out *MCPRemoteProxySpec) { *out = new(MCPGroupRef) **out = **in } + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } + if in.SessionStorage != nil { + in, out := &in.SessionStorage, &out.SessionStorage + *out = new(SessionStorageConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MCPRemoteProxySpec. diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_controller.go b/cmd/thv-operator/controllers/mcpremoteproxy_controller.go index ce18e84019..fa0336f0d0 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_controller.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_controller.go @@ -127,6 +127,9 @@ func (r *MCPRemoteProxyReconciler) validateAndHandleConfigs(ctx context.Context, // Surface advisory condition when primaryUpstreamProvider is set but ignored r.validateAuthzPrimaryUpstreamProviderIgnored(proxy) + // Surface advisory condition when replicas > 1 without Redis session storage + r.validateSessionStorageForReplicas(proxy) + // Handle MCPToolConfig if err := r.handleToolConfig(ctx, proxy); err != nil { ctxLogger.Error(err, "Failed to handle MCPToolConfig") @@ -301,10 +304,15 @@ func (r *MCPRemoteProxyReconciler) ensureDeployment( if newDeployment == nil { return ctrl.Result{}, fmt.Errorf("failed to create updated Deployment object") } - // Update the deployment spec but preserve replica count for HPA compatibility + // Update template and metadata. Also sync Spec.Replicas when spec.replicas + // is non-nil (operator authoritative); preserve it when nil so an HPA or + // other external controller can manage scaling. deployment.Spec.Template = newDeployment.Spec.Template deployment.Labels = newDeployment.Labels deployment.Annotations = ctrlutil.MergeAnnotations(newDeployment.Annotations, deployment.Annotations) + if newDeployment.Spec.Replicas != nil { + deployment.Spec.Replicas = newDeployment.Spec.Replicas + } ctxLogger.Info("Updating Deployment", "Deployment.Namespace", deployment.Namespace, "Deployment.Name", deployment.Name) if err := r.Update(ctx, deployment); err != nil { @@ -1107,6 +1115,40 @@ func (*MCPRemoteProxyReconciler) validateAuthzPrimaryUpstreamProviderIgnored(pro }) } +// validateSessionStorageForReplicas surfaces a SessionStorageWarning condition +// when replicas > 1 but session storage is not configured with the Redis +// backend. Reconciliation continues regardless; this is advisory only. +// Mirrors the MCPServer and VirtualMCPServer validators so the condition is +// consistent across all types that share the replicas + sessionStorage pair. +// The caller is responsible for persisting status. +func (*MCPRemoteProxyReconciler) validateSessionStorageForReplicas(proxy *mcpv1beta1.MCPRemoteProxy) { + condition := func() metav1.Condition { + if proxy.Spec.Replicas == nil || *proxy.Spec.Replicas <= 1 { + return metav1.Condition{ + Status: metav1.ConditionFalse, + Reason: mcpv1beta1.ConditionReasonSessionStorageNotApplicable, + Message: "session storage warning is not active", + } + } + if proxy.Spec.SessionStorage == nil || + proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis { + return metav1.Condition{ + Status: metav1.ConditionTrue, + Reason: mcpv1beta1.ConditionReasonSessionStorageMissing, + Message: "replicas > 1 but sessionStorage.provider is not redis; sessions are not shared across replicas", + } + } + return metav1.Condition{ + Status: metav1.ConditionFalse, + Reason: mcpv1beta1.ConditionReasonSessionStorageConfigured, + Message: "Redis session storage is configured", + } + }() + condition.Type = mcpv1beta1.ConditionSessionStorageWarning + condition.ObservedGeneration = proxy.Generation + meta.SetStatusCondition(&proxy.Status.Conditions, condition) +} + // ensureRBACResources ensures that the RBAC resources are in place for the remote proxy. // Uses the RBAC client (pkg/kubernetes/rbac) which creates or updates RBAC resources // automatically during operator upgrades. @@ -1300,6 +1342,15 @@ func (r *MCPRemoteProxyReconciler) deploymentNeedsUpdate( return true } + // Check if spec.replicas has changed. Only compare when spec.replicas is + // non-nil; nil means hands-off mode (HPA or another external controller + // manages replicas) and the live count is authoritative. + if proxy.Spec.Replicas != nil { + if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas != *proxy.Spec.Replicas { + return true + } + } + return false } diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go b/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go index 67e195575c..a6237ca28c 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_deployment.go @@ -18,6 +18,7 @@ import ( ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" "github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum" "github.com/stacklok/toolhive/pkg/container/kubernetes" + "github.com/stacklok/toolhive/pkg/transport/session" "github.com/stacklok/toolhive/pkg/vmcp/headerforward/wirefmt" ) @@ -26,7 +27,6 @@ func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy( ctx context.Context, proxy *mcpv1beta1.MCPRemoteProxy, runConfigChecksum string, ) *appsv1.Deployment { ls := labelsForMCPRemoteProxy(proxy.Name) - replicas := int32(1) // Build deployment components using helper functions args := r.buildContainerArgs() @@ -62,7 +62,10 @@ func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy( Annotations: deploymentAnnotations, }, Spec: appsv1.DeploymentSpec{ - Replicas: &replicas, + // nil leaves the replica count to the apiserver default (1) on create + // and to an HPA or other external controller thereafter; non-nil is + // operator-owned and reconciled by deploymentNeedsUpdate. + Replicas: proxy.Spec.Replicas, Selector: &metav1.LabelSelector{ MatchLabels: ls, }, @@ -234,9 +237,38 @@ func (r *MCPRemoteProxyReconciler) buildEnvVarsForProxy( } } + // Add THV_SESSION_REDIS_PASSWORD when sessionStorage uses a passwordRef. + // The non-sensitive parts (address/db/keyPrefix) are populated into the + // runconfig by populateScalingConfigForRemoteProxy; the password is + // injected separately so it never lands in the ConfigMap. + env = append(env, buildRedisPasswordEnvVarForRemoteProxy(proxy)...) + return ctrlutil.EnsureRequiredEnvVars(ctx, env) } +// buildRedisPasswordEnvVarForRemoteProxy returns the THV_SESSION_REDIS_PASSWORD +// env var sourced from spec.sessionStorage.passwordRef when sessionStorage uses +// the redis provider; returns nil otherwise. Mirrors VirtualMCPServer's +// buildRedisPasswordEnvVar in virtualmcpserver_deployment.go. +func buildRedisPasswordEnvVarForRemoteProxy(proxy *mcpv1beta1.MCPRemoteProxy) []corev1.EnvVar { + if proxy.Spec.SessionStorage == nil || + proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis || + proxy.Spec.SessionStorage.PasswordRef == nil { + return nil + } + return []corev1.EnvVar{{ + Name: session.RedisPasswordEnvVar, + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: proxy.Spec.SessionStorage.PasswordRef.Name, + }, + Key: proxy.Spec.SessionStorage.PasswordRef.Key, + }, + }, + }} +} + // buildOIDCClientSecretEnvVars returns OIDC client secret env vars when the proxy // references an MCPOIDCConfig with an inline client secret. Returns nil otherwise. func (r *MCPRemoteProxyReconciler) buildOIDCClientSecretEnvVars( diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go b/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go index c16d147c27..5ad27823df 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go @@ -30,6 +30,7 @@ import ( mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" + "github.com/stacklok/toolhive/pkg/transport/session" ) // TestDeploymentForMCPRemoteProxy tests deployment generation @@ -57,7 +58,7 @@ func TestDeploymentForMCPRemoteProxy(t *testing.T) { t.Helper() assert.Equal(t, "basic-proxy", dep.Name) assert.Equal(t, "default", dep.Namespace) - assert.Equal(t, int32(1), *dep.Spec.Replicas) + assert.Nil(t, dep.Spec.Replicas, "nil spec.replicas leaves the count to the apiserver default") // Verify labels assert.Equal(t, labelsForMCPRemoteProxy("basic-proxy"), dep.Spec.Selector.MatchLabels) @@ -1306,3 +1307,66 @@ func TestMCPRemoteProxyServiceNeedsUpdate(t *testing.T) { }) } } + +// TestBuildRedisPasswordEnvVarForRemoteProxy mirrors VirtualMCPServer's +// TestBuildRedisPasswordEnvVar — the env var must be injected only when +// sessionStorage uses the redis provider AND a passwordRef is set, and it +// must always be a SecretKeyRef (never a plaintext value). +func TestBuildRedisPasswordEnvVarForRemoteProxy(t *testing.T) { + t.Parallel() + + passwordRef := &mcpv1beta1.SecretKeyRef{Name: "redis-secret", Key: "password"} + + tests := []struct { + name string + storage *mcpv1beta1.SessionStorageConfig + expectEnVar bool + }{ + { + name: "nil sessionStorage produces no env var", + storage: nil, + expectEnVar: false, + }, + { + name: "memory provider produces no env var", + storage: &mcpv1beta1.SessionStorageConfig{Provider: "memory"}, + expectEnVar: false, + }, + { + name: "redis without passwordRef produces no env var", + storage: &mcpv1beta1.SessionStorageConfig{Provider: mcpv1beta1.SessionStorageProviderRedis, Address: "redis:6379"}, + expectEnVar: false, + }, + { + name: "redis with passwordRef produces THV_SESSION_REDIS_PASSWORD", + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis:6379", + PasswordRef: passwordRef, + }, + expectEnVar: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + proxy := &mcpv1beta1.MCPRemoteProxy{ + ObjectMeta: metav1.ObjectMeta{Name: "test-proxy", Namespace: "default"}, + Spec: mcpv1beta1.MCPRemoteProxySpec{SessionStorage: tc.storage}, + } + env := buildRedisPasswordEnvVarForRemoteProxy(proxy) + if tc.expectEnVar { + require.Len(t, env, 1) + assert.Equal(t, session.RedisPasswordEnvVar, env[0].Name) + assert.Empty(t, env[0].Value, "must not use plaintext Value") + require.NotNil(t, env[0].ValueFrom) + require.NotNil(t, env[0].ValueFrom.SecretKeyRef) + assert.Equal(t, passwordRef.Name, env[0].ValueFrom.SecretKeyRef.Name) + assert.Equal(t, passwordRef.Key, env[0].ValueFrom.SecretKeyRef.Key) + } else { + assert.Empty(t, env) + } + }) + } +} diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_replicas_test.go b/cmd/thv-operator/controllers/mcpremoteproxy_replicas_test.go new file mode 100644 index 0000000000..6ff30a5207 --- /dev/null +++ b/cmd/thv-operator/controllers/mcpremoteproxy_replicas_test.go @@ -0,0 +1,255 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" + "github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum" +) + +// replicasTestScheme registers the types the replica tests seed into the fake +// client: the CRDs plus core and apps (Deployments). +func replicasTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, appsv1.AddToScheme(scheme)) + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + return scheme +} + +func replicasTestProxy(replicas *int32) *mcpv1beta1.MCPRemoteProxy { + return &mcpv1beta1.MCPRemoteProxy{ + ObjectMeta: metav1.ObjectMeta{Name: "replicas-proxy", Namespace: "default"}, + Spec: mcpv1beta1.MCPRemoteProxySpec{ + RemoteURL: "https://mcp.example.com", + ProxyPort: 8080, + Replicas: replicas, + }, + } +} + +// TestDeploymentForMCPRemoteProxyReplicas verifies spec.replicas flows into +// the generated Deployment: nil stays nil (apiserver defaults to 1, and an +// HPA can manage the count thereafter), non-nil is set verbatim. +func TestDeploymentForMCPRemoteProxyReplicas(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + replicas *int32 + }{ + {name: "nil replicas leaves Deployment.Spec.Replicas unset", replicas: nil}, + {name: "replicas 3 is set on the Deployment", replicas: int32Ptr(3)}, + {name: "replicas 0 is set on the Deployment", replicas: int32Ptr(0)}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := replicasTestScheme(t) + r := &MCPRemoteProxyReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Scheme: scheme, + } + + dep := r.deploymentForMCPRemoteProxy(context.Background(), replicasTestProxy(tt.replicas), "test-checksum") + require.NotNil(t, dep) + + if tt.replicas == nil { + assert.Nil(t, dep.Spec.Replicas) + } else { + require.NotNil(t, dep.Spec.Replicas) + assert.Equal(t, *tt.replicas, *dep.Spec.Replicas) + } + }) + } +} + +// TestMCPRemoteProxyDeploymentNeedsUpdateReplicas pins the drift semantics: +// non-nil spec.replicas is operator-owned and any divergence triggers an +// update; nil spec.replicas is hands-off so external scaling (HPA) is never +// fought by the reconciler. +func TestMCPRemoteProxyDeploymentNeedsUpdateReplicas(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + specReplicas *int32 + liveReplicas *int32 + wantUpdate bool + }{ + {name: "nil spec ignores externally scaled count", specReplicas: nil, liveReplicas: int32Ptr(5), wantUpdate: false}, + {name: "set spec reconciles drifted count", specReplicas: int32Ptr(3), liveReplicas: int32Ptr(1), wantUpdate: true}, + {name: "set spec matches live count", specReplicas: int32Ptr(3), liveReplicas: int32Ptr(3), wantUpdate: false}, + {name: "set spec with nil live count", specReplicas: int32Ptr(2), liveReplicas: nil, wantUpdate: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := replicasTestScheme(t) + r := &MCPRemoteProxyReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Scheme: scheme, + } + + proxy := replicasTestProxy(tt.specReplicas) + // Generate the desired deployment so all non-replica comparisons + // in deploymentNeedsUpdate see identical state. + deployment := r.deploymentForMCPRemoteProxy(context.Background(), proxy, "test-checksum") + require.NotNil(t, deployment) + deployment.Spec.Replicas = tt.liveReplicas + + assert.Equal(t, tt.wantUpdate, + r.deploymentNeedsUpdate(context.Background(), deployment, proxy, "test-checksum")) + }) + } +} + +// TestMCPRemoteProxyEnsureDeploymentReplicaSync drives ensureDeployment +// against a fake cluster to verify the update path end to end: a live count +// scaled by an external controller survives when spec.replicas is nil, and is +// overwritten when spec.replicas is set. +func TestMCPRemoteProxyEnsureDeploymentReplicaSync(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + specReplicas *int32 + liveReplicas int32 + wantReplicas int32 + }{ + {name: "nil spec preserves HPA-scaled count", specReplicas: nil, liveReplicas: 5, wantReplicas: 5}, + {name: "set spec overrides drifted count", specReplicas: int32Ptr(3), liveReplicas: 5, wantReplicas: 3}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := replicasTestScheme(t) + proxy := replicasTestProxy(tt.specReplicas) + + // Seed the RunConfig ConfigMap so getRunConfigChecksum resolves the + // same checksum the test passes to the deployment generator. + runConfigCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: proxy.Name + "-runconfig", + Namespace: proxy.Namespace, + Annotations: map[string]string{checksum.ContentChecksumAnnotation: "test-checksum"}, + }, + } + + seedReconciler := &MCPRemoteProxyReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Scheme: scheme, + } + liveDeployment := seedReconciler.deploymentForMCPRemoteProxy(context.Background(), proxy, "test-checksum") + require.NotNil(t, liveDeployment) + liveDeployment.Spec.Replicas = int32Ptr(tt.liveReplicas) + // Strip the generated owner reference: the fake client rejects + // owner refs to objects it has not assigned a UID. + liveDeployment.OwnerReferences = nil + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(proxy, runConfigCM, liveDeployment). + Build() + r := &MCPRemoteProxyReconciler{Client: fakeClient, Scheme: scheme} + + _, err := r.ensureDeployment(context.Background(), proxy) + require.NoError(t, err) + + got := &appsv1.Deployment{} + require.NoError(t, fakeClient.Get(context.Background(), + types.NamespacedName{Name: proxy.Name, Namespace: proxy.Namespace}, got)) + require.NotNil(t, got.Spec.Replicas) + assert.Equal(t, tt.wantReplicas, *got.Spec.Replicas) + }) + } +} + +// TestValidateSessionStorageForReplicasRemoteProxy mirrors the MCPServer and +// VirtualMCPServer validators: the SessionStorageWarning condition is True +// only when replicas > 1 without Redis-backed session storage. +func TestValidateSessionStorageForReplicasRemoteProxy(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + replicas *int32 + storage *mcpv1beta1.SessionStorageConfig + wantStatus metav1.ConditionStatus + wantReason string + }{ + { + name: "nil replicas is not applicable", + replicas: nil, + wantStatus: metav1.ConditionFalse, + wantReason: mcpv1beta1.ConditionReasonSessionStorageNotApplicable, + }, + { + name: "single replica is not applicable", + replicas: int32Ptr(1), + wantStatus: metav1.ConditionFalse, + wantReason: mcpv1beta1.ConditionReasonSessionStorageNotApplicable, + }, + { + name: "multiple replicas without session storage warns", + replicas: int32Ptr(3), + wantStatus: metav1.ConditionTrue, + wantReason: mcpv1beta1.ConditionReasonSessionStorageMissing, + }, + { + name: "multiple replicas with memory provider warns", + replicas: int32Ptr(3), + storage: &mcpv1beta1.SessionStorageConfig{Provider: "memory"}, + wantStatus: metav1.ConditionTrue, + wantReason: mcpv1beta1.ConditionReasonSessionStorageMissing, + }, + { + name: "multiple replicas with redis provider is configured", + replicas: int32Ptr(3), + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis:6379", + }, + wantStatus: metav1.ConditionFalse, + wantReason: mcpv1beta1.ConditionReasonSessionStorageConfigured, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + proxy := replicasTestProxy(tt.replicas) + proxy.Spec.SessionStorage = tt.storage + + r := &MCPRemoteProxyReconciler{} + r.validateSessionStorageForReplicas(proxy) + + cond := meta.FindStatusCondition(proxy.Status.Conditions, mcpv1beta1.ConditionSessionStorageWarning) + require.NotNil(t, cond, "SessionStorageWarning condition must always be set") + assert.Equal(t, tt.wantStatus, cond.Status) + assert.Equal(t, tt.wantReason, cond.Reason) + }) + } +} diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go index 0059404fb1..997ec22923 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig.go @@ -168,6 +168,14 @@ func (r *MCPRemoteProxyReconciler) createRunConfigFromMCPRemoteProxy( return nil, err } + // Populate ScalingConfig.SessionRedis from spec.sessionStorage so the + // proxy runner has the address/db/keyPrefix needed to construct a + // shared Redis-backed session store. The Redis password is intentionally + // excluded here — it is injected as the THV_SESSION_REDIS_PASSWORD env + // var by buildRedisPasswordEnvVar in mcpremoteproxy_deployment.go. + // Must run before PopulateMiddlewareConfigs because rate limiting reads SessionRedis. + populateScalingConfigForRemoteProxy(runConfig, proxy) + // Populate middleware configs from the configuration fields // This ensures that middleware_configs is properly set for serialization if err := runner.PopulateMiddlewareConfigs(runConfig); err != nil { @@ -177,6 +185,27 @@ func (r *MCPRemoteProxyReconciler) createRunConfigFromMCPRemoteProxy( return runConfig, nil } +// populateScalingConfigForRemoteProxy mirrors populateScalingConfig from +// mcpserver_runconfig.go but for MCPRemoteProxy (which has no +// BackendReplicas concept). When MCPRemoteProxy.spec.sessionStorage uses +// the redis provider, this populates runner.ScalingConfig.SessionRedis with +// the non-sensitive connection parameters. +func populateScalingConfigForRemoteProxy(runConfig *runner.RunConfig, proxy *mcpv1beta1.MCPRemoteProxy) { + if proxy.Spec.SessionStorage == nil || + proxy.Spec.SessionStorage.Provider != mcpv1beta1.SessionStorageProviderRedis { + return + } + + if runConfig.ScalingConfig == nil { + runConfig.ScalingConfig = &runner.ScalingConfig{} + } + runConfig.ScalingConfig.SessionRedis = &runner.SessionRedisConfig{ + Address: proxy.Spec.SessionStorage.Address, + DB: proxy.Spec.SessionStorage.DB, + KeyPrefix: proxy.Spec.SessionStorage.KeyPrefix, + } +} + // resolveAndAddOIDCConfig resolves OIDC configuration from the shared MCPOIDCConfigRef, // adds the appropriate runner options, and returns the resolved config. func (r *MCPRemoteProxyReconciler) resolveAndAddOIDCConfig( diff --git a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go index 40ee973f4b..73e7fe874a 100644 --- a/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go +++ b/cmd/thv-operator/controllers/mcpremoteproxy_runconfig_test.go @@ -856,3 +856,89 @@ func TestLabelsForRunConfigRemoteProxy(t *testing.T) { result := labelsForRunConfigRemoteProxy("test-proxy") assert.Equal(t, expected, result) } + +// TestPopulateScalingConfigForRemoteProxy tests SessionRedis injection into RunConfig +// from MCPRemoteProxy.spec.sessionStorage. Mirrors TestPopulateScalingConfig in +// mcpserver_runconfig_test.go. MCPRemoteProxy has no BackendReplicas concept (no +// backend StatefulSet), so we only cover the SessionRedis path. +func TestPopulateScalingConfigForRemoteProxy(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + storage *mcpv1beta1.SessionStorageConfig + expected func(t *testing.T, sc *runner.ScalingConfig) + }{ + { + name: "nil sessionStorage — ScalingConfig stays nil", + storage: nil, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + assert.Nil(t, sc) + }, + }, + { + name: "memory provider — ScalingConfig stays nil", + storage: &mcpv1beta1.SessionStorageConfig{Provider: "memory"}, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + assert.Nil(t, sc) + }, + }, + { + name: "redis — address/db/keyPrefix written to SessionRedis", + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis.default.svc:6379", + DB: 2, + KeyPrefix: "thv:", + }, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + require.NotNil(t, sc) + require.NotNil(t, sc.SessionRedis) + assert.Equal(t, "redis.default.svc:6379", sc.SessionRedis.Address) + assert.Equal(t, int32(2), sc.SessionRedis.DB) + assert.Equal(t, "thv:", sc.SessionRedis.KeyPrefix) + }, + }, + { + name: "redis with passwordRef — password NOT in SessionRedis", + storage: &mcpv1beta1.SessionStorageConfig{ + Provider: mcpv1beta1.SessionStorageProviderRedis, + Address: "redis:6379", + PasswordRef: &mcpv1beta1.SecretKeyRef{ + Name: "redis-secret", + Key: "password", + }, + }, + expected: func(t *testing.T, sc *runner.ScalingConfig) { + t.Helper() + require.NotNil(t, sc) + require.NotNil(t, sc.SessionRedis) + assert.Equal(t, "redis:6379", sc.SessionRedis.Address) + // Password must NOT leak into the RunConfig — it's injected + // separately as the THV_SESSION_REDIS_PASSWORD pod env var + // by buildRedisPasswordEnvVarForRemoteProxy. + data, err := json.Marshal(sc) + require.NoError(t, err) + assert.NotContains(t, string(data), "redis-secret") + assert.NotContains(t, string(data), "password") + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + runConfig := &runner.RunConfig{} + proxy := &mcpv1beta1.MCPRemoteProxy{ + Spec: mcpv1beta1.MCPRemoteProxySpec{ + SessionStorage: tt.storage, + }, + } + populateScalingConfigForRemoteProxy(runConfig, proxy) + tt.expected(t, runConfig.ScalingConfig) + }) + } +} diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml index fe5e8a8a22..777c310691 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -361,6 +361,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator @@ -498,10 +511,78 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. @@ -1013,6 +1094,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator @@ -1150,10 +1244,78 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml index 537607ff2b..f54568e66e 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -364,6 +364,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator @@ -501,10 +514,78 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. @@ -1016,6 +1097,19 @@ spec: description: RemoteURL is the URL of the remote MCP server to proxy pattern: ^https?:// type: string + replicas: + description: |- + Replicas is the desired number of proxy pod replicas. + MCPRemoteProxy creates a single Deployment for the proxy process, so there + is only one replicas field (mirrors VirtualMCPServer.spec.replicas). + When nil, the operator does not set Deployment.Spec.Replicas, leaving replica + management to an HPA or other external controller. + When set above 1, also configure sessionStorage with the redis provider and + sessionAffinity: "None" so sessions resolve across replicas; otherwise a + SessionStorageWarning condition is surfaced on the resource status. + format: int32 + minimum: 0 + type: integer resourceOverrides: description: ResourceOverrides allows overriding annotations and labels for resources created by the operator @@ -1153,10 +1247,78 @@ spec: SessionAffinity controls whether the Service routes repeated client connections to the same pod. MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default. Set to "None" for stateless servers or when using an external load balancer with its own affinity. + + Interaction with sessionStorage: when running multiple replicas with + sessionStorage.provider "redis", set this to "None" so requests are + distributed across replicas and sessions resolve via the shared store. + Conversely, "None" without Redis-backed sessionStorage breaks session + continuity — any request landing on a different pod fails with + "Session not found". enum: - ClientIP - None type: string + sessionStorage: + description: |- + SessionStorage configures session storage for stateful horizontal scaling. + When nil, no session storage is configured and the proxy falls back to + pod-local in-memory session state — incompatible with multi-replica + deployments behind load balancers that don't preserve client-IP affinity + (e.g. AWS ALB across multiple AZs). + + The transparent proxy validates `Mcp-Session-Id` against this store on + every non-initialize request (see pkg/transport/proxy/transparent/ + transparent_proxy.go) and rewrites client-facing session IDs to backend + session IDs using session metadata. Both lookups require shared state + across replicas. + + When using the Redis provider, also set sessionAffinity to "None" so the + Service routes requests round-robin and all replicas rely on the shared + session store rather than pod-local state. + + Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. + properties: + address: + description: Address is the Redis server address (required when + provider is redis) + minLength: 1 + type: string + db: + default: 0 + description: DB is the Redis database number + format: int32 + minimum: 0 + type: integer + keyPrefix: + description: KeyPrefix is an optional prefix for all Redis keys + used by ToolHive + type: string + passwordRef: + description: PasswordRef is a reference to a Secret key containing + the Redis password + properties: + key: + description: Key is the key within the secret + type: string + name: + description: Name is the name of the secret + type: string + required: + - key + - name + type: object + provider: + description: Provider is the session storage backend type + enum: + - memory + - redis + type: string + required: + - provider + type: object + x-kubernetes-validations: + - message: address is required + rule: 'self.provider == ''redis'' ? has(self.address) : true' telemetryConfigRef: description: |- TelemetryConfigRef references an MCPTelemetryConfig resource for shared telemetry configuration. diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md index 32c720ce85..9d2c604524 100644 --- a/docs/operator/crd-api.md +++ b/docs/operator/crd-api.md @@ -2284,7 +2284,9 @@ _Appears in:_ | `endpointPrefix` _string_ | EndpointPrefix is the path prefix to prepend to SSE endpoint URLs.
This is used to handle path-based ingress routing scenarios where the ingress
strips a path prefix before forwarding to the backend. | | Optional: \{\}
| | `resourceOverrides` _[api.v1beta1.ResourceOverrides](#apiv1beta1resourceoverrides)_ | ResourceOverrides allows overriding annotations and labels for resources created by the operator | | Optional: \{\}
| | `groupRef` _[api.v1beta1.MCPGroupRef](#apiv1beta1mcpgroupref)_ | GroupRef references the MCPGroup this proxy belongs to.
The referenced MCPGroup must be in the same namespace. | | Optional: \{\}
| -| `sessionAffinity` _string_ | SessionAffinity controls whether the Service routes repeated client connections to the same pod.
MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default.
Set to "None" for stateless servers or when using an external load balancer with its own affinity. | ClientIP | Enum: [ClientIP None]
Optional: \{\}
| +| `sessionAffinity` _string_ | SessionAffinity controls whether the Service routes repeated client connections to the same pod.
MCP protocols (SSE, streamable-http) are stateful, so ClientIP is the default.
Set to "None" for stateless servers or when using an external load balancer with its own affinity.
Interaction with sessionStorage: when running multiple replicas with
sessionStorage.provider "redis", set this to "None" so requests are
distributed across replicas and sessions resolve via the shared store.
Conversely, "None" without Redis-backed sessionStorage breaks session
continuity — any request landing on a different pod fails with
"Session not found". | ClientIP | Enum: [ClientIP None]
Optional: \{\}
| +| `replicas` _integer_ | Replicas is the desired number of proxy pod replicas.
MCPRemoteProxy creates a single Deployment for the proxy process, so there
is only one replicas field (mirrors VirtualMCPServer.spec.replicas).
When nil, the operator does not set Deployment.Spec.Replicas, leaving replica
management to an HPA or other external controller.
When set above 1, also configure sessionStorage with the redis provider and
sessionAffinity: "None" so sessions resolve across replicas; otherwise a
SessionStorageWarning condition is surfaced on the resource status. | | Minimum: 0
Optional: \{\}
| +| `sessionStorage` _[api.v1beta1.SessionStorageConfig](#apiv1beta1sessionstorageconfig)_ | SessionStorage configures session storage for stateful horizontal scaling.
When nil, no session storage is configured and the proxy falls back to
pod-local in-memory session state — incompatible with multi-replica
deployments behind load balancers that don't preserve client-IP affinity
(e.g. AWS ALB across multiple AZs).
The transparent proxy validates `Mcp-Session-Id` against this store on
every non-initialize request (see pkg/transport/proxy/transparent/
transparent_proxy.go) and rewrites client-facing session IDs to backend
session IDs using session metadata. Both lookups require shared state
across replicas.
When using the Redis provider, also set sessionAffinity to "None" so the
Service routes requests round-robin and all replicas rely on the shared
session store rather than pod-local state.
Mirrors MCPServer.spec.sessionStorage and VirtualMCPServer.spec.sessionStorage. | | Optional: \{\}
| #### api.v1beta1.MCPRemoteProxyStatus @@ -3373,6 +3375,7 @@ into the vMCP ConfigMap so the vMCP process receives connection parameters at st _Appears in:_ +- [api.v1beta1.MCPRemoteProxySpec](#apiv1beta1mcpremoteproxyspec) - [api.v1beta1.MCPServerSpec](#apiv1beta1mcpserverspec) - [api.v1beta1.VirtualMCPServerSpec](#apiv1beta1virtualmcpserverspec)